import re
from pathlib import Path
import numpy as np
import pandas as pd
import pooch
import anndata as ad
import proteopy as pp
from proteopy.utils.anndata import check_proteodata
def _parse_sample_id(col: str) -> str:
"""Parse and clean sample identifiers from raw column names.
Remove technical prefixes, suffixes, and file extensions from column
names in the downloaded PRIDE data to extract meaningful sample
identifiers.
Parameters
----------
col : str
Raw column name from the PRIDE CSV file containing sample
identifier and technical annotations.
Returns
-------
str
Cleaned sample identifier with technical metadata removed.
Examples
--------
>>> col = "[1] 20181222_QX0_OzKa_SA_CD34pos_DIA_P1.raw.PG.Quantity"
>>> _parse_sample_id(col)
'P1'
"""
col = re.sub(r"^\[\d+\]\s*", "", col)
col = col.replace(".PG.Quantity", "")
col = re.sub(r"\.raw$", "", col)
col = Path(col).stem
col = col.replace("20181222_QX0_OzKa_SA_CD34pos_", "")
col = col.replace("DIA_", "")
col = col.replace("_181226121547", "")
return col
[docs]
def karayel_2020(
fill_na: float | int | None = None,
) -> ad.AnnData:
"""Load Karayel 2020 erythropoiesis proteomics dataset.
Download and process the protein-level DIA-MS dataset from
Karayel et al. [1]_ studying dynamic phosphosignaling
networks during human erythropoiesis. The study quantified
~7,400 proteins from CD34+ hematopoietic stem/progenitor
cells (HSPCs) isolated from healthy donors, across five
sequential erythroid differentiation stages with four
biological replicates each (20 samples total). Cells were
FACS-sorted using CD235a, CD49d, and Band 3 surface
markers. The differentiation stages are:
- Progenitor: CFU-E progenitor cells
(CD34+ HSPCs, negative fraction)
- ProE&EBaso: Proerythroblasts and early basophilic
erythroblasts
- LBaso: Late basophilic erythroblasts
- Poly: Polychromatic erythroblasts
- Ortho: Orthochromatic erythroblasts
Data are sourced from the PRIDE archive (`PXD017276
<https://proteomecentral.proteomexchange.org/cgi/
GetDataset?ID=PXD017276>`_). Protein quantities marked
as ``Filtered`` in the original data are converted to
``np.nan``. Samples collected at day 7 are excluded.
Sample annotation (``.obs``) includes:
- ``sample_id``: Unique identifier (cell_type_replicate).
- ``cell_type``: Differentiation stage abbreviation.
- ``replicate``: Technical replicate identifier.
Variable annotation (``.var``) includes:
- ``protein_id``: Protein group identifier (matches
``.var_names``).
- ``gene_id``: Associated gene name(s).
Parameters
----------
fill_na : float | int | None, optional
If not ``None``, replace ``np.nan`` in ``.X``
with this value.
Returns
-------
AnnData
Protein-level quantification data. ``.X`` contains
protein intensities (samples x proteins).
Raises
------
urllib.error.URLError
If the download from the PRIDE archive fails.
Examples
--------
>>> import proteopy as pr
>>> adata = pr.datasets.karayel_2020()
>>> adata
AnnData object with n_obs × n_vars
obs: 'sample_id', 'cell_type', 'replicate'
var: 'protein_id', 'gene_id'
>>> adata.obs['cell_type'].unique()
['Progenitor', 'ProE&EBaso', 'LBaso', 'Poly', 'Ortho']
References
----------
.. [1] Karayel Ö, Xu P, Bludau I, Velan Bhoopalan S,
Yao Y, Ana Rita FC, Santos A, Schulman BA, Alpi AF,
Weiss MJ, and Mann M. "Integrative proteomics
reveals principles of dynamic phosphosignaling
networks in human erythropoiesis."
*Molecular Systems Biology*, 16(12):MSB20209813,
2020. :doi:`10.15252/msb.20209813`.
"""
if fill_na is not None and (
isinstance(fill_na, bool)
or not isinstance(fill_na, (int, float))
):
raise TypeError(
f"fill_na must be float, int, or None, "
f"got {type(fill_na).__name__}"
)
# Download from PRIDE archive
url = (
"https://ftp.pride.ebi.ac.uk/pride/data/archive/2020/10/"
"PXD017276/20190213_CD34_Phospho_study_DIA_proteome_Report.csv"
)
file_path = pooch.retrieve(
url=url,
known_hash=(
"sha256:"
"b69cb93a0d1ef03efb3e29ddc4fcf1d28b09"
"73de8105b919a04fb710dfe66326"
),
fname="karayel_2020_proteome_report.csv",
path=pooch.os_cache("proteopy"),
)
df = pd.read_csv(file_path)
quant_cols = [c for c in df.columns if c.endswith(".PG.Quantity")]
df[quant_cols] = df[quant_cols].replace("Filtered", np.nan).astype(float)
# Wide to long format
long = (
df[["PG.ProteinGroups"] + quant_cols]
.melt(
id_vars="PG.ProteinGroups",
var_name="raw_col",
value_name="intensity",
)
)
# Clean sample IDs and map to cell type names
long["sample_id"] = long["raw_col"].map(_parse_sample_id)
long = long.drop(columns=["raw_col"])
long = long.rename(columns={"PG.ProteinGroups": "protein_id"})
long['sample_id'] = (
long['sample_id']
.str.replace('Negativefrac', 'Progenitor', regex=False)
.str.replace('P1andP2', 'ProE&EBaso', regex=False)
.str.replace('P3', 'LBaso', regex=False)
.str.replace('P4', 'Poly', regex=False)
.str.replace('P5', 'Ortho', regex=False)
)
# Exclude day 7 samples
karayel_2020_quant = long[~long["sample_id"].str.contains('_D7')]
# Build sample annotation
karayel_2020_meta_obs = (
karayel_2020_quant[['sample_id']]
.drop_duplicates()
.reset_index(drop=True)
)
karayel_2020_meta_obs["cell_type"] = (
karayel_2020_meta_obs["sample_id"].str.split("_").str[0]
)
karayel_2020_meta_obs["replicate"] = (
karayel_2020_meta_obs["sample_id"].str.split("_").str[-1]
)
# Build protein annotation
karayel_2020_meta_var = (
df[['PG.ProteinGroups', 'PG.Genes']]
.drop_duplicates()
.reset_index(drop=True)
)
karayel_2020_meta_var = karayel_2020_meta_var.rename(columns={
'PG.ProteinGroups': 'protein_id',
'PG.Genes': 'gene_id'
})
# Assemble AnnData
adata = pp.read.long(
intensities=karayel_2020_quant,
level='protein',
sample_annotation=karayel_2020_meta_obs,
var_annotation=karayel_2020_meta_var,
)
if fill_na is not None:
adata.X[np.isnan(adata.X)] = fill_na
check_proteodata(adata)
return adata