Source code for proteopy.datasets.karayel_2020

import re
from pathlib import Path

import numpy as np
import pandas as pd
import pooch

import anndata as ad
import proteopy as pp
from proteopy.utils.anndata import check_proteodata


def _parse_sample_id(col: str) -> str:
    """Parse and clean sample identifiers from raw column names.

    Remove technical prefixes, suffixes, and file extensions from column
    names in the downloaded PRIDE data to extract meaningful sample
    identifiers.

    Parameters
    ----------
    col : str
        Raw column name from the PRIDE CSV file containing sample
        identifier and technical annotations.

    Returns
    -------
    str
        Cleaned sample identifier with technical metadata removed.

    Examples
    --------
    >>> col = "[1] 20181222_QX0_OzKa_SA_CD34pos_DIA_P1.raw.PG.Quantity"
    >>> _parse_sample_id(col)
    'P1'
    """
    col = re.sub(r"^\[\d+\]\s*", "", col)
    col = col.replace(".PG.Quantity", "")
    col = re.sub(r"\.raw$", "", col)
    col = Path(col).stem
    col = col.replace("20181222_QX0_OzKa_SA_CD34pos_", "")
    col = col.replace("DIA_", "")
    col = col.replace("_181226121547", "")
    return col


[docs] def karayel_2020( fill_na: float | int | None = None, ) -> ad.AnnData: """Load Karayel 2020 erythropoiesis proteomics dataset. Download and process the protein-level DIA-MS dataset from Karayel et al. [1]_ studying dynamic phosphosignaling networks during human erythropoiesis. The study quantified ~7,400 proteins from CD34+ hematopoietic stem/progenitor cells (HSPCs) isolated from healthy donors, across five sequential erythroid differentiation stages with four biological replicates each (20 samples total). Cells were FACS-sorted using CD235a, CD49d, and Band 3 surface markers. The differentiation stages are: - Progenitor: CFU-E progenitor cells (CD34+ HSPCs, negative fraction) - ProE&EBaso: Proerythroblasts and early basophilic erythroblasts - LBaso: Late basophilic erythroblasts - Poly: Polychromatic erythroblasts - Ortho: Orthochromatic erythroblasts Data are sourced from the PRIDE archive (`PXD017276 <https://proteomecentral.proteomexchange.org/cgi/ GetDataset?ID=PXD017276>`_). Protein quantities marked as ``Filtered`` in the original data are converted to ``np.nan``. Samples collected at day 7 are excluded. Sample annotation (``.obs``) includes: - ``sample_id``: Unique identifier (cell_type_replicate). - ``cell_type``: Differentiation stage abbreviation. - ``replicate``: Technical replicate identifier. Variable annotation (``.var``) includes: - ``protein_id``: Protein group identifier (matches ``.var_names``). - ``gene_id``: Associated gene name(s). Parameters ---------- fill_na : float | int | None, optional If not ``None``, replace ``np.nan`` in ``.X`` with this value. Returns ------- AnnData Protein-level quantification data. ``.X`` contains protein intensities (samples x proteins). Raises ------ urllib.error.URLError If the download from the PRIDE archive fails. Examples -------- >>> import proteopy as pr >>> adata = pr.datasets.karayel_2020() >>> adata AnnData object with n_obs × n_vars obs: 'sample_id', 'cell_type', 'replicate' var: 'protein_id', 'gene_id' >>> adata.obs['cell_type'].unique() ['Progenitor', 'ProE&EBaso', 'LBaso', 'Poly', 'Ortho'] References ---------- .. [1] Karayel Ö, Xu P, Bludau I, Velan Bhoopalan S, Yao Y, Ana Rita FC, Santos A, Schulman BA, Alpi AF, Weiss MJ, and Mann M. "Integrative proteomics reveals principles of dynamic phosphosignaling networks in human erythropoiesis." *Molecular Systems Biology*, 16(12):MSB20209813, 2020. :doi:`10.15252/msb.20209813`. """ if fill_na is not None and ( isinstance(fill_na, bool) or not isinstance(fill_na, (int, float)) ): raise TypeError( f"fill_na must be float, int, or None, " f"got {type(fill_na).__name__}" ) # Download from PRIDE archive url = ( "https://ftp.pride.ebi.ac.uk/pride/data/archive/2020/10/" "PXD017276/20190213_CD34_Phospho_study_DIA_proteome_Report.csv" ) file_path = pooch.retrieve( url=url, known_hash=( "sha256:" "b69cb93a0d1ef03efb3e29ddc4fcf1d28b09" "73de8105b919a04fb710dfe66326" ), fname="karayel_2020_proteome_report.csv", path=pooch.os_cache("proteopy"), ) df = pd.read_csv(file_path) quant_cols = [c for c in df.columns if c.endswith(".PG.Quantity")] df[quant_cols] = df[quant_cols].replace("Filtered", np.nan).astype(float) # Wide to long format long = ( df[["PG.ProteinGroups"] + quant_cols] .melt( id_vars="PG.ProteinGroups", var_name="raw_col", value_name="intensity", ) ) # Clean sample IDs and map to cell type names long["sample_id"] = long["raw_col"].map(_parse_sample_id) long = long.drop(columns=["raw_col"]) long = long.rename(columns={"PG.ProteinGroups": "protein_id"}) long['sample_id'] = ( long['sample_id'] .str.replace('Negativefrac', 'Progenitor', regex=False) .str.replace('P1andP2', 'ProE&EBaso', regex=False) .str.replace('P3', 'LBaso', regex=False) .str.replace('P4', 'Poly', regex=False) .str.replace('P5', 'Ortho', regex=False) ) # Exclude day 7 samples karayel_2020_quant = long[~long["sample_id"].str.contains('_D7')] # Build sample annotation karayel_2020_meta_obs = ( karayel_2020_quant[['sample_id']] .drop_duplicates() .reset_index(drop=True) ) karayel_2020_meta_obs["cell_type"] = ( karayel_2020_meta_obs["sample_id"].str.split("_").str[0] ) karayel_2020_meta_obs["replicate"] = ( karayel_2020_meta_obs["sample_id"].str.split("_").str[-1] ) # Build protein annotation karayel_2020_meta_var = ( df[['PG.ProteinGroups', 'PG.Genes']] .drop_duplicates() .reset_index(drop=True) ) karayel_2020_meta_var = karayel_2020_meta_var.rename(columns={ 'PG.ProteinGroups': 'protein_id', 'PG.Genes': 'gene_id' }) # Assemble AnnData adata = pp.read.long( intensities=karayel_2020_quant, level='protein', sample_annotation=karayel_2020_meta_obs, var_annotation=karayel_2020_meta_var, ) if fill_na is not None: adata.X[np.isnan(adata.X)] = fill_na check_proteodata(adata) return adata