Source code for proteopy.datasets.karayel_2020

import re
from pathlib import Path

import numpy as np
import pandas as pd
import pooch

import anndata as ad
import proteopy as pp
from proteopy.utils.anndata import check_proteodata


def _parse_sample_id(col: str) -> str:
    """Parse and clean sample identifiers from raw column names.

    Remove technical prefixes, suffixes, and file extensions from column
    names in the downloaded PRIDE data to extract meaningful sample
    identifiers.

    Parameters
    ----------
    col : str
        Raw column name from the PRIDE CSV file containing sample
        identifier and technical annotations.

    Returns
    -------
    str
        Cleaned sample identifier with technical metadata removed.

    Examples
    --------
    >>> col = "[1] 20181222_QX0_OzKa_SA_CD34pos_DIA_P1.raw.PG.Quantity"
    >>> _parse_sample_id(col)
    'P1'
    """
    col = re.sub(r"^\[\d+\]\s*", "", col)
    col = col.replace(".PG.Quantity", "")
    col = re.sub(r"\.raw$", "", col)
    col = Path(col).stem
    col = col.replace("20181222_QX0_OzKa_SA_CD34pos_", "")
    col = col.replace("DIA_", "")
    col = col.replace("_181226121547", "")
    return col



[docs]
def karayel_2020(
    fill_na: float | int | None = None,
) -> ad.AnnData:
    """Load Karayel 2020 erythropoiesis proteomics dataset.

    Download and process the protein-level DIA-MS dataset from
    Karayel et al. [1]_ studying dynamic phosphosignaling
    networks during human erythropoiesis. The study quantified
    ~7,400 proteins from CD34+ hematopoietic stem/progenitor
    cells (HSPCs) isolated from healthy donors, across five
    sequential erythroid differentiation stages with four
    biological replicates each (20 samples total). Cells were
    FACS-sorted using CD235a, CD49d, and Band 3 surface
    markers. The differentiation stages are:

    - Progenitor: CFU-E progenitor cells
      (CD34+ HSPCs, negative fraction)
    - ProE&EBaso: Proerythroblasts and early basophilic
      erythroblasts
    - LBaso: Late basophilic erythroblasts
    - Poly: Polychromatic erythroblasts
    - Ortho: Orthochromatic erythroblasts

    Data are sourced from the PRIDE archive (`PXD017276
    <https://proteomecentral.proteomexchange.org/cgi/
    GetDataset?ID=PXD017276>`_). Protein quantities marked
    as ``Filtered`` in the original data are converted to
    ``np.nan``. Samples collected at day 7 are excluded.

    Sample annotation (``.obs``) includes:

    - ``sample_id``: Unique identifier (cell_type_replicate).
    - ``cell_type``: Differentiation stage abbreviation.
    - ``replicate``: Technical replicate identifier.

    Variable annotation (``.var``) includes:

    - ``protein_id``: Protein group identifier (matches
      ``.var_names``).
    - ``gene_id``: Associated gene name(s).

    Parameters
    ----------
    fill_na : float | int | None, optional
        If not ``None``, replace ``np.nan`` in ``.X``
        with this value.

    Returns
    -------
    AnnData
        Protein-level quantification data. ``.X`` contains
        protein intensities (samples x proteins).

    Raises
    ------
    urllib.error.URLError
        If the download from the PRIDE archive fails.

    Examples
    --------
    >>> import proteopy as pr
    >>> adata = pr.datasets.karayel_2020()
    >>> adata
    AnnData object with n_obs × n_vars
        obs: 'sample_id', 'cell_type', 'replicate'
        var: 'protein_id', 'gene_id'

    >>> adata.obs['cell_type'].unique()
    ['Progenitor', 'ProE&EBaso', 'LBaso', 'Poly', 'Ortho']

    References
    ----------
    .. [1] Karayel Ö, Xu P, Bludau I, Velan Bhoopalan S,
       Yao Y, Ana Rita FC, Santos A, Schulman BA, Alpi AF,
       Weiss MJ, and Mann M. "Integrative proteomics
       reveals principles of dynamic phosphosignaling
       networks in human erythropoiesis."
       *Molecular Systems Biology*, 16(12):MSB20209813,
       2020. :doi:`10.15252/msb.20209813`.
    """
    if fill_na is not None and (
        isinstance(fill_na, bool)
        or not isinstance(fill_na, (int, float))
    ):
        raise TypeError(
            f"fill_na must be float, int, or None, "
            f"got {type(fill_na).__name__}"
        )

    # Download from PRIDE archive
    url = (
        "https://ftp.pride.ebi.ac.uk/pride/data/archive/2020/10/"
        "PXD017276/20190213_CD34_Phospho_study_DIA_proteome_Report.csv"
    )
    file_path = pooch.retrieve(
        url=url,
        known_hash=(
            "sha256:"
            "b69cb93a0d1ef03efb3e29ddc4fcf1d28b09"
            "73de8105b919a04fb710dfe66326"
        ),
        fname="karayel_2020_proteome_report.csv",
        path=pooch.os_cache("proteopy"),
    )
    df = pd.read_csv(file_path)

    quant_cols = [c for c in df.columns if c.endswith(".PG.Quantity")]
    df[quant_cols] = df[quant_cols].replace("Filtered", np.nan).astype(float)

    # Wide to long format
    long = (
        df[["PG.ProteinGroups"] + quant_cols]
        .melt(
            id_vars="PG.ProteinGroups",
            var_name="raw_col",
            value_name="intensity",
        )
    )

    # Clean sample IDs and map to cell type names
    long["sample_id"] = long["raw_col"].map(_parse_sample_id)
    long = long.drop(columns=["raw_col"])
    long = long.rename(columns={"PG.ProteinGroups": "protein_id"})
    long['sample_id'] = (
        long['sample_id']
        .str.replace('Negativefrac', 'Progenitor', regex=False)
        .str.replace('P1andP2', 'ProE&EBaso', regex=False)
        .str.replace('P3', 'LBaso', regex=False)
        .str.replace('P4', 'Poly', regex=False)
        .str.replace('P5', 'Ortho', regex=False)
    )

    # Exclude day 7 samples
    karayel_2020_quant = long[~long["sample_id"].str.contains('_D7')]

    # Build sample annotation
    karayel_2020_meta_obs = (
        karayel_2020_quant[['sample_id']]
        .drop_duplicates()
        .reset_index(drop=True)
    )
    karayel_2020_meta_obs["cell_type"] = (
        karayel_2020_meta_obs["sample_id"].str.split("_").str[0]
    )
    karayel_2020_meta_obs["replicate"] = (
        karayel_2020_meta_obs["sample_id"].str.split("_").str[-1]
    )

    # Build protein annotation
    karayel_2020_meta_var = (
        df[['PG.ProteinGroups', 'PG.Genes']]
        .drop_duplicates()
        .reset_index(drop=True)
    )
    karayel_2020_meta_var = karayel_2020_meta_var.rename(columns={
        'PG.ProteinGroups': 'protein_id',
        'PG.Genes': 'gene_id'
    })

    # Assemble AnnData
    adata = pp.read.long(
        intensities=karayel_2020_quant,
        level='protein',
        sample_annotation=karayel_2020_meta_obs,
        var_annotation=karayel_2020_meta_var,
    )

    if fill_na is not None:
        adata.X[np.isnan(adata.X)] = fill_na

    check_proteodata(adata)
    return adata