Source code for proteopy.download.karayel_2020

from pathlib import Path

import pandas as pd

from proteopy.datasets import karayel_2020 as _load_karayel_2020
from proteopy.utils.string import detect_separator_from_extension


_DEFAULT_INTENSITIES = (
    "karayel-2020_ms-proteomics"
    "_human-erythropoiesis_intensities.tsv"
)
_DEFAULT_VAR = (
    "karayel-2020_ms-proteomics"
    "_human-erythropoiesis_protein-annotation.tsv"
)
_DEFAULT_SAMPLE = (
    "karayel-2020_ms-proteomics"
    "_human-erythropoiesis_sample-annotation.tsv"
)


def _check_karayel_2020_types(
    intensities_path,
    var_annotation_path,
    sample_annotation_path,
    sep,
    fill_na,
    force,
):
    """Type-check parameters for :func:`karayel_2020`."""
    for name, value in (
        ("intensities_path", intensities_path),
        ("var_annotation_path", var_annotation_path),
        ("sample_annotation_path", sample_annotation_path),
    ):
        if not isinstance(value, (str, Path)):
            raise TypeError(
                f"{name} must be str or Path, "
                f"got {type(value).__name__}"
            )
    if sep is not None and not isinstance(sep, str):
        raise TypeError(
            f"sep must be str or None, "
            f"got {type(sep).__name__}"
        )
    if fill_na is not None and (
        isinstance(fill_na, bool)
        or not isinstance(fill_na, (int, float))
    ):
        raise TypeError(
            f"fill_na must be float, int, or None, "
            f"got {type(fill_na).__name__}"
        )
    if not isinstance(force, bool):
        raise TypeError(
            f"force must be bool, "
            f"got {type(force).__name__}"
        )


def _check_karayel_2020_paths(
    intensities_path,
    var_annotation_path,
    sample_annotation_path,
    force,
):
    """Resolve paths, check for overlaps and existing files."""
    intensities_path = Path(intensities_path)
    var_annotation_path = Path(var_annotation_path)
    sample_annotation_path = Path(sample_annotation_path)

    paths = {
        "intensities_path": intensities_path.resolve(),
        "var_annotation_path": var_annotation_path.resolve(),
        "sample_annotation_path": sample_annotation_path.resolve(),
    }
    seen: dict[Path, str] = {}
    for name, resolved in paths.items():
        if resolved in seen:
            raise ValueError(
                f"{name} and {seen[resolved]} resolve to "
                f"the same path: {resolved}"
            )
        seen[resolved] = name

    if not force:
        for name, resolved in paths.items():
            if resolved.exists():
                raise FileExistsError(
                    f"{name} already exists: {resolved}. "
                    "Use force=True to overwrite."
                )

    return (
        intensities_path,
        var_annotation_path,
        sample_annotation_path,
    )



[docs]
def karayel_2020(
    intensities_path: str | Path = _DEFAULT_INTENSITIES,
    var_annotation_path: str | Path = _DEFAULT_VAR,
    sample_annotation_path: str | Path = _DEFAULT_SAMPLE,
    *,
    sep: str | None = None,
    fill_na: float | int | None = None,
    force: bool = False,
) -> None:
    """Save Karayel 2020 erythropoiesis dataset to disk.

    Download and process the protein-level DIA-MS dataset from
    Karayel et al. [1]_ and save it as three tabular files:
    intensities in long format, protein annotations, and sample
    annotations.

    The study quantified ~7,400 proteins from CD34+
    hematopoietic stem/progenitor cells (HSPCs) isolated from
    healthy donors, across five sequential erythroid
    differentiation stages with four biological replicates each
    (20 samples total). Cells were FACS-sorted using CD235a,
    CD49d, and Band 3 surface markers. The differentiation
    stages are:

    - Progenitor: CFU-E progenitor cells
      (CD34+ HSPCs, negative fraction)
    - ProE&EBaso: Proerythroblasts and early basophilic
      erythroblasts
    - LBaso: Late basophilic erythroblasts
    - Poly: Polychromatic erythroblasts
    - Ortho: Orthochromatic erythroblasts

    Data are sourced from the PRIDE archive (`PXD017276
    <https://proteomecentral.proteomexchange.org/cgi/
    GetDataset?ID=PXD017276>`_). Protein quantities marked
    as ``Filtered`` in the original data are converted to
    ``np.nan``. Samples collected at day 7 are excluded.

    Parameters
    ----------
    intensities_path : str | Path, optional
        Destination path for the intensities file. Columns:
        ``sample_id``, ``protein_id``, ``intensity``.
    var_annotation_path : str | Path, optional
        Destination path for the protein annotation file.
        Columns: ``protein_id``, ``gene_id``.
    sample_annotation_path : str | Path, optional
        Destination path for the sample annotation file.
        Columns: ``sample_id``, ``cell_type``, ``replicate``.
    sep : str | None, optional
        Column separator for all output files. When ``None``,
        the separator is inferred from each file extension via
        ``detect_separator_from_extension()``
        (``.tsv`` → tab, ``.csv`` → comma).
    fill_na : float | int | None, optional
        If not ``None``, replace NaN values in the long-format
        intensities DataFrame with this value before saving.
    force : bool, optional
        If ``True``, overwrite existing files at the output
        paths. Otherwise, raise ``FileExistsError`` when a
        destination file already exists.

    Returns
    -------
    None
        Writes files to disk; does not return a value.

    Examples
    --------
    >>> import proteopy as pr
    >>> pr.download.karayel_2020(
    ...     intensities_path="intensities.tsv",
    ...     var_annotation_path="protein_annotations.tsv",
    ...     sample_annotation_path="sample_annotations.tsv",
    ... )

    References
    ----------
    .. [1] Karayel Ö, Xu P, Bludau I, Velan Bhoopalan S,
       Yao Y, Ana Rita FC, Santos A, Schulman BA, Alpi AF,
       Weiss MJ, and Mann M. "Integrative proteomics
       reveals principles of dynamic phosphosignaling
       networks in human erythropoiesis."
       *Molecular Systems Biology*, 16(12):MSB20209813,
       2020. DOI: 10.15252/msb.20209813.
    """
    _check_karayel_2020_types(
        intensities_path,
        var_annotation_path,
        sample_annotation_path,
        sep,
        fill_na,
        force,
    )
    intensities_path, var_annotation_path, sample_annotation_path = (
        _check_karayel_2020_paths(
            intensities_path,
            var_annotation_path,
            sample_annotation_path,
            force,
        )
    )

    adata = _load_karayel_2020()

    # Auto-detect separator from file extension if not provided
    if sep is None:
        sep_intensities = detect_separator_from_extension(
            intensities_path,
        )
        sep_var = detect_separator_from_extension(
            var_annotation_path,
        )
        sep_sample = detect_separator_from_extension(
            sample_annotation_path,
        )
    else:
        sep_intensities = sep
        sep_var = sep
        sep_sample = sep

    # Melt .X to long format: sample_id, protein_id, intensity
    df_x = pd.DataFrame(
        adata.X,
        index=adata.obs_names,
        columns=adata.var_names,
    )
    df_x.index.name = "sample_id"
    df_long = df_x.reset_index().melt(
        id_vars="sample_id",
        var_name="protein_id",
        value_name="intensity",
    )
    if fill_na is not None:
        df_long["intensity"] = df_long["intensity"].fillna(
            fill_na,
        )
    intensities_path.parent.mkdir(parents=True, exist_ok=True)
    df_long.to_csv(
        intensities_path,
        sep=sep_intensities,
        index=False,
    )

    # Save .var annotation
    df_var = adata.var[
        ["protein_id", "gene_id"]
    ].copy()
    var_annotation_path.parent.mkdir(
        parents=True, exist_ok=True,
    )
    df_var.to_csv(
        var_annotation_path,
        sep=sep_var,
        index=False,
    )

    # Save .obs annotation
    df_obs = adata.obs[
        ["sample_id", "cell_type", "replicate"]
    ].copy()
    sample_annotation_path.parent.mkdir(
        parents=True, exist_ok=True,
    )
    df_obs.to_csv(
        sample_annotation_path,
        sep=sep_sample,
        index=False,
    )