Source code for proteopy.download.karayel_2020

from pathlib import Path

import pandas as pd

from proteopy.datasets import karayel_2020 as _load_karayel_2020
from proteopy.utils.string import detect_separator_from_extension


_DEFAULT_INTENSITIES = (
    "karayel-2020_ms-proteomics"
    "_human-erythropoiesis_intensities.tsv"
)
_DEFAULT_VAR = (
    "karayel-2020_ms-proteomics"
    "_human-erythropoiesis_protein-annotation.tsv"
)
_DEFAULT_SAMPLE = (
    "karayel-2020_ms-proteomics"
    "_human-erythropoiesis_sample-annotation.tsv"
)


def _check_karayel_2020_types(
    intensities_path,
    var_annotation_path,
    sample_annotation_path,
    sep,
    fill_na,
    force,
):
    """Type-check parameters for :func:`karayel_2020`."""
    for name, value in (
        ("intensities_path", intensities_path),
        ("var_annotation_path", var_annotation_path),
        ("sample_annotation_path", sample_annotation_path),
    ):
        if not isinstance(value, (str, Path)):
            raise TypeError(
                f"{name} must be str or Path, "
                f"got {type(value).__name__}"
            )
    if sep is not None and not isinstance(sep, str):
        raise TypeError(
            f"sep must be str or None, "
            f"got {type(sep).__name__}"
        )
    if fill_na is not None and (
        isinstance(fill_na, bool)
        or not isinstance(fill_na, (int, float))
    ):
        raise TypeError(
            f"fill_na must be float, int, or None, "
            f"got {type(fill_na).__name__}"
        )
    if not isinstance(force, bool):
        raise TypeError(
            f"force must be bool, "
            f"got {type(force).__name__}"
        )


def _check_karayel_2020_paths(
    intensities_path,
    var_annotation_path,
    sample_annotation_path,
    force,
):
    """Resolve paths, check for overlaps and existing files."""
    intensities_path = Path(intensities_path)
    var_annotation_path = Path(var_annotation_path)
    sample_annotation_path = Path(sample_annotation_path)

    paths = {
        "intensities_path": intensities_path.resolve(),
        "var_annotation_path": var_annotation_path.resolve(),
        "sample_annotation_path": sample_annotation_path.resolve(),
    }
    seen: dict[Path, str] = {}
    for name, resolved in paths.items():
        if resolved in seen:
            raise ValueError(
                f"{name} and {seen[resolved]} resolve to "
                f"the same path: {resolved}"
            )
        seen[resolved] = name

    if not force:
        for name, resolved in paths.items():
            if resolved.exists():
                raise FileExistsError(
                    f"{name} already exists: {resolved}. "
                    "Use force=True to overwrite."
                )

    return (
        intensities_path,
        var_annotation_path,
        sample_annotation_path,
    )


[docs] def karayel_2020( intensities_path: str | Path = _DEFAULT_INTENSITIES, var_annotation_path: str | Path = _DEFAULT_VAR, sample_annotation_path: str | Path = _DEFAULT_SAMPLE, *, sep: str | None = None, fill_na: float | int | None = None, force: bool = False, ) -> None: """Save Karayel 2020 erythropoiesis dataset to disk. Download and process the protein-level DIA-MS dataset from Karayel et al. [1]_ and save it as three tabular files: intensities in long format, protein annotations, and sample annotations. The study quantified ~7,400 proteins from CD34+ hematopoietic stem/progenitor cells (HSPCs) isolated from healthy donors, across five sequential erythroid differentiation stages with four biological replicates each (20 samples total). Cells were FACS-sorted using CD235a, CD49d, and Band 3 surface markers. The differentiation stages are: - Progenitor: CFU-E progenitor cells (CD34+ HSPCs, negative fraction) - ProE&EBaso: Proerythroblasts and early basophilic erythroblasts - LBaso: Late basophilic erythroblasts - Poly: Polychromatic erythroblasts - Ortho: Orthochromatic erythroblasts Data are sourced from the PRIDE archive (`PXD017276 <https://proteomecentral.proteomexchange.org/cgi/ GetDataset?ID=PXD017276>`_). Protein quantities marked as ``Filtered`` in the original data are converted to ``np.nan``. Samples collected at day 7 are excluded. Parameters ---------- intensities_path : str | Path, optional Destination path for the intensities file. Columns: ``sample_id``, ``protein_id``, ``intensity``. var_annotation_path : str | Path, optional Destination path for the protein annotation file. Columns: ``protein_id``, ``gene_id``. sample_annotation_path : str | Path, optional Destination path for the sample annotation file. Columns: ``sample_id``, ``cell_type``, ``replicate``. sep : str | None, optional Column separator for all output files. When ``None``, the separator is inferred from each file extension via ``detect_separator_from_extension()`` (``.tsv`` → tab, ``.csv`` → comma). fill_na : float | int | None, optional If not ``None``, replace NaN values in the long-format intensities DataFrame with this value before saving. force : bool, optional If ``True``, overwrite existing files at the output paths. Otherwise, raise ``FileExistsError`` when a destination file already exists. Returns ------- None Writes files to disk; does not return a value. Examples -------- >>> import proteopy as pr >>> pr.download.karayel_2020( ... intensities_path="intensities.tsv", ... var_annotation_path="protein_annotations.tsv", ... sample_annotation_path="sample_annotations.tsv", ... ) References ---------- .. [1] Karayel Ö, Xu P, Bludau I, Velan Bhoopalan S, Yao Y, Ana Rita FC, Santos A, Schulman BA, Alpi AF, Weiss MJ, and Mann M. "Integrative proteomics reveals principles of dynamic phosphosignaling networks in human erythropoiesis." *Molecular Systems Biology*, 16(12):MSB20209813, 2020. DOI: 10.15252/msb.20209813. """ _check_karayel_2020_types( intensities_path, var_annotation_path, sample_annotation_path, sep, fill_na, force, ) intensities_path, var_annotation_path, sample_annotation_path = ( _check_karayel_2020_paths( intensities_path, var_annotation_path, sample_annotation_path, force, ) ) adata = _load_karayel_2020() # Auto-detect separator from file extension if not provided if sep is None: sep_intensities = detect_separator_from_extension( intensities_path, ) sep_var = detect_separator_from_extension( var_annotation_path, ) sep_sample = detect_separator_from_extension( sample_annotation_path, ) else: sep_intensities = sep sep_var = sep sep_sample = sep # Melt .X to long format: sample_id, protein_id, intensity df_x = pd.DataFrame( adata.X, index=adata.obs_names, columns=adata.var_names, ) df_x.index.name = "sample_id" df_long = df_x.reset_index().melt( id_vars="sample_id", var_name="protein_id", value_name="intensity", ) if fill_na is not None: df_long["intensity"] = df_long["intensity"].fillna( fill_na, ) intensities_path.parent.mkdir(parents=True, exist_ok=True) df_long.to_csv( intensities_path, sep=sep_intensities, index=False, ) # Save .var annotation df_var = adata.var[ ["protein_id", "gene_id"] ].copy() var_annotation_path.parent.mkdir( parents=True, exist_ok=True, ) df_var.to_csv( var_annotation_path, sep=sep_var, index=False, ) # Save .obs annotation df_obs = adata.obs[ ["sample_id", "cell_type", "replicate"] ].copy() sample_annotation_path.parent.mkdir( parents=True, exist_ok=True, ) df_obs.to_csv( sample_annotation_path, sep=sep_sample, index=False, )