Source code for proteopy.download.williams_2018

from pathlib import Path

import pandas as pd

from proteopy.datasets import williams_2018 as _load_williams_2018
from proteopy.utils.string import detect_separator_from_extension


_DEFAULT_INTENSITIES = (
    "williams-2018_ms-proteomics"
    "_mouse-tissue_intensities.tsv"
)
_DEFAULT_VAR = (
    "williams-2018_ms-proteomics"
    "_mouse-tissue_peptide-annotation.tsv"
)
_DEFAULT_SAMPLE = (
    "williams-2018_ms-proteomics"
    "_mouse-tissue_sample-annotation.tsv"
)


def _check_williams_2018_types(
    intensities_path,
    var_annotation_path,
    sample_annotation_path,
    sep,
    fill_na,
    force,
):
    """Type-check parameters for :func:`williams_2018`."""
    for name, value in (
        ("intensities_path", intensities_path),
        ("var_annotation_path", var_annotation_path),
        ("sample_annotation_path", sample_annotation_path),
    ):
        if not isinstance(value, (str, Path)):
            raise TypeError(
                f"{name} must be str or Path, "
                f"got {type(value).__name__}"
            )
    if sep is not None and not isinstance(sep, str):
        raise TypeError(
            f"sep must be str or None, "
            f"got {type(sep).__name__}"
        )
    if fill_na is not None and (
        isinstance(fill_na, bool)
        or not isinstance(fill_na, (int, float))
    ):
        raise TypeError(
            f"fill_na must be float, int, or None, "
            f"got {type(fill_na).__name__}"
        )
    if not isinstance(force, bool):
        raise TypeError(
            f"force must be bool, "
            f"got {type(force).__name__}"
        )


def _check_williams_2018_paths(
    intensities_path,
    var_annotation_path,
    sample_annotation_path,
    force,
):
    """Resolve paths, check for overlaps and existing files."""
    intensities_path = Path(intensities_path)
    var_annotation_path = Path(var_annotation_path)
    sample_annotation_path = Path(sample_annotation_path)

    paths = {
        "intensities_path": intensities_path.resolve(),
        "var_annotation_path": var_annotation_path.resolve(),
        "sample_annotation_path": sample_annotation_path.resolve(),
    }
    seen: dict[Path, str] = {}
    for name, resolved in paths.items():
        if resolved in seen:
            raise ValueError(
                f"{name} and {seen[resolved]} resolve to "
                f"the same path: {resolved}"
            )
        seen[resolved] = name

    if not force:
        for name, resolved in paths.items():
            if resolved.exists():
                raise FileExistsError(
                    f"{name} already exists: {resolved}. "
                    "Use force=True to overwrite."
                )

    return intensities_path, var_annotation_path, sample_annotation_path


[docs] def williams_2018( intensities_path: str | Path = _DEFAULT_INTENSITIES, var_annotation_path: str | Path = _DEFAULT_VAR, sample_annotation_path: str | Path = _DEFAULT_SAMPLE, *, sep: str | None = None, fill_na: float | int | None = None, force: bool = False, ) -> None: """Save Williams 2018 SWATH-MS mouse tissue dataset to disk. Download and process the peptide-level SWATH-MS dataset from Williams et al. (2018) [1]_ and save it as three tabular files: intensities in long format, peptide annotations, and sample annotations. The dataset consists of the protein expression of eight genetically diverse BXD mouse strains across five tissues. Only the whole cell fraction is included; peptide intensities from different charge states are summed per peptide sequence. Data are sourced from the Elsevier supplementary archive (DOI: 10.1074/mcp.RA118.000554). Parameters ---------- intensities_path : str | Path, optional Destination path for the intensities file. Columns: ``sample_id``, ``peptide_id``, ``intensity``. var_annotation_path : str | Path, optional Destination path for the peptide annotation file. Columns: ``peptide_id``, ``protein_id``, ``gene_id``. sample_annotation_path : str | Path, optional Destination path for the sample annotation file. Columns: ``sample_id``, ``tissue``, ``mouse_id``. sep : str | None, optional Column separator for all output files. When ``None``, the separator is inferred from each file extension via ``detect_separator_from_extension()`` (``.tsv`` → tab, ``.csv`` → comma). fill_na : float | int | None, optional If not ``None``, replace NaN values in the long-format intensities DataFrame with this value before saving. force : bool, optional If ``True``, overwrite existing files at the output paths. Otherwise, raise ``FileExistsError`` when a destination file already exists. Returns ------- None Writes files to disk; does not return a value. Examples -------- >>> import proteopy as pr >>> pr.download.williams_2018( ... intensities_path="intensities.tsv", ... var_annotation_path="peptide_annotations.tsv", ... sample_annotation_path="sample_annotations.tsv", ... ) References ---------- .. [1] Williams EG, Wu Y, Wolski W, Kim JY, Lan J, Hasan M, Halter C, Jha P, Ryu D, Auwerx J, and Aebersold R. "Quantifying and Localizing the Mitochondrial Proteome Across Five Tissues in A Mouse Population." *Molecular & Cellular Proteomics*, 2018, 17(9):1766-1777. DOI: 10.1074/mcp.RA118.000554. """ _check_williams_2018_types( intensities_path, var_annotation_path, sample_annotation_path, sep, fill_na, force, ) intensities_path, var_annotation_path, sample_annotation_path = ( _check_williams_2018_paths( intensities_path, var_annotation_path, sample_annotation_path, force, ) ) adata = _load_williams_2018() # Auto-detect separator from file extension if not provided if sep is None: sep_intensities = detect_separator_from_extension( intensities_path, ) sep_var = detect_separator_from_extension( var_annotation_path, ) sep_sample = detect_separator_from_extension( sample_annotation_path, ) else: sep_intensities = sep sep_var = sep sep_sample = sep # Melt .X to long format: sample_id, peptide_id, intensity df_x = pd.DataFrame( adata.X, index=adata.obs_names, columns=adata.var_names, ) df_x.index.name = "sample_id" df_long = df_x.reset_index().melt( id_vars="sample_id", var_name="peptide_id", value_name="intensity", ) if fill_na is not None: df_long["intensity"] = df_long["intensity"].fillna( fill_na, ) intensities_path.parent.mkdir(parents=True, exist_ok=True) df_long.to_csv( intensities_path, sep=sep_intensities, index=False, ) # Save .var annotation df_var = adata.var[ ["peptide_id", "protein_id", "gene_id"] ].copy() var_annotation_path.parent.mkdir( parents=True, exist_ok=True, ) df_var.to_csv( var_annotation_path, sep=sep_var, index=False, ) # Save .obs annotation df_obs = adata.obs[ ["sample_id", "tissue", "mouse_id"] ].copy() sample_annotation_path.parent.mkdir( parents=True, exist_ok=True, ) df_obs.to_csv( sample_annotation_path, sep=sep_sample, index=False, )