Source code for proteopy.datasets.williams_2018

import warnings
import zipfile
from pathlib import Path

import anndata as ad
import numpy as np
import pandas as pd
import pooch

from proteopy.utils.anndata import check_proteodata

_KNOWN_HASH = (
    "sha256:"
    "58c2ea5cfdda5dc1bc91eec2d9c3fb1f56ccadcccd81ae3980877f6710c5a96d"
)



[docs]
def williams_2018(
    fill_na: float | int | None = None,
) -> ad.AnnData:
    """Load Williams 2018 mouse multi-tissue proteomics dataset.

    Download, process and format as an
    :class:`~anndata.AnnData` object the peptide-level
    SWATH-MS dataset from Williams et al. (2018) [1]_
    quantifying protein expression across five tissues
    in eight genetically diverse BXD mouse strains. Only
    the whole cell fraction is included; peptide
    intensities from different charge states are summed
    per peptide sequence. By default, missing values
    are represented as ``np.nan``.

    Sample annotation (``.obs``) includes:
        - ``sample_id``: Unique sample identifier
        - ``tissue``: Tissue type (Brain, BAT, Heart, Liver, Quad)
        - ``mouse_id``: BXD mouse strain identifier

    Variable annotation (``.var``) includes:
        - ``peptide_id``: Peptide sequence (matches ``.var_names``)
        - ``protein_id``: UniProt protein identifier
        - ``gene_id``: Gene symbol

    Data are sourced from the Elsevier supplementary archive
    (DOI: 10.1074/mcp.RA118.000554).

    Parameters
    ----------
    fill_na : float | int | None, optional
        If not ``None``, replace ``np.nan`` in ``.X``
        with this value.

    Returns
    -------
    ad.AnnData
        AnnData object with peptide-level quantification data.
        ``.X`` contains peptide intensities (samples x peptides).

    Raises
    ------
    urllib.error.URLError
        If download from the Elsevier CDN fails.

    Examples
    --------
    >>> import proteopy as pr
    >>> adata = pr.datasets.williams_2018()
    >>> adata
    AnnData object with n_obs x n_vars
        obs: 'sample_id', 'tissue', 'mouse_id'
        var: 'peptide_id', 'protein_id', 'gene_id'

    References
    ----------
    .. [1] Williams EG, Wu Y, Wolski W, Kim JY, Lan J, Hasan M,
       Halter C, Jha P, Ryu D, Auwerx J, and Aebersold R.
       "Quantifying and Localizing the Mitochondrial Proteome
       Across Five Tissues in A Mouse Population." Molecular &
       Cellular Proteomics, 2018, 17(9):1766-1777.
       DOI: 10.1074/mcp.RA118.000554.
    """
    if fill_na is not None and not isinstance(
        fill_na, (int, float),
    ):
        raise TypeError(
            f"fill_na must be float, int, or None, "
            f"got {type(fill_na).__name__}"
        )

    url = (
        "https://ars.els-cdn.com/content/image/"
        "1-s2.0-S1535947620320569-mmc1.zip"
    )
    zip_path = pooch.retrieve(
        url=url,
        known_hash=_KNOWN_HASH,
        fname="williams_2018_mmc1.zip",
        path=pooch.os_cache("proteopy"),
    )

    cache_dir = Path(zip_path).parent
    xlsx_name = "134784_1_supp_121511_p7byjt.xlsx"
    xlsx_path = cache_dir / xlsx_name

    if not xlsx_path.exists():
        with zipfile.ZipFile(zip_path, "r") as zf:
            zf.extract(xlsx_name, path=cache_dir)

    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
            message="Unknown extension is not supported",
            category=UserWarning,
            module="openpyxl",
        )
        df = pd.read_excel(xlsx_path)

    # Select metadata columns
    meta_cols = {
        "Unnamed: 0": "peptide_id",
        "Unnamed: 3": "protein_id",
        "Unnamed: 4": "gene_id",
    }

    # Select intensity columns: named cols where row 0 == "Intensity",
    # excluding _mito fractions
    intensity_cols = [
        c for c in df.columns
        if "Unnamed" not in str(c)
        and df[c].iloc[0] == "Intensity"
        and "_mito" not in str(c)
    ]

    df = df[list(meta_cols.keys()) + intensity_cols]

    # Remove _WholeCell suffix from sample column names
    df = df.rename(columns={
        c: c.replace("_WholeCell", "")
        for c in intensity_cols
    })
    df = df.rename(columns=meta_cols)

    # Drop the first row (secondary header)
    df = df.iloc[1:].reset_index(drop=True)

    # Extract peptide sequence (remove prefixes and suffixes)
    df["peptide_id"] = (
        df["peptide_id"].str.split("_").str[1]
    )

    # Verify protein_id and gene_id are consistent
    # across charge states of the same peptide
    meta_check = (
        df.groupby("peptide_id")[["protein_id", "gene_id"]]
        .nunique()
    )
    inconsistent = meta_check[
        (meta_check["protein_id"] > 1)
        | (meta_check["gene_id"] > 1)
    ]
    if not inconsistent.empty:
        raise ValueError(
            "Inconsistent protein_id or gene_id "
            "across charge states for peptides:\n"
            f"{inconsistent.index.tolist()}"
        )

    # Sum intensities across charge states of the same peptide
    sample_cols = [
        c for c in df.columns
        if c not in ("peptide_id", "protein_id", "gene_id")
    ]
    df[sample_cols] = df[sample_cols].astype(float)
    var = (
        df.groupby("peptide_id")[["protein_id", "gene_id"]]
        .first()
    )
    var["peptide_id"] = var.index
    X = (
        df.groupby("peptide_id")[sample_cols]
        .sum()
        .values.T
    )

    # Build obs annotation with tissue and mouse_id
    obs = pd.DataFrame({"sample_id": sample_cols})
    parts = obs["sample_id"].str.split(
        "_", n=1, expand=True,
    )
    parts.columns = ["p1", "p2"]
    tissue_first = parts["p1"].str.fullmatch(
        r"Brain|BAT|Heart|Liver|Quad"
    )
    obs["tissue"] = np.where(
        tissue_first, parts["p1"], parts["p2"],
    )
    obs["mouse_id"] = np.where(
        tissue_first, parts["p2"], parts["p1"],
    )
    obs = obs.set_index("sample_id")
    obs.index.name = None
    obs["sample_id"] = obs.index

    # Construct anndata
    adata = ad.AnnData(X=X, obs=obs, var=var)
    adata.X[adata.X == 0] = np.nan

    if fill_na is not None:
        adata.X[np.isnan(adata.X)] = fill_na

    check_proteodata(adata)

    return adata