Source code for proteopy.pl.copf

from __future__ import annotations

from pathlib import Path
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import anndata as ad
from matplotlib.axes import Axes
from adjustText import adjust_text

from proteopy.utils.anndata import check_proteodata


[docs]
def proteoform_scores(
    adata: ad.AnnData,
    *,
    adj: bool = True,
    pval_threshold: float | int | None = None,
    score_threshold: float | int | None = None,
    log_scores: bool = False,
    protein_id_key: str | None = None,
    highlight_prots: list[str] | None = None,
    protein_label_fontsize: int | float = 8,
    protein_label_color: str = "black",
    show: bool = True,
    save: str | Path | None = None,
    ax: Axes | None = None,
) -> Axes:
    """Scatter plot of COPF proteoform scores vs. p-values.

    Parameters
    ----------
    adata : AnnData
        :class:`~anndata.AnnData` with COPF score annotations in ``.var``.
    adj : bool
        Use adjusted ``proteoform_score_pval_adj`` values when ``True``.
    pval_threshold : float | int | None
        Maximum p-value used to highlight points. ``None`` disables filtering
        by p-value.
    score_threshold : float | int | None
        Minimum proteoform score used to highlight points. ``None`` disables
        score-based filtering.
    log_scores : bool
        Plot p-values on a log-scaled y-axis when ``True``; otherwise use a
        linear scale.
    protein_id_key : str | None
        Column in ``.var`` whose values are used as display labels
        instead of ``protein_id``. 1-to-1 mapping between ``protein_id`` and
        ``protein_id_key`` is enforced.
    highlight_prots : list[str] | None
        Protein IDs to highlight with text labels on the scatter
        plot. When ``protein_id_key`` is set, values must come
        from the ``protein_id_key`` column.
    protein_label_fontsize : int | float
        Font size for the highlight labels.
    protein_label_color : str
        Color for the highlight labels and connector lines.
    show : bool
        Call :func:`matplotlib.pyplot.show` when ``True``.
    save : str | Path | None
        File path to save the figure. ``None`` skips saving.
    ax : matplotlib.axes.Axes | None
        Matplotlib Axes object to plot onto. If ``None``, a new
        figure and axes are created.

    Returns
    -------
    matplotlib.axes.Axes
        The Axes object used for plotting.

    Examples
    --------
    Basic scatter plot of proteoform scores:

    >>> import proteopy as pr
    >>> adata = pr.read.long(...)
    >>> pr.tl.pairwise_peptide_correlations(adata)
    >>> pr.tl.peptide_dendograms_by_correlation(
    ...     adata,
    ...     method='agglomerative-hierarchical-clustering',
    ... )
    >>> pr.tl.peptide_clusters_from_dendograms(
    ...     adata,
    ...     n_clusters=2,
    ...     min_peptides_per_cluster=2,
    ... )
    >>> pr.tl.proteoform_scores(adata, min_pval_adj=0.4)
    >>> pr.pl.proteoform_scores(adata)

    Highlight specific proteins by ``protein_id``:

    >>> pr.pl.proteoform_scores(
    ...     adata,
    ...     highlight_prots=["P12345", "Q67890"],
    ... )

    Highlight proteins using an alternative label column:

    >>> pr.pl.proteoform_scores(
    ...     adata,
    ...     protein_id_key="gene_name",
    ...     highlight_prots=["GAPDH", "ACTB"],
    ...     protein_label_color="red",
    ...     protein_label_fontsize=10,
    ... )
    """

    check_proteodata(adata)

    if adj:
        pval_col = "proteoform_score_pval_adj"
    else:
        pval_col = "proteoform_score_pval"

    required_cols = {"proteoform_score", pval_col}
    missing = required_cols.difference(adata.var.columns)
    if missing:
        missing_str = ", ".join(sorted(missing))
        raise ValueError(
            "Missing required columns in `adata.var`: " f"{missing_str}"
        )

    var = adata.var.loc[:, ["proteoform_score", pval_col]].copy()
    var = var.drop_duplicates()
    var = var.dropna(subset=["proteoform_score", pval_col])

    # Filter out invalid p-values before plotting.
    finite_mask = np.isfinite(var[pval_col])
    if not finite_mask.all():
        warnings.warn(
            "Dropping entries with non-finite p-values.",
            RuntimeWarning,
        )
        var = var.loc[finite_mask]

    if log_scores:
        positive_mask = var[pval_col] > 0
        if not positive_mask.all():
            warnings.warn(
                "Dropping non-positive p-values before log-transforming.",
                RuntimeWarning,
            )
            var = var.loc[positive_mask]
        plot_pvals = -np.log10(var[pval_col])
        if adj:
            ylabel = "-log10(adj. p-value)"
        else:
            ylabel = "-log10(p-value)"
    else:
        non_negative = var[pval_col] >= 0
        if not non_negative.all():
            warnings.warn(
                "Dropping negative p-values before plotting.",
                RuntimeWarning,
            )
            var = var.loc[non_negative]
        plot_pvals = var[pval_col]
        ylabel = "adj. p-value" if adj else "p-value"

    if var.empty:
        raise ValueError("No valid proteoform scores available for plotting.")

    def _validate_threshold(
        value: float | int | None,
        *,
        name: str,
        allow_zero: bool = False,
        upper_bound: float | None = None,
    ) -> float | int | None:
        if value is None:
            return None
        if isinstance(value, bool):
            raise ValueError(f"{name} must be a number, not bool.")
        if not isinstance(value, (int, float, np.integer, np.floating)):
            raise ValueError(f"{name} must be a real number.")
        if not np.isfinite(value):
            raise ValueError(f"{name} must be a finite number.")
        if not allow_zero and value <= 0:
            raise ValueError(f"{name} must be greater than 0.")
        if upper_bound is not None and value > upper_bound:
            raise ValueError(
                f"{name} must be less than or equal to {upper_bound}."
            )
        return value

    pval_threshold = _validate_threshold(
        pval_threshold,
        name="pval_threshold",
        allow_zero=False,
        upper_bound=1.0,
    )
    score_threshold = _validate_threshold(
        score_threshold,
        name="score_threshold",
        allow_zero=True,
    )

    if pval_threshold is not None:
        if log_scores:
            pval_threshold_line = -np.log10(pval_threshold)
        else:
            pval_threshold_line = pval_threshold
    else:
        pval_threshold_line = None

    mask = pd.Series(True, index=var.index)
    has_condition = False
    if score_threshold is not None:
        mask &= var["proteoform_score"] >= score_threshold
        has_condition = True
    if pval_threshold is not None:
        mask &= var[pval_col] <= pval_threshold
        has_condition = True
    if not has_condition:
        mask[:] = False

    var["is_above_threshold"] = mask
    var["plot_pval"] = plot_pvals

    if ax is not None:
        _ax = ax
        _fig = _ax.get_figure()
    else:
        _fig, _ax = plt.subplots()
    sns.scatterplot(
        data=var,
        x="proteoform_score",
        y="plot_pval",
        hue="is_above_threshold",
        palette={True: "#008A1D", False: "#BDBDBD"},
        alpha=0.5,
        s=30,
        edgecolor=None,
        legend=False,
        ax=_ax,
    )

    # -- Highlight selected proteins with text labels --------
    if highlight_prots is not None:
        if not isinstance(highlight_prots, list) or not all(
            isinstance(v, str) for v in highlight_prots
        ):
            raise TypeError("`highlight_prots` must be a list of strings.")

        # Build protein_id <-> display label mapping.
        if protein_id_key is not None:
            if protein_id_key not in adata.var.columns:
                raise ValueError(
                    f"Column '{protein_id_key}' not found "
                    "in `adata.var`."
                )
            # Validate 1-to-1 mapping.
            mapping_df = adata.var[
                ["protein_id", protein_id_key]
            ].drop_duplicates()
            dup_proteins = (
                mapping_df
                .groupby("protein_id")[protein_id_key]
                .nunique()
            )
            bad = dup_proteins[dup_proteins > 1]
            if not bad.empty:
                raise ValueError(
                    "1-to-1 mapping violation between "
                    f"'protein_id' and '{protein_id_key}' "
                    "for protein(s): "
                    f"{sorted(bad.index.tolist())}"
                )
            pid_to_label = dict(
                zip(
                    mapping_df["protein_id"],
                    mapping_df[protein_id_key],
                )
            )
            label_to_pid = dict(
                zip(
                    mapping_df[protein_id_key],
                    mapping_df["protein_id"],
                )
            )

            # highlight_prots may contain protein_id_key
            # values — resolve them to protein_ids.
            known_labels = set(mapping_df[protein_id_key])
            resolved_pids = set()
            unknown = (set(highlight_prots) - known_labels)
            if unknown:
                raise ValueError(
                    "The following values from "
                    "`highlight_prots` are not found in "
                    f"`adata.var['{protein_id_key}']`: "
                    f"{sorted(unknown)}"
                )
            highlight_pids = {
                label_to_pid[v] for v in highlight_prots
            }
        else:
            pid_to_label = None
            known_ids = set(adata.var["protein_id"])
            unknown = set(highlight_prots) - known_ids
            if unknown:
                raise ValueError(
                    "The following protein IDs from "
                    "`highlight_prots` are not found in "
                    "`adata.var['protein_id']`: "
                    f"{sorted(unknown)}"
                )
            highlight_pids = set(highlight_prots)

        # Map var index back to protein_id for the
        # deduplicated var DataFrame.
        pid_series = adata.var.loc[var.index, "protein_id"]
        highlight_mask = pid_series.isin(highlight_pids)
        var_highlight = var.loc[highlight_mask.values]

        if not var_highlight.empty:
            texts = []
            for idx in var_highlight.index:
                pid = pid_series.loc[idx]
                if pid_to_label is not None:
                    label = pid_to_label[pid]
                else:
                    label = pid
                texts.append(
                    _ax.text(
                        var_highlight.loc[idx, "proteoform_score"],
                        var_highlight.loc[idx, "plot_pval"],
                        label,
                        fontsize=protein_label_fontsize,
                        color=protein_label_color,
                    )
                )
            adjust_text(
                texts,
                x=var["proteoform_score"].values,
                y=var["plot_pval"].values,
                ax=_ax,
                force_points=(2.0, 2.0),
                force_text=(1.0, 1.0),
                expand=(2.0, 2.0),
                arrowprops=dict(
                    arrowstyle="-",
                    color="grey",
                    lw=0.5,
                ),
            )

    if score_threshold is not None:
        _ax.axvline(
            score_threshold,
            color="#A2A2A2",
            linestyle="--",
        )
    if pval_threshold_line is not None:
        _ax.axhline(
            pval_threshold_line,
            color="#A2A2A2",
            linestyle="--",
        )

    _ax.set_xlabel("Proteoform Score")
    _ax.set_ylabel(ylabel)
    _fig.tight_layout()

    if save is not None:
        if not isinstance(save, (str, Path)):
            raise TypeError(
                "`save` must be a path-like object or None."
            )
        _fig.savefig(save, dpi=300, bbox_inches="tight")
    if show:
        plt.show()

    return _ax