Source code for proteopy.pl.copf
from __future__ import annotations
from pathlib import Path
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import anndata as ad
from matplotlib.axes import Axes
from adjustText import adjust_text
from proteopy.utils.anndata import check_proteodata
[docs]
def proteoform_scores(
adata: ad.AnnData,
*,
adj: bool = True,
pval_threshold: float | int | None = None,
score_threshold: float | int | None = None,
log_scores: bool = False,
protein_id_key: str | None = None,
highlight_prots: list[str] | None = None,
protein_label_fontsize: int | float = 8,
protein_label_color: str = "black",
show: bool = True,
save: str | Path | None = None,
ax: Axes | None = None,
) -> Axes:
"""Scatter plot of COPF proteoform scores vs. p-values.
Parameters
----------
adata : AnnData
:class:`~anndata.AnnData` with COPF score annotations in ``.var``.
adj : bool
Use adjusted ``proteoform_score_pval_adj`` values when ``True``.
pval_threshold : float | int | None
Maximum p-value used to highlight points. ``None`` disables filtering
by p-value.
score_threshold : float | int | None
Minimum proteoform score used to highlight points. ``None`` disables
score-based filtering.
log_scores : bool
Plot p-values on a log-scaled y-axis when ``True``; otherwise use a
linear scale.
protein_id_key : str | None
Column in ``.var`` whose values are used as display labels
instead of ``protein_id``. 1-to-1 mapping between ``protein_id`` and
``protein_id_key`` is enforced.
highlight_prots : list[str] | None
Protein IDs to highlight with text labels on the scatter
plot. When ``protein_id_key`` is set, values must come
from the ``protein_id_key`` column.
protein_label_fontsize : int | float
Font size for the highlight labels.
protein_label_color : str
Color for the highlight labels and connector lines.
show : bool
Call :func:`matplotlib.pyplot.show` when ``True``.
save : str | Path | None
File path to save the figure. ``None`` skips saving.
ax : matplotlib.axes.Axes | None
Matplotlib Axes object to plot onto. If ``None``, a new
figure and axes are created.
Returns
-------
matplotlib.axes.Axes
The Axes object used for plotting.
Examples
--------
Basic scatter plot of proteoform scores:
>>> import proteopy as pr
>>> adata = pr.read.long(...)
>>> pr.tl.pairwise_peptide_correlations(adata)
>>> pr.tl.peptide_dendograms_by_correlation(
... adata,
... method='agglomerative-hierarchical-clustering',
... )
>>> pr.tl.peptide_clusters_from_dendograms(
... adata,
... n_clusters=2,
... min_peptides_per_cluster=2,
... )
>>> pr.tl.proteoform_scores(adata, min_pval_adj=0.4)
>>> pr.pl.proteoform_scores(adata)
Highlight specific proteins by ``protein_id``:
>>> pr.pl.proteoform_scores(
... adata,
... highlight_prots=["P12345", "Q67890"],
... )
Highlight proteins using an alternative label column:
>>> pr.pl.proteoform_scores(
... adata,
... protein_id_key="gene_name",
... highlight_prots=["GAPDH", "ACTB"],
... protein_label_color="red",
... protein_label_fontsize=10,
... )
"""
check_proteodata(adata)
if adj:
pval_col = "proteoform_score_pval_adj"
else:
pval_col = "proteoform_score_pval"
required_cols = {"proteoform_score", pval_col}
missing = required_cols.difference(adata.var.columns)
if missing:
missing_str = ", ".join(sorted(missing))
raise ValueError(
"Missing required columns in `adata.var`: " f"{missing_str}"
)
var = adata.var.loc[:, ["proteoform_score", pval_col]].copy()
var = var.drop_duplicates()
var = var.dropna(subset=["proteoform_score", pval_col])
# Filter out invalid p-values before plotting.
finite_mask = np.isfinite(var[pval_col])
if not finite_mask.all():
warnings.warn(
"Dropping entries with non-finite p-values.",
RuntimeWarning,
)
var = var.loc[finite_mask]
if log_scores:
positive_mask = var[pval_col] > 0
if not positive_mask.all():
warnings.warn(
"Dropping non-positive p-values before log-transforming.",
RuntimeWarning,
)
var = var.loc[positive_mask]
plot_pvals = -np.log10(var[pval_col])
if adj:
ylabel = "-log10(adj. p-value)"
else:
ylabel = "-log10(p-value)"
else:
non_negative = var[pval_col] >= 0
if not non_negative.all():
warnings.warn(
"Dropping negative p-values before plotting.",
RuntimeWarning,
)
var = var.loc[non_negative]
plot_pvals = var[pval_col]
ylabel = "adj. p-value" if adj else "p-value"
if var.empty:
raise ValueError("No valid proteoform scores available for plotting.")
def _validate_threshold(
value: float | int | None,
*,
name: str,
allow_zero: bool = False,
upper_bound: float | None = None,
) -> float | int | None:
if value is None:
return None
if isinstance(value, bool):
raise ValueError(f"{name} must be a number, not bool.")
if not isinstance(value, (int, float, np.integer, np.floating)):
raise ValueError(f"{name} must be a real number.")
if not np.isfinite(value):
raise ValueError(f"{name} must be a finite number.")
if not allow_zero and value <= 0:
raise ValueError(f"{name} must be greater than 0.")
if upper_bound is not None and value > upper_bound:
raise ValueError(
f"{name} must be less than or equal to {upper_bound}."
)
return value
pval_threshold = _validate_threshold(
pval_threshold,
name="pval_threshold",
allow_zero=False,
upper_bound=1.0,
)
score_threshold = _validate_threshold(
score_threshold,
name="score_threshold",
allow_zero=True,
)
if pval_threshold is not None:
if log_scores:
pval_threshold_line = -np.log10(pval_threshold)
else:
pval_threshold_line = pval_threshold
else:
pval_threshold_line = None
mask = pd.Series(True, index=var.index)
has_condition = False
if score_threshold is not None:
mask &= var["proteoform_score"] >= score_threshold
has_condition = True
if pval_threshold is not None:
mask &= var[pval_col] <= pval_threshold
has_condition = True
if not has_condition:
mask[:] = False
var["is_above_threshold"] = mask
var["plot_pval"] = plot_pvals
if ax is not None:
_ax = ax
_fig = _ax.get_figure()
else:
_fig, _ax = plt.subplots()
sns.scatterplot(
data=var,
x="proteoform_score",
y="plot_pval",
hue="is_above_threshold",
palette={True: "#008A1D", False: "#BDBDBD"},
alpha=0.5,
s=30,
edgecolor=None,
legend=False,
ax=_ax,
)
# -- Highlight selected proteins with text labels --------
if highlight_prots is not None:
if not isinstance(highlight_prots, list) or not all(
isinstance(v, str) for v in highlight_prots
):
raise TypeError("`highlight_prots` must be a list of strings.")
# Build protein_id <-> display label mapping.
if protein_id_key is not None:
if protein_id_key not in adata.var.columns:
raise ValueError(
f"Column '{protein_id_key}' not found "
"in `adata.var`."
)
# Validate 1-to-1 mapping.
mapping_df = adata.var[
["protein_id", protein_id_key]
].drop_duplicates()
dup_proteins = (
mapping_df
.groupby("protein_id")[protein_id_key]
.nunique()
)
bad = dup_proteins[dup_proteins > 1]
if not bad.empty:
raise ValueError(
"1-to-1 mapping violation between "
f"'protein_id' and '{protein_id_key}' "
"for protein(s): "
f"{sorted(bad.index.tolist())}"
)
pid_to_label = dict(
zip(
mapping_df["protein_id"],
mapping_df[protein_id_key],
)
)
label_to_pid = dict(
zip(
mapping_df[protein_id_key],
mapping_df["protein_id"],
)
)
# highlight_prots may contain protein_id_key
# values — resolve them to protein_ids.
known_labels = set(mapping_df[protein_id_key])
resolved_pids = set()
unknown = (set(highlight_prots) - known_labels)
if unknown:
raise ValueError(
"The following values from "
"`highlight_prots` are not found in "
f"`adata.var['{protein_id_key}']`: "
f"{sorted(unknown)}"
)
highlight_pids = {
label_to_pid[v] for v in highlight_prots
}
else:
pid_to_label = None
known_ids = set(adata.var["protein_id"])
unknown = set(highlight_prots) - known_ids
if unknown:
raise ValueError(
"The following protein IDs from "
"`highlight_prots` are not found in "
"`adata.var['protein_id']`: "
f"{sorted(unknown)}"
)
highlight_pids = set(highlight_prots)
# Map var index back to protein_id for the
# deduplicated var DataFrame.
pid_series = adata.var.loc[var.index, "protein_id"]
highlight_mask = pid_series.isin(highlight_pids)
var_highlight = var.loc[highlight_mask.values]
if not var_highlight.empty:
texts = []
for idx in var_highlight.index:
pid = pid_series.loc[idx]
if pid_to_label is not None:
label = pid_to_label[pid]
else:
label = pid
texts.append(
_ax.text(
var_highlight.loc[idx, "proteoform_score"],
var_highlight.loc[idx, "plot_pval"],
label,
fontsize=protein_label_fontsize,
color=protein_label_color,
)
)
adjust_text(
texts,
x=var["proteoform_score"].values,
y=var["plot_pval"].values,
ax=_ax,
force_points=(2.0, 2.0),
force_text=(1.0, 1.0),
expand=(2.0, 2.0),
arrowprops=dict(
arrowstyle="-",
color="grey",
lw=0.5,
),
)
if score_threshold is not None:
_ax.axvline(
score_threshold,
color="#A2A2A2",
linestyle="--",
)
if pval_threshold_line is not None:
_ax.axhline(
pval_threshold_line,
color="#A2A2A2",
linestyle="--",
)
_ax.set_xlabel("Proteoform Score")
_ax.set_ylabel(ylabel)
_fig.tight_layout()
if save is not None:
if not isinstance(save, (str, Path)):
raise TypeError(
"`save` must be a path-like object or None."
)
_fig.savefig(save, dpi=300, bbox_inches="tight")
if show:
plt.show()
return _ax