Source code for proteopy.pl.sequence

from __future__ import annotations

from collections.abc import Callable, Sequence as SequenceABC
from pathlib import Path
from typing import Union

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.axes import Axes
from matplotlib.colors import Colormap
import anndata as ad

from proteopy.utils.anndata import check_proteodata
from proteopy.utils.matplotlib import _resolve_color_scheme

ColorScheme = Union[
    str, dict, SequenceABC, Colormap, Callable, None
]


def _find_sequence_positions(
    seq: str,
    ref_sequence: str,
) -> list[int]:
    """Find all start positions of ``seq`` within ``ref_sequence``."""
    positions = []
    start = 0
    while True:
        idx = ref_sequence.find(seq, start)
        if idx == -1:
            break
        positions.append(idx)
        start = idx + 1
    return positions


def _resolve_seq_coord(
    name: str,
    entry: dict,
    ref_len: int,
) -> dict[str, dict]:
    """Validate and resolve a seq_coord entry to coordinates."""
    start, end = entry["seq_coord"]
    group = entry["group"]
    if start >= end:
        raise ValueError(
            f"Invalid interval [{start}, {end}) "
            f"for '{name}': start must be less "
            f"than end."
        )
    if start < 0 or end > ref_len:
        raise ValueError(
            f"Sequence coordinate [{start}, {end}) "
            f"for '{name}' is out of bounds "
            f"[0, {ref_len})."
        )
    return {
        name: {
            "start": start,
            "end": end,
            "group": group,
        }
    }


def _resolve_seq_string(
    name: str,
    entry: dict,
    ref_sequence: str,
    allow_multi_match: bool,
) -> dict[str, dict]:
    """Locate a seq string in the reference and return coordinates."""
    seq = entry["seq"]
    group = entry["group"]
    if not seq:
        raise ValueError(
            f"Entry '{name}' has an empty "
            f"'seq' string."
        )
    positions = _find_sequence_positions(
        seq, ref_sequence,
    )

    if not positions:
        raise ValueError(
            f"Sequence '{name}' ('{seq}') was not "
            f"found in the reference sequence."
        )

    if len(positions) > 1 and not allow_multi_match:
        raise ValueError(
            f"Sequence '{name}' ('{seq}') matches "
            f"the reference at {len(positions)} "
            f"positions ({positions}). Set "
            f"allow_multi_match=True to show all "
            f"matches as separate bars."
        )

    result = {}
    if len(positions) == 1:
        result[name] = {
            "start": positions[0],
            "end": positions[0] + len(seq),
            "group": group,
        }
    else:
        for i, pos in enumerate(positions, start=1):
            match_name = f"{name} (match {i})"
            result[match_name] = {
                "start": pos,
                "end": pos + len(seq),
                "group": group,
            }
    return result


def _resolve_sequences(
    ref_sequence: str,
    sequences: dict[str, dict],
    allow_multi_match: bool,
) -> dict[str, dict]:
    """Resolve each sequence entry to absolute ``[start, end)`` coordinates.

    For entries with a ``"seq"`` key the amino-acid string is located
    within ``ref_sequence`` via substring search.  Entries with a
    ``"seq_coord"`` key are validated against the reference length.
    When ``allow_multi_match`` is ``True`` and a substring appears
    more than once, one entry per match is emitted with the suffix
    ``" (match N)"``.

    Parameters
    ----------
    ref_sequence : str
        Full reference protein sequence.
    sequences : dict[str, dict]
        Mapping of sequence name to a dict containing
        ``"group"`` and either ``"seq"`` or ``"seq_coord"``.
    allow_multi_match : bool
        If ``False``, raise ``ValueError`` when a ``"seq"``
        string matches the reference at more than one position.

    Returns
    -------
    dict[str, dict]
        Mapping of (possibly expanded) sequence names to dicts
        with ``"start"``, ``"end"``, and ``"group"`` keys.

    Raises
    ------
    ValueError
        If a ``"seq"`` string is not found in the reference,
        if coordinates are out of bounds, if an entry lacks
        both ``"seq"`` and ``"seq_coord"``, or if a sequence
        multi-matches and ``allow_multi_match`` is ``False``.
    """
    ref_len = len(ref_sequence)
    resolved = {}

    for name, entry in sequences.items():
        if "seq" in entry and "seq_coord" in entry:
            raise ValueError(
                f"Entry '{name}' has both 'seq' and "
                f"'seq_coord'; provide only one."
            )

        if "seq_coord" in entry:
            result = _resolve_seq_coord(name, entry, ref_len)
        elif "seq" in entry:
            result = _resolve_seq_string(
                name, entry, ref_sequence, allow_multi_match,
            )
        else:
            raise ValueError(
                f"Entry '{name}' must have either 'seq' "
                f"or 'seq_coord' key."
            )
        resolved.update(result)

    return resolved


def _check_overlaps(
    groups: dict[str, list[tuple[int, int, str]]],
) -> None:
    """Raise ``ValueError`` if any two sequences within a group overlap."""
    for group_name, intervals in groups.items():
        sorted_intervals = sorted(intervals, key=lambda x: x[0])
        for i in range(len(sorted_intervals) - 1):
            _, end_a, name_a = sorted_intervals[i]
            start_b, _, name_b = sorted_intervals[i + 1]
            if end_a > start_b:
                raise ValueError(
                    f"Overlapping sequences in group "
                    f"'{group_name}': '{name_a}' "
                    f"[..., {end_a}) overlaps with "
                    f"'{name_b}' [{start_b}, ...). "
                    f"Set allow_overlaps=True to permit "
                    f"overlapping sequences."
                )


def _group_sequences(
    resolved: dict[str, dict],
    order: list[str] | None,
) -> tuple[list[str], dict[str, list]]:
    """Build group ordering and entries from resolved sequences."""
    group_order = []
    group_entries = {}
    for name, entry in resolved.items():
        g = entry["group"]
        if g not in group_entries:
            group_order.append(g)
            group_entries[g] = []
        group_entries[g].append(
            (entry["start"], entry["end"], name)
        )

    if order is not None:
        unknown = set(order) - set(group_order)
        if unknown:
            raise ValueError(
                f"Groups in 'order' not found in "
                f"sequences: {sorted(unknown)}."
            )
        group_order = list(order)

    return group_order, group_entries


def _plot_sequences_on_reference(
    ref_sequence: str,
    sequences: dict[str, dict],
    color_scheme: ColorScheme = None,
    allow_overlaps: bool = False,
    allow_multi_match: bool = False,
    title: str | None = None,
    order: list[str] | None = None,
    figsize: tuple[float, float] | None = None,
    ax: Axes | None = None,
) -> Axes:
    """Render sequences as horizontal bars aligned to a reference sequence.

    Draws a grey reference bar at the bottom and one colored broken-barh
    row per group above it. Each entry in ``sequences`` must supply either a
    ``"seq"`` key (amino-acid string to locate by substring search) or a
    ``"seq_coord"`` key (pre-computed ``[start, end)`` tuple), plus a
    ``"group"`` key that determines row assignment and legend label.

    Parameters
    ----------
    ref_sequence : str
        Full reference protein sequence against which all sequences are
        plotted.
    sequences : dict[str, dict]
        Mapping of sequence name to a dict with keys:

        - ``"group"`` (*str*) -- row label for this sequence.
        - ``"seq"`` (*str*, optional) -- amino-acid string; matched by
          substring search within ``ref_sequence``.
        - ``"seq_coord"`` (*tuple[int, int]*, optional) -- explicit
          ``[start, end)`` coordinates; mutually exclusive with
          ``"seq"``.
    color_scheme : str | dict | list | Colormap | callable | None
        One color per group.
    allow_overlaps : bool
        Skip overlap validation when ``True``; raise ``ValueError``
        for any two sequences in the same group that overlap when
        ``False``.
    allow_multi_match : bool
        When ``True``, sequences matching the reference at more than
        one position are plotted as separate bars. When ``False``, a
        ``ValueError`` is raised for multi-matching sequences.
    title : str | None
        Axes title.
    order : list[str] | None
        Explicit ordering of group rows. Groups absent from ``order`` are
        excluded.
    figsize : tuple[float, float] | None
        Figure dimensions ``(width, height)`` in inches passed to
        ``plt.subplots``.
    ax : matplotlib.axes.Axes | None
        Matplotlib Axes object to plot onto. If ``None``, a new
        figure and axes are created. The function always returns
        the Axes object used for plotting.

    Returns
    -------
    matplotlib.axes.Axes
        The Axes object used for plotting.
    """
    resolved = _resolve_sequences(
        ref_sequence, sequences, allow_multi_match,
    )

    group_order, group_entries = _group_sequences(resolved, order)

    if not allow_overlaps:
        _check_overlaps(group_entries)

    # Resolve colors
    colors = _resolve_color_scheme(color_scheme, group_order)
    color_map = {}
    if colors is not None:
        color_map = dict(zip(group_order, colors))

    # Create figure or use provided axes
    if ax is None:
        if figsize is None:
            figsize = (12, 1.5 + len(group_order) * 0.8)
        _, _ax = plt.subplots(figsize=figsize)
    else:
        _ax = ax

    ref_len = len(ref_sequence)
    bar_height = 0.6

    # Reference bar at y=0
    _ax.broken_barh(
        [(0, ref_len)],
        (0, bar_height),
        facecolors="lightgrey",
        edgecolors="grey",
        linewidth=0.5,
    )

    # Group bars stacked upward
    y_labels = ["Reference"]
    y_positions = [bar_height / 2]

    for i, group_name in enumerate(group_order):
        y_base = (i + 1) * (bar_height + 0.3)
        intervals = [
            (entry[0], entry[1] - entry[0])
            for entry in group_entries[group_name]
        ]
        face_color = color_map.get(group_name, "C0")
        _ax.broken_barh(
            intervals,
            (y_base, bar_height),
            facecolors=face_color,
            edgecolors="black",
            linewidth=0.5,
        )
        y_labels.append(group_name)
        y_positions.append(y_base + bar_height / 2)

    # Configure axes
    _ax.set_xlim(0, ref_len)
    _ax.set_yticks(y_positions)
    _ax.set_yticklabels(y_labels)
    _ax.set_xlabel("Position")

    if title is not None:
        _ax.set_title(title)

    # Remove top and right spines
    _ax.spines["top"].set_visible(False)
    _ax.spines["right"].set_visible(False)

    return _ax


def _extract_peptide_sequences(
    var_sub: pd.DataFrame,
    alt_pep_sequence_key: str | None,
) -> pd.Series | pd.Index:
    """Return peptide sequences from var subset."""
    if alt_pep_sequence_key is not None:
        if alt_pep_sequence_key not in var_sub.columns:
            raise KeyError(
                f"Column '{alt_pep_sequence_key}' not "
                f"found in adata.var."
            )
        return var_sub[alt_pep_sequence_key]
    return var_sub.index


def _extract_groups(
    var_sub: pd.DataFrame,
    group_by: str | None,
    filter_value: str,
) -> pd.Series:
    """Return group labels for each peptide in var subset."""
    if group_by is not None:
        if group_by not in var_sub.columns:
            raise KeyError(
                f"Column '{group_by}' not found in "
                f"adata.var."
            )
        return var_sub[group_by]
    return pd.Series(
        filter_value, index=var_sub.index,
    )


def _build_sequences_dict(
    var_sub: pd.DataFrame,
    pep_seqs: pd.Series | pd.Index,
    groups: pd.Series,
    add_sequences: dict[str, dict] | None,
) -> dict[str, dict]:
    """Build sequences dict from peptides and merge additions."""
    sequences = {}
    for pep_id, seq_val, grp_val in zip(
        var_sub.index, pep_seqs, groups,
    ):
        if pd.isna(grp_val):
            continue
        sequences[pep_id] = {
            "seq": str(seq_val),
            "group": str(grp_val),
        }

    if add_sequences is not None:
        conflicts = set(sequences) & set(add_sequences)
        if conflicts:
            raise ValueError(
                f"Name conflicts between peptide "
                f"sequences and additional sequences: "
                f"{sorted(conflicts)}."
            )
        sequences.update(add_sequences)

    return sequences


[docs] def peptides_on_sequence( adata: ad.AnnData, filter_key: str, filter_value: str, group_by: str | None = None, ref_sequence: str | None = None, alt_pep_sequence_key: str | None = None, add_sequences: dict[str, dict] | None = None, allow_overlaps: bool = False, allow_multi_match: bool = False, color_scheme: ColorScheme = None, title: str | None = None, order: list[str] | None = None, figsize: tuple[float, float] | None = None, show: bool = True, save: str | Path | None = None, ax: Axes | None = None, ) -> Axes: """Plot peptide coverage on a reference sequence as horizontal bars. Filters peptides from a peptide-level :class:`~anndata.AnnData` using the ``filter_key`` and ``filter_value`` parameters, which are then drawn as broken bars on top of a grey reference bar. Extra sequences (e.g., domains or post-translational modification sites) can be overlaid via the ``add_sequences`` parameter. Parameters ---------- adata : AnnData Peptide-level :class:`~anndata.AnnData`. filter_key : str Column in ``adata.var`` used to select peptides. filter_value : str Value in the ``filter_key`` column to match. group_by : str | None Column in ``adata.var`` used to assign peptides to colored rows. When ``None``, all peptides are placed in a single row labeled ``filter_value``. ref_sequence : str | None Full reference protein sequence as an amino-acid string. alt_pep_sequence_key : str | None Column in ``adata.var`` whose values are used as peptide amino-acid strings. When ``None``, ``adata.var_names`` (i.e., ``peptide_id``) are used. add_sequences : dict[str, dict] | None Additional named sequences to overlay (e.g., domains). Each value must be a dict with a ``"group"`` key and either a ``"seq"`` or ``"seq_coord"`` key. Coordinates are 0-based throughout. ``"seq_coord"`` must be a half-open ``[start, end)`` tuple where ``start >= 0`` and ``end <= len(ref_sequence)``. allow_overlaps : bool When ``False``, raise ``ValueError`` if two sequences in the same group overlap positionally. allow_multi_match : bool When ``True``, a peptide sequence matching the reference at multiple positions is shown as one bar per match, labeled ``"<name> (match N)"``. When ``False``, a ``ValueError`` is raised for ambiguous matches. color_scheme : str | dict | list | Colormap | callable | None Color specification for groups. title : str | None Axes title. order : list[str] | None Explicit ordering of group rows. figsize : tuple[float, float] | None Figure dimensions ``(width, height)`` in inches. show : bool Call ``plt.show()`` at the end. save : str | Path | None Path at which to save the figure (300 dpi, ``bbox_inches="tight"``). When ``None``, the figure is not saved. ax : matplotlib.axes.Axes | None Matplotlib Axes object to plot onto. If ``None``, a new figure and axes are created. The function always returns the Axes object used for plotting. Returns ------- matplotlib.axes.Axes The Axes object used for plotting. Raises ------ ValueError If ``adata`` is not at peptide level. ValueError If ``ref_sequence`` is ``None``. ValueError If no peptides match ``filter_value`` in ``adata.var[filter_key]``. ValueError If a peptide sequence is not found in the reference. ValueError If a peptide matches the reference at multiple positions and ``allow_multi_match=False``. ValueError If two sequences in the same group overlap and ``allow_overlaps=False``. KeyError If ``filter_key``, ``alt_pep_sequence_key``, or ``group_by`` is not found in ``adata.var``. Examples -------- Build a minimal peptide-level AnnData with two proteins. ``"ELPGW"`` belongs to P2 and is excluded by the filter. >>> import numpy as np >>> import pandas as pd >>> import anndata as ad >>> import proteopy as pr >>> ref = "MKTAYIAKQRQISFVKSHFSRQDEEP" >>> peptides = ["KTAYIAK", "QRQISFVK", "SHFSRQ", "ELPGW"] >>> var = pd.DataFrame( ... { ... "peptide_id": peptides, ... "protein_id": ["P1", "P1", "P1", "P2"], ... "proteoform": ["A", "A", "B", "A"], ... }, ... index=peptides, ... ) >>> obs = pd.DataFrame( ... {"sample_id": ["S1", "S2"]}, ... index=["S1", "S2"], ... ) >>> adata = ad.AnnData( ... X=np.ones((2, 4)), ... obs=obs, ... var=var, ... ) Filter to P1; group peptides by proteoform. >>> pr.pl.peptides_on_sequence( ... adata, ... filter_key="protein_id", ... filter_value="P1", ... ref_sequence=ref, ... group_by="proteoform", ... ) """ _, level = check_proteodata(adata) if level != "peptide": raise ValueError( "peptides_on_sequence requires peptide-level " "data. The provided AnnData is at the " f"'{level}' level." ) if ref_sequence is None: raise ValueError( "'ref_sequence' must be provided." ) # Filter .var var = adata.var if filter_key not in var.columns: raise KeyError( f"Column '{filter_key}' not found in " f"adata.var." ) mask = var[filter_key] == filter_value if not mask.any(): raise ValueError( f"No peptides found where " f"adata.var['{filter_key}'] == " f"'{filter_value}'." ) var_sub = var.loc[mask] pep_seqs = _extract_peptide_sequences( var_sub, alt_pep_sequence_key, ) groups = _extract_groups( var_sub, group_by, filter_value, ) sequences = _build_sequences_dict( var_sub, pep_seqs, groups, add_sequences, ) # Resolve title: default to filter_value if title is None: title = str(filter_value) # Plot _ax = _plot_sequences_on_reference( ref_sequence=ref_sequence, sequences=sequences, color_scheme=color_scheme, allow_overlaps=allow_overlaps, allow_multi_match=allow_multi_match, title=title, order=order, figsize=figsize, ax=ax, ) fig = _ax.figure if ax is None: fig.tight_layout() if save is not None: fig.savefig(save, dpi=300, bbox_inches="tight") if show: plt.show() return _ax
[docs] def peptides_on_prot_sequence( adata: ad.AnnData, protein_id: str, group_by: str | None = None, ref_sequence: str | None = None, alt_pep_sequence_key: str | None = None, add_sequences: dict[str, dict] | None = None, allow_overlaps: bool = False, allow_multi_match: bool = False, color_scheme: ColorScheme = None, title: str | None = None, order: list[str] | None = None, figsize: tuple[float, float] | None = None, show: bool = True, save: str | Path | None = None, ax: Axes | None = None, ) -> Axes: """Plot peptide coverage on a protein reference sequence. Filters peptides belonging to the given ``protein_id`` and draws them as broken bars on top of a grey reference bar. Extra sequences (e.g., domains or post-translational modification sites) can be overlaid via the ``add_sequences`` parameter. Parameters ---------- adata : AnnData Peptide-level :class:`~anndata.AnnData`. protein_id : str Value in ``adata.var["protein_id"]`` to select peptides for. group_by : str | None Column in ``adata.var`` used to assign peptides to colored rows. When ``None``, all peptides are placed in a single row labeled ``protein_id``. ref_sequence : str | None Full reference protein sequence as an amino-acid string. alt_pep_sequence_key : str | None Column in ``adata.var`` whose values are used as peptide amino-acid strings. When ``None``, ``adata.var_names`` (i.e., ``peptide_id``) are used. add_sequences : dict[str, dict] | None Additional named sequences to overlay (e.g., domains). Each value must be a dict with a ``"group"`` key and either a ``"seq"`` or ``"seq_coord"`` key. Coordinates are 0-based throughout. ``"seq_coord"`` must be a half-open ``[start, end)`` tuple where ``start >= 0`` and ``end <= len(ref_sequence)``. allow_overlaps : bool When ``False``, raise ``ValueError`` if two sequences in the same group overlap positionally. allow_multi_match : bool When ``True``, a peptide sequence matching the reference at multiple positions is shown as one bar per match, labeled ``"<name> (match N)"``. When ``False``, a ``ValueError`` is raised for ambiguous matches. color_scheme : str | dict | list | Colormap | callable | None Color specification for groups. title : str | None Axes title. order : list[str] | None Explicit ordering of group rows. figsize : tuple[float, float] | None Figure dimensions ``(width, height)`` in inches. show : bool Call ``plt.show()`` at the end. save : str | Path | None Path at which to save the figure (300 dpi, ``bbox_inches="tight"``). When ``None``, the figure is not saved. ax : matplotlib.axes.Axes | None Matplotlib Axes object to plot onto. If ``None``, a new figure and axes are created. The function always returns the Axes object used for plotting. Returns ------- matplotlib.axes.Axes The Axes object used for plotting. Raises ------ ValueError If ``adata`` is not at peptide level. ValueError If ``ref_sequence`` is ``None``. ValueError If no peptides match ``protein_id`` in ``adata.var["protein_id"]``. ValueError If a peptide sequence is not found in the reference. ValueError If a peptide matches the reference at multiple positions and ``allow_multi_match=False``. ValueError If two sequences in the same group overlap and ``allow_overlaps=False``. KeyError If ``alt_pep_sequence_key`` or ``group_by`` is not found in ``adata.var``. Examples -------- Build a minimal peptide-level AnnData with two proteins. ``"ELPGW"`` belongs to P2 and is excluded by the filter. >>> import numpy as np >>> import pandas as pd >>> import anndata as ad >>> import proteopy as pr >>> ref = "MKTAYIAKQRQISFVKSHFSRQDEEP" >>> peptides = [ ... "KTAYIAK", "QRQISFVK", "SHFSRQ", "ELPGW", ... ] >>> var = pd.DataFrame( ... { ... "peptide_id": peptides, ... "protein_id": ["P1", "P1", "P1", "P2"], ... "proteoform": ["A", "A", "B", "A"], ... }, ... index=peptides, ... ) >>> obs = pd.DataFrame( ... {"sample_id": ["S1", "S2"]}, ... index=["S1", "S2"], ... ) >>> adata = ad.AnnData( ... X=np.ones((2, 4)), ... obs=obs, ... var=var, ... ) Filter to P1; group peptides by proteoform. >>> pr.pl.peptides_on_prot_sequence( ... adata, ... protein_id="P1", ... ref_sequence=ref, ... group_by="proteoform", ... ) """ return peptides_on_sequence( adata, filter_key="protein_id", filter_value=protein_id, group_by=group_by, ref_sequence=ref_sequence, alt_pep_sequence_key=alt_pep_sequence_key, add_sequences=add_sequences, allow_overlaps=allow_overlaps, allow_multi_match=allow_multi_match, color_scheme=color_scheme, title=title, order=order, figsize=figsize, show=show, save=save, ax=ax, )