Source code for proteopy.pl.sequence

from __future__ import annotations

from collections.abc import Callable, Sequence as SequenceABC
from pathlib import Path
from typing import Union

import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.axes import Axes
from matplotlib.colors import Colormap
import anndata as ad

from proteopy.utils.anndata import check_proteodata
from proteopy.utils.matplotlib import _resolve_color_scheme

ColorScheme = Union[
    str, dict, SequenceABC, Colormap, Callable, None
]


def _find_sequence_positions(
    seq: str,
    ref_sequence: str,
) -> list[int]:
    """Find all start positions of ``seq`` within ``ref_sequence``."""
    positions = []
    start = 0
    while True:
        idx = ref_sequence.find(seq, start)
        if idx == -1:
            break
        positions.append(idx)
        start = idx + 1
    return positions


def _resolve_seq_coord(
    name: str,
    entry: dict,
    ref_len: int,
) -> dict[str, dict]:
    """Validate and resolve a seq_coord entry to coordinates."""
    start, end = entry["seq_coord"]
    group = entry["group"]
    if start >= end:
        raise ValueError(
            f"Invalid interval [{start}, {end}) "
            f"for '{name}': start must be less "
            f"than end."
        )
    if start < 0 or end > ref_len:
        raise ValueError(
            f"Sequence coordinate [{start}, {end}) "
            f"for '{name}' is out of bounds "
            f"[0, {ref_len})."
        )
    return {
        name: {
            "start": start,
            "end": end,
            "group": group,
        }
    }


def _resolve_seq_string(
    name: str,
    entry: dict,
    ref_sequence: str,
    allow_multi_match: bool,
) -> dict[str, dict]:
    """Locate a seq string in the reference and return coordinates."""
    seq = entry["seq"]
    group = entry["group"]
    if not seq:
        raise ValueError(
            f"Entry '{name}' has an empty "
            f"'seq' string."
        )
    positions = _find_sequence_positions(
        seq, ref_sequence,
    )

    if not positions:
        raise ValueError(
            f"Sequence '{name}' ('{seq}') was not "
            f"found in the reference sequence."
        )

    if len(positions) > 1 and not allow_multi_match:
        raise ValueError(
            f"Sequence '{name}' ('{seq}') matches "
            f"the reference at {len(positions)} "
            f"positions ({positions}). Set "
            f"allow_multi_match=True to show all "
            f"matches as separate bars."
        )

    result = {}
    if len(positions) == 1:
        result[name] = {
            "start": positions[0],
            "end": positions[0] + len(seq),
            "group": group,
        }
    else:
        for i, pos in enumerate(positions, start=1):
            match_name = f"{name} (match {i})"
            result[match_name] = {
                "start": pos,
                "end": pos + len(seq),
                "group": group,
            }
    return result


def _resolve_sequences(
    ref_sequence: str,
    sequences: dict[str, dict],
    allow_multi_match: bool,
) -> dict[str, dict]:
    """Resolve each sequence entry to absolute ``[start, end)`` coordinates.

    For entries with a ``"seq"`` key the amino-acid string is located
    within ``ref_sequence`` via substring search.  Entries with a
    ``"seq_coord"`` key are validated against the reference length.
    When ``allow_multi_match`` is ``True`` and a substring appears
    more than once, one entry per match is emitted with the suffix
    ``" (match N)"``.

    Parameters
    ----------
    ref_sequence : str
        Full reference protein sequence.
    sequences : dict[str, dict]
        Mapping of sequence name to a dict containing
        ``"group"`` and either ``"seq"`` or ``"seq_coord"``.
    allow_multi_match : bool
        If ``False``, raise ``ValueError`` when a ``"seq"``
        string matches the reference at more than one position.

    Returns
    -------
    dict[str, dict]
        Mapping of (possibly expanded) sequence names to dicts
        with ``"start"``, ``"end"``, and ``"group"`` keys.

    Raises
    ------
    ValueError
        If a ``"seq"`` string is not found in the reference,
        if coordinates are out of bounds, if an entry lacks
        both ``"seq"`` and ``"seq_coord"``, or if a sequence
        multi-matches and ``allow_multi_match`` is ``False``.
    """
    ref_len = len(ref_sequence)
    resolved = {}

    for name, entry in sequences.items():
        if "seq" in entry and "seq_coord" in entry:
            raise ValueError(
                f"Entry '{name}' has both 'seq' and "
                f"'seq_coord'; provide only one."
            )

        if "seq_coord" in entry:
            result = _resolve_seq_coord(name, entry, ref_len)
        elif "seq" in entry:
            result = _resolve_seq_string(
                name, entry, ref_sequence, allow_multi_match,
            )
        else:
            raise ValueError(
                f"Entry '{name}' must have either 'seq' "
                f"or 'seq_coord' key."
            )
        resolved.update(result)

    return resolved


def _check_overlaps(
    groups: dict[str, list[tuple[int, int, str]]],
) -> None:
    """Raise ``ValueError`` if any two sequences within a group overlap."""
    for group_name, intervals in groups.items():
        sorted_intervals = sorted(intervals, key=lambda x: x[0])
        for i in range(len(sorted_intervals) - 1):
            _, end_a, name_a = sorted_intervals[i]
            start_b, _, name_b = sorted_intervals[i + 1]
            if end_a > start_b:
                raise ValueError(
                    f"Overlapping sequences in group "
                    f"'{group_name}': '{name_a}' "
                    f"[..., {end_a}) overlaps with "
                    f"'{name_b}' [{start_b}, ...). "
                    f"Set allow_overlaps=True to permit "
                    f"overlapping sequences."
                )


def _group_sequences(
    resolved: dict[str, dict],
    order: list[str] | None,
) -> tuple[list[str], dict[str, list]]:
    """Build group ordering and entries from resolved sequences."""
    group_order = []
    group_entries = {}
    for name, entry in resolved.items():
        g = entry["group"]
        if g not in group_entries:
            group_order.append(g)
            group_entries[g] = []
        group_entries[g].append(
            (entry["start"], entry["end"], name)
        )

    if order is not None:
        unknown = set(order) - set(group_order)
        if unknown:
            raise ValueError(
                f"Groups in 'order' not found in "
                f"sequences: {sorted(unknown)}."
            )
        group_order = list(order)

    return group_order, group_entries


def _plot_sequences_on_reference(
    ref_sequence: str,
    sequences: dict[str, dict],
    color_scheme: ColorScheme = None,
    allow_overlaps: bool = False,
    allow_multi_match: bool = False,
    title: str | None = None,
    order: list[str] | None = None,
    figsize: tuple[float, float] | None = None,
    ax: Axes | None = None,
) -> Axes:
    """Render sequences as horizontal bars aligned to a reference sequence.

    Draws a grey reference bar at the bottom and one colored broken-barh
    row per group above it. Each entry in ``sequences`` must supply either a
    ``"seq"`` key (amino-acid string to locate by substring search) or a
    ``"seq_coord"`` key (pre-computed ``[start, end)`` tuple), plus a
    ``"group"`` key that determines row assignment and legend label.

    Parameters
    ----------
    ref_sequence : str
        Full reference protein sequence against which all sequences are
        plotted.
    sequences : dict[str, dict]
        Mapping of sequence name to a dict with keys:

        - ``"group"`` (*str*) -- row label for this sequence.
        - ``"seq"`` (*str*, optional) -- amino-acid string; matched by
          substring search within ``ref_sequence``.
        - ``"seq_coord"`` (*tuple[int, int]*, optional) -- explicit
          ``[start, end)`` coordinates; mutually exclusive with
          ``"seq"``.
    color_scheme : str | dict | list | Colormap | callable | None
        One color per group.
    allow_overlaps : bool
        Skip overlap validation when ``True``; raise ``ValueError``
        for any two sequences in the same group that overlap when
        ``False``.
    allow_multi_match : bool
        When ``True``, sequences matching the reference at more than
        one position are plotted as separate bars. When ``False``, a
        ``ValueError`` is raised for multi-matching sequences.
    title : str | None
        Axes title.
    order : list[str] | None
        Explicit ordering of group rows. Groups absent from ``order`` are
        excluded.
    figsize : tuple[float, float] | None
        Figure dimensions ``(width, height)`` in inches passed to
        ``plt.subplots``.
    ax : matplotlib.axes.Axes | None
        Matplotlib Axes object to plot onto. If ``None``, a new
        figure and axes are created. The function always returns
        the Axes object used for plotting.

    Returns
    -------
    matplotlib.axes.Axes
        The Axes object used for plotting.
    """
    resolved = _resolve_sequences(
        ref_sequence, sequences, allow_multi_match,
    )

    group_order, group_entries = _group_sequences(resolved, order)

    if not allow_overlaps:
        _check_overlaps(group_entries)

    # Resolve colors
    colors = _resolve_color_scheme(color_scheme, group_order)
    color_map = {}
    if colors is not None:
        color_map = dict(zip(group_order, colors))

    # Create figure or use provided axes
    if ax is None:
        if figsize is None:
            figsize = (12, 1.5 + len(group_order) * 0.8)
        _, _ax = plt.subplots(figsize=figsize)
    else:
        _ax = ax

    ref_len = len(ref_sequence)
    bar_height = 0.6

    # Reference bar at y=0
    _ax.broken_barh(
        [(0, ref_len)],
        (0, bar_height),
        facecolors="lightgrey",
        edgecolors="grey",
        linewidth=0.5,
    )

    # Group bars stacked upward
    y_labels = ["Reference"]
    y_positions = [bar_height / 2]

    for i, group_name in enumerate(group_order):
        y_base = (i + 1) * (bar_height + 0.3)
        intervals = [
            (entry[0], entry[1] - entry[0])
            for entry in group_entries[group_name]
        ]
        face_color = color_map.get(group_name, "C0")
        _ax.broken_barh(
            intervals,
            (y_base, bar_height),
            facecolors=face_color,
            edgecolors="black",
            linewidth=0.5,
        )
        y_labels.append(group_name)
        y_positions.append(y_base + bar_height / 2)

    # Configure axes
    _ax.set_xlim(0, ref_len)
    _ax.set_yticks(y_positions)
    _ax.set_yticklabels(y_labels)
    _ax.set_xlabel("Position")

    if title is not None:
        _ax.set_title(title)

    # Remove top and right spines
    _ax.spines["top"].set_visible(False)
    _ax.spines["right"].set_visible(False)

    return _ax


def _extract_peptide_sequences(
    var_sub: pd.DataFrame,
    alt_pep_sequence_key: str | None,
) -> pd.Series | pd.Index:
    """Return peptide sequences from var subset."""
    if alt_pep_sequence_key is not None:
        if alt_pep_sequence_key not in var_sub.columns:
            raise KeyError(
                f"Column '{alt_pep_sequence_key}' not "
                f"found in adata.var."
            )
        return var_sub[alt_pep_sequence_key]
    return var_sub.index


def _extract_groups(
    var_sub: pd.DataFrame,
    group_by: str | None,
    filter_value: str,
) -> pd.Series:
    """Return group labels for each peptide in var subset."""
    if group_by is not None:
        if group_by not in var_sub.columns:
            raise KeyError(
                f"Column '{group_by}' not found in "
                f"adata.var."
            )
        return var_sub[group_by]
    return pd.Series(
        filter_value, index=var_sub.index,
    )


def _build_sequences_dict(
    var_sub: pd.DataFrame,
    pep_seqs: pd.Series | pd.Index,
    groups: pd.Series,
    add_sequences: dict[str, dict] | None,
) -> dict[str, dict]:
    """Build sequences dict from peptides and merge additions."""
    sequences = {}
    for pep_id, seq_val, grp_val in zip(
        var_sub.index, pep_seqs, groups,
    ):
        if pd.isna(grp_val):
            continue
        sequences[pep_id] = {
            "seq": str(seq_val),
            "group": str(grp_val),
        }

    if add_sequences is not None:
        conflicts = set(sequences) & set(add_sequences)
        if conflicts:
            raise ValueError(
                f"Name conflicts between peptide "
                f"sequences and additional sequences: "
                f"{sorted(conflicts)}."
            )
        sequences.update(add_sequences)

    return sequences



[docs]
def peptides_on_sequence(
    adata: ad.AnnData,
    filter_key: str,
    filter_value: str,
    group_by: str | None = None,
    ref_sequence: str | None = None,
    alt_pep_sequence_key: str | None = None,
    add_sequences: dict[str, dict] | None = None,
    allow_overlaps: bool = False,
    allow_multi_match: bool = False,
    color_scheme: ColorScheme = None,
    title: str | None = None,
    order: list[str] | None = None,
    figsize: tuple[float, float] | None = None,
    show: bool = True,
    save: str | Path | None = None,
    ax: Axes | None = None,
) -> Axes:
    """Plot peptide coverage on a reference sequence as horizontal bars.

    Filters peptides from a peptide-level :class:`~anndata.AnnData` using the
    ``filter_key`` and ``filter_value`` parameters, which are then drawn as
    broken bars on top of a grey reference bar. Extra sequences (e.g., domains
    or post-translational modification sites) can be overlaid via the
    ``add_sequences`` parameter.

    Parameters
    ----------
    adata : AnnData
        Peptide-level :class:`~anndata.AnnData`.
    filter_key : str
        Column in ``adata.var`` used to select peptides.
    filter_value : str
        Value in the ``filter_key`` column to match.
    group_by : str | None
        Column in ``adata.var`` used to assign peptides to
        colored rows. When ``None``, all peptides are placed
        in a single row labeled ``filter_value``.
    ref_sequence : str | None
        Full reference protein sequence as an amino-acid string.
    alt_pep_sequence_key : str | None
        Column in ``adata.var`` whose values are used as peptide
        amino-acid strings. When ``None``, ``adata.var_names``
        (i.e., ``peptide_id``) are used.
    add_sequences : dict[str, dict] | None
        Additional named sequences to overlay (e.g., domains).
        Each value must be a dict with a ``"group"`` key and
        either a ``"seq"`` or ``"seq_coord"`` key. Coordinates
        are 0-based throughout. ``"seq_coord"`` must be a
        half-open ``[start, end)`` tuple where ``start >= 0``
        and ``end <= len(ref_sequence)``.
    allow_overlaps : bool
        When ``False``, raise ``ValueError`` if two sequences in
        the same group overlap positionally.
    allow_multi_match : bool
        When ``True``, a peptide sequence matching the reference at
        multiple positions is shown as one bar per match, labeled
        ``"<name> (match N)"``. When ``False``, a ``ValueError``
        is raised for ambiguous matches.
    color_scheme : str | dict | list | Colormap | callable | None
        Color specification for groups.
    title : str | None
        Axes title.
    order : list[str] | None
        Explicit ordering of group rows.
    figsize : tuple[float, float] | None
        Figure dimensions ``(width, height)`` in inches.
    show : bool
        Call ``plt.show()`` at the end.
    save : str | Path | None
        Path at which to save the figure (300 dpi,
        ``bbox_inches="tight"``). When ``None``, the figure is
        not saved.
    ax : matplotlib.axes.Axes | None
        Matplotlib Axes object to plot onto. If ``None``, a new
        figure and axes are created. The function always returns
        the Axes object used for plotting.

    Returns
    -------
    matplotlib.axes.Axes
        The Axes object used for plotting.

    Raises
    ------
    ValueError
        If ``adata`` is not at peptide level.
    ValueError
        If ``ref_sequence`` is ``None``.
    ValueError
        If no peptides match ``filter_value`` in
        ``adata.var[filter_key]``.
    ValueError
        If a peptide sequence is not found in the reference.
    ValueError
        If a peptide matches the reference at multiple positions
        and ``allow_multi_match=False``.
    ValueError
        If two sequences in the same group overlap and
        ``allow_overlaps=False``.
    KeyError
        If ``filter_key``, ``alt_pep_sequence_key``, or
        ``group_by`` is not found in ``adata.var``.

    Examples
    --------
    Build a minimal peptide-level AnnData with two proteins.
    ``"ELPGW"`` belongs to P2 and is excluded by the filter.

    >>> import numpy as np
    >>> import pandas as pd
    >>> import anndata as ad
    >>> import proteopy as pr
    >>> ref = "MKTAYIAKQRQISFVKSHFSRQDEEP"
    >>> peptides = ["KTAYIAK", "QRQISFVK", "SHFSRQ", "ELPGW"]
    >>> var = pd.DataFrame(
    ...     {
    ...         "peptide_id": peptides,
    ...         "protein_id": ["P1", "P1", "P1", "P2"],
    ...         "proteoform": ["A", "A", "B", "A"],
    ...     },
    ...     index=peptides,
    ... )
    >>> obs = pd.DataFrame(
    ...     {"sample_id": ["S1", "S2"]},
    ...     index=["S1", "S2"],
    ... )
    >>> adata = ad.AnnData(
    ...     X=np.ones((2, 4)),
    ...     obs=obs,
    ...     var=var,
    ... )

    Filter to P1; group peptides by proteoform.

    >>> pr.pl.peptides_on_sequence(
    ...     adata,
    ...     filter_key="protein_id",
    ...     filter_value="P1",
    ...     ref_sequence=ref,
    ...     group_by="proteoform",
    ... )
    """
    _, level = check_proteodata(adata)
    if level != "peptide":
        raise ValueError(
            "peptides_on_sequence requires peptide-level "
            "data. The provided AnnData is at the "
            f"'{level}' level."
        )

    if ref_sequence is None:
        raise ValueError(
            "'ref_sequence' must be provided."
        )

    # Filter .var
    var = adata.var
    if filter_key not in var.columns:
        raise KeyError(
            f"Column '{filter_key}' not found in "
            f"adata.var."
        )
    mask = var[filter_key] == filter_value
    if not mask.any():
        raise ValueError(
            f"No peptides found where "
            f"adata.var['{filter_key}'] == "
            f"'{filter_value}'."
        )
    var_sub = var.loc[mask]

    pep_seqs = _extract_peptide_sequences(
        var_sub, alt_pep_sequence_key,
    )
    groups = _extract_groups(
        var_sub, group_by, filter_value,
    )
    sequences = _build_sequences_dict(
        var_sub, pep_seqs, groups, add_sequences,
    )

    # Resolve title: default to filter_value
    if title is None:
        title = str(filter_value)

    # Plot
    _ax = _plot_sequences_on_reference(
        ref_sequence=ref_sequence,
        sequences=sequences,
        color_scheme=color_scheme,
        allow_overlaps=allow_overlaps,
        allow_multi_match=allow_multi_match,
        title=title,
        order=order,
        figsize=figsize,
        ax=ax,
    )

    fig = _ax.figure

    if ax is None:
        fig.tight_layout()
    if save is not None:
        fig.savefig(save, dpi=300, bbox_inches="tight")
    if show:
        plt.show()
    return _ax




[docs]
def peptides_on_prot_sequence(
    adata: ad.AnnData,
    protein_id: str,
    group_by: str | None = None,
    ref_sequence: str | None = None,
    alt_pep_sequence_key: str | None = None,
    add_sequences: dict[str, dict] | None = None,
    allow_overlaps: bool = False,
    allow_multi_match: bool = False,
    color_scheme: ColorScheme = None,
    title: str | None = None,
    order: list[str] | None = None,
    figsize: tuple[float, float] | None = None,
    show: bool = True,
    save: str | Path | None = None,
    ax: Axes | None = None,
) -> Axes:
    """Plot peptide coverage on a protein reference sequence.

    Filters peptides belonging to the given ``protein_id`` and draws them as
    broken bars on top of a grey reference bar. Extra sequences (e.g., domains
    or post-translational modification sites) can be overlaid via the
    ``add_sequences`` parameter.

    Parameters
    ----------
    adata : AnnData
        Peptide-level :class:`~anndata.AnnData`.
    protein_id : str
        Value in ``adata.var["protein_id"]`` to select
        peptides for.
    group_by : str | None
        Column in ``adata.var`` used to assign peptides to
        colored rows. When ``None``, all peptides are placed
        in a single row labeled ``protein_id``.
    ref_sequence : str | None
        Full reference protein sequence as an amino-acid
        string.
    alt_pep_sequence_key : str | None
        Column in ``adata.var`` whose values are used as
        peptide amino-acid strings. When ``None``,
        ``adata.var_names`` (i.e., ``peptide_id``) are used.
    add_sequences : dict[str, dict] | None
        Additional named sequences to overlay (e.g.,
        domains). Each value must be a dict with a
        ``"group"`` key and either a ``"seq"`` or
        ``"seq_coord"`` key. Coordinates are 0-based
        throughout. ``"seq_coord"`` must be a half-open
        ``[start, end)`` tuple where ``start >= 0`` and
        ``end <= len(ref_sequence)``.
    allow_overlaps : bool
        When ``False``, raise ``ValueError`` if two
        sequences in the same group overlap positionally.
    allow_multi_match : bool
        When ``True``, a peptide sequence matching the
        reference at multiple positions is shown as one bar
        per match, labeled ``"<name> (match N)"``. When
        ``False``, a ``ValueError`` is raised for ambiguous
        matches.
    color_scheme : str | dict | list | Colormap | callable | None
        Color specification for groups.
    title : str | None
        Axes title.
    order : list[str] | None
        Explicit ordering of group rows.
    figsize : tuple[float, float] | None
        Figure dimensions ``(width, height)`` in inches.
    show : bool
        Call ``plt.show()`` at the end.
    save : str | Path | None
        Path at which to save the figure (300 dpi,
        ``bbox_inches="tight"``). When ``None``, the figure
        is not saved.
    ax : matplotlib.axes.Axes | None
        Matplotlib Axes object to plot onto. If ``None``, a
        new figure and axes are created. The function always
        returns the Axes object used for plotting.

    Returns
    -------
    matplotlib.axes.Axes
        The Axes object used for plotting.

    Raises
    ------
    ValueError
        If ``adata`` is not at peptide level.
    ValueError
        If ``ref_sequence`` is ``None``.
    ValueError
        If no peptides match ``protein_id`` in
        ``adata.var["protein_id"]``.
    ValueError
        If a peptide sequence is not found in the reference.
    ValueError
        If a peptide matches the reference at multiple
        positions and ``allow_multi_match=False``.
    ValueError
        If two sequences in the same group overlap and
        ``allow_overlaps=False``.
    KeyError
        If ``alt_pep_sequence_key`` or ``group_by`` is not
        found in ``adata.var``.

    Examples
    --------
    Build a minimal peptide-level AnnData with two proteins.
    ``"ELPGW"`` belongs to P2 and is excluded by the filter.

    >>> import numpy as np
    >>> import pandas as pd
    >>> import anndata as ad
    >>> import proteopy as pr
    >>> ref = "MKTAYIAKQRQISFVKSHFSRQDEEP"
    >>> peptides = [
    ...     "KTAYIAK", "QRQISFVK", "SHFSRQ", "ELPGW",
    ... ]
    >>> var = pd.DataFrame(
    ...     {
    ...         "peptide_id": peptides,
    ...         "protein_id": ["P1", "P1", "P1", "P2"],
    ...         "proteoform": ["A", "A", "B", "A"],
    ...     },
    ...     index=peptides,
    ... )
    >>> obs = pd.DataFrame(
    ...     {"sample_id": ["S1", "S2"]},
    ...     index=["S1", "S2"],
    ... )
    >>> adata = ad.AnnData(
    ...     X=np.ones((2, 4)),
    ...     obs=obs,
    ...     var=var,
    ... )

    Filter to P1; group peptides by proteoform.

    >>> pr.pl.peptides_on_prot_sequence(
    ...     adata,
    ...     protein_id="P1",
    ...     ref_sequence=ref,
    ...     group_by="proteoform",
    ... )
    """
    return peptides_on_sequence(
        adata,
        filter_key="protein_id",
        filter_value=protein_id,
        group_by=group_by,
        ref_sequence=ref_sequence,
        alt_pep_sequence_key=alt_pep_sequence_key,
        add_sequences=add_sequences,
        allow_overlaps=allow_overlaps,
        allow_multi_match=allow_multi_match,
        color_scheme=color_scheme,
        title=title,
        order=order,
        figsize=figsize,
        show=show,
        save=save,
        ax=ax,
    )