Source code for redeem_properties

"""
redeem_properties
====================
Python bindings for the redeem-properties Rust crate, exposing peptide property
prediction models for retention time (RT), collisional cross-section (CCS), and
MS2 fragment intensities.

All three model classes delegate inference to the compiled Rust extension
(``_lib``).  The ``predict_df`` convenience methods additionally return results
as a ``pandas`` or ``polars`` DataFrame.

Quick start
-----------
>>> import redeem_properties as rp
>>>
>>> # Download pretrained models (only needed once)
>>> rp.download_pretrained_models()
>>>
>>> rt_model  = rp.RTModel.from_pretrained("rt")
>>> ccs_model = rp.CCSModel.from_pretrained("ccs")
>>> ms2_model = rp.MS2Model.from_pretrained("ms2")
>>>
>>> # numpy arrays / list[dict]
>>> rt_values   = rt_model.predict(["PEPTIDE", "SEQU[+42.0106]ENCE"])
>>> ccs_results = ccs_model.predict(["PEPTIDE"], charges=[2])
>>> ms2_results = ms2_model.predict(["PEPTIDE"], charges=[2], nces=[20])
>>>
>>> # pandas DataFrames
>>> rt_df  = rt_model.predict_df(["PEPTIDE", "SEQU[+42.0106]ENCE"])
>>> ccs_df = ccs_model.predict_df(["PEPTIDE"], charges=[2])
>>> ms2_df = ms2_model.predict_df(["PEPTIDE"], charges=[2], nces=[20])
"""

from __future__ import annotations

from importlib.metadata import PackageNotFoundError, version
from typing import Optional

from redeem_properties._lib import (  # noqa: F401  (re-exported)
    CCSModel as _CCSLib,
    MS2Model as _MS2Lib,
    RTModel as _RTLib,
    locate_pretrained as locate_pretrained,
    validate_pretrained as validate_pretrained,
    download_pretrained_models as download_pretrained_models,
    compute_precursor_mz as compute_precursor_mz,
    compute_fragment_mzs as compute_fragment_mzs,
    compute_peptide_mz_info as compute_peptide_mz_info,
    match_fragment_mzs as match_fragment_mzs,
    ccs_to_mobility as ccs_to_mobility,
)

__all__ = [
    "__version__",
    "RTModel",
    "CCSModel",
    "MS2Model",
    "PropertyPrediction",
    "locate_pretrained",
    "download_pretrained_models",
    "compute_precursor_mz",
    "compute_fragment_mzs",
    "compute_peptide_mz_info",
    "match_fragment_mzs",
    "ccs_to_mobility",
]


def _detect_version() -> str:
    """Return the installed package version for redeem_properties."""
    for dist_name in ("redeem_properties", "redeem-properties-py"):
        try:
            return version(dist_name)
        except PackageNotFoundError:
            continue
    return "0+unknown"


__version__ = _detect_version()


# ---------------------------------------------------------------------------
# Helper
# ---------------------------------------------------------------------------


def _make_df(data: dict, framework: str):
    """Build a DataFrame from a column dict using *framework* ('pandas'/'polars')."""
    if framework == "pandas":
        try:
            import pandas as pd
        except ImportError as exc:
            raise ImportError(
                "pandas is required for predict_df(framework='pandas'). "
                "Install it with: pip install redeem-properties-py[pandas]"
            ) from exc
        return pd.DataFrame(data)
    elif framework == "polars":
        try:
            import polars as pl
        except ImportError as exc:
            raise ImportError(
                "polars is required for predict_df(framework='polars'). "
                "Install it with: pip install redeem-properties-py[polars]"
            ) from exc
        return pl.DataFrame(data)
    else:
        raise ValueError(f"Unknown framework '{framework}'. Use 'pandas' or 'polars'.")


def _expand_inputs(
    peptides: list[str],
    charges: int | list[int] | None = None,
    nces: int | float | list[int] | list[float] | None = None,
    instruments: str | list[Optional[str]] | None = None,
) -> tuple[list[str], list[int] | None, list[int] | None, list[Optional[str]] | None]:
    """
    Expands peptides and charges via Cartesian product if lengths differ.
    Broadcasts nces and instruments to match the expanded length.
    """
    # 1. Expand peptides and charges
    if charges is None:
        exp_pep = list(peptides)
        exp_charge = None
    elif isinstance(charges, int):
        exp_pep = list(peptides)
        exp_charge = [charges] * len(peptides)
    elif isinstance(charges, list):
        if len(charges) == len(peptides):
            # Assume 1:1 mapping
            exp_pep = list(peptides)
            exp_charge = list(charges)
        else:
            # Cartesian product
            exp_pep = []
            exp_charge = []
            for p in peptides:
                for c in charges:
                    exp_pep.append(p)
                    exp_charge.append(c)
    else:
        raise TypeError("charges must be an int or a list of ints")

    n = len(exp_pep)

    # 2. Broadcast nces
    exp_nces = None
    if nces is not None:
        if isinstance(nces, (int, float)):
            exp_nces = [int(nces)] * n
        elif isinstance(nces, list):
            if len(nces) == 1:
                exp_nces = [int(nces[0])] * n
            elif len(nces) == n:
                exp_nces = [int(x) for x in nces]
            else:
                raise ValueError(
                    f"nces must be a single value, a list of length 1, or match the expanded length {n}"
                )
        else:
            raise TypeError("nces must be an int, float, or list")

    # 3. Broadcast instruments
    exp_inst = None
    if instruments is not None:
        if isinstance(instruments, str):
            exp_inst = [instruments] * n
        elif isinstance(instruments, list):
            if len(instruments) == 1:
                exp_inst = [instruments[0]] * n
            elif len(instruments) == n:
                exp_inst = instruments
            else:
                raise ValueError(
                    f"instruments must be a single value, a list of length 1, or match the expanded length {n}"
                )
        else:
            raise TypeError("instruments must be a string or list")

    return exp_pep, exp_charge, exp_nces, exp_inst


# ---------------------------------------------------------------------------
# RTModel
# ---------------------------------------------------------------------------


[docs] class RTModel: """Retention time prediction model. Parameters ---------- model_path: Path to the ``.pth`` model weights file. arch: Model architecture string (e.g. ``"rt_cnn_lstm"``). constants_path: Optional path to the ``.yaml`` constants file. use_cuda: Whether to run inference on GPU (requires CUDA build). Default ``False``. """ def __init__( self, model_path: str, arch: str, constants_path: Optional[str] = None, use_cuda: bool = False, ) -> None: self._inner = _RTLib( model_path, arch, constants_path=constants_path, use_cuda=use_cuda ) # preserve construction metadata for nicer repr/str self._model_path = model_path self._arch = arch
[docs] @classmethod def from_pretrained(cls, name: str, use_cuda: bool = False) -> "RTModel": """Load an RTModel from the shipped pretrained weights. Accepted *name* values (case-insensitive): ``"rt"``, ``"alphapeptdeep-rt-cnn-lstm"``, ``"redeem-rt-cnn-tf"``. Parameters ---------- name: Pretrained model identifier. use_cuda: Whether to run inference on GPU. Default ``False``. """ obj = cls.__new__(cls) obj._inner = _RTLib.from_pretrained(name, use_cuda=use_cuda) # remember the requested pretrained name and located path (best-effort) obj._requested_name = name try: obj._model_path = locate_pretrained(name) except Exception: obj._model_path = None obj._arch = None return obj
def __repr__(self) -> str: arch = getattr(self, "_arch", None) or getattr(self, "_requested_name", None) path = getattr(self, "_model_path", None) param_count = ( self.param_count() if hasattr(self._inner, "param_count") else "unknown" ) return f"<RTModel arch={arch!r} params={param_count} path={path!r}>" def __str__(self) -> str: return self.__repr__()
[docs] def predict(self, peptides: list[str]): """Predict retention times for a list of peptides. Peptides may contain inline modification annotations (``[+X.X]`` mass-shift or ``(UniMod:N)`` notation). Parameters ---------- peptides: List of peptide sequences. Returns ------- numpy.ndarray 1-D float32 array of predicted RT values, one per peptide. """ return self._inner.predict(peptides)
[docs] def param_count(self) -> int: """Return total number of parameters in the loaded model (if available). This delegates to the compiled Rust extension when present. If the underlying extension does not expose a param_count method an AttributeError is raised. """ if hasattr(self._inner, "param_count"): try: return self._inner.param_count() except Exception as e: raise RuntimeError(f"failed to get param_count from inner model: {e}") raise AttributeError("underlying model does not expose 'param_count'")
[docs] def summary(self) -> str: """Return a compact/detailed model summary string delegated to the Rust extension. Prefer the detailed Rust-side summary when available. """ # Prefer pretty hierarchical summary if available if hasattr(self._inner, "summary_pretty"): try: return self._inner.summary_pretty() except Exception: pass if hasattr(self._inner, "summary"): try: return self._inner.summary() except Exception: pass # Fallback: try a compact repr using arch/requested name arch = getattr(self, "_arch", None) or getattr(self, "_requested_name", None) try: # if param_count available, include it pc = self.param_count() if hasattr(self._inner, "param_count") else None if pc is not None: return f"{arch} params={pc}" except Exception: pass return f"{arch}"
[docs] def predict_df(self, peptides: list[str], framework: str = "pandas"): """Predict retention times and return the result as a DataFrame. Parameters ---------- peptides: List of peptide sequences (inline modifications supported). framework: ``'pandas'`` (default) or ``'polars'``. Returns ------- pandas.DataFrame or polars.DataFrame Columns: ``peptide`` (str), ``rt`` (float32). """ rt_values = self.predict(peptides) return _make_df({"peptide": peptides, "rt": rt_values}, framework)
# --------------------------------------------------------------------------- # CCSModel # ---------------------------------------------------------------------------
[docs] class CCSModel: """Collisional cross-section prediction model. Parameters ---------- model_path: Path to the ``.pth`` model weights file. arch: Model architecture string (e.g. ``"ccs_cnn_lstm"``). constants_path: Path to the ``.yaml`` constants file (required). use_cuda: Whether to run inference on GPU. Default ``False``. """ def __init__( self, model_path: str, arch: str, constants_path: Optional[str] = None, use_cuda: bool = False, ) -> None: self._inner = _CCSLib(model_path, arch, constants_path, use_cuda=use_cuda) # preserve construction metadata for nicer repr/str self._model_path = model_path self._arch = arch
[docs] @classmethod def from_pretrained(cls, name: str, use_cuda: bool = False) -> "CCSModel": """Load a CCSModel from the shipped pretrained weights. Accepted *name* values (case-insensitive): ``"ccs"``, ``"alphapeptdeep-ccs-cnn-lstm"``, ``"redeem-ccs-cnn-tf"``. """ obj = cls.__new__(cls) obj._inner = _CCSLib.from_pretrained(name, use_cuda=use_cuda) obj._requested_name = name try: obj._model_path = locate_pretrained(name) except Exception: obj._model_path = None obj._arch = None return obj
def __repr__(self) -> str: arch = getattr(self, "_arch", None) or getattr(self, "_requested_name", None) path = getattr(self, "_model_path", None) param_count = ( self.param_count() if hasattr(self._inner, "param_count") else "unknown" ) return f"<CCSModel arch={arch!r} params={param_count} path={path!r}>" def __str__(self) -> str: return self.__repr__()
[docs] def predict(self, peptides: list[str], charges: int | list[int]): """Predict CCS values for a list of peptides. Parameters ---------- peptides: List of peptide sequences (inline modifications supported). charges: Charge state per peptide. If a single integer is provided, it is broadcast to all peptides. If a list of charges is provided and its length differs from the number of peptides, a Cartesian product is performed (predicting each peptide at each charge state). Returns ------- list[dict] One dict per peptide with keys: * ``"ccs"`` – predicted CCS value (Ų). * ``"charge"`` – charge state used for the prediction. """ peptides, exp_charges, _, _ = _expand_inputs(peptides, charges=charges) return self._inner.predict(peptides, exp_charges)
[docs] def param_count(self) -> int: """Return total number of parameters in the loaded model (if available).""" if hasattr(self._inner, "param_count"): try: return self._inner.param_count() except Exception as e: raise RuntimeError(f"failed to get param_count from inner model: {e}") raise AttributeError("underlying model does not expose 'param_count'")
[docs] def summary(self) -> str: """Return a model summary string delegated to the Rust extension. Prefers the pretty hierarchical summary when available. """ # Prefer pretty hierarchical summary if available if hasattr(self._inner, "summary_pretty"): try: return self._inner.summary_pretty() except Exception: pass if hasattr(self._inner, "summary"): try: return self._inner.summary() except Exception: pass # Fallback: compact repr using arch/requested name arch = getattr(self, "_arch", None) or getattr(self, "_requested_name", None) try: pc = self.param_count() if hasattr(self._inner, "param_count") else None if pc is not None: return f"{arch} params={pc}" except Exception: pass return f"{arch}"
[docs] def predict_df( self, peptides: list[str], charges: int | list[int], annotate_mobility: bool = False, framework: str = "pandas", ): """Predict CCS values and return the result as a DataFrame. Parameters ---------- peptides: List of peptide sequences (inline modifications supported). charges: Charge state per peptide. If a single integer is provided, it is broadcast to all peptides. If a list of charges is provided and its length differs from the number of peptides, a Cartesian product is performed (predicting each peptide at each charge state). annotate_mobility: If ``True``, compute and append an ``ion_mobility`` column converted from the predicted CCS value. Default ``False``. framework: ``'pandas'`` (default) or ``'polars'``. Returns ------- pandas.DataFrame or polars.DataFrame Columns: ``peptide`` (str), ``ccs`` (float32), ``charge`` (int), and optionally ``ion_mobility`` (float). """ peptides, exp_charges, _, _ = _expand_inputs(peptides, charges=charges) results = self.predict(peptides, exp_charges) # type: ignore data = { "peptide": peptides, "ccs": [r["ccs"] for r in results], "charge": [r["charge"] for r in results], } if annotate_mobility: mobility_col = [] for pep, ch, res in zip(peptides, exp_charges, results): # type: ignore try: mz = compute_precursor_mz(pep, ch) mob = ccs_to_mobility(res["ccs"], float(ch), mz) mobility_col.append(mob) except Exception: mobility_col.append(float("nan")) data["ion_mobility"] = mobility_col return _make_df(data, framework)
# --------------------------------------------------------------------------- # MS2Model # ---------------------------------------------------------------------------
[docs] class MS2Model: """MS2 fragment intensity prediction model. Parameters ---------- model_path: Path to the ``.pth`` model weights file. arch: Model architecture string (e.g. ``"ms2_bert"``). constants_path: Path to the ``.yaml`` constants file (required). use_cuda: Whether to run inference on GPU. Default ``False``. """ def __init__( self, model_path: str, arch: str, constants_path: Optional[str] = None, use_cuda: bool = False, ) -> None: self._inner = _MS2Lib(model_path, arch, constants_path, use_cuda=use_cuda) # preserve construction metadata for nicer repr/str self._model_path = model_path self._arch = arch
[docs] @classmethod def from_pretrained(cls, name: str, use_cuda: bool = False) -> "MS2Model": """Load an MS2Model from the shipped pretrained weights. Accepted *name* values (case-insensitive): ``"ms2"``, ``"alphapeptdeep-ms2-bert"``. """ obj = cls.__new__(cls) obj._inner = _MS2Lib.from_pretrained(name, use_cuda=use_cuda) obj._requested_name = name try: obj._model_path = locate_pretrained(name) except Exception: obj._model_path = None obj._arch = None return obj
def __repr__(self) -> str: arch = getattr(self, "_arch", None) or getattr(self, "_requested_name", None) path = getattr(self, "_model_path", None) param_count = ( self.param_count() if hasattr(self._inner, "param_count") else "unknown" ) return f"<MS2Model arch={arch!r} params={param_count} path={path!r}>" def __str__(self) -> str: return self.__repr__()
[docs] def predict( self, peptides: list[str], charges: int | list[int], nces: int | float | list[int] | list[float], instruments: str | list[Optional[str]] | None = None, multiplier: float = 10_000.0, ): """Predict MS2 fragment intensities for a list of peptides. Parameters ---------- peptides: List of peptide sequences (inline modifications supported). charges: Charge state per peptide. If a single integer is provided, it is broadcast to all peptides. If a list of charges is provided and its length differs from the number of peptides, a Cartesian product is performed (predicting each peptide at each charge state). nces: Normalized collision energy per peptide. Can be a single value (broadcast to all) or a list matching the expanded length. instruments: Instrument name per peptide (optional). Can be a single string (broadcast to all) or a list matching the expanded length. multiplier: Scalar to multiply predicted intensities by (default 10_000.0). Use e.g. ``10000.0`` to scale normalized outputs into typical intensity ranges. Returns ------- list[dict] One dict per peptide with keys: * ``"intensities"`` – 2-D float32 array ``(n_positions, 8)``. * ``"ion_types"`` – list of 8 ion-type strings. * ``"ion_charges"`` – list of 8 fragment charge integers. * ``"b_ordinals"`` – 1-D int array ``[1, …, n_positions]``. * ``"y_ordinals"`` – 1-D int array ``[n_positions, …, 1]``. """ peptides, exp_charges, exp_nces, exp_inst = _expand_inputs( peptides, charges=charges, nces=nces, instruments=instruments ) results = self._inner.predict( peptides, exp_charges, exp_nces, instruments=exp_inst ) # Optionally scale predicted intensities by a multiplier before returning. if multiplier is not None and multiplier != 1.0: try: for r in results: # r["intensities"] is a numpy.ndarray; multiply in-place for efficiency r_int = r.get("intensities") if r_int is not None: r_int *= multiplier except Exception: # If in-place scaling fails for some reason, fall back to non-mutating scaling scaled = [] for r in results: r2 = dict(r) arr = r2.get("intensities") if arr is not None: r2["intensities"] = arr * multiplier scaled.append(r2) results = scaled return results
[docs] def param_count(self) -> int: """Return total number of parameters in the loaded model (if available).""" if hasattr(self._inner, "param_count"): try: return self._inner.param_count() except Exception as e: raise RuntimeError(f"failed to get param_count from inner model: {e}") raise AttributeError("underlying model does not expose 'param_count'")
[docs] def summary(self) -> str: """Return a model summary string delegated to the Rust extension. Prefers the pretty hierarchical summary when available. """ # Prefer pretty hierarchical summary if available if hasattr(self._inner, "summary_pretty"): try: return self._inner.summary_pretty() except Exception: pass if hasattr(self._inner, "summary"): try: return self._inner.summary() except Exception: pass # Fallback: compact repr using arch/requested name arch = getattr(self, "_arch", None) or getattr(self, "_requested_name", None) try: pc = self.param_count() if hasattr(self._inner, "param_count") else None if pc is not None: return f"{arch} params={pc}" except Exception: pass return f"{arch}"
[docs] def predict_df( self, peptides: list[str], charges: int | list[int], nces: int | float | list[int] | list[float], instruments: str | list[Optional[str]] | None = None, multiplier: float = 10_000.0, exclude_zeros: bool = True, annotate_mz: bool = False, framework: str = "pandas", ): """Predict MS2 fragment intensities and return a long-format DataFrame. Each row represents one (peptide, ion_type, fragment_charge, ordinal) combination. Parameters ---------- peptides: List of peptide sequences (inline modifications supported). charges: Precursor charge state per peptide. If a single integer is provided, it is broadcast to all peptides. If a list of charges is provided and its length differs from the number of peptides, a Cartesian product is performed (predicting each peptide at each charge state). nces: Normalized collision energy per peptide. Can be a single value (broadcast to all) or a list matching the expanded length. instruments: Instrument name per peptide (optional). Can be a single string (broadcast to all) or a list matching the expanded length. multiplier: Scalar to multiply predicted intensities by (default 10_000.0). Use e.g. ``10000.0`` to scale normalized outputs into typical intensity ranges. exclude_zeros: If True, exclude rows where all predicted intensities are zero. annotate_mz: If ``True``, append a ``mz`` column with the theoretical monoisotopic m/z for each fragment ion (computed via *rustyms*). Neutral-loss ions (``b_nl``, ``y_nl``) receive ``NaN``. Default ``False``. framework: ``'pandas'`` (default) or ``'polars'``. Returns ------- pandas.DataFrame or polars.DataFrame Columns: ``peptide``, ``ion_type``, ``fragment_charge``, ``ordinal``, ``intensity``, and optionally ``mz``. Example ------- >>> df = ms2_model.predict_df( ... ["AGHCEWQMKYR"], ... charges=[2], nces=[20], instruments=["QE"], ... ) >>> df.head() peptide ion_type fragment_charge ordinal intensity 0 AGHCEWQMKYR b 1 1 0.123 1 AGHCEWQMKYR b 2 1 0.045 ... """ peptides, exp_charges, exp_nces, exp_inst = _expand_inputs( peptides, charges=charges, nces=nces, instruments=instruments ) results = self.predict( peptides, exp_charges, exp_nces, instruments=exp_inst, multiplier=multiplier, # type: ignore ) b_ion_types = {"b", "b_nl"} pep_col: list[str] = [] ion_type_col: list[str] = [] frag_charge_col: list[int] = [] ordinal_col: list[int] = [] intensity_col: list[float] = [] mz_col: list[float] = [] precursor_mz_col: list[float] = [] for pep, charge, res in zip(peptides, exp_charges, results): # type: ignore intensities = res["intensities"] ion_types = res["ion_types"] frag_charges = res["ion_charges"] b_ords = res["b_ordinals"] y_ords = res["y_ordinals"] n_pos, n_types = intensities.shape # Pre-compute theoretical fragment m/z for this peptide (if needed) frag_mz_lookup: dict[tuple[str, int, int], float] | None = None _precursor_mz = float("nan") if annotate_mz: max_frag_charge = max(int(fc) for fc in frag_charges) try: _precursor_mz = compute_precursor_mz(pep, charge) frag_info = compute_fragment_mzs(pep, max_frag_charge) frag_mz_lookup = { (t, c, o): m for t, c, o, m in zip( frag_info["ion_types"], frag_info["charges"], frag_info["ordinals"], frag_info["mzs"], ) } except Exception: frag_mz_lookup = {} for r in range(n_pos): for c in range(n_types): t = ion_types[c] ordinal = int(b_ords[r]) if t in b_ion_types else int(y_ords[r]) val = float(intensities[r, c]) # If requested, skip individual ion rows with zero intensity. if exclude_zeros and val == 0.0: continue pep_col.append(pep) ion_type_col.append(t) frag_charge_col.append(int(frag_charges[c])) ordinal_col.append(ordinal) intensity_col.append(val) if annotate_mz: precursor_mz_col.append(_precursor_mz) if frag_mz_lookup is not None: # Strip _nl suffix for lookup; NL ions won't match → NaN base_type = t.replace("_nl", "") mz_col.append( frag_mz_lookup.get( (base_type, int(frag_charges[c]), ordinal), float("nan"), ) ) else: mz_col.append(float("nan")) data: dict = { "peptide": pep_col, "ion_type": ion_type_col, "fragment_charge": frag_charge_col, "ordinal": ordinal_col, "intensity": intensity_col, } if annotate_mz: data["precursor_mz"] = precursor_mz_col data["mz"] = mz_col return _make_df(data, framework)
# --------------------------------------------------------------------------- # PropertyPrediction – unified RT + CCS + MS2 predictor # ---------------------------------------------------------------------------
[docs] class PropertyPrediction: """Unified peptide property predictor combining RT, CCS, and MS2 models. Each model is **optional**. When a model is ``None`` its columns are omitted from the output. By default the constructor loads the shipped pretrained weights for all three models; pass ``predict_rt=False``, ``predict_ccs=False``, or ``predict_ms2=False`` to skip a model entirely. Parameters ---------- rt_model: An :class:`RTModel` instance, or ``None`` to skip RT prediction. Ignored when *predict_rt* is ``False``. ccs_model: A :class:`CCSModel` instance, or ``None`` to skip CCS prediction. Ignored when *predict_ccs* is ``False``. ms2_model: An :class:`MS2Model` instance, or ``None`` to skip MS2 prediction. Ignored when *predict_ms2* is ``False``. predict_rt: Whether to include retention-time predictions. Default ``True``. predict_ccs: Whether to include CCS predictions. Default ``True``. predict_ms2: Whether to include MS2 fragment-intensity predictions. Default ``True``. use_cuda: Forwarded to ``from_pretrained`` when constructing default models. Default ``False``. Examples -------- >>> import redeem_properties as rp >>> prop = rp.PropertyPrediction() # all three pretrained models >>> df = prop.predict_df( ... ["PEPTIDE", "AGHCEWQMKYR"], ... charges=[2, 2], nces=[20, 20], instruments=["QE", "QE"], ... ) >>> df.columns.tolist() ['peptide', 'charge', 'nce', 'instrument', 'rt', 'ccs', 'ion_type', 'fragment_charge', 'ordinal', 'intensity'] Only RT + CCS (skip MS2): >>> prop = rp.PropertyPrediction(predict_ms2=False) >>> df = prop.predict_df(["PEPTIDE"], charges=[2]) """ def __init__( self, rt_model: Optional[RTModel] = None, ccs_model: Optional[CCSModel] = None, ms2_model: Optional[MS2Model] = None, *, predict_rt: bool = True, predict_ccs: bool = True, predict_ms2: bool = True, use_cuda: bool = False, ) -> None: # ------ RT ------ if predict_rt: if rt_model is not None: self.rt_model: Optional[RTModel] = rt_model else: try: self.rt_model = RTModel.from_pretrained("rt", use_cuda=use_cuda) except Exception: self.rt_model = None else: self.rt_model = None # ------ CCS ------ if predict_ccs: if ccs_model is not None: self.ccs_model: Optional[CCSModel] = ccs_model else: try: self.ccs_model = CCSModel.from_pretrained("ccs", use_cuda=use_cuda) except Exception: self.ccs_model = None else: self.ccs_model = None # ------ MS2 ------ if predict_ms2: if ms2_model is not None: self.ms2_model: Optional[MS2Model] = ms2_model else: try: self.ms2_model = MS2Model.from_pretrained("ms2", use_cuda=use_cuda) except Exception: self.ms2_model = None else: self.ms2_model = None # ----------------------------------------------------------------- def __repr__(self) -> str: parts = [] if self.rt_model is not None: rt_arch = getattr(self.rt_model, "_arch", None) or getattr( self.rt_model, "_requested_name", None ) rt_params = ( self.rt_model.param_count() if hasattr(self.rt_model._inner, "param_count") else "unknown" ) rt_path = getattr(self.rt_model, "_model_path", None) parts.append(f"\nrt={rt_arch!r} params={rt_params} path={rt_path!r}") if self.ccs_model is not None: ccs_arch = getattr(self.ccs_model, "_arch", None) or getattr( self.ccs_model, "_requested_name", None ) ccs_params = ( self.ccs_model.param_count() if hasattr(self.ccs_model._inner, "param_count") else "unknown" ) ccs_path = getattr(self.ccs_model, "_model_path", None) parts.append(f"\nccs={ccs_arch!r} params={ccs_params} path={ccs_path!r}") if self.ms2_model is not None: ms2_arch = getattr(self.ms2_model, "_arch", None) or getattr( self.ms2_model, "_requested_name", None ) ms2_params = ( self.ms2_model.param_count() if hasattr(self.ms2_model._inner, "param_count") else "unknown" ) ms2_path = getattr(self.ms2_model, "_model_path", None) parts.append(f"\nms2={ms2_arch!r} params={ms2_params} path={ms2_path!r}") return f"<PropertyPrediction {' '.join(parts)}\n>" def __str__(self) -> str: return self.__repr__() # -----------------------------------------------------------------
[docs] def predict( self, peptides: list[str], charges: int | list[int] | None = None, nces: int | float | list[int] | list[float] | None = None, instruments: str | list[Optional[str]] | None = None, multiplier: float = 10_000.0, ) -> dict: """Run enabled models and return raw results in a dict. Parameters ---------- peptides: List of peptide sequences (inline modifications supported). charges: Charge state per peptide (required for CCS and MS2). If a single integer is provided, it is broadcast to all peptides. If a list of charges is provided and its length differs from the number of peptides, a Cartesian product is performed. nces: Normalized collision energy per peptide (required for MS2). Can be a single value (broadcast to all) or a list matching the expanded length. instruments: Instrument name per peptide (optional, used by MS2). Can be a single string (broadcast to all) or a list matching the expanded length. multiplier: Scalar applied to MS2 predicted intensities (default 10 000). Returns ------- dict Keys that may be present: ``"rt"`` (1-D ndarray), ``"ccs"`` (list[dict]), ``"ms2"`` (list[dict]). """ peptides, exp_charges, exp_nces, exp_inst = _expand_inputs( peptides, charges=charges, nces=nces, instruments=instruments ) out: dict = {} if self.rt_model is not None: out["rt"] = self.rt_model.predict(peptides) if self.ccs_model is not None: if exp_charges is None: raise ValueError("charges are required for CCS prediction") out["ccs"] = self.ccs_model.predict(peptides, exp_charges) if self.ms2_model is not None: if exp_charges is None: raise ValueError("charges are required for MS2 prediction") if exp_nces is None: raise ValueError("nces are required for MS2 prediction") out["ms2"] = self.ms2_model.predict( peptides, exp_charges, exp_nces, instruments=exp_inst, multiplier=multiplier, ) return out
# -----------------------------------------------------------------
[docs] def predict_df( self, peptides: list[str], charges: int | list[int] | None = None, nces: int | float | list[int] | list[float] | None = None, instruments: str | list[Optional[str]] | None = None, multiplier: float = 10_000.0, exclude_zeros: bool = True, annotate_mz: bool = True, annotate_mobility: bool = False, framework: str = "pandas", ): """Predict all enabled properties and return a single long-format DataFrame. When MS2 is enabled every fragment row is emitted; the scalar RT and CCS values are broadcast (repeated) across those rows so that each row is fully self-contained. When MS2 is **disabled** the DataFrame contains one row per peptide with only the scalar columns that are enabled. Parameters ---------- peptides: List of peptide sequences (inline modifications supported). charges: Charge state per peptide (required for CCS and MS2). If a single integer is provided, it is broadcast to all peptides. If a list of charges is provided and its length differs from the number of peptides, a Cartesian product is performed. nces: Normalized collision energy per peptide (required for MS2). Can be a single value (broadcast to all) or a list matching the expanded length. instruments: Instrument name per peptide (optional, used by MS2). Can be a single string (broadcast to all) or a list matching the expanded length. multiplier: Scalar applied to MS2 predicted intensities (default 10 000). exclude_zeros: If ``True``, individual zero-intensity fragment rows are dropped. annotate_mz: If ``True`` (default), compute and add m/z columns. When MS2 is enabled a ``precursor_mz`` column and a per-fragment ``mz`` column are added. When MS2 is disabled only ``precursor_mz`` is added. Requires *charges* to be provided. annotate_mobility: If ``True``, compute and append an ``ion_mobility`` column converted from the predicted CCS value. Requires *charges* and the CCS model to be enabled. Default ``False``. framework: ``'pandas'`` (default) or ``'polars'``. Returns ------- pandas.DataFrame or polars.DataFrame Possible columns (depending on which models are enabled): ``peptide``, ``charge``, ``nce``, ``instrument``, ``rt``, ``ccs``, ``ion_mobility``, ``precursor_mz``, ``ion_type``, ``fragment_charge``, ``ordinal``, ``intensity``, ``mz``. """ peptides, exp_charges, exp_nces, exp_inst = _expand_inputs( peptides, charges=charges, nces=nces, instruments=instruments ) n = len(peptides) # -- scalar predictions (RT / CCS) --------------------------------- rt_values = None if self.rt_model is not None: rt_values = self.rt_model.predict(peptides) # 1-D ndarray ccs_values = None if self.ccs_model is not None: if exp_charges is None: raise ValueError("charges are required for CCS prediction") ccs_results = self.ccs_model.predict(peptides, exp_charges) ccs_values = [r["ccs"] for r in ccs_results] # list[float] # -- MS2 fragment predictions -------------------------------------- ms2_results = None if self.ms2_model is not None: if exp_charges is None: raise ValueError("charges are required for MS2 prediction") if exp_nces is None: raise ValueError("nces are required for MS2 prediction") ms2_results = self.ms2_model.predict( peptides, exp_charges, exp_nces, instruments=exp_inst, multiplier=multiplier, ) # -- Build output columns ------------------------------------------ b_ion_types = {"b", "b_nl"} if ms2_results is not None: # Long-format: one row per fragment ion pep_col: list[str] = [] charge_col: list[int] = [] nce_col: list[int] = [] instrument_col: list[Optional[str]] = [] rt_col: list[float] = [] ccs_col: list[float] = [] mobility_col: list[float] = [] precursor_mz_col: list[float] = [] ion_type_col: list[str] = [] frag_charge_col: list[int] = [] ordinal_col: list[int] = [] intensity_col: list[float] = [] mz_col: list[float] = [] for idx, (pep, res) in enumerate(zip(peptides, ms2_results)): intensities = res["intensities"] ion_types = res["ion_types"] frag_charges = res["ion_charges"] b_ords = res["b_ordinals"] y_ords = res["y_ordinals"] n_pos, n_types = intensities.shape # scalar values for this peptide _charge = exp_charges[idx] if exp_charges is not None else 0 _nce = exp_nces[idx] if exp_nces is not None else 0 _instrument = exp_inst[idx] if exp_inst is not None else None _rt = float(rt_values[idx]) if rt_values is not None else float("nan") _ccs = ( float(ccs_values[idx]) if ccs_values is not None else float("nan") ) # Precompute m/z info for this peptide (if requested) _precursor_mz = float("nan") frag_mz_lookup: dict[tuple[str, int, int], float] | None = None if (annotate_mz or annotate_mobility) and exp_charges is not None: try: _precursor_mz = compute_precursor_mz(pep, _charge) except Exception: pass if annotate_mz and exp_charges is not None: max_frag_charge = max(int(fc) for fc in frag_charges) try: frag_info = compute_fragment_mzs(pep, max_frag_charge) frag_mz_lookup = { (t, c, o): m for t, c, o, m in zip( frag_info["ion_types"], frag_info["charges"], frag_info["ordinals"], frag_info["mzs"], ) } except Exception: frag_mz_lookup = {} _ion_mobility = float("nan") if ( annotate_mobility and ccs_values is not None and exp_charges is not None ): try: _ion_mobility = ccs_to_mobility( _ccs, float(_charge), _precursor_mz ) except Exception: pass for r in range(n_pos): for c in range(n_types): t = ion_types[c] ordinal = int(b_ords[r]) if t in b_ion_types else int(y_ords[r]) val = float(intensities[r, c]) if exclude_zeros and val == 0.0: continue pep_col.append(pep) charge_col.append(_charge) nce_col.append(_nce) instrument_col.append(_instrument) rt_col.append(_rt) ccs_col.append(_ccs) if annotate_mobility: mobility_col.append(_ion_mobility) ion_type_col.append(t) frag_charge_col.append(int(frag_charges[c])) ordinal_col.append(ordinal) intensity_col.append(val) if annotate_mz: precursor_mz_col.append(_precursor_mz) if frag_mz_lookup is not None: base_type = t.replace("_nl", "") mz_col.append( frag_mz_lookup.get( (base_type, int(frag_charges[c]), ordinal), float("nan"), ) ) else: mz_col.append(float("nan")) data: dict = {"peptide": pep_col} if exp_charges is not None: data["charge"] = charge_col if exp_nces is not None: data["nce"] = nce_col if exp_inst is not None: data["instrument"] = instrument_col if rt_values is not None: data["rt"] = rt_col if ccs_values is not None: data["ccs"] = ccs_col if annotate_mobility: data["ion_mobility"] = mobility_col if annotate_mz: data["precursor_mz"] = precursor_mz_col data["ion_type"] = ion_type_col data["fragment_charge"] = frag_charge_col data["ordinal"] = ordinal_col data["intensity"] = intensity_col if annotate_mz: data["mz"] = mz_col else: # No MS2 – one row per peptide with scalar columns only data = {"peptide": list(peptides)} if exp_charges is not None: data["charge"] = list(exp_charges) if rt_values is not None: data["rt"] = [float(v) for v in rt_values] if ccs_values is not None: data["ccs"] = ccs_values if annotate_mobility and ccs_values is not None and exp_charges is not None: mob_col: list[float] = [] for pep, ch, ccs in zip(peptides, exp_charges, ccs_values): try: mz = compute_precursor_mz(pep, ch) mob_col.append(ccs_to_mobility(ccs, float(ch), mz)) except Exception: mob_col.append(float("nan")) data["ion_mobility"] = mob_col if annotate_mz and exp_charges is not None: prec_mz: list[float] = [] for pep, ch in zip(peptides, exp_charges): try: prec_mz.append(compute_precursor_mz(pep, ch)) except Exception: prec_mz.append(float("nan")) data["precursor_mz"] = prec_mz return _make_df(data, framework)