"""
redeem_properties
====================
Python bindings for the redeem-properties Rust crate, exposing peptide property
prediction models for retention time (RT), collisional cross-section (CCS), and
MS2 fragment intensities.
All three model classes delegate inference to the compiled Rust extension
(``_lib``). The ``predict_df`` convenience methods additionally return results
as a ``pandas`` or ``polars`` DataFrame.
Quick start
-----------
>>> import redeem_properties as rp
>>>
>>> # Download pretrained models (only needed once)
>>> rp.download_pretrained_models()
>>>
>>> rt_model = rp.RTModel.from_pretrained("rt")
>>> ccs_model = rp.CCSModel.from_pretrained("ccs")
>>> ms2_model = rp.MS2Model.from_pretrained("ms2")
>>>
>>> # numpy arrays / list[dict]
>>> rt_values = rt_model.predict(["PEPTIDE", "SEQU[+42.0106]ENCE"])
>>> ccs_results = ccs_model.predict(["PEPTIDE"], charges=[2])
>>> ms2_results = ms2_model.predict(["PEPTIDE"], charges=[2], nces=[20])
>>>
>>> # pandas DataFrames
>>> rt_df = rt_model.predict_df(["PEPTIDE", "SEQU[+42.0106]ENCE"])
>>> ccs_df = ccs_model.predict_df(["PEPTIDE"], charges=[2])
>>> ms2_df = ms2_model.predict_df(["PEPTIDE"], charges=[2], nces=[20])
"""
from __future__ import annotations
from importlib.metadata import PackageNotFoundError, version
from typing import Optional
from redeem_properties._lib import ( # noqa: F401 (re-exported)
CCSModel as _CCSLib,
MS2Model as _MS2Lib,
RTModel as _RTLib,
locate_pretrained as locate_pretrained,
validate_pretrained as validate_pretrained,
download_pretrained_models as download_pretrained_models,
compute_precursor_mz as compute_precursor_mz,
compute_fragment_mzs as compute_fragment_mzs,
compute_peptide_mz_info as compute_peptide_mz_info,
match_fragment_mzs as match_fragment_mzs,
ccs_to_mobility as ccs_to_mobility,
)
__all__ = [
"__version__",
"RTModel",
"CCSModel",
"MS2Model",
"PropertyPrediction",
"locate_pretrained",
"download_pretrained_models",
"compute_precursor_mz",
"compute_fragment_mzs",
"compute_peptide_mz_info",
"match_fragment_mzs",
"ccs_to_mobility",
]
def _detect_version() -> str:
"""Return the installed package version for redeem_properties."""
for dist_name in ("redeem_properties", "redeem-properties-py"):
try:
return version(dist_name)
except PackageNotFoundError:
continue
return "0+unknown"
__version__ = _detect_version()
# ---------------------------------------------------------------------------
# Helper
# ---------------------------------------------------------------------------
def _make_df(data: dict, framework: str):
"""Build a DataFrame from a column dict using *framework* ('pandas'/'polars')."""
if framework == "pandas":
try:
import pandas as pd
except ImportError as exc:
raise ImportError(
"pandas is required for predict_df(framework='pandas'). "
"Install it with: pip install redeem-properties-py[pandas]"
) from exc
return pd.DataFrame(data)
elif framework == "polars":
try:
import polars as pl
except ImportError as exc:
raise ImportError(
"polars is required for predict_df(framework='polars'). "
"Install it with: pip install redeem-properties-py[polars]"
) from exc
return pl.DataFrame(data)
else:
raise ValueError(f"Unknown framework '{framework}'. Use 'pandas' or 'polars'.")
def _expand_inputs(
peptides: list[str],
charges: int | list[int] | None = None,
nces: int | float | list[int] | list[float] | None = None,
instruments: str | list[Optional[str]] | None = None,
) -> tuple[list[str], list[int] | None, list[int] | None, list[Optional[str]] | None]:
"""
Expands peptides and charges via Cartesian product if lengths differ.
Broadcasts nces and instruments to match the expanded length.
"""
# 1. Expand peptides and charges
if charges is None:
exp_pep = list(peptides)
exp_charge = None
elif isinstance(charges, int):
exp_pep = list(peptides)
exp_charge = [charges] * len(peptides)
elif isinstance(charges, list):
if len(charges) == len(peptides):
# Assume 1:1 mapping
exp_pep = list(peptides)
exp_charge = list(charges)
else:
# Cartesian product
exp_pep = []
exp_charge = []
for p in peptides:
for c in charges:
exp_pep.append(p)
exp_charge.append(c)
else:
raise TypeError("charges must be an int or a list of ints")
n = len(exp_pep)
# 2. Broadcast nces
exp_nces = None
if nces is not None:
if isinstance(nces, (int, float)):
exp_nces = [int(nces)] * n
elif isinstance(nces, list):
if len(nces) == 1:
exp_nces = [int(nces[0])] * n
elif len(nces) == n:
exp_nces = [int(x) for x in nces]
else:
raise ValueError(
f"nces must be a single value, a list of length 1, or match the expanded length {n}"
)
else:
raise TypeError("nces must be an int, float, or list")
# 3. Broadcast instruments
exp_inst = None
if instruments is not None:
if isinstance(instruments, str):
exp_inst = [instruments] * n
elif isinstance(instruments, list):
if len(instruments) == 1:
exp_inst = [instruments[0]] * n
elif len(instruments) == n:
exp_inst = instruments
else:
raise ValueError(
f"instruments must be a single value, a list of length 1, or match the expanded length {n}"
)
else:
raise TypeError("instruments must be a string or list")
return exp_pep, exp_charge, exp_nces, exp_inst
# ---------------------------------------------------------------------------
# RTModel
# ---------------------------------------------------------------------------
[docs]
class RTModel:
"""Retention time prediction model.
Parameters
----------
model_path:
Path to the ``.pth`` model weights file.
arch:
Model architecture string (e.g. ``"rt_cnn_lstm"``).
constants_path:
Optional path to the ``.yaml`` constants file.
use_cuda:
Whether to run inference on GPU (requires CUDA build). Default ``False``.
"""
def __init__(
self,
model_path: str,
arch: str,
constants_path: Optional[str] = None,
use_cuda: bool = False,
) -> None:
self._inner = _RTLib(
model_path, arch, constants_path=constants_path, use_cuda=use_cuda
)
# preserve construction metadata for nicer repr/str
self._model_path = model_path
self._arch = arch
[docs]
@classmethod
def from_pretrained(cls, name: str, use_cuda: bool = False) -> "RTModel":
"""Load an RTModel from the shipped pretrained weights.
Accepted *name* values (case-insensitive):
``"rt"``, ``"alphapeptdeep-rt-cnn-lstm"``, ``"redeem-rt-cnn-tf"``.
Parameters
----------
name:
Pretrained model identifier.
use_cuda:
Whether to run inference on GPU. Default ``False``.
"""
obj = cls.__new__(cls)
obj._inner = _RTLib.from_pretrained(name, use_cuda=use_cuda)
# remember the requested pretrained name and located path (best-effort)
obj._requested_name = name
try:
obj._model_path = locate_pretrained(name)
except Exception:
obj._model_path = None
obj._arch = None
return obj
def __repr__(self) -> str:
arch = getattr(self, "_arch", None) or getattr(self, "_requested_name", None)
path = getattr(self, "_model_path", None)
param_count = (
self.param_count() if hasattr(self._inner, "param_count") else "unknown"
)
return f"<RTModel arch={arch!r} params={param_count} path={path!r}>"
def __str__(self) -> str:
return self.__repr__()
[docs]
def predict(self, peptides: list[str]):
"""Predict retention times for a list of peptides.
Peptides may contain inline modification annotations
(``[+X.X]`` mass-shift or ``(UniMod:N)`` notation).
Parameters
----------
peptides:
List of peptide sequences.
Returns
-------
numpy.ndarray
1-D float32 array of predicted RT values, one per peptide.
"""
return self._inner.predict(peptides)
[docs]
def param_count(self) -> int:
"""Return total number of parameters in the loaded model (if available).
This delegates to the compiled Rust extension when present. If the
underlying extension does not expose a param_count method an
AttributeError is raised.
"""
if hasattr(self._inner, "param_count"):
try:
return self._inner.param_count()
except Exception as e:
raise RuntimeError(f"failed to get param_count from inner model: {e}")
raise AttributeError("underlying model does not expose 'param_count'")
[docs]
def summary(self) -> str:
"""Return a compact/detailed model summary string delegated to the Rust extension.
Prefer the detailed Rust-side summary when available.
"""
# Prefer pretty hierarchical summary if available
if hasattr(self._inner, "summary_pretty"):
try:
return self._inner.summary_pretty()
except Exception:
pass
if hasattr(self._inner, "summary"):
try:
return self._inner.summary()
except Exception:
pass
# Fallback: try a compact repr using arch/requested name
arch = getattr(self, "_arch", None) or getattr(self, "_requested_name", None)
try:
# if param_count available, include it
pc = self.param_count() if hasattr(self._inner, "param_count") else None
if pc is not None:
return f"{arch} params={pc}"
except Exception:
pass
return f"{arch}"
[docs]
def predict_df(self, peptides: list[str], framework: str = "pandas"):
"""Predict retention times and return the result as a DataFrame.
Parameters
----------
peptides:
List of peptide sequences (inline modifications supported).
framework:
``'pandas'`` (default) or ``'polars'``.
Returns
-------
pandas.DataFrame or polars.DataFrame
Columns: ``peptide`` (str), ``rt`` (float32).
"""
rt_values = self.predict(peptides)
return _make_df({"peptide": peptides, "rt": rt_values}, framework)
# ---------------------------------------------------------------------------
# CCSModel
# ---------------------------------------------------------------------------
[docs]
class CCSModel:
"""Collisional cross-section prediction model.
Parameters
----------
model_path:
Path to the ``.pth`` model weights file.
arch:
Model architecture string (e.g. ``"ccs_cnn_lstm"``).
constants_path:
Path to the ``.yaml`` constants file (required).
use_cuda:
Whether to run inference on GPU. Default ``False``.
"""
def __init__(
self,
model_path: str,
arch: str,
constants_path: Optional[str] = None,
use_cuda: bool = False,
) -> None:
self._inner = _CCSLib(model_path, arch, constants_path, use_cuda=use_cuda)
# preserve construction metadata for nicer repr/str
self._model_path = model_path
self._arch = arch
[docs]
@classmethod
def from_pretrained(cls, name: str, use_cuda: bool = False) -> "CCSModel":
"""Load a CCSModel from the shipped pretrained weights.
Accepted *name* values (case-insensitive):
``"ccs"``, ``"alphapeptdeep-ccs-cnn-lstm"``, ``"redeem-ccs-cnn-tf"``.
"""
obj = cls.__new__(cls)
obj._inner = _CCSLib.from_pretrained(name, use_cuda=use_cuda)
obj._requested_name = name
try:
obj._model_path = locate_pretrained(name)
except Exception:
obj._model_path = None
obj._arch = None
return obj
def __repr__(self) -> str:
arch = getattr(self, "_arch", None) or getattr(self, "_requested_name", None)
path = getattr(self, "_model_path", None)
param_count = (
self.param_count() if hasattr(self._inner, "param_count") else "unknown"
)
return f"<CCSModel arch={arch!r} params={param_count} path={path!r}>"
def __str__(self) -> str:
return self.__repr__()
[docs]
def predict(self, peptides: list[str], charges: int | list[int]):
"""Predict CCS values for a list of peptides.
Parameters
----------
peptides:
List of peptide sequences (inline modifications supported).
charges:
Charge state per peptide. If a single integer is provided,
it is broadcast to all peptides. If a list of charges is provided
and its length differs from the number of peptides, a Cartesian
product is performed (predicting each peptide at each charge state).
Returns
-------
list[dict]
One dict per peptide with keys:
* ``"ccs"`` – predicted CCS value (Ų).
* ``"charge"`` – charge state used for the prediction.
"""
peptides, exp_charges, _, _ = _expand_inputs(peptides, charges=charges)
return self._inner.predict(peptides, exp_charges)
[docs]
def param_count(self) -> int:
"""Return total number of parameters in the loaded model (if available)."""
if hasattr(self._inner, "param_count"):
try:
return self._inner.param_count()
except Exception as e:
raise RuntimeError(f"failed to get param_count from inner model: {e}")
raise AttributeError("underlying model does not expose 'param_count'")
[docs]
def summary(self) -> str:
"""Return a model summary string delegated to the Rust extension.
Prefers the pretty hierarchical summary when available.
"""
# Prefer pretty hierarchical summary if available
if hasattr(self._inner, "summary_pretty"):
try:
return self._inner.summary_pretty()
except Exception:
pass
if hasattr(self._inner, "summary"):
try:
return self._inner.summary()
except Exception:
pass
# Fallback: compact repr using arch/requested name
arch = getattr(self, "_arch", None) or getattr(self, "_requested_name", None)
try:
pc = self.param_count() if hasattr(self._inner, "param_count") else None
if pc is not None:
return f"{arch} params={pc}"
except Exception:
pass
return f"{arch}"
[docs]
def predict_df(
self,
peptides: list[str],
charges: int | list[int],
annotate_mobility: bool = False,
framework: str = "pandas",
):
"""Predict CCS values and return the result as a DataFrame.
Parameters
----------
peptides:
List of peptide sequences (inline modifications supported).
charges:
Charge state per peptide. If a single integer is provided,
it is broadcast to all peptides. If a list of charges is provided
and its length differs from the number of peptides, a Cartesian
product is performed (predicting each peptide at each charge state).
annotate_mobility:
If ``True``, compute and append an ``ion_mobility`` column
converted from the predicted CCS value. Default ``False``.
framework:
``'pandas'`` (default) or ``'polars'``.
Returns
-------
pandas.DataFrame or polars.DataFrame
Columns: ``peptide`` (str), ``ccs`` (float32), ``charge`` (int),
and optionally ``ion_mobility`` (float).
"""
peptides, exp_charges, _, _ = _expand_inputs(peptides, charges=charges)
results = self.predict(peptides, exp_charges) # type: ignore
data = {
"peptide": peptides,
"ccs": [r["ccs"] for r in results],
"charge": [r["charge"] for r in results],
}
if annotate_mobility:
mobility_col = []
for pep, ch, res in zip(peptides, exp_charges, results): # type: ignore
try:
mz = compute_precursor_mz(pep, ch)
mob = ccs_to_mobility(res["ccs"], float(ch), mz)
mobility_col.append(mob)
except Exception:
mobility_col.append(float("nan"))
data["ion_mobility"] = mobility_col
return _make_df(data, framework)
# ---------------------------------------------------------------------------
# MS2Model
# ---------------------------------------------------------------------------
[docs]
class MS2Model:
"""MS2 fragment intensity prediction model.
Parameters
----------
model_path:
Path to the ``.pth`` model weights file.
arch:
Model architecture string (e.g. ``"ms2_bert"``).
constants_path:
Path to the ``.yaml`` constants file (required).
use_cuda:
Whether to run inference on GPU. Default ``False``.
"""
def __init__(
self,
model_path: str,
arch: str,
constants_path: Optional[str] = None,
use_cuda: bool = False,
) -> None:
self._inner = _MS2Lib(model_path, arch, constants_path, use_cuda=use_cuda)
# preserve construction metadata for nicer repr/str
self._model_path = model_path
self._arch = arch
[docs]
@classmethod
def from_pretrained(cls, name: str, use_cuda: bool = False) -> "MS2Model":
"""Load an MS2Model from the shipped pretrained weights.
Accepted *name* values (case-insensitive):
``"ms2"``, ``"alphapeptdeep-ms2-bert"``.
"""
obj = cls.__new__(cls)
obj._inner = _MS2Lib.from_pretrained(name, use_cuda=use_cuda)
obj._requested_name = name
try:
obj._model_path = locate_pretrained(name)
except Exception:
obj._model_path = None
obj._arch = None
return obj
def __repr__(self) -> str:
arch = getattr(self, "_arch", None) or getattr(self, "_requested_name", None)
path = getattr(self, "_model_path", None)
param_count = (
self.param_count() if hasattr(self._inner, "param_count") else "unknown"
)
return f"<MS2Model arch={arch!r} params={param_count} path={path!r}>"
def __str__(self) -> str:
return self.__repr__()
[docs]
def predict(
self,
peptides: list[str],
charges: int | list[int],
nces: int | float | list[int] | list[float],
instruments: str | list[Optional[str]] | None = None,
multiplier: float = 10_000.0,
):
"""Predict MS2 fragment intensities for a list of peptides.
Parameters
----------
peptides:
List of peptide sequences (inline modifications supported).
charges:
Charge state per peptide. If a single integer is provided,
it is broadcast to all peptides. If a list of charges is provided
and its length differs from the number of peptides, a Cartesian
product is performed (predicting each peptide at each charge state).
nces:
Normalized collision energy per peptide. Can be a single value
(broadcast to all) or a list matching the expanded length.
instruments:
Instrument name per peptide (optional). Can be a single string
(broadcast to all) or a list matching the expanded length.
multiplier:
Scalar to multiply predicted intensities by (default 10_000.0). Use e.g.
``10000.0`` to scale normalized outputs into typical intensity ranges.
Returns
-------
list[dict]
One dict per peptide with keys:
* ``"intensities"`` – 2-D float32 array ``(n_positions, 8)``.
* ``"ion_types"`` – list of 8 ion-type strings.
* ``"ion_charges"`` – list of 8 fragment charge integers.
* ``"b_ordinals"`` – 1-D int array ``[1, …, n_positions]``.
* ``"y_ordinals"`` – 1-D int array ``[n_positions, …, 1]``.
"""
peptides, exp_charges, exp_nces, exp_inst = _expand_inputs(
peptides, charges=charges, nces=nces, instruments=instruments
)
results = self._inner.predict(
peptides, exp_charges, exp_nces, instruments=exp_inst
)
# Optionally scale predicted intensities by a multiplier before returning.
if multiplier is not None and multiplier != 1.0:
try:
for r in results:
# r["intensities"] is a numpy.ndarray; multiply in-place for efficiency
r_int = r.get("intensities")
if r_int is not None:
r_int *= multiplier
except Exception:
# If in-place scaling fails for some reason, fall back to non-mutating scaling
scaled = []
for r in results:
r2 = dict(r)
arr = r2.get("intensities")
if arr is not None:
r2["intensities"] = arr * multiplier
scaled.append(r2)
results = scaled
return results
[docs]
def param_count(self) -> int:
"""Return total number of parameters in the loaded model (if available)."""
if hasattr(self._inner, "param_count"):
try:
return self._inner.param_count()
except Exception as e:
raise RuntimeError(f"failed to get param_count from inner model: {e}")
raise AttributeError("underlying model does not expose 'param_count'")
[docs]
def summary(self) -> str:
"""Return a model summary string delegated to the Rust extension.
Prefers the pretty hierarchical summary when available.
"""
# Prefer pretty hierarchical summary if available
if hasattr(self._inner, "summary_pretty"):
try:
return self._inner.summary_pretty()
except Exception:
pass
if hasattr(self._inner, "summary"):
try:
return self._inner.summary()
except Exception:
pass
# Fallback: compact repr using arch/requested name
arch = getattr(self, "_arch", None) or getattr(self, "_requested_name", None)
try:
pc = self.param_count() if hasattr(self._inner, "param_count") else None
if pc is not None:
return f"{arch} params={pc}"
except Exception:
pass
return f"{arch}"
[docs]
def predict_df(
self,
peptides: list[str],
charges: int | list[int],
nces: int | float | list[int] | list[float],
instruments: str | list[Optional[str]] | None = None,
multiplier: float = 10_000.0,
exclude_zeros: bool = True,
annotate_mz: bool = False,
framework: str = "pandas",
):
"""Predict MS2 fragment intensities and return a long-format DataFrame.
Each row represents one (peptide, ion_type, fragment_charge, ordinal) combination.
Parameters
----------
peptides:
List of peptide sequences (inline modifications supported).
charges:
Precursor charge state per peptide. If a single integer is provided,
it is broadcast to all peptides. If a list of charges is provided
and its length differs from the number of peptides, a Cartesian
product is performed (predicting each peptide at each charge state).
nces:
Normalized collision energy per peptide. Can be a single value
(broadcast to all) or a list matching the expanded length.
instruments:
Instrument name per peptide (optional). Can be a single string
(broadcast to all) or a list matching the expanded length.
multiplier:
Scalar to multiply predicted intensities by (default 10_000.0). Use e.g.
``10000.0`` to scale normalized outputs into typical intensity ranges.
exclude_zeros:
If True, exclude rows where all predicted intensities are zero.
annotate_mz:
If ``True``, append a ``mz`` column with the theoretical
monoisotopic m/z for each fragment ion (computed via *rustyms*).
Neutral-loss ions (``b_nl``, ``y_nl``) receive ``NaN``.
Default ``False``.
framework:
``'pandas'`` (default) or ``'polars'``.
Returns
-------
pandas.DataFrame or polars.DataFrame
Columns: ``peptide``, ``ion_type``, ``fragment_charge``,
``ordinal``, ``intensity``, and optionally ``mz``.
Example
-------
>>> df = ms2_model.predict_df(
... ["AGHCEWQMKYR"],
... charges=[2], nces=[20], instruments=["QE"],
... )
>>> df.head()
peptide ion_type fragment_charge ordinal intensity
0 AGHCEWQMKYR b 1 1 0.123
1 AGHCEWQMKYR b 2 1 0.045
...
"""
peptides, exp_charges, exp_nces, exp_inst = _expand_inputs(
peptides, charges=charges, nces=nces, instruments=instruments
)
results = self.predict(
peptides,
exp_charges,
exp_nces,
instruments=exp_inst,
multiplier=multiplier, # type: ignore
)
b_ion_types = {"b", "b_nl"}
pep_col: list[str] = []
ion_type_col: list[str] = []
frag_charge_col: list[int] = []
ordinal_col: list[int] = []
intensity_col: list[float] = []
mz_col: list[float] = []
precursor_mz_col: list[float] = []
for pep, charge, res in zip(peptides, exp_charges, results): # type: ignore
intensities = res["intensities"]
ion_types = res["ion_types"]
frag_charges = res["ion_charges"]
b_ords = res["b_ordinals"]
y_ords = res["y_ordinals"]
n_pos, n_types = intensities.shape
# Pre-compute theoretical fragment m/z for this peptide (if needed)
frag_mz_lookup: dict[tuple[str, int, int], float] | None = None
_precursor_mz = float("nan")
if annotate_mz:
max_frag_charge = max(int(fc) for fc in frag_charges)
try:
_precursor_mz = compute_precursor_mz(pep, charge)
frag_info = compute_fragment_mzs(pep, max_frag_charge)
frag_mz_lookup = {
(t, c, o): m
for t, c, o, m in zip(
frag_info["ion_types"],
frag_info["charges"],
frag_info["ordinals"],
frag_info["mzs"],
)
}
except Exception:
frag_mz_lookup = {}
for r in range(n_pos):
for c in range(n_types):
t = ion_types[c]
ordinal = int(b_ords[r]) if t in b_ion_types else int(y_ords[r])
val = float(intensities[r, c])
# If requested, skip individual ion rows with zero intensity.
if exclude_zeros and val == 0.0:
continue
pep_col.append(pep)
ion_type_col.append(t)
frag_charge_col.append(int(frag_charges[c]))
ordinal_col.append(ordinal)
intensity_col.append(val)
if annotate_mz:
precursor_mz_col.append(_precursor_mz)
if frag_mz_lookup is not None:
# Strip _nl suffix for lookup; NL ions won't match → NaN
base_type = t.replace("_nl", "")
mz_col.append(
frag_mz_lookup.get(
(base_type, int(frag_charges[c]), ordinal),
float("nan"),
)
)
else:
mz_col.append(float("nan"))
data: dict = {
"peptide": pep_col,
"ion_type": ion_type_col,
"fragment_charge": frag_charge_col,
"ordinal": ordinal_col,
"intensity": intensity_col,
}
if annotate_mz:
data["precursor_mz"] = precursor_mz_col
data["mz"] = mz_col
return _make_df(data, framework)
# ---------------------------------------------------------------------------
# PropertyPrediction – unified RT + CCS + MS2 predictor
# ---------------------------------------------------------------------------
[docs]
class PropertyPrediction:
"""Unified peptide property predictor combining RT, CCS, and MS2 models.
Each model is **optional**. When a model is ``None`` its columns are
omitted from the output. By default the constructor loads the shipped
pretrained weights for all three models; pass ``predict_rt=False``,
``predict_ccs=False``, or ``predict_ms2=False`` to skip a model entirely.
Parameters
----------
rt_model:
An :class:`RTModel` instance, or ``None`` to skip RT prediction.
Ignored when *predict_rt* is ``False``.
ccs_model:
A :class:`CCSModel` instance, or ``None`` to skip CCS prediction.
Ignored when *predict_ccs* is ``False``.
ms2_model:
An :class:`MS2Model` instance, or ``None`` to skip MS2 prediction.
Ignored when *predict_ms2* is ``False``.
predict_rt:
Whether to include retention-time predictions. Default ``True``.
predict_ccs:
Whether to include CCS predictions. Default ``True``.
predict_ms2:
Whether to include MS2 fragment-intensity predictions. Default ``True``.
use_cuda:
Forwarded to ``from_pretrained`` when constructing default models.
Default ``False``.
Examples
--------
>>> import redeem_properties as rp
>>> prop = rp.PropertyPrediction() # all three pretrained models
>>> df = prop.predict_df(
... ["PEPTIDE", "AGHCEWQMKYR"],
... charges=[2, 2], nces=[20, 20], instruments=["QE", "QE"],
... )
>>> df.columns.tolist()
['peptide', 'charge', 'nce', 'instrument', 'rt', 'ccs',
'ion_type', 'fragment_charge', 'ordinal', 'intensity']
Only RT + CCS (skip MS2):
>>> prop = rp.PropertyPrediction(predict_ms2=False)
>>> df = prop.predict_df(["PEPTIDE"], charges=[2])
"""
def __init__(
self,
rt_model: Optional[RTModel] = None,
ccs_model: Optional[CCSModel] = None,
ms2_model: Optional[MS2Model] = None,
*,
predict_rt: bool = True,
predict_ccs: bool = True,
predict_ms2: bool = True,
use_cuda: bool = False,
) -> None:
# ------ RT ------
if predict_rt:
if rt_model is not None:
self.rt_model: Optional[RTModel] = rt_model
else:
try:
self.rt_model = RTModel.from_pretrained("rt", use_cuda=use_cuda)
except Exception:
self.rt_model = None
else:
self.rt_model = None
# ------ CCS ------
if predict_ccs:
if ccs_model is not None:
self.ccs_model: Optional[CCSModel] = ccs_model
else:
try:
self.ccs_model = CCSModel.from_pretrained("ccs", use_cuda=use_cuda)
except Exception:
self.ccs_model = None
else:
self.ccs_model = None
# ------ MS2 ------
if predict_ms2:
if ms2_model is not None:
self.ms2_model: Optional[MS2Model] = ms2_model
else:
try:
self.ms2_model = MS2Model.from_pretrained("ms2", use_cuda=use_cuda)
except Exception:
self.ms2_model = None
else:
self.ms2_model = None
# -----------------------------------------------------------------
def __repr__(self) -> str:
parts = []
if self.rt_model is not None:
rt_arch = getattr(self.rt_model, "_arch", None) or getattr(
self.rt_model, "_requested_name", None
)
rt_params = (
self.rt_model.param_count()
if hasattr(self.rt_model._inner, "param_count")
else "unknown"
)
rt_path = getattr(self.rt_model, "_model_path", None)
parts.append(f"\nrt={rt_arch!r} params={rt_params} path={rt_path!r}")
if self.ccs_model is not None:
ccs_arch = getattr(self.ccs_model, "_arch", None) or getattr(
self.ccs_model, "_requested_name", None
)
ccs_params = (
self.ccs_model.param_count()
if hasattr(self.ccs_model._inner, "param_count")
else "unknown"
)
ccs_path = getattr(self.ccs_model, "_model_path", None)
parts.append(f"\nccs={ccs_arch!r} params={ccs_params} path={ccs_path!r}")
if self.ms2_model is not None:
ms2_arch = getattr(self.ms2_model, "_arch", None) or getattr(
self.ms2_model, "_requested_name", None
)
ms2_params = (
self.ms2_model.param_count()
if hasattr(self.ms2_model._inner, "param_count")
else "unknown"
)
ms2_path = getattr(self.ms2_model, "_model_path", None)
parts.append(f"\nms2={ms2_arch!r} params={ms2_params} path={ms2_path!r}")
return f"<PropertyPrediction {' '.join(parts)}\n>"
def __str__(self) -> str:
return self.__repr__()
# -----------------------------------------------------------------
[docs]
def predict(
self,
peptides: list[str],
charges: int | list[int] | None = None,
nces: int | float | list[int] | list[float] | None = None,
instruments: str | list[Optional[str]] | None = None,
multiplier: float = 10_000.0,
) -> dict:
"""Run enabled models and return raw results in a dict.
Parameters
----------
peptides:
List of peptide sequences (inline modifications supported).
charges:
Charge state per peptide (required for CCS and MS2). If a single
integer is provided, it is broadcast to all peptides. If a list of
charges is provided and its length differs from the number of peptides,
a Cartesian product is performed.
nces:
Normalized collision energy per peptide (required for MS2). Can be
a single value (broadcast to all) or a list matching the expanded length.
instruments:
Instrument name per peptide (optional, used by MS2). Can be a single
string (broadcast to all) or a list matching the expanded length.
multiplier:
Scalar applied to MS2 predicted intensities (default 10 000).
Returns
-------
dict
Keys that may be present: ``"rt"`` (1-D ndarray), ``"ccs"``
(list[dict]), ``"ms2"`` (list[dict]).
"""
peptides, exp_charges, exp_nces, exp_inst = _expand_inputs(
peptides, charges=charges, nces=nces, instruments=instruments
)
out: dict = {}
if self.rt_model is not None:
out["rt"] = self.rt_model.predict(peptides)
if self.ccs_model is not None:
if exp_charges is None:
raise ValueError("charges are required for CCS prediction")
out["ccs"] = self.ccs_model.predict(peptides, exp_charges)
if self.ms2_model is not None:
if exp_charges is None:
raise ValueError("charges are required for MS2 prediction")
if exp_nces is None:
raise ValueError("nces are required for MS2 prediction")
out["ms2"] = self.ms2_model.predict(
peptides,
exp_charges,
exp_nces,
instruments=exp_inst,
multiplier=multiplier,
)
return out
# -----------------------------------------------------------------
[docs]
def predict_df(
self,
peptides: list[str],
charges: int | list[int] | None = None,
nces: int | float | list[int] | list[float] | None = None,
instruments: str | list[Optional[str]] | None = None,
multiplier: float = 10_000.0,
exclude_zeros: bool = True,
annotate_mz: bool = True,
annotate_mobility: bool = False,
framework: str = "pandas",
):
"""Predict all enabled properties and return a single long-format DataFrame.
When MS2 is enabled every fragment row is emitted; the scalar RT and CCS
values are broadcast (repeated) across those rows so that each row is
fully self-contained.
When MS2 is **disabled** the DataFrame contains one row per peptide
with only the scalar columns that are enabled.
Parameters
----------
peptides:
List of peptide sequences (inline modifications supported).
charges:
Charge state per peptide (required for CCS and MS2). If a single
integer is provided, it is broadcast to all peptides. If a list of
charges is provided and its length differs from the number of peptides,
a Cartesian product is performed.
nces:
Normalized collision energy per peptide (required for MS2). Can be
a single value (broadcast to all) or a list matching the expanded length.
instruments:
Instrument name per peptide (optional, used by MS2). Can be a single
string (broadcast to all) or a list matching the expanded length.
multiplier:
Scalar applied to MS2 predicted intensities (default 10 000).
exclude_zeros:
If ``True``, individual zero-intensity fragment rows are dropped.
annotate_mz:
If ``True`` (default), compute and add m/z columns. When MS2 is
enabled a ``precursor_mz`` column and a per-fragment ``mz``
column are added. When MS2 is disabled only ``precursor_mz``
is added. Requires *charges* to be provided.
annotate_mobility:
If ``True``, compute and append an ``ion_mobility`` column
converted from the predicted CCS value. Requires *charges* and
the CCS model to be enabled. Default ``False``.
framework:
``'pandas'`` (default) or ``'polars'``.
Returns
-------
pandas.DataFrame or polars.DataFrame
Possible columns (depending on which models are enabled):
``peptide``, ``charge``, ``nce``, ``instrument``,
``rt``, ``ccs``, ``ion_mobility``, ``precursor_mz``, ``ion_type``,
``fragment_charge``, ``ordinal``, ``intensity``, ``mz``.
"""
peptides, exp_charges, exp_nces, exp_inst = _expand_inputs(
peptides, charges=charges, nces=nces, instruments=instruments
)
n = len(peptides)
# -- scalar predictions (RT / CCS) ---------------------------------
rt_values = None
if self.rt_model is not None:
rt_values = self.rt_model.predict(peptides) # 1-D ndarray
ccs_values = None
if self.ccs_model is not None:
if exp_charges is None:
raise ValueError("charges are required for CCS prediction")
ccs_results = self.ccs_model.predict(peptides, exp_charges)
ccs_values = [r["ccs"] for r in ccs_results] # list[float]
# -- MS2 fragment predictions --------------------------------------
ms2_results = None
if self.ms2_model is not None:
if exp_charges is None:
raise ValueError("charges are required for MS2 prediction")
if exp_nces is None:
raise ValueError("nces are required for MS2 prediction")
ms2_results = self.ms2_model.predict(
peptides,
exp_charges,
exp_nces,
instruments=exp_inst,
multiplier=multiplier,
)
# -- Build output columns ------------------------------------------
b_ion_types = {"b", "b_nl"}
if ms2_results is not None:
# Long-format: one row per fragment ion
pep_col: list[str] = []
charge_col: list[int] = []
nce_col: list[int] = []
instrument_col: list[Optional[str]] = []
rt_col: list[float] = []
ccs_col: list[float] = []
mobility_col: list[float] = []
precursor_mz_col: list[float] = []
ion_type_col: list[str] = []
frag_charge_col: list[int] = []
ordinal_col: list[int] = []
intensity_col: list[float] = []
mz_col: list[float] = []
for idx, (pep, res) in enumerate(zip(peptides, ms2_results)):
intensities = res["intensities"]
ion_types = res["ion_types"]
frag_charges = res["ion_charges"]
b_ords = res["b_ordinals"]
y_ords = res["y_ordinals"]
n_pos, n_types = intensities.shape
# scalar values for this peptide
_charge = exp_charges[idx] if exp_charges is not None else 0
_nce = exp_nces[idx] if exp_nces is not None else 0
_instrument = exp_inst[idx] if exp_inst is not None else None
_rt = float(rt_values[idx]) if rt_values is not None else float("nan")
_ccs = (
float(ccs_values[idx]) if ccs_values is not None else float("nan")
)
# Precompute m/z info for this peptide (if requested)
_precursor_mz = float("nan")
frag_mz_lookup: dict[tuple[str, int, int], float] | None = None
if (annotate_mz or annotate_mobility) and exp_charges is not None:
try:
_precursor_mz = compute_precursor_mz(pep, _charge)
except Exception:
pass
if annotate_mz and exp_charges is not None:
max_frag_charge = max(int(fc) for fc in frag_charges)
try:
frag_info = compute_fragment_mzs(pep, max_frag_charge)
frag_mz_lookup = {
(t, c, o): m
for t, c, o, m in zip(
frag_info["ion_types"],
frag_info["charges"],
frag_info["ordinals"],
frag_info["mzs"],
)
}
except Exception:
frag_mz_lookup = {}
_ion_mobility = float("nan")
if (
annotate_mobility
and ccs_values is not None
and exp_charges is not None
):
try:
_ion_mobility = ccs_to_mobility(
_ccs, float(_charge), _precursor_mz
)
except Exception:
pass
for r in range(n_pos):
for c in range(n_types):
t = ion_types[c]
ordinal = int(b_ords[r]) if t in b_ion_types else int(y_ords[r])
val = float(intensities[r, c])
if exclude_zeros and val == 0.0:
continue
pep_col.append(pep)
charge_col.append(_charge)
nce_col.append(_nce)
instrument_col.append(_instrument)
rt_col.append(_rt)
ccs_col.append(_ccs)
if annotate_mobility:
mobility_col.append(_ion_mobility)
ion_type_col.append(t)
frag_charge_col.append(int(frag_charges[c]))
ordinal_col.append(ordinal)
intensity_col.append(val)
if annotate_mz:
precursor_mz_col.append(_precursor_mz)
if frag_mz_lookup is not None:
base_type = t.replace("_nl", "")
mz_col.append(
frag_mz_lookup.get(
(base_type, int(frag_charges[c]), ordinal),
float("nan"),
)
)
else:
mz_col.append(float("nan"))
data: dict = {"peptide": pep_col}
if exp_charges is not None:
data["charge"] = charge_col
if exp_nces is not None:
data["nce"] = nce_col
if exp_inst is not None:
data["instrument"] = instrument_col
if rt_values is not None:
data["rt"] = rt_col
if ccs_values is not None:
data["ccs"] = ccs_col
if annotate_mobility:
data["ion_mobility"] = mobility_col
if annotate_mz:
data["precursor_mz"] = precursor_mz_col
data["ion_type"] = ion_type_col
data["fragment_charge"] = frag_charge_col
data["ordinal"] = ordinal_col
data["intensity"] = intensity_col
if annotate_mz:
data["mz"] = mz_col
else:
# No MS2 – one row per peptide with scalar columns only
data = {"peptide": list(peptides)}
if exp_charges is not None:
data["charge"] = list(exp_charges)
if rt_values is not None:
data["rt"] = [float(v) for v in rt_values]
if ccs_values is not None:
data["ccs"] = ccs_values
if annotate_mobility and ccs_values is not None and exp_charges is not None:
mob_col: list[float] = []
for pep, ch, ccs in zip(peptides, exp_charges, ccs_values):
try:
mz = compute_precursor_mz(pep, ch)
mob_col.append(ccs_to_mobility(ccs, float(ch), mz))
except Exception:
mob_col.append(float("nan"))
data["ion_mobility"] = mob_col
if annotate_mz and exp_charges is not None:
prec_mz: list[float] = []
for pep, ch in zip(peptides, exp_charges):
try:
prec_mz.append(compute_precursor_mz(pep, ch))
except Exception:
prec_mz.append(float("nan"))
data["precursor_mz"] = prec_mz
return _make_df(data, framework)