Source code for alchemlyb.estimators.mbar_

from __future__ import annotations
from typing import Literal

import numpy as np
import pandas as pd
import pymbar
import numpy.typing as npt
from sklearn.base import BaseEstimator

from . import BAR
from .base import _EstimatorMixOut



[docs]
class MBAR(BaseEstimator, _EstimatorMixOut):
    r"""Multi-state Bennett acceptance ratio (MBAR).

    Parameters
    ----------

    maximum_iterations : int, optional
        Set to limit the maximum number of iterations performed.

    relative_tolerance : float, optional
        Set to determine the relative tolerance convergence criteria.

    initial_f_k : np.ndarray, float, shape=(K), optional or String `BAR`
        When `isinstance(initial_f_k, np.ndarray)`, `initial_f_k` will be used as
        initial guess for MBAR estimator. initial_f_k should be dimensionless
        free energies.
        When `initial_f_k` is ``None``, ``initial_f_k`` will be set to 0.
        When `initial_f_k` is set to "BAR", a BAR calculation will be done and
        the result is used as the initial guess (default).

        .. versionchanged:: 2.3.0
           The new default is now "BAR" as it provides a substantial speedup
           over the previous default `None`.

    method : str, optional, default="robust"
        The optimization routine to use.  This can be any of the methods
        available via :func:`scipy.optimize.minimize` or
        :func:`scipy.optimize.root`.

    n_bootstraps : int, optional
        Whether to use bootstrap to estimate uncertainty. `0` means use analytic error
        estimation. 50~200 is a reasonable range to do bootstrap.

    verbose : bool, optional
        Set to ``True`` if verbose debug output from :mod:`pymbar` is desired.

    Attributes
    ----------

    delta_f_ : DataFrame
        The estimated dimensionless free energy difference between each state.

    d_delta_f_ : DataFrame
        The estimated statistical uncertainty (one standard deviation) in
        dimensionless free energy differences.

    delta_h_ : DataFrame
        The estimated dimensionless enthalpy difference between each state.

    d_delta_h_ : DataFrame
        The estimated statistical uncertainty (one standard deviation) in
        dimensionless enthalpy differences.

    delta_sT_ : DataFrame, optional
        The estimated dimensionless entropy difference between each state.

    d_delta_sT_ : DataFrame
        The estimated statistical uncertainty (one standard deviation) in
        dimensionless entropy differences.

    theta_ : DataFrame
        The theta matrix.

    states_ : list
        Lambda states for which free energy differences were obtained.

    Notes
    -----
    See [Shirts2008]_ for details of the derivation and cite the
    paper when using MBAR in published work.

    See Also
    --------
    pymbar.mbar.MBAR


    .. versionchanged:: 1.0.0
       `delta_f_`, `d_delta_f_`, `states_` are view of the original object.
    .. versionchanged:: 2.0.0
        default value for `method` was changed from "hybr" to "robust"
    .. versionchanged:: 2.1.0
        `n_bootstraps` option added.
    .. versionchanged:: 2.4.0
        Handle initial estimate, initial_f_k, from bar in the instance
        that not all lambda states represented as column headers are
        represented in the indices of u_nk.
    .. versionchanged:: 2.5.0
        Added computation of enthalpy and entropy

    """


[docs]
    def __init__(
        self,
        maximum_iterations: int = 10000,
        relative_tolerance: float = 1.0e-7,
        initial_f_k: np.ndarray | Literal["BAR"] | None = "BAR",
        method: str = "robust",
        n_bootstraps: int = 0,
        verbose: bool = False,
    ) -> None:
        self.maximum_iterations = maximum_iterations
        self.relative_tolerance = relative_tolerance
        if isinstance(initial_f_k, str) and initial_f_k != "BAR":
            raise ValueError(
                f"Only `BAR` is supported as string input to `initial_f_k`. Got ({initial_f_k})."
            )
        else:
            self.initial_f_k = initial_f_k
        self.method = method
        self.verbose = verbose
        self.n_bootstraps = n_bootstraps

        # handle for pymbar.MBAR object
        self._mbar = None



[docs]
    def fit(self, u_nk: pd.DataFrame, compute_entropy_enthalpy: bool = False) -> "MBAR":
        """
        Compute overlap matrix of reduced potentials using multi-state
        Bennett acceptance ratio.

        Parameters
        ----------
        u_nk : DataFrame
            ``u_nk[n, k]`` is the reduced potential energy of uncorrelated
            configuration ``n`` evaluated at state ``k``.

        compute_entropy_enthalpy : bool, optional, default=False
            Compute entropy and enthalpy.

        """
        # sort by state so that rows from same state are in contiguous blocks
        u_nk = u_nk.sort_index(level=u_nk.index.names[1:])  # type: ignore[arg-type]

        groups = u_nk.groupby(level=u_nk.index.names[1:])
        N_k = [
            (
                len(groups.get_group(i if isinstance(i, tuple) else (i,)))  # type: ignore[unreachable]
                if i in groups.groups
                else 0
            )
            for i in u_nk.columns
        ]
        self._states_ = u_nk.columns.values.tolist()

        if isinstance(self.initial_f_k, str) and self.initial_f_k == "BAR":
            bar = BAR(
                maximum_iterations=self.maximum_iterations,
                relative_tolerance=self.relative_tolerance,
                verbose=self.verbose,
            )
            bar.fit(u_nk)
            initial_f_k = bar.delta_f_.iloc[0, :]  # type: ignore[union-attr]
            if len(bar.delta_f_.iloc[0, :]) != len(self._states_):  # type: ignore[union-attr]
                states = [
                    x
                    for i, x in enumerate(self._states_[:-1])
                    if N_k[i] > 0 and N_k[i + 1] > 0
                ]
                initial_f_k = pd.Series(
                    [
                        initial_f_k.loc[x] if x in states else np.nan
                        for x in self._states_
                    ],
                    index=self._states_,
                    dtype=float,
                )
        else:
            initial_f_k = self.initial_f_k

        self._mbar = pymbar.MBAR(
            u_nk.T,
            N_k,
            maximum_iterations=self.maximum_iterations,
            relative_tolerance=self.relative_tolerance,
            verbose=self.verbose,
            initial_f_k=initial_f_k,
            solver_protocol=self.method,
            n_bootstraps=self.n_bootstraps,
        )

        uncertainty_method = None if self.n_bootstraps == 0 else "bootstrap"
        out = self._mbar.compute_free_energy_differences(  # type: ignore[attr-defined]
            return_theta=True, uncertainty_method=uncertainty_method
        )
        if compute_entropy_enthalpy:
            out.update(
                self._mbar.compute_entropy_and_enthalpy(  # type: ignore[attr-defined]
                    uncertainty_method=uncertainty_method
                )
            )

        self._delta_f_ = pd.DataFrame(
            out["Delta_f"],
            columns=self._states_,
            index=self._states_,
        )
        self._d_delta_f_ = pd.DataFrame(
            out["dDelta_f"],
            columns=self._states_,
            index=self._states_,
        )
        self.theta_ = pd.DataFrame(
            out["Theta"],
            columns=self._states_,
            index=self._states_,
        )
        if compute_entropy_enthalpy:
            self._delta_h_ = pd.DataFrame(
                out["Delta_u"],
                columns=self._states_,
                index=self._states_,
            )
            self._d_delta_h_ = pd.DataFrame(
                out["dDelta_u"],
                columns=self._states_,
                index=self._states_,
            )
            self._delta_sT_ = pd.DataFrame(
                out["Delta_s"],
                columns=self._states_,
                index=self._states_,
            )
            self._d_delta_sT_ = pd.DataFrame(
                out["dDelta_s"],
                columns=self._states_,
                index=self._states_,
            )

        self._delta_f_.attrs = u_nk.attrs
        self._d_delta_f_.attrs = u_nk.attrs
        if compute_entropy_enthalpy:
            self._delta_h_.attrs = u_nk.attrs  # type: ignore[union-attr]
            self._d_delta_h_.attrs = u_nk.attrs  # type: ignore[union-attr]
            self._delta_sT_.attrs = u_nk.attrs  # type: ignore[union-attr]
            self._d_delta_sT_.attrs = u_nk.attrs  # type: ignore[union-attr]

        return self


    @property
    def overlap_matrix(self) -> npt.NDArray:
        r"""MBAR overlap matrix.

        The estimated state overlap matrix :math:`O_{ij}` is an estimate of the probability
        of observing a sample from state :math:`i` in state :math:`j`.

        The :attr:`overlap_matrix` is computed on-the-fly. Assign it to a variable if
        you plan to re-use it.

        See Also
        ---------
        pymbar.MBAR.compute_overlap
        """
        return self._mbar.compute_overlap()["matrix"]  # type: ignore[attr-defined, no-any-return]