import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
from .base import _EstimatorMixOut
[docs]
class TI(BaseEstimator, _EstimatorMixOut):
"""Thermodynamic integration (TI).
Parameters
----------
verbose : bool, optional
Set to True if verbose debug output is desired.
Attributes
----------
delta_f_ : DataFrame
The estimated dimensionless free energy difference between each state.
d_delta_f_ : DataFrame
The estimated statistical uncertainty (one standard deviation) in
dimensionless free energy differences.
states_ : list
Lambda states for which free energy differences were obtained.
dhdl : DataFrame
The estimated dhdl of each state.
.. versionchanged:: 1.0.0
`delta_f_`, `d_delta_f_`, `states_` are view of the original object.
"""
[docs]
def __init__(self, verbose: bool = False) -> None:
self.verbose = verbose
[docs]
def fit(self, dHdl: pd.DataFrame) -> "TI":
"""
Compute free energy differences between each state by integrating
dHdl across lambda values.
Parameters
----------
dHdl : DataFrame
dHdl[n,k] is the potential energy gradient with respect to lambda
for each configuration n and lambda k.
"""
# sort by state so that rows from same state are in contiguous blocks,
# and adjacent states are next to each other
dHdl = dHdl.sort_index(level=dHdl.index.names[1:]) # type: ignore[arg-type]
# obtain the mean and variance of the mean for each state
# variance calculation assumes no correlation between points
# used to calculate mean
means = dHdl.groupby(level=dHdl.index.names[1:]).mean()
variances = np.square(dHdl.groupby(level=dHdl.index.names[1:]).sem())
# get the lambda names
l_types = dHdl.index.names[1:]
# obtain vector of delta lambdas between each state
# Fix issue #148, where for pandas == 1.3.0
# dl = means.reset_index()[list(means.index.names[:])].diff().iloc[1:].values
dl = means.reset_index()[means.index.names[:]].diff().iloc[1:].values
# apply trapezoid rule to obtain DF between each adjacent state
deltas = (dl * (means.iloc[:-1].values + means.iloc[1:].values) / 2).sum(axis=1)
# build matrix of deltas between each state
adelta = np.zeros((len(deltas) + 1, len(deltas) + 1))
ad_delta = np.zeros_like(adelta)
for j in range(len(deltas)):
out = []
dout = []
for i in range(len(deltas) - j):
out.append(deltas[i] + deltas[i + 1 : i + j + 1].sum())
# Define additional zero lambda
a = [0.0] * len(l_types)
# Define dl series' with additional zero lambda on the left and right
dll = np.insert(dl[i : i + j + 1], 0, [a], axis=0)
dlr = np.append(dl[i : i + j + 1], [a], axis=0)
# Get a series of the form: x1, x1 + x2, ..., x(n-1) + x(n), x(n)
dllr = dll + dlr
# Append deviation of free energy difference between state i and i+j+1
dout.append(
(dllr**2 * variances.iloc[i : i + j + 2].values / 4) # type: ignore[attr-defined]
.sum(axis=1)
.sum()
)
adelta += np.diagflat(np.array(out), k=j + 1)
ad_delta += np.diagflat(np.array(dout), k=j + 1)
# yield standard delta_f_ free energies between each state
self._delta_f_ = pd.DataFrame(
adelta - adelta.T, columns=means.index.values, index=means.index.values
)
self.dhdl = means
# yield standard deviation d_delta_f_ between each state
self._d_delta_f_ = pd.DataFrame(
np.sqrt(ad_delta + ad_delta.T),
columns=variances.index.values, # type: ignore[attr-defined]
index=variances.index.values, # type: ignore[attr-defined]
)
self._states_ = means.index.values.tolist()
self._delta_f_.attrs = dHdl.attrs
self._d_delta_f_.attrs = dHdl.attrs
self.dhdl.attrs = dHdl.attrs
return self
[docs]
def separate_dhdl(self) -> list[pd.Series]:
"""
For transitions with multiple lambda, the attr:`dhdl` would return
a :class:`~pandas.DataFrame` which gives the dHdl for all the lambda
states, regardless of whether it is perturbed or not. This function
creates a list of :class:`pandas.Series` for each lambda, where each
:class:`pandas.Series` describes the potential energy gradient for the
lambdas state that is perturbed.
Returns
----------
dHdl_list : list
A list of :class:`pandas.Series` such that ``dHdl_list[k]`` is the
potential energy gradient with respect to lambda for each
configuration that lambda k is perturbed.
"""
if len(self.dhdl.index.names) == 1:
name = self.dhdl.columns[0]
return [
self.dhdl[name],
]
dhdl_list = []
# get the lambda names
l_types = self.dhdl.index.names
# obtain bool of changed lambdas between each state
# Fix issue #148, where for pandas == 1.3.0
# lambdas = self.dhdl.reset_index()[list(l_types)]
lambdas = self.dhdl.reset_index()[l_types]
diff = lambdas.diff().to_numpy(dtype="bool")
# diff will give the first row as NaN so need to fix that
diff[0, :] = diff[1, :]
# Make sure that the start point is set to true as well
diff[:-1, :] = diff[:-1, :] | diff[1:, :]
for i in range(len(l_types)):
if any(diff[:, i]):
new = self.dhdl.iloc[diff[:, i], i]
# drop all other index
for l_type in l_types:
if l_type != l_types[i]:
new = new.reset_index(l_type, drop=True)
new.attrs = self.dhdl.attrs
dhdl_list.append(new)
return dhdl_list