Source code for alchemlyb.estimators.ti_gaussian_quadrature_

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator
import numpy.typing as npt
from .. import concat
from .base import _EstimatorMixOut


[docs] class TI_GQ(BaseEstimator, _EstimatorMixOut): """Thermodynamic integration (TI) with gaussian quadrature estimation. Parameters ---------- verbose : bool, optional Set to True if verbose debug output is desired. Attributes ---------- delta_f_ : DataFrame The estimated cumulative free energy from one state to another. d_delta_f_ : DataFrame The estimated statistical uncertainty (one standard deviation) in dimensionless cumulative free energies. states_ : list Lambda states for which free energy estimation were obtained. dhdl : DataFrame The estimated dhdl of each state. .. versionadded:: 2.1.0 """ special_points = { 1: {"lambdas": [0.5], "weights": [1.0]}, 2: {"lambdas": [0.21132, 0.78867], "weights": [0.5, 0.5]}, 3: {"lambdas": [0.1127, 0.5, 0.88729], "weights": [0.27777, 0.44444, 0.27777]}, 4: { "lambdas": [0.06943, 0.33001, 0.66999, 0.93057], "weights": [0.17393, 0.32607, 0.32607, 0.17393], }, 5: { "lambdas": [0.04691, 0.23076, 0.5, 0.76923, 0.95308], "weights": [0.11846, 0.23931, 0.28444, 0.23931, 0.11846], }, 6: { "lambdas": [0.03377, 0.1694, 0.38069, 0.61931, 0.8306, 0.96623], "weights": [0.08566, 0.18038, 0.23396, 0.23396, 0.18038, 0.08566], }, 7: { "lambdas": [0.02544, 0.12923, 0.29707, 0.5, 0.70292, 0.87076, 0.97455], "weights": [0.06474, 0.13985, 0.19091, 0.20897, 0.19091, 0.13985, 0.06474], }, 8: { "lambdas": [ 0.01986, 0.10167, 0.23723, 0.40828, 0.59172, 0.76277, 0.89833, 0.98014, ], "weights": [ 0.05061, 0.11119, 0.15685, 0.18134, 0.18134, 0.15685, 0.11119, 0.05061, ], }, 9: { "lambdas": [ 0.01592, 0.08198, 0.19331, 0.33787, 0.5, 0.66213, 0.80669, 0.91802, 0.98408, ], "weights": [ 0.04064, 0.09032, 0.13031, 0.15617, 0.16512, 0.15617, 0.13031, 0.09032, 0.04064, ], }, 10: { "lambdas": [ 0.01305, 0.06747, 0.1603, 0.2833, 0.42556, 0.57444, 0.7167, 0.8397, 0.93253, 0.98695, ], "weights": [ 0.03334, 0.07473, 0.10954, 0.13463, 0.14776, 0.14776, 0.13463, 0.10954, 0.07473, 0.03334, ], }, 11: { "lambdas": [ 0.01089, 0.05647, 0.13492, 0.24045, 0.36523, 0.5, 0.63477, 0.75955, 0.86508, 0.94353, 0.98911, ], "weights": [ 0.02783, 0.06279, 0.09315, 0.1166, 0.1314, 0.13646, 0.1314, 0.1166, 0.09315, 0.06279, 0.02783, ], }, 12: { "lambdas": [ 0.00922, 0.04794, 0.11505, 0.20634, 0.31608, 0.43738, 0.56262, 0.68392, 0.79366, 0.88495, 0.95206, 0.99078, ], "weights": [ 0.02359, 0.05347, 0.08004, 0.10158, 0.11675, 0.12457, 0.12457, 0.11675, 0.10158, 0.08004, 0.05347, 0.02359, ], }, 13: { "lambdas": [ 0.00791, 0.0412, 0.09921, 0.17883, 0.27575, 0.38477, 0.5, 0.61523, 0.72425, 0.82117, 0.90079, 0.9588, 0.99209, ], "weights": [ 0.02024, 0.04606, 0.06944, 0.08907, 0.10391, 0.11314, 0.11628, 0.11314, 0.10391, 0.08907, 0.06944, 0.04606, 0.02024, ], }, 14: { "lambdas": [ 0.00686, 0.03578, 0.0864, 0.15635, 0.24238, 0.34044, 0.44597, 0.55403, 0.65956, 0.75762, 0.84365, 0.9136, 0.96422, 0.99314, ], "weights": [ 0.01756, 0.04008, 0.06076, 0.0786, 0.09277, 0.1026, 0.10763, 0.10763, 0.1026, 0.09277, 0.0786, 0.06076, 0.04008, 0.01756, ], }, 15: { "lambdas": [ 0.006, 0.03136, 0.0759, 0.13779, 0.21451, 0.30292, 0.3994, 0.5, 0.6006, 0.69708, 0.78549, 0.86221, 0.9241, 0.96864, 0.994, ], "weights": [ 0.01538, 0.03518, 0.05358, 0.06979, 0.08313, 0.09308, 0.09922, 0.10129, 0.09922, 0.09308, 0.08313, 0.06979, 0.05358, 0.03518, 0.01538, ], }, 16: { "lambdas": [ 0.0053, 0.02771, 0.06718, 0.1223, 0.19106, 0.27099, 0.3592, 0.45249, 0.54751, 0.6408, 0.72901, 0.80894, 0.8777, 0.93282, 0.97229, 0.9947, ], "weights": [ 0.01358, 0.03113, 0.04758, 0.06231, 0.0748, 0.08458, 0.0913, 0.09473, 0.09473, 0.0913, 0.08458, 0.0748, 0.06231, 0.04758, 0.03113, 0.01358, ], }, }
[docs] def __init__(self, verbose: bool = False) -> None: self.verbose = verbose
[docs] def fit(self, dHdl: pd.DataFrame) -> "TI_GQ": """ Compute cumulative free energy from one state to another by integrating dHdl across lambda values. Parameters ---------- dHdl : DataFrame dHdl[n,k] is the potential energy gradient with respect to lambda for each configuration n and lambda k. """ # sort by state so that rows from same state are in contiguous blocks, # and adjacent states are next to each other dHdl = dHdl.sort_index(level=dHdl.index.names[1:]) # type: ignore[arg-type] # obtain the mean and variance of the mean for each state # variance calculation assumes no correlation between points # used to calculate mean means = dHdl.groupby(level=dHdl.index.names[1:]).mean() variances = np.square(dHdl.groupby(level=dHdl.index.names[1:]).sem()) weights = [] # check if the lambdas in the simulations match the suggested values lambda_list, means_list, variances_list, index_list = ( self.separate_mean_variance(means, variances) # type: ignore[arg-type] ) for lambdas in lambda_list: num_lambdas = len(lambdas) if num_lambdas not in self.special_points: raise ValueError( f"TI_GQ only supports a set number of lambda windows ({list(self.special_points.keys())}) currently, \ but {num_lambdas} lambda windows are given." ) suggested_lambdas = self.special_points[num_lambdas]["lambdas"] if not np.allclose(lambdas, suggested_lambdas, rtol=0.1): raise ValueError( f"lambda values, {suggested_lambdas}, are expected, but {lambdas} are given. Please use trapezoidal rule instead." ) weights.extend(self.special_points[num_lambdas]["weights"]) # means_new and variances_new are similar to means and variances, but with only values relevant to each lambda type (for multi-lambda situation) means_new = concat(means_list) mean_values = means_new.to_numpy() variances_new = concat(variances_list) variance_values = variances_new.to_numpy() # apply gaussian quadrature multiplication at each lambda state deltas = weights * mean_values deltas = np.insert(deltas, 0, [0.0], axis=0) # type: ignore[assignment] deltas = np.append(deltas, [0.0], axis=0) # type: ignore[assignment] d_deltas_squared = np.square(weights) * variance_values d_deltas_squared = np.insert(d_deltas_squared, 0, [0.0], axis=0) d_deltas_squared = np.append(d_deltas_squared, [0.0], axis=0) # build matrix of deltas between each state adelta = np.zeros((len(deltas), len(deltas)), dtype=float) ad_delta = np.zeros_like(adelta) for j in range(len(deltas)): out = [] dout = [] for i in range(len(deltas) - j): # Append cumulative free energy value from state i to i+j out.append(deltas[i] + deltas[i + 1 : i + j + 1].sum()) # type: ignore[attr-defined] # Append cumulative squared deviation of free energy from state i to i+j dout.append( d_deltas_squared[i] + d_deltas_squared[i + 1 : i + j + 1].sum() ) adelta += np.diagflat(np.array(out), k=j) ad_delta += np.diagflat(np.array(dout), k=j) adelta = adelta - adelta.T ad_delta = (ad_delta + ad_delta.T) - 2 * np.diagflat(d_deltas_squared) # yield standard delta_f_ cumulative free energies from one state to another self._delta_f_ = pd.DataFrame(adelta, columns=index_list, index=index_list) # yield standard deviation d_delta_f_ between each state self._d_delta_f_ = pd.DataFrame( np.sqrt(ad_delta), columns=index_list, index=index_list, ) self.dhdl = means self.dhdl.attrs = dHdl.attrs self._states_ = means_new.index.values.tolist() self._delta_f_.attrs = dHdl.attrs self._d_delta_f_.attrs = dHdl.attrs return self
[docs] @staticmethod def separate_mean_variance( means: pd.DataFrame, variances: pd.DataFrame ) -> tuple[ list[pd.Index], list[pd.Series], list[pd.Series], list[float | tuple[float, ...]], ]: """ For transitions with multiple lambda, the :attr:`dhdl` would return a :class:`~pandas.DataFrame` which gives the dHdl for all the lambda states, regardless of whether it is perturbed or not. This function creates 3 lists of :class:`numpy.ndarray`, :class:`pandas.Series` and :class:`pandas.Series` for each lambda, where the lists describe the lambda values, potential energy gradient and variance values for the lambdas state that is perturbed. Parameters ---------- means: DataFrame means is the average potential energy gradient at each lambda. variances: DataFrame variances is variance of the potential energy gradient at each lambda. Returns ---------- lambda_list : list A list of :class:`numpy.array` such that ``lambda_list[k]`` is the lambda values with respect to each type of lambda. dhdl_list : list A list of :class:`pandas.Series` such that ``dHdl_list[k]`` is the potential energy gradient with respect to lambda for each configuration that lambda k is perturbed. variance_list : list A list of :class:`pandas.Series` such that ``variance_list[k]`` is the variance of the potential energy gradient with respect to lambda for each configuration that lambda k is perturbed. index_list : list A list of :class:`float` or :class:`tuple` such that each :class:`float` or :class:`tuple` is the index of the final `delta_f_` and `d_delta_f_` """ lambda_list: list[npt.NDArray] = [] dhdl_list: list[pd.Series] = [] variance_list: list[pd.Series] = [] index_list: list[float | tuple[float]] = [] # get the lambda names l_types = means.index.names # get the lambda vaules lambdas = means.reset_index()[means.index.names].values for i in range(len(l_types)): # obtain the lambda points between 0.0 and 1.0 l_masks = (0.0 < lambdas[:, i]) & (lambdas[:, i] < 1.0) if not l_masks.any(): continue new_means = means.iloc[l_masks, i] new_variances = variances.iloc[l_masks, i] index_list.extend(new_means.index) # for multi-lambda case, extract the relevant column for l_type in l_types: if l_type != l_types[i]: new_means = new_means.reset_index(l_type, drop=True) new_variances = new_variances.reset_index(l_type, drop=True) new_means.attrs = means.attrs new_variances.attrs = variances.attrs lambda_list.append(new_means.index) # type: ignore[arg-type] dhdl_list.append(new_means) variance_list.append(new_variances) # add two end states at all lambda zeros and ones if len(l_types) == 1: index_list = [0.0] + index_list + [1.0] else: index_list = ( [tuple([0.0] * len(l_types))] # type: ignore[assignment] + index_list + [tuple([1.0] * len(l_types))] ) return lambda_list, dhdl_list, variance_list, index_list # type: ignore[return-value]