Source code for alchemlyb.parsing.gomc

"""Parsers for extracting alchemical data from `GOMC <http://gomc.eng.wayne.edu/>`_ output files."""

import pandas as pd

from . import _init_attrs
from .util import anyopen
from ..postprocessors.units import R_kJmol

k_b = R_kJmol


[docs] @_init_attrs def extract_u_nk(filename: str, T: float) -> pd.DataFrame: """Return reduced potentials `u_nk` from a Hamiltonian differences dat file. Parameters ---------- filename : str Path to free energy file to extract data from. T : float Temperature in Kelvin at which the simulation was sampled. Returns ------- u_nk : :class:`pandas.DataFrame` Potential energy for each alchemical state (k) for each frame (n). .. versionchanged:: 0.5.0 The :mod:`scipy.constants` is used for parsers instead of the constants used by the corresponding MD engine. """ h_col_match = "DelE" pv_col_match = "PV" u_col_match = ["Total_En"] beta = 1 / (k_b * T) state, lambdas, statevec = _extract_state(filename) # extract a DataFrame from free energy file data df = _extract_dataframe(filename) times = df[df.columns[0]] # want to grab only dH columns DHcols = [col for col in df.columns if (h_col_match in col)] dH = df[DHcols] # GOMC also gives us pV directly; need this for reduced potential pv_cols = [col for col in df.columns if (pv_col_match in col)] pv = None if pv_cols: pv = df[pv_cols[0]] # GOMC also gives us total energy U directly; need this for reduced potential u_cols = [ col for col in df.columns if any(single_u_col_match in col for single_u_col_match in u_col_match) ] u = None if u_cols: u = df[u_cols[0]] u_k = dict() cols = list() for col in dH: u_col = eval(col.split("->")[1][:-1]) # type: ignore[attr-defined] # calculate reduced potential u_k = dH + pV + U u_k[u_col] = beta * dH[col].values if pv_cols: u_k[u_col] += beta * pv.values # type: ignore[union-attr] if u_cols: u_k[u_col] += beta * u.values # type: ignore[union-attr] cols.append(u_col) u_k = pd.DataFrame( u_k, columns=cols, index=pd.Index(times.values, name="time", dtype="Float64") ) # type: ignore[assignment] # Need to modify the lambda name cols = [lambda_value + "-lambda" for lambda_value in lambdas] # create columns for each lambda, indicating state each row sampled from for i, lambda_value in enumerate(cols): u_k[lambda_value] = statevec[i] # set up new multi-index newind = ["time"] + cols u_k = u_k.reset_index().set_index(newind) # type: ignore[attr-defined] u_k.name = "u_nk" return u_k # type: ignore[no-any-return]
[docs] @_init_attrs def extract_dHdl(filename: str, T: float) -> pd.DataFrame: """Return gradients `dH/dl` from a Hamiltonian differences free energy file. Parameters ---------- filename : str Path to free energy file to extract data from. T : float Temperature in Kelvin at which the simulation was sampled. Returns ------- dH/dl : :class:`pandas.Series` dH/dl as a function of step for this lambda window. .. versionchanged:: 0.5.0 The :mod:`scipy.constants` is used for parsers instead of the constants used by the corresponding MD engine. """ beta = 1 / (k_b * T) state, lambdas, statevec = _extract_state(filename) # extract a DataFrame from free energy data df = _extract_dataframe(filename) times = df[df.columns[0]] # want to grab only dH/dl columns dHcols = [] for lambda_value in lambdas: dHcols.extend([col for col in df.columns if (lambda_value in col)]) dHdl = df[dHcols] # make dimensionless dHdl *= beta dHdl = pd.DataFrame( dHdl.values, columns=lambdas, index=pd.Index(times.values, name="time", dtype="Float64"), ) # Need to modify the lambda name cols = [lambda_value + "-lambda" for lambda_value in lambdas] # create columns for each lambda, indicating state each row sampled from for i, lambda_value in enumerate(cols): dHdl[lambda_value] = statevec[i] # set up new multi-index newind = ["time"] + cols dHdl = dHdl.reset_index().set_index(newind) dHdl.name = "dH/dl" return dHdl
[docs] def extract(filename: str, T: float) -> dict[str, pd.DataFrame | None]: r"""Return reduced potentials `u_nk` and gradients `dH/dl` from a Hamiltonian differences free energy file. Parameters ---------- xvg : str Path to free energy file to extract data from. T : float Temperature in Kelvin the simulations sampled. filter : bool Filter out the lines that cannot be parsed. Such as rows with incorrect number of Columns and incorrectly formatted numbers (e.g. 123.45.67, nan or -). Returns ------- :class:`dict` A dictionary with keys of 'u_nk', which is a :class:`~pandas.DataFrame` of potential energy for each alchemical state (k) for each frame (n), and 'dHdl', which is a :class:`~pandas.Series` of dH/dl as a function of time for this lambda window. .. versionadded:: 1.0.0 """ return {"u_nk": extract_u_nk(filename, T), "dHdl": extract_dHdl(filename, T)}
def _extract_state(filename: str) -> tuple[int, list[str], list[float]]: """Extract information on state sampled, names of lambdas.""" state = None with anyopen(filename, "r") as f: # type: ignore[arg-type] for line in f: if ("#" in line) and ("State" in line): state = int(line.split("State")[1].split(":")[0]) # GOMC always print these two fields lambdas = ["Coulomb", "VDW"] statevec = eval(line.strip().split(" = ")[-1]) break return state, lambdas, statevec # type: ignore[return-value] def _extract_dataframe(filename: str) -> pd.DataFrame: """Extract a DataFrame from free energy data.""" dh_col_match = "dU/dL" h_col_match = "DelE" pv_col_match = "PV" u_col_match = "Total_En" xaxis = "time" with anyopen(filename, "r") as f: # type: ignore[arg-type] names = [] rows = [] for line in f: line = line.strip() if len(line) == 0: # avoid parsing empty line continue elif line.startswith("#T"): # this line has state information. No need to be parsed continue elif line.startswith("#Steps"): # read the headers elements = line.split() for i, element in enumerate(elements): if element.startswith(u_col_match): names.append(element) elif element.startswith(dh_col_match): names.append(element) elif element.startswith(h_col_match): names.append(element) elif element.startswith(pv_col_match): names.append(element) else: # parse line as floats row = map(float, line.split()) rows.append(row) cols = [xaxis] cols.extend(names) return pd.DataFrame(rows, columns=cols)