Source code for alchemlyb.parsing.gomc

"""Parsers for extracting alchemical data from `GOMC <http://gomc.eng.wayne.edu/>`_ output files."""

import pandas as pd

from . import _init_attrs
from .util import anyopen
from ..postprocessors.units import R_kJmol

k_b = R_kJmol



[docs]
@_init_attrs
def extract_u_nk(filename: str, T: float) -> pd.DataFrame:
    """Return reduced potentials `u_nk` from a Hamiltonian differences dat file.

    Parameters
    ----------
    filename : str
        Path to free energy file to extract data from.
    T : float
        Temperature in Kelvin at which the simulation was sampled.

    Returns
    -------
    u_nk : :class:`pandas.DataFrame`
        Potential energy for each alchemical state (k) for each frame (n).


    .. versionchanged:: 0.5.0
        The :mod:`scipy.constants` is used for parsers instead of
        the constants used by the corresponding MD engine.

    """

    h_col_match = "DelE"
    pv_col_match = "PV"
    u_col_match = ["Total_En"]
    beta = 1 / (k_b * T)

    state, lambdas, statevec = _extract_state(filename)

    # extract a DataFrame from free energy file data
    df = _extract_dataframe(filename)

    times = df[df.columns[0]]

    # want to grab only dH columns
    DHcols = [col for col in df.columns if (h_col_match in col)]
    dH = df[DHcols]

    # GOMC also gives us pV directly; need this for reduced potential
    pv_cols = [col for col in df.columns if (pv_col_match in col)]
    pv = None
    if pv_cols:
        pv = df[pv_cols[0]]

    # GOMC also gives us total energy U directly; need this for reduced potential
    u_cols = [
        col
        for col in df.columns
        if any(single_u_col_match in col for single_u_col_match in u_col_match)
    ]
    u = None
    if u_cols:
        u = df[u_cols[0]]

    u_k = dict()
    cols = list()
    for col in dH:
        u_col = eval(col.split("->")[1][:-1])  # type: ignore[attr-defined]
        # calculate reduced potential u_k = dH + pV + U
        u_k[u_col] = beta * dH[col].values
        if pv_cols:
            u_k[u_col] += beta * pv.values  # type: ignore[union-attr]
        if u_cols:
            u_k[u_col] += beta * u.values  # type: ignore[union-attr]
        cols.append(u_col)

    u_k = pd.DataFrame(
        u_k, columns=cols, index=pd.Index(times.values, name="time", dtype="Float64")
    )  # type: ignore[assignment]

    # Need to modify the lambda name
    cols = [lambda_value + "-lambda" for lambda_value in lambdas]
    # create columns for each lambda, indicating state each row sampled from
    for i, lambda_value in enumerate(cols):
        u_k[lambda_value] = statevec[i]

    # set up new multi-index
    newind = ["time"] + cols
    u_k = u_k.reset_index().set_index(newind)  # type: ignore[attr-defined]

    u_k.name = "u_nk"

    return u_k  # type: ignore[no-any-return]




[docs]
@_init_attrs
def extract_dHdl(filename: str, T: float) -> pd.DataFrame:
    """Return gradients `dH/dl` from a Hamiltonian differences free energy file.

    Parameters
    ----------
    filename : str
        Path to free energy file to extract data from.
    T : float
        Temperature in Kelvin at which the simulation was sampled.

    Returns
    -------
    dH/dl : :class:`pandas.Series`
        dH/dl as a function of step for this lambda window.


    .. versionchanged:: 0.5.0
        The :mod:`scipy.constants` is used for parsers instead of
        the constants used by the corresponding MD engine.

    """
    beta = 1 / (k_b * T)

    state, lambdas, statevec = _extract_state(filename)

    # extract a DataFrame from free energy data
    df = _extract_dataframe(filename)

    times = df[df.columns[0]]

    # want to grab only dH/dl columns
    dHcols = []
    for lambda_value in lambdas:
        dHcols.extend([col for col in df.columns if (lambda_value in col)])

    dHdl = df[dHcols]

    # make dimensionless
    dHdl *= beta

    dHdl = pd.DataFrame(
        dHdl.values,
        columns=lambdas,
        index=pd.Index(times.values, name="time", dtype="Float64"),
    )

    # Need to modify the lambda name
    cols = [lambda_value + "-lambda" for lambda_value in lambdas]
    # create columns for each lambda, indicating state each row sampled from
    for i, lambda_value in enumerate(cols):
        dHdl[lambda_value] = statevec[i]

    # set up new multi-index
    newind = ["time"] + cols
    dHdl = dHdl.reset_index().set_index(newind)

    dHdl.name = "dH/dl"

    return dHdl




[docs]
def extract(filename: str, T: float) -> dict[str, pd.DataFrame | None]:
    r"""Return reduced potentials `u_nk` and gradients `dH/dl`
    from a Hamiltonian differences free energy file.

    Parameters
    ----------
    xvg : str
        Path to free energy file to extract data from.
    T : float
        Temperature in Kelvin the simulations sampled.
    filter : bool
        Filter out the lines that cannot be parsed.
        Such as rows with incorrect number of Columns and incorrectly
        formatted numbers (e.g. 123.45.67, nan or -).

    Returns
    -------
    :class:`dict`
        A dictionary with keys of 'u_nk', which is a :class:`~pandas.DataFrame` of
        potential energy for each alchemical state (k) for each frame (n),
        and 'dHdl', which is a :class:`~pandas.Series` of dH/dl
        as a function of time for this lambda window.


    .. versionadded:: 1.0.0
    """

    return {"u_nk": extract_u_nk(filename, T), "dHdl": extract_dHdl(filename, T)}



def _extract_state(filename: str) -> tuple[int, list[str], list[float]]:
    """Extract information on state sampled, names of lambdas."""
    state = None
    with anyopen(filename, "r") as f:  # type: ignore[arg-type]
        for line in f:
            if ("#" in line) and ("State" in line):
                state = int(line.split("State")[1].split(":")[0])
                # GOMC always print these two fields
                lambdas = ["Coulomb", "VDW"]
                statevec = eval(line.strip().split(" = ")[-1])
                break

    return state, lambdas, statevec  # type: ignore[return-value]


def _extract_dataframe(filename: str) -> pd.DataFrame:
    """Extract a DataFrame from free energy data."""
    dh_col_match = "dU/dL"
    h_col_match = "DelE"
    pv_col_match = "PV"
    u_col_match = "Total_En"

    xaxis = "time"
    with anyopen(filename, "r") as f:  # type: ignore[arg-type]
        names = []
        rows = []
        for line in f:
            line = line.strip()
            if len(line) == 0:
                # avoid parsing empty line
                continue
            elif line.startswith("#T"):
                # this line has state information. No need to be parsed
                continue
            elif line.startswith("#Steps"):
                # read the headers
                elements = line.split()
                for i, element in enumerate(elements):
                    if element.startswith(u_col_match):
                        names.append(element)
                    elif element.startswith(dh_col_match):
                        names.append(element)
                    elif element.startswith(h_col_match):
                        names.append(element)
                    elif element.startswith(pv_col_match):
                        names.append(element)
            else:
                # parse line as floats
                row = map(float, line.split())
                rows.append(row)

    cols = [xaxis]
    cols.extend(names)

    return pd.DataFrame(rows, columns=cols)