Source code for alchemlyb.parsing.gmx

"""Parsers for extracting alchemical data from `Gromacs <http://www.gromacs.org/>`_ output files."""

import numpy as np
import pandas as pd
from typing import Any

from . import _init_attrs
from .util import anyopen
from ..postprocessors.units import R_kJmol

k_b = R_kJmol



[docs]
@_init_attrs
def extract_u_nk(xvg: str, T: float, filter: bool = True) -> pd.DataFrame:
    r"""Return reduced potentials `u_nk` from a Hamiltonian differences XVG file.

    Parameters
    ----------
    xvg : str
        Path to XVG file to extract data from.
    T : float
        Temperature in Kelvin the simulations sampled.
    filter : bool
        Filter out the lines that cannot be parsed.
        Such as rows with incorrect number of Columns and incorrectly
        formatted numbers (e.g. 123.45.67, nan or -).

    Returns
    -------
    u_nk : :class:`pandas.DataFrame`
        Potential energy for each alchemical state (k) for each frame (n).


    Note
    -----
    Previous versions of alchemlyb (<0.5.0) used the `GROMACS value of the
    molar gas constant
    <https://manual.gromacs.org/documentation/2019/reference-manual/definitions.html>`_
    of :math:`R = 8.3144621 \times 10^{−3}\,
    \text{kJ}\cdot\text{mol}^{-1}\cdot\text{K}^{-1}` instead of the scipy value
    :data:`scipy.constants.R` in :mod:`scipy.constants` (see
    :mod:`alchemlyb.postprocessors.units`).  The relative difference between
    the two values is :math:`6 \times 10^{-8}`.

    Therefore, results in :math:`kT` for GROMACS data will differ between
    alchemlyb ≥0.5.0 and previous versions; the relative difference is on the
    order of :math:`10^{-7}` for typical cases.


    .. versionchanged:: 0.5.0
        The :mod:`scipy.constants` is used for parsers instead of
        the constants used by the corresponding MD engine.
        This leads to slightly different results for GROMACS input compared to
        previous versions of alchemlyb.

    .. versionchanged:: 0.7.0
        The keyword filter is implemented to ignore the line that cannot be
        parsed and is turned on by default.

    """

    h_col_match = r"\xD\f{}H \xl\f{}"
    pv_col_match = "pV"
    u_col_match = ["Total Energy", "Potential Energy"]
    beta = 1 / (k_b * T)

    state, lambdas, statevec = _extract_state(xvg)

    # extract a DataFrame from XVG data
    df = _extract_dataframe(xvg, filter=filter)

    times = df[df.columns[0]]

    # want to grab only dH columns
    DHcols = [col for col in df.columns if (h_col_match in col)]
    dH = df[DHcols]

    # gromacs also gives us pV directly; need this for reduced potential
    pv_cols = [col for col in df.columns if (pv_col_match in col)]
    pv = None
    if pv_cols:
        pv = df[pv_cols[0]]

    # gromacs also gives us total/potential energy U directly; need this for reduced potential
    u_cols = [
        col
        for col in df.columns
        if any(single_u_col_match in col for single_u_col_match in u_col_match)
    ]
    u = None
    if u_cols:
        u = df[u_cols[0]]

    u_k = dict()
    cols = list()
    for col in dH:
        u_col = eval(col.split("to")[1])  # type: ignore[attr-defined]
        # calculate reduced potential u_k = dH + pV + U
        u_k[u_col] = beta * dH[col].values
        if pv_cols:
            u_k[u_col] += beta * pv.values  # type: ignore[union-attr]
        if u_cols:
            u_k[u_col] += beta * u.values  # type: ignore[union-attr]
        cols.append(u_col)

    u_k = pd.DataFrame(
        u_k, columns=cols, index=pd.Index(times.values, name="time", dtype="Float64")
    )  # type: ignore[assignment]

    # create columns for each lambda, indicating state each row sampled from
    # if state is None run as expanded ensemble data or REX
    if state is None:
        # if thermodynamic state is specified map thermodynamic
        # state data to lambda values, else (for REX)
        # define state based on the legend
        if "Thermodynamic state" in df:
            ts_index = df.columns.get_loc("Thermodynamic state")
            thermo_state = df[df.columns[ts_index]]
            for i, lambda_value in enumerate(lambdas):
                v = []
                for t in thermo_state:
                    v.append(statevec[int(t)][i])  # type: ignore[index, call-overload]
                u_k[lambda_value] = v
        else:
            state_legend = _extract_legend(xvg)
            for i, state_legend_item in enumerate(state_legend):
                u_k[state_legend_item] = state_legend[state_legend_item]
    else:
        for i, lambda_value in enumerate(lambdas):
            try:
                u_k[lambda_value] = statevec[i]
            except TypeError:
                u_k[lambda_value] = statevec

    # set up new multi-index
    newind = ["time"] + lambdas
    u_k = u_k.reset_index().set_index(newind)  # type: ignore[attr-defined]

    u_k.name = "u_nk"

    return u_k  # type: ignore[no-any-return]




[docs]
@_init_attrs
def extract_dHdl(xvg: str, T: float, filter: bool = True) -> pd.DataFrame:
    r"""Return gradients `dH/dl` from a Hamiltonian differences XVG file.

    Parameters
    ----------
    xvg : str
        Path to XVG file to extract data from.
    T : float
        Temperature in Kelvin the simulations sampled.
    filter : bool
        Filter out the lines that cannot be parsed.
        Such as rows with incorrect number of Columns and incorrectly
        formatted numbers (e.g. 123.45.67, nan or -).

    Returns
    -------
    dH/dl : :class:`pandas.Series`
        dH/dl as a function of time for this lambda window.

    Note
    -----
    Previous versions of alchemlyb (<0.5.0) used the `GROMACS value of the
    molar gas constant
    <https://manual.gromacs.org/documentation/2019/reference-manual/definitions.html>`_
    of :math:`R = 8.3144621 \times 10^{−3}\,
    \text{kJ}\cdot\text{mol}^{-1}\cdot\text{K}^{-1}` instead of the scipy value
    :data:`scipy.constants.R` in :mod:`scipy.constants` (see
    :mod:`alchemlyb.postprocessors.units`).  The relative difference between
    the two values is :math:`6 \times 10^{-8}`.

    Therefore, results in :math:`kT` for GROMACS data will differ between
    alchemlyb ≥0.5.0 and previous versions; the relative difference is on the
    order of :math:`10^{-7}` for typical cases.


    .. versionchanged:: 0.5.0
        The :mod:`scipy.constants` is used for parsers instead of
        the constants used by the corresponding MD engine.
        This leads to slightly different results for GROMACS input compared to
        previous versions of alchemlyb.

    .. versionchanged:: 0.7.0
        The keyword filter is implemented to ignore the line that cannot be
        parsed and is turned on by default.

    """
    beta = 1 / (k_b * T)

    headers = _get_headers(xvg)
    state, lambdas, statevec = _extract_state(xvg, headers)

    # extract a DataFrame from XVG data
    df = _extract_dataframe(xvg, headers, filter=filter)

    times = df[df.columns[0]]

    # want to grab only dH/dl columns
    dHcols = []
    for lambda_value in lambdas:
        dHcols.extend([col for col in df.columns if (lambda_value in col)])

    dHdl = df[dHcols]

    # make dimensionless
    dHdl = beta * dHdl

    # rename columns to not include the word 'lambda', since we use this for
    # index below
    cols = [lambda_value.split("-")[0] for lambda_value in lambdas]

    dHdl = pd.DataFrame(
        dHdl.values,
        columns=cols,
        index=pd.Index(times.values, name="time", dtype="Float64"),
    )

    # create columns for each lambda, indicating state each row sampled from
    # if state is None run as expanded ensemble data or REX
    if state is None:
        # if thermodynamic state is specified map thermodynamic
        # state data to lambda values, else (for REX)
        # define state based on the legend
        if "Thermodynamic state" in df:
            ts_index = df.columns.get_loc("Thermodynamic state")
            thermo_state = df[df.columns[ts_index]]
            for i, lambda_value in enumerate(lambdas):
                v = []
                for t in thermo_state:
                    v.append(statevec[int(t)][i])  # type: ignore[index, call-overload]
                dHdl[lambda_value] = v
        else:
            state_legend = _extract_legend(xvg)
            for i, state_legend_item in enumerate(state_legend):
                dHdl[state_legend_item] = state_legend[state_legend_item]
    else:
        for i, lambda_value in enumerate(lambdas):
            try:
                dHdl[lambda_value] = statevec[i]
            except TypeError:
                dHdl[lambda_value] = statevec

    # set up new multi-index
    newind = ["time"] + lambdas
    dHdl = dHdl.reset_index().set_index(newind)

    dHdl.name = "dH/dl"  # type: ignore[attr-defined]

    return dHdl




[docs]
def extract(xvg: str, T: float, filter: bool = True) -> dict[str, pd.DataFrame | None]:
    r"""Return reduced potentials `u_nk` and gradients `dH/dl`
    from a Hamiltonian differences XVG file.

    Parameters
    ----------
    xvg : str
        Path to XVG file to extract data from.
    T : float
        Temperature in Kelvin the simulations sampled.
    filter : bool
        Filter out the lines that cannot be parsed.
        Such as rows with incorrect number of Columns and incorrectly
        formatted numbers (e.g. 123.45.67, nan or -).

    Returns
    -------
    :class:`dict`
        A dictionary with keys of 'u_nk', which is a :class:`~pandas.DataFrame` of
        potential energy for each alchemical state (k) for each frame (n),
        and 'dHdl', which is a :class:`~pandas.Series` of dH/dl
        as a function of time for this lambda window.


    .. versionadded:: 1.0.0
    """

    return {"u_nk": extract_u_nk(xvg, T, filter), "dHdl": extract_dHdl(xvg, T, filter)}



def _extract_state(
    xvg: str, headers: None | dict[str, Any] = None
) -> tuple[None | int, list[str], list[float]]:
    """Extract information on state sampled, names of lambdas.

    Parameters
    ----------
    xvg : str
        Path to XVG file to extract data from.
    headers: dict
       headers dictionary to search header information, reduced I/O by
       reusing if it is already parsed, e.g. _extract_state and
       _extract_dataframe in order need one-time header parsing

    """
    state = None
    if headers is None:
        headers = _get_headers(xvg)
    subtitle = _get_value_by_key(headers, "subtitle")
    if subtitle and "state" in subtitle:
        state = int(subtitle.split("state")[1].split(":")[0])
        lambdas = [word.strip(")(,") for word in subtitle.split() if "lambda" in word]
        statevec = eval(subtitle.strip().split(" = ")[-1].strip('"'))

    # if expanded ensemble data is used the state variable will never be assigned
    # parsing expanded ensemble data
    if state is None:
        lambdas = []
        statevec = []
        for line in headers["_raw_lines"]:
            if ("legend" in line) and ("lambda" in line):
                lambdas.append(
                    [word.strip(")(,") for word in line.split() if "lambda" in word][0]
                )
            if ("legend" in line) and (" to " in line):
                statevec.append(
                    (
                        [
                            float(i)
                            for i in line.strip()
                            .split(" to ")[-1]
                            .strip('"()')
                            .split(",")
                        ]
                    )
                )

    return state, lambdas, statevec


def _extract_legend(xvg: str) -> dict[str, float]:
    """Extract information on state sampled for REX simulations."""
    state_legend = {}
    with anyopen(xvg, "r") as f:  # type: ignore[arg-type]
        for line in f:
            if ("legend" in line) and ("lambda" in line):
                state_legend[line.split()[4]] = float(line.split()[6].strip('"'))

    return state_legend


def _extract_dataframe(
    xvg: str, headers: None | dict[str, Any] = None, filter: bool = True
) -> pd.DataFrame:
    """Extract a DataFrame from XVG data using Pandas `read_csv()`.

    pd.read_csv() shows the same behavior building pandas Dataframe with better
    performance (approx. 2 to 4 times speed up). See Issue #81.

    Parameters
    ----------
    xvg: str
       Path to XVG file to extract data from.
    headers: dict
       headers dictionary to search header information, reduced I/O by
       reusing if it is already parsed. Direct access by key name
    filter : bool
        Filter out the lines that cannot be parsed.
        Such as rows with incorrect number of Columns and incorrectly
        formatted numbers (e.g. 123.45.67, nan or -).

    """
    if headers is None:
        headers = _get_headers(xvg)

    xaxis = _get_value_by_key(headers, "xaxis", "label")
    names = [
        _get_value_by_key(headers, "s{}".format(x), "legend")
        for x in range(len(headers))
        if "s{}".format(x) in headers
    ]
    cols = [xaxis] + names

    # march through column names, mark duplicates when found
    cols = [
        col + "{}[duplicated]".format(i) if col in cols[:i] else col  # type: ignore[operator]
        for i, col in enumerate(cols)
    ]

    header_cnt = len(headers["_raw_lines"])
    if not filter:
        # assumes clean files
        df = pd.read_csv(
            xvg,
            sep=r"\s+",
            header=None,
            skiprows=header_cnt,
            na_filter=True,
            memory_map=True,
            names=cols,
            dtype=np.float64,
            float_precision="high",
        )
    else:
        df = pd.read_csv(
            xvg,
            sep=r"\s+",
            header=None,
            skiprows=header_cnt,
            memory_map=True,
            on_bad_lines="skip",
        )
        # If names=cols is passed to read_csv, rows with more than the
        # designated columns will be truncated and used instead of discarded.
        df.rename(columns={i: name for i, name in enumerate(cols)}, inplace=True)
        # If dtype=np.float64 and float_precision='high' are passed to read_csv,
        # 12.345.56 and - cannot be read.
        df = df.apply(pd.to_numeric, errors="coerce")
        # drop duplicate
        df.dropna(inplace=True)

    # drop duplicated columns (see PR #86 https://github.com/alchemistry/alchemlyb/pull/86/)
    df = df[df.columns[~df.columns.str.endswith("[duplicated]")]]

    return df


def _parse_header(line: str, headers: dict[str, Any] = {}, depth: int = 2) -> None:
    """Build python dictionary for single line header

    Update python dictionary to ``headers`` by reading ``line`` separated by
    whitespace. If ``depth`` is given, at most ``depth`` nested key value store
    is added. `_val` key is reserved which contain remaining words from
    ``line``.

    Note
    ----
    No return value but 'headers' dictionary will be updated.

    Parameters
    ----------

    line: str
        header line to parse
    headers: dict
        headers dictionary to update, pass by reference
    depth: int
        depth of nested key and value store

    Examples
    --------
    "x y z" line turns into { 'x': { 'y': {'_val': 'z' }}}

    >>> headers={}
    >>> _parse_header('@ s0 legend "Potential Energy (kJ/mol)"', headers)
    >>> headers
    {'s0': {'legend': {'_val': 'Potential Energy (kJ/mol)'}}}

    """
    # Remove a first character, i.e. @
    s = line[1:].split(None, 1)
    next_t = headers[s[0]] = {}
    for i in range(1, depth):
        # ord('"') == 34
        # no further parsing for quoted value
        if len(s) > 1 and s[1][0] != '"':
            s = s[1].split(None, 1)
            next_t[s[0]] = {}
            next_t = next_t[s[0]]
        else:
            break

    next_t["_val"] = "".join(s[1:]).rstrip().strip('"')


def _get_headers(xvg: str) -> dict[str, Any]:
    """Build python dictionary from header lines

    Build nested key and value store by reading header ('@') lines from a file.
    Direct access to value provides reduced time complexity O(1).
    `_raw_lines` key is reserved to keep the original text.

    Example
    -------

    Given a xvg file containinig header lines like:

        ...
       @    title "dH/d\\xl\\f{} and \\xD\\f{}H"
       @    xaxis  label "Time (ps)"
       @    yaxis  label "dH/d\\xl\\f{} and \\xD\\f{}H (kJ/mol [\\xl\\f{}]\\S-1\\N)"
       @TYPE xy
       @ subtitle "T = 310 (K) \\xl\\f{} state 38: (coul-lambda, vdw-lambda) = (0.9500, 0.0000)"
       @ view 0.15, 0.15, 0.75, 0.85
       @ legend on
       @ legend box on
       @ legend loctype view
       @ legend 0.78, 0.8
       @ legend length 2
       @ s0 legend "Potential Energy (kJ/mol)"
       @ s1 legend "dH/d\\xl\\f{} coul-lambda = 0.9500"
       @ s2 legend "dH/d\\xl\\f{} vdw-lambda = 0.0000"
       ...

    >>> _get_headers(xvg)
    {'TYPE': {'xy': {'_val': ''}},
      'subtitle': {'_val': 'T = 310 (K) \\xl\\f{} state 38: (coul-lambda, vdw-lambda) = (0.9500, 0.0000)'},
      'title': {'_val': 'dH/d\\xl\\f{} and \\xD\\f{}H'},
      'view': {'0.15,': {'_val': '0.15, 0.75, 0.85'}},
      'xaxis': {'label': {'_val': 'Time (ps)'}},
      'yaxis': {'label': {'_val': 'dH/d\\xl\\f{} and \\xD\\f{}H (kJ/mol [\\xl\\f{}]\\S-1\\N)'}},
      ...(omitted)...
      '_raw_lines': ['@    title "dH/d\\xl\\f{} and \\xD\\f{}H"',
                    '@    xaxis  label "Time (ps)"',
                    '@    yaxis  label "dH/d\\xl\\f{} and \\xD\\f{}H (kJ/mol [\\xl\\f{}]\\S-1\\N)"',
                    '@TYPE xy',
                    '@ subtitle "T = 310 (K) \\xl\\f{} state 38: (coul-lambda, vdw-lambda) = (0.9500, 0.0000)"',
                    '@ view 0.15, 0.15, 0.75, 0.85',
                    '@ legend on',
                    '@ legend box on',
                    '@ legend loctype view',
                    '@ legend 0.78, 0.8',
                    '@ legend length 2',
                    '@ s0 legend "Potential Energy (kJ/mol)"',
                    '@ s1 legend "dH/d\\xl\\f{} coul-lambda = 0.9500"',
                    '@ s2 legend "dH/d\\xl\\f{} vdw-lambda = 0.0000"'],
      }

    Returns
    -------
    headers: dict

    """
    with anyopen(xvg, "r") as f:  # type: ignore[arg-type]
        headers: dict[str, Any] = {"_raw_lines": []}
        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            if line.startswith("@"):
                _parse_header(line, headers)
                headers["_raw_lines"].append(line)
            elif line.startswith("#"):
                headers["_raw_lines"].append(line)
                continue
            # assuming to start a body section
            else:
                break

    return headers


def _get_value_by_key(
    headers: dict[str, Any], key1: str, key2: None | str = None
) -> None | str:
    """Return value by two-level keys where the second key is optional

    Example
    -------

    >>> headers
    {'s0': {'legend': {'_val': 'Potential Energy (kJ/mol)'}},
            'subtitle': {'_val': 'T = 310 (K) \\xl\\f{} state 38: (coul-lambda,
                vdw-lambda) = (0.9500, 0.0000)'}}
    >>> _get_value_by_key(header, 's0','legend')
    'Potential Energy (kJ/mol)'
    >>> _get_value_by_key(header, 'subtitle')
    'T = 310 (K) \\xl\\f{} state 38: (coul-lambda, vdw-lambda) = (0.9500, 0.0000)'

    """
    val = None
    if key1 in headers:
        if key2 is not None and key2 in headers[key1]:
            val = headers[key1][key2]["_val"]
        else:
            val = headers[key1]["_val"]

    return val