Source code for alchemlyb.parsing.gmx
"""Parsers for extracting alchemical data from `Gromacs <http://www.gromacs.org/>`_ output files."""
import numpy as np
import pandas as pd
from typing import Any
from . import _init_attrs
from .util import anyopen
from ..postprocessors.units import R_kJmol
k_b = R_kJmol
[docs]
@_init_attrs
def extract_u_nk(xvg: str, T: float, filter: bool = True) -> pd.DataFrame:
r"""Return reduced potentials `u_nk` from a Hamiltonian differences XVG file.
Parameters
----------
xvg : str
Path to XVG file to extract data from.
T : float
Temperature in Kelvin the simulations sampled.
filter : bool
Filter out the lines that cannot be parsed.
Such as rows with incorrect number of Columns and incorrectly
formatted numbers (e.g. 123.45.67, nan or -).
Returns
-------
u_nk : :class:`pandas.DataFrame`
Potential energy for each alchemical state (k) for each frame (n).
Note
-----
Previous versions of alchemlyb (<0.5.0) used the `GROMACS value of the
molar gas constant
<https://manual.gromacs.org/documentation/2019/reference-manual/definitions.html>`_
of :math:`R = 8.3144621 \times 10^{−3}\,
\text{kJ}\cdot\text{mol}^{-1}\cdot\text{K}^{-1}` instead of the scipy value
:data:`scipy.constants.R` in :mod:`scipy.constants` (see
:mod:`alchemlyb.postprocessors.units`). The relative difference between
the two values is :math:`6 \times 10^{-8}`.
Therefore, results in :math:`kT` for GROMACS data will differ between
alchemlyb ≥0.5.0 and previous versions; the relative difference is on the
order of :math:`10^{-7}` for typical cases.
.. versionchanged:: 0.5.0
The :mod:`scipy.constants` is used for parsers instead of
the constants used by the corresponding MD engine.
This leads to slightly different results for GROMACS input compared to
previous versions of alchemlyb.
.. versionchanged:: 0.7.0
The keyword filter is implemented to ignore the line that cannot be
parsed and is turned on by default.
"""
h_col_match = r"\xD\f{}H \xl\f{}"
pv_col_match = "pV"
u_col_match = ["Total Energy", "Potential Energy"]
beta = 1 / (k_b * T)
state, lambdas, statevec = _extract_state(xvg)
# extract a DataFrame from XVG data
df = _extract_dataframe(xvg, filter=filter)
times = df[df.columns[0]]
# want to grab only dH columns
DHcols = [col for col in df.columns if (h_col_match in col)]
dH = df[DHcols]
# gromacs also gives us pV directly; need this for reduced potential
pv_cols = [col for col in df.columns if (pv_col_match in col)]
pv = None
if pv_cols:
pv = df[pv_cols[0]]
# gromacs also gives us total/potential energy U directly; need this for reduced potential
u_cols = [
col
for col in df.columns
if any(single_u_col_match in col for single_u_col_match in u_col_match)
]
u = None
if u_cols:
u = df[u_cols[0]]
u_k = dict()
cols = list()
for col in dH:
u_col = eval(col.split("to")[1]) # type: ignore[attr-defined]
# calculate reduced potential u_k = dH + pV + U
u_k[u_col] = beta * dH[col].values
if pv_cols:
u_k[u_col] += beta * pv.values # type: ignore[union-attr]
if u_cols:
u_k[u_col] += beta * u.values # type: ignore[union-attr]
cols.append(u_col)
u_k = pd.DataFrame(
u_k, columns=cols, index=pd.Index(times.values, name="time", dtype="Float64")
) # type: ignore[assignment]
# create columns for each lambda, indicating state each row sampled from
# if state is None run as expanded ensemble data or REX
if state is None:
# if thermodynamic state is specified map thermodynamic
# state data to lambda values, else (for REX)
# define state based on the legend
if "Thermodynamic state" in df:
ts_index = df.columns.get_loc("Thermodynamic state")
thermo_state = df[df.columns[ts_index]]
for i, lambda_value in enumerate(lambdas):
v = []
for t in thermo_state:
v.append(statevec[int(t)][i]) # type: ignore[index, call-overload]
u_k[lambda_value] = v
else:
state_legend = _extract_legend(xvg)
for i, state_legend_item in enumerate(state_legend):
u_k[state_legend_item] = state_legend[state_legend_item]
else:
for i, lambda_value in enumerate(lambdas):
try:
u_k[lambda_value] = statevec[i]
except TypeError:
u_k[lambda_value] = statevec
# set up new multi-index
newind = ["time"] + lambdas
u_k = u_k.reset_index().set_index(newind) # type: ignore[attr-defined]
u_k.name = "u_nk"
return u_k # type: ignore[no-any-return]
[docs]
@_init_attrs
def extract_dHdl(xvg: str, T: float, filter: bool = True) -> pd.DataFrame:
r"""Return gradients `dH/dl` from a Hamiltonian differences XVG file.
Parameters
----------
xvg : str
Path to XVG file to extract data from.
T : float
Temperature in Kelvin the simulations sampled.
filter : bool
Filter out the lines that cannot be parsed.
Such as rows with incorrect number of Columns and incorrectly
formatted numbers (e.g. 123.45.67, nan or -).
Returns
-------
dH/dl : :class:`pandas.Series`
dH/dl as a function of time for this lambda window.
Note
-----
Previous versions of alchemlyb (<0.5.0) used the `GROMACS value of the
molar gas constant
<https://manual.gromacs.org/documentation/2019/reference-manual/definitions.html>`_
of :math:`R = 8.3144621 \times 10^{−3}\,
\text{kJ}\cdot\text{mol}^{-1}\cdot\text{K}^{-1}` instead of the scipy value
:data:`scipy.constants.R` in :mod:`scipy.constants` (see
:mod:`alchemlyb.postprocessors.units`). The relative difference between
the two values is :math:`6 \times 10^{-8}`.
Therefore, results in :math:`kT` for GROMACS data will differ between
alchemlyb ≥0.5.0 and previous versions; the relative difference is on the
order of :math:`10^{-7}` for typical cases.
.. versionchanged:: 0.5.0
The :mod:`scipy.constants` is used for parsers instead of
the constants used by the corresponding MD engine.
This leads to slightly different results for GROMACS input compared to
previous versions of alchemlyb.
.. versionchanged:: 0.7.0
The keyword filter is implemented to ignore the line that cannot be
parsed and is turned on by default.
"""
beta = 1 / (k_b * T)
headers = _get_headers(xvg)
state, lambdas, statevec = _extract_state(xvg, headers)
# extract a DataFrame from XVG data
df = _extract_dataframe(xvg, headers, filter=filter)
times = df[df.columns[0]]
# want to grab only dH/dl columns
dHcols = []
for lambda_value in lambdas:
dHcols.extend([col for col in df.columns if (lambda_value in col)])
dHdl = df[dHcols]
# make dimensionless
dHdl = beta * dHdl
# rename columns to not include the word 'lambda', since we use this for
# index below
cols = [lambda_value.split("-")[0] for lambda_value in lambdas]
dHdl = pd.DataFrame(
dHdl.values,
columns=cols,
index=pd.Index(times.values, name="time", dtype="Float64"),
)
# create columns for each lambda, indicating state each row sampled from
# if state is None run as expanded ensemble data or REX
if state is None:
# if thermodynamic state is specified map thermodynamic
# state data to lambda values, else (for REX)
# define state based on the legend
if "Thermodynamic state" in df:
ts_index = df.columns.get_loc("Thermodynamic state")
thermo_state = df[df.columns[ts_index]]
for i, lambda_value in enumerate(lambdas):
v = []
for t in thermo_state:
v.append(statevec[int(t)][i]) # type: ignore[index, call-overload]
dHdl[lambda_value] = v
else:
state_legend = _extract_legend(xvg)
for i, state_legend_item in enumerate(state_legend):
dHdl[state_legend_item] = state_legend[state_legend_item]
else:
for i, lambda_value in enumerate(lambdas):
try:
dHdl[lambda_value] = statevec[i]
except TypeError:
dHdl[lambda_value] = statevec
# set up new multi-index
newind = ["time"] + lambdas
dHdl = dHdl.reset_index().set_index(newind)
dHdl.name = "dH/dl" # type: ignore[attr-defined]
return dHdl
[docs]
def extract(xvg: str, T: float, filter: bool = True) -> dict[str, pd.DataFrame | None]:
r"""Return reduced potentials `u_nk` and gradients `dH/dl`
from a Hamiltonian differences XVG file.
Parameters
----------
xvg : str
Path to XVG file to extract data from.
T : float
Temperature in Kelvin the simulations sampled.
filter : bool
Filter out the lines that cannot be parsed.
Such as rows with incorrect number of Columns and incorrectly
formatted numbers (e.g. 123.45.67, nan or -).
Returns
-------
:class:`dict`
A dictionary with keys of 'u_nk', which is a :class:`~pandas.DataFrame` of
potential energy for each alchemical state (k) for each frame (n),
and 'dHdl', which is a :class:`~pandas.Series` of dH/dl
as a function of time for this lambda window.
.. versionadded:: 1.0.0
"""
return {"u_nk": extract_u_nk(xvg, T, filter), "dHdl": extract_dHdl(xvg, T, filter)}
def _extract_state(
xvg: str, headers: None | dict[str, Any] = None
) -> tuple[None | int, list[str], list[float]]:
"""Extract information on state sampled, names of lambdas.
Parameters
----------
xvg : str
Path to XVG file to extract data from.
headers: dict
headers dictionary to search header information, reduced I/O by
reusing if it is already parsed, e.g. _extract_state and
_extract_dataframe in order need one-time header parsing
"""
state = None
if headers is None:
headers = _get_headers(xvg)
subtitle = _get_value_by_key(headers, "subtitle")
if subtitle and "state" in subtitle:
state = int(subtitle.split("state")[1].split(":")[0])
lambdas = [word.strip(")(,") for word in subtitle.split() if "lambda" in word]
statevec = eval(subtitle.strip().split(" = ")[-1].strip('"'))
# if expanded ensemble data is used the state variable will never be assigned
# parsing expanded ensemble data
if state is None:
lambdas = []
statevec = []
for line in headers["_raw_lines"]:
if ("legend" in line) and ("lambda" in line):
lambdas.append(
[word.strip(")(,") for word in line.split() if "lambda" in word][0]
)
if ("legend" in line) and (" to " in line):
statevec.append(
(
[
float(i)
for i in line.strip()
.split(" to ")[-1]
.strip('"()')
.split(",")
]
)
)
return state, lambdas, statevec
def _extract_legend(xvg: str) -> dict[str, float]:
"""Extract information on state sampled for REX simulations."""
state_legend = {}
with anyopen(xvg, "r") as f: # type: ignore[arg-type]
for line in f:
if ("legend" in line) and ("lambda" in line):
state_legend[line.split()[4]] = float(line.split()[6].strip('"'))
return state_legend
def _extract_dataframe(
xvg: str, headers: None | dict[str, Any] = None, filter: bool = True
) -> pd.DataFrame:
"""Extract a DataFrame from XVG data using Pandas `read_csv()`.
pd.read_csv() shows the same behavior building pandas Dataframe with better
performance (approx. 2 to 4 times speed up). See Issue #81.
Parameters
----------
xvg: str
Path to XVG file to extract data from.
headers: dict
headers dictionary to search header information, reduced I/O by
reusing if it is already parsed. Direct access by key name
filter : bool
Filter out the lines that cannot be parsed.
Such as rows with incorrect number of Columns and incorrectly
formatted numbers (e.g. 123.45.67, nan or -).
"""
if headers is None:
headers = _get_headers(xvg)
xaxis = _get_value_by_key(headers, "xaxis", "label")
names = [
_get_value_by_key(headers, "s{}".format(x), "legend")
for x in range(len(headers))
if "s{}".format(x) in headers
]
cols = [xaxis] + names
# march through column names, mark duplicates when found
cols = [
col + "{}[duplicated]".format(i) if col in cols[:i] else col # type: ignore[operator]
for i, col in enumerate(cols)
]
header_cnt = len(headers["_raw_lines"])
if not filter:
# assumes clean files
df = pd.read_csv(
xvg,
sep=r"\s+",
header=None,
skiprows=header_cnt,
na_filter=True,
memory_map=True,
names=cols,
dtype=np.float64,
float_precision="high",
)
else:
df = pd.read_csv(
xvg,
sep=r"\s+",
header=None,
skiprows=header_cnt,
memory_map=True,
on_bad_lines="skip",
)
# If names=cols is passed to read_csv, rows with more than the
# designated columns will be truncated and used instead of discarded.
df.rename(columns={i: name for i, name in enumerate(cols)}, inplace=True)
# If dtype=np.float64 and float_precision='high' are passed to read_csv,
# 12.345.56 and - cannot be read.
df = df.apply(pd.to_numeric, errors="coerce")
# drop duplicate
df.dropna(inplace=True)
# drop duplicated columns (see PR #86 https://github.com/alchemistry/alchemlyb/pull/86/)
df = df[df.columns[~df.columns.str.endswith("[duplicated]")]]
return df
def _parse_header(line: str, headers: dict[str, Any] = {}, depth: int = 2) -> None:
"""Build python dictionary for single line header
Update python dictionary to ``headers`` by reading ``line`` separated by
whitespace. If ``depth`` is given, at most ``depth`` nested key value store
is added. `_val` key is reserved which contain remaining words from
``line``.
Note
----
No return value but 'headers' dictionary will be updated.
Parameters
----------
line: str
header line to parse
headers: dict
headers dictionary to update, pass by reference
depth: int
depth of nested key and value store
Examples
--------
"x y z" line turns into { 'x': { 'y': {'_val': 'z' }}}
>>> headers={}
>>> _parse_header('@ s0 legend "Potential Energy (kJ/mol)"', headers)
>>> headers
{'s0': {'legend': {'_val': 'Potential Energy (kJ/mol)'}}}
"""
# Remove a first character, i.e. @
s = line[1:].split(None, 1)
next_t = headers[s[0]] = {}
for i in range(1, depth):
# ord('"') == 34
# no further parsing for quoted value
if len(s) > 1 and s[1][0] != '"':
s = s[1].split(None, 1)
next_t[s[0]] = {}
next_t = next_t[s[0]]
else:
break
next_t["_val"] = "".join(s[1:]).rstrip().strip('"')
def _get_headers(xvg: str) -> dict[str, Any]:
"""Build python dictionary from header lines
Build nested key and value store by reading header ('@') lines from a file.
Direct access to value provides reduced time complexity O(1).
`_raw_lines` key is reserved to keep the original text.
Example
-------
Given a xvg file containinig header lines like:
...
@ title "dH/d\\xl\\f{} and \\xD\\f{}H"
@ xaxis label "Time (ps)"
@ yaxis label "dH/d\\xl\\f{} and \\xD\\f{}H (kJ/mol [\\xl\\f{}]\\S-1\\N)"
@TYPE xy
@ subtitle "T = 310 (K) \\xl\\f{} state 38: (coul-lambda, vdw-lambda) = (0.9500, 0.0000)"
@ view 0.15, 0.15, 0.75, 0.85
@ legend on
@ legend box on
@ legend loctype view
@ legend 0.78, 0.8
@ legend length 2
@ s0 legend "Potential Energy (kJ/mol)"
@ s1 legend "dH/d\\xl\\f{} coul-lambda = 0.9500"
@ s2 legend "dH/d\\xl\\f{} vdw-lambda = 0.0000"
...
>>> _get_headers(xvg)
{'TYPE': {'xy': {'_val': ''}},
'subtitle': {'_val': 'T = 310 (K) \\xl\\f{} state 38: (coul-lambda, vdw-lambda) = (0.9500, 0.0000)'},
'title': {'_val': 'dH/d\\xl\\f{} and \\xD\\f{}H'},
'view': {'0.15,': {'_val': '0.15, 0.75, 0.85'}},
'xaxis': {'label': {'_val': 'Time (ps)'}},
'yaxis': {'label': {'_val': 'dH/d\\xl\\f{} and \\xD\\f{}H (kJ/mol [\\xl\\f{}]\\S-1\\N)'}},
...(omitted)...
'_raw_lines': ['@ title "dH/d\\xl\\f{} and \\xD\\f{}H"',
'@ xaxis label "Time (ps)"',
'@ yaxis label "dH/d\\xl\\f{} and \\xD\\f{}H (kJ/mol [\\xl\\f{}]\\S-1\\N)"',
'@TYPE xy',
'@ subtitle "T = 310 (K) \\xl\\f{} state 38: (coul-lambda, vdw-lambda) = (0.9500, 0.0000)"',
'@ view 0.15, 0.15, 0.75, 0.85',
'@ legend on',
'@ legend box on',
'@ legend loctype view',
'@ legend 0.78, 0.8',
'@ legend length 2',
'@ s0 legend "Potential Energy (kJ/mol)"',
'@ s1 legend "dH/d\\xl\\f{} coul-lambda = 0.9500"',
'@ s2 legend "dH/d\\xl\\f{} vdw-lambda = 0.0000"'],
}
Returns
-------
headers: dict
"""
with anyopen(xvg, "r") as f: # type: ignore[arg-type]
headers: dict[str, Any] = {"_raw_lines": []}
for line in f:
line = line.strip()
if len(line) == 0:
continue
if line.startswith("@"):
_parse_header(line, headers)
headers["_raw_lines"].append(line)
elif line.startswith("#"):
headers["_raw_lines"].append(line)
continue
# assuming to start a body section
else:
break
return headers
def _get_value_by_key(
headers: dict[str, Any], key1: str, key2: None | str = None
) -> None | str:
"""Return value by two-level keys where the second key is optional
Example
-------
>>> headers
{'s0': {'legend': {'_val': 'Potential Energy (kJ/mol)'}},
'subtitle': {'_val': 'T = 310 (K) \\xl\\f{} state 38: (coul-lambda,
vdw-lambda) = (0.9500, 0.0000)'}}
>>> _get_value_by_key(header, 's0','legend')
'Potential Energy (kJ/mol)'
>>> _get_value_by_key(header, 'subtitle')
'T = 310 (K) \\xl\\f{} state 38: (coul-lambda, vdw-lambda) = (0.9500, 0.0000)'
"""
val = None
if key1 in headers:
if key2 is not None and key2 in headers[key1]:
val = headers[key1][key2]["_val"]
else:
val = headers[key1]["_val"]
return val