Source code for alchemlyb.parsing.parquet
import pandas as pd
from loguru import logger
from . import _init_attrs
def _read_parquet_with_metadata(path: str, T: float) -> pd.DataFrame:
"""
Check if the metadata is included in the Dataframe and has the correct
temperature.
Parameters
----------
path : str
Path to parquet file to extract dataframe from.
T : float
Temperature in Kelvin of the simulations.
Returns
-------
:class:`pandas.DataFrame`
"""
df = pd.read_parquet(path)
if "temperature" not in df.attrs:
logger.warning(
f"No temperature metadata found in {path}. "
f"Serialise the Dataframe with pandas>=2.1 to preserve the metadata."
)
df.attrs["temperature"] = T
df.attrs["energy_unit"] = "kT"
else:
if df.attrs["temperature"] != T:
raise ValueError(
f"Temperature in the input ({T}) doesn't match the temperature "
f"in the dataframe ({df.attrs['temperature']})."
)
return df
[docs]
def extract_u_nk(path: str, T: float) -> pd.DataFrame:
r"""Return reduced potentials `u_nk` (unit: kT) from a pandas parquet file.
The parquet file should be serialised from the dataframe output
from any parser with command
(``u_nk_df.to_parquet(path=path, index=True)``).
Parameters
----------
path : str
Path to parquet file to extract dataframe from.
T : float
Temperature in Kelvin of the simulations.
Returns
-------
u_nk : :class:`pandas.DataFrame`
Potential energy for each alchemical state (k) for each frame (n).
Note
----
pyarraw serializers would handle the float or string column name fine but will
convert multi-lambda column name from `(0.0, 0.0)` to `"('0.0', '0.0')"`.
This parser will restore the correct column name.
Also parquet serialisation doesn't preserve the :attr:`pandas.DataFrame.attrs`.
So the temperature is assigned in this function.
.. versionadded:: 2.1.0
"""
u_nk = _read_parquet_with_metadata(path, T)
columns = list(u_nk.columns)
if isinstance(columns[0], str) and columns[0][0] == "(":
new_columns = []
for column in columns:
new_columns.append(
tuple(
map(
float, column[1:-1].replace('"', "").replace("'", "").split(",")
)
)
)
u_nk.columns = new_columns
return u_nk
[docs]
@_init_attrs
def extract_dHdl(path: str, T: float) -> pd.DataFrame:
r"""Return gradients `dH/dl` (unit: kT) from a pandas parquet file.
The parquet file should be serialised from the dataframe output
from any parser with command
(`dHdl_df.to_parquet(path=path, index=True)`).
Parameters
----------
path : str
Path to parquet file to extract dataframe from.
T : float
Temperature in Kelvin the simulations sampled.
Returns
-------
dH/dl : :class:`pandas.DataFrame`
dH/dl as a function of time for this lambda window.
Note
----
Parquet serialisation doesn't preserve the :attr:`pandas.DataFrame.attrs`.
So the temperature is assigned in this function.
.. versionadded:: 2.1.0
"""
return _read_parquet_with_metadata(path, T)