import os
import subprocess
from ase.units import kcal, mol, Hartree, Bohr
from ase.geometry import get_distances
from ase.data import atomic_numbers, atomic_masses
from ase.data import covalent_radii, chemical_symbols
from os import path
import numpy as np
from typing import Optional
from ..storage.storage_base import Storage
from ..potential.potential_base import Potential
from ..potential.chimes import ChIMES
from ..workflow.workflow_base import Workflow
from ase import Atoms
from .trainer_base import Trainer
from ..utils.data_standard import (
ENERGY_KEY,
FORCES_KEY,
STRESS_KEY,
SELECTION_MASK_KEY,
)
[docs]
class ChIMESTrainer(Trainer):
"""
Train and deploy a potential using ChIMES
The trainer class is responsible for handling the loading/assignment of
training data, as well as the actual process of training a potential.
This trainer is intended to be used with ChIMES model trained with ASE
training data. WARNING: the fit directory location will be overwritten
during any call to the train functions.
"""
[docs]
def __init__(
self,
exe_chimes_fit_1: str,
exe_chimes_fit_2: str,
fit_directory: Optional[str] = '_ChIMES_FIT',
**kwargs,
) -> None:
"""
Initialize the ChIMESTrainer.
:param exe_chimes_fit_1: Path to the first ChIMES fitting executable -
/build/chimes_lsq (executable)
:type exe_chimes_fit_1: str
:param exe_chimes_fit_2: Path to the second ChIMES fitting executable
- src/chimes_lsq.py (python script)
:type exe_chimes_fit_2: str
:param fit_directory: Directory for fitting outputs. WARNING: this
directory location will be overwritten during any call to a
training function
:type fit_directory: Optional[str]
:param kwargs: Additional keyword arguments for the base Trainer.
:type kwargs: dict
"""
super().__init__(**kwargs)
self.exe_chimes_fit_1 = path.abspath(exe_chimes_fit_1)
self.exe_chimes_fit_2 = path.abspath(exe_chimes_fit_2)
self.fit_directory = fit_directory
# arguments to reinitialize an instance of the trainer
self.trainer_init_args = {
'exe_chimes_fit_1': self.exe_chimes_fit_1,
'exe_chimes_fit_2': self.exe_chimes_fit_2,
'fit_directory': self.fit_directory,
}
[docs]
def checkpoint_trainer(self) -> None:
"""
checkpoint the trainer module into the checkpoint file
save necessary internal variables into a dict with key checkpoint_name
and write to the (json) checkpoint file for restart capabilities
"""
pass
[docs]
def restart_trainer(self) -> None:
"""
restart the trainer module from the checkpoint file
check if the checkpoint_file has an entry matching the checkpoint_name
and set internal variables accordingly if so
"""
pass
def _get_training_data(
self,
dataset_handle: str,
storage: Storage,
) -> list[Atoms]:
"""
Get the training data configurations
Retrieve the dataset specified by dataset_handle from the passed
storage module.
:param dataset_handle: the identifier of the dataset to extract from
the storage module
:type dataset_handle: str
:param storage: storage instance where the training data is saved
:type storage: Storage
:returns: training data of configurations
:rtype: ASE Dataset
"""
self.logger.info('Reading training data from storage')
training_set = storage.get_data(dataset_handle)
for c in training_set:
try:
c.info[ENERGY_KEY] = c.get_potential_energy()
except Exception:
pass
try:
c.info[STRESS_KEY] = c.get_stress()
except Exception:
pass
try:
c.set_array(FORCES_KEY, c.get_forces())
except Exception:
pass
try:
c.info[SELECTION_MASK_KEY] = c.get_array(SELECTION_MASK_KEY)
except Exception:
pass
return training_set
def _write_training_script(
self,
save_path: str,
dataset_list: list[str],
potential: Potential,
storage: Storage,
eweight: float = 1.0,
fweight: float = 1.0,
vweight: float = 1.0,
per_atom_weights: bool = False,
upload_to_kimkit=True,
) -> str:
"""
write a script to run the trainer outside of memory
this is a helper function for generating a script, training_script.py,
which can be executed via a workflow or offline
:param save_path: path where the training script will be written
:type save_path: str
:param dataset_list: list of dataset handles which should be used for
the training procedure
:type dataset_list: list of str
:param potential: Potential instance to be trained, expect its
pre-trained state to be written to save_path/potential_to_train.pkl
:type potential: Potential
:param storage: an instance of the storage class, which contains the
datasets in dataset_list
:type storage: Storage
:param eweight: weight of energy data in the loss function
:type eweight: float
:param fweight: weight of the force data in the loss function
:type fweight: float
:param vweight: weight of the stress data in the loss function
:type vweight: float
:param per_atom_weights: True to read from dataset |default| ``False``
:type per_atom_weights: boolean
:param upload_to_kimkit: Whether to upload to kimkit after training
|default| ``True``.
:type upload_to_kimkit: bool
:returns: the name of the execution script
:rtype: str
"""
full_save_path = path.abspath(save_path)
import_lines = ('from orchestrator.utils.setup_input import '
'init_and_validate_module_type\n'
'from numpy import loadtxt, array, zeros\n')
trainer_dict = {
'trainer_type': self.factory_token,
'trainer_args': self.trainer_init_args
}
init_trainer = ('trainer = init_and_validate_module_type("trainer", '
f'{trainer_dict}, single_input_dict=True)')
storage_dict = {
'storage_type': storage.factory_token,
'storage_args': storage.storage_init_args
}
init_storage = ('storage = init_and_validate_module_type("storage", '
f'{storage_dict}, single_input_dict=True)')
potential_dict = {
'potential_type': potential.factory_token,
'potential_args': potential.trainer_args
}
init_potential = ('potential = init_and_validate_module_type('
f'"potential", {potential_dict}, '
'single_input_dict=True)\n')
load_potential = "potential.build_potential()"
# Currently uses the workflow from trainer, not submit_train's input
construct_and_train = (
f'chimes, errors = trainer.train(path_type="{full_save_path}",'
'potential=potential,'
'storage=storage,'
f'dataset_list={dataset_list},'
f'eweight={eweight},'
f'fweight={fweight},'
f'vweight={vweight},'
f'per_atom_weights={per_atom_weights},'
'write_training_script=False,'
f'upload_to_kimkit={upload_to_kimkit})')
script = '\n'.join([
import_lines,
init_trainer,
init_storage,
init_potential,
load_potential,
construct_and_train,
])
with open(f'{save_path}/training_script.py', 'w') as fout:
fout.write(script)
return 'training_script.py'
def _chimes_write_masses(self) -> None:
"""
Write atomic masses for all elements to a LAMMPS-compatible file.
"""
# Elements from H (Z=1) to Og (Z=118)
symbols = chemical_symbols[1:119]
masses = atomic_masses[1:119]
nlen = len(symbols)
comment = (
"""# The KIM API Simulator Model Interface (SMI) allows a uniform
# interface to any simulator model regardless of type with the
# "kim interactions" command followed by the mapping of species to numeric
# LAMMPS atom types, e.g. if your atom types 1 and 2 are C, and 3 is Si,
# "kim interactions C C Si"
# The atom types string (e.g. "C C Si") is passed to the LAMMPS commands in
# smspec.edn through the template map key "atom-type-sym-list".
# See https://kim-api.readthedocs.io/en/latest/implementation.html#kim_api_smi
# Usually, this can be used with the pair_coeff command, but because ChIMES
# assigns atom types based on mass, we use LAMMPS scripting to assign masses
# by saving "atom-type-sym-list" as the LAMMPS variable kim_atom_type_sym_list
# in smspec.edn, and invoking this LAMMPS script. Repeated "mass" commands
# should not be an issue if the user wishes to define or redefine the masses
# later.
variable atom_sym_i index ${kim_atom_type_sym_list}
variable atom_type_i loop 10000
label loopi
""")
file_path = 'masses.lammps'
with open(file_path, 'w') as file:
file.write(comment)
# first element is Hydrogen
atom_symbol = symbols[0]
atom_mass = masses[0]
text = (f' if "${{atom_sym_i}} == {atom_symbol}" then '
f'"mass ${{atom_type_i}} {atom_mass:.6f}" &\n')
file.write(text)
for i in range(1, nlen - 1):
atom_symbol = symbols[i]
atom_mass = masses[i]
text = (f' elif "${{atom_sym_i}} == {atom_symbol}" '
f'"mass ${{atom_type_i}} {atom_mass:.6f}" &\n')
file.write(text)
# last element
atom_symbol = symbols[-1]
atom_mass = masses[-1]
text = (f' elif "${{atom_sym_i}} == {atom_symbol}" '
f'"mass ${{atom_type_i}} {atom_mass:.6f}" \n')
file.write(text)
# some last lines
text = """ next atom_type_i
next atom_sym_i
jump SELF loopi
variable atom_type_i delete
"""
file.write(text)
def _chimes_write_xyzf(
self,
atomlist: list[str],
xyz: np.ndarray,
cell_xyz: np.ndarray,
fxyz: np.ndarray,
energy: float,
stress: np.ndarray,
weights: list[float],
weight_mask: np.ndarray,
) -> None:
"""
Write fitting data to an xyz file for ChIMES LSQ.
This is called on the data from a single atomic configuration
:param atomlist: List of atomic symbols.
:type atomlist: list[str]
:param xyz: Atomic positions array.
:type xyz: np.ndarray
:param cell_xyz: Cell matrix.
:type cell_xyz: np.ndarray
:param fxyz: Forces array.
:type fxyz: np.ndarray
:param energy: Configuration energy.
:type energy: float
:param stress: Stress tensor.
:type stress: np.ndarray
:param weights: List of weights [eweight, fweight, vweight].
:type weights: list[float]
:param weight_mask: Per-atom weight mask.
:type weight_mask: np.ndarray
"""
eweight = weights[0]
fweight = weights[1]
vweight = weights[2]
f2 = open('training_ChIMES.xyzf', 'a')
f3 = open('weights.dat', 'a')
f4 = open('label.txt', 'a')
natom = len(atomlist)
f2.write("%1d\n" % (natom))
# cell parameters
f2.write("NON_ORTHO ")
for i in range(3):
for j in range(3):
f2.write("%9.4f" % (cell_xyz[i, j]))
if (len(stress) > 0):
# Voigt notation for stress tensor:
# xx, yy, zz
for i in range(3):
f2.write("%12.4f" % (stress[i]))
# stress off-diagonal xy, xz, yz
f2.write("%12.4f" % (stress[5]))
f2.write("%12.4f" % (stress[4]))
f2.write("%12.4f" % (stress[3]))
# energy
f2.write("%20.4f" % (energy))
f2.write("\n")
# xyz, fxyz
for i in range(natom):
f2.write("%s" % (atomlist[i]))
for j in range(3):
f2.write("%15.9f" % (xyz[i, j]))
for j in range(3):
f2.write("%15.9f" % (fxyz[i, j]))
f2.write("\n")
# weights of forces
for j in range(3):
f3.write("%15.9f\n" % (fweight * weight_mask[i]))
f4.write("forces\n")
if (len(stress) > 0):
# weights of stress
for j in range(9):
f3.write("%15.9f\n" % (vweight))
f4.write("stress\n")
# weights of energy
for j in range(3):
f3.write("%15.9f\n" % (eweight / natom))
f4.write("energy\n")
f2.close()
f3.close()
f4.close()
def _chimes_write_data(
self,
atoms: Atoms,
eweight: float,
fweight: float,
vweight: float,
per_atom_weights: bool,
) -> np.ndarray:
"""
Organize and write fitting data for ChIMES from an ASE Atoms object.
:param atoms: ASE Atoms object.
:type atoms: Atoms
:param eweight: Energy weight.
:type eweight: float
:param fweight: Force weight.
:type fweight: float
:param vweight: Stress weight.
:type vweight: float
:param per_atom_weights: Use per-atom weights if True.
:type per_atom_weights: bool
:return: Array of unique atom symbols in the configuration.
:rtype: np.ndarray
"""
cell_xyz = atoms.cell.array
atomlist = list(atoms.symbols)
xyz = atoms.get_positions()
energy = atoms.info[ENERGY_KEY]
forces = atoms.arrays[FORCES_KEY]
try:
stress = atoms.info[STRESS_KEY]
stress = np.array(stress)
except Exception:
stress = np.array([])
if per_atom_weights:
try:
weight_mask = atoms.get_array(SELECTION_MASK_KEY)
except KeyError:
raise RuntimeError('per atom weights set to true but no '
'selection mask available in the Atoms!')
else:
weight_mask = np.ones(len(atoms))
# convert units
# ChIMES Orchestrator (metal LAMMPS)
# Energy kcal/mol eV
# Forces Ha/Bohr eV/Angstrom
# Stress GPa bar
kcal_per_mol = kcal / mol
ha_per_bohr = Hartree / Bohr
gpa = 10000.0
energy *= 1.0 / kcal_per_mol
forces *= 1.0 / ha_per_bohr
if (len(stress) > 0):
stress *= 1.0 / gpa
weights = [eweight, fweight, vweight]
self._chimes_write_xyzf(atomlist, xyz, cell_xyz, forces, energy,
stress, weights, weight_mask)
return np.unique(atomlist, return_counts=False)
def _chimes_apair(self, atom_list: list[str], atom_type: str) -> list[str]:
"""
Generate all alphabetically sorted pairs with a given atom type.
:param atom_list: List of atom symbols, i.e. ['C', 'H', 'N', 'O']
:type atom_list: list[str]
:param atom_type: The atom symbol to pair with others, i.e. 'N'
:type atom_type: str
:return: List of sorted atom pairs as strings, i.e. ['CN', 'HN', 'NN',
'NO']
:rtype: list[str]
"""
natom = len(atom_list)
pair = []
for i in range(natom):
tmp_0 = [atom_type, atom_list[i]]
tmp_0.sort()
pair.append(tmp_0[0] + tmp_0[1])
return pair
def _chimes_rmin_calc(
self,
list_dist: list[float],
list_pair: list[str],
atypes: list[str],
) -> list[float]:
"""
Calculate minimum interatomic distances for all pairs.
:param list_dist: List of distances, i.e. [1.1, 1.3, 1.2, 1.5]
:type list_dist: list[float]
:param list_pair: List of pair labels, i.e. ['CC', 'CH', 'HH', 'CN']
:type list_pair: list[str]
:param atypes: Array of sorted unique atom types, i.e. ['C', 'H', 'N',
'O']
:type atypes: list[str]
:return: List of minimum distances for each pair type (['CC', 'HH',
'NN', 'OO', 'CH', 'CN', 'CO', 'HN', 'HO', 'NO'])
:rtype: list[float]
"""
ntype = len(atypes)
rmins = []
# pairs with one atom type
for i in range(ntype):
tpair = atypes[i] + atypes[i]
iloc = [j for j in range(len(list_pair)) if list_pair[j] == tpair]
if len(iloc) == 0:
# atom pair is not found, minimum distance is 100 Angstrom
rmin = 100.0
else:
# take the minimum distance
rmin = np.min(list_dist[iloc])
rmins.append(rmin)
# pairs with two atom types
for i in range(ntype):
for k in range(i + 1, ntype):
tpair = atypes[i] + atypes[k]
iloc = [
j for j in range(len(list_pair)) if list_pair[j] == tpair
]
if len(iloc) == 0:
# atom pair is not found, minimum distance is 100 Angstrom
rmin = 100.0
else:
# take the minimum distance
rmin = np.min(list_dist[iloc])
rmins.append(rmin)
return rmins
def _chimes_read_xyzf(
self,
file_xyz: str,
atom_types: list[str],
) -> tuple[int, int, np.ndarray]:
"""
Parse a ChIMES xyzf file to determine configuration and pair statistics
:param file_xyz: Path to xyzf file that contains forces, energy, and
stresses.
:type file_xyz: str
:param atom_types: list of sorted atom types.
:type atom_types: list[str]
:return: Tuple of (number of configurations, number of condensed
phase, rmin array of the element pairs).
:rtype: tuple[int, int, np.ndarray]
"""
f = open(file_xyz, "rt")
nconf = 0
ncondensed = 0
ntype = len(atom_types)
npair = ntype * (ntype + 1) // 2
rmin_1 = [100.0] * npair
amatrix = np.array([])
while True:
tmp = f.readline()
line = tmp.strip()
if line == '':
break
natom = int(tmp)
cell_xyz = np.zeros(shape=(3, 3))
tmp = f.readline().split()
if tmp[0] == "NON_ORTHO":
if len(tmp) == 11:
# format: ["NON_ORTHO", cell[0,:], cell[1,:], cell[2,:],
# energy]
cell_xyz[0, :] = [float(x) for x in tmp[1:4]]
cell_xyz[1, :] = [float(x) for x in tmp[4:7]]
cell_xyz[2, :] = [float(x) for x in tmp[7:10]]
elif len(tmp) == 17:
# format: ["NON_ORTHO", cell[0,:], cell[1,:], cell[2,:],
# sigma_xx/yy/zz/xy/yz/zx, energy]
cell_xyz[0, :] = [float(x) for x in tmp[1:4]]
cell_xyz[1, :] = [float(x) for x in tmp[4:7]]
cell_xyz[2, :] = [float(x) for x in tmp[7:10]]
ncondensed += 1
else:
print("error, unknown option")
exit()
else:
print("keyword NON_ORTHO isn't found!")
exit()
atomlist = []
xyz = np.zeros(shape=(natom, 3))
for k in range(natom):
tmp = f.readline().split()
atomlist.append(tmp[0])
xyz[k, 0] = float(tmp[1])
xyz[k, 1] = float(tmp[2])
xyz[k, 2] = float(tmp[3])
atoms = Atoms(symbols=atomlist,
positions=xyz,
cell=cell_xyz,
pbc=True)
cell = atoms.get_cell()
nconf += 1
pair = []
dist = np.array([])
for i in range(natom):
tmp_arr = get_distances(atoms.positions[i],
atoms.positions,
pbc=True,
cell=cell)
tmp_dist = tmp_arr[1][0]
tmp_dist[tmp_dist < 0.01] = 100.0
tmp_pair = self._chimes_apair(atomlist, atomlist[i])
pair.extend(tmp_pair)
dist = np.append(dist, tmp_dist)
rmin_2 = self._chimes_rmin_calc(dist, pair, atom_types)
amatrix = np.append(amatrix, rmin_2)
rmin_1 = np.minimum(rmin_1, rmin_2)
if (nconf * npair != len(amatrix)):
print("the size of minimum distance matrix is wrong.")
exit()
amatrix = amatrix.reshape((nconf, npair))
np.savetxt('rmin.dat', amatrix, fmt='%.6f')
f.close
np.savetxt('rmin_all.dat', rmin_1, fmt='%.6f')
return nconf, ncondensed, rmin_1
def _chimes_write_input(
self,
file_xyz: str,
atom_types: list[str],
nconf: int,
ncondensed: int,
rmin: np.ndarray,
polynomial_orders: list[int],
cutoff_distances: list[float],
) -> None:
"""
Write ChIMES input file for fitting.
:param file_xyz: Path to xyzf file.
:type file_xyz: str
:param atom_types: List of atom types.
:type atom_types: list[str]
:param nconf: Number of configurations.
:type nconf: int
:param ncondensed: Number of condensed phase configs.
:type ncondensed: int
:param rmin: Minimum pair distances.
:type rmin: np.ndarray
:param polynomial_orders: Polynomial orders for ChIMES.
:type polynomial_orders: list[int]
:param cutoff_distances: Cutoff distances for ChIMES.
:type cutoff_distances: list[float]
"""
f2 = open('fm_setup.in', "w")
f2.write("\n")
f2.write("####### CONTROL VARIABLES #######\n")
f2.write("\n")
f2.write("# TRJFILE #\n")
f2.write("%s\n" % file_xyz)
f2.write("# WRAPTRJ #\n")
f2.write("true\n")
f2.write("# NFRAMES #\n")
f2.write("%d\n" % nconf)
f2.write("# NLAYERS #\n")
f2.write("1\n")
f2.write("# FITSTRS #\n")
if ncondensed > 0:
f2.write("FIRSTALL %d\n" % ncondensed)
else:
f2.write("false\n")
f2.write("# FITENER #\n")
f2.write("true\n")
f2.write("# FITCOUL #\n")
f2.write("false\n")
f2.write("# FITPOVR #\n")
f2.write("false\n")
f2.write("# PAIRTYP #\n")
f2.write("CHEBYSHEV ")
for i in range(len(polynomial_orders)):
f2.write("%d " % (polynomial_orders[i]))
f2.write("\n")
f2.write("# CHBTYPE #\n")
f2.write("MORSE\n")
# We will probably need this for large training data
# f2.write("# SPLITFI #\n")
# f2.write("true\n")
# f2.write("# SKIP_FRAMES #\n")
# f2.write("1\n")
f2.write("\n")
f2.write("####### TOPOLOGY VARIABLES #######\n")
f2.write("\n")
f2.write("# NATMTYP # \n")
f2.write("%d\n" % len(atom_types))
f2.write("\n")
f2.write("# TYPEIDX # ")
f2.write("# ATM_TYP # ")
f2.write("# ATMCHRG # ")
f2.write("# ATMMASS # ")
f2.write("\n")
for i in range(len(atom_types)):
atomic_number = atomic_numbers[atom_types[i]]
atomic_mass = atomic_masses[atomic_number]
f2.write("%4d %4s %4d %8.3f\n" %
(i + 1, atom_types[i], 0, atomic_mass))
f2.write("\n")
f2.write("# PAIRIDX # ")
f2.write("# ATM_TY1 # ")
f2.write("# ATM_TY1 # ")
f2.write("# S_MINIM # ")
f2.write("# S_MAXIM # ")
f2.write("# S_DELTA # ")
f2.write("# MORSE_LAMBDA # ")
f2.write("# USEOVRP # ")
f2.write("# NIJBINS # ")
f2.write("# NIKBINS # ")
f2.write("# NJKBINS # ")
f2.write("\n")
ncount = 0
for i in range(len(atom_types)):
ncount += 1
atomic_number_1 = atomic_numbers[atom_types[i]]
atomic_number_2 = atomic_numbers[atom_types[i]]
atomic_radius_1 = covalent_radii[atomic_number_1]
atomic_radius_2 = covalent_radii[atomic_number_2]
bond_length = atomic_radius_1 + atomic_radius_2
f2.write("%4d " % (ncount))
f2.write("%4s " % (atom_types[i]))
f2.write("%4s " % (atom_types[i]))
f2.write("%7.3f " % (rmin[ncount - 1]))
f2.write("%7.3f " % (cutoff_distances[0]))
f2.write("%7.3f " % (0.1))
f2.write("%7.3f " % (bond_length))
f2.write("%s " % ('false'))
f2.write(" %d %d %d\n" % (0, 0, 0))
for i in range(len(atom_types)):
for j in range(i + 1, len(atom_types)):
ncount += 1
atomic_number_1 = atomic_numbers[atom_types[i]]
atomic_number_2 = atomic_numbers[atom_types[j]]
atomic_radius_1 = covalent_radii[atomic_number_1]
atomic_radius_2 = covalent_radii[atomic_number_2]
bond_length = atomic_radius_1 + atomic_radius_2
f2.write("%4d " % (ncount))
f2.write("%4s " % (atom_types[i]))
f2.write("%4s " % (atom_types[j]))
f2.write("%7.3f " % (rmin[ncount - 1]))
f2.write("%7.3f " % (cutoff_distances[0]))
f2.write("%7.3f " % (0.1))
f2.write("%7.3f " % (bond_length))
f2.write("%s " % ('false'))
f2.write("%d %d %d\n" % (0, 0, 0))
f2.write("\n")
f2.write("SPECIAL 3B S_MAXIM: ALL %7.3f\n" % cutoff_distances[1])
f2.write("SPECIAL 4B S_MAXIM: ALL %7.3f\n" % cutoff_distances[2])
f2.write("\n")
f2.write("# FCUTTYP #\n")
f2.write("TERSOFF 0.95\n")
f2.write("\n")
f2.write("# ENDFILE #\n")
f2.close()
def _chimes_perform_fit(self) -> None:
"""
Run the ChIMES fitting executables to perform parameter optimization.
"""
f = open("fm_setup.out", "w")
f2 = open("ChIMES_params.txt", "w")
subprocess.run([self.exe_chimes_fit_1, "fm_setup.in"], stdout=f)
subprocess.run(
[
"python",
self.exe_chimes_fit_2,
"--alpha=0.01",
"--weights=weights.dat",
"--algorithm=lassolars",
],
stdout=f2,
)
def _chimes_fit_vs_reference(
self,
file_ref: Optional[str] = 'b.txt',
file_fit: Optional[str] = 'forces.txt',
file_label: Optional[str] = 'label.txt',
) -> float:
"""
Compare ChIMES fit results to reference data and compute RMSE.
:param file_ref: Reference data file. Typically b.txt
:type file_ref: str
:param file_fit: Fitted data file. Typically forces.txt
:type file_fit: str
:param file_label: Label file. Typically label.txt
:type file_label: str
:return: Root mean squared error between reference and fit.
:rtype: float
"""
ref = np.loadtxt(file_ref)
fit = np.loadtxt(file_fit)
label = np.loadtxt(file_label, dtype=str)
iloc_forces = [j for j in range(len(label)) if "forces" in label[j]]
ref_forces = ref[iloc_forces]
fit_forces = fit[iloc_forces]
combined_array = np.column_stack((ref_forces, fit_forces))
np.savetxt('data_compare_force.dat',
combined_array,
fmt='%15.3f',
delimiter=' ')
iloc_energy = [j for j in range(len(label)) if "energy" in label[j]]
ref_energy = ref[iloc_energy]
fit_energy = fit[iloc_energy]
combined_array = np.column_stack((ref_energy, fit_energy))
np.savetxt('data_compare_energy.dat',
combined_array,
fmt='%15.3f',
delimiter=' ')
iloc_stress = [j for j in range(len(label)) if "stress" in label[j]]
ref_stress = ref[iloc_stress]
fit_stress = fit[iloc_stress]
combined_array = np.column_stack((ref_stress, fit_stress))
np.savetxt('data_compare_stress.dat',
combined_array,
fmt='%15.3f',
delimiter=' ')
return np.sqrt(np.mean((ref - fit)**2))
[docs]
def train(
self,
path_type: str,
potential: Potential,
storage: Storage,
dataset_list: list[str],
workflow: Optional[Workflow] = None,
eweight: Optional[float] = 1.0,
fweight: Optional[float] = 1.0,
vweight: Optional[float] = 1.0,
per_atom_weights: Optional[bool] = False,
write_training_script: Optional[bool] = True,
upload_to_kimkit: Optional[bool] = True,
) -> tuple[ChIMES, float]:
"""
Train a ChIMES potential
This is the main method of the trainer class, and uses the parameters
supplied in the ChIMES settings file to perform the potential training
in the fit_directory locaiton specified at instantiation.
:param path_type: specifier for the workflow path, to differentiate
training runs; currently unused in this function
:type path_type: str
:param potential: class object containing ChIMES instance
:type potential: ChIMESPotential instance
:param storage: Storage instance to pull data from
:type storage: Storage
:param dataset_list: List of dataset handles to train with
:type dataset_list: list[str]
:param workflow: the workflow for managing path definition and job
submission, if none are supplied, will use the default workflow
defined in this class |default| ``None``
:type workflow: Workflow
:param eweight: weight of energy data in the loss function
:type eweight: float
:param fweight: weight of the force data in the loss function
:type fweight: float
:param vweight: weight of the stress data in the loss function
:type vweight: float
:param per_atom_weights: True to read from dataset |default| ``False``
:type per_atom_weights: boolean
:param write_training_script: True to write a training script in the
working trainer directory |default| ``True``
:type write_training_script: bool
:param upload_to_kimkit: Upload to kimkit after training. |default|
``True``
:type upload_to_kimkit: bool
:return: Tuple of (trained ChIMES model, error metric).
:rtype: tuple[ChIMES, float]
"""
if dataset_list is None or storage is None:
raise ValueError('A storage object and list of dataset handles'
' are required!')
if not isinstance(per_atom_weights, bool):
raise ValueError('per_atom_weights must be bool for ChIMES!')
# reset parameter_path for new training
potential.parameter_path = None
if write_training_script:
# for normal training we need to make a path to save to
if workflow is None:
workflow = self.default_wf
save_path = workflow.make_path(self.__class__.__name__, path_type)
else:
save_path = path_type
if not isinstance(dataset_list, list):
dataset_list = [dataset_list]
combined_dataset = []
for dataset_handle in dataset_list:
configs = self._get_training_data(dataset_handle, storage)
combined_dataset.extend(configs)
# Write ASE object to file xyzf, which is one input file for ChIMES LSQ
collect_atom_types = []
for atoms in combined_dataset:
collect_atom_types.extend(
self._chimes_write_data(
atoms,
eweight,
fweight,
vweight,
per_atom_weights,
))
# Read the xyzf file and compute rmins for all pairs,
# as well as the total number of configurations
# and the number of condensed structures
file_xyz = "training_ChIMES.xyzf"
atom_types = np.unique(collect_atom_types, return_counts=False)
nconf, ncondensed, rmins = self._chimes_read_xyzf(file_xyz, atom_types)
chimes = potential.model
_x = chimes.polynomial_orders
polynomial_orders = [int(i) for i in _x.split()]
_x = chimes.cutoff_distances
cutoff_distances = [float(i) for i in _x.split()]
if len(polynomial_orders) != 3 or len(cutoff_distances) != 3:
raise ValueError(
'lengths of polynomial_orders and cutoff_distances'
' must be 3!')
# Write file fm_setup.in, the other input for ChIMES LSQ
self._chimes_write_input(file_xyz, atom_types, nconf, ncondensed,
rmins, polynomial_orders, cutoff_distances)
current_directory = os.getcwd()
subprocess.run(['rm', '-rf', self.fit_directory])
subprocess.run(["mkdir", self.fit_directory])
files_to_move = [
"training_ChIMES.xyzf", "fm_setup.in", "rmin.dat", "rmin_all.dat",
"weights.dat", "label.txt"
]
for file_to_move in files_to_move:
subprocess.run(["mv", file_to_move, self.fit_directory])
os.chdir(self.fit_directory)
self._chimes_perform_fit()
rmse = self._chimes_fit_vs_reference(file_ref='b.txt',
file_fit='force.txt',
file_label='label.txt')
os.chdir(current_directory)
potential.model = chimes
# Finally output the model files
_ = self._save_model(
save_path,
potential,
potential_name='chimes_potential',
loss=rmse,
create_path=False,
workflow=workflow,
)
if upload_to_kimkit:
training_files = [f'{path_type}/training_script.py']
# if include_weights_file is True:
# training_files.append(f'{path_type}/weights.txt')
potential.save_potential_files(work_dir=save_path,
training_files=training_files,
import_to_kimkit=True,
write_to_tmp_dir=False)
return chimes, rmse
[docs]
def submit_train(
self,
path_type: str,
potential: Potential,
storage: Storage,
dataset_list: list[str],
workflow: Workflow,
job_details: dict,
eweight: Optional[float] = 1.0,
fweight: Optional[float] = 1.0,
vweight: Optional[float] = 1.0,
per_atom_weights: Optional[bool] = False,
upload_to_kimkit: Optional[bool] = True,
) -> int:
"""
Asychronously train the potential based on the trainer details
This is a main method of the trainer class, and uses the parameters
supplied at instantiation to perform the potential training by
minimizing a loss function. While :meth:`train` works synchronously,
this method submits training to a job scheduler. Unless fit_directory
is set as an absolute path, it will be a local version in the working
directory generated by the Workflow.
:param path_type: specifier for the workflow path, to differentiate
training runs
:type path_type: str
:param potential: potential to be trained. The actual model itself is
set as an attribute of the Potential object
:type potential: Potential
:param storage: Storage instance to pull data from
:type storage: Storage
:param dataset_list: List of dataset handles to train with
:type dataset_list: list[str]
:param workflow: the workflow for managing path definition and job
submission, if none are supplied, will use the default workflow
defined in this class
:type workflow: Workflow
:param job_details: job parameters such as walltime or # of nodes
:type job_details: dict
:param eweight: weight of energy data in the loss function
:type eweight: float
:param fweight: weight of the force data in the loss function
:type fweight: float
:param vweight: weight of the stress data in the loss function
:type vweight: float
:param per_atom_weights: True to read from dataset |default| ``False``
:type per_atom_weights: boolean
:param upload_to_kimkit: Upload to kimkit after training |default|
``True``
:type upload_to_kimkit: bool
:returns: calculation ID of the submitted job
:rtype: int
"""
if dataset_list is None or storage is None:
raise ValueError('A storage object and list of dataset handles'
' are required!')
if not isinstance(per_atom_weights, bool):
raise ValueError('per_atom_weights must be bool for ChIMES!')
# reset parameter_path for new training
potential.parameter_path = None
potential.trainer_args['parameter_path'] = None
if not isinstance(dataset_list, list):
dataset_list = [dataset_list]
save_path = workflow.make_path(self.__class__.__name__, f'{path_type}')
script_filename = self._write_training_script(
save_path,
dataset_list,
potential,
storage,
eweight,
fweight,
vweight,
per_atom_weights=per_atom_weights,
upload_to_kimkit=upload_to_kimkit,
)
job_details['custom_preamble'] = 'python'
calc_id = workflow.submit_job(
script_filename,
save_path,
job_details=job_details,
)
return calc_id
def _save_model(
self,
path_type: str,
potential: Potential,
potential_name: Optional[str] = 'chimes_potential',
loss: Optional[float] = None,
create_path: Optional[bool] = True,
workflow: Optional[Workflow] = None,
) -> str:
"""
Deploy a ChIMES model. Write error metric and LAMMPS input files
Provide the path to the ChIMES parameter file
:param path_type: specifier for the workflow path, to differentiate
training runs and where the model will be saved
:type path_type: str
:param potential: potential to be saved
:type potential: ChIMESPotential
:param potential_name: name to save the potential as
|default| 'chimes_potential'
:type potential_name: str
:param loss: ChIMES error object; this can but probably should not
be supplied by the user
:type loss: ChIMES error
:param create_path: if the function needs to create a new path, or if
path_type should be used as the full path |default| ``True``
:type create_path: boolean
:param workflow: the workflow for managing path definition, if none are
supplied, will use the default workflow defined in this class
|default| ``None``
:type workflow: Workflow
:returns: path where the model is saved (inclusive)
:rtype: str
"""
if workflow is None:
workflow = self.default_wf
if create_path:
save_path = workflow.make_path(self.__class__.__name__, path_type)
else:
save_path = path_type
if potential.parameter_path is not None:
return potential.parameter_path
else: # first save after a local train()
self.logger.info(f'Saving model state in {save_path}')
potential.parameter_path = f'{save_path}/{potential_name}'
file_parameter = f'{self.fit_directory}/ChIMES_params.txt'
# Write the masses to masses.lammps for later use in LAMMPS
# with ChIMES via KIM_API.
self._chimes_write_masses()
subprocess.run(["mv", "masses.lammps", f'{save_path}/'])
subprocess.run(
["mv", file_parameter, f'{save_path}/{potential_name}'])
return f'{save_path}/{potential_name}'
[docs]
def load_from_submitted_training(
self,
calc_id: int,
potential: Potential,
workflow: Workflow,
) -> None:
"""
reload a potential that was trained via a submitted job
:param calc_id: calculation ID of the submitted training job
:type calc_id: int
:param potential: :class:`~.ChIMESPotential`
class object that will be updated with the model saved to disk
after the training job.
:type potential: ChIMESPotential
:param workflow: the workflow for managing path definition and job
submission
:type workflow: Workflow
"""
workflow.block_until_completed(calc_id)
if potential.name is not None:
potential_name = potential.name
else:
potential_name = "chimes_potential"
parameter_path = workflow.get_job_path(calc_id) + '/' + potential_name
potential.parameter_path = parameter_path
self.logger.info(f'Loading potential from: {parameter_path}')