Source code for orchestrator.trainer.chimes

import os
import subprocess
from ase.units import kcal, mol, Hartree, Bohr
from ase.geometry import get_distances
from ase.data import atomic_numbers, atomic_masses
from ase.data import covalent_radii, chemical_symbols
from os import path
import numpy as np
from typing import Optional
from ..storage.storage_base import Storage
from ..potential.potential_base import Potential
from ..potential.chimes import ChIMES
from ..workflow.workflow_base import Workflow
from ase import Atoms
from .trainer_base import Trainer
from ..utils.data_standard import (
    ENERGY_KEY,
    FORCES_KEY,
    STRESS_KEY,
    SELECTION_MASK_KEY,
)


[docs] class ChIMESTrainer(Trainer): """ Train and deploy a potential using ChIMES The trainer class is responsible for handling the loading/assignment of training data, as well as the actual process of training a potential. This trainer is intended to be used with ChIMES model trained with ASE training data. WARNING: the fit directory location will be overwritten during any call to the train functions. """
[docs] def __init__( self, exe_chimes_fit_1: str, exe_chimes_fit_2: str, fit_directory: Optional[str] = '_ChIMES_FIT', **kwargs, ) -> None: """ Initialize the ChIMESTrainer. :param exe_chimes_fit_1: Path to the first ChIMES fitting executable - /build/chimes_lsq (executable) :type exe_chimes_fit_1: str :param exe_chimes_fit_2: Path to the second ChIMES fitting executable - src/chimes_lsq.py (python script) :type exe_chimes_fit_2: str :param fit_directory: Directory for fitting outputs. WARNING: this directory location will be overwritten during any call to a training function :type fit_directory: Optional[str] :param kwargs: Additional keyword arguments for the base Trainer. :type kwargs: dict """ super().__init__(**kwargs) self.exe_chimes_fit_1 = path.abspath(exe_chimes_fit_1) self.exe_chimes_fit_2 = path.abspath(exe_chimes_fit_2) self.fit_directory = fit_directory # arguments to reinitialize an instance of the trainer self.trainer_init_args = { 'exe_chimes_fit_1': self.exe_chimes_fit_1, 'exe_chimes_fit_2': self.exe_chimes_fit_2, 'fit_directory': self.fit_directory, }
[docs] def checkpoint_trainer(self) -> None: """ checkpoint the trainer module into the checkpoint file save necessary internal variables into a dict with key checkpoint_name and write to the (json) checkpoint file for restart capabilities """ pass
[docs] def restart_trainer(self) -> None: """ restart the trainer module from the checkpoint file check if the checkpoint_file has an entry matching the checkpoint_name and set internal variables accordingly if so """ pass
def _get_training_data( self, dataset_handle: str, storage: Storage, ) -> list[Atoms]: """ Get the training data configurations Retrieve the dataset specified by dataset_handle from the passed storage module. :param dataset_handle: the identifier of the dataset to extract from the storage module :type dataset_handle: str :param storage: storage instance where the training data is saved :type storage: Storage :returns: training data of configurations :rtype: ASE Dataset """ self.logger.info('Reading training data from storage') training_set = storage.get_data(dataset_handle) for c in training_set: try: c.info[ENERGY_KEY] = c.get_potential_energy() except Exception: pass try: c.info[STRESS_KEY] = c.get_stress() except Exception: pass try: c.set_array(FORCES_KEY, c.get_forces()) except Exception: pass try: c.info[SELECTION_MASK_KEY] = c.get_array(SELECTION_MASK_KEY) except Exception: pass return training_set def _write_training_script( self, save_path: str, dataset_list: list[str], potential: Potential, storage: Storage, eweight: float = 1.0, fweight: float = 1.0, vweight: float = 1.0, per_atom_weights: bool = False, upload_to_kimkit=True, ) -> str: """ write a script to run the trainer outside of memory this is a helper function for generating a script, training_script.py, which can be executed via a workflow or offline :param save_path: path where the training script will be written :type save_path: str :param dataset_list: list of dataset handles which should be used for the training procedure :type dataset_list: list of str :param potential: Potential instance to be trained, expect its pre-trained state to be written to save_path/potential_to_train.pkl :type potential: Potential :param storage: an instance of the storage class, which contains the datasets in dataset_list :type storage: Storage :param eweight: weight of energy data in the loss function :type eweight: float :param fweight: weight of the force data in the loss function :type fweight: float :param vweight: weight of the stress data in the loss function :type vweight: float :param per_atom_weights: True to read from dataset |default| ``False`` :type per_atom_weights: boolean :param upload_to_kimkit: Whether to upload to kimkit after training |default| ``True``. :type upload_to_kimkit: bool :returns: the name of the execution script :rtype: str """ full_save_path = path.abspath(save_path) import_lines = ('from orchestrator.utils.setup_input import ' 'init_and_validate_module_type\n' 'from numpy import loadtxt, array, zeros\n') trainer_dict = { 'trainer_type': self.factory_token, 'trainer_args': self.trainer_init_args } init_trainer = ('trainer = init_and_validate_module_type("trainer", ' f'{trainer_dict}, single_input_dict=True)') storage_dict = { 'storage_type': storage.factory_token, 'storage_args': storage.storage_init_args } init_storage = ('storage = init_and_validate_module_type("storage", ' f'{storage_dict}, single_input_dict=True)') potential_dict = { 'potential_type': potential.factory_token, 'potential_args': potential.trainer_args } init_potential = ('potential = init_and_validate_module_type(' f'"potential", {potential_dict}, ' 'single_input_dict=True)\n') load_potential = "potential.build_potential()" # Currently uses the workflow from trainer, not submit_train's input construct_and_train = ( f'chimes, errors = trainer.train(path_type="{full_save_path}",' 'potential=potential,' 'storage=storage,' f'dataset_list={dataset_list},' f'eweight={eweight},' f'fweight={fweight},' f'vweight={vweight},' f'per_atom_weights={per_atom_weights},' 'write_training_script=False,' f'upload_to_kimkit={upload_to_kimkit})') script = '\n'.join([ import_lines, init_trainer, init_storage, init_potential, load_potential, construct_and_train, ]) with open(f'{save_path}/training_script.py', 'w') as fout: fout.write(script) return 'training_script.py' def _chimes_write_masses(self) -> None: """ Write atomic masses for all elements to a LAMMPS-compatible file. """ # Elements from H (Z=1) to Og (Z=118) symbols = chemical_symbols[1:119] masses = atomic_masses[1:119] nlen = len(symbols) comment = ( """# The KIM API Simulator Model Interface (SMI) allows a uniform # interface to any simulator model regardless of type with the # "kim interactions" command followed by the mapping of species to numeric # LAMMPS atom types, e.g. if your atom types 1 and 2 are C, and 3 is Si, # "kim interactions C C Si" # The atom types string (e.g. "C C Si") is passed to the LAMMPS commands in # smspec.edn through the template map key "atom-type-sym-list". # See https://kim-api.readthedocs.io/en/latest/implementation.html#kim_api_smi # Usually, this can be used with the pair_coeff command, but because ChIMES # assigns atom types based on mass, we use LAMMPS scripting to assign masses # by saving "atom-type-sym-list" as the LAMMPS variable kim_atom_type_sym_list # in smspec.edn, and invoking this LAMMPS script. Repeated "mass" commands # should not be an issue if the user wishes to define or redefine the masses # later. variable atom_sym_i index ${kim_atom_type_sym_list} variable atom_type_i loop 10000 label loopi """) file_path = 'masses.lammps' with open(file_path, 'w') as file: file.write(comment) # first element is Hydrogen atom_symbol = symbols[0] atom_mass = masses[0] text = (f' if "${{atom_sym_i}} == {atom_symbol}" then ' f'"mass ${{atom_type_i}} {atom_mass:.6f}" &\n') file.write(text) for i in range(1, nlen - 1): atom_symbol = symbols[i] atom_mass = masses[i] text = (f' elif "${{atom_sym_i}} == {atom_symbol}" ' f'"mass ${{atom_type_i}} {atom_mass:.6f}" &\n') file.write(text) # last element atom_symbol = symbols[-1] atom_mass = masses[-1] text = (f' elif "${{atom_sym_i}} == {atom_symbol}" ' f'"mass ${{atom_type_i}} {atom_mass:.6f}" \n') file.write(text) # some last lines text = """ next atom_type_i next atom_sym_i jump SELF loopi variable atom_type_i delete """ file.write(text) def _chimes_write_xyzf( self, atomlist: list[str], xyz: np.ndarray, cell_xyz: np.ndarray, fxyz: np.ndarray, energy: float, stress: np.ndarray, weights: list[float], weight_mask: np.ndarray, ) -> None: """ Write fitting data to an xyz file for ChIMES LSQ. This is called on the data from a single atomic configuration :param atomlist: List of atomic symbols. :type atomlist: list[str] :param xyz: Atomic positions array. :type xyz: np.ndarray :param cell_xyz: Cell matrix. :type cell_xyz: np.ndarray :param fxyz: Forces array. :type fxyz: np.ndarray :param energy: Configuration energy. :type energy: float :param stress: Stress tensor. :type stress: np.ndarray :param weights: List of weights [eweight, fweight, vweight]. :type weights: list[float] :param weight_mask: Per-atom weight mask. :type weight_mask: np.ndarray """ eweight = weights[0] fweight = weights[1] vweight = weights[2] f2 = open('training_ChIMES.xyzf', 'a') f3 = open('weights.dat', 'a') f4 = open('label.txt', 'a') natom = len(atomlist) f2.write("%1d\n" % (natom)) # cell parameters f2.write("NON_ORTHO ") for i in range(3): for j in range(3): f2.write("%9.4f" % (cell_xyz[i, j])) if (len(stress) > 0): # Voigt notation for stress tensor: # xx, yy, zz for i in range(3): f2.write("%12.4f" % (stress[i])) # stress off-diagonal xy, xz, yz f2.write("%12.4f" % (stress[5])) f2.write("%12.4f" % (stress[4])) f2.write("%12.4f" % (stress[3])) # energy f2.write("%20.4f" % (energy)) f2.write("\n") # xyz, fxyz for i in range(natom): f2.write("%s" % (atomlist[i])) for j in range(3): f2.write("%15.9f" % (xyz[i, j])) for j in range(3): f2.write("%15.9f" % (fxyz[i, j])) f2.write("\n") # weights of forces for j in range(3): f3.write("%15.9f\n" % (fweight * weight_mask[i])) f4.write("forces\n") if (len(stress) > 0): # weights of stress for j in range(9): f3.write("%15.9f\n" % (vweight)) f4.write("stress\n") # weights of energy for j in range(3): f3.write("%15.9f\n" % (eweight / natom)) f4.write("energy\n") f2.close() f3.close() f4.close() def _chimes_write_data( self, atoms: Atoms, eweight: float, fweight: float, vweight: float, per_atom_weights: bool, ) -> np.ndarray: """ Organize and write fitting data for ChIMES from an ASE Atoms object. :param atoms: ASE Atoms object. :type atoms: Atoms :param eweight: Energy weight. :type eweight: float :param fweight: Force weight. :type fweight: float :param vweight: Stress weight. :type vweight: float :param per_atom_weights: Use per-atom weights if True. :type per_atom_weights: bool :return: Array of unique atom symbols in the configuration. :rtype: np.ndarray """ cell_xyz = atoms.cell.array atomlist = list(atoms.symbols) xyz = atoms.get_positions() energy = atoms.info[ENERGY_KEY] forces = atoms.arrays[FORCES_KEY] try: stress = atoms.info[STRESS_KEY] stress = np.array(stress) except Exception: stress = np.array([]) if per_atom_weights: try: weight_mask = atoms.get_array(SELECTION_MASK_KEY) except KeyError: raise RuntimeError('per atom weights set to true but no ' 'selection mask available in the Atoms!') else: weight_mask = np.ones(len(atoms)) # convert units # ChIMES Orchestrator (metal LAMMPS) # Energy kcal/mol eV # Forces Ha/Bohr eV/Angstrom # Stress GPa bar kcal_per_mol = kcal / mol ha_per_bohr = Hartree / Bohr gpa = 10000.0 energy *= 1.0 / kcal_per_mol forces *= 1.0 / ha_per_bohr if (len(stress) > 0): stress *= 1.0 / gpa weights = [eweight, fweight, vweight] self._chimes_write_xyzf(atomlist, xyz, cell_xyz, forces, energy, stress, weights, weight_mask) return np.unique(atomlist, return_counts=False) def _chimes_apair(self, atom_list: list[str], atom_type: str) -> list[str]: """ Generate all alphabetically sorted pairs with a given atom type. :param atom_list: List of atom symbols, i.e. ['C', 'H', 'N', 'O'] :type atom_list: list[str] :param atom_type: The atom symbol to pair with others, i.e. 'N' :type atom_type: str :return: List of sorted atom pairs as strings, i.e. ['CN', 'HN', 'NN', 'NO'] :rtype: list[str] """ natom = len(atom_list) pair = [] for i in range(natom): tmp_0 = [atom_type, atom_list[i]] tmp_0.sort() pair.append(tmp_0[0] + tmp_0[1]) return pair def _chimes_rmin_calc( self, list_dist: list[float], list_pair: list[str], atypes: list[str], ) -> list[float]: """ Calculate minimum interatomic distances for all pairs. :param list_dist: List of distances, i.e. [1.1, 1.3, 1.2, 1.5] :type list_dist: list[float] :param list_pair: List of pair labels, i.e. ['CC', 'CH', 'HH', 'CN'] :type list_pair: list[str] :param atypes: Array of sorted unique atom types, i.e. ['C', 'H', 'N', 'O'] :type atypes: list[str] :return: List of minimum distances for each pair type (['CC', 'HH', 'NN', 'OO', 'CH', 'CN', 'CO', 'HN', 'HO', 'NO']) :rtype: list[float] """ ntype = len(atypes) rmins = [] # pairs with one atom type for i in range(ntype): tpair = atypes[i] + atypes[i] iloc = [j for j in range(len(list_pair)) if list_pair[j] == tpair] if len(iloc) == 0: # atom pair is not found, minimum distance is 100 Angstrom rmin = 100.0 else: # take the minimum distance rmin = np.min(list_dist[iloc]) rmins.append(rmin) # pairs with two atom types for i in range(ntype): for k in range(i + 1, ntype): tpair = atypes[i] + atypes[k] iloc = [ j for j in range(len(list_pair)) if list_pair[j] == tpair ] if len(iloc) == 0: # atom pair is not found, minimum distance is 100 Angstrom rmin = 100.0 else: # take the minimum distance rmin = np.min(list_dist[iloc]) rmins.append(rmin) return rmins def _chimes_read_xyzf( self, file_xyz: str, atom_types: list[str], ) -> tuple[int, int, np.ndarray]: """ Parse a ChIMES xyzf file to determine configuration and pair statistics :param file_xyz: Path to xyzf file that contains forces, energy, and stresses. :type file_xyz: str :param atom_types: list of sorted atom types. :type atom_types: list[str] :return: Tuple of (number of configurations, number of condensed phase, rmin array of the element pairs). :rtype: tuple[int, int, np.ndarray] """ f = open(file_xyz, "rt") nconf = 0 ncondensed = 0 ntype = len(atom_types) npair = ntype * (ntype + 1) // 2 rmin_1 = [100.0] * npair amatrix = np.array([]) while True: tmp = f.readline() line = tmp.strip() if line == '': break natom = int(tmp) cell_xyz = np.zeros(shape=(3, 3)) tmp = f.readline().split() if tmp[0] == "NON_ORTHO": if len(tmp) == 11: # format: ["NON_ORTHO", cell[0,:], cell[1,:], cell[2,:], # energy] cell_xyz[0, :] = [float(x) for x in tmp[1:4]] cell_xyz[1, :] = [float(x) for x in tmp[4:7]] cell_xyz[2, :] = [float(x) for x in tmp[7:10]] elif len(tmp) == 17: # format: ["NON_ORTHO", cell[0,:], cell[1,:], cell[2,:], # sigma_xx/yy/zz/xy/yz/zx, energy] cell_xyz[0, :] = [float(x) for x in tmp[1:4]] cell_xyz[1, :] = [float(x) for x in tmp[4:7]] cell_xyz[2, :] = [float(x) for x in tmp[7:10]] ncondensed += 1 else: print("error, unknown option") exit() else: print("keyword NON_ORTHO isn't found!") exit() atomlist = [] xyz = np.zeros(shape=(natom, 3)) for k in range(natom): tmp = f.readline().split() atomlist.append(tmp[0]) xyz[k, 0] = float(tmp[1]) xyz[k, 1] = float(tmp[2]) xyz[k, 2] = float(tmp[3]) atoms = Atoms(symbols=atomlist, positions=xyz, cell=cell_xyz, pbc=True) cell = atoms.get_cell() nconf += 1 pair = [] dist = np.array([]) for i in range(natom): tmp_arr = get_distances(atoms.positions[i], atoms.positions, pbc=True, cell=cell) tmp_dist = tmp_arr[1][0] tmp_dist[tmp_dist < 0.01] = 100.0 tmp_pair = self._chimes_apair(atomlist, atomlist[i]) pair.extend(tmp_pair) dist = np.append(dist, tmp_dist) rmin_2 = self._chimes_rmin_calc(dist, pair, atom_types) amatrix = np.append(amatrix, rmin_2) rmin_1 = np.minimum(rmin_1, rmin_2) if (nconf * npair != len(amatrix)): print("the size of minimum distance matrix is wrong.") exit() amatrix = amatrix.reshape((nconf, npair)) np.savetxt('rmin.dat', amatrix, fmt='%.6f') f.close np.savetxt('rmin_all.dat', rmin_1, fmt='%.6f') return nconf, ncondensed, rmin_1 def _chimes_write_input( self, file_xyz: str, atom_types: list[str], nconf: int, ncondensed: int, rmin: np.ndarray, polynomial_orders: list[int], cutoff_distances: list[float], ) -> None: """ Write ChIMES input file for fitting. :param file_xyz: Path to xyzf file. :type file_xyz: str :param atom_types: List of atom types. :type atom_types: list[str] :param nconf: Number of configurations. :type nconf: int :param ncondensed: Number of condensed phase configs. :type ncondensed: int :param rmin: Minimum pair distances. :type rmin: np.ndarray :param polynomial_orders: Polynomial orders for ChIMES. :type polynomial_orders: list[int] :param cutoff_distances: Cutoff distances for ChIMES. :type cutoff_distances: list[float] """ f2 = open('fm_setup.in', "w") f2.write("\n") f2.write("####### CONTROL VARIABLES #######\n") f2.write("\n") f2.write("# TRJFILE #\n") f2.write("%s\n" % file_xyz) f2.write("# WRAPTRJ #\n") f2.write("true\n") f2.write("# NFRAMES #\n") f2.write("%d\n" % nconf) f2.write("# NLAYERS #\n") f2.write("1\n") f2.write("# FITSTRS #\n") if ncondensed > 0: f2.write("FIRSTALL %d\n" % ncondensed) else: f2.write("false\n") f2.write("# FITENER #\n") f2.write("true\n") f2.write("# FITCOUL #\n") f2.write("false\n") f2.write("# FITPOVR #\n") f2.write("false\n") f2.write("# PAIRTYP #\n") f2.write("CHEBYSHEV ") for i in range(len(polynomial_orders)): f2.write("%d " % (polynomial_orders[i])) f2.write("\n") f2.write("# CHBTYPE #\n") f2.write("MORSE\n") # We will probably need this for large training data # f2.write("# SPLITFI #\n") # f2.write("true\n") # f2.write("# SKIP_FRAMES #\n") # f2.write("1\n") f2.write("\n") f2.write("####### TOPOLOGY VARIABLES #######\n") f2.write("\n") f2.write("# NATMTYP # \n") f2.write("%d\n" % len(atom_types)) f2.write("\n") f2.write("# TYPEIDX # ") f2.write("# ATM_TYP # ") f2.write("# ATMCHRG # ") f2.write("# ATMMASS # ") f2.write("\n") for i in range(len(atom_types)): atomic_number = atomic_numbers[atom_types[i]] atomic_mass = atomic_masses[atomic_number] f2.write("%4d %4s %4d %8.3f\n" % (i + 1, atom_types[i], 0, atomic_mass)) f2.write("\n") f2.write("# PAIRIDX # ") f2.write("# ATM_TY1 # ") f2.write("# ATM_TY1 # ") f2.write("# S_MINIM # ") f2.write("# S_MAXIM # ") f2.write("# S_DELTA # ") f2.write("# MORSE_LAMBDA # ") f2.write("# USEOVRP # ") f2.write("# NIJBINS # ") f2.write("# NIKBINS # ") f2.write("# NJKBINS # ") f2.write("\n") ncount = 0 for i in range(len(atom_types)): ncount += 1 atomic_number_1 = atomic_numbers[atom_types[i]] atomic_number_2 = atomic_numbers[atom_types[i]] atomic_radius_1 = covalent_radii[atomic_number_1] atomic_radius_2 = covalent_radii[atomic_number_2] bond_length = atomic_radius_1 + atomic_radius_2 f2.write("%4d " % (ncount)) f2.write("%4s " % (atom_types[i])) f2.write("%4s " % (atom_types[i])) f2.write("%7.3f " % (rmin[ncount - 1])) f2.write("%7.3f " % (cutoff_distances[0])) f2.write("%7.3f " % (0.1)) f2.write("%7.3f " % (bond_length)) f2.write("%s " % ('false')) f2.write(" %d %d %d\n" % (0, 0, 0)) for i in range(len(atom_types)): for j in range(i + 1, len(atom_types)): ncount += 1 atomic_number_1 = atomic_numbers[atom_types[i]] atomic_number_2 = atomic_numbers[atom_types[j]] atomic_radius_1 = covalent_radii[atomic_number_1] atomic_radius_2 = covalent_radii[atomic_number_2] bond_length = atomic_radius_1 + atomic_radius_2 f2.write("%4d " % (ncount)) f2.write("%4s " % (atom_types[i])) f2.write("%4s " % (atom_types[j])) f2.write("%7.3f " % (rmin[ncount - 1])) f2.write("%7.3f " % (cutoff_distances[0])) f2.write("%7.3f " % (0.1)) f2.write("%7.3f " % (bond_length)) f2.write("%s " % ('false')) f2.write("%d %d %d\n" % (0, 0, 0)) f2.write("\n") f2.write("SPECIAL 3B S_MAXIM: ALL %7.3f\n" % cutoff_distances[1]) f2.write("SPECIAL 4B S_MAXIM: ALL %7.3f\n" % cutoff_distances[2]) f2.write("\n") f2.write("# FCUTTYP #\n") f2.write("TERSOFF 0.95\n") f2.write("\n") f2.write("# ENDFILE #\n") f2.close() def _chimes_perform_fit(self) -> None: """ Run the ChIMES fitting executables to perform parameter optimization. """ f = open("fm_setup.out", "w") f2 = open("ChIMES_params.txt", "w") subprocess.run([self.exe_chimes_fit_1, "fm_setup.in"], stdout=f) subprocess.run( [ "python", self.exe_chimes_fit_2, "--alpha=0.01", "--weights=weights.dat", "--algorithm=lassolars", ], stdout=f2, ) def _chimes_fit_vs_reference( self, file_ref: Optional[str] = 'b.txt', file_fit: Optional[str] = 'forces.txt', file_label: Optional[str] = 'label.txt', ) -> float: """ Compare ChIMES fit results to reference data and compute RMSE. :param file_ref: Reference data file. Typically b.txt :type file_ref: str :param file_fit: Fitted data file. Typically forces.txt :type file_fit: str :param file_label: Label file. Typically label.txt :type file_label: str :return: Root mean squared error between reference and fit. :rtype: float """ ref = np.loadtxt(file_ref) fit = np.loadtxt(file_fit) label = np.loadtxt(file_label, dtype=str) iloc_forces = [j for j in range(len(label)) if "forces" in label[j]] ref_forces = ref[iloc_forces] fit_forces = fit[iloc_forces] combined_array = np.column_stack((ref_forces, fit_forces)) np.savetxt('data_compare_force.dat', combined_array, fmt='%15.3f', delimiter=' ') iloc_energy = [j for j in range(len(label)) if "energy" in label[j]] ref_energy = ref[iloc_energy] fit_energy = fit[iloc_energy] combined_array = np.column_stack((ref_energy, fit_energy)) np.savetxt('data_compare_energy.dat', combined_array, fmt='%15.3f', delimiter=' ') iloc_stress = [j for j in range(len(label)) if "stress" in label[j]] ref_stress = ref[iloc_stress] fit_stress = fit[iloc_stress] combined_array = np.column_stack((ref_stress, fit_stress)) np.savetxt('data_compare_stress.dat', combined_array, fmt='%15.3f', delimiter=' ') return np.sqrt(np.mean((ref - fit)**2))
[docs] def train( self, path_type: str, potential: Potential, storage: Storage, dataset_list: list[str], workflow: Optional[Workflow] = None, eweight: Optional[float] = 1.0, fweight: Optional[float] = 1.0, vweight: Optional[float] = 1.0, per_atom_weights: Optional[bool] = False, write_training_script: Optional[bool] = True, upload_to_kimkit: Optional[bool] = True, ) -> tuple[ChIMES, float]: """ Train a ChIMES potential This is the main method of the trainer class, and uses the parameters supplied in the ChIMES settings file to perform the potential training in the fit_directory locaiton specified at instantiation. :param path_type: specifier for the workflow path, to differentiate training runs; currently unused in this function :type path_type: str :param potential: class object containing ChIMES instance :type potential: ChIMESPotential instance :param storage: Storage instance to pull data from :type storage: Storage :param dataset_list: List of dataset handles to train with :type dataset_list: list[str] :param workflow: the workflow for managing path definition and job submission, if none are supplied, will use the default workflow defined in this class |default| ``None`` :type workflow: Workflow :param eweight: weight of energy data in the loss function :type eweight: float :param fweight: weight of the force data in the loss function :type fweight: float :param vweight: weight of the stress data in the loss function :type vweight: float :param per_atom_weights: True to read from dataset |default| ``False`` :type per_atom_weights: boolean :param write_training_script: True to write a training script in the working trainer directory |default| ``True`` :type write_training_script: bool :param upload_to_kimkit: Upload to kimkit after training. |default| ``True`` :type upload_to_kimkit: bool :return: Tuple of (trained ChIMES model, error metric). :rtype: tuple[ChIMES, float] """ if dataset_list is None or storage is None: raise ValueError('A storage object and list of dataset handles' ' are required!') if not isinstance(per_atom_weights, bool): raise ValueError('per_atom_weights must be bool for ChIMES!') # reset parameter_path for new training potential.parameter_path = None if write_training_script: # for normal training we need to make a path to save to if workflow is None: workflow = self.default_wf save_path = workflow.make_path(self.__class__.__name__, path_type) else: save_path = path_type if not isinstance(dataset_list, list): dataset_list = [dataset_list] combined_dataset = [] for dataset_handle in dataset_list: configs = self._get_training_data(dataset_handle, storage) combined_dataset.extend(configs) # Write ASE object to file xyzf, which is one input file for ChIMES LSQ collect_atom_types = [] for atoms in combined_dataset: collect_atom_types.extend( self._chimes_write_data( atoms, eweight, fweight, vweight, per_atom_weights, )) # Read the xyzf file and compute rmins for all pairs, # as well as the total number of configurations # and the number of condensed structures file_xyz = "training_ChIMES.xyzf" atom_types = np.unique(collect_atom_types, return_counts=False) nconf, ncondensed, rmins = self._chimes_read_xyzf(file_xyz, atom_types) chimes = potential.model _x = chimes.polynomial_orders polynomial_orders = [int(i) for i in _x.split()] _x = chimes.cutoff_distances cutoff_distances = [float(i) for i in _x.split()] if len(polynomial_orders) != 3 or len(cutoff_distances) != 3: raise ValueError( 'lengths of polynomial_orders and cutoff_distances' ' must be 3!') # Write file fm_setup.in, the other input for ChIMES LSQ self._chimes_write_input(file_xyz, atom_types, nconf, ncondensed, rmins, polynomial_orders, cutoff_distances) current_directory = os.getcwd() subprocess.run(['rm', '-rf', self.fit_directory]) subprocess.run(["mkdir", self.fit_directory]) files_to_move = [ "training_ChIMES.xyzf", "fm_setup.in", "rmin.dat", "rmin_all.dat", "weights.dat", "label.txt" ] for file_to_move in files_to_move: subprocess.run(["mv", file_to_move, self.fit_directory]) os.chdir(self.fit_directory) self._chimes_perform_fit() rmse = self._chimes_fit_vs_reference(file_ref='b.txt', file_fit='force.txt', file_label='label.txt') os.chdir(current_directory) potential.model = chimes # Finally output the model files _ = self._save_model( save_path, potential, potential_name='chimes_potential', loss=rmse, create_path=False, workflow=workflow, ) if upload_to_kimkit: training_files = [f'{path_type}/training_script.py'] # if include_weights_file is True: # training_files.append(f'{path_type}/weights.txt') potential.save_potential_files(work_dir=save_path, training_files=training_files, import_to_kimkit=True, write_to_tmp_dir=False) return chimes, rmse
[docs] def submit_train( self, path_type: str, potential: Potential, storage: Storage, dataset_list: list[str], workflow: Workflow, job_details: dict, eweight: Optional[float] = 1.0, fweight: Optional[float] = 1.0, vweight: Optional[float] = 1.0, per_atom_weights: Optional[bool] = False, upload_to_kimkit: Optional[bool] = True, ) -> int: """ Asychronously train the potential based on the trainer details This is a main method of the trainer class, and uses the parameters supplied at instantiation to perform the potential training by minimizing a loss function. While :meth:`train` works synchronously, this method submits training to a job scheduler. Unless fit_directory is set as an absolute path, it will be a local version in the working directory generated by the Workflow. :param path_type: specifier for the workflow path, to differentiate training runs :type path_type: str :param potential: potential to be trained. The actual model itself is set as an attribute of the Potential object :type potential: Potential :param storage: Storage instance to pull data from :type storage: Storage :param dataset_list: List of dataset handles to train with :type dataset_list: list[str] :param workflow: the workflow for managing path definition and job submission, if none are supplied, will use the default workflow defined in this class :type workflow: Workflow :param job_details: job parameters such as walltime or # of nodes :type job_details: dict :param eweight: weight of energy data in the loss function :type eweight: float :param fweight: weight of the force data in the loss function :type fweight: float :param vweight: weight of the stress data in the loss function :type vweight: float :param per_atom_weights: True to read from dataset |default| ``False`` :type per_atom_weights: boolean :param upload_to_kimkit: Upload to kimkit after training |default| ``True`` :type upload_to_kimkit: bool :returns: calculation ID of the submitted job :rtype: int """ if dataset_list is None or storage is None: raise ValueError('A storage object and list of dataset handles' ' are required!') if not isinstance(per_atom_weights, bool): raise ValueError('per_atom_weights must be bool for ChIMES!') # reset parameter_path for new training potential.parameter_path = None potential.trainer_args['parameter_path'] = None if not isinstance(dataset_list, list): dataset_list = [dataset_list] save_path = workflow.make_path(self.__class__.__name__, f'{path_type}') script_filename = self._write_training_script( save_path, dataset_list, potential, storage, eweight, fweight, vweight, per_atom_weights=per_atom_weights, upload_to_kimkit=upload_to_kimkit, ) job_details['custom_preamble'] = 'python' calc_id = workflow.submit_job( script_filename, save_path, job_details=job_details, ) return calc_id
def _save_model( self, path_type: str, potential: Potential, potential_name: Optional[str] = 'chimes_potential', loss: Optional[float] = None, create_path: Optional[bool] = True, workflow: Optional[Workflow] = None, ) -> str: """ Deploy a ChIMES model. Write error metric and LAMMPS input files Provide the path to the ChIMES parameter file :param path_type: specifier for the workflow path, to differentiate training runs and where the model will be saved :type path_type: str :param potential: potential to be saved :type potential: ChIMESPotential :param potential_name: name to save the potential as |default| 'chimes_potential' :type potential_name: str :param loss: ChIMES error object; this can but probably should not be supplied by the user :type loss: ChIMES error :param create_path: if the function needs to create a new path, or if path_type should be used as the full path |default| ``True`` :type create_path: boolean :param workflow: the workflow for managing path definition, if none are supplied, will use the default workflow defined in this class |default| ``None`` :type workflow: Workflow :returns: path where the model is saved (inclusive) :rtype: str """ if workflow is None: workflow = self.default_wf if create_path: save_path = workflow.make_path(self.__class__.__name__, path_type) else: save_path = path_type if potential.parameter_path is not None: return potential.parameter_path else: # first save after a local train() self.logger.info(f'Saving model state in {save_path}') potential.parameter_path = f'{save_path}/{potential_name}' file_parameter = f'{self.fit_directory}/ChIMES_params.txt' # Write the masses to masses.lammps for later use in LAMMPS # with ChIMES via KIM_API. self._chimes_write_masses() subprocess.run(["mv", "masses.lammps", f'{save_path}/']) subprocess.run( ["mv", file_parameter, f'{save_path}/{potential_name}']) return f'{save_path}/{potential_name}'
[docs] def load_from_submitted_training( self, calc_id: int, potential: Potential, workflow: Workflow, ) -> None: """ reload a potential that was trained via a submitted job :param calc_id: calculation ID of the submitted training job :type calc_id: int :param potential: :class:`~.ChIMESPotential` class object that will be updated with the model saved to disk after the training job. :type potential: ChIMESPotential :param workflow: the workflow for managing path definition and job submission :type workflow: Workflow """ workflow.block_until_completed(calc_id) if potential.name is not None: potential_name = potential.name else: potential_name = "chimes_potential" parameter_path = workflow.get_job_path(calc_id) + '/' + potential_name potential.parameter_path = parameter_path self.logger.info(f'Loading potential from: {parameter_path}')