Source code for orchestrator.storage.storage_base
from abc import ABC, abstractmethod
from ase import Atoms
from uuid import uuid4
from typing import Optional
from ..utils.recorder import Recorder
[docs]
class Storage(Recorder, ABC):
"""
Abstract base class for data storage
The Storage class deals with all functionalities associated to data storage
inside Orchestrator. Its functions include the initialization of the
database, and data additions, updates, and queries. The Orchestrator uses
a list of ASE Atoms as the internal data representation. A given database
(Storage instance) can include multiple datasets (collections of
configurations and properties) and generally persists in time.
:param storage_args: dictionary with initialization parameters, including
database_name and database_path. See module documentation for greater
detail
:type storage_args: dict
"""
[docs]
def __init__(self, **kwargs):
"""
Set variables and initialize the recorder
:param storage_args: dictionary with initialization parameters,
including database_name and database_path. See module documentation
for greater detail
:type storage_args: dict
"""
super().__init__()
# this should be set by children
self.STORAGE_ID_KEY = 'storage_id'
[docs]
def generate_dataset_name(
self,
root: str,
specifier: str,
counter: Optional[int] = None,
check_uniqueness: Optional[bool] = True,
) -> str:
"""
generate a detailed (mostly) human-readable dataset name
The dataset name will be in the form: root_specifier[_counter] and if
check_uniqueness is true, root_specifier[_counter]_unique_hash.
:param root: root of the dataset name, this should be consistent across
similar runs (i.e. a campaign name)
:type root: str
:param specifier: this argument gives more fine control of the dataset
name, allowing differentiation within a given root
:type specifier: str
:param counter: iteration number of the present root and specifier
combination. This can be used for versioning of datasets.
|default| ``None``
:type counter: int
:param check_uniqueness: attaches a random hash to the dataset name if
true, and ensures that the resulting dataset name is unique within
the storage module. |default| ``True``
:type check_uniqueness: boolean
:returns: the dataset name
:rtype: str
"""
base_string = f'{root}_{specifier}'
if counter is not None:
base_string += f'_{counter}'
if check_uniqueness:
is_unique = False
while not is_unique:
# there may be a more robust way to do this?
random_id = f'{uuid4().time_low:010}'
full_name = f'{base_string}_{random_id}'
is_unique = self.check_if_dataset_name_unique(full_name)
else:
full_name = base_string
return full_name
[docs]
@abstractmethod
def check_if_dataset_name_unique(self, dataset_name: str) -> bool:
"""
check if the provided dataset_name is unique in the database
:param dataset_name: name to check (human readable)
:type dataset_name: str
:returns: true if the database is not present in the database, false if
it does exist
:rtype: boolean
"""
pass
[docs]
@abstractmethod
def add_data(
self,
dataset_handle: str,
data: list[Atoms],
dataset_metadata: Optional[dict] = None,
) -> str:
"""
Add new configurations (and associated properties) to the database
This method is used to add to an existing dataset with new
configurations. The new configurations may or may not have other
properties associated with them.
:param dataset_handle: name or ID of dataset
:type dataset_handle: str or int
:param data: list of ASE.Atoms objects containing the configurations
and associated properties to add to the database. Note that
configuration-specific metadata should be stored under the
`atoms.info[METADATA_KEY]` field.
:type data: list
:param dataset_metadata: A dictionary of metadata specific to the
dataset as a whole.
:type dataset_metadata: dict
:returns: handle for the dataset which includes the new additions
:rtype: str or int
"""
pass
[docs]
@abstractmethod
def new_dataset(
self,
dataset_handle: str,
data: list[Atoms],
dataset_metadata: Optional[dict] = None,
) -> str:
"""
Create a new dataset with the provided data and metadata
The new dataset will have a human readable name specificed by
dataset_handle and will ingest the data and metadata provided.
:param dataset_handle: name of the dataset to be created
:type dataset_handle: str
:param data: list of ASE.Atoms objects containing the configurations
and associated properties to add to the database. Note that
configuration-specific metadata should be stored under the
`atoms.info[METADATA_KEY]` field.
:type data: list
:param dataset_metadata: A dictionary of metadata specific to the
dataset as a whole.
:type dataset_metadata: dict
:returns: unique handle for the dataset, i.e. its ID
:rtype: str or int
"""
pass
[docs]
@abstractmethod
def update_data(
self,
dataset_handle: str,
data: list[Atoms],
metadata: Optional[dict] = None,
) -> str:
"""
Update an existing dataset - overwriting or adding new properties
This method operates on existing configurations and/or properties. Data
are provided as a KliFF dataset of properties that should be added to
either the configuration as a new property or overwriting existing
properties within the database.
:param dataset_handle: name or ID of dataset
:type dataset_handle: str or int
:param data: list of ASE.Atoms objects containing the configurations
and associated properties to add to the database. Note that
configuration-specific metadata should be stored under the
`atoms.info[METADATA_KEY]` field.
:type data: list
:param dataset_metadata: A dictionary of metadata specific to the
dataset as a whole.
:type dataset_metadata: dict
:returns: unique handle for the dataset
:rtype: str
"""
pass
[docs]
@abstractmethod
def get_data(
self,
dataset_handle: str,
query_options: Optional[dict] = None,
) -> list[Atoms]:
"""
Extract data from Storage
Return the dataset specified by dataset_handle as a list of ASE Atoms.
Further options for parameterizing the extraction can be provided by
the query_options dictionary.
:param dataset_handle: name or ID of dataset
:type dataset_handle: str or int
:param query_options: dict of options for data extraction and return
|default| ``None``
:type query_options: dict
:returns: requested data as a list of ASE Atoms
:rtype: list
"""
pass
[docs]
@abstractmethod
def delete_dataset(self, dataset_handle: str):
"""
Remove the dataset specified by dataset_handle from the database
:param dataset_handle: name or ID of dataset
:type dataset_handle: str
"""
pass
[docs]
@abstractmethod
def list_data(self, dataset_handle: Optional[str] = None):
"""
Utility function to query the database
Prints an overview of the database contents if no dataset_handle is
provided, otherwise provides information about the specific dataset
contents.
:param dataset_handle: name or ID of dataset |default| ``None``
:type dataset_handle: str or int
"""
pass