Source code for orchestrator.computer.score.quests

import numpy as np
from ase import Atoms
from typing import Optional, Union, Any
from .score_base import DatasetScore, AtomCenteredScore, ScoreQuantity
from orchestrator.utils.data_standard import (METADATA_KEY, SELECTION_MASK_KEY,
                                              PLACEHOLDER_ARRAY_KEY)

from quests.entropy import (perfect_entropy, diversity, delta_entropy,
                            approx_delta_entropy, DEFAULT_BANDWIDTH,
                            DEFAULT_BATCH, DEFAULT_UQ_NBRS, DEFAULT_GRAPH_NBRS)



[docs]
class QUESTSEfficiencyScore(DatasetScore):
    """
    An information-based method of quantifying dataset diversity.

    This module wraps the `quests` package, which performs a kernel density
    estimate of the distribution of points in a descriptor space to obtain a
    non-parametric estimate of the information entropy of a dataset. This
    estimate can then be used to identify the "efficiency" of the dataset (the
    lack of redundancy).
    """

    OUTPUT_KEY = 'quests_efficiency'
    supported_score_quantities = [ScoreQuantity.EFFICIENCY]


[docs]
    def __init__(self, **kwargs):

        super().__init__()

        # QUESTS does not have any init_args
        self._init_args = {}
        self._metadata = {}



[docs]
    def compute(
        self,
        dataset: list[Atoms],
        score_quantity: int,
        apply_mask: bool = False,
        descriptors_key: str = 'descriptors',
        bandwidth: float = DEFAULT_BANDWIDTH,
        batch_size: Optional[int] = DEFAULT_BATCH,
        **kwargs,
    ) -> Union[float, np.ndarray]:
        """
        Computes the efficiency of the dataset.

        The efficiency is a measure of how little oversampling the dataset has.
        If the efficiency is near 1, the dataset has very little redundancy.

        :param dataset: a list of ASE Atoms objects.
        :type dataset: list

        :param score_quantity: the type of score value to compute
        :type score_quantity: int

        :param apply_mask: if True, apply the environment selection mask; can
            only be used if mask already exists for all configurations.
        :type apply_mask: bool

        :param descriptors_key: the key to use for extracting the descriptors
            from an ASE.Atoms object
        :type descriptors_key: str

        :param bandwidth: the bandwidth used by the Gaussian kernel for KDE.
        :type bandwidth: float

        :param batch_size: the maximum batch size to consider when performing a
            distance calculation.
        :type batch_size: int

        :returns: the efficiency of the dataset
        :rtype: float
        """

        if isinstance(score_quantity, str):
            score_quantity = ScoreQuantity[
                score_quantity]  # Enum conversion uses []

        if score_quantity not in self.supported_score_quantities:
            raise RuntimeError(
                f"Requested compute value '{score_quantity}' is "
                "not supported by '{self.__class__.__name__}'."
                " Supported quantities are "
                "'{self.supported_score_quantities}'")

        if isinstance(dataset, Atoms):
            dataset = [dataset]

        x = np.concatenate(
            [atoms.arrays[descriptors_key] for atoms in dataset])

        # Try to apply a selection mask
        if apply_mask:
            try:
                selection_mask = np.concatenate(
                    [atoms.arrays[SELECTION_MASK_KEY] for atoms in dataset])
                selection_mask = selection_mask.astype(bool)

                x = x[selection_mask]
            except KeyError:  # no mask exists for one of the configs
                raise RuntimeError("`apply_mask=True` was provided, but masks"
                                   " could not be found for all atoms.")

        entropy = perfect_entropy(x, h=bandwidth, batch_size=batch_size)
        max_entropy = np.log(x.shape[0])

        efficiency = entropy / max_entropy

        self._metadata = {'bandwidth': bandwidth}

        return np.array([efficiency])



[docs]
    def get_colabfit_property_definition(
        self,
        score_quantity: int,
    ) -> dict[str, Any]:

        if isinstance(score_quantity, str):
            score_quantity = ScoreQuantity[
                score_quantity]  # Enum conversion uses []

        if score_quantity not in self.supported_score_quantities:
            raise RuntimeError(
                f"Requested compute value '{score_quantity}' is "
                "not supported by '{self.__class__.__name__}'."
                " Supported quantities are "
                "'{self.supported_score_quantities}'")

        return {
            'property-id': 'tag:staff@noreply.colabfit.org,2024-12-09:'
            f'property/{self.OUTPUT_KEY.replace("_", "-")}',

            "property-name": self.OUTPUT_KEY.replace("_", "-"),

            "property-title": "QUESTS data efficiency",

            "property-description": "The efficiency is the ratio of "
            "H/maxH, which measures the dataset entropy relative to the "
            "theoretical maximum entropy of a non-overlapping dataset of the "
            "same size. " "Will be in the range [0, 1].",

            "score": {  # example: 0.90
                "type": "float",
                "has-unit": True,  # depends on log base; usually 'nats'
                "extent": [],
                "required": True,
                "description": "The efficiency H/maxH",
            },

            "bandwidth": {  # example: 0.015
                "type": "float",
                "has-unit": False,
                "extent": [],
                "required": True,
                "description": "The bandwidth used by the Gaussian kernel"
                " for KDE"
            },

            # NOTE: batch_size is not part of the definition because it is
            # only used for computational efficiency
        }





[docs]
class QUESTSDiversityScore(DatasetScore):
    """
    An information-based method of quantifying dataset diversity.

    This module wraps the `quests` package, which performs a kernel density
    estimate of the distribution of points in a descriptor space to obtain a
    non-parametric estimate of the information entropy of a dataset. This
    estimate can then be used to identify the "diversity" of a dataset.
    """

    OUTPUT_KEY = 'quests_diversity'
    supported_score_quantities = [ScoreQuantity.DIVERSITY]


[docs]
    def __init__(self, **kwargs):

        super().__init__()

        # QUESTS does not have any init_args
        self._init_args = {}
        self._metadata = {}



[docs]
    def compute(
        self,
        dataset: list[Atoms],
        score_quantity: int,
        apply_mask: bool = False,
        descriptors_key: str = 'descriptors',
        bandwidth: float = DEFAULT_BANDWIDTH,
        batch_size: Optional[int] = DEFAULT_BATCH,
        **kwargs,
    ) -> Union[float, np.ndarray]:
        """
        Computes the diversity of the dataset.

        The diversity is a measure of how well the dataset covers all regions
        of the configuration space that it spans.

        :param dataset: a list of ASE Atoms objects.
        :type dataset: list

        :param score_quantity: the type of score value to compute
        :type score_quantity: int

        :param apply_mask: if True, apply the environment selection mask; can
            only be used if mask already exists for all configurations.
        :type apply_mask: bool

        :param descriptors_key: the key to use for extracting the descriptors
            from an ASE.Atoms object
        :type descriptors_key: str

        :param bandwidth: the bandwidth used by the Gaussian kernel for KDE.
        :type bandwidth: float

        :param batch_size: the maximum batch size to consider when performing a
            distance calculation.
        :type batch_size: int

        :returns: returns the diversity of the dataset
        :rtype: float
        """
        if isinstance(score_quantity, str):
            score_quantity = ScoreQuantity[
                score_quantity]  # Enum conversion uses []

        if score_quantity not in self.supported_score_quantities:
            raise RuntimeError(
                f"Requested compute value '{score_quantity}' is "
                "not supported by '{self.__class__.__name__}'."
                " Supported quantities are "
                "'{self.supported_score_quantities}'")

        if isinstance(dataset, Atoms):
            dataset = [dataset]

        x = np.concatenate(
            [atoms.arrays[descriptors_key] for atoms in dataset])

        # Try to apply a selection mask
        if apply_mask:
            try:
                selection_mask = np.concatenate(
                    [atoms.arrays[SELECTION_MASK_KEY] for atoms in dataset])
                selection_mask = selection_mask.astype(bool)

                x = x[selection_mask]
            except KeyError:  # no mask exists for one of the configs
                raise RuntimeError("`apply_mask=True` was provided, but masks"
                                   " could not be found for all atoms.")

        return np.array(diversity(x, h=bandwidth, batch_size=batch_size))



[docs]
    def get_colabfit_property_definition(
        self,
        score_quantity: int,
    ) -> dict[str, Any]:

        if isinstance(score_quantity, str):
            score_quantity = ScoreQuantity[
                score_quantity]  # Enum conversion uses []

        if score_quantity not in self.supported_score_quantities:
            raise RuntimeError(
                f"Requested compute value '{score_quantity}' is "
                "not supported by '{self.__class__.__name__}'."
                " Supported quantities are "
                "'{self.supported_score_quantities}'")

        return {
            'property-id': 'tag:staff@noreply.colabfit.org,2024-12-09:'
            f'property/{self.OUTPUT_KEY.replace("_", "-")}',

            "property-name": self.OUTPUT_KEY.replace("_", "-"),

            "property-title": "QUESTS data diversity",

            "property-description": "The diversity is an estimate of how"
            " well the dataset covers the configuration space.",

            "score": {  # example: 0.90
                "type": "float",
                "has-unit": True,  # depends on log base; usually 'nats'
                "extent": [],
                "required": True,
                "description": "The diversity of the dataset",
            },

            "bandwidth": {  # example: 0.015
                "type": "float",
                "has-unit": False,
                "extent": [],
                "required": True,
                "description": "The bandwidth used by the Gaussian kernel"
                " for KDE"
            },

            # NOTE: batch_size is not part of the definition because it is
            # only used for computational efficiency
        }





[docs]
class QUESTSDeltaEntropyScore(AtomCenteredScore):

    OUTPUT_KEY = 'quests_delta_entropy'
    supported_score_quantities = [
        ScoreQuantity.DELTA_ENTROPY,  # deltaH
    ]


[docs]
    def compute(
        self,
        atoms: Atoms,
        score_quantity: int,
        reference_set: np.ndarray,
        descriptors_key: str = 'descriptors',
        approx: bool = False,
        bandwidth: float = DEFAULT_BANDWIDTH,
        num_nearest_neighbors: int = DEFAULT_UQ_NBRS,
        graph_neighbors: int = DEFAULT_GRAPH_NBRS,
        **kwargs,
    ) -> Union[float, np.ndarray]:
        """
        Calls compute_batch with a single-configuration list.

        :param atoms: a single ASE Atoms objects.
        :type list_of_atoms: Atoms

        :param score_quantity: the type of score value to compute
        :type score_quantity: int

        :param reference_set: an (N, D) matrix with the descriptors of the
            reference.
        :type reference_set: np.ndarray

        :param descriptors_key: the key to use for extracting the descriptors
            from an ASE.Atoms object
        :type descriptors_key: str

        :param approx: if True, uses an approximate nearest neighbor search to
            compute the delta entropy values. Recommended for large data sizes.
        :type approx: bool

        :param bandwidth: the bandwidth used by the Gaussian kernel for KDE.
        :type bandwidth: float

        :param num_nearest_neighbors: number of nearest-neighbors to take into
            account when computing the approximate dH.
        :type num_nearest_neighbors: int

        :param graph_neighbors: a parameter used by pynndescent for performing
            the approximate nearest neighbor search.
        :type graph_neighbors: int

        :returns: returns the delta_entropy
        :rtype: float or np.ndarray
        """
        if isinstance(atoms, Atoms):
            return self.compute_batch(
                [atoms],
                score_quantity,
                descriptors_key=descriptors_key,
                approx=approx,
                bandwidth=bandwidth,
                num_nearest_neighbors=num_nearest_neighbors,
                graph_neighbors=graph_neighbors,
                reference_set=reference_set,
            )[0]
        elif isinstance(atoms, list):
            raise RuntimeError(
                ".compute_batch() should be used to compute this score for"
                " multiple atoms objects")
        else:
            raise RuntimeError(
                f"Invalid input type of '{type(atoms)}' passed to .compute()")



[docs]
    def compute_batch(
        self,
        list_of_atoms: Union[list[Atoms], Atoms],
        score_quantity: int,
        reference_set: np.ndarray,
        descriptors_key: str = 'descriptors',
        approx: bool = False,
        bandwidth: float = DEFAULT_BANDWIDTH,
        batch_size: Optional[int] = DEFAULT_BATCH,
        num_nearest_neighbors: int = DEFAULT_UQ_NBRS,
        graph_neighbors: int = DEFAULT_GRAPH_NBRS,
        **kwargs,
    ) -> Union[float, np.ndarray]:
        """
        Calls compute_batch with a single-configuration list.

        :param list_of_atoms: a list of ASE Atoms objects. If a
            list of ASE Atoms is provided, then 'descriptors_key' should be
            provided in `args` to allow extraction of descriptors.
        :type list_of_atoms: list

        :param score_quantity: the type of score value to compute
        :type score_quantity: int

        :param descriptors_key: the key to use for extracting the descriptors
            from an ASE.Atoms object
        :type descriptors_key: str

        :param approx: if True, uses an approximate nearest neighbor search to
            compute the delta entropy values. Recommended for large data sizes.
        :type approx: bool

        :param bandwidth: the bandwidth used by the Gaussian kernel for KDE.
        :type bandwidth: float

        :param batch_size: the maximum batch size to consider when performing a
            distance calculation.
        :type batch_size: int

        :param num_nearest_neighbors: number of nearest-neighbors to take into
            account when computing the approximate dH.
        :type num_nearest_neighbors: int

        :param graph_neighbors: a parameter used by pynndescent for performing
            the approximate nearest neighbor search.
        :type graph_neighbors: int

        :param reference_set: an (N, D) matrix with the descriptors of the
            reference.
        :type reference_set: np.ndarray

        :returns: the delta_entropy scores for each atom
        :rtype: float or np.ndarray
        """
        if isinstance(score_quantity, str):
            score_quantity = ScoreQuantity[
                score_quantity]  # Enum conversion uses []

        if score_quantity not in self.supported_score_quantities:
            raise RuntimeError(
                f"Requested compute value '{score_quantity}' is "
                "not supported by '{self.__class__.__name__}'."
                " Supported quantities are "
                "'{self.supported_score_quantities}'")

        shapes = [len(_) for _ in list_of_atoms]

        descriptors = np.concatenate(
            [atoms.arrays[descriptors_key] for atoms in list_of_atoms])

        if isinstance(reference_set, str):
            reference_set_name = reference_set  # for saving in _metadata
            reference_set = np.load(reference_set)
        else:
            reference_set_name = PLACEHOLDER_ARRAY_KEY

        if approx:
            results = approx_delta_entropy(descriptors,
                                           reference_set,
                                           h=bandwidth,
                                           n=num_nearest_neighbors,
                                           graph_neighbors=graph_neighbors)
        else:
            results = delta_entropy(descriptors,
                                    reference_set,
                                    h=bandwidth,
                                    batch_size=batch_size)

        self._metadata = {
            'reference_set': reference_set_name,
            'bandwidth': bandwidth,
            'approx': approx,
            'num_nearest_neighbors': num_nearest_neighbors,
            'graph_neighbors': graph_neighbors,
        }

        for atoms in list_of_atoms:
            # NOTE: these are being attached here because ColabFit can't handle
            # nested keys during property extraction. e.g. extracting
            # "cut_name" from
            # atoms.info[METADATA_KEY][self.OUTPUT_KEY]['cut_name']
            for k, v in self._metadata.items():
                atoms.info[f'{self.OUTPUT_KEY}_{k}'] = v

            if METADATA_KEY not in atoms.info:
                atoms.info[METADATA_KEY] = {}

            atoms.info[METADATA_KEY][self.OUTPUT_KEY] = self._metadata

        return np.array_split(results, np.cumsum(shapes)[:-1])



[docs]
    def get_colabfit_property_definition(
        self,
        score_quantity: int,
    ) -> dict[str, Any]:

        if isinstance(score_quantity, str):
            score_quantity = ScoreQuantity[
                score_quantity]  # Enum conversion uses []

        if score_quantity not in self.supported_score_quantities:
            raise RuntimeError(
                f"Requested compute value '{score_quantity}' is "
                "not supported by '{self.__class__.__name__}'."
                " Supported quantities are "
                "'{self.supported_score_quantities}'")

        return {
            'property-id': 'tag:staff@noreply.colabfit.org,2024-12-09:'
            f'property/{self.OUTPUT_KEY.replace("_", "-")}',

            "property-name": self.OUTPUT_KEY.replace("_", "-"),

            "property-title": "QUESTS differential entropy, dH",

            "property-description": "The estimated increase that a point"
            " will have to the entropy of a reference set",

            "score": {  # example: (N, 1) array of floats
                "type": "float",
                "has-unit": True,  # depends on log base; usually 'nats'
                "extent": [":"],
                "required": True,
                "description": "The estimated differential entropies for each"
                " atom.",
            },

            "reference-set": {  # example: "/path/to/array"
                "type": "string",
                "has-unit": False,
                "extent": [],
                "required": True,
                "description": "The location of the reference descriptors."
            },

            "bandwidth": {  # example: 0.015
                "type": "float",
                "has-unit": False,
                "extent": [],
                "required": True,
                "description": "The bandwidth used by the Gaussian kernel"
                " for KDE"
            },

            "num-nearest-neighbors": {  # example: 10
                "type": "int",
                "has-unit": False,
                "extent": [],
                "required": True,
                "description": "Number of nearest-neighbors to take into"
                " account when computing the approximate dH"
            },

            "graph-neighbors": {  # example: 10
                "type": "int",
                "has-unit": False,
                "extent": [],
                "required": True,
                "description": "A parameter used by pynndescent for "
                "performing the approximate nearest neighbor search"
            },
        }



[docs]
    def get_colabfit_property_map(
        self,
        score_quantity: int,
    ) -> dict[str, Any]:

        if isinstance(score_quantity, str):
            score_quantity = ScoreQuantity[
                score_quantity]  # Enum conversion uses []

        if score_quantity not in self.supported_score_quantities:
            raise RuntimeError(
                f"Requested compute value '{score_quantity}' is "
                "not supported by '{self.__class__.__name__}'."
                " Supported quantities are "
                "'{self.supported_score_quantities}'")

        return {
            'score': {
                'field': self.OUTPUT_KEY + '_score',
                'units': 'nats'
            },
            'bandwidth': {
                'field': self.OUTPUT_KEY + '_bandwidth',
                'units': None
            },
            'num-nearest-neighbors': {
                'field': self.OUTPUT_KEY + '_num_nearest_neighbors',
                'units': None
            },
        }