Source code for uncertainpy.data

from __future__ import absolute_import, division, print_function, unicode_literals

import six
import os
import collections

import numpy as np

from .utils.utility import contains_nan, is_regular
from .utils.logger import setup_module_logger, get_logger
from ._version import __version__


[docs]class DataFeature(collections.MutableMapping):
    """
    Store the results of each statistical metric calculated from the uncertainty
    quantification and sensitivity analysis for a single model/feature.

    The statistical metrics can be retrieved as attributes. Additionally, DataFeature
    implements all standard dictionary methods, such as items, value, contains
    and so implemented. This means it can be indexed as a regular dictionary
    with the statistical metric names as keys and returns the values for that
    statistical metric.

    Parameters
    ----------
    name : str
        Name of the model/feature.
    evaluations : {None, array_like}, optional.
        Feature or model result.
        Default is None.
    time : {None, array_like}, optional.
        Time evaluations for feature or model.
        Default is None.
    mean : {None, array_like}, optional.
        Mean of the feature or model results.
        Default is None.
    variance : {None, array_like}, optional.
        Variance of the feature or model results.
        Default is None.
    percentile_5 : {None, array_like}, optional.
        5 percentile of the feature or model results.
        Default is None.
    percentile_95 : {None, array_like}, optional.
        95 percentile of the feature or model results.
        Default is None.
    sobol_first : {None, array_like}, optional.
        First order sensitivity of the feature or model results.
        Default is None.
    sobol_first_average : {None, array_like}, optional.
        First order sensitivity of the feature or model results.
        Default is None.
    sobol_total : {None, array_like}, optional.
        Total effect sensitivity of the feature or model results.
        Default is None.
    sobol_total_average : {None, array_like}, optional.
        Average of the total effect sensitivity of
        the feature or model results.
        Default is None.
    labels : list, optional.
        A list of labels for plotting, ``[x-axis, y-axis, z-axis]``
        Default is ``[]``.

    Attributes
    ----------
    name : str
        Name of the model/feature.
    evaluations : {None, array_like}
        Feature or model output.
    time : {None, array_like}
        Time values for feature or model.
    mean : {None, array_like}
        Mean of the feature or model results.
    variance : {None, array_like}
        Variance of the feature or model results.
    percentile_5 : {None, array_like}
        5 percentile of the feature or model results.
    percentile_95 : {None, array_like}
        95 percentile of the feature or model results.
    sobol_first : {None, array_like}
        First order Sobol indices (sensitivity) of the feature or model results.
    sobol_first_average : {None, array_like}
        Average of the first order Sobol indices of the feature or model results.
    sobol_total : {None, array_like}
        Total order Sobol indices (sensitivity) of the feature or model results.
    sobol_total_average : {None, array_like}
        Average of the total order Sobol indices of the feature or model results.
    labels : list
        A list of labels for plotting, ``[x-axis, y-axis, z-axis]``.

    Notes
    -----
    The statistical metrics calculated in Uncertainpy are:

        * ``evaluations`` - the results from the model/feature evaluations.
        * ``time`` - the time of the model/feature.
        * ``mean`` - the mean of the model/feature.
        * ``variance``. - the variance of the model/feature.
        * ``percentile_5`` - the 5th percentile of the model/feature.
        * ``percentile_95`` - the 95th percentile of the model/feature.
        * ``sobol_first`` - the first order Sobol indices (sensitivity) of
          the model/feature.
        * ``sobol_first_average`` - the average of the first order Sobol
          indices (sensitivity) of the model/feature.
        * ``sobol_total`` - the total order Sobol indices (sensitivity)
          of the model/feature.
        * ``sobol_total_average`` - the average of the total order Sobol
          indices (sensitivity) of the model/feature.
    """
    def __init__(self,
                 name,
                 evaluations=None,
                 time=None,
                 mean=None,
                 variance=None,
                 percentile_5=None,
                 percentile_95=None,
                 sobol_first=None,
                 sobol_first_average=None,
                 sobol_total=None,
                 sobol_total_average=None,
                 labels=[]):

        self.name = name
        self.evaluations = evaluations
        self.time = time
        self.mean = mean
        self.variance = variance
        self.percentile_5 = percentile_5
        self.percentile_95 = percentile_95
        self.sobol_first = sobol_first
        self.sobol_first_average = sobol_first_average
        self.sobol_total = sobol_total
        self.sobol_total_average = sobol_total_average
        self.labels = labels

        self._statistical_metrics = ["evaluations", "time", "mean", "variance",
                                     "percentile_5", "percentile_95",
                                     "sobol_first", "sobol_first_average",
                                     "sobol_total", "sobol_total_average"]

        self._information = ["name", "labels"]

[docs]    def __getitem__(self, statistical_metric):
        """
        Get the data for `statistical_metric`.

        Parameters
        ----------
        statistical_metric: str
            Name of the statistical metric.

        Returns
        -------
        {array_like, None}
            The data for `statistical_metric`.
        """
        return getattr(self, statistical_metric)


[docs]    def get_metrics(self):
        """
        Get the names of all statistical metrics that contain data (not None).

        Returns
        -------
        list
           List of the names of all statistical metric that contain data.
        """
        statistical_metrics = []

        for statistical_metric in dir(self):
            if not statistical_metric.startswith('_') and not callable(self[statistical_metric]) \
                and self[statistical_metric] is not None and statistical_metric not in self._information:
                statistical_metrics.append(statistical_metric)

        return statistical_metrics


[docs]    def __setitem__(self, statistical_metric, data):
        """
        Set the data for the statistical metric.

        Parameters
        ----------
        statistical_metric: str
            Name of the statistical metric.
        data : {array_like, None}
            The data for the statistical metric.
        """
        setattr(self, statistical_metric, data)


[docs]    def __iter__(self):
        """
        Iterate over each statistical metric with data.

        Yields
        ------
        str
            Name of the statistical metric.
        """
        for statistical_metric in self.get_metrics():
            yield statistical_metric



[docs]    def __delitem__(self, statistical_metric):
        """
        Delete data for `statistical_metric` (set to None).

        Parameters
        ----------
        statistical_metric: str
            Name of the statistical metric.
        """
        setattr(self, statistical_metric, None)


[docs]    def __len__(self):
        """
        Get the number of data types with data.

        Returns
        -------
        int
            The number of data types with data.
        """
        return len(self.get_metrics())


    def __contains__(self, statistical_metric):
        """
        Check if `statistical_metric` exists and contains data (not None).

        Parameters
        ----------
        statistical_metric: str
            Name of the statistical metric.

        Returns
        -------
        bool
            If `statistical_metric` exists and contains data (not None)
        """
        if statistical_metric not in self.get_metrics() or self[statistical_metric] is None:
            return False
        else:
            return True


    def __str__(self):
        """
        Convert all data to a readable string.

        Returns
        -------
        str
           A human readable string of all statistical metrics.
        """
        output_str = ""
        for statistical_metric in self:
            output_str += "=== {statistical_metric} ===\n".format(statistical_metric=statistical_metric)
            output_str += "{data}\n\n".format(data=self[statistical_metric])


        return output_str.strip()

    # TODO: add test for a single evaluations list
[docs]    def ndim(self):
        """
        Get the number of dimensions the data of a data type. Returns None if no
        evaluations or all evaluations contain numpy.nan.

        Parameters
        ----------
        feature : str
            Name of the model or a feature.

        Returns
        -------
        int
            The number of dimensions of the data of the data type.
        """

        if self.evaluations is not None:
            for evaluation in self.evaluations:
                if not contains_nan(evaluation):
                    return np.ndim(evaluation)

        return None


[docs]class Data(collections.MutableMapping):
    """
    Store the results of each statistical metric calculated from the uncertainty
    quantification and sensitivity analysis for each model/features.

    Has all standard dictionary methods, such as items, value, contains
    and so implemented. Can be indexed as a regular dictionary with
    model/feature names as keys and returns a DataFeature object that contains
    the data for all statistical metrics for that model/feature.
    Additionally it contains information on how the calculations was performed

    Parameters
    ----------
    filename : str, optional
        Name of the file to load data from. If None, no data is loaded.
        Default is None.
    backend : {"auto", "hdf5", "exdir"}, optional
        The fileformat used to save and load data to/from file. "auto" assumes the
        filenamess ends with either ".h5" for HDF5 files or ".exdir" for Exdir files.
        If unknown fileextension defaults to saving as HDF5 files. "hdf5" saves
        and loads files from HDF5 files. "exdir" saves and loads files from
        Exdir files. Default is "auto".
    logger_level : {"info", "debug", "warning", "error", "critical", None}, optional
        Set the threshold for the logging level. Logging messages less severe
        than this level is ignored. If None, no logging to file is performed
        Default logger level is "info".

    Attributes
    ----------
    uncertain_parameters : list
        A list of the uncertain parameters in the uncertainty quantification.
    model_name : str
        Name of the model.
    incomplete : list
        List of all model/features that have missing model/feature evaluations.
    error : list
        List of all model/features that were irregular, but not set to be
        interpolated.
    method : str
        A string that describes the method used to perform the uncertainty
        quantification.
    data : dictionary
        A dictionary with a DataFeature for each model/feature.
    data_information : list
        List of attributes containing additional information.


    Notes
    -----
    The statistical metrics calculated for each feature and model in Uncertainpy
    are:

        * ``evaluations`` - the results from the model/feature evaluations.
        * ``time`` - the time of the model/feature.
        * ``mean`` - the mean of the model/feature.
        * ``variance``. - the variance of the model/feature.
        * ``percentile_5`` - the 5th percentile of the model/feature.
        * ``percentile_95`` - the 95th percentile of the model/feature.
        * ``sobol_first`` - the first order Sobol indices (sensitivity) of
          the model/feature.
        * ``sobol_first_average`` - the average of the first order Sobol
          indices (sensitivity) of the model/feature.
        * ``sobol_total`` - the total order Sobol indices (sensitivity)
          of the model/feature.
        * ``sobol_total_average`` - the average of the total order Sobol
          indices (sensitivity) of the model/feature.

    Raises
    ------
    ValueError
        If unsupported backend is chosen.

    See also
    --------
    uncertainpy.DataFeature
    """
    def __init__(self,
                 filename=None,
                 backend="auto",
                 logger_level="info"):

        self.data_information = ["uncertain_parameters", "model_name",
                                 "incomplete", "method", "version", "seed",
                                 "model_ignore", "error"]


        if backend not in ["auto", "hdf5", "exdir"]:
            raise ValueError("backend {} not supported. Supported backends are: auto, hdf5, and exdir".format(backend))

        setup_module_logger(class_instance=self, level=logger_level)

        self.uncertain_parameters = []
        self.model_name = ""
        self.incomplete = []
        self.error = []
        self.data = {}
        self.method = ""
        self.model_ignore = False
        self._seed = ""
        self.backend = backend

        self.version = __version__

        if filename is not None:
            self.load(filename)


    @property
    def seed(self):
        """
        Seed used in the calculations.

        Parameters
        ----------
        new_seed : {None, int}
            Seed used in the calculations.
            If None, converted to "".

        Returns
        -------
        seed : {int, str}
            Seed used in the calculations.
        """
        return self._seed


    @seed.setter
    def seed(self, new_seed):
        if new_seed is None:
            self._seed = ""
        else:
            self._seed = new_seed


[docs]    def __str__(self):
        """
        Convert all data to a readable string.

        Returns
        -------
        str
           A human readable string of all stored data.
        """

        def border(msg):
            count = len(msg) + 6
            line = "="*(count + 2)
            string = """
{line}
|   {msg}   |
{line}\n\n""".format(line=line, msg=msg)
            return string

        output_str = border("Information")

        for info in self.data_information:
            current_info = getattr(self, info)
            output_str += "{info}: {current_info}\n".format(info=info,
                                                            current_info=current_info)

        for feature in self:
            output_str += border(feature)
            output_str += "=== labels ===\n"
            output_str += "{data}\n\n".format(data=self[feature].labels)
            output_str += str(self[feature]) + "\n"

        return output_str.strip()



[docs]    def clear(self):
        """
        Clear all data.
        """
        self.uncertain_parameters = []
        self.model_name = ""
        self.incomplete = []
        self.error = []
        self.data = {}
        self.method = ""
        self._seed = ""
        self.model_ignore = False
        self.version = __version__


[docs]    def ndim(self, feature):
        """
        Get the number of dimensions of a `feature`.

        Parameters
        ----------
        feature : str
            Name of the model or a feature.

        Returns
        -------
        int, None
            The number of dimensions of the model/feature result. Returns None
            if the feature has no evaluations or only contains nan.
        """
        return self[feature].ndim()



[docs]    def get_labels(self, feature):
        """
        Get labels for a `feature`. If no labels are defined,
        returns a list with the correct number of empty strings.

        Parameters
        ----------
        feature : str
            Name of the model or a feature.

        Returns
        -------
        list
            A list of labels for plotting, ``[x-axis, y-axis, z-axis]``.
            If no labels are defined (labels = []),
            returns a list with the correct number of empty strings.
        """
        if self[feature].labels != []:
            return self[feature].labels

        elif self[self.model_name].labels != [] and self[self.model_name].ndim() == self[feature].ndim():
            return self[self.model_name].labels

        else:
            return [""]*(self[feature].ndim() + 1)



[docs]    def __getitem__(self, feature):
        """
        Get the DataFeature containing the data for `feature`.

        Parameters
        ----------
        feature: str
            Name of feature/model.

        Returns
        -------
        DataFeature
            The DataFeature containing the data for `feature`.
        """
        return self.data[feature]

[docs]    def __setitem__(self, feature, data):
        """
        Set `data` for `feature`. `Data` must be a DataFeature object.

        Parameters
        ----------
        feature: str
            Name of feature/model.
        data : DataFeature
            DataFeature with the data for `feature`.

        Raises
        ------
        ValueError
            If `data` is not a DataFeature.
        """
        if not isinstance(data, DataFeature):
            raise ValueError("data must be of type DataFeature")
        self.data[feature] = data


[docs]    def __iter__(self):
        """
        Iterate over each feature/model that has not errored.

        Yields
        ------
        str
            Name of feature/model.
        """
        for d in self.data:
            if d not in self.error:
                yield d

        # return iter(self.data)


[docs]    def __delitem__(self, feature):
        """
        Delete data for `feature`.

        Parameters
        ----------
        feature: str
            Name of feature.
        """
        del self.data[feature]


[docs]    def __len__(self):
        """
        Get the number of model/features that have not errored.

        Returns
        -------
        int
            The number of model/features that have not errored.
        """
        return len(self.data) - len(self.error)


[docs]    def add_features(self, features):
        """
        Add features (which contain no data).

        Parameters
        ----------
        features : {str, list}
            Name of feature to add, or list of features to add.
        """
        if isinstance(features, six.string_types):
            features = [features]

        for feature in features:
            self.data[feature] = DataFeature(feature)


    # TODO expand the save function to also save parameters and model information
[docs]    def save(self, filename):
        """
        Save data to a HDF5 or Exdir file with name `filename`.

        Parameters
        ----------
        filename : str
            Name of the file to load data from.

        Raises
        ------
        ImportError
            If h5py is not installed.
        ImportError
            If Exdir is not installed.
        """
        logger = get_logger(self)

        if self.backend == "auto":
            if filename.endswith(".h5"):
                current_backend = "hdf5"
            elif filename.endswith(".exdir"):
                current_backend = "exdir"
            else:
                logger.warning("Unknown fileextension, defaulting to save {} as a HDF5 file.".format(filename))
                current_backend = "hdf5"
        else:
            current_backend = self.backend


        if current_backend == "hdf5":
            try:
                import h5py as backend
            except ImportError:
                raise ImportError("The HDF5 backend requires: h5py")

        elif current_backend == "exdir":
            try:
                import exdir.core as backend
            except ImportError:
                raise ImportError("The Exdir backend requires: exdir")



        def add_group(group, values, name="evaluation"):
            iteration = 0

            padding = len(str(len(values) + 1))

            for value in values:
                try:
                    group.create_dataset(name + "_{0:0{1}d}".format(iteration, padding), data=value)
                except (TypeError, ValueError):
                    new_group = group.create_group(name + "_{0:0{1}d}".format(iteration, padding))

                    if not name.startswith("sub_"):
                        new_name = "sub_" + name

                    add_group(new_group, value, name=new_name)

                iteration += 1



        # with backend.File(filename, "w") as f:
        f = backend.File(filename, "w")

        f.attrs["uncertain parameters"] =  [parameter.encode("utf8") for parameter in self.uncertain_parameters]
        f.attrs["model name"] = self.model_name
        f.attrs["incomplete results"] =  [incomplete.encode("utf8") for incomplete in self.incomplete]
        f.attrs["error"] =  [irregular.encode("utf8") for irregular in self.error]
        f.attrs["method"] = self.method
        f.attrs["version"] = self.version
        f.attrs["seed"] = self.seed
        f.attrs["model ignore"] = self.model_ignore


        for feature in self.data:
            group = f.create_group(feature)

            for statistical_metric in self[feature]:
                if statistical_metric in ["evaluations", "time"]:
                    if is_regular(self[feature][statistical_metric]):
                        group.create_dataset(statistical_metric, data=self[feature][statistical_metric])
                    else:
                        evaluations_group = group.create_group(statistical_metric)
                        add_group(evaluations_group, self[feature][statistical_metric], name=statistical_metric)
                else:
                    group.create_dataset(statistical_metric, data=self[feature][statistical_metric])

            group.create_dataset("labels", data=np.array([label.encode("utf8") for label in self[feature].labels]))

        f.close()


[docs]    def load(self, filename):
        """
        Load data from a HDF5 or Exdir file with name `filename`.

        Parameters
        ----------
        filename : str
            Name of the file to load data from.

        Raises
        ------
        ImportError
            If h5py is not installed.
        ImportError
            If Exdir is not installed.
        """
        logger = get_logger(self)

        if self.backend == "auto":
            if filename.endswith(".h5"):
                current_backend = "hdf5"
            elif filename.endswith(".exdir"):
                current_backend = "exdir"
            else:
                logger.warning("Unknown fileextension, defaulting to load {} from a HDF5 file.".format(filename))
                current_backend = "hdf5"

        else:
            current_backend = self.backend


        if current_backend == "hdf5":
            try:
                import h5py as backend
            except ImportError:
                raise ImportError("The HDF5 backend requires: h5py")

        elif current_backend == "exdir":
            try:
                import exdir.core as backend
            except ImportError:
                raise ImportError("The Exdir backend requires: exdir")


        # TODO add this check when changing to python 3
        # if not os.path.isfile(self.filename):
        #     raise FileNotFoundError("{} file not found".format(self.filename))
        self.clear()

        def append_evaluations(evaluations, group):
            sub_evaluations = []
            for item in group:
                value = group[item]
                if isinstance(value, backend.Dataset):
                    sub_evaluations.append(value[()])

                elif  isinstance(value, backend.Group):
                    append_evaluations(sub_evaluations, group)

            evaluations.append(sub_evaluations)


        # with backend.File(filename, "r") as f:
        f = backend.File(filename, "r")

        if "uncertain parameters" in f.attrs:
            try:
                self.uncertain_parameters = [parameter.decode("utf8") for parameter in f.attrs["uncertain parameters"]]
            except (UnicodeDecodeError, AttributeError):
                self.uncertain_parameters = [parameter for parameter in f.attrs["uncertain parameters"]]

        if "model name" in f.attrs:
            self.model_name = str(f.attrs["model name"])

        if "incomplete results" in f.attrs:
            try:
                self.incomplete = [incomplete.decode("utf8") for incomplete in f.attrs["incomplete results"]]
            except (UnicodeDecodeError, AttributeError):
                self.incomplete = [incomplete for incomplete in f.attrs["incomplete results"]]

        if "error" in f.attrs:
            try:
                self.error =  [irregular.decode("utf8") for irregular in f.attrs["error"]]
            except (UnicodeDecodeError, AttributeError):
                self.error =  [irregular for irregular in f.attrs["error"]]

        if "method" in f.attrs:
            self.method = str(f.attrs["method"])

        if "version" in f.attrs:
            self.version = str(f.attrs["version"])

        if "seed" in f.attrs:
            self.seed = f.attrs["seed"]

        if "model ignore" in f.attrs:
            self.model_ignore = f.attrs["model ignore"]


        for feature in f:
            self.add_features(str(feature))
            for statistical_metric in f[feature]:


                if statistical_metric in ["evaluations", "time"]:
                    values = f[feature][statistical_metric]

                    if isinstance(values, backend.Dataset):
                        evaluations = values[()]
                    else:
                        evaluations = []

                        for item in f[feature][statistical_metric]:
                            value = f[feature][statistical_metric][item]

                            if isinstance(value, backend.Dataset):
                                evaluations.append(value[()])
                            elif  isinstance(value, backend.Group):
                                append_evaluations(evaluations, value)

                    self[feature][statistical_metric] = evaluations
                elif statistical_metric == "labels":
                    self[feature][statistical_metric] = [label.decode("utf8") for label in f[feature][statistical_metric][()]]
                else:
                    self[feature][statistical_metric] = f[feature][statistical_metric][()]

        f.close()


[docs]    def remove_only_invalid_features(self):
        """
        Remove all features that only have invalid results (NaN).
        """

        feature_list = list(self.data.keys())
        for feature in feature_list:
            all_nan = True
            for U in self[feature].evaluations:
                if not np.all(np.isnan(U)):
                    all_nan = False

            if all_nan:
                logger = get_logger(self)
                logger.warning("Feature: {} does".format(feature)
                               + " not yield results for any parameter combinations")

                del self[feature]