Source code for hiplot.experiment

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import csv
import uuid
import json
import codecs
import warnings
from abc import ABCMeta, abstractmethod
from enum import Enum
from collections import defaultdict
from pathlib import Path
import typing as tp

if tp.TYPE_CHECKING:
    import pandas as pd
    from .streamlit_helpers import ExperimentStreamlitComponent
    import optuna

TextWriterIO = tp.Union[tp.IO[str], codecs.StreamWriter]

DisplayableType = tp.Union[bool, int, float, str]


[docs]class ExperimentValidationError(Exception):
    pass


class ExperimentValidationCircularRef(ExperimentValidationError):
    pass


class ExperimentValidationMissingParent(ExperimentValidationError):
    pass


class _DictSerializable:
    """
    All classes that are transmitted to Javascript must subclass this
    """

    def _asdict(self) -> tp.Dict[str, tp.Any]:
        return self.__dict__


[docs]class ValueType(Enum):
    """
    Defines how we render a column (scaling, and color scheme)
    """
    CATEGORICAL = 'categorical'                 #: Categorical value
    NUMERIC = 'numeric'                         #: Numeric value on a linear scale. Supports integers, floats, NaNs and inf
    NUMERIC_LOG = 'numericlog'                  #: Same as :attr:`hiplot.ValueType.NUMERIC`, displayed on a logarithmic scale.
    NUMERIC_PERCENTILE = 'numericpercentile'    #: Same as :attr:`hiplot.ValueType.NUMERIC`, displayed on a percentile scale.
    TIMESTAMP = 'timestamp'                     #: Timestamps in seconds (only integers)


[docs]class Displays:
    """
    See :meth:`Experiment.display_data` and :ref:`frontendRenderingSettings`
    """
    PARALLEL_PLOT = 'PARALLEL_PLOT'             #: Parallel plot data
    XY = 'XY'                                   #: XY scatter/line plot data
    TABLE = 'TABLE'                             #: Rows table data
    DISTRIBUTION = 'DISTRIBUTION'               #: Distribution plot data


def validate_colormap(cm: tp.Optional[str]) -> None:
    VALID_MODIFIERS = ["inverse"]
    if cm is None:
        return
    cm_modifiers = cm.split('#', 1)
    cm = cm_modifiers[0]
    # We don't want `d3.interpolateTurbo` but just `interpolateTurbo`
    if cm is not None and not cm.startswith("interpolate") and not cm.startswith("scheme"):
        raise ExperimentValidationError(f"""Invalid colormap `{cm}`.
Valid colormaps can be found in https://github.com/d3/d3-scale-chromatic. Their name starts with `interpolate` or `scheme`.
Examples include `interpolateSpectral`, `interpolateViridis`, `interpolateSinebow`, `schemeYlOrRd`
""")
    if len(cm_modifiers) > 1:
        for modifier in cm_modifiers[1].split(","):
            if modifier not in VALID_MODIFIERS:
                raise ExperimentValidationError(f"""Invalid colormap modifier `{modifier}`.
Valid colormaps modifiers: {','.join(VALID_MODIFIERS)}
""")


[docs]class ValueDef(_DictSerializable):
    """
    Provides a custom type, color, etc.. for a column.

    :ivar type: See :attr:`hiplot.ValueType` for possible values
    :ivar colors: Categorical scales only: mapping from value to HTML color (either :code:`rgb(R, G, B)` or :code:`#RRGGBB`)
    :ivar colormap: Numerical scales only: `D3 scale <https://github.com/d3/d3-scale-chromatic>`_ to use
        (default scale is `interpolateTurbo <https://github.com/d3/d3-scale-chromatic#interpolateTurbo>`_).
        For example :code:`"interpolateSinebow"`.
        To inverse the colormap, append `#inverse` to the name (eg :code:`"interpolateSinebow#inverse"`)
    :ivar label_css: Space-separated bootstrap CSS classes to apply on the label when supported
    :ivar label_html: HTML code used to render the column name

    See :attr:`hiplot.Experiment.parameters_definition`
    """

    def __init__(
            self,
            value_type: tp.Optional[ValueType] = None,
            colors: tp.Optional[tp.Dict[tp.Any, str]] = None,
            colormap: tp.Optional[str] = None,
            label_css: tp.Optional[str] = None
    ) -> None:
        self.type = value_type
        self.colors = colors
        self.colormap = colormap
        self.label_css = label_css
        self.force_value_min: tp.Optional[float] = None
        self.force_value_max: tp.Optional[float] = None
        self.label_html: tp.Optional[str] = None

[docs]    def force_range(self, minimum: float, maximum: float) -> "ValueDef":
        """
        Enforces the range of the column.
        """
        self.force_value_min = minimum
        self.force_value_max = maximum
        return self

    def validate(self) -> None:
        if self.colors is not None:
            for k, v in self.colors.items():
                if not v.startswith("rgb(") and not v.startswith("hsl(") and not v.startswith("#"):
                    raise ExperimentValidationError(
                        f'Invalid color {v} for value {k}. Expected color to start with "rgb(", "hsl(", or "#"'
                    )
        validate_colormap(self.colormap)

    def _asdict(self) -> tp.Dict[str, tp.Any]:
        return {
            "type": self.type.value if self.type is not None else None,
            "colors": self.colors,
            "colormap": self.colormap,
            "force_value_min": self.force_value_min,
            "force_value_max": self.force_value_max,
            "label_css": self.label_css,
            "label_html": self.label_html,
        }


[docs]class Datapoint(_DictSerializable):
    """
    A datapoint represents a single measurement of metrics - for instance a model checkpoint that is evaluated.
    It can have a parent if it originates from another one (offspring).


    :ivar uid: A unique identifier for this datapoint
    :ivar values: A dictionary with arbitrary metrics/values
    :ivar from_uid: The uid of the parent :class:`Datapoint` (tp.Optional)

    :Example:

    .. code-block:: python

        import hiplot as hip
        dp1 = hip.Datapoint(uid="parent", values={"loss": 0.0})
        dp2 = hip.Datapoint(uid="child", from_uid="parent", values={
            "loss": 1.0,
            "another_metric": 0.0  # Different datapoints can have different metrics
        })
        hip.Experiment(datapoints=[dp1, dp2]).display()  # Render in an ipython notebook
    """

    def __init__(self, values: tp.Dict[str, DisplayableType], *, uid: tp.Optional[str] = None, from_uid: tp.Optional[str] = None) -> None:
        self.uid = uid if uid is not None else str(uuid.uuid4())
        self.values = values
        self.from_uid = from_uid

[docs]    def validate(self) -> None:
        """
        Makes sure this object is valid - throws an :class:`hiplot.ExperimentValidationError` exception otherwise.
        """
        for reserved_kw in ["uid", "from_uid"]:
            if reserved_kw in self.values:
                raise ExperimentValidationError(f'Datapoint {self.uid} contains a value for "{reserved_kw}"')


def _is_running_ipython() -> bool:
    try:
        get_ipython()  # type: ignore
        return True
    except NameError:
        return False


# pylint: disable=too-many-instance-attributes
[docs]class Experiment(_DictSerializable):
    """
    Object that can be rendered by HiPlot. It essential contains a list of metrics, but also some options on how to render it.

    See :meth:`Experiment.display` to display an :class:`Experiment` in an ipython notebook.

    :ivar datapoints: All the measurements we have. One datapoint corresponds to one line in the parallel plot and to one line in the table.
    :ivar parameters_definition: Characteristics of the columns (ordering, type, etc...)
    :ivar colormap: Colormap to use
    :ivar colorby: Default column to color by
    :ivar weightcolumn: If rows have different weights, use this column as the weight (default to 1 if not specified)
    :ivar enabledDisplays: Ordered displays to enable (by default all are enabled)

    :Example:

    .. code-block:: python

        import hiplot as hip
        data = [{'param': 1, 'loss': 10, 'hidden_field': 'value1', 'c': 'red'},
            {'param': 2, 'loss': 5, 'hidden_field': 'value2', 'c': 'black'}]
        exp = hip.Experiment.from_iterable(data)

    """

    def __init__(self,
                 datapoints: tp.Optional[tp.List[Datapoint]] = None,
                 parameters_definition: tp.Optional[tp.Dict[str, ValueDef]] = None,
                 colormap: tp.Optional[str] = None,
                 ) -> None:
        self.datapoints = datapoints if datapoints is not None else []
        self.parameters_definition = parameters_definition if parameters_definition is not None else defaultdict(ValueDef)
        self.colormap = colormap if colormap is not None else "interpolateTurbo"
        self.colorby: tp.Optional[str] = None
        self.weightcolumn: tp.Optional[str] = None
        self.enabledDisplays: tp.List[str] = [Displays.PARALLEL_PLOT, Displays.XY, Displays.DISTRIBUTION, Displays.TABLE]
        self._display_data: tp.Dict[str, tp.Dict[str, tp.Any]] = {}
        self._compress: bool = False

[docs]    def validate(self) -> "Experiment":
        """
        Makes sure that this object is valid. Raises a :class:`hiplot.ExperimentValidationError` otherwise.
        Experiments with circular references, non-existent parents, or without datapoints are invalid.
        """
        seen: tp.Set[str] = set()
        dp_lookup: tp.Dict[str, Datapoint] = {dp.uid: dp for dp in self.datapoints}
        for p in self.datapoints:
            if p.uid not in seen:
                seen_now: tp.Set[str] = {p.uid}
                dp = p
                while dp.from_uid is not None and dp.from_uid not in seen:
                    if dp.from_uid in seen_now:
                        raise ExperimentValidationCircularRef(f"Circular reference in {p} parents ({len(seen_now)}-th parent)")
                    seen_now.add(dp.from_uid)
                    if dp.from_uid not in dp_lookup:
                        raise ExperimentValidationMissingParent(f"Datapoint ({dp.uid}) parent ({dp.from_uid}) not found")
                    dp = dp_lookup[dp.from_uid]
                seen |= seen_now
            p.validate()
        if not self.datapoints:
            raise ExperimentValidationError('Not a single datapoint')
        validate_colormap(self.colormap)
        return self

[docs]    def display(self, force_full_width: bool = False, store_state_key: tp.Optional[str] = None, **kwargs: tp.Any) -> "ExperimentDisplayed":
        """
        Displays an experiment in an ipython notebook.

        :param force_full_width: allows to force to have 100% width on Jupyter Notebooks only.
        :param store_state_key: a string identifier for the HiPlot instance.
            If not ``None``, HiPlot will store dynamic modifications (removing/reordering columns...)
            in the URL, and restore them when calling ``display`` with the same value for ``store_state_key`` - see :ref:`tutoNotebookState`
        :returns: An :class:`ExperimentDisplayed` object that can be used to interact with the visualization
            - only implemented for Jupyter notebook.
            See :ref:`tutonotebookdisplayedexperiment`
        """
        from .streamlit_helpers import _StreamlitHelpers  # pylint: disable=cyclic-import
        if not _is_running_ipython():
            if _StreamlitHelpers.is_running_within_streamlit():
                raise RuntimeError(r"""`experiment.display` can only be called with ipython.
It appears that you are trying to create a HiPlot visualization in Streamlit: you should use `display_st`""")
            raise RuntimeError(r"""`display` can only be called on an ipython context. Are you in a notebook?
- To render an experiment to HTML, use `experiment.to_html(file_name)` or `html_page = experiment.to_html()`
- To render an experiment in Streamlit, use `experiment.display_st`""")
        from .ipython import display_exp  # pylint: disable=cyclic-import
        self.validate()
        return display_exp(self, force_full_width=force_full_width, store_state_url=store_state_key, **kwargs)

    # pylint: disable=function-redefined
    @tp.overload
    def display_st(self, *, ret: str, key: tp.Optional[str] = None) -> tp.Any:
        pass

    @tp.overload
    def display_st(self, *, ret: tp.List[str], key: tp.Optional[str] = None) -> tp.List[tp.Any]:
        pass

    @tp.overload
    def display_st(self, *, key: tp.Optional[str] = None) -> None:
        pass

[docs]    def display_st(self, *, ret: tp.Union[str, tp.List[str], None] = None, key: tp.Optional[str] = None) -> tp.Any:
        """
        Displays an experiment in a Streamlit app - see :ref:`tutoStreamlit`

        This function can be pretty slow, see :ref:`tutoStreamlitCache` to learn how to make it faster.

        :param key: Unique key for the streamlit component. It is strongly recommended to give some unique string.
        :param ret: Specify what HiPlot should return.
        :returns: Return value depends on ``ret``

        :Example:

        .. code-block:: python

            exp.display_st(key="hiplot1")
            brush_extents = exp.display_st(key="hiplot2", ret="brush_extents")
            brush_extents, selected_uids = exp.display_st(key="hiplot3", ret=["brush_extents", "selected_uids"])

        """
        return self.to_streamlit(ret=ret, key=key).display()

    # pylint: enable=function-redefined

[docs]    def to_streamlit(self, key: tp.Optional[str] = None, ret: tp.Union[str, tp.List[str], None] = None) -> "ExperimentStreamlitComponent":
        """
        Streamlit only:
        creates a copy of the Experiment that you can cache,
        which only exposes the `display_st` method - see :ref:`tutoStreamlitCache`

        :param key: Unique key for the streamlit component.
        :param ret: Specify what HiPlot should return.
        :returns: A `component` object that be rendered with `component.display()`

        :Example:

        .. code-block:: python

            import streamlit as st
            import hiplot as hip

            @st.cache
            def get_experiment():
                # Create your hiplot experiment as usual
                big_exp = hip.Experiment.from_iterable(...)
                # ... and cache the component
                return big_exp.to_streamlit(key="hipl", ret=["brush_extents", "selected_uids"])

            exp = get_experiment() # This will be cached the second time
            brush_extents, selected_uids = exp.display()

        """

        from . import streamlit_helpers  # pylint: disable=cyclic-import

        if not streamlit_helpers._StreamlitHelpers.is_running_within_streamlit():
            if _is_running_ipython():
                raise RuntimeError(r"""`experiment.to_streamlit` can only be called in a streamlit script.
It appears that you are trying to create a HiPlot visualization in ipython: you should use `display` instead of `to_streamlit`""")
            raise RuntimeError(r"""`experiment.to_streamlit` can only be called in a streamlit script.
To render an experiment to HTML, use `experiment.to_html(file_name)` or `html_page = experiment.to_html()`""")
        return streamlit_helpers.ExperimentStreamlitComponent(self, key=key, ret=ret)

[docs]    def to_html(self, file: tp.Optional[tp.Union[Path, str, TextWriterIO]] = None, **kwargs: tp.Any) -> str:
        """
        Returns the content of a standalone .html file that displays this experiment
        without any dependency to HiPlot server or static files.

        :param file: Path/handle to a file to write (optional)
        :returns: A standalone HTML code to display this Experiment.
        """
        from .render import make_experiment_standalone_page, html_inlinize

        self.validate()
        html = make_experiment_standalone_page(options={
            **kwargs,
            'experiment': self._asdict()
        })
        html = html_inlinize(html)
        if file is not None:
            if isinstance(file, (Path, str)):
                Path(file).write_text(html, encoding="utf-8")
            else:
                file.write(html)
        return html

[docs]    def to_csv(self, file: tp.Union[Path, str, TextWriterIO]) -> None:
        """
        Dumps this Experiment as a .csv file.
        Information about display_data, parameters definition will be lost.

        :param file: Path/handle to a file to write
        """
        if isinstance(file, (Path, str)):
            with Path(file).open("w", encoding="utf-8") as csvfile:
                return self._to_csv(csvfile)
        else:
            return self._to_csv(file)

    def _to_csv(self, fh: TextWriterIO) -> None:
        fieldnames: tp.Set[str] = set()
        for dp in self.datapoints:
            for f in dp.values.keys():
                fieldnames.add(f)
        writer = csv.DictWriter(fh, fieldnames=["uid", "from_uid"] + sorted(list(fieldnames)))
        writer.writeheader()
        for dp in self.datapoints:
            writer.writerow({
                **dp.values,
                "uid": dp.uid,
                "from_uid": dp.from_uid,
            })

    def _asdict(self) -> tp.Dict[str, tp.Any]:
        data: tp.Dict[str, tp.Any] = {
            "parameters_definition": {k: v._asdict() for k, v in self.parameters_definition.items()},
            "colormap": self.colormap,
            "colorby": self.colorby,
            "weightcolumn": self.weightcolumn,
            "display_data": self._display_data,
            "enabled_displays": self.enabledDisplays,
        }
        if self._compress:
            from .compress import compress
            data["datapoints_compressed"] = compress(self.datapoints)
        else:
            data["datapoints"] = [d._asdict() for d in self.datapoints]
        return data

[docs]    def remove_missing_parents(self) -> "Experiment":
        """
        Sets :attr:`hiplot.Datapoint.from_uid` to None when set to a non-existing Datapoint.
        """
        existing_dp: tp.Set[str] = set((dp.uid for dp in self.datapoints))
        for dp in self.datapoints:
            if dp.from_uid not in existing_dp:
                dp.from_uid = None
        return self

[docs]    def display_data(self, plugin: str) -> tp.Dict[str, tp.Any]:
        """
        Retrieve data dictionary for a plugin, which can be modified.

        :param plugin: Name of the plugin

        :Example:

        .. code-block:: python

            exp.display_data(hip.Displays.XY).update({
                "axis_x": "time",
                "axis_y": "loss"
            })

        """
        return self._display_data.setdefault(plugin, {})

[docs]    @staticmethod
    def from_iterable(it: tp.Iterable[tp.Dict[str, tp.Any]]) -> "Experiment":
        """
        Creates a HiPlot experiment from an iterable/list of dictionnaries.
        This is the easiest way to generate an `hiplot.Experiment` object.

        :param it: A list (or iterable) of dictionnaries

        :Example:

        >>> import hiplot as hip
        >>> hip.Experiment.from_iterable([{"p": "a"}, {"p": "b"}])
        <hiplot.experiment.Experiment object at 0x7f0f2e13c590>

        """
        return Experiment(
            datapoints=[
                Datapoint(
                    uid=str(row.get("uid", k)),
                    from_uid=row.get("from_uid") if row.get("from_uid") != '' else None,
                    values={mk: mv for mk, mv in row.items() if mk not in ["uid", "from_uid"]}) for k, row in enumerate(it)
            ]
        )

[docs]    @staticmethod
    def from_csv(file: tp.Union[Path, str, tp.IO[str]]) -> "Experiment":
        """
        Creates a HiPlot experiment from a CSV file.

        :param file: CSV file path
        """
        if isinstance(file, (Path, str)):
            with Path(file).open(encoding="utf-8") as csvfile:
                return Experiment.from_iterable(csv.DictReader(csvfile))
        else:
            return Experiment.from_iterable(csv.DictReader(file))

[docs]    @staticmethod
    def from_dataframe(dataframe: "pd.DataFrame") -> "Experiment":  # No type hint to avoid having pandas as an additional dependency
        """
        Creates a HiPlot experiment from a pandas DataFrame.

        :param dataframe: Pandas DataFrame
        """
        # Check if from_uid and uid is both in columns
        if {'from_uid', 'uid'}.issubset(dataframe.columns):
            # Check if there are any NaN values to handle
            if dataframe['from_uid'].isnull().values.any():

                # NaN values forces integer columns to become float, if uid is integer and from_uid is float, it crashes.
                # The line below changes uid to match from_uid type (either float or string), since NaN cannot be integer.
                dataframe['uid'] = dataframe['uid'].astype(dataframe['from_uid'].dtypes)
                dataframe = dataframe.fillna({'from_uid': '', 'uid': ''})

                # Replaces their dtypes accordingly to str, which is handled better with lesser errors with no change to functionality
                dataframe['uid'] = dataframe['uid'].astype(str)
                dataframe['from_uid'] = dataframe['from_uid'].astype(str)

        experiment = Experiment.from_iterable(dataframe.to_dict(orient='records'))

        # Restore columns order
        experiment.display_data(Displays.PARALLEL_PLOT)['order'] = list(dataframe.columns)
        experiment.display_data(Displays.TABLE)['order'] = list(dataframe.columns)

        return experiment

[docs]    @staticmethod
    def from_optuna(study: "optuna.study.Study") -> "Experiment":  # No type hint to avoid having optuna as an additional dependency
        """
        Creates a HiPlot experiment from a Optuna Study.

        :param study: Optuna Study
        """

        # Create a list of dictionary objects using study trials
        # All parameters are taken using params.copy()
        # pylint: disable=redefined-outer-name
        import optuna

        hyper_opt_data = []
        for each_trial in study.get_trials(states=(optuna.trial.TrialState.COMPLETE, )):
            trial_params = {}
            # This checks if the trial was fully completed
            # the value will be None if the trial was interrupted halfway (e.g. via KeyboardInterrupt)
            if not each_trial.values:
                continue
            num_objectives = len(each_trial.values)

            if num_objectives == 1:
                # name = value, as it could be RMSE / accuracy, or any value that the user selects for tuning
                trial_params["value"] = each_trial.value
            else:
                for objective_id, value in enumerate(each_trial.values):
                    trial_params[f"value_{objective_id}"] = value

            trial_params["uid"] = each_trial.number
            trial_params.update(each_trial.params.copy())
            hyper_opt_data.append(trial_params)
        experiment = Experiment.from_iterable(hyper_opt_data)

        return experiment

[docs]    @staticmethod
    def merge(xp_dict: tp.Dict[str, "Experiment"]) -> "Experiment":
        """
        Merge several experiments into a single one
        """
        xp = Experiment(datapoints=[])
        assert xp.parameters_definition is not None  # for mypy
        for k, subxp in xp_dict.items():
            assert subxp is not None, k
            xp.datapoints += [
                Datapoint(
                    uid=f"{k}_{d.uid}", from_uid=f"{k}_{d.from_uid}" if d.from_uid is not None else None, values={**d.values, "exp": k}
                )
                for d in subxp.datapoints
            ]
            if subxp.parameters_definition is not None:
                xp.parameters_definition.update(subxp.parameters_definition)
            for d, v in subxp._display_data.items():
                xp.display_data(d).update(v)
        return xp


[docs]class ExperimentFetcherDoesntApply(Exception):
    pass


ExperimentFetcher = tp.Callable[[str], Experiment]


[docs]class ExperimentDisplayed(metaclass=ABCMeta):
    """
    Class that allows to communicate with a displayed HiPlot visualization in a Jupyter notebook.
    Read more in :ref:`tutoNotebookDisplayedExperiment`
    """
[docs]    @abstractmethod
    def get_selected(self) -> tp.List[Datapoint]:
        """
        Returns a list of currently rendered datapoints in the parallel plot
        """

[docs]    @abstractmethod
    def get_brush_extents(self) -> tp.Dict[str, tp.Dict[str, tp.Any]]:
        """
        Returns a dictionary, where keys corresponds to columns currently brushed in parallel plot,
        and values contain information about the current brush.
        """
Table Of Contents

Source code for hiplot.experiment