Source code for hiplot.experiment

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import csv
import uuid
import json
import codecs
import warnings
from abc import ABCMeta, abstractmethod
from enum import Enum
from collections import defaultdict
from pathlib import Path
import typing as tp

if tp.TYPE_CHECKING:
    import pandas as pd
    from .streamlit_helpers import ExperimentStreamlitComponent
    import optuna

TextWriterIO = tp.Union[tp.IO[str], codecs.StreamWriter]

DisplayableType = tp.Union[bool, int, float, str]


[docs]class ExperimentValidationError(Exception): pass
class ExperimentValidationCircularRef(ExperimentValidationError): pass class ExperimentValidationMissingParent(ExperimentValidationError): pass class _DictSerializable: """ All classes that are transmitted to Javascript must subclass this """ def _asdict(self) -> tp.Dict[str, tp.Any]: return self.__dict__
[docs]class ValueType(Enum): """ Defines how we render a column (scaling, and color scheme) """ CATEGORICAL = 'categorical' #: Categorical value NUMERIC = 'numeric' #: Numeric value on a linear scale. Supports integers, floats, NaNs and inf NUMERIC_LOG = 'numericlog' #: Same as :attr:`hiplot.ValueType.NUMERIC`, displayed on a logarithmic scale. NUMERIC_PERCENTILE = 'numericpercentile' #: Same as :attr:`hiplot.ValueType.NUMERIC`, displayed on a percentile scale. TIMESTAMP = 'timestamp' #: Timestamps in seconds (only integers)
[docs]class Displays: """ See :meth:`Experiment.display_data` and :ref:`frontendRenderingSettings` """ PARALLEL_PLOT = 'PARALLEL_PLOT' #: Parallel plot data XY = 'XY' #: XY scatter/line plot data TABLE = 'TABLE' #: Rows table data DISTRIBUTION = 'DISTRIBUTION' #: Distribution plot data
def validate_colormap(cm: tp.Optional[str]) -> None: VALID_MODIFIERS = ["inverse"] if cm is None: return cm_modifiers = cm.split('#', 1) cm = cm_modifiers[0] # We don't want `d3.interpolateTurbo` but just `interpolateTurbo` if cm is not None and not cm.startswith("interpolate") and not cm.startswith("scheme"): raise ExperimentValidationError(f"""Invalid colormap `{cm}`. Valid colormaps can be found in https://github.com/d3/d3-scale-chromatic. Their name starts with `interpolate` or `scheme`. Examples include `interpolateSpectral`, `interpolateViridis`, `interpolateSinebow`, `schemeYlOrRd` """) if len(cm_modifiers) > 1: for modifier in cm_modifiers[1].split(","): if modifier not in VALID_MODIFIERS: raise ExperimentValidationError(f"""Invalid colormap modifier `{modifier}`. Valid colormaps modifiers: {','.join(VALID_MODIFIERS)} """)
[docs]class ValueDef(_DictSerializable): """ Provides a custom type, color, etc.. for a column. :ivar type: See :attr:`hiplot.ValueType` for possible values :ivar colors: Categorical scales only: mapping from value to HTML color (either :code:`rgb(R, G, B)` or :code:`#RRGGBB`) :ivar colormap: Numerical scales only: `D3 scale <https://github.com/d3/d3-scale-chromatic>`_ to use (default scale is `interpolateTurbo <https://github.com/d3/d3-scale-chromatic#interpolateTurbo>`_). For example :code:`"interpolateSinebow"`. To inverse the colormap, append `#inverse` to the name (eg :code:`"interpolateSinebow#inverse"`) :ivar label_css: Space-separated bootstrap CSS classes to apply on the label when supported :ivar label_html: HTML code used to render the column name See :attr:`hiplot.Experiment.parameters_definition` """ def __init__( self, value_type: tp.Optional[ValueType] = None, colors: tp.Optional[tp.Dict[tp.Any, str]] = None, colormap: tp.Optional[str] = None, label_css: tp.Optional[str] = None ) -> None: self.type = value_type self.colors = colors self.colormap = colormap self.label_css = label_css self.force_value_min: tp.Optional[float] = None self.force_value_max: tp.Optional[float] = None self.label_html: tp.Optional[str] = None
[docs] def force_range(self, minimum: float, maximum: float) -> "ValueDef": """ Enforces the range of the column. """ self.force_value_min = minimum self.force_value_max = maximum return self
def validate(self) -> None: if self.colors is not None: for k, v in self.colors.items(): if not v.startswith("rgb(") and not v.startswith("hsl(") and not v.startswith("#"): raise ExperimentValidationError( f'Invalid color {v} for value {k}. Expected color to start with "rgb(", "hsl(", or "#"' ) validate_colormap(self.colormap) def _asdict(self) -> tp.Dict[str, tp.Any]: return { "type": self.type.value if self.type is not None else None, "colors": self.colors, "colormap": self.colormap, "force_value_min": self.force_value_min, "force_value_max": self.force_value_max, "label_css": self.label_css, "label_html": self.label_html, }
[docs]class Datapoint(_DictSerializable): """ A datapoint represents a single measurement of metrics - for instance a model checkpoint that is evaluated. It can have a parent if it originates from another one (offspring). :ivar uid: A unique identifier for this datapoint :ivar values: A dictionary with arbitrary metrics/values :ivar from_uid: The uid of the parent :class:`Datapoint` (tp.Optional) :Example: .. code-block:: python import hiplot as hip dp1 = hip.Datapoint(uid="parent", values={"loss": 0.0}) dp2 = hip.Datapoint(uid="child", from_uid="parent", values={ "loss": 1.0, "another_metric": 0.0 # Different datapoints can have different metrics }) hip.Experiment(datapoints=[dp1, dp2]).display() # Render in an ipython notebook """ def __init__(self, values: tp.Dict[str, DisplayableType], *, uid: tp.Optional[str] = None, from_uid: tp.Optional[str] = None) -> None: self.uid = uid if uid is not None else str(uuid.uuid4()) self.values = values self.from_uid = from_uid
[docs] def validate(self) -> None: """ Makes sure this object is valid - throws an :class:`hiplot.ExperimentValidationError` exception otherwise. """ for reserved_kw in ["uid", "from_uid"]: if reserved_kw in self.values: raise ExperimentValidationError(f'Datapoint {self.uid} contains a value for "{reserved_kw}"')
def _is_running_ipython() -> bool: try: get_ipython() # type: ignore return True except NameError: return False # pylint: disable=too-many-instance-attributes
[docs]class Experiment(_DictSerializable): """ Object that can be rendered by HiPlot. It essential contains a list of metrics, but also some options on how to render it. See :meth:`Experiment.display` to display an :class:`Experiment` in an ipython notebook. :ivar datapoints: All the measurements we have. One datapoint corresponds to one line in the parallel plot and to one line in the table. :ivar parameters_definition: Characteristics of the columns (ordering, type, etc...) :ivar colormap: Colormap to use :ivar colorby: Default column to color by :ivar weightcolumn: If rows have different weights, use this column as the weight (default to 1 if not specified) :ivar enabledDisplays: Ordered displays to enable (by default all are enabled) :Example: .. code-block:: python import hiplot as hip data = [{'param': 1, 'loss': 10, 'hidden_field': 'value1', 'c': 'red'}, {'param': 2, 'loss': 5, 'hidden_field': 'value2', 'c': 'black'}] exp = hip.Experiment.from_iterable(data) """ def __init__(self, datapoints: tp.Optional[tp.List[Datapoint]] = None, parameters_definition: tp.Optional[tp.Dict[str, ValueDef]] = None, colormap: tp.Optional[str] = None, ) -> None: self.datapoints = datapoints if datapoints is not None else [] self.parameters_definition = parameters_definition if parameters_definition is not None else defaultdict(ValueDef) self.colormap = colormap if colormap is not None else "interpolateTurbo" self.colorby: tp.Optional[str] = None self.weightcolumn: tp.Optional[str] = None self.enabledDisplays: tp.List[str] = [Displays.PARALLEL_PLOT, Displays.XY, Displays.DISTRIBUTION, Displays.TABLE] self._display_data: tp.Dict[str, tp.Dict[str, tp.Any]] = {} self._compress: bool = False
[docs] def validate(self) -> "Experiment": """ Makes sure that this object is valid. Raises a :class:`hiplot.ExperimentValidationError` otherwise. Experiments with circular references, non-existent parents, or without datapoints are invalid. """ seen: tp.Set[str] = set() dp_lookup: tp.Dict[str, Datapoint] = {dp.uid: dp for dp in self.datapoints} for p in self.datapoints: if p.uid not in seen: seen_now: tp.Set[str] = {p.uid} dp = p while dp.from_uid is not None and dp.from_uid not in seen: if dp.from_uid in seen_now: raise ExperimentValidationCircularRef(f"Circular reference in {p} parents ({len(seen_now)}-th parent)") seen_now.add(dp.from_uid) if dp.from_uid not in dp_lookup: raise ExperimentValidationMissingParent(f"Datapoint ({dp.uid}) parent ({dp.from_uid}) not found") dp = dp_lookup[dp.from_uid] seen |= seen_now p.validate() if not self.datapoints: raise ExperimentValidationError('Not a single datapoint') validate_colormap(self.colormap) return self
[docs] def display(self, force_full_width: bool = False, store_state_key: tp.Optional[str] = None, **kwargs: tp.Any) -> "ExperimentDisplayed": """ Displays an experiment in an ipython notebook. :param force_full_width: allows to force to have 100% width on Jupyter Notebooks only. :param store_state_key: a string identifier for the HiPlot instance. If not ``None``, HiPlot will store dynamic modifications (removing/reordering columns...) in the URL, and restore them when calling ``display`` with the same value for ``store_state_key`` - see :ref:`tutoNotebookState` :returns: An :class:`ExperimentDisplayed` object that can be used to interact with the visualization - only implemented for Jupyter notebook. See :ref:`tutonotebookdisplayedexperiment` """ from .streamlit_helpers import _StreamlitHelpers # pylint: disable=cyclic-import if not _is_running_ipython(): if _StreamlitHelpers.is_running_within_streamlit(): raise RuntimeError(r"""`experiment.display` can only be called with ipython. It appears that you are trying to create a HiPlot visualization in Streamlit: you should use `display_st`""") raise RuntimeError(r"""`display` can only be called on an ipython context. Are you in a notebook? - To render an experiment to HTML, use `experiment.to_html(file_name)` or `html_page = experiment.to_html()` - To render an experiment in Streamlit, use `experiment.display_st`""") from .ipython import display_exp # pylint: disable=cyclic-import self.validate() return display_exp(self, force_full_width=force_full_width, store_state_url=store_state_key, **kwargs)
# pylint: disable=function-redefined @tp.overload def display_st(self, *, ret: str, key: tp.Optional[str] = None) -> tp.Any: pass @tp.overload def display_st(self, *, ret: tp.List[str], key: tp.Optional[str] = None) -> tp.List[tp.Any]: pass @tp.overload def display_st(self, *, key: tp.Optional[str] = None) -> None: pass
[docs] def display_st(self, *, ret: tp.Union[str, tp.List[str], None] = None, key: tp.Optional[str] = None) -> tp.Any: """ Displays an experiment in a Streamlit app - see :ref:`tutoStreamlit` This function can be pretty slow, see :ref:`tutoStreamlitCache` to learn how to make it faster. :param key: Unique key for the streamlit component. It is strongly recommended to give some unique string. :param ret: Specify what HiPlot should return. :returns: Return value depends on ``ret`` :Example: .. code-block:: python exp.display_st(key="hiplot1") brush_extents = exp.display_st(key="hiplot2", ret="brush_extents") brush_extents, selected_uids = exp.display_st(key="hiplot3", ret=["brush_extents", "selected_uids"]) """ return self.to_streamlit(ret=ret, key=key).display()
# pylint: enable=function-redefined
[docs] def to_streamlit(self, key: tp.Optional[str] = None, ret: tp.Union[str, tp.List[str], None] = None) -> "ExperimentStreamlitComponent": """ Streamlit only: creates a copy of the Experiment that you can cache, which only exposes the `display_st` method - see :ref:`tutoStreamlitCache` :param key: Unique key for the streamlit component. :param ret: Specify what HiPlot should return. :returns: A `component` object that be rendered with `component.display()` :Example: .. code-block:: python import streamlit as st import hiplot as hip @st.cache def get_experiment(): # Create your hiplot experiment as usual big_exp = hip.Experiment.from_iterable(...) # ... and cache the component return big_exp.to_streamlit(key="hipl", ret=["brush_extents", "selected_uids"]) exp = get_experiment() # This will be cached the second time brush_extents, selected_uids = exp.display() """ from . import streamlit_helpers # pylint: disable=cyclic-import if not streamlit_helpers._StreamlitHelpers.is_running_within_streamlit(): if _is_running_ipython(): raise RuntimeError(r"""`experiment.to_streamlit` can only be called in a streamlit script. It appears that you are trying to create a HiPlot visualization in ipython: you should use `display` instead of `to_streamlit`""") raise RuntimeError(r"""`experiment.to_streamlit` can only be called in a streamlit script. To render an experiment to HTML, use `experiment.to_html(file_name)` or `html_page = experiment.to_html()`""") return streamlit_helpers.ExperimentStreamlitComponent(self, key=key, ret=ret)
[docs] def to_html(self, file: tp.Optional[tp.Union[Path, str, TextWriterIO]] = None, **kwargs: tp.Any) -> str: """ Returns the content of a standalone .html file that displays this experiment without any dependency to HiPlot server or static files. :param file: Path/handle to a file to write (optional) :returns: A standalone HTML code to display this Experiment. """ from .render import make_experiment_standalone_page, html_inlinize self.validate() html = make_experiment_standalone_page(options={ **kwargs, 'experiment': self._asdict() }) html = html_inlinize(html) if file is not None: if isinstance(file, (Path, str)): Path(file).write_text(html, encoding="utf-8") else: file.write(html) return html
[docs] def to_csv(self, file: tp.Union[Path, str, TextWriterIO]) -> None: """ Dumps this Experiment as a .csv file. Information about display_data, parameters definition will be lost. :param file: Path/handle to a file to write """ if isinstance(file, (Path, str)): with Path(file).open("w", encoding="utf-8") as csvfile: return self._to_csv(csvfile) else: return self._to_csv(file)
def _to_csv(self, fh: TextWriterIO) -> None: fieldnames: tp.Set[str] = set() for dp in self.datapoints: for f in dp.values.keys(): fieldnames.add(f) writer = csv.DictWriter(fh, fieldnames=["uid", "from_uid"] + sorted(list(fieldnames))) writer.writeheader() for dp in self.datapoints: writer.writerow({ **dp.values, "uid": dp.uid, "from_uid": dp.from_uid, }) def _asdict(self) -> tp.Dict[str, tp.Any]: data: tp.Dict[str, tp.Any] = { "parameters_definition": {k: v._asdict() for k, v in self.parameters_definition.items()}, "colormap": self.colormap, "colorby": self.colorby, "weightcolumn": self.weightcolumn, "display_data": self._display_data, "enabled_displays": self.enabledDisplays, } if self._compress: from .compress import compress data["datapoints_compressed"] = compress(self.datapoints) else: data["datapoints"] = [d._asdict() for d in self.datapoints] return data
[docs] def remove_missing_parents(self) -> "Experiment": """ Sets :attr:`hiplot.Datapoint.from_uid` to None when set to a non-existing Datapoint. """ existing_dp: tp.Set[str] = set((dp.uid for dp in self.datapoints)) for dp in self.datapoints: if dp.from_uid not in existing_dp: dp.from_uid = None return self
[docs] def display_data(self, plugin: str) -> tp.Dict[str, tp.Any]: """ Retrieve data dictionary for a plugin, which can be modified. :param plugin: Name of the plugin :Example: .. code-block:: python exp.display_data(hip.Displays.XY).update({ "axis_x": "time", "axis_y": "loss" }) """ return self._display_data.setdefault(plugin, {})
[docs] @staticmethod def from_iterable(it: tp.Iterable[tp.Dict[str, tp.Any]]) -> "Experiment": """ Creates a HiPlot experiment from an iterable/list of dictionnaries. This is the easiest way to generate an `hiplot.Experiment` object. :param it: A list (or iterable) of dictionnaries :Example: >>> import hiplot as hip >>> hip.Experiment.from_iterable([{"p": "a"}, {"p": "b"}]) <hiplot.experiment.Experiment object at 0x7f0f2e13c590> """ return Experiment( datapoints=[ Datapoint( uid=str(row.get("uid", k)), from_uid=row.get("from_uid") if row.get("from_uid") != '' else None, values={mk: mv for mk, mv in row.items() if mk not in ["uid", "from_uid"]}) for k, row in enumerate(it) ] )
[docs] @staticmethod def from_csv(file: tp.Union[Path, str, tp.IO[str]]) -> "Experiment": """ Creates a HiPlot experiment from a CSV file. :param file: CSV file path """ if isinstance(file, (Path, str)): with Path(file).open(encoding="utf-8") as csvfile: return Experiment.from_iterable(csv.DictReader(csvfile)) else: return Experiment.from_iterable(csv.DictReader(file))
[docs] @staticmethod def from_dataframe(dataframe: "pd.DataFrame") -> "Experiment": # No type hint to avoid having pandas as an additional dependency """ Creates a HiPlot experiment from a pandas DataFrame. :param dataframe: Pandas DataFrame """ # Check if from_uid and uid is both in columns if {'from_uid', 'uid'}.issubset(dataframe.columns): # Check if there are any NaN values to handle if dataframe['from_uid'].isnull().values.any(): # NaN values forces integer columns to become float, if uid is integer and from_uid is float, it crashes. # The line below changes uid to match from_uid type (either float or string), since NaN cannot be integer. dataframe['uid'] = dataframe['uid'].astype(dataframe['from_uid'].dtypes) dataframe = dataframe.fillna({'from_uid': '', 'uid': ''}) # Replaces their dtypes accordingly to str, which is handled better with lesser errors with no change to functionality dataframe['uid'] = dataframe['uid'].astype(str) dataframe['from_uid'] = dataframe['from_uid'].astype(str) experiment = Experiment.from_iterable(dataframe.to_dict(orient='records')) # Restore columns order experiment.display_data(Displays.PARALLEL_PLOT)['order'] = list(dataframe.columns) experiment.display_data(Displays.TABLE)['order'] = list(dataframe.columns) return experiment
[docs] @staticmethod def from_optuna(study: "optuna.study.Study") -> "Experiment": # No type hint to avoid having optuna as an additional dependency """ Creates a HiPlot experiment from a Optuna Study. :param study: Optuna Study """ # Create a list of dictionary objects using study trials # All parameters are taken using params.copy() # pylint: disable=redefined-outer-name import optuna hyper_opt_data = [] for each_trial in study.get_trials(states=(optuna.trial.TrialState.COMPLETE, )): trial_params = {} # This checks if the trial was fully completed # the value will be None if the trial was interrupted halfway (e.g. via KeyboardInterrupt) if not each_trial.values: continue num_objectives = len(each_trial.values) if num_objectives == 1: # name = value, as it could be RMSE / accuracy, or any value that the user selects for tuning trial_params["value"] = each_trial.value else: for objective_id, value in enumerate(each_trial.values): trial_params[f"value_{objective_id}"] = value trial_params["uid"] = each_trial.number trial_params.update(each_trial.params.copy()) hyper_opt_data.append(trial_params) experiment = Experiment.from_iterable(hyper_opt_data) return experiment
[docs] @staticmethod def merge(xp_dict: tp.Dict[str, "Experiment"]) -> "Experiment": """ Merge several experiments into a single one """ xp = Experiment(datapoints=[]) assert xp.parameters_definition is not None # for mypy for k, subxp in xp_dict.items(): assert subxp is not None, k xp.datapoints += [ Datapoint( uid=f"{k}_{d.uid}", from_uid=f"{k}_{d.from_uid}" if d.from_uid is not None else None, values={**d.values, "exp": k} ) for d in subxp.datapoints ] if subxp.parameters_definition is not None: xp.parameters_definition.update(subxp.parameters_definition) for d, v in subxp._display_data.items(): xp.display_data(d).update(v) return xp
[docs]class ExperimentFetcherDoesntApply(Exception): pass
ExperimentFetcher = tp.Callable[[str], Experiment]
[docs]class ExperimentDisplayed(metaclass=ABCMeta): """ Class that allows to communicate with a displayed HiPlot visualization in a Jupyter notebook. Read more in :ref:`tutoNotebookDisplayedExperiment` """
[docs] @abstractmethod def get_selected(self) -> tp.List[Datapoint]: """ Returns a list of currently rendered datapoints in the parallel plot """
[docs] @abstractmethod def get_brush_extents(self) -> tp.Dict[str, tp.Dict[str, tp.Any]]: """ Returns a dictionary, where keys corresponds to columns currently brushed in parallel plot, and values contain information about the current brush. """