Source code for kats.consts

#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

"""
This module contains some of the key data structures in the Kats library,
including :class:`TimeSeriesData`, :class:`TimeSeriesChangePoint`, and
:class:`TimeSeriesIterator`.

:class:`TimeSeriesChangePoint` is the return type of many of the Kats detection
algorithms.

:class:`TimeSeriesData` is the fundamental data structure in the Kats library,
that gives uses access to a host of forecasting, detection, and utility
algorithms right at the user's fingertips.
"""

from __future__ import annotations

import copy
import datetime
import logging
from enum import Enum, auto, unique
from typing import List, Optional, Union, cast

import dateutil
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas.api.types import is_datetime64_any_dtype as is_datetime, is_numeric_dtype
from pandas.testing import assert_frame_equal, assert_series_equal
from pandas.tseries.frequencies import to_offset


# Constants
DEFAULT_TIME_NAME = "time"  # Default name for the time column in TimeSeriesData
DEFAULT_VALUE_NAME = "value"  # Default name for the value column in TimeSeriesData
PREFIX_OP_1 = "_kats.1"  # Internal prefix used when merging two TimeSeriesData objects
PREFIX_OP_2 = (
    "_kats.2"  # Second internal prefix used when merging two TimeSeriesData objects
)
INTERPOLATION_METHODS = {
    "linear",
    "bfill",
    "ffill",
}  # List of possible interpolation methods


def _log_error(msg: str) -> ValueError:
    logging.error(msg)
    return ValueError(msg)


[docs]class TimeSeriesChangePoint:
    """Object returned by detector classes.

    Attributes:

        start_time: Start time of the change.
        end_time: End time of the change.
        confidence: The confidence of the change point.
    """

    def __init__(self, start_time, end_time, confidence: float) -> None:
        self._start_time = start_time
        self._end_time = end_time
        self._confidence = confidence

    @property
    def start_time(self):
        return self._start_time

    @property
    def end_time(self):
        return self._end_time

    @property
    def confidence(self) -> float:
        return self._confidence

    def __repr__(self):
        return (
            f"TimeSeriesChangePoint(start_time: {self.start_time}, end_time: "
            f"{self.end_time}, confidence: {self.confidence})"
        )

    def __str__(self):
        return (
            f"TimeSeriesChangePoint(start_time: {self.start_time}, end_time: "
            f"{self.end_time}, confidence: {self.confidence})"
        )


[docs]class TimeSeriesData:
    """The fundamental Kats data structure to store a time series.

    In order to access much of the functionality in the Kats library, users
    must initialize the :class:`TimeSeriesData` class with their data first.

    Initialization. :class:`TimeSeriesData` can be initialized from the
    following data sources:

        - `pandas.DataFrame`
        - `pandas.Series`
        - `pandas.DatetimeIndex`

    Typical usage example for initialization:

    >>> import pandas as pd
    >>> df = pd.read_csv("/kats/data/air_passengers.csv")
    >>> ts = TimeSeriesData(df=df, time_col_name="ds")

    Initialization arguments (all optional, but must choose one way to
    initialize e.g. `pandas.DataFrame`):

    - df: A `pandas.DataFrame` storing the time series (default None).
    - sort_by_time: A boolean indicating whether the :class:`TimeSeriesData`
        should be sorted by time (default True).
    - time: a `pandas.Series` or `pandas.DatetimeIndex` storing the time
        values (default None).
    - value: A pandas.Series or pandas.DataFrame storing the series value(s)
        (default None).
    - time_col_name: A string representing the value of the time column (
        default "time")
    - date_format: A string specifying the format of the date/time in the
        time column. Useful for faster parsing, and required
        `pandas.to_datetime()` cannot parse the column otherwise (default None).
    - use_unix_time: A boolean indicating if the time is represented as
        unix time (default False).
    - unix_time_units: A string indicating the units of the unix time -- only
        used if `use_unix_time=True` (default "ns").
    - tz: A string representing the timezone of the time values (default None).
    - tz_ambiguous: A string representing how to handle ambiguous timezones
        (default "raise").
    - tz_nonexistant: A string representing how to handle nonexistant timezone
        values (default "raise").

    Raises:
      ValueError: Invalid params passed when trying to create the
        :class:`TimeSeriesData`.

    Operations. Many operations that you can do with `pandas.DataFrame` objects
    are also applicable to :class:`TimeSeriesData`. For example:

      >>> ts[0:2] # Slicing
      >>> ts_1 == ts_2 # Equality
      >>> ts_1.extend(ts_2) # Extend
      >>> ts.plot(cols=["y"]) # Visualize

    Utility Functions. Many utility functions for converting
    :class:`TimeSeriesData` objects to other common data structures exist.
    For example:

      >>> ts.to_dataframe() # Convert to pandas.DataFrame
      >>> ts.to_array() # Convert to numpy.ndarray

    Attributes:
      time: A `pandas.Series` object storing the time values of the time
        series.
      value: A `pandas.Series` (if univariate) or `pandas.DataFrame` (if
        multivariate) object storing the values of each field in the time
        series.
      min: A float or `pandas.Series` representing the min value(s) of the
        time series.
      max: A float or `pandas.Series` representing the max value(s) of the
        time series.
    """
    _min: float = np.nan
    _max: float = np.nan

    def __init__(  # noqa C901
        self,
        df: Optional[pd.DataFrame] = None,
        sort_by_time: bool = True,
        time: Union[pd.Series, pd.DatetimeIndex, None] = None,
        value: Union[pd.Series, pd.DataFrame, None] = None,
        time_col_name: str = DEFAULT_TIME_NAME,
        date_format: Optional[str] = None,
        use_unix_time: bool = False,
        unix_time_units: str = "ns",
        tz: Optional[str] = None,
        tz_ambiguous: Union[str, np.ndarray] = "raise",
        tz_nonexistent: str = "raise",
    ) -> None:
        """Initializes :class:`TimeSeriesData` class with arguments provided."""
        self.time_col_name = time_col_name

        # If DataFrame is passed
        if df is not None:
            if not isinstance(df, pd.DataFrame):
                msg = (
                    "Argument df needs to be a pandas.DataFrame but is of type "
                    f"{type(df)}."
                )
                raise _log_error(msg)
            # If empty DataFrame is passed then create an empty object
            if df.empty:
                self._time = pd.Series([], name=time_col_name, dtype=float)
                self._value = pd.Series([], name=DEFAULT_VALUE_NAME, dtype=float)
                logging.warning("Initializing empty TimeSeriesData object")
            # Otherwise initialize TimeSeriesData from DataFrame
            else:
                # Ensuring time column is present in DataFrame
                if self.time_col_name not in df.columns:
                    msg = f"Time column {self.time_col_name} not in DataFrame"
                    raise _log_error(msg)
                # Parsing time column into correct format
                df = df.copy()
                df.reset_index(inplace=True, drop=True)
                df[self.time_col_name] = self._set_time_format(
                    series=df[self.time_col_name],
                    date_format=date_format,
                    use_unix_time=use_unix_time,
                    unix_time_units=unix_time_units,
                    tz=tz,
                    tz_ambiguous=tz_ambiguous,
                    tz_nonexistent=tz_nonexistent,
                )
                # Sorting by time if necessary
                if sort_by_time:
                    df.sort_values(self.time_col_name, inplace=True)
                    df.reset_index(inplace=True, drop=True)
                else:
                    logging.warning(
                        (
                            "Please make sure the time series is sorted by time or "
                            "set 'sort_by_time' as True."
                        )
                    )
                self._time = df[self.time_col_name]
                self._value = df[[x for x in df.columns if x != self.time_col_name]]
                self._set_univariate_values_to_series()

        # If separate objects are passed
        elif time is not None and value is not None:
            if not (
                (
                    isinstance(time, pd.core.series.Series)
                    or isinstance(time, pd.DatetimeIndex)
                )
                and (
                    isinstance(value, pd.core.series.Series)
                    or isinstance(value, pd.DataFrame)
                )
            ):
                msg = (
                    f"Invalid types: time is {type(time)} when it must be a "
                    + "pandas.Series or pandas.DatetimeIndex and value is "
                    + f"{type(value)} when it must be a pandas.DataFrame or "
                    + "pandas.Series"
                )
                raise _log_error(msg)
            if isinstance(time, pd.DatetimeIndex):
                self._time = pd.Series(time)
            else:
                self._time = cast(pd.Series, time.reset_index(drop=True))
            self._value = value.reset_index(drop=True)
            self._set_univariate_values_to_series()
            # Set time col name
            if time.name:
                self.time_col_name = time.name
            # Checking for emptiness
            if self.time.empty and self.value.empty:
                logging.warning("Initializing empty TimeSeriesData object")
                self.time = pd.Series([], name=time_col_name)
                if isinstance(value, pd.DataFrame):
                    self.value = pd.Series([], name=DEFAULT_VALUE_NAME)
                else:
                    self.value = pd.Series(
                        [], name=value.name if value.name else DEFAULT_VALUE_NAME
                    )
            # Raise exception if only one of time and value is empty
            elif self.time.empty or self.value.empty:
                msg = "One of time or value is empty while the other is not"
                raise _log_error(msg)
            # If time values are passed then standardizing format
            else:
                self.time = cast(
                    pd.Series,
                    self._set_time_format(
                        self.time,
                        date_format=date_format,
                        use_unix_time=use_unix_time,
                        unix_time_units=unix_time_units,
                        tz=tz,
                        tz_ambiguous=tz_ambiguous,
                        tz_nonexistent=tz_nonexistent,
                    ).reset_index(drop=True),
                )

        # If None is passed
        elif not time and not value:
            self._time = pd.Series([], name=time_col_name)
            self._value = pd.Series([], name=DEFAULT_VALUE_NAME)
            logging.warning("Initializing empty TimeSeriesData object")

        # Error if only one of time or value is None
        else:
            msg = "One of time or value is empty while the other is not"
            raise _log_error(msg)

        # Validate values
        if not self.value.empty and not (
            (
                isinstance(self.value, pd.core.series.Series)
                and is_numeric_dtype(self.value)
            )
            or (
                isinstance(self.value, pd.DataFrame)
                and all(is_numeric_dtype(self.value[col]) for col in self.value)
            )
        ):
            msg = f"Time series data is type {self.value.dtype} but must be numeric"
            raise _log_error(msg)

        self._calc_min_max_values()

    @property
    def time(self) -> pd.Series:
        """Returns the time values of the series.

        Returns:
          A `pandas.Series` representing the time values of the time series.
        """
        return self._time

    @time.setter
    def time(self, time_values: pd.Series) -> None:
        """Sets the time values of the :class:`TimeSeriesData`.

        Args:
          time_values. A `pandas.Series` with the updated time values.
        """
        self._time = time_values

    @property
    def value(self) -> Union[pd.Series, pd.DataFrame]:
        """Returns the value(s) of the series.

        Returns:
          A `pandas.Series` or `pandas.DataFrame` representing the value(s) of the
          time series.
        """
        return self._value

    @value.setter
    def value(self, values: Union[pd.Series, pd.DataFrame]) -> None:
        """Sets the value(s) of the :class:`TimeSeriesData.`

        Args:
          values: A `pandas.Series` or `pandas.DataFrame` with the updated
          values(s).
        """

        self._value = values
        # updates for min/max values are necessary once values are updated
        self._calc_min_max_values()

    @property
    def min(self) -> Union[pd.Series, float]:
        """Returns the min value(s) of the series.

        Returns:
          A `pandas.Series` or float representing the min value(s) of the
          time series.
        """
        return self._min

    @property
    def max(self) -> Union[pd.Series, float]:
        """Returns the max value(s) of the series.

        Returns:
          A `pandas.Series` or float representing the max value(s) of the
          time series.
        """
        return self._max

    def __eq__(self, other: object) -> bool:
        # Currently "__eq__" only works with other TimeSeriesData objects.
        if not isinstance(other, TimeSeriesData):
            return NotImplemented

        # Check if time values are equal
        try:
            assert_series_equal(self.time, other.time, check_dtype=False)
        except AssertionError:
            return False

        # If both objects are univariate
        if isinstance(self.value, pd.Series) and isinstance(other.value, pd.Series):
            # Check if value Series are equal
            try:
                assert_series_equal(self.value, other.value, check_dtype=False)
            except AssertionError:
                return False
        # If both objects are multivariate
        elif isinstance(self.value, pd.DataFrame) and isinstance(
            other.value, pd.DataFrame
        ):
            # Check if value DataFrames are equal (ignore column order)
            try:
                assert_frame_equal(
                    self.value.sort_index(axis=1),
                    other.value.sort_index(axis=1),
                    check_names=True,
                    check_dtype=False,
                )
            except AssertionError:
                return False
        # Otherwise one TimeSeriesData is univariate and the other is multivariate
        else:
            return False

        return True

    def __ne__(self, other: object) -> bool:
        return not self.__eq__(other)

    def __sub__(self, other: object) -> TimeSeriesData:
        return self._perform_op(other, OperationsEnum.SUB)

    def __truediv__(self, other: object) -> TimeSeriesData:
        return self._perform_op(other, OperationsEnum.DIV)

    def __add__(self, other: object) -> TimeSeriesData:
        return self._perform_op(other, OperationsEnum.ADD)

    def __mul__(self, other: object) -> TimeSeriesData:
        return self._perform_op(other, OperationsEnum.MUL)

    def __len__(self) -> int:
        return len(self.value)

    def __getitem__(self, sliced) -> TimeSeriesData:
        return TimeSeriesData(
            time=self.time[sliced],
            value=self.value[sliced],
            time_col_name=self.time_col_name,
        )

    def __repr__(self) -> str:
        return self.to_dataframe().__repr__()

    def _repr_html_(self) -> str:
        return self.to_dataframe()._repr_html_()

    def _set_univariate_values_to_series(self):
        # This hack is required since downstream models are expecting value of
        # type Series in case of univariate time series
        if isinstance(self.value, pd.DataFrame) and self.value.shape[1] == 1:
            self.value = self.value.iloc[:, 0]

[docs]    def is_empty(self) -> bool:
        """Checks if the :class:`TimeSeriesData` is empty.

        Returns:
          False if :class:`TimeSeriesData` does not have any datapoints.
          Otherwise return True.
        """

        return self.value.empty and self.time.empty

    def _set_time_format(
        self,
        series: pd.Series,
        date_format: Optional[str],
        use_unix_time: Optional[bool],
        unix_time_units: Optional[str],
        tz: Optional[str] = None,
        tz_ambiguous: Union[str, np.ndarray] = "raise",
        tz_nonexistent: str = "raise",
    ) -> pd.core.series.Series:
        """Parses time format when initializing :class:`TimeSeriesData`."""

        # Checking if time column is of type pandas datetime
        if not is_datetime(series):
            # If we should use unix time
            if use_unix_time:
                try:
                    if tz:
                        return (
                            pd.to_datetime(
                                series.values, unit=unix_time_units, utc=True
                            )
                            .tz_convert(tz)
                            .to_series()
                            .reset_index(drop=True)
                        )
                    else:
                        return pd.to_datetime(series, unit=unix_time_units)
                except ValueError:
                    logging.error("Failed to parse unix time")
                    logging.debug(
                        "Could not parse time column "
                        + f"{list(series)} using unix units "
                        + f"{unix_time_units}"
                    )
                    raise ValueError("Unable to parse unix time")
            # Otherwise try to parse string
            else:
                try:
                    if tz:
                        return (
                            pd.to_datetime(series.values, format=date_format)
                            .tz_localize(
                                tz, ambiguous=tz_ambiguous, nonexistent=tz_nonexistent
                            )
                            .to_series()
                            .reset_index(drop=True)
                        )
                    else:
                        return pd.to_datetime(series, format=date_format)
                except ValueError:
                    logging.error("Failed to parse time")
                    logging.debug(
                        "Could not parse time column "
                        + f"{list(series)} automatically "
                        + "or by using specified format "
                        + f"{date_format}"
                    )
                    raise ValueError("Unable to parse time with format specified")
        else:
            return series

[docs]    def extend(self, other: object, validate: bool = True) -> None:
        """
        Extends :class:`TimeSeriesData` with another :class:`TimeSeriesData`
        object.

        Args:
          other: The other :class:`TimeSeriesData` object (currently
            only other :class:`TimeSeriesData` objects are supported).
          validate (optional): A boolean representing if the new
            :class:`TimeSeriesData` should be validated (default True).

        Raises:
          ValueError: The object passed was not an instance of
            :class:`TimeSeriesData`.
        """

        if not isinstance(other, TimeSeriesData):
            raise TypeError("extend must take another TimeSeriesData object")
        # Concatenate times
        self.time = pd.concat([self.time, other.time], ignore_index=True).reset_index(
            drop=True
        )
        # Convert values to DataFrame if needed
        cur_value = self.value
        other_value = other.value
        if isinstance(self.value, pd.Series):
            cur_value = pd.DataFrame(cur_value)
        if isinstance(other.value, pd.Series):
            other_value = pd.DataFrame(other_value)
        # Concatenate values
        self.value = pd.concat([cur_value, other_value], ignore_index=True).reset_index(
            drop=True
        )
        # Merge value back to Series if required
        self._set_univariate_values_to_series()
        # Validate that frequency is constant if required
        if validate:
            self.validate_data(validate_frequency=True, validate_dimension=False)

[docs]    def time_to_index(self) -> pd.DatetimeIndex:
        """
        Utility function converting the time in the :class:`TimeSeriesData`
        object to a `pandas.DatetimeIndex`.

        Returns:
          A `pandas.DatetimeIndex` representation of the time values of the series.
        """

        return pd.DatetimeIndex(self.time)

[docs]    def validate_data(self, validate_frequency: bool, validate_dimension: bool) -> None:
        """
        Validates the time series for correctness (on both frequency and
        dimension).

        Args:
          validate_frequency: A boolean indicating whether the
            :class:`TimeSeriesData` should be validated for constant frequency.
          validate_dimension: A boolean indicating whether the
            :class:`TimeSeriesData` should be validated for having both the
            same number of timesteps and values.

        Raises:
          ValueError: The frequency and/or dimensions were invalid.
        """

        # check the time frequency is constant
        if validate_frequency and pd.infer_freq(self.time_to_index()) is None:
            raise ValueError("Only constant frequency is supported for time!")

        if validate_dimension and len(self.time) != self.value.shape[0]:
            raise ValueError("time and value has different length (dimension)!")

    def _calc_min_max_values(self):
        # Get maximum and minimum values
        if not self.value.empty:
            if isinstance(self.value, pd.core.series.Series):
                self._min = np.nanmin(self.value.values)
                self._max = np.nanmax(self.value.values)
            else:
                self._min = self.value.min(skipna=True)
                self._max = self.value.max(skipna=True)
        else:
            self._min = np.nan
            self._max = np.nan

[docs]    def is_data_missing(self) -> bool:
        """
        Checks if data is missing from the time series.

        This is very similar to :meth:`validate_data()` but will not raise an
        error.

        Returns:
          True when data is missing from the time series. Otherwise False.
        """

        # pd.infer_freq needs at least 3 time points.
        # here we tackle the case less than 3 time points
        if len(self.time) < 3:
            return False

        if pd.infer_freq(self.time_to_index()) is None:
            return True
        else:
            return False

[docs]    def freq_to_timedelta(self):
        """
        Returns a `pandas.Timedelta` representation of the
        :class:`TimeSeriesdata` frequency.

        Returns:
          A `pandas.Timedelta` object representing the frequency of the
          :class:`TimeSeriesData`.
        """

        return pd.Timedelta(to_offset(pd.infer_freq(self.time_to_index())))

[docs]    def tz(
        self,
    ) -> Union[datetime.tzinfo, dateutil.tz.tz.tzfile, None]:
        """
        Returns the timezone of the :class:`TimeSeriesData`.

        Returns:
          A timezone aware object representing the timezone of the
          :class:`TimeSeriesData`. Returns None when there is no timezone
          present.

        For more info, see:
        https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DatetimeIndex.tz.html.
        """

        return self.time_to_index().tz

[docs]    def is_univariate(self):
        """Returns whether the :class:`TimeSeriesData` is univariate.

        Returns:
          True if the :class:`TimeSeriesData` is univariate. False otherwise.
        """

        return len(self.value.shape) == 1

[docs]    def to_dataframe(self, standard_time_col_name: bool = False) -> pd.DataFrame:
        """
        Converts the :class:`TimeSeriesData` object into a `pandas.DataFrame`.

        Args:
          standard_time_col (optional): True if the DataFrame's time column name
            should be "time". To keep the same time column name as the current
            :class:`TimeSeriesData` object, leave as False (default False).
        """

        time_col_name = (
            DEFAULT_TIME_NAME if standard_time_col_name else self.time_col_name
        )
        output_df = pd.DataFrame(dict(zip((time_col_name,), (self.time,))))
        if isinstance(self.value, pd.Series):
            if self.value.name is not None:
                output_df[self.value.name] = self.value
            else:
                output_df[DEFAULT_VALUE_NAME] = self.value
        elif isinstance(self.value, pd.DataFrame):
            output_df = pd.concat([output_df, self.value], axis=1).reset_index(
                drop=True
            )
        else:
            raise ValueError(f"Wrong value type: {type(self.value)}")
        return output_df

[docs]    def to_array(self) -> np.ndarray:
        """Converts the :class:`TimeSeriesData` object to a `numpy.ndarray`.

        Returns:
          A `numpy.ndarray` representation of the time series.
        """

        return self.to_dataframe().to_numpy()

    def _get_binary_op_other_arg(self, other: object) -> TimeSeriesData:
        if isinstance(other, float) or isinstance(other, int):
            if isinstance(self.value, pd.Series):
                return TimeSeriesData(
                    pd.DataFrame(
                        dict(
                            zip(
                                (DEFAULT_TIME_NAME, self.value.name),
                                (self.time, pd.Series(other, index=self.time.index)),
                            )
                        )
                    )
                )
            else:
                # TODO: implement multivariate time series operation with constant
                raise NotImplementedError("Operation on multivariate")

        if not isinstance(other, TimeSeriesData):
            raise TypeError("Binary op must take another TimeSeriesData object")
        if not self.time.equals(other.time):
            raise ValueError("BBinary op must take a TimeSeriesData with same time")
        return other

    def _perform_op(self, other: object, op_type: "OperationsEnum") -> TimeSeriesData:
        # Extract DataFrames with same time column name for joining
        self_df = self.to_dataframe(standard_time_col_name=True)
        other_df = self._get_binary_op_other_arg(other).to_dataframe(
            standard_time_col_name=True
        )
        # Join DataFrames on time column
        combo_df = pd.merge(
            self_df,
            other_df,
            on=DEFAULT_TIME_NAME,
            how="outer",
            suffixes=(PREFIX_OP_1, PREFIX_OP_2),
        )
        # Map the final column name to the sub column names
        col_map = {}
        for col_name in list(combo_df.columns):
            if PREFIX_OP_1 in col_name:
                prefix = col_name.split(PREFIX_OP_1)[0]
                col_map[prefix] = col_map.get(prefix, []) + [col_name]
            elif PREFIX_OP_2 in col_name:
                prefix = col_name.split(PREFIX_OP_2)[0]
                col_map[prefix] = col_map.get(prefix, []) + [col_name]

        for col_name in list(col_map.keys()):
            # Perform operation on two columns and merge back to one column
            col_1, col_2 = col_map[col_name]
            if op_type == OperationsEnum.ADD:
                combo_df[col_name] = combo_df[col_1] + combo_df[col_2]
            elif op_type == OperationsEnum.SUB:
                combo_df[col_name] = combo_df[col_1] - combo_df[col_2]
            elif op_type == OperationsEnum.DIV:
                combo_df[col_name] = combo_df[col_1] / combo_df[col_2]
            elif op_type == OperationsEnum.MUL:
                combo_df[col_name] = combo_df[col_1] * combo_df[col_2]
            else:
                raise ValueError("Unsupported Operations Type")
            combo_df.drop([col_1, col_2], axis=1, inplace=True)
        # Set columns only present in one of the objects to None
        final_col_list = set([DEFAULT_TIME_NAME] + list(col_map.keys()))
        for col_name in list(combo_df.columns):
            if col_name not in final_col_list:
                combo_df[col_name] = np.nan
        # Change time col name back if needed
        if self.time_col_name != DEFAULT_TIME_NAME:
            combo_df[self.time_col_name] = combo_df[DEFAULT_TIME_NAME]
            combo_df.drop(DEFAULT_TIME_NAME, axis=1, inplace=True)
        return TimeSeriesData(df=combo_df, time_col_name=self.time_col_name)

[docs]    def infer_freq_robust(self) -> pd.Timedelta:
        """
        This method is a more robust way to infer the frequency of the time
        series in the presence of missing data. It looks at the diff of the
        time series, and decides the frequency by majority voting.

        Returns:
          A `pandas.Timedelta` object representing the frequency of the series.

        Raises:
          ValueError: The :class:`TimeSeriesData` has less than 2 data points.
        """

        df = self.to_dataframe()

        if df.shape[0] <= 1:
            raise ValueError("Cannot find frequency for less than two data points")

        freq_counts = (
            df[self.time_col_name].diff().value_counts().sort_values(ascending=False)
        )

        frequency = freq_counts.index[0]

        return frequency

[docs]    def interpolate(
        self,
        freq: Optional[Union[str, pd.Timedelta]] = None,
        method: str = "linear",
        remove_duplicate_time=False,
    ) -> TimeSeriesData:
        """
        Interpolate missing date if `time` doesn't have constant frequency.

        The following options are available:
          - linear
          - backward fill
          - forward fill

        See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.interpolate.html
        for more detail on these options.

        Args:
          freq: A string representing the pre-defined freq of the time series.
          method: A string representing the method to impute the missing time
            and data. See the above options (default "linear").
          remove_duplicate_index: A boolean to auto-remove any duplicate time
            values, as interpolation in this case due to the need to index
            on time (default False).

        Returns:
            A new :class:`TimeSeriesData` object with interpolated data.
        """

        if not freq:
            freq = self.infer_freq_robust()

        # convert to pandas.DataFrame so that we can leverage the built-in methods
        df = self.to_dataframe()

        # Linear interpolation fails if a column has an int type - convert to float
        if method == "linear":
            for col in list(df):
                if col != self.time_col_name:
                    try:
                        df[col] = df[col].astype(float)
                    except ValueError:
                        raise ValueError(
                            f"Column {col} is invalid type: {df[col].dtype}"
                        )

        df.set_index(self.time_col_name, inplace=True)

        # Removing duplicate time index values if needed
        if remove_duplicate_time:
            df = df[~df.index.duplicated()]

        if method == "linear":
            df = df.resample(freq).interpolate(method="linear")

        elif method == "ffill":
            df = df.resample(freq).ffill()

        elif method == "bfill":
            df = df.resample(freq).bfill()

        else:
            # method is not supported
            raise ValueError(f"the given method is not supported: {method}")

        df = df.reset_index().rename(columns={"index": self.time_col_name})
        return TimeSeriesData(df, time_col_name=self.time_col_name)

[docs]    def plot(self, cols: List[str]) -> None:
        """Plots the time series.

        Args:
          cols: List of variables (strings) to plot (against time).
        """

        if self.is_empty():
            return
        # Make sure columns are valid
        df = self.to_dataframe()
        all_cols = list(df.columns)
        all_cols.remove(self.time_col_name)
        if not set(cols).issubset(all_cols):
            logging.error(f"Columns to plot: {cols} are not all in the timeseries")
            raise ValueError("Invalid columns passed")
        # Plot
        logging.info("Plotting time series")
        fig = plt.figure(facecolor="w", figsize=(10, 6))
        ax = fig.add_subplot(111)
        for col in cols:
            ax.plot(
                df[self.time_col_name].to_numpy(),
                df[col].to_numpy(),
                "k",
            )
        ax.grid(True, which="major", c="gray", ls="-", lw=1, alpha=0.2)
        fig.tight_layout()
        # pyre-ignore[29]: `pd.core.accessor.CachedAccessor` is not a function.
        self.to_dataframe().plot(x=self.time_col_name, y=cols, ax=ax)


class TimeSeriesIterator:
    def __init__(self, ts: TimeSeriesData) -> None:
        self.ts = copy.deepcopy(ts)
        self.ts.value = pd.DataFrame(ts.value)
        self.start = 0

    def __iter__(self):
        self.a = pd.DataFrame(
            list(self.ts.value.iloc[:, 0]), index=list(self.ts.time), columns=["y"]
        )
        return self

    def __next__(self):
        if self.start < self.ts.value.shape[1]:
            x = pd.DataFrame(
                list(self.ts.value.iloc[:, self.start]),
                index=list(self.ts.time),
                columns=["y"],
            )
            self.start += 1
            return x
        else:
            raise StopIteration


[docs]class TSIterator:
    """Iterates through the values of a single timeseries.

    Produces a timeseries with a single point, in case of an
    univariate time series, or a timeseries with an array indicating
    the values at the given location, for a multivariate time series.

    Attributes:
        ts: The input timeseries.
    """

    def __init__(self, ts: TimeSeriesData) -> None:
        self.ts = ts
        self.curr = 0

    def __iter__(self):
        return self

    def __next__(self) -> TimeSeriesData:
        if self.curr < len(self.ts.time):
            if self.ts.is_univariate():
                ret = TimeSeriesData(
                    time=pd.Series(self.ts.time[self.curr]),
                    value=pd.Series(self.ts.value.iloc[self.curr]),
                )
            else:
                ret = TimeSeriesData(
                    time=pd.Series(self.ts.time[self.curr]),
                    value=pd.DataFrame(self.ts.value.iloc[self.curr]),
                )
            self.curr += 1
            return ret
        else:
            raise StopIteration


class Params:
    def __init__(self):
        pass

    def validate_params(self):
        pass


[docs]@unique
class ModelEnum(Enum):
    """
    This enum lists the options of models to be set for default search space in
    hyper-parameter tuning.
    """

    ARIMA = auto()
    SARIMA = auto()
    PROPHET = auto()
    HOLTWINTERS = auto()
    LINEAR = auto()
    QUADRATIC = auto()


[docs]@unique
class SearchMethodEnum(Enum):
    """
    This enum lists the options of search algorithms to be used in
    hyper-parameter tuning.
    """

    GRID_SEARCH = auto()
    RANDOM_SEARCH_UNIFORM = auto()
    RANDOM_SEARCH_SOBOL = auto()
    BAYES_OPT = auto()


[docs]@unique
class OperationsEnum(Enum):
    """
    This enum lists all the mathematical operations that can be performed on
    :class:`TimeSeriesData` objects.
    """

    ADD = auto()
    SUB = auto()
    DIV = auto()
    MUL = auto()


__all__ = [
    'ModelEnum',
    'OperationsEnum',
    'Params',
    'SearchMethodEnum',
    'TimeSeriesChangePoint',
    'TimeSeriesData',
    'TimeSeriesIterator',
    'TSIterator',
]
Source code for kats.consts

Kats

Navigation

Related Topics