Source code for kats.utils.emp_confidence_int

#!/usr/bin/env python3

# Copyright (c) Facebook, Inc. and its affiliates.
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

"""The Empirical Confidence (Prediction) Interval

This is an empirical way to estimate the prediction interval for any forecasting models
The high level idea is to estimate the empirical error distributions from a specific
forecasting model, and use linear regression model to fit the standard error (S.E.) with
the time horizon, under the assumption that longer horizon has larger S.E.
"""

import logging
from typing import List, Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from kats.consts import Params, TimeSeriesData
from kats.utils.backtesters import BackTesterRollingWindow
from scipy import stats


ALL_ERRORS = ["mape", "smape", "mae", "mase", "mse", "rmse"]


[docs]class EmpConfidenceInt: """ "class for empirical confidence interval The steps are listed as follows: 1. Run K-fold CV for a given model and data, each fold contains h (horizon) time ahead 2. For each horizon, calculate the Std of K error terms (S.E) 3. Fit linear model: S.E. ~ Horizon 4. Estimate the S.E. for each horizon for the true future 5. Lower/Upper = Point Estimate -/+ Z_Score * S.E. Attributes: error_method: list of strings indicating which errors to calculate we currently support "mape", "smape", "mae", "mase", "mse", "rmse" data: the time series data in `TimeSeriesData` format params: the Kats model parameter object train_percentage: percentage of data used for training test_percentage: percentage of data used for testing sliding_steps: number of rolling steps to take (# of folds) model_class: the Kats model class multi: flag to use multiprocessing, the default is True confidence_level: the confidence level for the prediction interval """ freq: Optional[str] = None dates: Optional[pd.DatetimeIndex] = None predicted: Optional[np.ndarray] = None coefs: Optional[np.ndarray] = None df: Optional[pd.DataFrame] = None def __init__( self, error_methods: List[str], data: TimeSeriesData, params: Params, train_percentage: float, test_percentage: float, sliding_steps: int, model_class, multi=True, confidence_level: float = 0.8, **kwargs ): self.error_methods = error_methods self.data = data self.params = params logging.info("Initializing train/test percentages") if train_percentage <= 0: logging.error("Non positive training percentage") raise ValueError("Invalid training percentage") elif train_percentage > 100: logging.error("Too large training percentage") raise ValueError("Invalid training percentage") self.train_percentage = train_percentage if test_percentage <= 0: logging.error("Non positive test percentage") raise ValueError("Invalid test percentage") elif test_percentage > 100: logging.error("Too large test percentage") raise ValueError("Invalid test percentage") self.test_percentage = test_percentage if sliding_steps < 0: logging.error("Non positive sliding steps") raise ValueError("Invalid sliding steps") self.sliding_steps = sliding_steps self.model_class = model_class self.multi = multi self.confidence_level = confidence_level logging.debug( "Initialized Empirical CI calculation with parameters. " "error_methods:{error_methods}," "data:{data}," "params:{params}," "train_percentage:{train_percentage}," "test_percentage:{test_percentage}," "sliding_steps:{sliding_steps}," "model_class:{model_class}," "multi:{multi}," "confidence_level:{confidence_level}".format( error_methods=error_methods, data=data, params=params, train_percentage=train_percentage, test_percentage=test_percentage, sliding_steps=sliding_steps, model_class=model_class, multi=multi, confidence_level=confidence_level, ) )
[docs] def run_cv(self): """Running the cross validation process run cv with given model and data get errors for each horizon as follows | horizon | error | | 1 | 12.3 | ... then calculate the std for each horizon | horizon | std | | 1 | 1.2 | Args: None Returns: None """ logging.info("Creating backtester object.") backtester = BackTesterRollingWindow( self.error_methods, self.data, self.params, self.train_percentage, self.test_percentage, self.sliding_steps, self.model_class, self.multi, ) logging.info("Run backtesting.") backtester.run_backtest() self.SE = pd.DataFrame(backtester.raw_errors).transpose().std(axis=1)
[docs] def get_lr( self, ): """Fit linear regression model Fit linear regression model for std ~ horizon return the fitted model Args: None Returns: None """ X = pd.DataFrame( { "horizon": [x + 1 for x in range(len(self.SE))], "const": [1] * len(self.SE), } ) y = self.SE logging.info("Fit the OLS model and get the coefs.") self.coefs = np.linalg.lstsq(X, y)[0]
[docs] def get_eci(self, steps: int, **kwargs): """Get empirical prediction interval Args: steps: the length of forecasting horizon Returns: The dataframe of forecasted values with prediction intervals """ self.freq = kwargs.get("freq", "D") # get dates last_date = self.data.time.max() dates = pd.date_range(start=last_date, periods=steps + 1, freq=self.freq) self.dates = dates[dates != last_date] # Return correct number of periods self.run_cv() self.get_lr() # run model with all data m = self.model_class(self.data, self.params) m.fit() self.predicted = predicted = m.predict(steps, freq=self.freq) # get margin of error horizons = np.array([x + 1 for x in range(steps)]) coefs = self.coefs assert coefs is not None me = stats.norm.ppf(self.confidence_level) * (horizons * coefs[0] + coefs[1]) self.df = pd.DataFrame( { "time": predicted.time, "fcst": predicted.fcst, "fcst_lower": predicted.fcst - me, "fcst_upper": predicted.fcst + me, } ) return self.df
[docs] def diagnose(self): """Diagnose the linear model fit for SE Plot the OLS fit: SE ~ Horizon Args: None Returns: None """ fig, ax = plt.subplots(figsize=(10, 6)) x = [x + 1 for x in range(len(self.SE))] ax.scatter(x, self.SE, label="Empirical S.E.", color="r") predictions = self.coefs[0] * np.array(x) + self.coefs[1] ax.plot(x, predictions, "b", label="Fitted") ax.legend(loc=2) ax.set_xlabel("Horizon") ax.set_ylabel("SE") ax.set_title("Diagnosis plot for SE ~ Horizon") fig.tight_layout()
[docs] def plot(self): """Make plot for model fitting with new uncertainty intervals Args: None Returns: None """ logging.info("Generating chart for forecast result with emp. conf. intervals.") fig = plt.figure(facecolor="w", figsize=(10, 6)) ax = fig.add_subplot(111) ax.plot(pd.to_datetime(self.data.time), self.data.value, "k") fcst_dates = self.dates.to_pydatetime() ax.plot(fcst_dates, self.df.fcst, ls="-", c="#4267B2") ax.fill_between( fcst_dates, self.df.fcst_lower, self.df.fcst_upper, color="#4267B2", alpha=0.2, ) # if there are default CI from the model, plot as well if self.predicted.shape[1] == 4: ax.fill_between( fcst_dates, self.predicted.fcst_lower, self.predicted.fcst_upper, color="r", alpha=0.2, ) ax.grid(True, which="major", c="gray", ls="-", lw=1, alpha=0.2) ax.set_xlabel(xlabel="time") ax.set_ylabel(ylabel="y") fig.tight_layout()