# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Libraries
# ==============================================================================
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd, numpy as np
from pmdarima.arima import auto_arima
import warnings

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: MARKDOWN
# ## Model

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Load DSS dataset as a Pandas dataframe
# ==============================================================================
data = dataiku.Dataset("historical_ts_data_normalised")
data = data.get_dataframe()
data = data.set_index('date')
#data

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Detect number and names of subcategories
# ==============================================================================
subcategories = data["category"].unique()
subcategories_nb = len(subcategories)
print("number of subcategories :",subcategories_nb)
print("names of subcategories :",subcategories)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Create horizons list
# ==============================================================================
horizon_nb = int(dataiku.get_custom_variables()["horizon_nb"])
horizon_list = []
#lag_list = []

for i in range(1,horizon_nb+1):
    horizon_list.append("H"+str(i))
    #lag_list.append(i)

#print(lag_list)
print(horizon_list)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Min datapoints used in timeseries
# ==============================================================================
# min_datapoints = horizon_nb - 2 # to set as a variable to parameter if wanted -> min value needed = 3

min_datapoints = 3

#if min_datapoints<0:
#    min_datapoints = 1

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Store not forecasted data
# ==============================================================================
not_forecasted_data = data.sort_values(by=["date"])
not_forecasted_data=not_forecasted_data[:(min_datapoints*subcategories_nb)]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Store models parameters
# ==============================================================================
model_parameter_df = pd.DataFrame(columns=["model","category"]) # create a dataframe to store models parameters

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Build auto arima models for each catefories over the horizon number
# ==============================================================================
forecasts_results = pd.DataFrame() # create an empty dataframe to store results
warnings.filterwarnings('ignore') # ignore warnings

for i in range (0,(subcategories_nb)):
    data_input = data[data['category']==subcategories[i]] # filter on subcategorie value
    print(subcategories[i])
    data_input = data_input[['actual_value']] # keep actual_value column only
    data_input = data_input.sort_values(by=["date"]) # sort by date
    model = auto_arima(data_input.actual_value,max_p=2,max_q=2) # identify the most optimal parameters for an ARIMA and fit the model
    model_parameter_df.loc[i] = [str(model),subcategories[i]] # keep track of models parameters
    for j in range(min_datapoints, len(data_input)+1):
        start = True
        k = 0 
        while start and k<j :
            try:
                forecasts = model.fit_predict(data_input[k:j], n_periods=horizon_nb) # predict forecasts
                start = False
            except Exception as e:
                # forecasts = model.fit_predict(data_input[1:j], n_periods=horizon_nb) # predict forecasts
                k = k+1 
                # print(data_input[1:j])
        forecasts_df = pd.DataFrame(forecasts, columns=["forecasts"]) #.round() # store forecasts in dataset
        forecasts_df['horizon']=horizon_list # add horizons flag
        #print(data_input)
        #forecasts_df['lag']=lag_list # add horizons flag
        forecasts_df['category']=str(subcategories[i]) # add subcategories flag
        # get date of point j
        date = pd.DataFrame(data_input[j-1:j])
        date.reset_index(inplace=True)
        forecasts_df['ref_date']=date['date'][0]
        forecasts_results = pd.concat([forecasts_results,forecasts_df]) # add forecasts to results dataframe

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Results
# ==============================================================================
forecasts_results

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Models parameters
# ==============================================================================
model_parameter_df

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Convert index to column
# ==============================================================================
forecasts_results.reset_index(inplace=True)
forecasts_results = forecasts_results.rename(columns = {'index':'date'})
forecasts_results

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write model output
# ==============================================================================
forecasted_ts_features_df = forecasts_results # For this sample code, simply copy input to output
forecasted_ts_features = dataiku.Dataset("forecasted_ts_features")
forecasted_ts_features.write_with_schema(forecasted_ts_features_df)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: MARKDOWN
# ## Model parameters

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write model parameters as an output
# ==============================================================================
arima_models_parameters = dataiku.Dataset("arima_models_parameters")
arima_models_parameters.write_with_schema(model_parameter_df)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: MARKDOWN
# ## Not forecasted data

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Convert index to column
# ==============================================================================
not_forecasted_data.reset_index(inplace=True)
not_forecasted_data = not_forecasted_data.rename(columns = {'index':'date'})
not_forecasted_data = not_forecasted_data[["date","category","actual_value_non_normalised","manual_forecast","actual_value"]]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Drop manual_forecast  column
# ==============================================================================
#not_forecasted_data = not_forecasted_data.drop(columns=['manual_forecast'])

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write not forecasted data as an output
# ==============================================================================
not_ts_forecasted_data = dataiku.Dataset("not_ts_forecasted_data")
not_ts_forecasted_data.write_with_schema(not_forecasted_data)