# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
from real_estate_pricing.flow.constants import PARIS_DISTRICTS

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
from sklearn.metrics import mean_absolute_error

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
def compute_mean_absolute_percentage_error(true_values, predictions):
    absolute_percentage_errors = []
    for value, prediction in zip(true_values, predictions):
        absolute_percentage_error = abs(value - prediction)/value
        absolute_percentage_errors.append(absolute_percentage_error)
    return np.mean(absolute_percentage_errors)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Read recipe inputs
square_meter_price_time_series = dataiku.Dataset("square_meter_price_monthly_time_series_windows")
square_meter_price_time_series_df = square_meter_price_time_series.get_dataframe()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
TRAIN_TEST_SPLIT_RATIO = 0.8
district_dataframes = []
district_metrics = []
for district in PARIS_DISTRICTS:
    # Filtering original dataframe on district data :
    district_square_meter_price_df =\
    square_meter_price_time_series_df[square_meter_price_time_series_df["census_district"]==district].copy()
    district_polygon = list(district_square_meter_price_df["district_polygon"])[0]

    district_square_meter_price_df.sort_values(by="date", ascending=True, inplace=True)

    # Defining train and test sets :
    n_measures_in_dataframe = len(district_square_meter_price_df)
    train_set_measure_id_threshold = int(TRAIN_TEST_SPLIT_RATIO * n_measures_in_dataframe)
    district_square_meter_price_df["date_number"] = range(n_measures_in_dataframe)
    district_square_meter_price_df["measure_set"] =\
    district_square_meter_price_df["date_number"].apply(lambda x: "train" if x < train_set_measure_id_threshold else "test")

    # Creating time data variables :
    district_train_days = np.array(district_square_meter_price_df["date_number"][district_square_meter_price_df["measure_set"]=="train"])
    district_train_data = np.array(district_square_meter_price_df["living_surface_square_meter_price_avg"][district_square_meter_price_df["measure_set"]=="train"])

    # Fitting a polynomial model (ordinary least squares) :
    fit = np.polyfit(district_train_days, district_train_data, deg=1)
    district_square_meter_price_slope = fit[0]
    district_square_meter_price_intercept = fit[1]
    district_square_meter_price_df["square_meter_price_slope"] = district_square_meter_price_slope
    district_square_meter_price_df["square_meter_price_intercept"] = district_square_meter_price_intercept

    # Creating test data variables :
    district_test_days = np.array(district_square_meter_price_df["date_number"][district_square_meter_price_df["measure_set"]=="test"])
    district_test_data = np.array(district_square_meter_price_df["living_surface_square_meter_price_avg"][district_square_meter_price_df["measure_set"]=="test"])


    # Applying slope and intercept learned on train, to the train set :
    train_predictions = np.array([district_square_meter_price_intercept + district_square_meter_price_slope * date_number  for date_number in district_train_days])
    # Applying slope and intercept learned on train, to the test set :
    test_predictions = np.array([district_square_meter_price_intercept + district_square_meter_price_slope * date_number  for date_number in district_test_days])

    # Computing metrics :
    ## Mean absolute error :
    mean_absolute_error_train = mean_absolute_error(district_train_data, train_predictions)
    mean_absolute_error_test = mean_absolute_error(district_test_data, test_predictions)
    ## Mean absolute percentage error :
    mean_absolute_percentage_error_train = compute_mean_absolute_percentage_error(district_train_data, train_predictions)
    mean_absolute_percentage_error_test = compute_mean_absolute_percentage_error(district_test_data, test_predictions)

    district_metrics.append({"census_district": district, "mae_train": mean_absolute_error_train,
                             "district_polygon": district_polygon,
                             "mae_test": mean_absolute_error_test,
                             "mape_train": mean_absolute_percentage_error_train,
                             "mape_test": mean_absolute_percentage_error_test})

    # Merging predictions :
    all_predictions = list(train_predictions) + list(test_predictions)
    district_square_meter_price_df["prediction"] = all_predictions

    district_dataframes.append(district_square_meter_price_df)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
district_metrics_df = pd.DataFrame(district_metrics)
district_metrics_df

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
square_meter_price_forecasts_df = pd.concat(district_dataframes)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
square_meter_price_forecasts_df["prediction_error"] =\
square_meter_price_forecasts_df["living_surface_square_meter_price_avg"]  - square_meter_price_forecasts_df["prediction"]
square_meter_price_forecasts_df["prediction_absolute_error"] =\
abs(square_meter_price_forecasts_df["prediction_error"])
square_meter_price_forecasts_df["prediction_absolute_percentage_error"] =\
abs(square_meter_price_forecasts_df["prediction_error"] / square_meter_price_forecasts_df["living_surface_square_meter_price_avg"])

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
square_meter_price_forecasts = dataiku.Dataset("square_meter_price_forecasts")
square_meter_price_forecasts.write_with_schema(square_meter_price_forecasts_df)
square_meter_price_forecast_metrics = dataiku.Dataset("square_meter_price_forecast_metrics")
square_meter_price_forecast_metrics.write_with_schema(district_metrics_df)