# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
from dku_utils import (get_managed_folder_info,
                       get_managed_folder_id_with_folder_name,
                       write_pickle_in_dss_folder)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
from pandas_utils import nest_dataframe_data_to_key_column

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
from real_estate_pricing.geographic_handling.relationship.inclusion import PolygonsIndexer
from real_estate_pricing.geographic_handling.formating.multipolygons import read_multipolygon_string_as_list

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Read recipe inputs
census_data_prepared = dataiku.Dataset("census_data_with_district_polygons")
census_data_prepared_df = census_data_prepared.get_dataframe()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
census_data_by_districts_prepared = dataiku.Dataset("census_data_by_districts_prepared")
census_data_by_districts_prepared_df = census_data_by_districts_prepared.get_dataframe()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
project_key = dataiku.get_custom_variables()["projectKey"]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
census_data_indexed_folder_id = get_managed_folder_id_with_folder_name(project_key, "census_data_indexing")

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
census_years = list(np.unique(census_data_prepared_df["census_year"]))
write_pickle_in_dss_folder(census_years, "census_years", census_data_indexed_folder_id)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
census_data_prepared_columns = list(census_data_prepared_df.columns)
census_data_indexed_columns = ["census_polygon_index"] + [column for column in census_data_prepared_columns]
census_data_indexed_df = pd.DataFrame(columns=census_data_indexed_columns)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
reverse_coordinates = True
census_polygons_format = "wkt"
try_to_union_all_polygons = True
census_data_indexed_dfs = []
for year in census_years:
    print("Handling census data from year '{}'.".format(year))
    filtered_census_df = census_data_prepared_df[census_data_prepared_df["census_year"]==year].copy()
    filtered_census_df["census_polygon_index"] = range(len(filtered_census_df))
    list_polygons_collection = []
    list_polygons_types = []
    for census_polygon in filtered_census_df["census_polygon"]:
        list_polygon, list_polygon_type = read_multipolygon_string_as_list(census_polygon,
                                                                           census_polygons_format,
                                                                           reverse_coordinates,
                                                                           try_to_union_all_polygons)
        list_polygons_collection.append(list_polygon)
        list_polygons_types.append(list_polygon_type)
        pass
    census_polygons_indexer = PolygonsIndexer(list_polygons_collection, list_polygons_types)
    write_pickle_in_dss_folder(census_polygons_indexer,
                               "census_data_indexer_year_{}".format(year),
                               census_data_indexed_folder_id)
    census_data_indexed_dfs.append(filtered_census_df)
census_data_indexed_df = pd.concat(census_data_indexed_dfs)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
census_data_indexed_df = census_data_indexed_df[census_data_indexed_columns]
census_data_indexed_df["census_year"] = census_data_indexed_df["census_year"].astype(int)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
census_data_by_districts_prepared_df.drop("district_polygon", axis=1, inplace=True)
census_data_indexed_df = census_data_indexed_df.merge(census_data_by_districts_prepared_df,
                                                      how="left",
                                                      on="census_district")

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
census_last_year = np.max(census_years)
census_df_to_index = census_data_indexed_df[census_data_indexed_df["census_year"]==census_last_year].copy()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
dict_census_metadata = nest_dataframe_data_to_key_column(census_df_to_index, "census_polygon_index")

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
write_pickle_in_dss_folder(dict_census_metadata, "dict_census_metadata", census_data_indexed_folder_id)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
census_data_indexed = dataiku.Dataset("census_data_indexed")
census_data_indexed.write_with_schema(census_data_indexed_df)