# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
from dku_utils.folders.pickles.folder_pickles import write_pickle_in_managed_folder
from dku_utils.core import get_current_project_and_variables

project, variables = get_current_project_and_variables()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Read recipe inputs
document_meta = dataiku.Dataset("input_metadata")
document_meta_df = document_meta.get_dataframe()
document_meta_df['file_name_without_extension'] = [file_name.split('.')[0] for file_name in document_meta_df['file_name']]

try:
    document_database = dataiku.Dataset("document_database")
    document_database_df = document_database.get_dataframe()
except:
    document_database_df = pd.DataFrame()

try:
    ocr_dataset = dataiku.Dataset("ocr_dataset")
    ocr_dataset_df = ocr_dataset.get_dataframe()
except:
    ocr_dataset_df = pd.DataFrame()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
all_documents_df = pd.merge(document_meta_df, document_database_df, how='left', on='file_name')

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
all_documents_df = pd.merge(all_documents_df, ocr_dataset_df, how='left', left_on='file_name_without_extension', right_on='file')

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
all_documents_df['text'] = all_documents_df['text_x']
all_documents_df['text'][all_documents_df['text'].isna()] = all_documents_df['text_y'][all_documents_df['text'].isna()]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
all_documents_df.drop(['text_x', 'text_y', 'file', 'file_name_without_extension'], axis=1, inplace=True)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
for i, document in all_documents_df.iterrows():
    document_dict = document.to_dict()
    write_pickle_in_managed_folder(project, "all_documents", document_dict, document_dict['file_name'])