# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import fitz
import os
import uuid

# Read recipe inputs
searchable = dataiku.Folder("1Jon6Y5u")
searchable_info = searchable.get_info()

error = dataiku.Folder("epzn88Ed")
error_info = error.get_info()

documents_processed_df = pd.DataFrame(columns = ['file_name', 'document_type','text'])
documents_errored_df = pd.DataFrame(columns = ['file_name', 'document_type','text'])
documents_nonsearchable_df = pd.DataFrame(columns = ['file_name', 'document_type','text'])

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
paths = searchable.list_paths_in_partition()
filenames = [os.path.basename(path) for path in paths]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
def process_searchable(documents_processed_df,searchable):
    paths = searchable.list_paths_in_partition()
    filenames = [os.path.basename(path) for path in paths]
    i = 0
    for file in filenames:
        with searchable.get_download_stream(file) as f:
            file_path = searchable.file_path(file)
            doc = fitz.open(file_path)
            text = ""
            for page in doc:
                text += page.getText()
            documents_processed_df.loc[i,'file_name'] = file
            documents_processed_df.loc[i,'document_type'] = "searchable_pdf"
            documents_processed_df.loc[i,'text'] = text
            documents_processed_df.loc[i,'uid'] = file[:file.find(".")]
            i = i + 1
    return


def process_error(documents_errored_df, error):
    paths = error.list_paths_in_partition()
    filenames = [os.path.basename(path) for path in paths]
    j = 0
    for file in filenames:
        documents_errored_df.loc[j,'file_name'] = file
        documents_errored_df.loc[j,'document_type'] = "error"
        documents_errored_df.loc[j,'text'] = "N/A"
        documents_errored_df.loc[j,'uid'] = file[:file.find(".")]
        j = j + 1
    return

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
process_searchable(documents_processed_df,searchable)
process_error(documents_errored_df,error)

concat_df = documents_processed_df.append(documents_errored_df)

# Write recipe outputs
documents_processed = dataiku.Dataset("document_database")
documents_processed.write_with_schema(concat_df)
