# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
from dku_utils.core import get_current_project_and_variables
from dku_utils.folders.pickles.folder_pickles import read_pickle_from_managed_folder

project, variables = get_current_project_and_variables()

# Read recipe inputs
category = dataiku.Folder("WSwcnOo2")

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
def get_count(environmental, social, governance):
    try:
        return len(environmental.keys()), len(social.keys()), len(governance.keys())
    except:
        return None, None, None

def get_counts(all_dict):
    return {k: len(all_dict[k].keys()) for k in all_dict.keys()}
    
def get_average(environmental, social, governance):
    try:
        avg_sent = []
        for x in [environmental,social,governance]:
            cat_dict = x
            for cat,value in cat_dict.items():
                result = []
                for val in value:
                    result.append(val[2])
            avg_sent.append(round(sum(result)/len(result),2))
        return avg_sent[0], avg_sent[1], avg_sent[2]
    except:
        return None, None, None

def get_average(cat_dict):
    for cat, value in cat_dict.items():
        result = []
        for val in value:
            result.append(val[2])
    return round(sum(result)/len(result),2)

def get_averages(all_dict):
    return {k: get_average(all_dict[k]) for k in all_dict.keys()}

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
paths = category.list_paths_in_partition()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
count_occurence_df = pd.DataFrame()

for i, path in enumerate(paths):
    document = read_pickle_from_managed_folder(project, 'category', path)
    #Environmental_Count, Social_Count, Governance_Count = get_count(document['key_word_dict_Environmental'], document['key_word_dict_Social'], document['key_word_dict_Governance'])
    count_dict = get_counts(document['key_word_category_dict'])
    #Environmental_Avg_Sentiment, Social_Avg_Sentiment, Governance_Avg_Sentiment = get_average(document['key_word_dict_Environmental'], document['key_word_dict_Social'], document['key_word_dict_Governance'])
    sentiment_dict = get_averages(document['key_word_category_dict'])
    for category in document['key_word_category_dict'].keys():
        count_occurence_df = count_occurence_df.append(pd.DataFrame({'file_name': [document['file_name']],
                                               'date': [document['date']],
                                               'name': [document['name']],
                                               'Category': [category],
                                               'Category_Count': [count_dict[category]],
                                               'Category_Sentiment': [sentiment_dict[category]]}))

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
count_occurence = dataiku.Dataset("count_occurences")
count_occurence.write_with_schema(count_occurence_df)