# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import warnings
import os
import datamol as dm
from dotenv import load_dotenv

# Import the predeveloped python functions from the project Libraries
# ==================================================================================================================
from molecular_property_prediction import molecular_featurizer, get_MACCS_keys, molecular_descriptors, rule_of_five
target_accession = dataiku.get_custom_variables()['accession_protein_code']
# Load python variables
# ==================================================================================================================
load_dotenv()
warnings.simplefilter('ignore')
os.environ['PYTHONWARNINGS'] = 'ignore'
os.environ["TOKENIZERS_PARALLELISM"] = 'false'
dm.disable_rdkit_log()

# Read recipe inputs
# ==================================================================================================================
compounds_to_score = dataiku.Dataset("test_data_distinct")
compounds_to_score_df = compounds_to_score.get_dataframe()
compounds_to_score_df = compounds_to_score_df.dropna()

# Set the right parameters for featurization
# ==================================================================================================================
new_smiles = compounds_to_score_df['canonical_smiles'].dropna()


# ## Featurize the new canonical smiles with the same tranformer_type used for the the regression model
# ==================================================================================================================
transformer_type = dataiku.get_custom_variables()["transformer_type"]
prefix = transformer_type + '_'
molecular_features_vector = molecular_featurizer(new_smiles, transformer_type)
new_features_df = pd.DataFrame(molecular_features_vector)
new_features_df = new_features_df.add_prefix(prefix)
output_dataset = compounds_to_score_df.join(new_features_df)

# Remove any records with empty values
# ==================================================================================================================
output_dataset = (output_dataset[output_dataset[transformer_type + '_0'].notnull()])
new_compounds_list = output_dataset['canonical_smiles']

# Get molecular descriptors from RDkit
# ==================================================================================================================
descriptors_names = ['MolWt','MolLogP','NumHAcceptors','NumHDonors','NumRotatableBonds','NumHeteroatoms',
                 'NumAromaticRings','RingCount','HeavyAtomCount','TPSA']
molecular_desc = molecular_descriptors(new_compounds_list, descriptors_names)
columns = ["canon_smiles"] + descriptors_names + ['QED']
output_dataset[columns] = molecular_desc
output_dataset= output_dataset.drop(['canon_smiles'], axis = 1)

# Compute Lipinski's rule
# ==================================================================================================================
output_dataset['Lipinskis_rule'] = output_dataset.apply(lambda row: rule_of_five(row['MolWt'], row['MolLogP'], row['NumHAcceptors'], row['NumHDonors']), axis=1)
output_dataset['target_protein'] = target_accession
# Write recipe outputs
molecules_to_score = dataiku.Dataset("molecules_to_score")
molecules_to_score.write_with_schema(output_dataset)
