# -*- coding: utf-8 -*-
import dataiku
from dotenv import load_dotenv
import warnings
import datamol as dm
import os
import pandas as pd, numpy as np


# Import the predeveloped python functions from the project Libraries
# ==================================================================================================================
from molecular_property_prediction import pIC50_bioactivity, molecular_featurizer, get_MACCS_keys, molecular_descriptors, tsne_function, rule_of_five

# Load python variables
# ==================================================================================================================
load_dotenv()
warnings.simplefilter('ignore')
os.environ['PYTHONWARNINGS'] = 'ignore'
os.environ["TOKENIZERS_PARALLELISM"] = 'false'
dm.disable_rdkit_log()

# Read recipe inputs
# ==================================================================================================================
molecules_bioactivity = dataiku.Dataset("molecule_bioactivity")
molecules_bioactivity_df = molecules_bioactivity.get_dataframe()

# Set the right parameters for featurization
# ==================================================================================================================
smiles = molecules_bioactivity_df['canonical_smiles'].dropna()
smiles_list = smiles.to_list()

# ## Featurize the canonical smiles expression with a tranformer that vectorizes the canonical structures into numerical features
'''
1. User should speficy the transformer type: 'MACCS', 'ecfp', 'mordred', 'ChemBERTa' in the project variables
2. Call the featurizer function with parameters the dataset that contains a smiles 'canonical_smiles' and the transformer_type
'''
# Project variables
# ==================================================================================================================
transformer_type = dataiku.get_custom_variables()["transformer_type"]
prefix = transformer_type + '_'
molecular_features = molecular_featurizer(smiles, transformer_type)

# Replace any NaN values with 0 to avoid modeling errors
# ==================================================================================================================
for array in molecular_features:
    array[np.isnan(array)] = 0
    
# Create a dataframe with the vector of molecular features and join back to the original dataset
# ==================================================================================================================
features_df = pd.DataFrame(molecular_features)
features_df = features_df.add_prefix(prefix)
output_dataset = molecules_bioactivity_df.join(features_df)

# ## Use TSNE (t-Distributed Stochastic Neighbor Embedding) for dimensionality reduction for visualising and clustering molecules in 2D
# ==================================================================================================================
tsne_array = tsne_function(len(molecular_features), np.array(molecular_features))
tsne_dataframe = molecules_bioactivity_df.join(tsne_array)

# ## Construct a table with molecular descriptor features
# ==================================================================================================================
descriptors_names = ['MolWt','MolLogP','NumHAcceptors','NumHDonors','NumRotatableBonds','NumHeteroatoms',
                 'NumAromaticRings','RingCount','HeavyAtomCount','TPSA']
molecular_desc = molecular_descriptors(smiles_list, descriptors_names)
columns = ["canonical_smiles"] + descriptors_names + ['QED']
descriptors_dataframe = pd.DataFrame(molecular_desc, columns=columns)
descriptors_df_output = molecules_bioactivity_df.join(descriptors_dataframe.drop('canonical_smiles',axis=1))


# ## Apply Lipinskis rule of five
# ==================================================================================================================
descriptors_df_output['Lipinskis_rule'] = descriptors_df_output.apply(lambda row: rule_of_five(row['MolWt'], row['MolLogP'], row['NumHAcceptors'], row['NumHDonors']), axis=1)

# Output datasets
# ==================================================================================================================
columns_to_drop = ['canonical_smiles', 'standard_value', 'bioactivity_class', 'inactive_IC50', 'active_IC50']
train_dataset = descriptors_dataframe.join(output_dataset.drop(columns=columns_to_drop))
columns_to_drop.append('pIC50')
columns_to_drop.append('molecule_id')
descriptors_df_output = descriptors_df_output.join(tsne_dataframe.drop(columns=columns_to_drop))

# Write recipe outputs
# ==================================================================================================================
molecular_properties = dataiku.Dataset("molecular_properties")
molecular_properties.write_with_schema(descriptors_df_output)
molecular_features = dataiku.Dataset("train_dataset")
molecular_features.write_with_schema(train_dataset)
