# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import dataiku.insights

# Read recipe inputs
# ==================================================================================================================
# Dataset sinlge_molecule renamed to single_molecule by georgia.kouyialis@dataiku.com on 2025-02-05 14:38:55
scored_molecules = dataiku.Dataset("single_molecule")
scored_molecules_df = scored_molecules.get_dataframe()
molecular_features = dataiku.Dataset("train_dataset")
molecular_features_df = molecular_features.get_dataframe()

# Import the predeveloped python functions from the project Libraries
# ==================================================================================================================
from molecular_similarity import calculate_tanimoto_similarity, compute_fingerprints, molecule_graph

# Select a novel molecule to compute similarity with the top 200 studied molecules with the higherst bioactivity values
# ==================================================================================================================
top_scored_new_molecules = scored_molecules_df.nlargest(1,'pIC50_prediction')[['molecule_id','canonical_smiles', 'pIC50_prediction']]
top_scored_studied_molecules = molecular_features_df.nlargest(200,'pIC50')[['canonical_smiles','molecule_id']]

# Calculate molecule fingerprints with vectors for graph input
# ==================================================================================================================
df1 = compute_fingerprints(top_scored_studied_molecules)
df2 = compute_fingerprints(top_scored_new_molecules)

# ## Apply Tanimoto algorithm to compute similarities between the new molecules and the ones that have been studied for the target protein. The Tanimoto coefficient is the ratio of the number of features common to both molecules to the total number of features
# ==================================================================================================================
similarity_dataset = pd.DataFrame(columns=["new_molecules","new_molecule_id","pIC50_prediction", "studied_molecules","studied_molecule_id", "similarity_score", 'molecular_vectors'])

for i, row1 in df1.iterrows():
    for j, row2 in df2.iterrows():
        similarity = calculate_tanimoto_similarity(row1["fingerprint"], row2["fingerprint"])
        similarity_dataset = similarity_dataset.append({
            "new_molecules": row2["canonical_smiles"],
            "new_molecule_id":row2["molecule_id"],
           "pIC50_prediction": row2['pIC50_prediction'],
            "studied_molecules": row1["canonical_smiles"],
            'studied_molecule_id': row1['molecule_id'],
            "similarity_score": similarity,
            "molecular_vectors": row1['mol_vectors']
        }, ignore_index=True)

# Drop duplicates
# ==================================================================================================================
similarity_dataset = similarity_dataset.drop_duplicates()

# Rank the dataset by the highest similarity score and select the top 6 most similar records
# ==================================================================================================================
similarity_dataset["rank"] = similarity_dataset.groupby("new_molecules")["similarity_score"].rank(method="dense", ascending=False)



# Write recipe outputs
# ==================================================================================================================
molecular_similarity = dataiku.Dataset("molecular_similarity")
molecular_similarity.write_with_schema(similarity_dataset)

# Generate the plot for the selected molecule that visualises the 6 most similar studied molecules 
# ==================================================================================================================
similarity_dataset = similarity_dataset[similarity_dataset['rank']<=6].reset_index().drop(['index'],axis=1).sort_values('rank')
selected_mols = set([mol for mol in similarity_dataset['new_molecules']])

i = 0
for mol in selected_mols:
    i = i + 1
    molecule = mol
    most_similar_df = similarity_dataset[similarity_dataset['new_molecules'] == mol].reset_index()
    most_similar_mols = most_similar_df[:6]
    score_mols = most_similar_mols['pIC50_prediction'][0]
    legends_mols = [notation for notation in most_similar_mols['studied_molecule_id']]
    similarity_score_mols = [sim for sim in  most_similar_mols['similarity_score']]
    vectors_mols = [vec for vec in most_similar_mols['molecular_vectors']]
    figure_mol = molecule_graph(vectors_mols, molecule, score_mols,legends_mols, similarity_score_mols)
    dataiku.insights.save_figure(id= str(i) + "Molecular_Similarity", figure=figure_mol)

figure_mol
