# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu

# Read recipe inputs
reference_process = dataiku.Dataset("reference_process")
reference_process_df = reference_process.get_dataframe()
workflow_with_variants = dataiku.Dataset("workflow_with_variants")
workflow_with_variants_df = workflow_with_variants.get_dataframe()
variants_id = dataiku.Dataset("variants_id")
variants_id_df = variants_id.get_dataframe()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
def global_alignment_score(seq1, seq2, match=1, mismatch=-1, gap=-1):
    # Initialize the score matrix
    score_matrix = [[0] * (len(seq2) + 1) for _ in range(len(seq1) + 1)]

    # Initialize the first row and column
    for i in range(len(seq1) + 1):
        score_matrix[i][0] = i * gap
    for j in range(len(seq2) + 1):
        score_matrix[0][j] = j * gap

    # Fill in the score matrix
    for i in range(1, len(seq1) + 1):
        for j in range(1, len(seq2) + 1):
            match_score = match if seq1[i - 1] == seq2[j - 1] else mismatch
            score_matrix[i][j] = max(score_matrix[i - 1][j - 1] + match_score,
                                     score_matrix[i - 1][j] + gap,
                                     score_matrix[i][j - 1] + gap)

    # The score is the value in the bottom-right cell
    alignment_score = score_matrix[-1][-1]

    return alignment_score

def normalized_score(seq1, seq2, match=1, mismatch=-1, gap=-1):
    # Compute the alignment score
    alignment_score = global_alignment_score(seq1, seq2, match, mismatch, gap)

    # Maximum possible score is the length of the longer sequence
    max_possible_score = max(len(seq1), len(seq2))

    # Normalize the score
    normalized_score = alignment_score / max_possible_score

    return normalized_score

def calculate_similarity(sequence1, sequence2):
    return normalized_score(sequence1, sequence2)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
reference_variants = []

for reference_id in reference_process_df['variant_rank']:
    seq = variants_id_df.loc[variants_id_df['variant_rank'] == reference_id]['variant'].iloc[0]
    reference_variants.append(seq.split(','))

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
variant_distances = dict()

for variant_id in variants_id_df['variant_rank']:
    variant_distances[variant_id] = []
    for i, reference_id in enumerate(reference_process_df['variant_rank']):
        if reference_id == variant_id:
            variant_distances[variant_id].append(1)
        else:
            seq1 = variants_id_df.loc[variants_id_df['variant_rank'] == variant_id]['variant'].iloc[0].split(',')
            seq2 = reference_variants[i]
            variant_distances[variant_id].append(calculate_similarity(seq1, seq2))

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
distance_to_reference = {k: max(variant_distances[k]) for k in variant_distances.keys()}

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
trace_ids, distances = zip(*distance_to_reference.items())

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
distance_to_reference_df = pd.DataFrame(data={'variant_rank': trace_ids, 'conformance': distances})

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
workflow_variant_id = workflow_with_variants_df.groupby('case')['variant_rank'].last().reset_index()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
conformance_checks_df = workflow_variant_id.merge(distance_to_reference_df, how='left', on='variant_rank')

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
conformance_checks_df['conformance'] = (conformance_checks_df['conformance'] + 1) / 2

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
conformance_checks_df['trace_is_conform'] = [(1 if score == 1 else 0) for score in conformance_checks_df['conformance']]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
conformance_checks_df['case'] = conformance_checks_df['case'].apply(str)
conformance_checks = dataiku.Dataset("conformance_checks")
conformance_checks.write_with_schema(conformance_checks_df)