# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
from process_mining.variants import get_variants

# Read recipe inputs
workflow_clean = dataiku.Dataset("workflow_clean")
workflow_clean_df = workflow_clean.get_dataframe()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
workflow_clean_df = workflow_clean_df[~workflow_clean_df['activity'].isin(['START', 'END'])]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
workflow_clean_df = workflow_clean_df.sort_values(by=['sorting'])

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
variants = get_variants(workflow_clean_df)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
variants['activity_start'] = [v[0] for v in variants['activity']]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
variants['activity_end'] = [v[-1] for v in variants['activity']]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
variants['activity'] = [','.join(v) for v in variants['activity']]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
variants.columns = ['case', 'variant', 'activity_start', 'activity_end']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
variants['case'] = variants['case'].apply(str)
workflow_variants = dataiku.Dataset("workflow_variants")
workflow_variants.write_with_schema(variants)