# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
#%config Completer.use_jedi = False

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
from pandas.api.types import is_numeric_dtype

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Read recipe inputs
workflow = dataiku.Dataset("workflow_parsed")
workflow_df = workflow.get_dataframe()

project = dataiku.api_client().get_project(dataiku.default_project_key())
variables = project.get_variables()

# column names

case_column = variables['standard']['case']
activity_column = variables['standard']['activity']
timestamp_column = variables['standard']['timestamp']
use_end_timestamp = variables['standard']['use_end_timestamp']
end_timestamp_column = variables['standard']['end_timestamp']
use_sorting = variables['standard']['use_sorting']
sorting_column = variables['standard']['sorting']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
workflow_df.rename(columns={case_column: 'case',
                            activity_column: 'activity',
                            timestamp_column: 'timestamp'},
                   inplace=True)

if use_end_timestamp:
    workflow_df.rename(columns={end_timestamp_column: 'end_timestamp'},
                   inplace=True)
    workflow_df['end_timestamp'].fillna(workflow_df['timestamp'], inplace=True)
else:
    workflow_df['end_timestamp'] = workflow_df['timestamp']

if use_sorting:
    workflow_df.rename(columns={sorting_column: 'sorting'},
                   inplace=True)
else:
    workflow_df['sorting'] = workflow_df['timestamp']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
filtered_workflow = workflow_df

filtered_workflow['timestamp'] = pd.to_datetime(filtered_workflow['timestamp'], infer_datetime_format=True, utc=True)
filtered_workflow['epoch_timestamp'] = pd.to_datetime(filtered_workflow['timestamp']).astype(int) / 10**9
if not use_sorting:
    filtered_workflow['sorting'] = filtered_workflow['epoch_timestamp']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
filtered_workflow['end_timestamp'] = pd.to_datetime(filtered_workflow['end_timestamp'], infer_datetime_format=True, utc=True)
filtered_workflow['epoch_end_timestamp'] = pd.to_datetime(filtered_workflow['end_timestamp']).astype(int) / 10**9

aggs = {'timestamp': 'min', 
     'epoch_timestamp': 'min', 
     'end_timestamp': 'max', 
     'epoch_end_timestamp': 'max',
     'sorting': ['min', 'max']}

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
start_ends = filtered_workflow.groupby('case').agg(aggs).reset_index()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
start_ends.columns = ['case'] + ['min', 'epoch_min', 'max', 'epoch_max', 'sorting_min', 'sorting_max']

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
starts = start_ends[['case'] + ['min', 'epoch_min', 'sorting_min']]
starts['activity'] = 'START'
starts.columns = ['case'] + ['timestamp', 'epoch_timestamp', 'sorting', 'activity']
starts['timestamp'] = starts['timestamp'] - pd.Timedelta(seconds=1)
starts['sorting'] = starts['sorting'] - 1
starts['epoch_timestamp'] = starts['epoch_timestamp'] - 1
starts['end_timestamp'] = starts['timestamp']
starts['epoch_end_timestamp'] = starts['epoch_timestamp']

ends = start_ends[['case'] + ['max', 'epoch_max', 'sorting_max']]
ends['activity'] = 'END'
ends.columns = ['case'] + ['timestamp', 'epoch_timestamp', 'sorting', 'activity']
ends['timestamp'] = ends['timestamp'] + pd.Timedelta(seconds=1)
ends['sorting'] = ends['sorting'] + 1
ends['epoch_timestamp'] = ends['epoch_timestamp'] + 1
ends['end_timestamp'] = ends['timestamp']
ends['epoch_end_timestamp'] = ends['epoch_timestamp']


filtered_workflow = pd.concat([starts, filtered_workflow, ends])

filtered_workflow['process_time'] = (filtered_workflow['epoch_end_timestamp'] - filtered_workflow['epoch_timestamp'])

filtered_workflow = filtered_workflow.sort_values(['case', 'sorting'])
filtered_workflow['diff_sorting'] = filtered_workflow.groupby(['case'])['sorting'].diff().fillna(10000)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
concurrent_activity_cases = filtered_workflow[filtered_workflow['diff_sorting']==0]['case'].unique()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
filtered_out_workflow = filtered_workflow[filtered_workflow['case'].isin(concurrent_activity_cases)]
filtered_workflow = filtered_workflow[~filtered_workflow['case'].isin(concurrent_activity_cases)]
filtered_workflow = filtered_workflow.drop(['diff_sorting'], axis=1)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
filtered_workflow['case'] = filtered_workflow['case'].apply(str)
workflow_clean = dataiku.Dataset("workflow_clean")
workflow_clean.write_with_schema(filtered_workflow)

workflow_filtered_out = dataiku.Dataset("dropped_cases")
workflow_filtered_out.write_with_schema(filtered_out_workflow)

dropped_cases_number = dataiku.Dataset("dropped_cases_number")
dropped_cases_number.write_with_schema(pd.DataFrame(data={'nb_cases': [len(filtered_out_workflow['case'].unique())]}))