import dataiku
from dataiku import pandasutils as pdu
import pandas as pd

from dku_utils.projects.recipes.recipe_commons import switch_recipe_engine

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Read the dataset as a Pandas dataframe in memory
# Note: here, we only read the first 100K rows. Other sampling options are available
dataset_test_output = dataiku.Dataset("test_output")
df = dataset_test_output.get_dataframe()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Get some simple descriptive statistics
pdu.audit(df)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
clusters = df[["future_clv_cluster","prediction_future_clv_cluster","ml_regression_future_clv_cluster","lifetime_regression_future_clv_cluster"]]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
pandas_dataframe = clusters.apply(pd.Series.value_counts)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Recipe outputs
cluster_counts = dataiku.Dataset("cluster_counts")
cluster_counts.write_with_schema(pandas_dataframe.reset_index())

switch_recipe_engine