# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
from commons.dku_utils.datasets.dataset_commons import get_dataset_schema
from commons.dku_utils.core import get_current_project_and_variables

# Read recipe inputs
applications_joined = dataiku.Dataset("applications_train")
applications_joined_df = applications_joined.get_dataframe()
top_variables_all = dataiku.Dataset("applications_statistical_filter")
top_variables_all_df = top_variables_all.get_dataframe()

project, variables = get_current_project_and_variables()

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
kept_columns = list(top_variables_all_df['feature'])

applications_filtered_df = applications_joined_df[['id', 'credit_event'] + kept_columns]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
input_schema = get_dataset_schema(project, 'applications_train')

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
output_schema = [column for column in input_schema if column['name'] in ['id', 'credit_event'] + kept_columns]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Write recipe outputs
applications_filtered = dataiku.Dataset("applications_initial_filter")
applications_filtered.write_schema(output_schema)
applications_filtered.write_dataframe(applications_filtered_df,
                                     infer_schema=False,
                                     dropAndCreate=True)