# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu


mapping_eventServer = {
 'clientEvent.clusterId': 'clusterId',
 'clientEvent.computeResourceUsage.context.activityId': 'context_activityId',
 'clientEvent.computeResourceUsage.context.analysisId': 'context_analysisId',
 'clientEvent.computeResourceUsage.context.authIdentifier': 'context_authIdentifier',
 'clientEvent.computeResourceUsage.context.jobId': 'context_jobId',
 'clientEvent.computeResourceUsage.context.jupyterNotebookId': 'context_jupyterNotebookId',
 'clientEvent.computeResourceUsage.context.mlTaskId': 'context_mlTaskId',
 'clientEvent.computeResourceUsage.context.projectKey': 'context_projectKey',
 'clientEvent.computeResourceUsage.context.recipeName': 'context_recipeName',
 'clientEvent.computeResourceUsage.context.type': 'context_type',
 'clientEvent.computeResourceUsage.context.webappId': 'context_webappId',
 'clientEvent.computeResourceUsage.context.datasetName' : 'context_datasetName',
 'clientEvent.computeResourceUsage.endTime': 'endTime',
 'clientEvent.computeResourceUsage.id': 'contextId',
 'clientEvent.computeResourceUsage.localProcess.commandName': 'localProcess_commandName',
 'clientEvent.computeResourceUsage.localProcess.cpuCurrent': 'localProcess_cpuCurrent',
 'clientEvent.computeResourceUsage.localProcess.cpuTotalMS': 'localProcess_cpuTotalMS',
 'clientEvent.computeResourceUsage.localProcess.vmRSSMB': 'localProcess_vmRSSMB',
 'clientEvent.computeResourceUsage.localProcess.vmRSSTotalMBS': 'localProcess_vmRSSTotalMBS',
 'clientEvent.computeResourceUsage.singleK8SJob.executionId': 'singleK8SJob_executionId',
 'clientEvent.computeResourceUsage.sqlConnection.connection': 'sqlConnection_connection',
 'clientEvent.computeResourceUsage.sqlQuery.connection': 'sqlQuery_connection',
 'clientEvent.computeResourceUsage.sqlQuery.query': 'sqlQuery_query',
 'clientEvent.computeResourceUsage.startTime': 'startTime',
 'clientEvent.computeResourceUsage.llmUsage.connection': 'llmUsage_connection',
'clientEvent.computeResourceUsage.llmUsage.llmType': 'llmUsage_llmType',
'clientEvent.computeResourceUsage.llmUsage.usageType': 'llmUsage_usageType',
'clientEvent.computeResourceUsage.llmUsage.totalQueries': 'llmUsage_totalQueries',
'clientEvent.computeResourceUsage.llmUsage.cacheHitQueries': 'llmUsage_cacheHitQueries',
'clientEvent.computeResourceUsage.llmUsage.cacheMissQueries': 'llmUsage_cacheMissQueries',
'clientEvent.computeResourceUsage.llmUsage.totalPromptTokens': 'llmUsage_totalPromptTokens',
'clientEvent.computeResourceUsage.llmUsage.totalCompletionTokens': 'llmUsage_totalCompletionTokens',
'clientEvent.computeResourceUsage.llmUsage.totalComputationTimeMS': 'llmUsage_totalComputationTimeMS',
'clientEvent.computeResourceUsage.llmUsage.estimatedCostUSD': 'llmUsage_estimatedCostUSD',
 'clientEvent.computeResourceUsage.type': 'type',
 'clientEvent.dssNodeId': 'dssNodeId',
 'clientEvent.dssNodeName': 'dssNodeName',
 'clientEvent.msgType': 'msgType',
 'clientEvent.podsStatus.pods': 'podsStatus.pods',
 'serverTimestamp': 'serverTimestamp'}

mapping_log4j = {
 'message.clusterId': 'clusterId',
 'message.computeResourceUsage.context.activityId': 'context_activityId',
 'message.computeResourceUsage.context.analysisId': 'context_analysisId',
 'message.computeResourceUsage.context.authIdentifier': 'context_authIdentifier',
 'message.computeResourceUsage.context.jobId': 'context_jobId',
 'message.computeResourceUsage.context.jupyterNotebookId': 'context_jupyterNotebookId',
 'message.computeResourceUsage.context.mlTaskId': 'context_mlTaskId',
 'message.computeResourceUsage.context.projectKey': 'context_projectKey',
 'message.computeResourceUsage.context.recipeName': 'context_recipeName',
 'message.computeResourceUsage.context.type': 'context_type',
 'message.computeResourceUsage.context.webappId': 'context_webappId',
 'message.computeResourceUsage.context.datasetName' : 'context_datasetName',
 'message.computeResourceUsage.endTime': 'endTime',
 'message.computeResourceUsage.id': 'contextId',
 'message.computeResourceUsage.localProcess.commandName': 'localProcess_commandName',
 'message.computeResourceUsage.localProcess.cpuCurrent': 'localProcess_cpuCurrent',
 'message.computeResourceUsage.localProcess.cpuTotalMS': 'localProcess_cpuTotalMS',
 'message.computeResourceUsage.localProcess.vmRSSMB': 'localProcess_vmRSSMB',
 'message.computeResourceUsage.localProcess.vmRSSTotalMBS': 'localProcess_vmRSSTotalMBS',
 'message.computeResourceUsage.singleK8SJob.executionId': 'singleK8SJob_executionId',
 'message.computeResourceUsage.sqlConnection.connection': 'sqlConnection_connection',
 'message.computeResourceUsage.sqlQuery.connection': 'sqlQuery_connection',
 'message.computeResourceUsage.sqlQuery.query': 'sqlQuery_query',
 'message.computeResourceUsage.llmUsage.connection': 'llmUsage_connection',
'message.computeResourceUsage.llmUsage.llmType': 'llmUsage_llmType',
'message.computeResourceUsage.llmUsage.usageType': 'llmUsage_usageType',
'message.computeResourceUsage.llmUsage.totalQueries': 'llmUsage_totalQueries',
'message.computeResourceUsage.llmUsage.cacheHitQueries': 'llmUsage_cacheHitQueries',
'message.computeResourceUsage.llmUsage.cacheMissQueries': 'llmUsage_cacheMissQueries',
'message.computeResourceUsage.llmUsage.totalPromptTokens': 'llmUsage_totalPromptTokens',
'message.computeResourceUsage.llmUsage.totalCompletionTokens': 'llmUsage_totalCompletionTokens',
'message.computeResourceUsage.llmUsage.totalComputationTimeMS': 'llmUsage_totalComputationTimeMS',
'message.computeResourceUsage.llmUsage.estimatedCostUSD': 'llmUsage_estimatedCostUSD',
 'message.computeResourceUsage.startTime': 'startTime',
 'message.computeResourceUsage.type': 'type',
 'message.msgType': 'msgType',
 'message.podsStatus.pods': 'podsStatus.pods',
 'timestamp': 'serverTimestamp'}

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE


# Read recipe inputs
compute_resource_usage_logs = dataiku.Dataset("compute_resource_usage_logs")
compute_resource_usage_logs_df = compute_resource_usage_logs.get_dataframe(infer_with_pandas=False)
columns = compute_resource_usage_logs_df.columns

if "clientEvent.msgType" in columns:
    compute_resource_usage_logs_df.rename(columns=mapping_eventServer, inplace=True)
elif "message.msgType" in columns:
    compute_resource_usage_logs_df.rename(columns=mapping_log4j, inplace=True)


# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# filter rows on compute resource usage logs
compute_resource_usage_logs_df = compute_resource_usage_logs_df[(compute_resource_usage_logs_df.msgType == "compute-resource-usage-start")
                                                                | (compute_resource_usage_logs_df.msgType == "compute-resource-usage-complete")
                                                                | (compute_resource_usage_logs_df.msgType == "compute-resource-usage-update")
                                                                | (compute_resource_usage_logs_df.msgType == "kubernetes-cluster-usage-status")]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
logs_clean_df = compute_resource_usage_logs_df # For this sample code, simply copy input to output

# Write recipe outputs
logs_clean = dataiku.Dataset("logs_clean")
logs_clean.write_with_schema(logs_clean_df)
