# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import os
import json
from datetime import datetime

import dataiku
from dataikuapi.dss.modelevaluationstore import DSSModelEvaluationStore

import numpy as np
import mlflow
from mlflow.metrics import genai, make_metric
from mlflow.metrics.base import MetricValue, standard_aggregations
import evaluate
import tiktoken

enc = tiktoken.encoding_for_model("gpt-3.5-turbo")


def get_num_tokens(s):
    return len(enc.encode(s))


EVALUATION_STORE_ID = "a3nnPHvq"
EXPERIMENT_TRACKING_FOLDER_ID = "UZu8GFBh"
MODEL = "openai:/gpt-3.5-turbo"

auth_info = dataiku.api_client().get_auth_info(with_secrets=True)
for secret in auth_info["secrets"]:
    if secret["key"] == "openai_key":
        os.environ["OPENAI_API_KEY"] = secret["value"]
        break

EXPERIMENT_NAME = "question_answering"
project = dataiku.api_client().get_default_project()
mlflow_handle = project.setup_mlflow(
    project.get_managed_folder(EXPERIMENT_TRACKING_FOLDER_ID)
)
mlflow.set_experiment(EXPERIMENT_NAME)

df = dataiku.Dataset("answers_stacked").get_dataframe()
for i in df.index:
    df.at[i, "context"] = "- " + "\n\n- ".join(json.loads(df.at[i, "context"]))
    df.at[i, "num_tokens_context"] = get_num_tokens(df.at[i, "context"])
    df.at[i, "num_tokens_answer"] = get_num_tokens(df.at[i, "generated_answer"])

already_computed_metrics = ["num_tokens_answer", "num_tokens_context"]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
bertscore = evaluate.load("bertscore", keep_in_memory=True)


def bert_score_fn(predictions, targets, metrics):
    scores = bertscore.compute(
        predictions=predictions,
        references=targets,
        model_type="distilbert-base-uncased",
    )["f1"]
    return MetricValue(
        scores=scores,
        aggregate_results=standard_aggregations(scores),
    )


bert_score = make_metric(
    eval_fn=bert_score_fn, greater_is_better=True, name="BERT score"
)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
for approach in set(df.approach):
    with mlflow.start_run() as run:
        results = mlflow.evaluate(
            data=df[df.approach == approach],
            targets="reference_answer",
            predictions="generated_answer",
            extra_metrics=[
                bert_score,
                genai.answer_correctness(model=MODEL),
                genai.answer_relevance(model=MODEL),
                genai.relevance(model=MODEL),
                genai.faithfulness(model=MODEL),
            ],
            custom_artifacts=[lambda x, y, z: {approach: {"name": approach}}],
            evaluators="default",
            evaluator_config={"col_mapping": {"inputs": "question"}},
        )

    scores = []

    for metric in results.metrics:
        if "/mean" in metric:
            name = metric.split("/mean")[0]
        scores.append(
            DSSModelEvaluationStore.MetricDefinition(
                code=name,
                value=results.metrics[metric],
                name=metric,
                description=metric,
            )
        )

    for metric in already_computed_metrics:
        scores.append(
            DSSModelEvaluationStore.MetricDefinition(
                code=metric,
                value=np.mean(df[df.approach == approach][metric]),
                name=metric,
                description=metric,
            )
        )

    eval_timestamp = datetime.now().isoformat()
    label_date = DSSModelEvaluationStore.LabelDefinition(
        "evaluation:date", eval_timestamp
    )
    mes = (
        dataiku.api_client()
        .get_default_project()
        .get_model_evaluation_store(EVALUATION_STORE_ID)
    )
    mes.add_custom_model_evaluation(scores, name=approach, labels=[label_date])
