# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
from sklearn.cluster import DBSCAN

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Read recipe inputs
facilities_distinct_prepared = dataiku.Dataset("facilities_distinct_prepared")
facilities_distinct_prepared_df = facilities_distinct_prepared.get_dataframe()


# Write recipe outputs
embed_cluster = dataiku.Dataset("facilities_id_lookup")
schema=[
    {'name': 'Location_facility',   'type':'string'},
    {'name': 'Location_facility_normalized',   'type':'string'},
    {'name': 'GeoZip',   'type':'string'},
    {'name': 'Cluster',   'type':'string'},
    {'name': 'Facility_ID',   'type':'string'}]
embed_cluster.write_schema(schema)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
facilities_distinct_prepared_df = facilities_distinct_prepared.get_dataframe().drop(columns=['target'])
geo_id = facilities_distinct_prepared_df['GeoZip'].unique()

dbscan = DBSCAN(min_samples=2, eps=0.025, metric="cosine")
dbscan.random_state = np.random.RandomState(100)

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch

# Load the BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("DataikuNLP/paraphrase-multilingual-MiniLM-L12-v2")
model = AutoModel.from_pretrained("DataikuNLP/paraphrase-multilingual-MiniLM-L12-v2")

def embed_text(df, columns, batch_size=32, device='cuda' if torch.cuda.is_available() else 'cpu'):
    embeddings = {}
    for column in columns:
        # Tokenize the text in batches
        tokenized_texts = tokenizer(list(df[column]), padding=True, truncation=True, return_tensors='pt', max_length=128)
        # Move tokenized inputs to appropriate device
        tokenized_texts = {key: value.to(device) for key, value in tokenized_texts.items()}
        
        # Forward pass through the BERT model in batches
        with torch.no_grad():
            column_embeddings = []
            for i in range(0, len(tokenized_texts['input_ids']), batch_size):
                batch_tokenized_texts = {key: value[i:i+batch_size] for key, value in tokenized_texts.items()}
                outputs = model(**batch_tokenized_texts)
                # Use mean pooling to get sentence embeddings
                batch_embeddings = outputs.last_hidden_state.mean(dim=1)
                column_embeddings.append(batch_embeddings)
            embeddings[column] = torch.cat(column_embeddings, dim=0)
    
    return embeddings

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
def get_cluster(facility_df, geopoint, clustering_model):
    df = facility_df[facility_df['GeoZip']==geopoint]
    if len(df) > 1:
        site_name_embed = embed_text(df, ['Location_facility_normalized'])['Location_facility_normalized']
        # Apply DBSCAN clustering
        cluster_labels = clustering_model.fit_predict(site_name_embed)

        # Assign cluster labels to the DataFrame
        df = df.assign(Cluster= cluster_labels)
    else:
        df = df.assign(Cluster= -1)
        cluster_labels = [-1]

    df = format_id(df)
    return np.array(df), len(df.Facility_ID.unique())


def format_id(df):
    facilities_unlabeled = df[df['Cluster']==-1]
    facilities_labeled = df[df['Cluster']!=-1]
    facilities_labeled = facilities_labeled.assign(Cluster=facilities_labeled['Cluster'].astype('str').str.zfill(5))
    facilities_unlabeled = facilities_unlabeled.assign(Cluster=facilities_unlabeled.groupby('GeoZip')['Location_facility_normalized'].cumcount()+10000)
    facilities_unlabeled['Cluster'] = facilities_unlabeled['Cluster'].astype('str')
    facilities_prepared = pd.concat([facilities_labeled, facilities_unlabeled])
    facilities_prepared['Facility_ID'] = facilities_prepared['GeoZip'] + '-id-' + facilities_prepared['Cluster'].astype('str')
    return facilities_prepared

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
with embed_cluster.get_writer() as writer:
    for geo in geo_id:
        df_array, result = get_cluster(facilities_distinct_prepared_df, geo, dbscan)
        df = pd.DataFrame(
            df_array,
            columns=['Location_facility', 'Location_facility_normalized', 'GeoZip', 'Cluster', 'Facility_ID'])
        writer.write_dataframe(df)
        print(f'{geo} has labels: ', result)
        print(f'wrote {len(df)} rows')

