# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# -*- coding: utf-8 -*-
from dataiku import pandasutils as pdu
import dataiku
import pandas as pd, numpy as np
import re
import unicodedata


# from clinical_sites_intelligence.facility_harmonisation import preprocess_text

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# Read recipe inputs
facilities_goecode = dataiku.Dataset("facilities_goecode")

# Write recipe outputs
# Dataset facilities_geocode_filtered renamed to facilities_geocode_map by liheng.fu@dataiku.com on 2024-03-13 12:48:38
facilities_geocode_filtered = dataiku.Dataset("facilities_geocode_map")

schema = [
    {'name': 'NCTId', 'type': 'string'},
    {'name': 'Location_zip', 'type': 'string'},
    {'name': 'Location_country', 'type': 'string'},
    {'name': 'Location_city', 'type': 'string'},
    {'name': 'Location_state', 'type': 'string'},
    {'name': 'Location_geoPoint', 'type': 'string'},
    {'name': 'Location_facility', 'type': 'string'},
    {'name': 'Location_contacts', 'type': 'string'},
    {'name': 'Location_status', 'type': 'string'},
    {'name': 'countryCode', 'type': 'string'},
    {'name': 'postalCode', 'type': 'string'},
    {'name': 'placeName', 'type': 'string'},
    {'name': 'adminName1', 'type': 'string'},
    {'name': 'adminCode1', 'type': 'string'},
    {'name': 'adminName2', 'type': 'string'},
    {'name': 'adminCode2', 'type': 'string'},
    {'name': 'adminName3', 'type': 'string'},
    {'name': 'adminCode3', 'type': 'string'},
    {'name': 'latitude', 'type': 'string'},
    {'name': 'longitude', 'type': 'string'},
    {'name': 'accuracy', 'type': 'string'},
    {'name': 'Country', 'type': 'string'},
    {'name': 'Location_city_norm', 'type': 'string'},
    {'name': 'placeName_norm', 'type': 'string'},
    {'name': 'Location_facility_normalized', 'type': 'string'},
    {'name': 'label', 'type': 'string'},
]
facilities_geocode_filtered.write_schema(schema)

output_schema = [i['name'] for i in schema]

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
def remove_parathesis(text):
    pattern = r'\s*\([^)]*\)'
    return re.sub(pattern, '', text)


def remove_site_id(text):
    pattern = r'(?i)\W+ID\W*\d+|\W+Site Number\W*\d+|\W+Site\W*\d+'
    return re.sub(pattern, ' ', text)


def remove_special_characters(text, characters):
    # Define a translation table
    special_chars = {}
    for char in characters:
        if char == ";":
            special_chars[char] = ", "
        else:
            special_chars[char] = " "
    translation_table = str.maketrans(special_chars)

    # Use translate to remove specified characters
    text = text.translate(translation_table)

    return text


def clean_up_site_name(text):
    text = remove_parathesis(text)
    text = remove_site_id(text)
    text = remove_special_characters(text, ['-', '/', '\\', '&', ':', ';'])
    return text


def preprocess_text(text, to_unicode=True, to_lowercase=True):
    """
    Preprocesses text by optionally converting it to a basic ASCII representation
    and to lowercase.

    Args:
        text (str): The input text.
        to_unicode (bool): If True, attempts to convert Unicode characters to
                           their closest ASCII representation by removing diacritics
                           and non-ASCII characters. This is not a full transliteration
                           but handles common cases.
        to_lowercase (bool): If True, converts the text to lowercase.

    Returns:
        str: The processed text.
    """
    assert isinstance(text, str), f"{text} is {type(text)} type"

    if to_unicode:
        # Normalize to NFKD form (decompose characters into base characters and diacritics)
        # Then encode to ASCII and ignore characters that cannot be represented in ASCII.
        # This effectively removes diacritics and non-ASCII symbols.
        text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

    if to_lowercase:
        text = text.lower()

    return text

# -------------------------------------------------------------------------------- NOTEBOOK-CELL: CODE
# In some countries, the zip code has one-to-many geopoints.
# Most of the one-to-many relationship can be reduced to 1-to-1 by mapping place names
chunksize=3000

with facilities_geocode_filtered.get_writer() as writer:
    for partial_df in facilities_goecode.iter_dataframes(chunksize=chunksize):
        # normalize city name
        partial_df = partial_df.assign(
            Location_city_norm=partial_df['Location_city'].apply(lambda x: preprocess_text(x) if pd.notnull(x) else x))
        partial_df = partial_df.assign(
            placeName_norm=partial_df['placeName'].apply(lambda x: preprocess_text(x) if pd.notnull(x) else x))

        # normalize site name
        partial_df = partial_df.assign(
            Location_facility_normalized=partial_df['Location_facility'].apply(lambda x: clean_up_site_name(x) if pd.notnull(x) else x))

        partial_df = partial_df.assign(label=
            partial_df.apply(
                lambda row: row['Location_city_norm'] in row['placeName_norm']
                if (pd.notnull(row['Location_city_norm']) & pd.notnull(row['placeName_norm'])) else ("UNMAPPED" if pd.isnull(row['postalCode']) else False), axis=1))
        # conform to output schema
        conformed_df = partial_df.reindex(columns=output_schema)
        writer.write_dataframe(conformed_df)
        print(f"wrote {chunksize} rows")