import dataclasses
import json
import logging
from json import JSONDecodeError
from typing import Optional, List, Dict, Any

import pandas as pd

from dataiku.llm.types import SchemaColumn

logger = logging.getLogger(__name__)

DKU_MULTIMODAL_CONTENT = "DKU_MULTIMODAL_CONTENT"
DKU_DOCUMENT_INFO = "DKU_DOCUMENT_INFO"
DKU_SECURITY_TOKENS_META = "DKU_SECURITY_TOKENS"


@dataclasses.dataclass(frozen=True)
class MetadataGenerator:

    metadata_columns: List[SchemaColumn]
    source_id_column: Optional[str] = None
    security_tokens_column: Optional[str] = None

    def to_metadata(self, row: Dict[str, Any]) -> Dict[str, Any]:
        """
        Completes the specified metadata with the dataiku internal information, used at retrieval time.
        """

        # Being extra safe with key access, only get the sources if they are present on the row
        sources = {}
        for metadata_column in self.metadata_columns:
            if metadata_column.get("name") in row:
                if metadata_column.get("meaning") == "BagOfWordsMeaning":
                    # We try to properly set the type of data in the bag of words setting.
                    try:
                        expected_array_metadata = row[metadata_column["name"]]
                        if isinstance(expected_array_metadata, list):
                            array_metadata = expected_array_metadata
                        else:
                            array_metadata = json.loads(expected_array_metadata)
                            if not isinstance(array_metadata, list): # Parsed json is not a list
                                raise TypeError("Parsed json is not a list")
                        values = [str(w) for w in array_metadata]
                        sources[metadata_column["name"]] = values
                    except (JSONDecodeError, TypeError):
                        logger.warning(f"Row {row} bag-of-words embedding failed. {row[metadata_column['name']]} is not a valid json array. Falling back to string metadata embedding")
                        sources[metadata_column["name"]] = row[metadata_column["name"]]
                else:
                    sources[metadata_column["name"]] = row[metadata_column["name"]]

        source_id = {self.source_id_column: row[self.source_id_column]} if self.source_id_column is not None else {}
        multimodal_info = {DKU_MULTIMODAL_CONTENT: row[DKU_MULTIMODAL_CONTENT]} if DKU_MULTIMODAL_CONTENT in row else {}
        document_info = {DKU_DOCUMENT_INFO: row[DKU_DOCUMENT_INFO]} if DKU_DOCUMENT_INFO in row else {}
        security_tokens_info = {DKU_SECURITY_TOKENS_META: row[self.security_tokens_column]} if self.security_tokens_column is not None and not pd.isna(row.get(self.security_tokens_column, None)) else {}

        return {
            **sources,  # Original metadata
            **source_id,  # Optional column to reference doc id for RecordManager indexing
            **multimodal_info,  # Optional column containing multimodal info
            **security_tokens_info,
            **document_info, # Settings to identify the metadata info at loading time
        }

