import os
import re
import shutil
from typing import Any, Optional

import pandas as pd
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VectorStore

from dataiku.core.vector_stores.dku_vector_store import DkuLocalVectorStore, logger
from dataiku.core.vector_stores.vector_store_document_filter import ChromaDBVectorStoreDocumentFilter
from dataiku.llm.types import RetrievableKnowledge

CHROMA_DB_NAME = "chroma.sqlite3"

class ChromaVectorStore(DkuLocalVectorStore):

    def __init__(self, kb: RetrievableKnowledge, exec_folder: str):
        super(ChromaVectorStore, self).__init__(kb, exec_folder, collection_name="langchain")
        self.run_the_ugly_chromadb_monkeypatch()
        self.document_filter = ChromaDBVectorStoreDocumentFilter(self.metadata_column_type_and_meaning)

    def get_db(self, embeddings: Embeddings, allow_creation: bool = False, **kwargs: Any) -> VectorStore:
        # import locally to let the monkeypatch run before
        try:
            from langchain_chroma import Chroma as LangchainChroma # type: ignore
        except ImportError:
            from langchain_community.vectorstores import Chroma as LangchainChroma  # type: ignore
        import chromadb

        # See: https://github.com/langchain-ai/langchain/issues/14872#issuecomment-2138666781
        # And: https://github.com/chroma-core/chroma/issues/5868
        # And: https://github.com/chroma-core/chroma/issues/1976
        # And: https://github.com/langchain-ai/langchain/issues/26884
        # Removing this causes errors on linux
        chromadb.api.shared_system_client.SharedSystemClient.clear_system_cache()

        client_settings = chromadb.config.Settings(
            anonymized_telemetry=False,
            persist_directory=self.exec_folder,
            is_persistent=True,
        )
        db = LangchainChroma(collection_name=self.collection_name, embedding_function=embeddings, client_settings=client_settings)
        db._client.get_or_create_collection(name=self.collection_name)  # todo should check for allow_creation first
        return db

    def clear_files(self, folder_path: str) -> None:
        if os.path.isfile(os.path.join(folder_path, CHROMA_DB_NAME)):
            os.remove(os.path.join(folder_path, CHROMA_DB_NAME))

        for file in os.listdir(folder_path):
            # if filename matches regex, remove it
            if self._is_filename_uuid(file):
                shutil.rmtree(os.path.join(folder_path, file))

    def get_file_size(self) -> int:
        size = 0

        db_file_path = os.path.join(self.exec_folder, CHROMA_DB_NAME)
        if os.path.isfile(db_file_path):
            size += os.path.getsize(db_file_path)

        # get all folders with a uuid name
        subfolder_paths = [
            os.path.join(self.exec_folder, subfolder_path) for subfolder_path in os.listdir(self.exec_folder)
            if (os.path.isdir(os.path.join(self.exec_folder, subfolder_path))
                and self._is_filename_uuid(subfolder_path))
        ]

        for subfolder_path in subfolder_paths:
            for dir_name, _, folder_files in os.walk(subfolder_path):
                for folder_file in folder_files:
                    folder_file_path = os.path.join(dir_name, folder_file)
                    if os.path.isfile(folder_file_path):
                        size += os.path.getsize(folder_file_path)

        return size

    def _process_metadata_entry(self, new_meta, key, val, storage_type, meaning):
        if meaning == "BagOfWordsMeaning":
            if isinstance(val, list):
                for v in val:
                    new_meta[f"DKU_BOW_{key}__{v}"] = True
        else:
            super()._process_metadata_entry(new_meta, key, val, storage_type, meaning)

    def get_document_metadata(self, document: Document, column: str) -> Optional[Any]:
        column_type, meaning = self.metadata_column_type_and_meaning.get(column, (None, None))
        if meaning == "BagOfWordsMeaning":
            return [
                m.split(f"DKU_BOW_{column}__")[-1]
                for m in document.metadata
                if m.startswith(f"DKU_BOW_{column}__")
            ]
        else:
            return document.metadata.get(column, None)

    @staticmethod
    def _is_filename_uuid(filename: str) -> bool:
        return re.match(r"[0-9a-f]{8}-([0-9a-f]{4}-){3}[0-9a-f]{12}", filename) is not None

    @staticmethod
    def run_the_ugly_chromadb_monkeypatch() -> None:
        logger.info("maybe-monkeypatching Chroma")
        from importlib import util

        chromadb_spec = util.find_spec("chromadb")

        if chromadb_spec is None:
            logger.info("Chroma is not installed, ignoring monkeypatch")
            return

        import sqlite3

        if sqlite3.sqlite_version_info < (3, 35, 0):
            logger.warning("SQLite3 is too old (<3.35), Chroma would not load, trying to replace sqlite3.")
            try:
                __import__('pysqlite3')
            except ModuleNotFoundError:
                logger.exception("Failed to import pysqlite3 to replace sqlite3")
                raise ModuleNotFoundError("Could not import Chroma directly due to SQLite version issue, and pysqlite3-binary is not installed, cannot work-around. Consider installing pysqlite3-binary package")

            try:
                import sys
                sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
            except:
                logger.exception("Failed to replace sqlite3 by pysqlite3")
                raise ModuleNotFoundError("Could not import Chroma directly due to SQLite version issue, and applying work-around failed")

            try:
                # Check that it's working now
                logger.info("monkeypatched, retrying")
                import chromadb
            except Exception as e:
                logger.exception("chromadb monkeypatch did not work")
                raise ModuleNotFoundError("Could not import Chroma directly due to SQLite version issue, and applying work-around failed") from e
