import base64
import logging
import os
from functools import lru_cache
from io import BytesIO
from pathlib import Path
from typing import Optional, List, Union, Tuple

import math
from PIL import Image
from docling.datamodel.pipeline_options import TesseractOcrOptions, EasyOcrOptions, OcrOptions
from langcodes import Language, LanguageTagError
from pypdfium2 import PdfDocument

logger = logging.getLogger("ocr")


# easyocr and tesseract are both long to import, only import them when needed.
# Also, the OCR settings are part of the kernel hash so 1 server should handle only easyOCR or tesseract requests.

def get_ocr_config(lang: str, ocr_engine: str, is_gpu_available) -> OcrOptions:
    # IN AUTO mode check if tesseract is installed and use it if so, else use easyOCR
    lang_list = [item.strip() for item in lang.split(",") if item.strip()]
    if ocr_engine == "TESSERACT":
        return check_tesseract_installation(lang_list, True)
    elif ocr_engine == "EASYOCR":
        return check_easy_ocr_installation(lang_list, is_gpu_available, True)
    elif ocr_engine == "AUTO":
        tesseract_options = check_tesseract_installation(lang_list, False)
        if tesseract_options is not None:
            logger.info("Using Tesseract OCR engine")
            return tesseract_options
        else:
            easy_ocr_options = check_easy_ocr_installation(lang_list, is_gpu_available, False)
            if easy_ocr_options is not None:
                logger.info("Using EasyOCR engine")
                return easy_ocr_options
            else:
                raise ImportError(
                    "No valid OCR engine found. Check the logs and please ensure that either Tesseract or EasyOCR is installed and configured correctly.")
    else:
        raise ValueError("Invalid OCR engine specified. Choose either 'TESSERACT', 'EASYOCR', or 'AUTO'.")


def check_tesseract_installation(lang: list[str], raises_on_error: bool) -> Optional[TesseractOcrOptions]:
    try:
        # set the default language files location, required for containerized execution
        if not os.getenv("TESSDATA_PREFIX"):
            os.environ["TESSDATA_PREFIX"] = os.path.join(os.getenv("DOCUMENT_EXTRACTION_MODELS"), "tessdata/")
        import tesserocr
        tesseract_version = tesserocr.tesseract_version()
    except ImportError as e:
        # We install tesserocr in the document extraction internal env but users can switch to a different code env.
        if raises_on_error:
            raise ImportError(f"Unable to use tesserocr. Check the code environment to ensure 'tesserocr' is correctly installed {e}")
        else:
            logger.warning(f"Unable to use tesserocr. Check the code environment to ensure 'tesserocr' is correctly installed {e}")
            return None

    tesseract_ocr_lang = lang_tags_to_ocr_langs(lang=lang, for_tesseract=True)
    if not tesseract_ocr_lang:
        if raises_on_error:
            raise ValueError("No valid languages were found as input, add more language codes, or try with 'auto'")
        else:
            logger.warning("No valid languages were found as input, add more language codes, or try with 'auto'")
            return None

    # Check if tesseract is correctly installed. tesserocr.get_languages should return at list one language. It will try to load the language file located in the
    # path defined by the TESSDATA_PREFIX environment variable.
    _, tesserocr_available_languages = tesserocr.get_languages()
    if not tesserocr_available_languages:
        if raises_on_error:
            raise ImportError(
                "tesserocr is not correctly configured. No language models have been detected. Please ensure that the TESSDATA_PREFIX envvar points "
                "to tesseract languages dir.")
        else:
            logger.warning(
                "tesserocr is not correctly configured. No language models have been detected. Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir.")
            return None

    logger.info("Tesseract version " + tesseract_version + " is configured. Available languages are: " + str(tesserocr_available_languages))
    missing_lang = set(tesseract_ocr_lang) - set("auto") - set(tesserocr_available_languages)
    if missing_lang:
        logging.warning(f"tesserocr is not correctly configured. Some languages were explicitly requested but not found on the system: {missing_lang}. "
                        f"Available languages are {tesserocr_available_languages}")
    return TesseractOcrOptions(lang=tesseract_ocr_lang)


def check_easy_ocr_installation(lang: list[str], is_gpu_available, raises_on_error: bool) -> Optional[EasyOcrOptions]:
    """
    Check the Easy OCR installation, returns None is easyocr is not available or docling.datamodel.pipeline_options.EasyOcrOptions with the correct model
    repository.
    """
    try:
        import easyocr
    except ImportError as e:
        if raises_on_error:
            # We install tesserocr in the document extraction internal nev. But users can switch to a different code env.
            raise ImportError(f"Unable to use EasyOCR. Check the code environment to ensure 'easyocr' is correctly installed {e}")
        else:
            logger.warning(f"Unable to use EasyOCR. Check the code environment to ensure 'easyocr' is correctly installed {e}")
            return None
    easyocr_ocr_lang = lang_tags_to_ocr_langs(lang=lang, for_tesseract=False)
    if not easyocr_ocr_lang:
        if raises_on_error:
            raise ValueError("No valid languages were found as input, try adding more language codes")
        else:
            logger.warning("No valid languages were found as input, try adding more language codes")
            return None
    if not is_gpu_available:
        logger.warning("EasyOCR extraction will run on CPU and will be slow")
    logger.info(u"EasyOCR extraction requires ocr models. Checking if those models are already in the code env resources folder.")
    if os.environ.get("DOCUMENT_EXTRACTION_MODELS") is None:
        return EasyOcrOptions(lang=easyocr_ocr_lang, download_enabled=True)
    easyocr_model_path = Path(os.environ.get("DOCUMENT_EXTRACTION_MODELS")).expanduser() / "EasyOCR" / "model"
    logger.info(u"Searching for EasyOCR models in {}".format(easyocr_model_path))
    # Check if there are any files that are .pth files in the easyocr_model_path directory.
    if os.path.isdir(easyocr_model_path) and any(file.suffix.lower() == ".pth" for file in easyocr_model_path.iterdir() if file.is_file()):
        logger.info("EasyOCR models were found in the resources folder of the code env. Will use them for OCR")
        return EasyOcrOptions(lang=easyocr_ocr_lang, model_storage_directory=easyocr_model_path.as_posix())
    else:
        logger.info("No EasyOCR model files were found in the code env resources folder. Will use the easyOCR cache or download the models")
        return EasyOcrOptions(lang=easyocr_ocr_lang, download_enabled=True)


def lang_tags_to_ocr_langs(lang: list[str], for_tesseract: bool) -> list[str]:
    """
    Tesseract and easyOCR do not use the same languages codes. Tesseract will need 'eng' for english, which leads to `ValueError: ({'eng'}, 'is not supported')`
    with easyOCR. Here we want to map/standardize the languages list to one engine or the other.
    Some valid inputs for easyOCR are not BCP47 nor ISO639 ('rs_cyrillic'), so we also explicitly check for those values.
    Tesseract is expecting ISO639-3 (3 letters code).
    All other invalid inputs are removed from the list

    :param lang: The lang list, to be added to the list, the code should be a BCP47 which cover many use cases such as:
    en	English
    en-US	English as used in the United States
    zh-Hant-TW	Traditional Chinese as used in Taiwan
    sl-rozaj	Slovenian in the Resian dialect
    de-CH-1901	German as used in Switzerland, 1901 orthography
    fr-x-custom
    :param for_tesseract: Set to True to generate a valid code list for tesseract, False for easyOCR
    :return list[str]: Valid code list for tesseract or easyocr
    """
    from easyocr.config import all_lang_list as easyocr_all_lang_list
    if not for_tesseract:
        easy_ocr_languages = []
        for language_str in lang:
            if language_str in easyocr_all_lang_list:
                # Some language code might not be ISO639, for example 'rs_cyrillic' which is Serbian (cyrillic). But it's a valid EasyOCR code, so we add it
                # like this.
                easy_ocr_languages.append(language_str)
            else:
                try:
                    # Language.get can either throw a LanguageTagError if the input is not a language code (not 1-8 alphanumeric characters) or return a not valid
                    # Language for ex 'rs_cyrillic'
                    language = Language.get(language_str)
                except LanguageTagError as e:
                    logger.warning(
                        f"The following language: {language_str} is not a valid easyOCR language nor a valid ISO639 language code. It will be removed from the language list",
                        e)
                    continue
                if language.is_valid():
                    # Language.language is the valid ISO639-1 two letters code
                    # This will ensure 'eng' is mapped to 'en' for example
                    easy_ocr_languages.append(language.language)
                else:
                    logger.warning(
                        f"The following language: {language_str} is not a valid easyOCR language nor a valid ISO639 language code. It will be removed from the language list")
        logger.info(f"For easyOCR, the following language list will be used: {easy_ocr_languages}")
        return easy_ocr_languages
    else:
        tesseract_languages = []
        # For tesseract, ISO639-3 (3 letters code) is expected
        for language_str in lang:
            if language_str == "auto":
                # auto is a valid entry only for tesseract
                tesseract_languages.append(language_str)
                continue
            try:
                # Language.get can either throw a LanguageTagError if the input is not a language code (not 1-8 alphanumeric characters) or return a not valid
                # Language for ex 'rs_cyrillic'
                language = Language.get(language_str)
            except LanguageTagError as e:
                logger.warning(
                    f"The following language: {language_str} is not a valid ISO639 language code. It will be removed from the language list", e)
                continue
            if language.is_valid():
                tesseract_languages.append(language.to_alpha3())
            else:
                logger.warning(
                    f"The following language: {language_str} is not a valid ISO639 language code. It will be removed from the language list")
        logger.info(f"For tesseract, the following language list will be used: {tesseract_languages}")
        return tesseract_languages


def process_images_with_ocr(ocr_options: OcrOptions, images_ref: List[Tuple[Union[str, int], Union[str, Image.Image]]]) -> dict:
    """

    :param ocr_options: options for the OCR engine, either EasyOcrOptions or TesseractOcrOptions
    :param images_ref: A list of tuples, each tuple contains a reference to the image (str) and either a base64 encoded image (str) or a PIL image.
    :return: a dictionary where the keys are the references and the values are the extracted text from the images.
    """
    res = {}
    if ocr_options.kind == EasyOcrOptions.kind:
        import numpy as np
        model_storage_directory = None
        if hasattr(ocr_options, "model_storage_directory"):
            model_storage_directory = ocr_options.model_storage_directory
        # Convert to tuple to make it hashable
        reader = get_easyocr_reader(tuple(ocr_options.lang), model_storage_directory)
        for ref, image in images_ref:
            if type(image) is not Image.Image:
                image_data = base64.b64decode(image)
                image = Image.open(BytesIO(image_data))
            image_np = np.array(image)
            img_text = "\n".join(reader.readtext(image_np, detail=0))
            res[ref] = img_text
    elif ocr_options.kind == TesseractOcrOptions.kind:
        import tesserocr
        for ref, image in images_ref:
            if type(image) is not Image.Image:
                image_data = base64.b64decode(image)
                image = Image.open(BytesIO(image_data))
            img_text = tesserocr.image_to_text(image)
            res[ref] = img_text
    else:
        raise ValueError("No valid OCR engine is available")
    return res


def pdf_to_pil_images_iterator(pdf_bytes, memory_limit_per_document):
    """ iterator over the multiple images of PDF bytes """
    bytes_per_pixel = 4  # Assuming 4 bytes per pixel (ARGB)
    default_scale = 2
    default_dpi = 72 * default_scale
    pdf_pages = PdfDocument(pdf_bytes)
    for pdf_page in pdf_pages:
        scale_for_page = default_scale

        # Compute an estimation of the memory used by the image, based on its pixel dimensions
        # Estimated memory is width * scale * height * scale * bytes_per_pixel
        width, height = pdf_page.get_size()
        maximum_scale = math.sqrt(
            memory_limit_per_document * 1e6 / (bytes_per_pixel * width * height)
        )

        if scale_for_page > maximum_scale:
            logger.info("Reducing rendering DPI from %s to %s to limit memory usage.", default_dpi, (72 * maximum_scale))
            scale_for_page = maximum_scale

        yield pdf_page.render(scale=scale_for_page).to_pil()


def convert_image_to_greyscale_bytes(img):
    """ convert a PIL image to greyscale with a specified dpi and output image as bytes """
    return img.convert("L")


@lru_cache(maxsize=5)
def get_easyocr_reader(lang, model_storage_directory):
    import easyocr
    # instantiate the easyocr.Reader only once here because it takes some time
    easyocr_reader = easyocr.Reader(lang_list=list(lang),
                                    model_storage_directory=model_storage_directory)
    return easyocr_reader
