import base64
import io
import tempfile
from typing import Dict, List, Optional, Tuple, Union

import pypdfium2 as pdfium  # type: ignore
from common.backend.constants import PROMPT_SEPARATOR_LENGTH
from common.backend.models.base import MediaSummary, UploadChainTypes, UploadFileError
from common.backend.utils.dataiku_api import dataiku_api
from common.backend.utils.llm_utils import get_llm_capabilities
from common.backend.utils.picture_utils import resize_with_ratio
from common.backend.utils.sql_timing import log_query_time
from common.backend.utils.upload_utils import get_checked_config, save_extracted_json
from common.llm_assist.logging import logger
from common.solutions.chains.summary.doc_as_image_summary_chain import DocAsImageSummaryChain
from common.solutions.chains.summary.text_extraction_summary_chain import TextExtractionSummaryChain
from langchain_community.document_loaders import PyPDFium2Loader
from langchain_core.documents.base import Document

webapp_config: Dict[str, str] = dataiku_api.webapp_config
llm_caps = get_llm_capabilities()


@log_query_time
def extract_pdf_text(file_data: bytes) -> Tuple[str, bool]:
    multimodal_enabled = bool(llm_caps.get("multi_modal"))
    allow_doc_as_image = bool(get_checked_config("allow_doc_as_image"))
    docs_per_page_as_image = int(get_checked_config("docs_per_page_as_image"))
    is_doc_as_image = False
    try:
        with tempfile.NamedTemporaryFile(delete=True) as temp_file:
            temp_file.write(file_data)
            temp_file.flush()
            loader = PyPDFium2Loader(temp_file.name)
            document: List[Document] = loader.load()

        is_doc_as_image = len(document) <= docs_per_page_as_image and multimodal_enabled and allow_doc_as_image
        extracted_text = ""
        for page in document:
            extracted_text += f"""
        {'-'*PROMPT_SEPARATOR_LENGTH} page: {page.metadata.get('page', 'Unknown')} {'-'*PROMPT_SEPARATOR_LENGTH}
        {page.page_content}
        """
        return extracted_text, is_doc_as_image
    except Exception as e:
        logger.exception(f"Error in extract_pdf_text: {e}")
        raise Exception(UploadFileError.PARSING_ERROR.value)


@log_query_time
def first_page_to_preview(file_data: bytes) -> str:
    try:
        pdf_document = pdfium.PdfDocument(file_data)
        page = pdf_document[0]
        pil_image = page.render().to_pil()
        resized_image = resize_with_ratio(pil_image)
        buffered = io.BytesIO()
        resized_image.save(buffered, format="PNG")
        b64_image = base64.b64encode(buffered.getvalue()).decode("utf-8")
        return b64_image
    except IOError as e:
        logger.exception(f"Unable to parse document to image: {e}")
        raise Exception(UploadFileError.PARSING_ERROR.value)


@log_query_time
def extract_pdf_summary(
    file_path: str, file_data: bytes, original_file_name: str, language: Optional[str], begin_time: int
) -> MediaSummary:
    b64_image: str = first_page_to_preview(file_data)
    media_summary: Union[MediaSummary, None]

    extracted_text = ""
    is_doc_as_image = False
    extracted_text, is_doc_as_image = extract_pdf_text(file_data)
    if is_doc_as_image:
        media_summary = DocAsImageSummaryChain(file_data, original_file_name, language, file_path).get_summary()
        if media_summary is None:
            raise Exception(UploadFileError.PARSING_ERROR.value)
        media_summary["chain_type"] = UploadChainTypes.DOCUMENT_AS_IMAGE.value
    else:
        media_summary = TextExtractionSummaryChain(extracted_text, original_file_name, language).get_summary()
        if media_summary is None:
            raise Exception(UploadFileError.PARSING_ERROR.value)
        if media_summary.get("summary") is None:
            media_summary["chain_type"] = UploadChainTypes.LONG_DOCUMENT.value
        else:
            media_summary["chain_type"] = UploadChainTypes.SHORT_DOCUMENT.value

    media_summary = {
        **media_summary,
        "file_path": file_path,
        "preview": f"data:image/png;base64,{b64_image}",
        "full_extracted_text": extracted_text,
    }
    media_summary["begin_time"] = begin_time
    metadata_path: str = save_extracted_json(file_path, media_summary or {})
    media_summary["metadata_path"] = metadata_path
    del media_summary["full_extracted_text"]
    return media_summary
