import asyncio
import base64
import io
import logging
import os
import traceback
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, asdict
from pathlib import Path
from typing import List, Optional, Tuple

import torch
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat, ConversionStatus
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_EGRET_MEDIUM, DOCLING_LAYOUT_HERON, DOCLING_LAYOUT_V2, LayoutModelConfig
from docling.datamodel.pipeline_options import TableFormerMode, AcceleratorOptions, AcceleratorDevice, EasyOcrOptions, \
    PaginatedPipelineOptions, ThreadedPdfPipelineOptions
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption, WordFormatOption, PowerpointFormatOption, HTMLFormatOption
from docling.models.document_picture_classifier import DocumentPictureClassifier
from docling.models.table_structure_model import TableStructureModel
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.threaded_standard_pdf_pipeline import ThreadedStandardPdfPipeline
from docling_core.types import DoclingDocument
from docling_core.types.doc import DocItemLabel, ProvenanceItem, ImageRef, TextItem, PictureItem, GroupItem, TableItem, GroupLabel, ListItem, \
    PictureClassificationClass
from docling_core.types.doc.document import PictureClassificationPrediction
from docling_core.types.io import DocumentStream
from pydantic import AnyUrl

from dataiku.base.batcher import Batcher
from dataiku.doctor.utils.gpu_execution import TorchGpuCapability
from dataiku.llm.docextraction import build_message_log_for_document, AbstractDocumentNode, ImageNode, TextNode, TableNode, SectionNode, SlideNode, RootNode
from dataiku.llm.docextraction.ocr import get_ocr_config, process_images_with_ocr

logger = logging.getLogger(__name__)


@dataclass
class DoclingRequest:
    file_name: str
    document_content: str
    do_ocr: bool
    ocr_engine: str
    lang: str
    vlm_annotate: bool
    save_imgs: bool


@dataclass
class DoclingResponse:
    ok: bool
    resp: dict
    error: Optional[str]

    def to_dict(self):
        return asdict(self)


class DoclingExtractorPipeline:
    batcher: Batcher[DoclingRequest, dict]
    image_formats: List[str] = ["png", "jpg", "jpeg"]
    supported_formats: List[str] = ["pdf", "docx", "pptx", "html"] + image_formats
    document_batch_size: int = 4
    config_optimization_mode: str = "AUTO"  # AUTO, FAST, ACCURACY
    device: AcceleratorDevice
    table_structure_options_mode: TableFormerMode
    layout_model: LayoutModelConfig
    artifacts_path: Optional[Path]
    pdf_backend: PdfDocumentBackend = DoclingParseV4DocumentBackend
    do_image_classification: bool = False

    def __init__(self, kernel_settings):
        self.executor = ThreadPoolExecutor()

        if "documentBatchSize" in kernel_settings:
            self.document_batch_size = kernel_settings.get("documentBatchSize")
            settings.perf.doc_batch_size = self.document_batch_size
        if "pageBatchSize" in kernel_settings:
            settings.perf.page_batch_size = kernel_settings.get("pageBatchSize")
        if "configOptimizationMode" in kernel_settings:
            self.config_optimization_mode = kernel_settings.get("configOptimizationMode")
        if "pdfBackend" in kernel_settings:
            self.pdf_backend = PyPdfiumDocumentBackend if kernel_settings.get("pdfBackend").lower() == "pypdfium" else self.pdf_backend
        self.do_image_classification = kernel_settings.get("enableImageClassificationFiltering", False)

        self.device = self.detect_accelerator_device()

        self.table_structure_options_mode = self.get_docling_pdf_table_mode(self.config_optimization_mode)
        logger.info("Table structure mode used: %s", self.table_structure_options_mode)

        # PDF extraction with docling requires those IBM models tableformer and layout:  https://huggingface.co/ds4sd/docling-models
        # We'll check if the models are available within the resource folder. If this is the case we'll use them for extraction. If not we'll continue regular
        # execution (check hf cache check + download if not present)
        self.layout_model, self.artifacts_path = self.find_docling_pdf_models_in_resources(self.table_structure_options_mode,
                                                                                           do_image_classification=self.do_image_classification)
        logger.info("Artifact path: %s", self.artifacts_path)
        # Layout Model
        logger.info("Layer model used: %s", self.layout_model.name)

        self.batcher = Batcher[DoclingRequest, dict](
            batch_size=self.document_batch_size,
            timeout=1,
            process_batch=self._process_batch_async,
            group_by=lambda request: hash((os.path.splitext(request.file_name)[1], request.do_ocr, tuple(request.lang)))
        )
        logger.info("Docling extractor pipeline started with settings: pdf_backend=%s, configOptimizationMode=%s, doc_batch_size=%d, page_batch_size=%d",
                    self.pdf_backend, self.config_optimization_mode, settings.perf.doc_batch_size, settings.perf.page_batch_size)

    def _run_batch_sync(self, requests: List[DoclingRequest]) -> List[dict]:
        logger.info("Processing a batch of %s document extraction requests" % len(requests))
        return self.docling_batch_structured_extract(requests)

    async def _process_batch_async(self, requests: List[DoclingRequest]) -> List[dict]:
        return await asyncio.get_running_loop().run_in_executor(self.executor, self._run_batch_sync, requests)

    async def process_document(self, process_document_command):
        ocr_settings = process_document_command.get("ocrSettings", {})
        return await self.batcher.process(DoclingRequest(process_document_command["fileName"], process_document_command["documentContent"],
                                                         process_document_command.get("imageHandlingMode", "IGNORE") == "OCR",
                                                         ocr_settings.get("ocrEngine", "AUTO"), ocr_settings.get("ocrLanguages", []),
                                                         process_document_command.get("imageHandlingMode", "IGNORE") == "VLM_ANNOTATE",
                                                         process_document_command.get("saveImgs", False)))

    def find_docling_pdf_models_in_resources(self, table_mode: TableFormerMode, do_image_classification: bool) -> Optional[
        Tuple[LayoutModelConfig, Optional[Path]]]:
        """
        Look in the resources folder whether the docling PDF models are available. Returns the path to the directory containing the models (that should be
        passed to docling) or None if one or more models is missing
        """
        if os.environ.get("DOCUMENT_EXTRACTION_MODELS") is None:
            return None

        artifact_path = Path(os.environ.get("DOCUMENT_EXTRACTION_MODELS")).expanduser()
        logger.info(u"Pdf extraction requires layout and tableformer model from docling. Checking if those models are already in the ressources folder.")

        original_layout_model = self.get_docling_pdf_layout_model(self.config_optimization_mode, self.device != AcceleratorDevice.CPU)
        layout_model = original_layout_model
        layout_model_path = (artifact_path / layout_model.model_repo_folder / layout_model.model_path)
        logger.info(u"Searching for layout model in {}".format(layout_model_path))
        layout_model_found = False
        table_model_found = False
        if not self.model_exists(layout_model_path):
            logger.info(u"Layout model not found. Searching for alternative models available in {}".format(artifact_path))

            heron_model_path = (artifact_path / DOCLING_LAYOUT_HERON.model_repo_folder / DOCLING_LAYOUT_HERON.model_path)
            egret_medium_path = (artifact_path / DOCLING_LAYOUT_EGRET_MEDIUM.model_repo_folder / DOCLING_LAYOUT_EGRET_MEDIUM.model_path)
            old_layout_path = (artifact_path / DOCLING_LAYOUT_V2.model_repo_folder / DOCLING_LAYOUT_V2.model_path)

            # check the other possible models
            if layout_model == DOCLING_LAYOUT_EGRET_MEDIUM and self.model_exists(heron_model_path):
                layout_model = DOCLING_LAYOUT_HERON
                layout_model_found = True
                logger.info(u"Alternative layout model found {} in {}".format(layout_model, layout_model_path))
            elif layout_model == DOCLING_LAYOUT_HERON and self.model_exists(egret_medium_path):
                layout_model = DOCLING_LAYOUT_EGRET_MEDIUM
                layout_model_found = True
                logger.info(u"Alternative layout model found {} in {}".format(layout_model, layout_model_path))
            elif self.model_exists(old_layout_path):  # Fallback on legacy docling layout model
                logger.info(u"Alternative layout model not found. Using legacy docling layer model found in {}".format(old_layout_path))
                layout_model = DOCLING_LAYOUT_V2
                layout_model_found = True
        else:
            logger.info(u"Layout model found in {}".format(layout_model_path))
            layout_model_found = True

        tableformer_path = (artifact_path / TableStructureModel._model_repo_folder / TableStructureModel._model_path / table_mode.value)
        logger.info(u"Searching for tableformer model in {}".format(tableformer_path))
        if self.model_exists(tableformer_path):
            logger.info(u"Table model found in {}".format(tableformer_path))
            table_model_found = True
        else:
            logger.info(u"No table model found in {}".format(tableformer_path))

        # Image classification model:
        if do_image_classification:
            image_classification_model_path = (artifact_path / DocumentPictureClassifier._model_repo_folder)
            logger.info(u"Searching for image classification model in {}".format(image_classification_model_path))
            if self.model_exists(image_classification_model_path):
                logger.info(u"Image classification model found in {}".format(image_classification_model_path))
                classification_model_found = True
            else:
                logger.info(u"No image classification model found in {}".format(image_classification_model_path))
                classification_model_found = False
            if layout_model_found and table_model_found and classification_model_found:
                logger.info(
                    "Layout, table and image classification Docling models for pdf extraction were found in the resources folder of the code env. They will be used for extraction")
                return layout_model, artifact_path
            elif layout_model_found and table_model_found and not classification_model_found:
                logger.info(
                    "Layout and table Docling models for pdf extraction were found in the resources folder of the code env. But the image classification model was not found. "
                    "The image classification will be disabled")
                self.do_image_classification = False
                return layout_model, artifact_path
            else:
                logger.info(
                    "Cannot find layout, table and image classification Docling models in the resources folder of the code env. They will be downloaded from HF.")
                return original_layout_model, None

        if layout_model_found and table_model_found:
            logger.info(
                "Layout and table Docling models for pdf extraction were found in the resources folder of the code env. They will be used for extraction")
            return layout_model, artifact_path
        else:  # we cannot set separately the layout and table path but only set the root artifact_path. If one of them is not present, extraction will fail.
            logger.info("Cannot find both layout and table Docling models in the resources folder of the code env. They will be downloaded from HF.")
            return original_layout_model, None

    @staticmethod
    def model_exists(path):
        return path.exists() and any(f.endswith("safetensors") for f in os.listdir(path))

    @staticmethod
    def get_docling_pdf_table_mode(config_optimization_mode) -> TableFormerMode:
        if config_optimization_mode == "FAST":
            return TableFormerMode.FAST
        elif config_optimization_mode == "ACCURATE":
            return TableFormerMode.ACCURATE
        else:
            return TableFormerMode.FAST  # Default value for both CPU and GPU

    @staticmethod
    def get_docling_pdf_layout_model(config_optimization_mode, is_gpu_available) -> LayoutModelConfig:
        if config_optimization_mode == "FAST":
            return DOCLING_LAYOUT_EGRET_MEDIUM
        elif config_optimization_mode == "ACCURATE":
            return DOCLING_LAYOUT_HERON
        else:
            return DOCLING_LAYOUT_HERON if is_gpu_available else DOCLING_LAYOUT_EGRET_MEDIUM

    @staticmethod
    def detect_accelerator_device() -> AcceleratorDevice:
        # Detects if GPU is available else fallback on CPU or MPS (macOS).
        is_gpu_available = TorchGpuCapability.is_gpu_available()
        has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
        device = AcceleratorDevice.CPU
        if has_mps:
            device = AcceleratorDevice.MPS
        if is_gpu_available:
            device = AcceleratorDevice.CUDA

        if device == AcceleratorDevice.CPU:
            logger.info("No accelerator nor GPU was detected in the environment")
        else:
            logger.info(f"Detected the following accelerator option: {device.value}. Will be used for extraction")
        return device

    def docling_batch_structured_extract(self, requests: List[DoclingRequest]) -> List[dict]:
        """
        requests is a list of documents with the same extension. They also share in common the same settings for OCR. The settings of each document should lead
        to the same docling.document.PipelineOptions. For PDFs, we build a specific PipelineOptions for each `requests` object (it's not long). For png/jpeg/jpg we don't use
        docling so we process them differently (docling only supports pdf for OCR). For docx, pptx, html, the PipelineOptions object is always the same so it's
        an attribute of this class that we'll reuse.
        """
        if not requests:
            return []

        accelerator_options = AcceleratorOptions(device=self.device)

        extension = os.path.splitext(requests[0].file_name)[1].lower().lstrip(".")

        needs_ocr = any(req.do_ocr for req in requests)
        vlm_annotate = any(req.vlm_annotate for req in requests)
        save_images = any(req.save_imgs for req in requests)
        apply_additional_ocr_on_generated_images = False  # For docx/pptx, docling does not apply OCR on generated images, so we need to do it manually if needed.

        # To save images in a managed folder, we need their base64. We only don't need them if the image handling mode is set to ignore.
        with_inline_images = save_images or vlm_annotate or needs_ocr

        if needs_ocr:
            ocr_options = get_ocr_config(requests[0].lang, requests[0].ocr_engine, accelerator_options.device != AcceleratorDevice.CPU)
        else:
            ocr_options = None

        # Handle image formats without docling:
        if extension in self.image_formats:
            res = []
            if ocr_options is None:
                raise ValueError("OCR options must be provided for image extraction")
            text_results = process_images_with_ocr(ocr_options, [(idx, request.document_content) for idx, request in enumerate(requests)])
            for idx, request in enumerate(requests):
                if idx in text_results:
                    res.append(DoclingResponse(True, ImageNode(node_id=request.file_name,
                                                               children=[],
                                                               label="image",
                                                               highest_confidence_class_name=None,
                                                               highest_confidence_class_confidence=None,
                                                               content=text_results[idx],
                                                               level=0).to_dict(), None).to_dict())
                else:
                    # something went wrong with the OCR processing
                    res.append(DoclingResponse(False, {}, f"Error processing image {request.file_name} with OCR").to_dict())
            return res
        # handle PDFs
        elif extension == "pdf":
            if needs_ocr:
                if ocr_options is None:
                    logger.warning("The provided OCR engine is not available, deactivating OCR for extraction")
                else:
                    ocr_options.force_full_page_ocr = False  # To apply OCR on each image if pdf is not fully scanned. This will ensure we get good results on embedded images.
            pipeline_options = ThreadedPdfPipelineOptions(do_table_structure=True, do_ocr=(needs_ocr and ocr_options is not None),
                                                          ocr_options=(
                                                              ocr_options if ocr_options is not None else EasyOcrOptions()))  # Can't set ocr_options to None

            pipeline_options.table_structure_options.mode = self.table_structure_options_mode
            pipeline_options.table_structure_options.do_cell_matching = True

            if self.artifacts_path:
                pipeline_options.artifacts_path = self.artifacts_path

            pipeline_options.layout_options.model_spec = self.layout_model
            pipeline_options.accelerator_options = accelerator_options

            if with_inline_images:
                # The default resolution for images inside PDFs is 72dpi, we set it to 144dpi.
                # See https://github.com/docling-project/docling/blob/v2.61.1/docling/pipeline/standard_pdf_pipeline.py#L636-L638
                pipeline_options.images_scale = 2.0
                pipeline_options.generate_picture_images = True

            doc_converter = DocumentConverter(format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_options=pipeline_options,
                    pipeline_cls=ThreadedStandardPdfPipeline, backend=self.pdf_backend)
            })
        # Any other documents (docx, pptx, html)
        else:
            pipeline_options = PaginatedPipelineOptions(accelerator_options=accelerator_options)
            if with_inline_images:
                # Note that for docx and pptx, picture images will be generated even if pipeline_options.generate_picture_images = False
                pipeline_options.generate_picture_images = True

            # Docling does not support OCR on docx, pptx, html, so we need to extract the images and apply "raw" OCR on them.
            if needs_ocr:
                if ocr_options is None:
                    logger.warning("The provided OCR engine is not available, deactivating OCR for extraction")
                else:
                    pipeline_options.generate_picture_images = True
                    apply_additional_ocr_on_generated_images = True

            doc_converter = DocumentConverter(format_options={
                InputFormat.DOCX: WordFormatOption(
                    pipeline_options=pipeline_options,
                    pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
                ),
                InputFormat.PPTX: PowerpointFormatOption(
                    pipeline_options=pipeline_options,
                    pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
                ),
                InputFormat.HTML: HTMLFormatOption(
                    pipeline_options=pipeline_options,
                    pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
                )
            })

        pipeline_options.do_picture_classification = self.do_image_classification
        logger.info(build_message_log_for_document([request.file_name for request in requests], "Image classification is " +
                                                   ("enabled" if pipeline_options.do_picture_classification else "disabled")))
        # Do the docling conversion and map output to tree structure
        document_to_process = []
        for document in requests:
            byte_data = base64.b64decode(document.document_content)
            document_to_process.append(DocumentStream(name=document.file_name, stream=io.BytesIO(byte_data)))

        res = []
        for idx, conversion_result in enumerate(doc_converter.convert_all(document_to_process, raises_on_error=False)):
            try:
                if conversion_result.status not in {
                    ConversionStatus.SUCCESS,
                    ConversionStatus.PARTIAL_SUCCESS,
                }:
                    logger.error(build_message_log_for_document(conversion_result.document.name,
                                                                f"An error occurred during docling processing : {conversion_result.errors}"))
                    res.append(
                        DoclingResponse(False, {}, f"Conversion failed for: {conversion_result.document.name} with : {conversion_result.errors}").to_dict())
                else:
                    res.append(
                        DoclingResponse(True, build_tree_with_outline(conversion_result.document, needs_ocr,
                                                                      apply_additional_ocr_on_generated_images, ocr_options,
                                                                      vlm_annotate, with_inline_images=with_inline_images).to_dict(), None).to_dict())
                    logger.info(build_message_log_for_document(conversion_result.document.name, "Done processing document with docling"))
            except Exception as e:
                logger.exception("An error occurred during docling processing")
                res.append(DoclingResponse(False, {}, ''.join(traceback.format_exception(type(e), e, e.__traceback__))).to_dict())
        return res


def build_tree_with_outline(document: DoclingDocument, needs_ocr, apply_additional_ocr_on_generated_images=False,
                            ocr_options=None, vlm_annotate=False, with_inline_images=False) -> RootNode:
    """
    Given a docling conversion result. We build a tree that represent the structure of the documents. The nodes can either be sections, texts, images or tables.

    :param document: result of Docling conversion
    :param needs_ocr: Whether OCR should be applied on the document.
    :param apply_additional_ocr_on_generated_images: For docx/pptx, docling does not apply OCR on images, if needed, do it manually on the generated images.
    :param ocr_options: OCR options if additional OCR needs to be applied.
    :param vlm_annotate: Whether images will be annotated by a LLM in the java. If so we need to look if images have captions.
    :param with_inline_images: Whether the base64 of the images will be added to image nodes.
    :return:
    """
    root = RootNode(document.body.self_ref, [], "root", "", 0)
    # We use a stack to keep track of the current title hierarchy. If a deeper section is encountered, it is added to the stack. This will ensure deeper text
    # have the whole outline. If a higher section is encountered, it is removed from the stack because it means we entered into a new section and the current
    # outline must be updated.
    stack: List[AbstractDocumentNode] = [root]
    # current_text holds the current text nodes that have not been yet added to the tree. Docling creates multiple paragraphs that we want to merge.
    # If we encounter a new section, an image, or a table, we need to merge the previous paragraphs and add them to the tree. To do so we use the append_pending_nodes_to_tree function.
    current_text: List[TextNode] = []

    image_idx = 0

    # We apply OCR on docx/pptx images because docling does not do it (only on PDFs). We do it as a batch here because easyocr.Reader is long to initialize
    images_text = {}
    if apply_additional_ocr_on_generated_images and ocr_options is not None:
        images_text = apply_ocr_on_images_for_non_pdf_documents(document, ocr_options)
    # Captions for images are returned as part of ImageNode when doing vlm annotation, we keep track of them to avoid duplicates. Because document.iterate_items() will also return them
    # as text, and we don't want them to be considered as separate text nodes.
    captions_cref_list: List[str] = []
    for node_item, level in document.iterate_items(with_groups=True):
        if not stack:
            logger.error(
                build_message_log_for_document(document.source_file_path, "Stack is empty while building document tree for document. This should not happen."))
            stack.append(root)
        if hasattr(node_item, "label"):
            # To build the tree structure we rely on the labeled nodes of docling. Docling identifies titles/section headers and groups (chapters, sections, slides) that we use to build the outline.
            # Groups do not have any content, but we still identify some as sections because they represent a logical section in the document for pptx files
            # In pptx documents, we need to rely on groups to recover the slide structure.
            if node_item.label == DocItemLabel.TITLE or node_item.label == DocItemLabel.SECTION_HEADER:
                current_section = SectionNode(node_item.self_ref, [], node_item.label, node_item.text if hasattr(node_item, "text") else "", level,
                                              [prov.page_no for prov in node_item.prov] if hasattr(node_item, "prov") else [])
                handle_section_item(stack, current_section, level, current_text)

            elif type(node_item) == GroupItem and node_item.label in [GroupLabel.CHAPTER, GroupLabel.SLIDE]:
                # For now docling identifies slide as chapter see https://github.com/docling-project/docling/blob/main/docling/backend/mspowerpoint_backend.py#L331-L335
                # We should pay attention to any changes for next docling bump, for the moment chapter is only used in mspowerpoint_backend.py
                current_section = SlideNode(node_item.self_ref, [], node_item.label, increment_slide(node_item.name) if hasattr(node_item, "name") else "",
                                            level, [])
                handle_section_item(stack, current_section, level, current_text)


            elif isinstance(node_item, ListItem) and node_item.text:
                if current_text and current_text[-1].label != DocItemLabel.LIST_ITEM:
                    # We have a new list item but the current_text is not a list item, so we need to flush the current_text to the tree first. And then start a new list item list.
                    append_pending_nodes_to_tree(stack, current_text)
                current_text.append(TextNode(node_item.self_ref, [], DocItemLabel.LIST_ITEM, node_item.text, level,
                                             [prov.page_no for prov in node_item.prov] if hasattr(node_item, "prov") else []))

            elif isinstance(node_item, PictureItem):
                append_pending_nodes_to_tree(stack, current_text)
                handle_image_item(document, node_item, needs_ocr, vlm_annotate, with_inline_images, apply_additional_ocr_on_generated_images, images_text,
                                  captions_cref_list, stack, image_idx, level)
                image_idx += 1
            elif isinstance(node_item, TableItem):
                append_pending_nodes_to_tree(stack, current_text)
                stack[-1].children.append(TableNode(node_item.self_ref, [], node_item.label, node_item.export_to_markdown(document), level,
                                                    [prov.page_no for prov in node_item.prov] if hasattr(node_item, "prov") else []))
            else:
                # Make sure we don't add empty paragraphs
                if hasattr(node_item, "text") and node_item.text and node_item.self_ref not in captions_cref_list:
                    if current_text and current_text[-1].label == DocItemLabel.LIST_ITEM:
                        # We had a list item before, so we need to flush it to the tree first.
                        append_pending_nodes_to_tree(stack, current_text)
                    current_text.append(TextNode(node_item.self_ref, [], node_item.label, node_item.text, level,
                                                 [prov.page_no for prov in node_item.prov] if hasattr(node_item, "prov") else []))
    # Add last current_text
    if stack and current_text:
        append_pending_nodes_to_tree(stack, current_text)
    return root


def apply_ocr_on_images_for_non_pdf_documents(document: DoclingDocument, ocr_options) -> dict:
    image_refs = []
    for picture in document.pictures:
        # Check that docling correctly included the base64 image in the document. If so apply OCR on it.
        if (
                isinstance(picture.image, ImageRef)
                and isinstance(picture.image.uri, AnyUrl)
                and picture.image.uri.scheme == "data"
                and picture.image.uri.path is not None
        ):
            image_refs.append((picture.self_ref, picture.image.uri.path.split(",")[1]))
    images_text = process_images_with_ocr(ocr_options, image_refs)
    return images_text


def handle_section_item(stack: List[AbstractDocumentNode], current_section: SectionNode, level: int, current_text: List[TextNode]):
    append_pending_nodes_to_tree(stack, current_text)
    # The stack carries the current hierarchy. If the current section is of a lower level (means higher in the hierarchy), then
    # we need to remove the deeper levels from the stack. Level 0 is the root, level 6 is the max depth for Markdown.
    while stack and stack[-1].level is not None and stack[-1].level >= level:
        stack.pop()
    if stack:
        stack[-1].children.append(current_section)
    stack.append(current_section)


def handle_image_item(document, image_item: PictureItem, needs_ocr, vlm_annotate, with_inline_images, apply_additional_ocr_on_generated_images, images_text,
                      captions_cref_list, stack, image_idx, level):
    """
    Handle an image item from docling conversion. Depending on the settings, it can extract OCR text from the image, or retrieve its base64 representation for VLM annotation (later in Java).
    :param document: docling document
    :param image_item: current image item
    :param needs_ocr: whether OCR is needed
    :param vlm_annotate: whether VLM annotation is needed
    :param with_inline_images: whether the base64 should be returned
    :param apply_additional_ocr_on_generated_images: Docling only supports OCR on PDFs. For docx/pptx, docling does not apply OCR on images, if needed, we do it manually on the generated images.
    :param images_text: A dictionary of image cref to OCR text for images that needed additional OCR
    :param captions_cref_list:     # Captions for images are returned as part of ImageNode when doing vlm annotation, we keep track of them to avoid duplicates. Because document.iterate_items() will also return them
    :param stack: stack of current document hierarchy
    :param image_idx:
    :param level:
    :return:
    """
    content = ""
    if (
            isinstance(image_item.image, ImageRef)
            and isinstance(image_item.image.uri, AnyUrl)
            and image_item.image.uri.scheme == "data"
            and image_item.image.uri.path is not None
    ) and with_inline_images:
        image_base64 = image_item.image.uri.path.split(",")[1]
    else:
        image_base64 = None

    caption = None
    highest_confidence_class: Optional[PictureClassificationPrediction] = None
    if not image_item.meta and image_item.annotations:
        # In docling 2.61.1, annotations are deprecated but meta field is not yet filled... We migrate them to avoid any future issues when annotations are removed.
        image_item._migrate_annotations_to_meta()
    if image_item.meta and image_item.meta.classification:
        highest_confidence_class = image_item.meta.classification.get_main_prediction()

    if needs_ocr:
        if not apply_additional_ocr_on_generated_images:
            # OCR was applied on the document, so we can look for text nodes inside the image.
            if image_item.children:
                for child in image_item.children:
                    # Resolve the children and append them to a single text node
                    try:
                        child_node = child.resolve(document)
                        if isinstance(child_node, TextItem) and child_node.text:
                            content += child_node.text + " "
                    except Exception as e:
                        logger.error(f"Error resolving child node {child.cref} in image {image_item.self_ref}: {e}")
                        continue
        else:
            if image_item.self_ref in images_text:
                # Retrieve the OCR text for the image that we computed at the beginning of the function
                content = images_text[image_item.self_ref]
    elif vlm_annotate and image_base64 is not None:
        # We look for caption to enrich the image Node
        # This caption will also be returned on the iterate_items, so we need to keep track of them in order not to add them twice
        if image_item.captions:
            logger.info("trying to find caption for image:")
            caption = find_captions_for_image(document, image_item)
            captions_cref_list.extend([ref_item.cref for ref_item in image_item.captions])
    highest_confidence_class_name = None
    highest_confidence_class_confidence = None
    if highest_confidence_class is not None:
        highest_confidence_class_name = highest_confidence_class.class_name
        highest_confidence_class_confidence = highest_confidence_class.confidence
    stack[-1].children.append(
        ImageNode(node_id="image_" + str(image_idx), children=[],
                  label=image_item.label, highest_confidence_class_name=highest_confidence_class_name,
                  highest_confidence_class_confidence=highest_confidence_class_confidence,
                  content=content, level=level,
                  page_provenance=[prov.page_no for prov in image_item.prov] if hasattr(image_item, "prov") else [],
                  image_base64=image_base64,
                  image=image_item.image if hasattr(image_item, "image") else None, caption=caption))


def find_captions_for_image(document: DoclingDocument, image_node: PictureItem) -> (Optional[str]):
    """
    Look if the image has any caption. Docling will have the caption as an image child if it's identified as such
    """
    if image_node.captions:
        caption = ""
        for child in image_node.captions:
            try:
                child_node = child.resolve(document)
                if hasattr(child_node, "text") and child_node.text:
                    caption += child_node.text + " "
            except Exception as e:
                logger.error(f"Error resolving child node {child.cref} in image {image_node.self_ref}: {e}")
                continue
        if caption:
            return caption
    return None


def append_pending_nodes_to_tree(stack: List[AbstractDocumentNode], current_text: List[TextNode]):
    if stack and current_text:
        stack[-1].children.append(merge_paragraphs(current_text))
    current_text.clear()


def merge_paragraphs(current_text: List[TextNode]) -> TextNode:
    node_id_parts, children, content_parts, provenance = [], [], [], []
    for node in current_text:
        node_id_parts.append(node.node_id)
        children.extend(node.children)
        content_parts.append(node.content)
        provenance.extend(node.page_provenance)
    node_id = "-".join(node_id_parts)
    content = "\n".join(content_parts)
    return TextNode(node_id, children, "merged_text", content, current_text[0].level if current_text else None, provenance)


def increment_slide(slide_name: str) -> str:
    # Docling slide index start from 0 which introduces a mismatch with the pageRange
    # We increment the slide-xx, to start at slide-1
    split_name = slide_name.split('-')
    if len(split_name) == 2 and split_name[1].isdigit():
        prefix, idx = split_name
        return f"{prefix}-{int(idx) + 1}"
    else:
        # If the slide name does not follow the expected pattern, just return the name provided by docling
        return slide_name
