from enum import Enum from typing import TYPE_CHECKING, Dict, List, Optional, Union from docling_core.types.doc import ( BoundingBox, DocItemLabel, NodeItem, PictureDataType, Size, TableCell, ) from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location DocumentStream, ) from PIL.Image import Image from pydantic import BaseModel, ConfigDict if TYPE_CHECKING: from docling.backend.pdf_backend import PdfPageBackend class ConversionStatus(str, Enum): PENDING = "pending" STARTED = "started" FAILURE = "failure" SUCCESS = "success" PARTIAL_SUCCESS = "partial_success" SKIPPED = "skipped" class InputFormat(str, Enum): """A document format supported by document backend parsers.""" DOCX = "docx" PPTX = "pptx" HTML = "html" XML_PUBMED = "xml_pubmed" IMAGE = "image" PDF = "pdf" ASCIIDOC = "asciidoc" MD = "md" CSV = "csv" XLSX = "xlsx" XML_USPTO = "xml_uspto" JSON_DOCLING = "json_docling" class OutputFormat(str, Enum): MARKDOWN = "md" JSON = "json" HTML = "html" TEXT = "text" DOCTAGS = "doctags" FormatToExtensions: Dict[InputFormat, List[str]] = { InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"], InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"], InputFormat.PDF: ["pdf"], InputFormat.MD: ["md"], InputFormat.HTML: ["html", "htm", "xhtml"], InputFormat.XML_PUBMED: ["xml", "nxml"], InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"], InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"], InputFormat.CSV: ["csv"], InputFormat.XLSX: ["xlsx"], InputFormat.XML_USPTO: ["xml", "txt"], InputFormat.JSON_DOCLING: ["json"], } FormatToMimeType: Dict[InputFormat, List[str]] = { InputFormat.DOCX: [ "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/vnd.openxmlformats-officedocument.wordprocessingml.template", ], InputFormat.PPTX: [ "application/vnd.openxmlformats-officedocument.presentationml.template", "application/vnd.openxmlformats-officedocument.presentationml.slideshow", "application/vnd.openxmlformats-officedocument.presentationml.presentation", ], InputFormat.HTML: ["text/html", "application/xhtml+xml"], InputFormat.XML_PUBMED: ["application/xml"], InputFormat.IMAGE: [ "image/png", "image/jpeg", "image/tiff", "image/gif", "image/bmp", ], InputFormat.PDF: ["application/pdf"], InputFormat.ASCIIDOC: ["text/asciidoc"], InputFormat.MD: ["text/markdown", "text/x-markdown"], InputFormat.CSV: ["text/csv"], InputFormat.XLSX: [ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ], InputFormat.XML_USPTO: ["application/xml", "text/plain"], InputFormat.JSON_DOCLING: ["application/json"], } MimeTypeToFormat: dict[str, list[InputFormat]] = { mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]] for value in FormatToMimeType.values() for mime in value } class DocInputType(str, Enum): PATH = "path" STREAM = "stream" class DoclingComponentType(str, Enum): DOCUMENT_BACKEND = "document_backend" MODEL = "model" DOC_ASSEMBLER = "doc_assembler" USER_INPUT = "user_input" class ErrorItem(BaseModel): component_type: DoclingComponentType module_name: str error_message: str class Cell(BaseModel): id: int text: str bbox: BoundingBox class OcrCell(Cell): confidence: float class Cluster(BaseModel): id: int label: DocItemLabel bbox: BoundingBox confidence: float = 1.0 cells: List[Cell] = [] children: List["Cluster"] = [] # Add child cluster support class BasePageElement(BaseModel): label: DocItemLabel id: int page_no: int cluster: Cluster text: Optional[str] = None class LayoutPrediction(BaseModel): clusters: List[Cluster] = [] class ContainerElement( BasePageElement ): # Used for Form and Key-Value-Regions, only for typing. pass class Table(BasePageElement): otsl_seq: List[str] num_rows: int = 0 num_cols: int = 0 table_cells: List[TableCell] class TableStructurePrediction(BaseModel): table_map: Dict[int, Table] = {} class TextElement(BasePageElement): text: str class FigureElement(BasePageElement): annotations: List[PictureDataType] = [] provenance: Optional[str] = None predicted_class: Optional[str] = None confidence: Optional[float] = None class FigureClassificationPrediction(BaseModel): figure_count: int = 0 figure_map: Dict[int, FigureElement] = {} class EquationPrediction(BaseModel): equation_count: int = 0 equation_map: Dict[int, TextElement] = {} class PagePredictions(BaseModel): layout: Optional[LayoutPrediction] = None tablestructure: Optional[TableStructurePrediction] = None figures_classification: Optional[FigureClassificationPrediction] = None equations_prediction: Optional[EquationPrediction] = None PageElement = Union[TextElement, Table, FigureElement, ContainerElement] class AssembledUnit(BaseModel): elements: List[PageElement] = [] body: List[PageElement] = [] headers: List[PageElement] = [] class ItemAndImageEnrichmentElement(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) item: NodeItem image: Image class Page(BaseModel): model_config = ConfigDict(arbitrary_types_allowed=True) page_no: int # page_hash: Optional[str] = None size: Optional[Size] = None cells: List[Cell] = [] predictions: PagePredictions = PagePredictions() assembled: Optional[AssembledUnit] = None _backend: Optional["PdfPageBackend"] = ( None # Internal PDF backend. By default it is cleared during assembling. ) _default_image_scale: float = 1.0 # Default image scale for external usage. _image_cache: Dict[float, Image] = ( {} ) # Cache of images in different scales. By default it is cleared during assembling. def get_image( self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None ) -> Optional[Image]: if self._backend is None: return self._image_cache.get(scale, None) if not scale in self._image_cache: if cropbox is None: self._image_cache[scale] = self._backend.get_page_image(scale=scale) else: return self._backend.get_page_image(scale=scale, cropbox=cropbox) if cropbox is None: return self._image_cache[scale] else: page_im = self._image_cache[scale] assert self.size is not None return page_im.crop( cropbox.to_top_left_origin(page_height=self.size.height) .scaled(scale=scale) .as_tuple() ) @property def image(self) -> Optional[Image]: return self.get_image(scale=self._default_image_scale)