Docling/docling/datamodel/base_models.py
Peter W. J. Staar c0ba88edf1
feat(cli): add option for html with split-page mode (#1355)
* updated the cli to output html in split-page mode

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* add pin for new docling-core with html split argument

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* relock with fixed html export in docling-core

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update test results

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update more tests

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update example

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update lock with docling-core fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update test results

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add again chunking extras

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
2025-04-14 08:41:50 +02:00

298 lines
7.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from enum import Enum
from typing import TYPE_CHECKING, Dict, List, Optional, Union
from docling_core.types.doc import (
BoundingBox,
DocItemLabel,
NodeItem,
PictureDataType,
Size,
TableCell,
)
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
DocumentStream,
)
from PIL.Image import Image
from pydantic import BaseModel, ConfigDict
if TYPE_CHECKING:
from docling.backend.pdf_backend import PdfPageBackend
class ConversionStatus(str, Enum):
PENDING = "pending"
STARTED = "started"
FAILURE = "failure"
SUCCESS = "success"
PARTIAL_SUCCESS = "partial_success"
SKIPPED = "skipped"
class InputFormat(str, Enum):
"""A document format supported by document backend parsers."""
DOCX = "docx"
PPTX = "pptx"
HTML = "html"
IMAGE = "image"
PDF = "pdf"
ASCIIDOC = "asciidoc"
MD = "md"
CSV = "csv"
XLSX = "xlsx"
XML_USPTO = "xml_uspto"
XML_JATS = "xml_jats"
JSON_DOCLING = "json_docling"
class OutputFormat(str, Enum):
MARKDOWN = "md"
JSON = "json"
HTML = "html"
HTML_SPLIT_PAGE = "html_split_page"
TEXT = "text"
DOCTAGS = "doctags"
FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
InputFormat.PDF: ["pdf"],
InputFormat.MD: ["md"],
InputFormat.HTML: ["html", "htm", "xhtml"],
InputFormat.XML_JATS: ["xml", "nxml"],
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.CSV: ["csv"],
InputFormat.XLSX: ["xlsx"],
InputFormat.XML_USPTO: ["xml", "txt"],
InputFormat.JSON_DOCLING: ["json"],
}
FormatToMimeType: Dict[InputFormat, List[str]] = {
InputFormat.DOCX: [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
],
InputFormat.PPTX: [
"application/vnd.openxmlformats-officedocument.presentationml.template",
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
],
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
InputFormat.XML_JATS: ["application/xml"],
InputFormat.IMAGE: [
"image/png",
"image/jpeg",
"image/tiff",
"image/gif",
"image/bmp",
],
InputFormat.PDF: ["application/pdf"],
InputFormat.ASCIIDOC: ["text/asciidoc"],
InputFormat.MD: ["text/markdown", "text/x-markdown"],
InputFormat.CSV: ["text/csv"],
InputFormat.XLSX: [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
],
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
InputFormat.JSON_DOCLING: ["application/json"],
}
MimeTypeToFormat: dict[str, list[InputFormat]] = {
mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]
for value in FormatToMimeType.values()
for mime in value
}
class DocInputType(str, Enum):
PATH = "path"
STREAM = "stream"
class DoclingComponentType(str, Enum):
DOCUMENT_BACKEND = "document_backend"
MODEL = "model"
DOC_ASSEMBLER = "doc_assembler"
USER_INPUT = "user_input"
class ErrorItem(BaseModel):
component_type: DoclingComponentType
module_name: str
error_message: str
# class Cell(BaseModel):
# id: int
# text: str
# bbox: BoundingBox
class Cluster(BaseModel):
id: int
label: DocItemLabel
bbox: BoundingBox
confidence: float = 1.0
cells: List[TextCell] = []
children: List["Cluster"] = [] # Add child cluster support
class BasePageElement(BaseModel):
label: DocItemLabel
id: int
page_no: int
cluster: Cluster
text: Optional[str] = None
class LayoutPrediction(BaseModel):
clusters: List[Cluster] = []
class VlmPrediction(BaseModel):
text: str = ""
class ContainerElement(
BasePageElement
): # Used for Form and Key-Value-Regions, only for typing.
pass
class Table(BasePageElement):
otsl_seq: List[str]
num_rows: int = 0
num_cols: int = 0
table_cells: List[TableCell]
class TableStructurePrediction(BaseModel):
table_map: Dict[int, Table] = {}
class TextElement(BasePageElement):
text: str
class FigureElement(BasePageElement):
annotations: List[PictureDataType] = []
provenance: Optional[str] = None
predicted_class: Optional[str] = None
confidence: Optional[float] = None
class FigureClassificationPrediction(BaseModel):
figure_count: int = 0
figure_map: Dict[int, FigureElement] = {}
class EquationPrediction(BaseModel):
equation_count: int = 0
equation_map: Dict[int, TextElement] = {}
class PagePredictions(BaseModel):
layout: Optional[LayoutPrediction] = None
tablestructure: Optional[TableStructurePrediction] = None
figures_classification: Optional[FigureClassificationPrediction] = None
equations_prediction: Optional[EquationPrediction] = None
vlm_response: Optional[VlmPrediction] = None
PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
class AssembledUnit(BaseModel):
elements: List[PageElement] = []
body: List[PageElement] = []
headers: List[PageElement] = []
class ItemAndImageEnrichmentElement(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
item: NodeItem
image: Image
class Page(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
page_no: int
# page_hash: Optional[str] = None
size: Optional[Size] = None
cells: List[TextCell] = []
parsed_page: Optional[SegmentedPdfPage] = None
predictions: PagePredictions = PagePredictions()
assembled: Optional[AssembledUnit] = None
_backend: Optional["PdfPageBackend"] = (
None # Internal PDF backend. By default it is cleared during assembling.
)
_default_image_scale: float = 1.0 # Default image scale for external usage.
_image_cache: Dict[float, Image] = (
{}
) # Cache of images in different scales. By default it is cleared during assembling.
def get_image(
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
) -> Optional[Image]:
if self._backend is None:
return self._image_cache.get(scale, None)
if not scale in self._image_cache:
if cropbox is None:
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
else:
return self._backend.get_page_image(scale=scale, cropbox=cropbox)
if cropbox is None:
return self._image_cache[scale]
else:
page_im = self._image_cache[scale]
assert self.size is not None
return page_im.crop(
cropbox.to_top_left_origin(page_height=self.size.height)
.scaled(scale=scale)
.as_tuple()
)
@property
def image(self) -> Optional[Image]:
return self.get_image(scale=self._default_image_scale)
## OpenAI API Request / Response Models ##
class OpenAiChatMessage(BaseModel):
role: str
content: str
class OpenAiResponseChoice(BaseModel):
index: int
message: OpenAiChatMessage
finish_reason: str
class OpenAiResponseUsage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
class OpenAiApiResponse(BaseModel):
model_config = ConfigDict(
protected_namespaces=(),
)
id: str
model: Optional[str] = None # returned by openai
choices: List[OpenAiResponseChoice]
created: int
usage: OpenAiResponseUsage