
* add coverage calculation and push Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * new codecov version and usage of token Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * enable ruff formatter instead of black and isort Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff lint fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff unsafe fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add removed imports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * runs 1 on linter issues Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * finalize linter fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update pyproject.toml Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
300 lines
7.7 KiB
Python
300 lines
7.7 KiB
Python
from enum import Enum
|
|
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
|
|
|
from docling_core.types.doc import (
|
|
BoundingBox,
|
|
DocItemLabel,
|
|
NodeItem,
|
|
PictureDataType,
|
|
Size,
|
|
TableCell,
|
|
)
|
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
|
|
|
# DO NOT REMOVE; explicitly exposed from this location
|
|
from docling_core.types.io import (
|
|
DocumentStream,
|
|
)
|
|
from PIL.Image import Image
|
|
from pydantic import BaseModel, ConfigDict
|
|
|
|
if TYPE_CHECKING:
|
|
from docling.backend.pdf_backend import PdfPageBackend
|
|
|
|
|
|
class ConversionStatus(str, Enum):
|
|
PENDING = "pending"
|
|
STARTED = "started"
|
|
FAILURE = "failure"
|
|
SUCCESS = "success"
|
|
PARTIAL_SUCCESS = "partial_success"
|
|
SKIPPED = "skipped"
|
|
|
|
|
|
class InputFormat(str, Enum):
|
|
"""A document format supported by document backend parsers."""
|
|
|
|
DOCX = "docx"
|
|
PPTX = "pptx"
|
|
HTML = "html"
|
|
IMAGE = "image"
|
|
PDF = "pdf"
|
|
ASCIIDOC = "asciidoc"
|
|
MD = "md"
|
|
CSV = "csv"
|
|
XLSX = "xlsx"
|
|
XML_USPTO = "xml_uspto"
|
|
XML_JATS = "xml_jats"
|
|
JSON_DOCLING = "json_docling"
|
|
|
|
|
|
class OutputFormat(str, Enum):
|
|
MARKDOWN = "md"
|
|
JSON = "json"
|
|
HTML = "html"
|
|
HTML_SPLIT_PAGE = "html_split_page"
|
|
TEXT = "text"
|
|
DOCTAGS = "doctags"
|
|
|
|
|
|
FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
|
|
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
|
|
InputFormat.PDF: ["pdf"],
|
|
InputFormat.MD: ["md"],
|
|
InputFormat.HTML: ["html", "htm", "xhtml"],
|
|
InputFormat.XML_JATS: ["xml", "nxml"],
|
|
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
|
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
|
InputFormat.CSV: ["csv"],
|
|
InputFormat.XLSX: ["xlsx"],
|
|
InputFormat.XML_USPTO: ["xml", "txt"],
|
|
InputFormat.JSON_DOCLING: ["json"],
|
|
}
|
|
|
|
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
|
InputFormat.DOCX: [
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
|
|
],
|
|
InputFormat.PPTX: [
|
|
"application/vnd.openxmlformats-officedocument.presentationml.template",
|
|
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
|
],
|
|
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
|
|
InputFormat.XML_JATS: ["application/xml"],
|
|
InputFormat.IMAGE: [
|
|
"image/png",
|
|
"image/jpeg",
|
|
"image/tiff",
|
|
"image/gif",
|
|
"image/bmp",
|
|
],
|
|
InputFormat.PDF: ["application/pdf"],
|
|
InputFormat.ASCIIDOC: ["text/asciidoc"],
|
|
InputFormat.MD: ["text/markdown", "text/x-markdown"],
|
|
InputFormat.CSV: ["text/csv"],
|
|
InputFormat.XLSX: [
|
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
|
],
|
|
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
|
|
InputFormat.JSON_DOCLING: ["application/json"],
|
|
}
|
|
|
|
MimeTypeToFormat: dict[str, list[InputFormat]] = {
|
|
mime: [fmt for fmt in FormatToMimeType if mime in FormatToMimeType[fmt]]
|
|
for value in FormatToMimeType.values()
|
|
for mime in value
|
|
}
|
|
|
|
|
|
class DocInputType(str, Enum):
|
|
PATH = "path"
|
|
STREAM = "stream"
|
|
|
|
|
|
class DoclingComponentType(str, Enum):
|
|
DOCUMENT_BACKEND = "document_backend"
|
|
MODEL = "model"
|
|
DOC_ASSEMBLER = "doc_assembler"
|
|
USER_INPUT = "user_input"
|
|
|
|
|
|
class ErrorItem(BaseModel):
|
|
component_type: DoclingComponentType
|
|
module_name: str
|
|
error_message: str
|
|
|
|
|
|
# class Cell(BaseModel):
|
|
# id: int
|
|
# text: str
|
|
# bbox: BoundingBox
|
|
|
|
|
|
class Cluster(BaseModel):
|
|
id: int
|
|
label: DocItemLabel
|
|
bbox: BoundingBox
|
|
confidence: float = 1.0
|
|
cells: List[TextCell] = []
|
|
children: List["Cluster"] = [] # Add child cluster support
|
|
|
|
|
|
class BasePageElement(BaseModel):
|
|
label: DocItemLabel
|
|
id: int
|
|
page_no: int
|
|
cluster: Cluster
|
|
text: Optional[str] = None
|
|
|
|
|
|
class LayoutPrediction(BaseModel):
|
|
clusters: List[Cluster] = []
|
|
|
|
|
|
class VlmPrediction(BaseModel):
|
|
text: str = ""
|
|
|
|
|
|
class ContainerElement(
|
|
BasePageElement
|
|
): # Used for Form and Key-Value-Regions, only for typing.
|
|
pass
|
|
|
|
|
|
class Table(BasePageElement):
|
|
otsl_seq: List[str]
|
|
num_rows: int = 0
|
|
num_cols: int = 0
|
|
table_cells: List[TableCell]
|
|
|
|
|
|
class TableStructurePrediction(BaseModel):
|
|
table_map: Dict[int, Table] = {}
|
|
|
|
|
|
class TextElement(BasePageElement):
|
|
text: str
|
|
|
|
|
|
class FigureElement(BasePageElement):
|
|
annotations: List[PictureDataType] = []
|
|
provenance: Optional[str] = None
|
|
predicted_class: Optional[str] = None
|
|
confidence: Optional[float] = None
|
|
|
|
|
|
class FigureClassificationPrediction(BaseModel):
|
|
figure_count: int = 0
|
|
figure_map: Dict[int, FigureElement] = {}
|
|
|
|
|
|
class EquationPrediction(BaseModel):
|
|
equation_count: int = 0
|
|
equation_map: Dict[int, TextElement] = {}
|
|
|
|
|
|
class PagePredictions(BaseModel):
|
|
layout: Optional[LayoutPrediction] = None
|
|
tablestructure: Optional[TableStructurePrediction] = None
|
|
figures_classification: Optional[FigureClassificationPrediction] = None
|
|
equations_prediction: Optional[EquationPrediction] = None
|
|
vlm_response: Optional[VlmPrediction] = None
|
|
|
|
|
|
PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
|
|
|
|
|
|
class AssembledUnit(BaseModel):
|
|
elements: List[PageElement] = []
|
|
body: List[PageElement] = []
|
|
headers: List[PageElement] = []
|
|
|
|
|
|
class ItemAndImageEnrichmentElement(BaseModel):
|
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
|
|
item: NodeItem
|
|
image: Image
|
|
|
|
|
|
class Page(BaseModel):
|
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
|
|
page_no: int
|
|
# page_hash: Optional[str] = None
|
|
size: Optional[Size] = None
|
|
cells: List[TextCell] = []
|
|
parsed_page: Optional[SegmentedPdfPage] = None
|
|
predictions: PagePredictions = PagePredictions()
|
|
assembled: Optional[AssembledUnit] = None
|
|
|
|
_backend: Optional["PdfPageBackend"] = (
|
|
None # Internal PDF backend. By default it is cleared during assembling.
|
|
)
|
|
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
|
_image_cache: Dict[
|
|
float, Image
|
|
] = {} # Cache of images in different scales. By default it is cleared during assembling.
|
|
|
|
def get_image(
|
|
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
|
|
) -> Optional[Image]:
|
|
if self._backend is None:
|
|
return self._image_cache.get(scale, None)
|
|
|
|
if scale not in self._image_cache:
|
|
if cropbox is None:
|
|
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
|
else:
|
|
return self._backend.get_page_image(scale=scale, cropbox=cropbox)
|
|
|
|
if cropbox is None:
|
|
return self._image_cache[scale]
|
|
else:
|
|
page_im = self._image_cache[scale]
|
|
assert self.size is not None
|
|
return page_im.crop(
|
|
cropbox.to_top_left_origin(page_height=self.size.height)
|
|
.scaled(scale=scale)
|
|
.as_tuple()
|
|
)
|
|
|
|
@property
|
|
def image(self) -> Optional[Image]:
|
|
return self.get_image(scale=self._default_image_scale)
|
|
|
|
|
|
## OpenAI API Request / Response Models ##
|
|
|
|
|
|
class OpenAiChatMessage(BaseModel):
|
|
role: str
|
|
content: str
|
|
|
|
|
|
class OpenAiResponseChoice(BaseModel):
|
|
index: int
|
|
message: OpenAiChatMessage
|
|
finish_reason: str
|
|
|
|
|
|
class OpenAiResponseUsage(BaseModel):
|
|
prompt_tokens: int
|
|
completion_tokens: int
|
|
total_tokens: int
|
|
|
|
|
|
class OpenAiApiResponse(BaseModel):
|
|
model_config = ConfigDict(
|
|
protected_namespaces=(),
|
|
)
|
|
|
|
id: str
|
|
model: Optional[str] = None # returned by openai
|
|
choices: List[OpenAiResponseChoice]
|
|
created: int
|
|
usage: OpenAiResponseUsage
|