
* Skeleton for SmolDocling model and VLM Pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * wip smolDocling inference and vlm pipeline Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * WIP, first working code for inference of SmolDocling, and vlm pipeline assembly code, example included. Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fixes to preserve page image and demo export to html Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Enabled figure support in vlm_pipeline Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fix for table span compute in vlm_pipeline Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Properly propagating image data per page, together with predicted tags in VLM pipeline. This enables correct figure extraction and page numbers in provenances Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Cleaned up logs, added pages to vlm_pipeline, basic timing per page measurement in smol_docling models Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Replaced hardcoded otsl tokens with the ones from docling-core tokens.py enum Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added tokens/sec measurement, improved example Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added capability for vlm_pipeline to grab text from preconfigured backend Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Exposed "force_backend_text" as pipeline parameter Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Flipped keep_backend to True for vlm_pipeline assembly to work Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated vlm pipeline assembly and smol docling model code to support updated doctags Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fixing doctags starting tag, that broke elements on first line during assembly Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Introduced SmolDoclingOptions to configure model parameters (such as query and artifacts path) via client code, see example in minimal_smol_docling. Provisioning for other potential vlm all-in-one models. Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Moved artifacts_path for SmolDocling into vlm_options instead of global pipeline option Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * New assembly code for latest model revision, updated prompt and parsing of doctags, updated logging Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated example of Smol Docling usage Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added captions for the images for SmolDocling assembly code, improved provenance definition for all elements Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Update minimal smoldocling example Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix repo id Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Cleaned up unnecessary logging Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * More elegant solution in removing the input prompt Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * removed minimal_smol_docling example from CI checks Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Removed special html code wrapping when exporting to docling document, cleaned up comments Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Addressing PR comments, added enabled property to SmolDocling, and related VLM pipeline option, few other minor things Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Moved keep_backend = True to vlm pipeline Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * removed pipeline_options.generate_table_images from vlm_pipeline (deprecated in the pipelines) Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added example on how to get original predicted doctags in minimal_smol_docling Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * removing changes from base_pipeline Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Replaced remaining strings to appropriate enums Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated poetry.lock Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * re-built poetry.lock Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Generalize and refactor VLM pipeline and models Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename example Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Move imports Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Expose control over using flash_attention_2 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix VLM example exclusion in CI Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add back device_map and accelerate Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Make drawing code resilient against bad bboxes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * chore: clean up code and comments Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * chore: more cleanup Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * chore: fix leftover .to(device) Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: add proper table provenance Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
384 lines
12 KiB
Python
384 lines
12 KiB
Python
import logging
|
|
import os
|
|
import re
|
|
import warnings
|
|
from enum import Enum
|
|
from pathlib import Path
|
|
from typing import Annotated, Any, Dict, List, Literal, Optional, Union
|
|
|
|
from pydantic import (
|
|
AnyUrl,
|
|
BaseModel,
|
|
ConfigDict,
|
|
Field,
|
|
field_validator,
|
|
model_validator,
|
|
validator,
|
|
)
|
|
from pydantic_settings import (
|
|
BaseSettings,
|
|
PydanticBaseSettingsSource,
|
|
SettingsConfigDict,
|
|
)
|
|
from typing_extensions import deprecated
|
|
|
|
_log = logging.getLogger(__name__)
|
|
|
|
|
|
class AcceleratorDevice(str, Enum):
|
|
"""Devices to run model inference"""
|
|
|
|
AUTO = "auto"
|
|
CPU = "cpu"
|
|
CUDA = "cuda"
|
|
MPS = "mps"
|
|
|
|
|
|
class AcceleratorOptions(BaseSettings):
|
|
model_config = SettingsConfigDict(
|
|
env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
|
|
)
|
|
|
|
num_threads: int = 4
|
|
device: Union[str, AcceleratorDevice] = "auto"
|
|
cuda_use_flash_attention2: bool = False
|
|
|
|
@field_validator("device")
|
|
def validate_device(cls, value):
|
|
# "auto", "cpu", "cuda", "mps", or "cuda:N"
|
|
if value in {d.value for d in AcceleratorDevice} or re.match(
|
|
r"^cuda(:\d+)?$", value
|
|
):
|
|
return value
|
|
raise ValueError(
|
|
"Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
|
|
)
|
|
|
|
@model_validator(mode="before")
|
|
@classmethod
|
|
def check_alternative_envvars(cls, data: Any) -> Any:
|
|
r"""
|
|
Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
|
|
The alternative envvar is used only if it is valid and the regular envvar is not set.
|
|
|
|
Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
|
|
the same functionality. In case the alias envvar is set and the user tries to override the
|
|
parameter in settings initialization, Pydantic treats the parameter provided in __init__()
|
|
as an extra input instead of simply overwriting the evvar value for that parameter.
|
|
"""
|
|
if isinstance(data, dict):
|
|
input_num_threads = data.get("num_threads")
|
|
# Check if to set the num_threads from the alternative envvar
|
|
if input_num_threads is None:
|
|
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
|
|
omp_num_threads = os.getenv("OMP_NUM_THREADS")
|
|
if docling_num_threads is None and omp_num_threads is not None:
|
|
try:
|
|
data["num_threads"] = int(omp_num_threads)
|
|
except ValueError:
|
|
_log.error(
|
|
"Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
|
|
omp_num_threads,
|
|
)
|
|
return data
|
|
|
|
|
|
class TableFormerMode(str, Enum):
|
|
"""Modes for the TableFormer model."""
|
|
|
|
FAST = "fast"
|
|
ACCURATE = "accurate"
|
|
|
|
|
|
class TableStructureOptions(BaseModel):
|
|
"""Options for the table structure."""
|
|
|
|
do_cell_matching: bool = (
|
|
True
|
|
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
|
# are merged across table columns.
|
|
# False: Let table structure model define the text cells, ignore PDF cells.
|
|
)
|
|
mode: TableFormerMode = TableFormerMode.FAST
|
|
|
|
|
|
class OcrOptions(BaseModel):
|
|
"""OCR options."""
|
|
|
|
kind: str
|
|
lang: List[str]
|
|
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
|
bitmap_area_threshold: float = (
|
|
0.05 # percentage of the area for a bitmap to processed with OCR
|
|
)
|
|
|
|
|
|
class RapidOcrOptions(OcrOptions):
|
|
"""Options for the RapidOCR engine."""
|
|
|
|
kind: Literal["rapidocr"] = "rapidocr"
|
|
|
|
# English and chinese are the most commly used models and have been tested with RapidOCR.
|
|
lang: List[str] = [
|
|
"english",
|
|
"chinese",
|
|
] # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything.
|
|
# For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
|
|
|
|
# For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
|
|
text_score: float = 0.5 # same default as rapidocr
|
|
|
|
use_det: Optional[bool] = None # same default as rapidocr
|
|
use_cls: Optional[bool] = None # same default as rapidocr
|
|
use_rec: Optional[bool] = None # same default as rapidocr
|
|
|
|
# class Device(Enum):
|
|
# CPU = "CPU"
|
|
# CUDA = "CUDA"
|
|
# DIRECTML = "DIRECTML"
|
|
# AUTO = "AUTO"
|
|
|
|
# device: Device = Device.AUTO # Default value is AUTO
|
|
|
|
print_verbose: bool = False # same default as rapidocr
|
|
|
|
det_model_path: Optional[str] = None # same default as rapidocr
|
|
cls_model_path: Optional[str] = None # same default as rapidocr
|
|
rec_model_path: Optional[str] = None # same default as rapidocr
|
|
rec_keys_path: Optional[str] = None # same default as rapidocr
|
|
|
|
model_config = ConfigDict(
|
|
extra="forbid",
|
|
)
|
|
|
|
|
|
class EasyOcrOptions(OcrOptions):
|
|
"""Options for the EasyOCR engine."""
|
|
|
|
kind: Literal["easyocr"] = "easyocr"
|
|
lang: List[str] = ["fr", "de", "es", "en"]
|
|
|
|
use_gpu: Optional[bool] = None
|
|
|
|
confidence_threshold: float = 0.5
|
|
|
|
model_storage_directory: Optional[str] = None
|
|
recog_network: Optional[str] = "standard"
|
|
download_enabled: bool = True
|
|
|
|
model_config = ConfigDict(
|
|
extra="forbid",
|
|
protected_namespaces=(),
|
|
)
|
|
|
|
|
|
class TesseractCliOcrOptions(OcrOptions):
|
|
"""Options for the TesseractCli engine."""
|
|
|
|
kind: Literal["tesseract"] = "tesseract"
|
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
|
tesseract_cmd: str = "tesseract"
|
|
path: Optional[str] = None
|
|
|
|
model_config = ConfigDict(
|
|
extra="forbid",
|
|
)
|
|
|
|
|
|
class TesseractOcrOptions(OcrOptions):
|
|
"""Options for the Tesseract engine."""
|
|
|
|
kind: Literal["tesserocr"] = "tesserocr"
|
|
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
|
path: Optional[str] = None
|
|
|
|
model_config = ConfigDict(
|
|
extra="forbid",
|
|
)
|
|
|
|
|
|
class OcrMacOptions(OcrOptions):
|
|
"""Options for the Mac OCR engine."""
|
|
|
|
kind: Literal["ocrmac"] = "ocrmac"
|
|
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
|
recognition: str = "accurate"
|
|
framework: str = "vision"
|
|
|
|
model_config = ConfigDict(
|
|
extra="forbid",
|
|
)
|
|
|
|
|
|
class PictureDescriptionBaseOptions(BaseModel):
|
|
kind: str
|
|
batch_size: int = 8
|
|
scale: float = 2
|
|
|
|
bitmap_area_threshold: float = (
|
|
0.2 # percentage of the area for a bitmap to processed with the models
|
|
)
|
|
|
|
|
|
class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
|
|
kind: Literal["api"] = "api"
|
|
|
|
url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
|
|
headers: Dict[str, str] = {}
|
|
params: Dict[str, Any] = {}
|
|
timeout: float = 20
|
|
|
|
prompt: str = "Describe this image in a few sentences."
|
|
provenance: str = ""
|
|
|
|
|
|
class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
|
kind: Literal["vlm"] = "vlm"
|
|
|
|
repo_id: str
|
|
prompt: str = "Describe this image in a few sentences."
|
|
# Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
|
|
generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
|
|
|
|
@property
|
|
def repo_cache_folder(self) -> str:
|
|
return self.repo_id.replace("/", "--")
|
|
|
|
|
|
smolvlm_picture_description = PictureDescriptionVlmOptions(
|
|
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
|
|
)
|
|
# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
|
|
granite_picture_description = PictureDescriptionVlmOptions(
|
|
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
|
|
prompt="What is shown in this image?",
|
|
)
|
|
|
|
|
|
class BaseVlmOptions(BaseModel):
|
|
kind: str
|
|
prompt: str
|
|
|
|
|
|
class ResponseFormat(str, Enum):
|
|
DOCTAGS = "doctags"
|
|
MARKDOWN = "markdown"
|
|
|
|
|
|
class HuggingFaceVlmOptions(BaseVlmOptions):
|
|
kind: Literal["hf_model_options"] = "hf_model_options"
|
|
|
|
repo_id: str
|
|
load_in_8bit: bool = True
|
|
llm_int8_threshold: float = 6.0
|
|
quantized: bool = False
|
|
|
|
response_format: ResponseFormat
|
|
|
|
@property
|
|
def repo_cache_folder(self) -> str:
|
|
return self.repo_id.replace("/", "--")
|
|
|
|
|
|
smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
|
|
repo_id="ds4sd/SmolDocling-256M-preview",
|
|
prompt="Convert this page to docling.",
|
|
response_format=ResponseFormat.DOCTAGS,
|
|
)
|
|
|
|
granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
|
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
|
|
# prompt="OCR the full page to markdown.",
|
|
prompt="OCR this image.",
|
|
response_format=ResponseFormat.MARKDOWN,
|
|
)
|
|
|
|
|
|
# Define an enum for the backend options
|
|
class PdfBackend(str, Enum):
|
|
"""Enum of valid PDF backends."""
|
|
|
|
PYPDFIUM2 = "pypdfium2"
|
|
DLPARSE_V1 = "dlparse_v1"
|
|
DLPARSE_V2 = "dlparse_v2"
|
|
|
|
|
|
# Define an enum for the ocr engines
|
|
class OcrEngine(str, Enum):
|
|
"""Enum of valid OCR engines."""
|
|
|
|
EASYOCR = "easyocr"
|
|
TESSERACT_CLI = "tesseract_cli"
|
|
TESSERACT = "tesseract"
|
|
OCRMAC = "ocrmac"
|
|
RAPIDOCR = "rapidocr"
|
|
|
|
|
|
class PipelineOptions(BaseModel):
|
|
"""Base pipeline options."""
|
|
|
|
create_legacy_output: bool = (
|
|
True # This default will be set to False on a future version of docling
|
|
)
|
|
document_timeout: Optional[float] = None
|
|
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
|
enable_remote_services: bool = False
|
|
|
|
|
|
class PaginatedPipelineOptions(PipelineOptions):
|
|
images_scale: float = 1.0
|
|
generate_page_images: bool = False
|
|
generate_picture_images: bool = False
|
|
|
|
|
|
class VlmPipelineOptions(PaginatedPipelineOptions):
|
|
artifacts_path: Optional[Union[Path, str]] = None
|
|
|
|
generate_page_images: bool = True
|
|
force_backend_text: bool = (
|
|
False # (To be used with vlms, or other generative models)
|
|
)
|
|
# If True, text from backend will be used instead of generated text
|
|
vlm_options: Union[HuggingFaceVlmOptions] = smoldocling_vlm_conversion_options
|
|
|
|
|
|
class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
"""Options for the PDF pipeline."""
|
|
|
|
artifacts_path: Optional[Union[Path, str]] = None
|
|
do_table_structure: bool = True # True: perform table structure extraction
|
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
|
do_code_enrichment: bool = False # True: perform code OCR
|
|
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
|
do_picture_classification: bool = False # True: classify pictures in documents
|
|
do_picture_description: bool = False # True: run describe pictures in documents
|
|
force_backend_text: bool = (
|
|
False # (To be used with vlms, or other generative models)
|
|
)
|
|
# If True, text from backend will be used instead of generated text
|
|
|
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
|
ocr_options: Union[
|
|
EasyOcrOptions,
|
|
TesseractCliOcrOptions,
|
|
TesseractOcrOptions,
|
|
OcrMacOptions,
|
|
RapidOcrOptions,
|
|
] = Field(EasyOcrOptions(), discriminator="kind")
|
|
picture_description_options: Annotated[
|
|
Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
|
|
Field(discriminator="kind"),
|
|
] = smolvlm_picture_description
|
|
|
|
images_scale: float = 1.0
|
|
generate_page_images: bool = False
|
|
generate_picture_images: bool = False
|
|
generate_table_images: bool = Field(
|
|
default=False,
|
|
deprecated=(
|
|
"Field `generate_table_images` is deprecated. "
|
|
"To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
|
|
"before conversion and then use the `TableItem.get_image` function."
|
|
),
|
|
)
|