feat: Introduce support for GPU Accelerators (#593)
* Upgraded Layout Postprocessing, sending old code back to ERZ Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Implement hierachical cluster layout processing Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Pass nested cluster processing through full pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Pass nested clusters through GLM as payload Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Move to_docling_document from ds-glm to this repo Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Clean up imports again Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * feat(Accelerator): Introduce options to control the num_threads and device from API, envvars, CLI. - Introduce the AcceleratorOptions, AcceleratorDevice and use them to set the device where the models run. - Introduce the accelerator_utils with function to decide the device and resolve the AUTO setting. - Refactor the way how the docling-ibm-models are called to match the new init signature of models. - Translate the accelerator options to the specific inputs for third-party models. - Extend the docling CLI with parameters to set the num_threads and device. - Add new unit tests. - Write new example how to use the accelerator options. * fix: Improve the pydantic objects in the pipeline_options and imports. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: TableStructureModel: Refactor the artifacts path to use the new structure for fast/accurate model Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * Updated test ground-truth Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Updated test ground-truth (again), bugfix for empty layout Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Do proper check to set the device in EasyOCR, RapidOCR. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * Rollback changes from main Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update test gt Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Remove unused debug settings Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Review fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Nail the accelerator defaults for MPS Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>
This commit is contained in:
parent
365a1e7b98
commit
19fad9261c
@ -26,6 +26,8 @@ from docling.datamodel.base_models import (
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
EasyOcrOptions,
|
||||
OcrEngine,
|
||||
OcrMacOptions,
|
||||
@ -257,6 +259,10 @@ def convert(
|
||||
help="The timeout for processing each document, in seconds.",
|
||||
),
|
||||
] = None,
|
||||
num_threads: Annotated[int, typer.Option(..., help="Number of threads")] = 4,
|
||||
device: Annotated[
|
||||
AcceleratorDevice, typer.Option(..., help="Accelerator device")
|
||||
] = AcceleratorDevice.AUTO,
|
||||
):
|
||||
if verbose == 0:
|
||||
logging.basicConfig(level=logging.WARNING)
|
||||
@ -336,7 +342,9 @@ def convert(
|
||||
if ocr_lang_list is not None:
|
||||
ocr_options.lang = ocr_lang_list
|
||||
|
||||
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
accelerator_options=accelerator_options,
|
||||
do_ocr=ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=True,
|
||||
|
@ -1,8 +1,66 @@
|
||||
import logging
|
||||
import os
|
||||
import warnings
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import List, Literal, Optional, Union
|
||||
from typing import Annotated, Any, Dict, List, Literal, Optional, Tuple, Type, Union
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
|
||||
from pydantic_settings import (
|
||||
BaseSettings,
|
||||
PydanticBaseSettingsSource,
|
||||
SettingsConfigDict,
|
||||
)
|
||||
from typing_extensions import deprecated
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AcceleratorDevice(str, Enum):
|
||||
"""Devices to run model inference"""
|
||||
|
||||
AUTO = "auto"
|
||||
CPU = "cpu"
|
||||
CUDA = "cuda"
|
||||
MPS = "mps"
|
||||
|
||||
|
||||
class AcceleratorOptions(BaseSettings):
|
||||
model_config = SettingsConfigDict(
|
||||
env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
|
||||
)
|
||||
|
||||
num_threads: int = 4
|
||||
device: AcceleratorDevice = AcceleratorDevice.AUTO
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_alternative_envvars(cls, data: Any) -> Any:
|
||||
r"""
|
||||
Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
|
||||
The alternative envvar is used only if it is valid and the regular envvar is not set.
|
||||
|
||||
Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
|
||||
the same functionality. In case the alias envvar is set and the user tries to override the
|
||||
parameter in settings initialization, Pydantic treats the parameter provided in __init__()
|
||||
as an extra input instead of simply overwriting the evvar value for that parameter.
|
||||
"""
|
||||
if isinstance(data, dict):
|
||||
input_num_threads = data.get("num_threads")
|
||||
|
||||
# Check if to set the num_threads from the alternative envvar
|
||||
if input_num_threads is None:
|
||||
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
|
||||
omp_num_threads = os.getenv("OMP_NUM_THREADS")
|
||||
if docling_num_threads is None and omp_num_threads is not None:
|
||||
try:
|
||||
data["num_threads"] = int(omp_num_threads)
|
||||
except ValueError:
|
||||
_log.error(
|
||||
"Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
|
||||
omp_num_threads,
|
||||
)
|
||||
return data
|
||||
|
||||
|
||||
class TableFormerMode(str, Enum):
|
||||
@ -78,9 +136,11 @@ class EasyOcrOptions(OcrOptions):
|
||||
|
||||
kind: Literal["easyocr"] = "easyocr"
|
||||
lang: List[str] = ["fr", "de", "es", "en"]
|
||||
use_gpu: bool = True # same default as easyocr.Reader
|
||||
|
||||
use_gpu: Optional[bool] = None
|
||||
|
||||
model_storage_directory: Optional[str] = None
|
||||
download_enabled: bool = True # same default as easyocr.Reader
|
||||
download_enabled: bool = True
|
||||
|
||||
model_config = ConfigDict(
|
||||
extra="forbid",
|
||||
@ -153,6 +213,7 @@ class PipelineOptions(BaseModel):
|
||||
True # This default will be set to False on a future version of docling
|
||||
)
|
||||
document_timeout: Optional[float] = None
|
||||
accelerator_options: AcceleratorOptions = AcceleratorOptions()
|
||||
|
||||
|
||||
class PdfPipelineOptions(PipelineOptions):
|
||||
|
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import warnings
|
||||
from typing import Iterable
|
||||
|
||||
import numpy
|
||||
@ -7,16 +8,26 @@ from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import EasyOcrOptions
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
EasyOcrOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EasyOcrModel(BaseOcrModel):
|
||||
def __init__(self, enabled: bool, options: EasyOcrOptions):
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
options: EasyOcrOptions,
|
||||
accelerator_options: AcceleratorOptions,
|
||||
):
|
||||
super().__init__(enabled=enabled, options=options)
|
||||
self.options: EasyOcrOptions
|
||||
|
||||
@ -31,11 +42,32 @@ class EasyOcrModel(BaseOcrModel):
|
||||
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
||||
)
|
||||
|
||||
if self.options.use_gpu is None:
|
||||
device = decide_device(accelerator_options.device)
|
||||
# Enable easyocr GPU if running on CUDA, MPS
|
||||
use_gpu = any(
|
||||
[
|
||||
device.startswith(x)
|
||||
for x in [
|
||||
AcceleratorDevice.CUDA.value,
|
||||
AcceleratorDevice.MPS.value,
|
||||
]
|
||||
]
|
||||
)
|
||||
else:
|
||||
warnings.warn(
|
||||
"Deprecated field. Better to set the `accelerator_options.device` in `pipeline_options`. "
|
||||
"When `use_gpu and accelerator_options.device == AcceleratorDevice.CUDA` the GPU is used "
|
||||
"to run EasyOCR. Otherwise, EasyOCR runs in CPU."
|
||||
)
|
||||
use_gpu = self.options.use_gpu
|
||||
|
||||
self.reader = easyocr.Reader(
|
||||
lang_list=self.options.lang,
|
||||
gpu=self.options.use_gpu,
|
||||
gpu=use_gpu,
|
||||
model_storage_directory=self.options.model_storage_directory,
|
||||
download_enabled=self.options.download_enabled,
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
def __call__(
|
||||
|
@ -9,6 +9,7 @@ from docling_core.types.doc import CoordOrigin, DocItemLabel
|
||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||
from PIL import ImageDraw
|
||||
|
||||
import docling.utils.layout_utils as lu
|
||||
from docling.datamodel.base_models import (
|
||||
BoundingBox,
|
||||
Cell,
|
||||
@ -17,9 +18,10 @@ from docling.datamodel.base_models import (
|
||||
Page,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import AcceleratorDevice, AcceleratorOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils import layout_utils as lu
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -46,8 +48,16 @@ class LayoutModel(BasePageModel):
|
||||
FIGURE_LABEL = DocItemLabel.PICTURE
|
||||
FORMULA_LABEL = DocItemLabel.FORMULA
|
||||
|
||||
def __init__(self, artifacts_path: Path):
|
||||
self.layout_predictor = LayoutPredictor(artifacts_path) # TODO temporary
|
||||
def __init__(self, artifacts_path: Path, accelerator_options: AcceleratorOptions):
|
||||
device = decide_device(accelerator_options.device)
|
||||
|
||||
self.layout_predictor = LayoutPredictor(
|
||||
artifact_path=str(artifacts_path),
|
||||
device=device,
|
||||
num_threads=accelerator_options.num_threads,
|
||||
base_threshold=0.6,
|
||||
blacklist_classes={"Form", "Key-Value Region"},
|
||||
)
|
||||
|
||||
def postprocess(self, clusters_in: List[Cluster], cells: List[Cell], page_height):
|
||||
MIN_INTERSECTION = 0.2
|
||||
|
@ -6,16 +6,26 @@ from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import RapidOcrOptions
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
RapidOcrOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RapidOcrModel(BaseOcrModel):
|
||||
def __init__(self, enabled: bool, options: RapidOcrOptions):
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
options: RapidOcrOptions,
|
||||
accelerator_options: AcceleratorOptions,
|
||||
):
|
||||
super().__init__(enabled=enabled, options=options)
|
||||
self.options: RapidOcrOptions
|
||||
|
||||
@ -30,52 +40,21 @@ class RapidOcrModel(BaseOcrModel):
|
||||
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
||||
)
|
||||
|
||||
# This configuration option will be revamped while introducing device settings for all models.
|
||||
# For the moment we will default to auto and let onnx-runtime pick the best.
|
||||
cls_use_cuda = True
|
||||
rec_use_cuda = True
|
||||
det_use_cuda = True
|
||||
det_use_dml = True
|
||||
cls_use_dml = True
|
||||
rec_use_dml = True
|
||||
|
||||
# # Same as Defaults in RapidOCR
|
||||
# cls_use_cuda = False
|
||||
# rec_use_cuda = False
|
||||
# det_use_cuda = False
|
||||
# det_use_dml = False
|
||||
# cls_use_dml = False
|
||||
# rec_use_dml = False
|
||||
|
||||
# # If we set everything to true onnx-runtime would automatically choose the fastest accelerator
|
||||
# if self.options.device == self.options.Device.AUTO:
|
||||
# cls_use_cuda = True
|
||||
# rec_use_cuda = True
|
||||
# det_use_cuda = True
|
||||
# det_use_dml = True
|
||||
# cls_use_dml = True
|
||||
# rec_use_dml = True
|
||||
|
||||
# # If we set use_cuda to true onnx would use the cuda device available in runtime if no cuda device is available it would run on CPU.
|
||||
# elif self.options.device == self.options.Device.CUDA:
|
||||
# cls_use_cuda = True
|
||||
# rec_use_cuda = True
|
||||
# det_use_cuda = True
|
||||
|
||||
# # If we set use_dml to true onnx would use the dml device available in runtime if no dml device is available it would work on CPU.
|
||||
# elif self.options.device == self.options.Device.DIRECTML:
|
||||
# det_use_dml = True
|
||||
# cls_use_dml = True
|
||||
# rec_use_dml = True
|
||||
# Decide the accelerator devices
|
||||
device = decide_device(accelerator_options.device)
|
||||
use_cuda = str(AcceleratorDevice.CUDA.value).lower() in device
|
||||
use_dml = accelerator_options.device == AcceleratorDevice.AUTO
|
||||
intra_op_num_threads = accelerator_options.num_threads
|
||||
|
||||
self.reader = RapidOCR(
|
||||
text_score=self.options.text_score,
|
||||
cls_use_cuda=cls_use_cuda,
|
||||
rec_use_cuda=rec_use_cuda,
|
||||
det_use_cuda=det_use_cuda,
|
||||
det_use_dml=det_use_dml,
|
||||
cls_use_dml=cls_use_dml,
|
||||
rec_use_dml=rec_use_dml,
|
||||
cls_use_cuda=use_cuda,
|
||||
rec_use_cuda=use_cuda,
|
||||
det_use_cuda=use_cuda,
|
||||
det_use_dml=use_dml,
|
||||
cls_use_dml=use_dml,
|
||||
rec_use_dml=use_dml,
|
||||
intra_op_num_threads=intra_op_num_threads,
|
||||
print_verbose=self.options.print_verbose,
|
||||
det_model_path=self.options.det_model_path,
|
||||
cls_model_path=self.options.cls_model_path,
|
||||
|
@ -9,15 +9,25 @@ from PIL import ImageDraw
|
||||
|
||||
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import TableFormerMode, TableStructureOptions
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
TableFormerMode,
|
||||
TableStructureOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
|
||||
class TableStructureModel(BasePageModel):
|
||||
def __init__(
|
||||
self, enabled: bool, artifacts_path: Path, options: TableStructureOptions
|
||||
self,
|
||||
enabled: bool,
|
||||
artifacts_path: Path,
|
||||
options: TableStructureOptions,
|
||||
accelerator_options: AcceleratorOptions,
|
||||
):
|
||||
self.options = options
|
||||
self.do_cell_matching = self.options.do_cell_matching
|
||||
@ -26,16 +36,26 @@ class TableStructureModel(BasePageModel):
|
||||
self.enabled = enabled
|
||||
if self.enabled:
|
||||
if self.mode == TableFormerMode.ACCURATE:
|
||||
artifacts_path = artifacts_path / "fat"
|
||||
artifacts_path = artifacts_path / "accurate"
|
||||
else:
|
||||
artifacts_path = artifacts_path / "fast"
|
||||
|
||||
# Third Party
|
||||
import docling_ibm_models.tableformer.common as c
|
||||
|
||||
device = decide_device(accelerator_options.device)
|
||||
|
||||
# Disable MPS here, until we know why it makes things slower.
|
||||
if device == AcceleratorDevice.MPS.value:
|
||||
device = AcceleratorDevice.CPU.value
|
||||
|
||||
self.tm_config = c.read_config(f"{artifacts_path}/tm_config.json")
|
||||
self.tm_config["model"]["save_dir"] = artifacts_path
|
||||
self.tm_model_type = self.tm_config["model"]["type"]
|
||||
|
||||
self.tf_predictor = TFPredictor(self.tm_config)
|
||||
self.tf_predictor = TFPredictor(
|
||||
self.tm_config, device, accelerator_options.num_threads
|
||||
)
|
||||
self.scale = 2.0 # Scale up table input images to 144 dpi
|
||||
|
||||
def draw_table_and_cells(
|
||||
|
@ -38,7 +38,7 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StandardPdfPipeline(PaginatedPipeline):
|
||||
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
|
||||
_layout_model_path = "model_artifacts/layout"
|
||||
_table_model_path = "model_artifacts/tableformer"
|
||||
|
||||
def __init__(self, pipeline_options: PdfPipelineOptions):
|
||||
@ -75,7 +75,8 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
# Layout model
|
||||
LayoutModel(
|
||||
artifacts_path=self.artifacts_path
|
||||
/ StandardPdfPipeline._layout_model_path
|
||||
/ StandardPdfPipeline._layout_model_path,
|
||||
accelerator_options=pipeline_options.accelerator_options,
|
||||
),
|
||||
# Table structure model
|
||||
TableStructureModel(
|
||||
@ -83,6 +84,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
artifacts_path=self.artifacts_path
|
||||
/ StandardPdfPipeline._table_model_path,
|
||||
options=pipeline_options.table_structure_options,
|
||||
accelerator_options=pipeline_options.accelerator_options,
|
||||
),
|
||||
# Page assemble
|
||||
PageAssembleModel(options=PageAssembleOptions(keep_images=keep_images)),
|
||||
@ -104,7 +106,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
repo_id="ds4sd/docling-models",
|
||||
force_download=force,
|
||||
local_dir=local_dir,
|
||||
revision="v2.0.1",
|
||||
revision="v2.1.0",
|
||||
)
|
||||
|
||||
return Path(download_path)
|
||||
@ -114,6 +116,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
return EasyOcrModel(
|
||||
enabled=self.pipeline_options.do_ocr,
|
||||
options=self.pipeline_options.ocr_options,
|
||||
accelerator_options=self.pipeline_options.accelerator_options,
|
||||
)
|
||||
elif isinstance(self.pipeline_options.ocr_options, TesseractCliOcrOptions):
|
||||
return TesseractOcrCliModel(
|
||||
@ -129,6 +132,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
return RapidOcrModel(
|
||||
enabled=self.pipeline_options.do_ocr,
|
||||
options=self.pipeline_options.ocr_options,
|
||||
accelerator_options=self.pipeline_options.accelerator_options,
|
||||
)
|
||||
elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
|
||||
if "darwin" != sys.platform:
|
||||
|
42
docling/utils/accelerator_utils.py
Normal file
42
docling/utils/accelerator_utils.py
Normal file
@ -0,0 +1,42 @@
|
||||
import logging
|
||||
|
||||
import torch
|
||||
|
||||
from docling.datamodel.pipeline_options import AcceleratorDevice
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def decide_device(accelerator_device: AcceleratorDevice) -> str:
|
||||
r"""
|
||||
Resolve the device based on the acceleration options and the available devices in the system
|
||||
Rules:
|
||||
1. AUTO: Check for the best available device on the system.
|
||||
2. User-defined: Check if the device actually exists, otherwise fall-back to CPU
|
||||
"""
|
||||
cuda_index = 0
|
||||
device = "cpu"
|
||||
|
||||
has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
|
||||
has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
|
||||
|
||||
if accelerator_device == AcceleratorDevice.AUTO:
|
||||
if has_cuda:
|
||||
device = f"cuda:{cuda_index}"
|
||||
elif has_mps:
|
||||
device = "mps"
|
||||
|
||||
else:
|
||||
if accelerator_device == AcceleratorDevice.CUDA:
|
||||
if has_cuda:
|
||||
device = f"cuda:{cuda_index}"
|
||||
else:
|
||||
_log.warning("CUDA is not available in the system. Fall back to 'CPU'")
|
||||
elif accelerator_device == AcceleratorDevice.MPS:
|
||||
if has_mps:
|
||||
device = "mps"
|
||||
else:
|
||||
_log.warning("MPS is not available in the system. Fall back to 'CPU'")
|
||||
|
||||
_log.info("Accelerator device: '%s'", device)
|
||||
return device
|
@ -74,6 +74,10 @@ def main():
|
||||
pipeline_options.do_ocr = True
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
pipeline_options.ocr_options.lang = ["es"]
|
||||
pipeline_options.accelerator_options = AcceleratorOptions(
|
||||
num_threads=4, device=Device.AUTO
|
||||
)
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
|
63
docs/examples/run_with_accelerator.py
Normal file
63
docs/examples/run_with_accelerator.py
Normal file
@ -0,0 +1,63 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
def main():
|
||||
input_doc = Path("./tests/data/2206.01062.pdf")
|
||||
|
||||
# Explicitly set the accelerator
|
||||
# accelerator_options = AcceleratorOptions(
|
||||
# num_threads=8, device=AcceleratorDevice.AUTO
|
||||
# )
|
||||
accelerator_options = AcceleratorOptions(
|
||||
num_threads=8, device=AcceleratorDevice.CPU
|
||||
)
|
||||
# accelerator_options = AcceleratorOptions(
|
||||
# num_threads=8, device=AcceleratorDevice.MPS
|
||||
# )
|
||||
# accelerator_options = AcceleratorOptions(
|
||||
# num_threads=8, device=AcceleratorDevice.CUDA
|
||||
# )
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.accelerator_options = accelerator_options
|
||||
pipeline_options.do_ocr = True
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
# Enable the profiling to measure the time spent
|
||||
settings.debug.profile_pipeline_timings = True
|
||||
|
||||
# Convert the document
|
||||
conversion_result = converter.convert(input_doc)
|
||||
doc = conversion_result.document
|
||||
|
||||
# List with total time per document
|
||||
doc_conversion_secs = conversion_result.timings["pipeline_total"].times
|
||||
|
||||
md = doc.export_to_markdown()
|
||||
print(md)
|
||||
print(f"Conversion secs: {doc_conversion_secs}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -75,6 +75,7 @@ nav:
|
||||
- "Table export": examples/export_tables.py
|
||||
- "Multimodal export": examples/export_multimodal.py
|
||||
- "Force full page OCR": examples/full_page_ocr.py
|
||||
- "Accelerator options": examples/run_with_acclerators.py
|
||||
- Chunking:
|
||||
- "Hybrid chunking": examples/hybrid_chunking.ipynb
|
||||
- RAG / QA:
|
||||
|
14
poetry.lock
generated
14
poetry.lock
generated
@ -914,13 +914,13 @@ chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "docling-ibm-models"
|
||||
version = "2.0.7"
|
||||
version = "3.1.0"
|
||||
description = "This package contains the AI models used by the Docling PDF conversion package"
|
||||
optional = false
|
||||
python-versions = "<4.0,>=3.9"
|
||||
files = [
|
||||
{file = "docling_ibm_models-2.0.7-py3-none-any.whl", hash = "sha256:bf362add22e9c526ac56c04bce412d7bb1c331b44a73204abba0b1d90a500c78"},
|
||||
{file = "docling_ibm_models-2.0.7.tar.gz", hash = "sha256:e1372c4f2517d522125fb02a820558f01914926f532bcd0534f1028a25d63667"},
|
||||
{file = "docling_ibm_models-3.1.0-py3-none-any.whl", hash = "sha256:a381a45dff16fdb2246b99c15a2e3d6ba880c573d48a1d6477d3ffb36bab807f"},
|
||||
{file = "docling_ibm_models-3.1.0.tar.gz", hash = "sha256:65d734ffa490edc4e2301d296b6e893afa536c63b7daae7bbda781bd15b3431e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@ -929,9 +929,11 @@ jsonlines = ">=3.1.0,<4.0.0"
|
||||
numpy = ">=1.24.4,<3.0.0"
|
||||
opencv-python-headless = ">=4.6.0.66,<5.0.0.0"
|
||||
Pillow = ">=10.0.0,<11.0.0"
|
||||
safetensors = {version = ">=0.4.3,<1", extras = ["torch"]}
|
||||
torch = ">=2.2.2,<3.0.0"
|
||||
torchvision = ">=0,<1"
|
||||
tqdm = ">=4.64.0,<5.0.0"
|
||||
transformers = ">=4.42.0,<5.0.0"
|
||||
|
||||
[[package]]
|
||||
name = "docling-parse"
|
||||
@ -5978,6 +5980,10 @@ files = [
|
||||
{file = "safetensors-0.4.5.tar.gz", hash = "sha256:d73de19682deabb02524b3d5d1f8b3aaba94c72f1bbfc7911b9b9d5d391c0310"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
numpy = {version = ">=1.21.6", optional = true, markers = "extra == \"numpy\""}
|
||||
torch = {version = ">=1.10", optional = true, markers = "extra == \"torch\""}
|
||||
|
||||
[package.extras]
|
||||
all = ["safetensors[jax]", "safetensors[numpy]", "safetensors[paddlepaddle]", "safetensors[pinned-tf]", "safetensors[quality]", "safetensors[testing]", "safetensors[torch]"]
|
||||
dev = ["safetensors[all]"]
|
||||
@ -7602,4 +7608,4 @@ tesserocr = ["tesserocr"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "3e66a54bd0433581e4909003124e2b79b42bdd1fb90d17c037f3294aeff56aa9"
|
||||
content-hash = "5271637a86ae221be362a288546c9fee3e3e25e5b323c997464c032c284716bd"
|
||||
|
@ -27,8 +27,9 @@ packages = [{include = "docling"}]
|
||||
python = "^3.9"
|
||||
docling-core = { version = "^2.9.0", extras = ["chunking"] }
|
||||
pydantic = "^2.0.0"
|
||||
docling-ibm-models = "^2.0.6"
|
||||
docling-ibm-models = "^3.1.0"
|
||||
deepsearch-glm = "^1.0.0"
|
||||
docling-parse = "^3.0.0"
|
||||
filetype = "^1.2.0"
|
||||
pypdfium2 = "^4.30.0"
|
||||
pydantic-settings = "^2.3.0"
|
||||
@ -36,7 +37,6 @@ huggingface_hub = ">=0.23,<1"
|
||||
requests = "^2.32.3"
|
||||
easyocr = "^1.7"
|
||||
tesserocr = { version = "^2.7.1", optional = true }
|
||||
docling-parse = "^3.0.0"
|
||||
certifi = ">=2024.7.4"
|
||||
rtree = "^1.3.0"
|
||||
scipy = "^1.6.0"
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1 +1 @@
|
||||
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [70.90211486816406, 689.2166748046875, 504.87200927734375, 765.0995483398438], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
||||
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.6796646118164, 689.012451171875, 504.87200927734375, 765.0995483398438], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
@ -1 +1 @@
|
||||
[{"page_no": 0, "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896755, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 70.90211866351085, "t": 102.66666671251767, "r": 504.8720079864275, "b": 124.83139551297336, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 73.10852522817731, "t": 130.0013615789096, "r": 153.04479435252625, "b": 152.70503335218427, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "text", "bbox": {"l": 70.90211866351085, "t": 76.82212829589844, "r": 504.8720079864275, "b": 152.70503335218427, "coord_origin": "TOPLEFT"}, "confidence": 0.9715733528137207, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896755, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 70.90211866351085, "t": 102.66666671251767, "r": 504.8720079864275, "b": 124.83139551297336, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 73.10852522817731, "t": 130.0013615789096, "r": 153.04479435252625, "b": 152.70503335218427, "coord_origin": "TOPLEFT"}}]}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 70.90211866351085, "t": 76.82212829589844, "r": 504.8720079864275, "b": 152.70503335218427, "coord_origin": "TOPLEFT"}, "confidence": 0.9715733528137207, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896755, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 70.90211866351085, "t": 102.66666671251767, "r": 504.8720079864275, "b": 124.83139551297336, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 73.10852522817731, "t": 130.0013615789096, "r": 153.04479435252625, "b": 152.70503335218427, "coord_origin": "TOPLEFT"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "body": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 70.90211866351085, "t": 76.82212829589844, "r": 504.8720079864275, "b": 152.70503335218427, "coord_origin": "TOPLEFT"}, "confidence": 0.9715733528137207, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896755, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 70.90211866351085, "t": 102.66666671251767, "r": 504.8720079864275, "b": 124.83139551297336, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 73.10852522817731, "t": 130.0013615789096, "r": 153.04479435252625, "b": 152.70503335218427, "coord_origin": "TOPLEFT"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "headers": []}}]
|
||||
[{"page_no": 0, "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896755, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573798, "r": 504.8720051760782, "b": 124.83139494707746, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.79712523204603, "r": 153.088934155825, "b": 152.90926970226087, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.82213592529297, "r": 504.8720051760782, "b": 152.90926970226087, "coord_origin": "TOPLEFT"}, "confidence": 0.9715732336044312, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896755, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573798, "r": 504.8720051760782, "b": 124.83139494707746, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.79712523204603, "r": 153.088934155825, "b": 152.90926970226087, "coord_origin": "TOPLEFT"}}]}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.82213592529297, "r": 504.8720051760782, "b": 152.90926970226087, "coord_origin": "TOPLEFT"}, "confidence": 0.9715732336044312, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896755, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573798, "r": 504.8720051760782, "b": 124.83139494707746, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.79712523204603, "r": 153.088934155825, "b": 152.90926970226087, "coord_origin": "TOPLEFT"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "body": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.82213592529297, "r": 504.8720051760782, "b": 152.90926970226087, "coord_origin": "TOPLEFT"}, "confidence": 0.9715732336044312, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896755, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573798, "r": 504.8720051760782, "b": 124.83139494707746, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.79712523204603, "r": 153.088934155825, "b": 152.90926970226087, "coord_origin": "TOPLEFT"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "headers": []}}]
|
@ -1 +1 @@
|
||||
{"schema_name": "DoclingDocument", "version": "1.0.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 70.90211486816406, "t": 765.0995483398438, "r": 504.87200927734375, "b": 689.2166748046875, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
|
||||
{"schema_name": "DoclingDocument", "version": "1.0.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.6796646118164, "t": 765.0995483398438, "r": 504.87200927734375, "b": 689.012451171875, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "pictures": [], "tables": [], "key_value_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
|
@ -1 +1 @@
|
||||
[{"page_no": 0, "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896755, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 70.90211866351085, "t": 102.66666671251767, "r": 504.8720079864275, "b": 124.83139551297336, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 73.10852522817731, "t": 130.0013615789096, "r": 153.04479435252625, "b": 152.70503335218427, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "text", "bbox": {"l": 70.90211866351085, "t": 76.82212829589844, "r": 504.8720079864275, "b": 152.70503335218427, "coord_origin": "TOPLEFT"}, "confidence": 0.9715733528137207, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896755, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 70.90211866351085, "t": 102.66666671251767, "r": 504.8720079864275, "b": 124.83139551297336, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 73.10852522817731, "t": 130.0013615789096, "r": 153.04479435252625, "b": 152.70503335218427, "coord_origin": "TOPLEFT"}}]}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 70.90211866351085, "t": 76.82212829589844, "r": 504.8720079864275, "b": 152.70503335218427, "coord_origin": "TOPLEFT"}, "confidence": 0.9715733528137207, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896755, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 70.90211866351085, "t": 102.66666671251767, "r": 504.8720079864275, "b": 124.83139551297336, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 73.10852522817731, "t": 130.0013615789096, "r": 153.04479435252625, "b": 152.70503335218427, "coord_origin": "TOPLEFT"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "body": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 70.90211866351085, "t": 76.82212829589844, "r": 504.8720079864275, "b": 152.70503335218427, "coord_origin": "TOPLEFT"}, "confidence": 0.9715733528137207, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896755, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 70.90211866351085, "t": 102.66666671251767, "r": 504.8720079864275, "b": 124.83139551297336, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 73.10852522817731, "t": 130.0013615789096, "r": 153.04479435252625, "b": 152.70503335218427, "coord_origin": "TOPLEFT"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "headers": []}}]
|
||||
[{"page_no": 0, "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896755, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573798, "r": 504.8720051760782, "b": 124.83139494707746, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.79712523204603, "r": 153.088934155825, "b": 152.90926970226087, "coord_origin": "TOPLEFT"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.82213592529297, "r": 504.8720051760782, "b": 152.90926970226087, "coord_origin": "TOPLEFT"}, "confidence": 0.9715732336044312, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896755, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573798, "r": 504.8720051760782, "b": 124.83139494707746, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.79712523204603, "r": 153.088934155825, "b": 152.90926970226087, "coord_origin": "TOPLEFT"}}]}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.82213592529297, "r": 504.8720051760782, "b": 152.90926970226087, "coord_origin": "TOPLEFT"}, "confidence": 0.9715732336044312, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896755, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573798, "r": 504.8720051760782, "b": 124.83139494707746, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.79712523204603, "r": 153.088934155825, "b": 152.90926970226087, "coord_origin": "TOPLEFT"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "body": [{"label": "text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "text", "bbox": {"l": 69.6796630536824, "t": 76.82213592529297, "r": 504.8720051760782, "b": 152.90926970226087, "coord_origin": "TOPLEFT"}, "confidence": 0.9715732336044312, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 73.34702132031646, "t": 76.99999977896755, "r": 503.64955224479564, "b": 97.99999977896755, "coord_origin": "TOPLEFT"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.6796630536824, "t": 104.00000011573798, "r": 504.8720051760782, "b": 124.83139494707746, "coord_origin": "TOPLEFT"}}, {"id": 2, "text": "package", "bbox": {"l": 71.84193505100733, "t": 129.79712523204603, "r": 153.088934155825, "b": 152.90926970226087, "coord_origin": "TOPLEFT"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "headers": []}}]
|
@ -1,3 +1,4 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
@ -5,7 +6,12 @@ import pytest
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
PdfPipelineOptions,
|
||||
TableFormerMode,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
@ -35,6 +41,61 @@ def get_converters_with_table_options():
|
||||
yield converter
|
||||
|
||||
|
||||
def test_accelerator_options():
|
||||
# Check the default options
|
||||
ao = AcceleratorOptions()
|
||||
assert ao.num_threads == 4, "Wrong default num_threads"
|
||||
assert ao.device == AcceleratorDevice.AUTO, "Wrong default device"
|
||||
|
||||
# Use API
|
||||
ao2 = AcceleratorOptions(num_threads=2, device=AcceleratorDevice.MPS)
|
||||
ao3 = AcceleratorOptions(num_threads=3, device=AcceleratorDevice.CUDA)
|
||||
assert ao2.num_threads == 2
|
||||
assert ao2.device == AcceleratorDevice.MPS
|
||||
assert ao3.num_threads == 3
|
||||
assert ao3.device == AcceleratorDevice.CUDA
|
||||
|
||||
# Use envvars (regular + alternative) and default values
|
||||
os.environ["OMP_NUM_THREADS"] = "1"
|
||||
ao.__init__()
|
||||
assert ao.num_threads == 1
|
||||
assert ao.device == AcceleratorDevice.AUTO
|
||||
os.environ["DOCLING_DEVICE"] = "cpu"
|
||||
ao.__init__()
|
||||
assert ao.device == AcceleratorDevice.CPU
|
||||
assert ao.num_threads == 1
|
||||
|
||||
# Use envvars and override in init
|
||||
os.environ["DOCLING_DEVICE"] = "cpu"
|
||||
ao4 = AcceleratorOptions(num_threads=5, device=AcceleratorDevice.MPS)
|
||||
assert ao4.num_threads == 5
|
||||
assert ao4.device == AcceleratorDevice.MPS
|
||||
|
||||
# Use regular and alternative envvar
|
||||
os.environ["DOCLING_NUM_THREADS"] = "2"
|
||||
ao5 = AcceleratorOptions()
|
||||
assert ao5.num_threads == 2
|
||||
assert ao5.device == AcceleratorDevice.CPU
|
||||
|
||||
# Use wrong values
|
||||
is_exception = False
|
||||
try:
|
||||
os.environ["DOCLING_DEVICE"] = "wrong"
|
||||
ao5.__init__()
|
||||
except Exception as ex:
|
||||
print(ex)
|
||||
is_exception = True
|
||||
assert is_exception
|
||||
|
||||
# Use misformatted alternative envvar
|
||||
del os.environ["DOCLING_NUM_THREADS"]
|
||||
del os.environ["DOCLING_DEVICE"]
|
||||
os.environ["OMP_NUM_THREADS"] = "wrong"
|
||||
ao6 = AcceleratorOptions()
|
||||
assert ao6.num_threads == 4
|
||||
assert ao6.device == AcceleratorDevice.AUTO
|
||||
|
||||
|
||||
def test_e2e_conversions(test_doc_path):
|
||||
for converter in get_converters_with_table_options():
|
||||
print(f"converting {test_doc_path}")
|
||||
|
Loading…
Reference in New Issue
Block a user