perf: prevent temp file leftovers, reuse core type (#487)

* chore: reuse DocumentStream from docling-core

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>

* update docling-core version

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>

* [skip ci] document  import line

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>

* fix: use new resolve_source_to_x functions to avoid tempfile leftovers (#490)

use new resolve_source_to_x functions

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
This commit is contained in:
Panos Vagenas 2024-12-03 10:40:28 +01:00 committed by GitHub
parent d3f84b2457
commit 051789d017
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 103 additions and 97 deletions

View File

@ -2,6 +2,7 @@ import importlib
import json import json
import logging import logging
import re import re
import tempfile
import time import time
import warnings import warnings
from enum import Enum from enum import Enum
@ -9,7 +10,7 @@ from pathlib import Path
from typing import Annotated, Dict, Iterable, List, Optional, Type from typing import Annotated, Dict, Iterable, List, Optional, Type
import typer import typer
from docling_core.utils.file import resolve_file_source from docling_core.utils.file import resolve_source_to_path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@ -256,95 +257,98 @@ def convert(
if from_formats is None: if from_formats is None:
from_formats = [e for e in InputFormat] from_formats = [e for e in InputFormat]
input_doc_paths: List[Path] = [] with tempfile.TemporaryDirectory() as tempdir:
for src in input_sources: input_doc_paths: List[Path] = []
source = resolve_file_source(source=src) for src in input_sources:
if not source.exists(): source = resolve_source_to_path(source=src, workdir=Path(tempdir))
err_console.print( if not source.exists():
f"[red]Error: The input file {source} does not exist.[/red]" err_console.print(
) f"[red]Error: The input file {source} does not exist.[/red]"
raise typer.Abort() )
elif source.is_dir(): raise typer.Abort()
for fmt in from_formats: elif source.is_dir():
for ext in FormatToExtensions[fmt]: for fmt in from_formats:
input_doc_paths.extend(list(source.glob(f"**/*.{ext}"))) for ext in FormatToExtensions[fmt]:
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}"))) input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
else:
input_doc_paths.append(source)
if to_formats is None:
to_formats = [OutputFormat.MARKDOWN]
export_json = OutputFormat.JSON in to_formats
export_md = OutputFormat.MARKDOWN in to_formats
export_txt = OutputFormat.TEXT in to_formats
export_doctags = OutputFormat.DOCTAGS in to_formats
if ocr_engine == OcrEngine.EASYOCR:
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.TESSERACT_CLI:
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.OCRMAC:
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.RAPIDOCR:
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
else: else:
input_doc_paths.append(source) raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
if to_formats is None: ocr_lang_list = _split_list(ocr_lang)
to_formats = [OutputFormat.MARKDOWN] if ocr_lang_list is not None:
ocr_options.lang = ocr_lang_list
export_json = OutputFormat.JSON in to_formats pipeline_options = PdfPipelineOptions(
export_md = OutputFormat.MARKDOWN in to_formats do_ocr=ocr,
export_txt = OutputFormat.TEXT in to_formats ocr_options=ocr_options,
export_doctags = OutputFormat.DOCTAGS in to_formats do_table_structure=True,
if ocr_engine == OcrEngine.EASYOCR:
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.TESSERACT_CLI:
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.OCRMAC:
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
elif ocr_engine == OcrEngine.RAPIDOCR:
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
else:
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
ocr_lang_list = _split_list(ocr_lang)
if ocr_lang_list is not None:
ocr_options.lang = ocr_lang_list
pipeline_options = PdfPipelineOptions(
do_ocr=ocr,
ocr_options=ocr_options,
do_table_structure=True,
)
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
pipeline_options.table_structure_options.mode = table_mode
if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path
if pdf_backend == PdfBackend.DLPARSE_V1:
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
elif pdf_backend == PdfBackend.DLPARSE_V2:
backend = DoclingParseV2DocumentBackend
elif pdf_backend == PdfBackend.PYPDFIUM2:
backend = PyPdfiumDocumentBackend
else:
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=backend, # pdf_backend
) )
} pipeline_options.table_structure_options.do_cell_matching = (
doc_converter = DocumentConverter( True # do_cell_matching
allowed_formats=from_formats, )
format_options=format_options, pipeline_options.table_structure_options.mode = table_mode
)
start_time = time.time() if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path
conv_results = doc_converter.convert_all( if pdf_backend == PdfBackend.DLPARSE_V1:
input_doc_paths, raises_on_error=abort_on_error backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
) elif pdf_backend == PdfBackend.DLPARSE_V2:
backend = DoclingParseV2DocumentBackend
elif pdf_backend == PdfBackend.PYPDFIUM2:
backend = PyPdfiumDocumentBackend
else:
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
output.mkdir(parents=True, exist_ok=True) format_options: Dict[InputFormat, FormatOption] = {
export_documents( InputFormat.PDF: PdfFormatOption(
conv_results, pipeline_options=pipeline_options,
output_dir=output, backend=backend, # pdf_backend
export_json=export_json, )
export_md=export_md, }
export_txt=export_txt, doc_converter = DocumentConverter(
export_doctags=export_doctags, allowed_formats=from_formats,
) format_options=format_options,
)
end_time = time.time() - start_time start_time = time.time()
conv_results = doc_converter.convert_all(
input_doc_paths, raises_on_error=abort_on_error
)
output.mkdir(parents=True, exist_ok=True)
export_documents(
conv_results,
output_dir=output,
export_json=export_json,
export_md=export_md,
export_txt=export_txt,
export_doctags=export_doctags,
)
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.") _log.info(f"All documents were converted in {end_time:.2f} seconds.")

View File

@ -1,5 +1,4 @@
from enum import Enum, auto from enum import Enum, auto
from io import BytesIO
from typing import TYPE_CHECKING, Dict, List, Optional, Union from typing import TYPE_CHECKING, Dict, List, Optional, Union
from docling_core.types.doc import ( from docling_core.types.doc import (
@ -9,6 +8,9 @@ from docling_core.types.doc import (
Size, Size,
TableCell, TableCell,
) )
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
DocumentStream,
)
from PIL.Image import Image from PIL.Image import Image
from pydantic import BaseModel, ConfigDict from pydantic import BaseModel, ConfigDict
@ -207,10 +209,3 @@ class Page(BaseModel):
@property @property
def image(self) -> Optional[Image]: def image(self) -> Optional[Image]:
return self.get_image(scale=self._default_image_scale) return self.get_image(scale=self._default_image_scale)
class DocumentStream(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
name: str
stream: BytesIO

View File

@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
) )
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from docling_core.utils.file import resolve_file_source from docling_core.utils.file import resolve_source_to_stream
from pydantic import BaseModel from pydantic import BaseModel
from typing_extensions import deprecated from typing_extensions import deprecated
@ -459,7 +459,7 @@ class _DocumentConversionInput(BaseModel):
self, format_options: Dict[InputFormat, "FormatOption"] self, format_options: Dict[InputFormat, "FormatOption"]
) -> Iterable[InputDocument]: ) -> Iterable[InputDocument]:
for item in self.path_or_stream_iterator: for item in self.path_or_stream_iterator:
obj = resolve_file_source(item) if isinstance(item, str) else item obj = resolve_source_to_stream(item) if isinstance(item, str) else item
format = self._guess_format(obj) format = self._guess_format(obj)
if format not in format_options.keys(): if format not in format_options.keys():
_log.info( _log.info(

15
poetry.lock generated
View File

@ -896,13 +896,13 @@ files = [
[[package]] [[package]]
name = "docling-core" name = "docling-core"
version = "2.5.1" version = "2.6.1"
description = "A python library to define and validate data types in Docling." description = "A python library to define and validate data types in Docling."
optional = false optional = false
python-versions = "<4.0,>=3.9" python-versions = "<4.0,>=3.9"
files = [ files = [
{file = "docling_core-2.5.1-py3-none-any.whl", hash = "sha256:e4a5626520714c25a5ec2f9f7495407e730485257c2272e8467faae7357435bf"}, {file = "docling_core-2.6.1-py3-none-any.whl", hash = "sha256:8e7a5bc0ce13289567738481949fed3ab580f2d8cea7525b246159233d81b26b"},
{file = "docling_core-2.5.1.tar.gz", hash = "sha256:e9a7c7c46f869b13747436a2ce42df3632af655e1a3af574dfcd114e71dcbb75"}, {file = "docling_core-2.6.1.tar.gz", hash = "sha256:c8af45e0873611120cc24757d567d37e053a54e2ce060b7b5b44efd0d73f75e5"},
] ]
[package.dependencies] [package.dependencies]
@ -913,6 +913,7 @@ pillow = ">=10.3.0,<11.0.0"
pydantic = ">=2.6.0,<2.10" pydantic = ">=2.6.0,<2.10"
pyyaml = ">=5.1,<7.0.0" pyyaml = ">=5.1,<7.0.0"
tabulate = ">=0.9.0,<0.10.0" tabulate = ">=0.9.0,<0.10.0"
typing-extensions = ">=4.12.2,<5.0.0"
[[package]] [[package]]
name = "docling-ibm-models" name = "docling-ibm-models"
@ -3200,6 +3201,7 @@ files = [
{file = "nh3-0.2.19-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:00810cd5275f5c3f44b9eb0e521d1a841ee2f8023622de39ffc7d88bd533d8e0"}, {file = "nh3-0.2.19-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:00810cd5275f5c3f44b9eb0e521d1a841ee2f8023622de39ffc7d88bd533d8e0"},
{file = "nh3-0.2.19-cp38-abi3-win32.whl", hash = "sha256:7e98621856b0a911c21faa5eef8f8ea3e691526c2433f9afc2be713cb6fbdb48"}, {file = "nh3-0.2.19-cp38-abi3-win32.whl", hash = "sha256:7e98621856b0a911c21faa5eef8f8ea3e691526c2433f9afc2be713cb6fbdb48"},
{file = "nh3-0.2.19-cp38-abi3-win_amd64.whl", hash = "sha256:75c7cafb840f24430b009f7368945cb5ca88b2b54bb384ebfba495f16bc9c121"}, {file = "nh3-0.2.19-cp38-abi3-win_amd64.whl", hash = "sha256:75c7cafb840f24430b009f7368945cb5ca88b2b54bb384ebfba495f16bc9c121"},
{file = "nh3-0.2.19.tar.gz", hash = "sha256:790056b54c068ff8dceb443eaefb696b84beff58cca6c07afd754d17692a4804"},
] ]
[[package]] [[package]]
@ -6028,6 +6030,11 @@ files = [
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"},
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"}, {file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"},
{file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"}, {file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"},
{file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"},
{file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"},
{file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"},
{file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"},
{file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"},
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"},
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"}, {file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"},
{file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"}, {file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},
@ -7646,4 +7653,4 @@ tesserocr = ["tesserocr"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.9" python-versions = "^3.9"
content-hash = "2e7c27ffe32d556a66cc1008a7147a90c17f63b01d2a6cde3e7b941ba7e268d7" content-hash = "ee3b3d938295f0057567c10fb808a0d95ed2fe9a32f459d489b4b29aacf710c8"

View File

@ -26,7 +26,7 @@ packages = [{include = "docling"}]
###################### ######################
python = "^3.9" python = "^3.9"
pydantic = ">=2.0.0,<2.10" pydantic = ">=2.0.0,<2.10"
docling-core = "^2.5.1" docling-core = "^2.6.1"
docling-ibm-models = "^2.0.6" docling-ibm-models = "^2.0.6"
deepsearch-glm = "^0.26.1" deepsearch-glm = "^0.26.1"
filetype = "^1.2.0" filetype = "^1.2.0"