perf: prevent temp file leftovers, reuse core type (#487)

* chore: reuse DocumentStream from docling-core

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>

* update docling-core version

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>

* [skip ci] document  import line

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>

* fix: use new resolve_source_to_x functions to avoid tempfile leftovers (#490)

use new resolve_source_to_x functions

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
This commit is contained in:
Panos Vagenas 2024-12-03 10:40:28 +01:00 committed by GitHub
parent d3f84b2457
commit 051789d017
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 103 additions and 97 deletions

View File

@ -2,6 +2,7 @@ import importlib
import json
import logging
import re
import tempfile
import time
import warnings
from enum import Enum
@ -9,7 +10,7 @@ from pathlib import Path
from typing import Annotated, Dict, Iterable, List, Optional, Type
import typer
from docling_core.utils.file import resolve_file_source
from docling_core.utils.file import resolve_source_to_path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@ -256,9 +257,10 @@ def convert(
if from_formats is None:
from_formats = [e for e in InputFormat]
with tempfile.TemporaryDirectory() as tempdir:
input_doc_paths: List[Path] = []
for src in input_sources:
source = resolve_file_source(source=src)
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
if not source.exists():
err_console.print(
f"[red]Error: The input file {source} does not exist.[/red]"
@ -302,7 +304,9 @@ def convert(
ocr_options=ocr_options,
do_table_structure=True,
)
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
pipeline_options.table_structure_options.do_cell_matching = (
True # do_cell_matching
)
pipeline_options.table_structure_options.mode = table_mode
if artifacts_path is not None:

View File

@ -1,5 +1,4 @@
from enum import Enum, auto
from io import BytesIO
from typing import TYPE_CHECKING, Dict, List, Optional, Union
from docling_core.types.doc import (
@ -9,6 +8,9 @@ from docling_core.types.doc import (
Size,
TableCell,
)
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
DocumentStream,
)
from PIL.Image import Image
from pydantic import BaseModel, ConfigDict
@ -207,10 +209,3 @@ class Page(BaseModel):
@property
def image(self) -> Optional[Image]:
return self.get_image(scale=self._default_image_scale)
class DocumentStream(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
name: str
stream: BytesIO

View File

@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
)
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from docling_core.utils.file import resolve_file_source
from docling_core.utils.file import resolve_source_to_stream
from pydantic import BaseModel
from typing_extensions import deprecated
@ -459,7 +459,7 @@ class _DocumentConversionInput(BaseModel):
self, format_options: Dict[InputFormat, "FormatOption"]
) -> Iterable[InputDocument]:
for item in self.path_or_stream_iterator:
obj = resolve_file_source(item) if isinstance(item, str) else item
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
format = self._guess_format(obj)
if format not in format_options.keys():
_log.info(

15
poetry.lock generated
View File

@ -896,13 +896,13 @@ files = [
[[package]]
name = "docling-core"
version = "2.5.1"
version = "2.6.1"
description = "A python library to define and validate data types in Docling."
optional = false
python-versions = "<4.0,>=3.9"
files = [
{file = "docling_core-2.5.1-py3-none-any.whl", hash = "sha256:e4a5626520714c25a5ec2f9f7495407e730485257c2272e8467faae7357435bf"},
{file = "docling_core-2.5.1.tar.gz", hash = "sha256:e9a7c7c46f869b13747436a2ce42df3632af655e1a3af574dfcd114e71dcbb75"},
{file = "docling_core-2.6.1-py3-none-any.whl", hash = "sha256:8e7a5bc0ce13289567738481949fed3ab580f2d8cea7525b246159233d81b26b"},
{file = "docling_core-2.6.1.tar.gz", hash = "sha256:c8af45e0873611120cc24757d567d37e053a54e2ce060b7b5b44efd0d73f75e5"},
]
[package.dependencies]
@ -913,6 +913,7 @@ pillow = ">=10.3.0,<11.0.0"
pydantic = ">=2.6.0,<2.10"
pyyaml = ">=5.1,<7.0.0"
tabulate = ">=0.9.0,<0.10.0"
typing-extensions = ">=4.12.2,<5.0.0"
[[package]]
name = "docling-ibm-models"
@ -3200,6 +3201,7 @@ files = [
{file = "nh3-0.2.19-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:00810cd5275f5c3f44b9eb0e521d1a841ee2f8023622de39ffc7d88bd533d8e0"},
{file = "nh3-0.2.19-cp38-abi3-win32.whl", hash = "sha256:7e98621856b0a911c21faa5eef8f8ea3e691526c2433f9afc2be713cb6fbdb48"},
{file = "nh3-0.2.19-cp38-abi3-win_amd64.whl", hash = "sha256:75c7cafb840f24430b009f7368945cb5ca88b2b54bb384ebfba495f16bc9c121"},
{file = "nh3-0.2.19.tar.gz", hash = "sha256:790056b54c068ff8dceb443eaefb696b84beff58cca6c07afd754d17692a4804"},
]
[[package]]
@ -6028,6 +6030,11 @@ files = [
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"},
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"},
{file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"},
{file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"},
{file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"},
{file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"},
{file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"},
{file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"},
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"},
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"},
{file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},
@ -7646,4 +7653,4 @@ tesserocr = ["tesserocr"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "2e7c27ffe32d556a66cc1008a7147a90c17f63b01d2a6cde3e7b941ba7e268d7"
content-hash = "ee3b3d938295f0057567c10fb808a0d95ed2fe9a32f459d489b4b29aacf710c8"

View File

@ -26,7 +26,7 @@ packages = [{include = "docling"}]
######################
python = "^3.9"
pydantic = ">=2.0.0,<2.10"
docling-core = "^2.5.1"
docling-core = "^2.6.1"
docling-ibm-models = "^2.0.6"
deepsearch-glm = "^0.26.1"
filetype = "^1.2.0"