diff --git a/Dockerfile b/Dockerfile index 3fb8172..5b295ee 100644 --- a/Dockerfile +++ b/Dockerfile @@ -16,6 +16,9 @@ COPY examples/minimal.py /root/minimal.py RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);' RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);' +# On container environments, always set a thread budget to avoid undesired thread congestion. +ENV OMP_NUM_THREADS=4 + # On container shell: # > cd /root/ # > python minimal.py diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index 3082b6c..09ec502 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -1,4 +1,6 @@ +import logging import random +import time from io import BytesIO from pathlib import Path from typing import Iterable, List, Optional, Union @@ -11,6 +13,8 @@ from pypdfium2 import PdfPage from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize +_log = logging.getLogger(__name__) + class DoclingParsePageBackend(PdfPageBackend): def __init__(self, page_obj: PdfPage, docling_page_obj): @@ -151,11 +155,19 @@ class DoclingParseDocumentBackend(PdfDocumentBackend): self._pdoc = pdfium.PdfDocument(path_or_stream) # Parsing cells with docling_parser call parser = pdf_parser() + + start_pb_time = time.time() + if isinstance(path_or_stream, BytesIO): self._parser_doc = parser.find_cells_from_bytesio(path_or_stream) else: self._parser_doc = parser.find_cells(str(path_or_stream)) + end_pb_time = time.time() - start_pb_time + _log.info( + f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}" + ) + def page_count(self) -> int: return len(self._parser_doc["pages"]) diff --git a/examples/batch_convert.py b/examples/batch_convert.py index e54860e..d3d0e28 100644 --- a/examples/batch_convert.py +++ b/examples/batch_convert.py @@ -48,6 +48,8 @@ def main(): Path("./test/data/2206.01062.pdf"), Path("./test/data/2203.01017v2.pdf"), Path("./test/data/2305.03393v1.pdf"), + Path("./test/data/redp5110.pdf"), + Path("./test/data/redp5695.pdf"), ] doc_converter = DocumentConverter() diff --git a/test/data/redp5110.pdf b/test/data/redp5110.pdf new file mode 100644 index 0000000..f4ff9a5 Binary files /dev/null and b/test/data/redp5110.pdf differ diff --git a/test/data/redp5695.pdf b/test/data/redp5695.pdf new file mode 100644 index 0000000..7ea9482 Binary files /dev/null and b/test/data/redp5695.pdf differ