Add redbooks to test data, small additions (#35)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-08-20 12:36:00 +02:00 committed by GitHub
parent a13114bafd
commit c253dd743a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 17 additions and 0 deletions

View File

@ -16,6 +16,9 @@ COPY examples/minimal.py /root/minimal.py
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);' RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);' RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
# On container environments, always set a thread budget to avoid undesired thread congestion.
ENV OMP_NUM_THREADS=4
# On container shell: # On container shell:
# > cd /root/ # > cd /root/
# > python minimal.py # > python minimal.py

View File

@ -1,4 +1,6 @@
import logging
import random import random
import time
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Iterable, List, Optional, Union from typing import Iterable, List, Optional, Union
@ -11,6 +13,8 @@ from pypdfium2 import PdfPage
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
_log = logging.getLogger(__name__)
class DoclingParsePageBackend(PdfPageBackend): class DoclingParsePageBackend(PdfPageBackend):
def __init__(self, page_obj: PdfPage, docling_page_obj): def __init__(self, page_obj: PdfPage, docling_page_obj):
@ -151,11 +155,19 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
self._pdoc = pdfium.PdfDocument(path_or_stream) self._pdoc = pdfium.PdfDocument(path_or_stream)
# Parsing cells with docling_parser call # Parsing cells with docling_parser call
parser = pdf_parser() parser = pdf_parser()
start_pb_time = time.time()
if isinstance(path_or_stream, BytesIO): if isinstance(path_or_stream, BytesIO):
self._parser_doc = parser.find_cells_from_bytesio(path_or_stream) self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
else: else:
self._parser_doc = parser.find_cells(str(path_or_stream)) self._parser_doc = parser.find_cells(str(path_or_stream))
end_pb_time = time.time() - start_pb_time
_log.info(
f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
)
def page_count(self) -> int: def page_count(self) -> int:
return len(self._parser_doc["pages"]) return len(self._parser_doc["pages"])

View File

@ -48,6 +48,8 @@ def main():
Path("./test/data/2206.01062.pdf"), Path("./test/data/2206.01062.pdf"),
Path("./test/data/2203.01017v2.pdf"), Path("./test/data/2203.01017v2.pdf"),
Path("./test/data/2305.03393v1.pdf"), Path("./test/data/2305.03393v1.pdf"),
Path("./test/data/redp5110.pdf"),
Path("./test/data/redp5695.pdf"),
] ]
doc_converter = DocumentConverter() doc_converter = DocumentConverter()

BIN
test/data/redp5110.pdf Normal file

Binary file not shown.

BIN
test/data/redp5695.pdf Normal file

Binary file not shown.