Add redbooks to test data, small additions (#35)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-08-20 12:36:00 +02:00 committed by GitHub
parent a13114bafd
commit c253dd743a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 17 additions and 0 deletions

View File

@ -16,6 +16,9 @@ COPY examples/minimal.py /root/minimal.py
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
# On container environments, always set a thread budget to avoid undesired thread congestion.
ENV OMP_NUM_THREADS=4
# On container shell:
# > cd /root/
# > python minimal.py

View File

@ -1,4 +1,6 @@
import logging
import random
import time
from io import BytesIO
from pathlib import Path
from typing import Iterable, List, Optional, Union
@ -11,6 +13,8 @@ from pypdfium2 import PdfPage
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
_log = logging.getLogger(__name__)
class DoclingParsePageBackend(PdfPageBackend):
def __init__(self, page_obj: PdfPage, docling_page_obj):
@ -151,11 +155,19 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
self._pdoc = pdfium.PdfDocument(path_or_stream)
# Parsing cells with docling_parser call
parser = pdf_parser()
start_pb_time = time.time()
if isinstance(path_or_stream, BytesIO):
self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
else:
self._parser_doc = parser.find_cells(str(path_or_stream))
end_pb_time = time.time() - start_pb_time
_log.info(
f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
)
def page_count(self) -> int:
return len(self._parser_doc["pages"])

View File

@ -48,6 +48,8 @@ def main():
Path("./test/data/2206.01062.pdf"),
Path("./test/data/2203.01017v2.pdf"),
Path("./test/data/2305.03393v1.pdf"),
Path("./test/data/redp5110.pdf"),
Path("./test/data/redp5695.pdf"),
]
doc_converter = DocumentConverter()

BIN
test/data/redp5110.pdf Normal file

Binary file not shown.

BIN
test/data/redp5695.pdf Normal file

Binary file not shown.