Add redbooks to test data, small additions (#35)
Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
a13114bafd
commit
c253dd743a
@ -16,6 +16,9 @@ COPY examples/minimal.py /root/minimal.py
|
||||
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
|
||||
RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
|
||||
|
||||
# On container environments, always set a thread budget to avoid undesired thread congestion.
|
||||
ENV OMP_NUM_THREADS=4
|
||||
|
||||
# On container shell:
|
||||
# > cd /root/
|
||||
# > python minimal.py
|
||||
|
@ -1,4 +1,6 @@
|
||||
import logging
|
||||
import random
|
||||
import time
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Union
|
||||
@ -11,6 +13,8 @@ from pypdfium2 import PdfPage
|
||||
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DoclingParsePageBackend(PdfPageBackend):
|
||||
def __init__(self, page_obj: PdfPage, docling_page_obj):
|
||||
@ -151,11 +155,19 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
|
||||
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
||||
# Parsing cells with docling_parser call
|
||||
parser = pdf_parser()
|
||||
|
||||
start_pb_time = time.time()
|
||||
|
||||
if isinstance(path_or_stream, BytesIO):
|
||||
self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
|
||||
else:
|
||||
self._parser_doc = parser.find_cells(str(path_or_stream))
|
||||
|
||||
end_pb_time = time.time() - start_pb_time
|
||||
_log.info(
|
||||
f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
|
||||
)
|
||||
|
||||
def page_count(self) -> int:
|
||||
return len(self._parser_doc["pages"])
|
||||
|
||||
|
@ -48,6 +48,8 @@ def main():
|
||||
Path("./test/data/2206.01062.pdf"),
|
||||
Path("./test/data/2203.01017v2.pdf"),
|
||||
Path("./test/data/2305.03393v1.pdf"),
|
||||
Path("./test/data/redp5110.pdf"),
|
||||
Path("./test/data/redp5695.pdf"),
|
||||
]
|
||||
|
||||
doc_converter = DocumentConverter()
|
||||
|
BIN
test/data/redp5110.pdf
Normal file
BIN
test/data/redp5110.pdf
Normal file
Binary file not shown.
BIN
test/data/redp5695.pdf
Normal file
BIN
test/data/redp5695.pdf
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user