Add redbooks to test data, small additions (#35)
Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
a13114bafd
commit
c253dd743a
@ -16,6 +16,9 @@ COPY examples/minimal.py /root/minimal.py
|
|||||||
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
|
RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
|
||||||
RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
|
RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'
|
||||||
|
|
||||||
|
# On container environments, always set a thread budget to avoid undesired thread congestion.
|
||||||
|
ENV OMP_NUM_THREADS=4
|
||||||
|
|
||||||
# On container shell:
|
# On container shell:
|
||||||
# > cd /root/
|
# > cd /root/
|
||||||
# > python minimal.py
|
# > python minimal.py
|
||||||
|
@ -1,4 +1,6 @@
|
|||||||
|
import logging
|
||||||
import random
|
import random
|
||||||
|
import time
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Optional, Union
|
from typing import Iterable, List, Optional, Union
|
||||||
@ -11,6 +13,8 @@ from pypdfium2 import PdfPage
|
|||||||
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
|
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class DoclingParsePageBackend(PdfPageBackend):
|
class DoclingParsePageBackend(PdfPageBackend):
|
||||||
def __init__(self, page_obj: PdfPage, docling_page_obj):
|
def __init__(self, page_obj: PdfPage, docling_page_obj):
|
||||||
@ -151,11 +155,19 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
|
|||||||
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
||||||
# Parsing cells with docling_parser call
|
# Parsing cells with docling_parser call
|
||||||
parser = pdf_parser()
|
parser = pdf_parser()
|
||||||
|
|
||||||
|
start_pb_time = time.time()
|
||||||
|
|
||||||
if isinstance(path_or_stream, BytesIO):
|
if isinstance(path_or_stream, BytesIO):
|
||||||
self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
|
self._parser_doc = parser.find_cells_from_bytesio(path_or_stream)
|
||||||
else:
|
else:
|
||||||
self._parser_doc = parser.find_cells(str(path_or_stream))
|
self._parser_doc = parser.find_cells(str(path_or_stream))
|
||||||
|
|
||||||
|
end_pb_time = time.time() - start_pb_time
|
||||||
|
_log.info(
|
||||||
|
f"Time to parse {path_or_stream.name} with docling-parse: time={end_pb_time:.3f}"
|
||||||
|
)
|
||||||
|
|
||||||
def page_count(self) -> int:
|
def page_count(self) -> int:
|
||||||
return len(self._parser_doc["pages"])
|
return len(self._parser_doc["pages"])
|
||||||
|
|
||||||
|
@ -48,6 +48,8 @@ def main():
|
|||||||
Path("./test/data/2206.01062.pdf"),
|
Path("./test/data/2206.01062.pdf"),
|
||||||
Path("./test/data/2203.01017v2.pdf"),
|
Path("./test/data/2203.01017v2.pdf"),
|
||||||
Path("./test/data/2305.03393v1.pdf"),
|
Path("./test/data/2305.03393v1.pdf"),
|
||||||
|
Path("./test/data/redp5110.pdf"),
|
||||||
|
Path("./test/data/redp5695.pdf"),
|
||||||
]
|
]
|
||||||
|
|
||||||
doc_converter = DocumentConverter()
|
doc_converter = DocumentConverter()
|
||||||
|
BIN
test/data/redp5110.pdf
Normal file
BIN
test/data/redp5110.pdf
Normal file
Binary file not shown.
BIN
test/data/redp5695.pdf
Normal file
BIN
test/data/redp5695.pdf
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user