feat: Upgrade docling-parse PDF backend and interface to use page-by-page parsing (#44)

* Use docling-parse page-by-page Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Propagate document_hash to PDF backends, use docling-parse 1.0.0 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Upgrade lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * repin after more packages on pypi Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
2024-08-22 13:49:37 +02:00
parent f7c50c8b0e
commit a8c6b29a67
8 changed files with 73 additions and 51 deletions
@@ -215,8 +215,8 @@ class PyPdfiumPageBackend(PdfPageBackend):


 class PyPdfiumDocumentBackend(PdfDocumentBackend):
-    def __init__(self, path_or_stream: Union[BytesIO, Path]):
-        super().__init__(path_or_stream)
+    def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
+        super().__init__(path_or_stream, document_hash)
        self._pdoc = pdfium.PdfDocument(path_or_stream)

    def page_count(self) -> int:
@@ -229,5 +229,6 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
        return self.page_count() > 0

    def unload(self):
+        super().unload()
        self._pdoc.close()
        self._pdoc = None