feat: Upgrade docling-parse PDF backend and interface to use page-by-page parsing (#44)
* Use docling-parse page-by-page Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Propagate document_hash to PDF backends, use docling-parse 1.0.0 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Upgrade lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * repin after more packages on pypi Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
@@ -215,8 +215,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
|
||||
|
||||
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(path_or_stream)
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||
super().__init__(path_or_stream, document_hash)
|
||||
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
||||
|
||||
def page_count(self) -> int:
|
||||
@@ -229,5 +229,6 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
||||
return self.page_count() > 0
|
||||
|
||||
def unload(self):
|
||||
super().unload()
|
||||
self._pdoc.close()
|
||||
self._pdoc = None
|
||||
|
||||
Reference in New Issue
Block a user