feat: Page-level error reporting from PDF backend, introduce PARTIAL_SUCCESS status (#47)

* Put safety-checks for failed parse of pages

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Introduce page-level error checks

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Bump to docling-parse 1.1.1

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Introduce page-level error checks

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-08-23 16:18:41 +02:00
committed by GitHub
parent 3226b20779
commit a294b7e64a
7 changed files with 92 additions and 30 deletions

View File

@@ -19,22 +19,23 @@ class DoclingParsePageBackend(PdfPageBackend):
def __init__(
self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
):
super().__init__(page_obj)
self._ppage = page_obj
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
self._dpage = None
self.broken_page = "pages" not in parsed_page
if not self.broken_page:
self.valid = "pages" in parsed_page
if self.valid:
self._dpage = parsed_page["pages"][0]
else:
raise RuntimeError(
f"Page {page_no} of document {document_hash} could not be parsed."
_log.info(
f"An error occured when loading page {page_no} of document {document_hash}."
)
def is_valid(self) -> bool:
return self.valid
def get_text_in_rect(self, bbox: BoundingBox) -> str:
if self.broken_page:
if not self.valid:
return ""
# Find intersecting cells on the page
text_piece = ""
@@ -70,7 +71,7 @@ class DoclingParsePageBackend(PdfPageBackend):
cells = []
cell_counter = 0
if self.broken_page:
if not self.valid:
return cells
page_size = self.get_size()
@@ -201,7 +202,9 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
success = self.parser.load_document(document_hash, str(path_or_stream))
if not success:
raise RuntimeError("docling-parse could not load this document.")
raise RuntimeError(
f"docling-parse could not load document {document_hash}."
)
def page_count(self) -> int:
return len(self._pdoc) # To be replaced with docling-parse API