From 8808463cecd7ff3a92bd99d2e3d65fd248672c9e Mon Sep 17 00:00:00 2001 From: Christoph Auer <60343111+cau-git@users.noreply.github.com> Date: Fri, 23 Aug 2024 13:51:42 +0200 Subject: [PATCH] fix: Better raise exception when a page fails to parse (#46) * Put safety-checks for failed parse of pages Signed-off-by: Christoph Auer * Bump to docling-parse 1.1.1 Signed-off-by: Christoph Auer * Raise from page backend if page is not correctly parsed Signed-off-by: Christoph Auer --------- Signed-off-by: Christoph Auer --- docling/backend/docling_parse_backend.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index 905d365..8ccc0c8 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -28,6 +28,10 @@ class DoclingParsePageBackend(PdfPageBackend): self.broken_page = "pages" not in parsed_page if not self.broken_page: self._dpage = parsed_page["pages"][0] + else: + raise RuntimeError( + f"Page {page_no} of document {document_hash} could not be parsed." + ) def get_text_in_rect(self, bbox: BoundingBox) -> str: if self.broken_page: