feat: Page-level error reporting from PDF backend, introduce PARTIAL_SUCCESS status (#47)

* Put safety-checks for failed parse of pages

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Introduce page-level error checks

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Bump to docling-parse 1.1.1

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Introduce page-level error checks

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-08-23 16:18:41 +02:00 committed by GitHub
parent 3226b20779
commit a294b7e64a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 92 additions and 30 deletions

View File

@ -7,8 +7,6 @@ from PIL import Image
class PdfPageBackend(ABC): class PdfPageBackend(ABC):
def __init__(self, page_obj: Any) -> object:
pass
@abstractmethod @abstractmethod
def get_text_in_rect(self, bbox: "BoundingBox") -> str: def get_text_in_rect(self, bbox: "BoundingBox") -> str:
@ -32,6 +30,10 @@ class PdfPageBackend(ABC):
def get_size(self) -> "PageSize": def get_size(self) -> "PageSize":
pass pass
@abstractmethod
def is_valid(self) -> bool:
pass
@abstractmethod @abstractmethod
def unload(self): def unload(self):
pass pass

View File

@ -19,22 +19,23 @@ class DoclingParsePageBackend(PdfPageBackend):
def __init__( def __init__(
self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage
): ):
super().__init__(page_obj)
self._ppage = page_obj self._ppage = page_obj
parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no) parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no)
self._dpage = None self._dpage = None
self.broken_page = "pages" not in parsed_page self.valid = "pages" in parsed_page
if not self.broken_page: if self.valid:
self._dpage = parsed_page["pages"][0] self._dpage = parsed_page["pages"][0]
else: else:
raise RuntimeError( _log.info(
f"Page {page_no} of document {document_hash} could not be parsed." f"An error occured when loading page {page_no} of document {document_hash}."
) )
def is_valid(self) -> bool:
return self.valid
def get_text_in_rect(self, bbox: BoundingBox) -> str: def get_text_in_rect(self, bbox: BoundingBox) -> str:
if self.broken_page: if not self.valid:
return "" return ""
# Find intersecting cells on the page # Find intersecting cells on the page
text_piece = "" text_piece = ""
@ -70,7 +71,7 @@ class DoclingParsePageBackend(PdfPageBackend):
cells = [] cells = []
cell_counter = 0 cell_counter = 0
if self.broken_page: if not self.valid:
return cells return cells
page_size = self.get_size() page_size = self.get_size()
@ -201,7 +202,9 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
success = self.parser.load_document(document_hash, str(path_or_stream)) success = self.parser.load_document(document_hash, str(path_or_stream))
if not success: if not success:
raise RuntimeError("docling-parse could not load this document.") raise RuntimeError(
f"docling-parse could not load document {document_hash}."
)
def page_count(self) -> int: def page_count(self) -> int:
return len(self._pdoc) # To be replaced with docling-parse API return len(self._pdoc) # To be replaced with docling-parse API

View File

@ -1,3 +1,4 @@
import logging
import random import random
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
@ -7,17 +8,32 @@ import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c import pypdfium2.raw as pdfium_c
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from pypdfium2 import PdfPage from pypdfium2 import PdfPage
from pypdfium2._helpers.misc import PdfiumError
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
_log = logging.getLogger(__name__)
class PyPdfiumPageBackend(PdfPageBackend): class PyPdfiumPageBackend(PdfPageBackend):
def __init__(self, page_obj: PdfPage): def __init__(
super().__init__(page_obj) self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
self._ppage = page_obj ):
self.valid = True # No better way to tell from pypdfium.
try:
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
except PdfiumError as e:
_log.info(
f"An exception occured when loading page {page_no} of document {document_hash}.",
exc_info=True,
)
self.valid = False
self.text_page = None self.text_page = None
def is_valid(self) -> bool:
return self.valid
def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]: def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 32 * 32 AREA_THRESHOLD = 32 * 32
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]): for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
@ -217,13 +233,18 @@ class PyPdfiumPageBackend(PdfPageBackend):
class PyPdfiumDocumentBackend(PdfDocumentBackend): class PyPdfiumDocumentBackend(PdfDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash) super().__init__(path_or_stream, document_hash)
self._pdoc = pdfium.PdfDocument(path_or_stream) try:
self._pdoc = pdfium.PdfDocument(path_or_stream)
except PdfiumError as e:
raise RuntimeError(
f"pypdfium could not load document {document_hash}"
) from e
def page_count(self) -> int: def page_count(self) -> int:
return len(self._pdoc) return len(self._pdoc)
def load_page(self, page_no: int) -> PyPdfiumPageBackend: def load_page(self, page_no: int) -> PyPdfiumPageBackend:
return PyPdfiumPageBackend(self._pdoc[page_no]) return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.page_count() > 0 return self.page_count() > 0

View File

@ -16,7 +16,7 @@ class ConversionStatus(str, Enum):
STARTED = auto() STARTED = auto()
FAILURE = auto() FAILURE = auto()
SUCCESS = auto() SUCCESS = auto()
SUCCESS_WITH_ERRORS = auto() PARTIAL_SUCCESS = auto()
class DocInputType(str, Enum): class DocInputType(str, Enum):
@ -29,6 +29,18 @@ class CoordOrigin(str, Enum):
BOTTOMLEFT = auto() BOTTOMLEFT = auto()
class DoclingComponentType(str, Enum):
PDF_BACKEND = auto()
MODEL = auto()
DOC_ASSEMBLER = auto()
class ErrorItem(BaseModel):
component_type: DoclingComponentType
module_name: str
error_message: str
class PageSize(BaseModel): class PageSize(BaseModel):
width: float = 0.0 width: float = 0.0
height: float = 0.0 height: float = 0.0

View File

@ -19,6 +19,7 @@ from docling.datamodel.base_models import (
AssembledUnit, AssembledUnit,
ConversionStatus, ConversionStatus,
DocumentStream, DocumentStream,
ErrorItem,
FigureElement, FigureElement,
Page, Page,
PageElement, PageElement,
@ -118,7 +119,7 @@ class ConvertedDocument(BaseModel):
input: InputDocument input: InputDocument
status: ConversionStatus = ConversionStatus.PENDING # failure, success status: ConversionStatus = ConversionStatus.PENDING # failure, success
errors: List[Dict] = [] # structure to keep errors errors: List[ErrorItem] = [] # structure to keep errors
pages: List[Page] = [] pages: List[Page] = []
assembled: Optional[AssembledUnit] = None assembled: Optional[AssembledUnit] = None

View File

@ -16,6 +16,8 @@ from docling.datamodel.base_models import (
AssembledUnit, AssembledUnit,
AssembleOptions, AssembleOptions,
ConversionStatus, ConversionStatus,
DoclingComponentType,
ErrorItem,
Page, Page,
PipelineOptions, PipelineOptions,
) )
@ -157,7 +159,6 @@ class DocumentConverter:
for page_batch in chunkify( for page_batch in chunkify(
converted_doc.pages, settings.perf.page_batch_size converted_doc.pages, settings.perf.page_batch_size
): ):
start_pb_time = time.time() start_pb_time = time.time()
# Pipeline # Pipeline
@ -205,12 +206,27 @@ class DocumentConverter:
converted_doc.pages = all_assembled_pages converted_doc.pages = all_assembled_pages
self.assemble_doc(converted_doc) self.assemble_doc(converted_doc)
converted_doc.status = ConversionStatus.SUCCESS status = ConversionStatus.SUCCESS
for page in converted_doc.pages:
if not page._backend.is_valid():
converted_doc.errors.append(
ErrorItem(
component_type=DoclingComponentType.PDF_BACKEND,
module_name=type(page._backend).__name__,
error_message=f"Page {page.page_no} failed to parse.",
)
)
status = ConversionStatus.PARTIAL_SUCCESS
converted_doc.status = status
except Exception as e: except Exception as e:
converted_doc.status = ConversionStatus.FAILURE converted_doc.status = ConversionStatus.FAILURE
trace = "\n".join(traceback.format_exception(e)) trace = "\n".join(traceback.format_exception(e))
_log.info(f"Encountered an error during conversion: {trace}") _log.info(
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
f"{trace}"
)
end_doc_time = time.time() - start_doc_time end_doc_time = time.time() - start_doc_time
_log.info( _log.info(
@ -230,7 +246,9 @@ class DocumentConverter:
# Generate the page image and store it in the page object # Generate the page image and store it in the page object
def populate_page_images(self, doc: InputDocument, page: Page) -> Page: def populate_page_images(self, doc: InputDocument, page: Page) -> Page:
# default scale # default scale
page.get_image(scale=1.0) page.get_image(
scale=1.0
) # puts the page image on the image cache at default scale
# user requested scales # user requested scales
if self.assemble_options.images_scale is not None: if self.assemble_options.images_scale is not None:

View File

@ -1,15 +1,10 @@
import json import json
import logging import logging
import time import time
from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Iterable from typing import Iterable
from docling.datamodel.base_models import ( from docling.datamodel.base_models import ConversionStatus, PipelineOptions
ConversionStatus,
DocumentStream,
PipelineOptions,
)
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
@ -24,6 +19,7 @@ def export_documents(
success_count = 0 success_count = 0
failure_count = 0 failure_count = 0
partial_success_count = 0
for doc in converted_docs: for doc in converted_docs:
if doc.status == ConversionStatus.SUCCESS: if doc.status == ConversionStatus.SUCCESS:
@ -37,12 +33,21 @@ def export_documents(
# Export Markdown format: # Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w") as fp: with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(doc.render_as_markdown()) fp.write(doc.render_as_markdown())
elif doc.status == ConversionStatus.PARTIAL_SUCCESS:
_log.info(
f"Document {doc.input.file} was partially converted with the following errors:"
)
for item in doc.errors:
_log.info(f"\t{item.error_message}")
partial_success_count += 1
else: else:
_log.info(f"Document {doc.input.file} failed to convert.") _log.info(f"Document {doc.input.file} failed to convert.")
failure_count += 1 failure_count += 1
_log.info( _log.info(
f"Processed {success_count + failure_count} docs, of which {failure_count} failed" f"Processed {success_count + partial_success_count + failure_count} docs, "
f"of which {failure_count} failed "
f"and {partial_success_count} were partially converted."
) )
@ -61,7 +66,7 @@ def main():
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)] # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs) # input = DocumentConversionInput.from_streams(docs)
doc_converter = DocumentConverter(pipeline_options=PipelineOptions(do_ocr=False)) doc_converter = DocumentConverter()
input = DocumentConversionInput.from_paths(input_doc_paths) input = DocumentConversionInput.from_paths(input_doc_paths)