diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py index 36f6119..7bb53fc 100644 --- a/docling/backend/abstract_backend.py +++ b/docling/backend/abstract_backend.py @@ -7,8 +7,6 @@ from PIL import Image class PdfPageBackend(ABC): - def __init__(self, page_obj: Any) -> object: - pass @abstractmethod def get_text_in_rect(self, bbox: "BoundingBox") -> str: @@ -32,6 +30,10 @@ class PdfPageBackend(ABC): def get_size(self) -> "PageSize": pass + @abstractmethod + def is_valid(self) -> bool: + pass + @abstractmethod def unload(self): pass diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index 8ccc0c8..aeaf473 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -19,22 +19,23 @@ class DoclingParsePageBackend(PdfPageBackend): def __init__( self, parser: pdf_parser, document_hash: str, page_no: int, page_obj: PdfPage ): - super().__init__(page_obj) self._ppage = page_obj - parsed_page = parser.parse_pdf_from_key_on_page(document_hash, page_no) self._dpage = None - self.broken_page = "pages" not in parsed_page - if not self.broken_page: + self.valid = "pages" in parsed_page + if self.valid: self._dpage = parsed_page["pages"][0] else: - raise RuntimeError( - f"Page {page_no} of document {document_hash} could not be parsed." + _log.info( + f"An error occured when loading page {page_no} of document {document_hash}." ) + def is_valid(self) -> bool: + return self.valid + def get_text_in_rect(self, bbox: BoundingBox) -> str: - if self.broken_page: + if not self.valid: return "" # Find intersecting cells on the page text_piece = "" @@ -70,7 +71,7 @@ class DoclingParsePageBackend(PdfPageBackend): cells = [] cell_counter = 0 - if self.broken_page: + if not self.valid: return cells page_size = self.get_size() @@ -201,7 +202,9 @@ class DoclingParseDocumentBackend(PdfDocumentBackend): success = self.parser.load_document(document_hash, str(path_or_stream)) if not success: - raise RuntimeError("docling-parse could not load this document.") + raise RuntimeError( + f"docling-parse could not load document {document_hash}." + ) def page_count(self) -> int: return len(self._pdoc) # To be replaced with docling-parse API diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index 56758b1..b7ec824 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -1,3 +1,4 @@ +import logging import random from io import BytesIO from pathlib import Path @@ -7,17 +8,32 @@ import pypdfium2 as pdfium import pypdfium2.raw as pdfium_c from PIL import Image, ImageDraw from pypdfium2 import PdfPage +from pypdfium2._helpers.misc import PdfiumError from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize +_log = logging.getLogger(__name__) + class PyPdfiumPageBackend(PdfPageBackend): - def __init__(self, page_obj: PdfPage): - super().__init__(page_obj) - self._ppage = page_obj + def __init__( + self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int + ): + self.valid = True # No better way to tell from pypdfium. + try: + self._ppage: pdfium.PdfPage = pdfium_doc[page_no] + except PdfiumError as e: + _log.info( + f"An exception occured when loading page {page_no} of document {document_hash}.", + exc_info=True, + ) + self.valid = False self.text_page = None + def is_valid(self) -> bool: + return self.valid + def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]: AREA_THRESHOLD = 32 * 32 for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]): @@ -217,13 +233,18 @@ class PyPdfiumPageBackend(PdfPageBackend): class PyPdfiumDocumentBackend(PdfDocumentBackend): def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str): super().__init__(path_or_stream, document_hash) - self._pdoc = pdfium.PdfDocument(path_or_stream) + try: + self._pdoc = pdfium.PdfDocument(path_or_stream) + except PdfiumError as e: + raise RuntimeError( + f"pypdfium could not load document {document_hash}" + ) from e def page_count(self) -> int: return len(self._pdoc) def load_page(self, page_no: int) -> PyPdfiumPageBackend: - return PyPdfiumPageBackend(self._pdoc[page_no]) + return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no) def is_valid(self) -> bool: return self.page_count() > 0 diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index c579cb5..2705c9d 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -16,7 +16,7 @@ class ConversionStatus(str, Enum): STARTED = auto() FAILURE = auto() SUCCESS = auto() - SUCCESS_WITH_ERRORS = auto() + PARTIAL_SUCCESS = auto() class DocInputType(str, Enum): @@ -29,6 +29,18 @@ class CoordOrigin(str, Enum): BOTTOMLEFT = auto() +class DoclingComponentType(str, Enum): + PDF_BACKEND = auto() + MODEL = auto() + DOC_ASSEMBLER = auto() + + +class ErrorItem(BaseModel): + component_type: DoclingComponentType + module_name: str + error_message: str + + class PageSize(BaseModel): width: float = 0.0 height: float = 0.0 diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 5726b76..57d40c3 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -19,6 +19,7 @@ from docling.datamodel.base_models import ( AssembledUnit, ConversionStatus, DocumentStream, + ErrorItem, FigureElement, Page, PageElement, @@ -118,7 +119,7 @@ class ConvertedDocument(BaseModel): input: InputDocument status: ConversionStatus = ConversionStatus.PENDING # failure, success - errors: List[Dict] = [] # structure to keep errors + errors: List[ErrorItem] = [] # structure to keep errors pages: List[Page] = [] assembled: Optional[AssembledUnit] = None diff --git a/docling/document_converter.py b/docling/document_converter.py index 8b1b0e1..8a71a57 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -16,6 +16,8 @@ from docling.datamodel.base_models import ( AssembledUnit, AssembleOptions, ConversionStatus, + DoclingComponentType, + ErrorItem, Page, PipelineOptions, ) @@ -157,7 +159,6 @@ class DocumentConverter: for page_batch in chunkify( converted_doc.pages, settings.perf.page_batch_size ): - start_pb_time = time.time() # Pipeline @@ -205,12 +206,27 @@ class DocumentConverter: converted_doc.pages = all_assembled_pages self.assemble_doc(converted_doc) - converted_doc.status = ConversionStatus.SUCCESS + status = ConversionStatus.SUCCESS + for page in converted_doc.pages: + if not page._backend.is_valid(): + converted_doc.errors.append( + ErrorItem( + component_type=DoclingComponentType.PDF_BACKEND, + module_name=type(page._backend).__name__, + error_message=f"Page {page.page_no} failed to parse.", + ) + ) + status = ConversionStatus.PARTIAL_SUCCESS + + converted_doc.status = status except Exception as e: converted_doc.status = ConversionStatus.FAILURE trace = "\n".join(traceback.format_exception(e)) - _log.info(f"Encountered an error during conversion: {trace}") + _log.info( + f"Encountered an error during conversion of document {in_doc.document_hash}:\n" + f"{trace}" + ) end_doc_time = time.time() - start_doc_time _log.info( @@ -230,7 +246,9 @@ class DocumentConverter: # Generate the page image and store it in the page object def populate_page_images(self, doc: InputDocument, page: Page) -> Page: # default scale - page.get_image(scale=1.0) + page.get_image( + scale=1.0 + ) # puts the page image on the image cache at default scale # user requested scales if self.assemble_options.images_scale is not None: diff --git a/examples/batch_convert.py b/examples/batch_convert.py index 76bbdcd..f1a5c8b 100644 --- a/examples/batch_convert.py +++ b/examples/batch_convert.py @@ -1,15 +1,10 @@ import json import logging import time -from io import BytesIO from pathlib import Path from typing import Iterable -from docling.datamodel.base_models import ( - ConversionStatus, - DocumentStream, - PipelineOptions, -) +from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docling.datamodel.document import ConvertedDocument, DocumentConversionInput from docling.document_converter import DocumentConverter @@ -24,6 +19,7 @@ def export_documents( success_count = 0 failure_count = 0 + partial_success_count = 0 for doc in converted_docs: if doc.status == ConversionStatus.SUCCESS: @@ -37,12 +33,21 @@ def export_documents( # Export Markdown format: with (output_dir / f"{doc_filename}.md").open("w") as fp: fp.write(doc.render_as_markdown()) + elif doc.status == ConversionStatus.PARTIAL_SUCCESS: + _log.info( + f"Document {doc.input.file} was partially converted with the following errors:" + ) + for item in doc.errors: + _log.info(f"\t{item.error_message}") + partial_success_count += 1 else: _log.info(f"Document {doc.input.file} failed to convert.") failure_count += 1 _log.info( - f"Processed {success_count + failure_count} docs, of which {failure_count} failed" + f"Processed {success_count + partial_success_count + failure_count} docs, " + f"of which {failure_count} failed " + f"and {partial_success_count} were partially converted." ) @@ -61,7 +66,7 @@ def main(): # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)] # input = DocumentConversionInput.from_streams(docs) - doc_converter = DocumentConverter(pipeline_options=PipelineOptions(do_ocr=False)) + doc_converter = DocumentConverter() input = DocumentConversionInput.from_paths(input_doc_paths)