diff --git a/docling/models/readingorder_model.py b/docling/models/readingorder_model.py index ba1bb45..9c38756 100644 --- a/docling/models/readingorder_model.py +++ b/docling/models/readingorder_model.py @@ -124,7 +124,7 @@ class ReadingOrderModel: page_no = page.page_no + 1 size = page.size - assert size is not None + assert size is not None, "Page size is not initialized." out_doc.add_page(page_no=page_no, size=size) diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index 29475d6..2b16810 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -193,6 +193,17 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. ) raise e + # Filter out uninitialized pages (those with size=None) that may remain + # after timeout or processing failures to prevent assertion errors downstream + initial_page_count = len(conv_res.pages) + conv_res.pages = [page for page in conv_res.pages if page.size is not None] + + if len(conv_res.pages) < initial_page_count: + _log.info( + f"Filtered out {initial_page_count - len(conv_res.pages)} uninitialized pages " + f"due to timeout or processing failures" + ) + return conv_res def _unload(self, conv_res: ConversionResult) -> ConversionResult: