From ec6cf6f7e8050db30c14f0625d6d5c6bbfeb6aeb Mon Sep 17 00:00:00 2001 From: Christoph Auer <60343111+cau-git@users.noreply.github.com> Date: Fri, 4 Jul 2025 15:36:13 +0200 Subject: [PATCH] feat: Introduce LayoutOptions to control layout postprocessing behaviour (#1870) Signed-off-by: Christoph Auer --- docling/datamodel/pipeline_options.py | 8 ++++++++ docling/models/layout_model.py | 10 ++++++++-- docling/pipeline/standard_pdf_pipeline.py | 1 + docling/utils/layout_postprocessor.py | 9 +++++++-- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 11e085b..fcf091e 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -1,4 +1,5 @@ import logging +from datetime import datetime from enum import Enum from pathlib import Path from typing import Any, ClassVar, Dict, List, Literal, Optional, Union @@ -265,6 +266,12 @@ class VlmPipelineOptions(PaginatedPipelineOptions): ) +class LayoutOptions(BaseModel): + """Options for layout processing.""" + + create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells + + class AsrPipelineOptions(PipelineOptions): asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY artifacts_path: Optional[Union[Path, str]] = None @@ -289,6 +296,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions): picture_description_options: PictureDescriptionBaseOptions = ( smolvlm_picture_description ) + layout_options: LayoutOptions = LayoutOptions() images_scale: float = 1.0 generate_page_images: bool = False diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index da75bb8..44e7286 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -12,6 +12,7 @@ from PIL import Image from docling.datamodel.accelerator_options import AcceleratorOptions from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import LayoutOptions from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel from docling.models.utils.hf_model_download import download_hf_model @@ -48,10 +49,15 @@ class LayoutModel(BasePageModel): CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION] def __init__( - self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions + self, + artifacts_path: Optional[Path], + accelerator_options: AcceleratorOptions, + options: LayoutOptions, ): from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor + self.options = options + device = decide_device(accelerator_options.device) if artifacts_path is None: @@ -177,7 +183,7 @@ class LayoutModel(BasePageModel): # Apply postprocessing processed_clusters, processed_cells = LayoutPostprocessor( - page, clusters + page, clusters, self.options ).postprocess() # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index ad4f36d..8861174 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -80,6 +80,7 @@ class StandardPdfPipeline(PaginatedPipeline): LayoutModel( artifacts_path=artifacts_path, accelerator_options=pipeline_options.accelerator_options, + options=pipeline_options.layout_options, ), # Table structure model TableStructureModel( diff --git a/docling/utils/layout_postprocessor.py b/docling/utils/layout_postprocessor.py index 3db1cf8..a98b3aa 100644 --- a/docling/utils/layout_postprocessor.py +++ b/docling/utils/layout_postprocessor.py @@ -9,6 +9,7 @@ from docling_core.types.doc.page import TextCell from rtree import index from docling.datamodel.base_models import BoundingBox, Cluster, Page +from docling.datamodel.pipeline_options import LayoutOptions _log = logging.getLogger(__name__) @@ -194,12 +195,16 @@ class LayoutPostprocessor: DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER, } - def __init__(self, page: Page, clusters: List[Cluster]) -> None: + def __init__( + self, page: Page, clusters: List[Cluster], options: LayoutOptions + ) -> None: """Initialize processor with page and clusters.""" + self.cells = page.cells self.page = page self.page_size = page.size self.all_clusters = clusters + self.options = options self.regular_clusters = [ c for c in clusters if c.label not in self.SPECIAL_TYPES ] @@ -267,7 +272,7 @@ class LayoutPostprocessor: # Handle orphaned cells unassigned = self._find_unassigned_cells(clusters) - if unassigned: + if unassigned and self.options.create_orphan_clusters: next_id = max((c.id for c in self.all_clusters), default=0) + 1 orphan_clusters = [] for i, cell in enumerate(unassigned):