feat: Introduce LayoutOptions to control layout postprocessing behaviour (#1870)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-04 15:36:13 +02:00 · 2025-07-04 15:36:13 +02:00 · ec6cf6f7e8
commit ec6cf6f7e8
parent 598c9c53d4
4 changed files with 24 additions and 4 deletions
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -1,4 +1,5 @@
 import logging
+from datetime import datetime
 from enum import Enum
 from pathlib import Path
 from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
@ -265,6 +266,12 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
    )


+class LayoutOptions(BaseModel):
+    """Options for layout processing."""
+
+    create_orphan_clusters: bool = True  # Whether to create clusters for orphaned cells
+
+
 class AsrPipelineOptions(PipelineOptions):
    asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
    artifacts_path: Optional[Union[Path, str]] = None
@ -289,6 +296,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
    picture_description_options: PictureDescriptionBaseOptions = (
        smolvlm_picture_description
    )
+    layout_options: LayoutOptions = LayoutOptions()

    images_scale: float = 1.0
    generate_page_images: bool = False
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -12,6 +12,7 @@ from PIL import Image
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
 from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import LayoutOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
 from docling.models.utils.hf_model_download import download_hf_model
@ -48,10 +49,15 @@ class LayoutModel(BasePageModel):
    CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]

    def __init__(
-        self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
+        self,
+        artifacts_path: Optional[Path],
+        accelerator_options: AcceleratorOptions,
+        options: LayoutOptions,
    ):
        from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor

+        self.options = options
+
        device = decide_device(accelerator_options.device)

        if artifacts_path is None:
@ -177,7 +183,7 @@ class LayoutModel(BasePageModel):
                    # Apply postprocessing

                    processed_clusters, processed_cells = LayoutPostprocessor(
-                        page, clusters
+                        page, clusters, self.options
                    ).postprocess()
                    # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally

--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -80,6 +80,7 @@ class StandardPdfPipeline(PaginatedPipeline):
            LayoutModel(
                artifacts_path=artifacts_path,
                accelerator_options=pipeline_options.accelerator_options,
+                options=pipeline_options.layout_options,
            ),
            # Table structure model
            TableStructureModel(
--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@ -9,6 +9,7 @@ from docling_core.types.doc.page import TextCell
 from rtree import index

 from docling.datamodel.base_models import BoundingBox, Cluster, Page
+from docling.datamodel.pipeline_options import LayoutOptions

 _log = logging.getLogger(__name__)

@ -194,12 +195,16 @@ class LayoutPostprocessor:
        DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
    }

-    def __init__(self, page: Page, clusters: List[Cluster]) -> None:
+    def __init__(
+        self, page: Page, clusters: List[Cluster], options: LayoutOptions
+    ) -> None:
        """Initialize processor with page and clusters."""
+
        self.cells = page.cells
        self.page = page
        self.page_size = page.size
        self.all_clusters = clusters
+        self.options = options
        self.regular_clusters = [
            c for c in clusters if c.label not in self.SPECIAL_TYPES
        ]
@ -267,7 +272,7 @@ class LayoutPostprocessor:

        # Handle orphaned cells
        unassigned = self._find_unassigned_cells(clusters)
-        if unassigned:
+        if unassigned and self.options.create_orphan_clusters:
            next_id = max((c.id for c in self.all_clusters), default=0) + 1
            orphan_clusters = []
            for i, cell in enumerate(unassigned):