feat: Introduce LayoutOptions to control layout postprocessing behaviour (#1870)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-07-04 15:36:13 +02:00 committed by GitHub
parent 598c9c53d4
commit ec6cf6f7e8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 24 additions and 4 deletions

View File

@ -1,4 +1,5 @@
import logging
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
@ -265,6 +266,12 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
)
class LayoutOptions(BaseModel):
"""Options for layout processing."""
create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
class AsrPipelineOptions(PipelineOptions):
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
artifacts_path: Optional[Union[Path, str]] = None
@ -289,6 +296,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
picture_description_options: PictureDescriptionBaseOptions = (
smolvlm_picture_description
)
layout_options: LayoutOptions = LayoutOptions()
images_scale: float = 1.0
generate_page_images: bool = False

View File

@ -12,6 +12,7 @@ from PIL import Image
from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import LayoutOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.models.utils.hf_model_download import download_hf_model
@ -48,10 +49,15 @@ class LayoutModel(BasePageModel):
CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
def __init__(
self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
self,
artifacts_path: Optional[Path],
accelerator_options: AcceleratorOptions,
options: LayoutOptions,
):
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
self.options = options
device = decide_device(accelerator_options.device)
if artifacts_path is None:
@ -177,7 +183,7 @@ class LayoutModel(BasePageModel):
# Apply postprocessing
processed_clusters, processed_cells = LayoutPostprocessor(
page, clusters
page, clusters, self.options
).postprocess()
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally

View File

@ -80,6 +80,7 @@ class StandardPdfPipeline(PaginatedPipeline):
LayoutModel(
artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
options=pipeline_options.layout_options,
),
# Table structure model
TableStructureModel(

View File

@ -9,6 +9,7 @@ from docling_core.types.doc.page import TextCell
from rtree import index
from docling.datamodel.base_models import BoundingBox, Cluster, Page
from docling.datamodel.pipeline_options import LayoutOptions
_log = logging.getLogger(__name__)
@ -194,12 +195,16 @@ class LayoutPostprocessor:
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
}
def __init__(self, page: Page, clusters: List[Cluster]) -> None:
def __init__(
self, page: Page, clusters: List[Cluster], options: LayoutOptions
) -> None:
"""Initialize processor with page and clusters."""
self.cells = page.cells
self.page = page
self.page_size = page.size
self.all_clusters = clusters
self.options = options
self.regular_clusters = [
c for c in clusters if c.label not in self.SPECIAL_TYPES
]
@ -267,7 +272,7 @@ class LayoutPostprocessor:
# Handle orphaned cells
unassigned = self._find_unassigned_cells(clusters)
if unassigned:
if unassigned and self.options.create_orphan_clusters:
next_id = max((c.id for c in self.all_clusters), default=0) + 1
orphan_clusters = []
for i, cell in enumerate(unassigned):