feat(OCR): Introduce the OcrOptions.force_full_page_ocr parameter that forces a full page OCR scanning (#290)

- When the OCR is forced, any existing PDF cells are rejected.
- Introduce the force-ocr cmd parameter in docling CLI.
- Update unit tests.
- Add the full_page_ocr.py example in mkdocs.

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos
2024-11-12 09:46:14 +01:00
committed by GitHub
parent 81c8243a8b
commit c6b3763ecb
10 changed files with 100 additions and 62 deletions

View File

@@ -5,7 +5,7 @@ import numpy
import torch
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.base_models import Cell, OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import EasyOcrOptions
from docling.datamodel.settings import settings
@@ -88,12 +88,8 @@ class EasyOcrModel(BaseOcrModel):
]
all_ocr_cells.extend(cells)
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(
all_ocr_cells, page.cells
)
page.cells.extend(filtered_ocr_cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
# DEBUG code:
if settings.debug.visualize_ocr: