feat(OCR): Introduce the OcrOptions.force_full_page_ocr parameter that forces a full page OCR scanning (#290)

- When the OCR is forced, any existing PDF cells are rejected. - Introduce the force-ocr cmd parameter in docling CLI. - Update unit tests. - Add the full_page_ocr.py example in mkdocs. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
2024-11-12 09:46:14 +01:00
parent 81c8243a8b
commit c6b3763ecb
10 changed files with 100 additions and 62 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -153,6 +153,13 @@ def convert(
            ..., help="If enabled, the bitmap content will be processed using OCR."
        ),
    ] = True,
    force_ocr: Annotated[
        bool,
        typer.Option(
            ...,
            help="Replace any existing text with OCR generated text over the full content.",
        ),
    ] = False,
    ocr_engine: Annotated[
        OcrEngine, typer.Option(..., help="The OCR engine to use.")
    ] = OcrEngine.EASYOCR,
@@ -219,11 +226,11 @@ def convert(
    match ocr_engine:
        case OcrEngine.EASYOCR:
-            ocr_options: OcrOptions = EasyOcrOptions()
+            ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
        case OcrEngine.TESSERACT_CLI:
-            ocr_options = TesseractCliOcrOptions()
+            ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
        case OcrEngine.TESSERACT:
-            ocr_options = TesseractOcrOptions()
+            ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
        case _:
            raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):
 class OcrOptions(BaseModel):
    kind: str
    force_full_page_ocr: bool = False  # If enabled a full page OCR is always applied
    bitmap_area_threshold: float = (
        0.05  # percentage of the area for a bitmap to processed with OCR
    )
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@@ -10,7 +10,7 @@ from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import find_objects, label
-from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.base_models import Cell, OcrCell, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import OcrOptions
 from docling.datamodel.settings import settings
@@ -73,7 +73,9 @@ class BaseOcrModel(BasePageModel):
        coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
        # return full-page rectangle if sufficiently covered with bitmaps
-        if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold):
+        if self.options.force_full_page_ocr or coverage > max(
            BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
        ):
            return [
                BoundingBox(
                    l=0,
@@ -96,7 +98,7 @@ class BaseOcrModel(BasePageModel):
            return ocr_rects
    # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
-    def filter_ocr_cells(self, ocr_cells, programmatic_cells):
+    def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
        # Create R-tree index for programmatic cells
        p = index.Property()
        p.dimension = 2
@@ -117,6 +119,23 @@ class BaseOcrModel(BasePageModel):
        ]
        return filtered_ocr_cells
    def post_process_cells(self, ocr_cells, programmatic_cells):
        r"""
        Post-process the ocr and programmatic cells and return the final list of of cells
        """
        if self.options.force_full_page_ocr:
            # If a full page OCR is forced, use only the OCR cells
            cells = [
                Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
                for c_ocr in ocr_cells
            ]
            return cells
        ## Remove OCR cells which overlap with programmatic cells.
        filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
        programmatic_cells.extend(filtered_ocr_cells)
        return programmatic_cells
    def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
        image = copy.deepcopy(page.image)
        draw = ImageDraw.Draw(image, "RGBA")
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@@ -5,7 +5,7 @@ import numpy
 import torch
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.base_models import Cell, OcrCell, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import EasyOcrOptions
 from docling.datamodel.settings import settings
@@ -88,12 +88,8 @@ class EasyOcrModel(BaseOcrModel):
                        ]
                        all_ocr_cells.extend(cells)
-                    ## Remove OCR cells which overlap with programmatic cells.
+                    # Post-process the cells
-                    filtered_ocr_cells = self.filter_ocr_cells(
+                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
                        all_ocr_cells, page.cells
                    )
                    page.cells.extend(filtered_ocr_cells)
                # DEBUG code:
                if settings.debug.visualize_ocr:
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@@ -7,7 +7,7 @@ from typing import Iterable, Optional, Tuple
 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.base_models import Cell, OcrCell, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractCliOcrOptions
 from docling.datamodel.settings import settings
@@ -170,12 +170,8 @@ class TesseractOcrCliModel(BaseOcrModel):
                            )
                            all_ocr_cells.append(cell)
-                    ## Remove OCR cells which overlap with programmatic cells.
+                    # Post-process the cells
-                    filtered_ocr_cells = self.filter_ocr_cells(
+                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
                        all_ocr_cells, page.cells
                    )
                    page.cells.extend(filtered_ocr_cells)
                # DEBUG code:
                if settings.debug.visualize_ocr:
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@@ -3,7 +3,7 @@ from typing import Iterable
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling.datamodel.base_models import OcrCell, Page
+from docling.datamodel.base_models import Cell, OcrCell, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import TesseractOcrOptions
 from docling.datamodel.settings import settings
@@ -140,12 +140,8 @@ class TesseractOcrModel(BaseOcrModel):
                        # del high_res_image
                        all_ocr_cells.extend(cells)
-                    ## Remove OCR cells which overlap with programmatic cells.
+                    # Post-process the cells
-                    filtered_ocr_cells = self.filter_ocr_cells(
+                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
                        all_ocr_cells, page.cells
                    )
                    page.cells.extend(filtered_ocr_cells)
                # DEBUG code:
                if settings.debug.visualize_ocr:
--- a/docs/examples/full_page_ocr.py
+++ b/docs/examples/full_page_ocr.py
@@ -0,0 +1,42 @@
 from pathlib import Path
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    PdfPipelineOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
 def main():
    input_doc = Path("./tests/data/2206.01062.pdf")
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    # Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions
    # ocr_options = EasyOcrOptions(force_full_page_ocr=True)
    # ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
    ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
    pipeline_options.ocr_options = ocr_options
    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
                pipeline_options=pipeline_options,
            )
        }
    )
    doc = converter.convert(input_doc).document
    md = doc.export_to_markdown()
    print(md)
 if __name__ == "__main__":
    main()
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -71,6 +71,7 @@ nav:
      - "Figure enrichment": examples/develop_picture_enrichment.py
      - "Table export": examples/export_tables.py
      - "Multimodal export": examples/export_multimodal.py
      - "Force full page OCR": examples/full_page_ocr.py
    - RAG / QA:
      - "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
      - "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@@ -15,34 +15,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
 from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
-GENERATE = False
+GENERATE_V1 = False
-
+GENERATE_V2 = False
 # Debug
 def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
    r""" """
    import json
    import os
    parent = pdf_path.parent
    eng = "" if engine is None else f".{engine}"
    dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
    with open(dict_fn, "w") as fd:
        json.dump(doc_result.legacy_document.export_to_dict(), fd)
    pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json")
    pages = [p.model_dump() for p in doc_result.pages]
    with open(pages_fn, "w") as fd:
        json.dump(pages, fd)
    doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
    with open(doctags_fn, "w") as fd:
        fd.write(doc_result.legacy_document.export_to_doctags())
    md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
    with open(md_fn, "w") as fd:
        fd.write(doc_result.legacy_document.export_to_markdown())
 def get_pdf_paths():
@@ -74,13 +48,15 @@ def get_converter(ocr_options: OcrOptions):
 def test_e2e_conversions():
    pdf_paths = get_pdf_paths()
    engines: List[OcrOptions] = [
        EasyOcrOptions(),
        TesseractOcrOptions(),
        TesseractCliOcrOptions(),
        EasyOcrOptions(force_full_page_ocr=True),
        TesseractOcrOptions(force_full_page_ocr=True),
        TesseractCliOcrOptions(force_full_page_ocr=True),
    ]
    for ocr_options in engines:
@@ -91,20 +67,16 @@ def test_e2e_conversions():
            doc_result: ConversionResult = converter.convert(pdf_path)
            # Save conversions
            # save_output(pdf_path, doc_result, None)
            # Debug
            verify_conversion_result_v1(
                input_path=pdf_path,
                doc_result=doc_result,
-                generate=GENERATE,
+                generate=GENERATE_V1,
                fuzzy=True,
            )
            verify_conversion_result_v2(
                input_path=pdf_path,
                doc_result=doc_result,
-                generate=GENERATE,
+                generate=GENERATE_V2,
                fuzzy=True,
            )
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -256,15 +256,19 @@ def verify_conversion_result_v1(
    dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
    if generate:  # only used when re-generating truth
        pages_path.parent.mkdir(parents=True, exist_ok=True)
        with open(pages_path, "w") as fw:
            fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
        json_path.parent.mkdir(parents=True, exist_ok=True)
        with open(json_path, "w") as fw:
            fw.write(json.dumps(doc_pred, default=pydantic_encoder))
        md_path.parent.mkdir(parents=True, exist_ok=True)
        with open(md_path, "w") as fw:
            fw.write(doc_pred_md)
        dt_path.parent.mkdir(parents=True, exist_ok=True)
        with open(dt_path, "w") as fw:
            fw.write(doc_pred_dt)
    else:  # default branch in test
@@ -328,15 +332,19 @@ def verify_conversion_result_v2(
    dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
    if generate:  # only used when re-generating truth
        pages_path.parent.mkdir(parents=True, exist_ok=True)
        with open(pages_path, "w") as fw:
            fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
        json_path.parent.mkdir(parents=True, exist_ok=True)
        with open(json_path, "w") as fw:
            fw.write(json.dumps(doc_pred, default=pydantic_encoder))
        md_path.parent.mkdir(parents=True, exist_ok=True)
        with open(md_path, "w") as fw:
            fw.write(doc_pred_md)
        dt_path.parent.mkdir(parents=True, exist_ok=True)
        with open(dt_path, "w") as fw:
            fw.write(doc_pred_dt)
    else:  # default branch in test