feat(OCR): Introduce the OcrOptions.force_full_page_ocr parameter that forces a full page OCR scanning (#290)
- When the OCR is forced, any existing PDF cells are rejected. - Introduce the force-ocr cmd parameter in docling CLI. - Update unit tests. - Add the full_page_ocr.py example in mkdocs. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
parent
81c8243a8b
commit
c6b3763ecb
@ -153,6 +153,13 @@ def convert(
|
|||||||
..., help="If enabled, the bitmap content will be processed using OCR."
|
..., help="If enabled, the bitmap content will be processed using OCR."
|
||||||
),
|
),
|
||||||
] = True,
|
] = True,
|
||||||
|
force_ocr: Annotated[
|
||||||
|
bool,
|
||||||
|
typer.Option(
|
||||||
|
...,
|
||||||
|
help="Replace any existing text with OCR generated text over the full content.",
|
||||||
|
),
|
||||||
|
] = False,
|
||||||
ocr_engine: Annotated[
|
ocr_engine: Annotated[
|
||||||
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
||||||
] = OcrEngine.EASYOCR,
|
] = OcrEngine.EASYOCR,
|
||||||
@ -219,11 +226,11 @@ def convert(
|
|||||||
|
|
||||||
match ocr_engine:
|
match ocr_engine:
|
||||||
case OcrEngine.EASYOCR:
|
case OcrEngine.EASYOCR:
|
||||||
ocr_options: OcrOptions = EasyOcrOptions()
|
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
||||||
case OcrEngine.TESSERACT_CLI:
|
case OcrEngine.TESSERACT_CLI:
|
||||||
ocr_options = TesseractCliOcrOptions()
|
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
||||||
case OcrEngine.TESSERACT:
|
case OcrEngine.TESSERACT:
|
||||||
ocr_options = TesseractOcrOptions()
|
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
||||||
case _:
|
case _:
|
||||||
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
||||||
|
|
||||||
|
@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):
|
|||||||
|
|
||||||
class OcrOptions(BaseModel):
|
class OcrOptions(BaseModel):
|
||||||
kind: str
|
kind: str
|
||||||
|
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
||||||
bitmap_area_threshold: float = (
|
bitmap_area_threshold: float = (
|
||||||
0.05 # percentage of the area for a bitmap to processed with OCR
|
0.05 # percentage of the area for a bitmap to processed with OCR
|
||||||
)
|
)
|
||||||
|
@ -10,7 +10,7 @@ from PIL import Image, ImageDraw
|
|||||||
from rtree import index
|
from rtree import index
|
||||||
from scipy.ndimage import find_objects, label
|
from scipy.ndimage import find_objects, label
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell, Page
|
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import OcrOptions
|
from docling.datamodel.pipeline_options import OcrOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
@ -73,7 +73,9 @@ class BaseOcrModel(BasePageModel):
|
|||||||
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
|
||||||
|
|
||||||
# return full-page rectangle if sufficiently covered with bitmaps
|
# return full-page rectangle if sufficiently covered with bitmaps
|
||||||
if coverage > max(BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold):
|
if self.options.force_full_page_ocr or coverage > max(
|
||||||
|
BITMAP_COVERAGE_TRESHOLD, self.options.bitmap_area_threshold
|
||||||
|
):
|
||||||
return [
|
return [
|
||||||
BoundingBox(
|
BoundingBox(
|
||||||
l=0,
|
l=0,
|
||||||
@ -96,7 +98,7 @@ class BaseOcrModel(BasePageModel):
|
|||||||
return ocr_rects
|
return ocr_rects
|
||||||
|
|
||||||
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
||||||
def filter_ocr_cells(self, ocr_cells, programmatic_cells):
|
def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
|
||||||
# Create R-tree index for programmatic cells
|
# Create R-tree index for programmatic cells
|
||||||
p = index.Property()
|
p = index.Property()
|
||||||
p.dimension = 2
|
p.dimension = 2
|
||||||
@ -117,6 +119,23 @@ class BaseOcrModel(BasePageModel):
|
|||||||
]
|
]
|
||||||
return filtered_ocr_cells
|
return filtered_ocr_cells
|
||||||
|
|
||||||
|
def post_process_cells(self, ocr_cells, programmatic_cells):
|
||||||
|
r"""
|
||||||
|
Post-process the ocr and programmatic cells and return the final list of of cells
|
||||||
|
"""
|
||||||
|
if self.options.force_full_page_ocr:
|
||||||
|
# If a full page OCR is forced, use only the OCR cells
|
||||||
|
cells = [
|
||||||
|
Cell(id=c_ocr.id, text=c_ocr.text, bbox=c_ocr.bbox)
|
||||||
|
for c_ocr in ocr_cells
|
||||||
|
]
|
||||||
|
return cells
|
||||||
|
|
||||||
|
## Remove OCR cells which overlap with programmatic cells.
|
||||||
|
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
|
||||||
|
programmatic_cells.extend(filtered_ocr_cells)
|
||||||
|
return programmatic_cells
|
||||||
|
|
||||||
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
||||||
image = copy.deepcopy(page.image)
|
image = copy.deepcopy(page.image)
|
||||||
draw = ImageDraw.Draw(image, "RGBA")
|
draw = ImageDraw.Draw(image, "RGBA")
|
||||||
|
@ -5,7 +5,7 @@ import numpy
|
|||||||
import torch
|
import torch
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell, Page
|
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import EasyOcrOptions
|
from docling.datamodel.pipeline_options import EasyOcrOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
@ -88,12 +88,8 @@ class EasyOcrModel(BaseOcrModel):
|
|||||||
]
|
]
|
||||||
all_ocr_cells.extend(cells)
|
all_ocr_cells.extend(cells)
|
||||||
|
|
||||||
## Remove OCR cells which overlap with programmatic cells.
|
# Post-process the cells
|
||||||
filtered_ocr_cells = self.filter_ocr_cells(
|
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||||
all_ocr_cells, page.cells
|
|
||||||
)
|
|
||||||
|
|
||||||
page.cells.extend(filtered_ocr_cells)
|
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
if settings.debug.visualize_ocr:
|
if settings.debug.visualize_ocr:
|
||||||
|
@ -7,7 +7,7 @@ from typing import Iterable, Optional, Tuple
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell, Page
|
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
@ -170,12 +170,8 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
)
|
)
|
||||||
all_ocr_cells.append(cell)
|
all_ocr_cells.append(cell)
|
||||||
|
|
||||||
## Remove OCR cells which overlap with programmatic cells.
|
# Post-process the cells
|
||||||
filtered_ocr_cells = self.filter_ocr_cells(
|
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||||
all_ocr_cells, page.cells
|
|
||||||
)
|
|
||||||
|
|
||||||
page.cells.extend(filtered_ocr_cells)
|
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
if settings.debug.visualize_ocr:
|
if settings.debug.visualize_ocr:
|
||||||
|
@ -3,7 +3,7 @@ from typing import Iterable
|
|||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell, Page
|
from docling.datamodel.base_models import Cell, OcrCell, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
from docling.datamodel.pipeline_options import TesseractOcrOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
@ -140,12 +140,8 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
# del high_res_image
|
# del high_res_image
|
||||||
all_ocr_cells.extend(cells)
|
all_ocr_cells.extend(cells)
|
||||||
|
|
||||||
## Remove OCR cells which overlap with programmatic cells.
|
# Post-process the cells
|
||||||
filtered_ocr_cells = self.filter_ocr_cells(
|
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||||
all_ocr_cells, page.cells
|
|
||||||
)
|
|
||||||
|
|
||||||
page.cells.extend(filtered_ocr_cells)
|
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
if settings.debug.visualize_ocr:
|
if settings.debug.visualize_ocr:
|
||||||
|
42
docs/examples/full_page_ocr.py
Normal file
42
docs/examples/full_page_ocr.py
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.pipeline_options import (
|
||||||
|
EasyOcrOptions,
|
||||||
|
PdfPipelineOptions,
|
||||||
|
TesseractCliOcrOptions,
|
||||||
|
TesseractOcrOptions,
|
||||||
|
)
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
input_doc = Path("./tests/data/2206.01062.pdf")
|
||||||
|
|
||||||
|
pipeline_options = PdfPipelineOptions()
|
||||||
|
pipeline_options.do_ocr = True
|
||||||
|
pipeline_options.do_table_structure = True
|
||||||
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
|
|
||||||
|
# Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions
|
||||||
|
# ocr_options = EasyOcrOptions(force_full_page_ocr=True)
|
||||||
|
# ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
|
||||||
|
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
|
||||||
|
pipeline_options.ocr_options = ocr_options
|
||||||
|
|
||||||
|
converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
doc = converter.convert(input_doc).document
|
||||||
|
md = doc.export_to_markdown()
|
||||||
|
print(md)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -71,6 +71,7 @@ nav:
|
|||||||
- "Figure enrichment": examples/develop_picture_enrichment.py
|
- "Figure enrichment": examples/develop_picture_enrichment.py
|
||||||
- "Table export": examples/export_tables.py
|
- "Table export": examples/export_tables.py
|
||||||
- "Multimodal export": examples/export_multimodal.py
|
- "Multimodal export": examples/export_multimodal.py
|
||||||
|
- "Force full page OCR": examples/full_page_ocr.py
|
||||||
- RAG / QA:
|
- RAG / QA:
|
||||||
- "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
|
- "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
|
||||||
- "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
|
- "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
|
||||||
|
@ -15,34 +15,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
|||||||
|
|
||||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||||
|
|
||||||
GENERATE = False
|
GENERATE_V1 = False
|
||||||
|
GENERATE_V2 = False
|
||||||
|
|
||||||
# Debug
|
|
||||||
def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
|
|
||||||
r""" """
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
|
|
||||||
parent = pdf_path.parent
|
|
||||||
eng = "" if engine is None else f".{engine}"
|
|
||||||
|
|
||||||
dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
|
|
||||||
with open(dict_fn, "w") as fd:
|
|
||||||
json.dump(doc_result.legacy_document.export_to_dict(), fd)
|
|
||||||
|
|
||||||
pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json")
|
|
||||||
pages = [p.model_dump() for p in doc_result.pages]
|
|
||||||
with open(pages_fn, "w") as fd:
|
|
||||||
json.dump(pages, fd)
|
|
||||||
|
|
||||||
doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
|
|
||||||
with open(doctags_fn, "w") as fd:
|
|
||||||
fd.write(doc_result.legacy_document.export_to_doctags())
|
|
||||||
|
|
||||||
md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
|
|
||||||
with open(md_fn, "w") as fd:
|
|
||||||
fd.write(doc_result.legacy_document.export_to_markdown())
|
|
||||||
|
|
||||||
|
|
||||||
def get_pdf_paths():
|
def get_pdf_paths():
|
||||||
@ -74,13 +48,15 @@ def get_converter(ocr_options: OcrOptions):
|
|||||||
|
|
||||||
|
|
||||||
def test_e2e_conversions():
|
def test_e2e_conversions():
|
||||||
|
|
||||||
pdf_paths = get_pdf_paths()
|
pdf_paths = get_pdf_paths()
|
||||||
|
|
||||||
engines: List[OcrOptions] = [
|
engines: List[OcrOptions] = [
|
||||||
EasyOcrOptions(),
|
EasyOcrOptions(),
|
||||||
TesseractOcrOptions(),
|
TesseractOcrOptions(),
|
||||||
TesseractCliOcrOptions(),
|
TesseractCliOcrOptions(),
|
||||||
|
EasyOcrOptions(force_full_page_ocr=True),
|
||||||
|
TesseractOcrOptions(force_full_page_ocr=True),
|
||||||
|
TesseractCliOcrOptions(force_full_page_ocr=True),
|
||||||
]
|
]
|
||||||
|
|
||||||
for ocr_options in engines:
|
for ocr_options in engines:
|
||||||
@ -91,20 +67,16 @@ def test_e2e_conversions():
|
|||||||
|
|
||||||
doc_result: ConversionResult = converter.convert(pdf_path)
|
doc_result: ConversionResult = converter.convert(pdf_path)
|
||||||
|
|
||||||
# Save conversions
|
|
||||||
# save_output(pdf_path, doc_result, None)
|
|
||||||
|
|
||||||
# Debug
|
|
||||||
verify_conversion_result_v1(
|
verify_conversion_result_v1(
|
||||||
input_path=pdf_path,
|
input_path=pdf_path,
|
||||||
doc_result=doc_result,
|
doc_result=doc_result,
|
||||||
generate=GENERATE,
|
generate=GENERATE_V1,
|
||||||
fuzzy=True,
|
fuzzy=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
verify_conversion_result_v2(
|
verify_conversion_result_v2(
|
||||||
input_path=pdf_path,
|
input_path=pdf_path,
|
||||||
doc_result=doc_result,
|
doc_result=doc_result,
|
||||||
generate=GENERATE,
|
generate=GENERATE_V2,
|
||||||
fuzzy=True,
|
fuzzy=True,
|
||||||
)
|
)
|
||||||
|
@ -256,15 +256,19 @@ def verify_conversion_result_v1(
|
|||||||
dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
|
dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
|
||||||
|
|
||||||
if generate: # only used when re-generating truth
|
if generate: # only used when re-generating truth
|
||||||
|
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(pages_path, "w") as fw:
|
with open(pages_path, "w") as fw:
|
||||||
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
|
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
|
||||||
|
|
||||||
|
json_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(json_path, "w") as fw:
|
with open(json_path, "w") as fw:
|
||||||
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
|
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
|
||||||
|
|
||||||
|
md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(md_path, "w") as fw:
|
with open(md_path, "w") as fw:
|
||||||
fw.write(doc_pred_md)
|
fw.write(doc_pred_md)
|
||||||
|
|
||||||
|
dt_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(dt_path, "w") as fw:
|
with open(dt_path, "w") as fw:
|
||||||
fw.write(doc_pred_dt)
|
fw.write(doc_pred_dt)
|
||||||
else: # default branch in test
|
else: # default branch in test
|
||||||
@ -328,15 +332,19 @@ def verify_conversion_result_v2(
|
|||||||
dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
|
dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
|
||||||
|
|
||||||
if generate: # only used when re-generating truth
|
if generate: # only used when re-generating truth
|
||||||
|
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(pages_path, "w") as fw:
|
with open(pages_path, "w") as fw:
|
||||||
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
|
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
|
||||||
|
|
||||||
|
json_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(json_path, "w") as fw:
|
with open(json_path, "w") as fw:
|
||||||
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
|
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
|
||||||
|
|
||||||
|
md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(md_path, "w") as fw:
|
with open(md_path, "w") as fw:
|
||||||
fw.write(doc_pred_md)
|
fw.write(doc_pred_md)
|
||||||
|
|
||||||
|
dt_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(dt_path, "w") as fw:
|
with open(dt_path, "w") as fw:
|
||||||
fw.write(doc_pred_dt)
|
fw.write(doc_pred_dt)
|
||||||
else: # default branch in test
|
else: # default branch in test
|
||||||
|
Loading…
Reference in New Issue
Block a user