feat(ocr): auto-detect rotated pages in Tesseract (#1167)

* fix(ocr): tesseract support mis-oriented documents

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): update missing test data

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): rotate image to the natural orientation before layout prediction

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): move bounding bow rotation util to orientation.py

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): refactor rotation utilities

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): revert layout updates

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): update e2e OCR test data

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): avoid to swallow tesseract errors causing orientation detection failures

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): revert layout updates

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): update e2e OCR test data

* chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrCliModel`

* chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrModel`

* chore(ocr): default `TesseractOcrCliModel._is_auto` to `False`

* fix(ocr): fix `TesseractOcrCliModel._is_auto` computation

* chore(ocr): improve logging in case of OSD failure in `TesseractOcrCliModel` and `TesseractOcrModel`

---------

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
Clément Doumouro
2025-05-21 18:12:33 +02:00
committed by GitHub
parent 90875247e5
commit 45265bf8b1
96 changed files with 9864 additions and 5258 deletions

View File

@@ -1,12 +1,11 @@
from __future__ import annotations
import logging
from collections.abc import Iterable
from pathlib import Path
from typing import Optional, Type
from typing import Iterable, Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling_core.types.doc.page import TextCell
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
@@ -17,7 +16,11 @@ from docling.datamodel.pipeline_options import (
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.ocr_utils import map_tesseract_script
from docling.utils.ocr_utils import (
map_tesseract_script,
parse_tesseract_orientation,
tesseract_box_to_bounding_rectangle,
)
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
@@ -38,7 +41,7 @@ class TesseractOcrModel(BaseOcrModel):
accelerator_options=accelerator_options,
)
self.options: TesseractOcrOptions
self._is_auto: bool = "auto" in self.options.lang
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self.reader = None
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
@@ -95,13 +98,13 @@ class TesseractOcrModel(BaseOcrModel):
if lang == "auto":
self.reader = tesserocr.PyTessBaseAPI(**tesserocr_kwargs)
self.osd_reader = tesserocr.PyTessBaseAPI(
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
)
else:
self.reader = tesserocr.PyTessBaseAPI(
**{"lang": lang} | tesserocr_kwargs,
)
self.osd_reader = tesserocr.PyTessBaseAPI(
**{"lang": "osd", "psm": tesserocr.PSM.OSD_ONLY} | tesserocr_kwargs
)
self.reader_RIL = tesserocr.RIL
def __del__(self):
@@ -118,19 +121,20 @@ class TesseractOcrModel(BaseOcrModel):
yield from page_batch
return
for page in page_batch:
for page_i, page in enumerate(page_batch):
assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
with TimeRecorder(conv_res, "ocr"):
assert self.reader is not None
assert self.osd_reader is not None
assert self._tesserocr_languages is not None
ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = []
for ocr_rect in ocr_rects:
for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
# Skip zero area boxes
if ocr_rect.area() == 0:
continue
@@ -139,16 +143,27 @@ class TesseractOcrModel(BaseOcrModel):
)
local_reader = self.reader
if "auto" in self.options.lang:
assert self.osd_reader is not None
self.osd_reader.SetImage(high_res_image)
osd = self.osd_reader.DetectOrientationScript()
# No text, probably
if osd is None:
self.osd_reader.SetImage(high_res_image)
osd = self.osd_reader.DetectOrientationScript()
# No text, or Orientation and Script detection failure
if osd is None:
_log.error(
"OSD failed for doc (doc %s, page: %s, "
"OCR rectangle: %s)",
conv_res.input.file,
page_i,
ocr_rect_i,
)
# Skipping if OSD fail when in auto mode, otherwise proceed
# to OCR in the hope OCR will succeed while OSD failed
if self._is_auto:
continue
doc_orientation = parse_tesseract_orientation(osd["orient_deg"])
if doc_orientation != 0:
high_res_image = high_res_image.rotate(
-doc_orientation, expand=True
)
if self._is_auto:
script = osd["script_name"]
script = map_tesseract_script(script)
lang = f"{self.script_prefix}{script}"
@@ -188,11 +203,23 @@ class TesseractOcrModel(BaseOcrModel):
# Extract text within the bounding box
text = local_reader.GetUTF8Text().strip()
confidence = local_reader.MeanTextConf()
left = box["x"] / self.scale
bottom = box["y"] / self.scale
right = (box["x"] + box["w"]) / self.scale
top = (box["y"] + box["h"]) / self.scale
left, top = box["x"], box["y"]
right = left + box["w"]
bottom = top + box["h"]
bbox = BoundingBox(
l=left,
t=top,
r=right,
b=bottom,
coord_origin=CoordOrigin.TOPLEFT,
)
rect = tesseract_box_to_bounding_rectangle(
bbox,
original_offset=ocr_rect,
scale=self.scale,
orientation=doc_orientation,
im_size=high_res_image.size,
)
cells.append(
TextCell(
index=ix,
@@ -200,12 +227,7 @@ class TesseractOcrModel(BaseOcrModel):
orig=text,
from_ocr=True,
confidence=confidence,
rect=BoundingRectangle.from_bounding_box(
BoundingBox.from_tuple(
coord=(left, top, right, bottom),
origin=CoordOrigin.TOPLEFT,
),
),
rect=rect,
)
)