feat(ocr): auto-detect rotated pages in Tesseract (#1167)

* fix(ocr): tesseract support mis-oriented documents

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): update missing test data

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): rotate image to the natural orientation before layout prediction

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): move bounding bow rotation util to orientation.py

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): refactor rotation utilities

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): revert layout updates

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): update e2e OCR test data

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* fix(ocr): avoid to swallow tesseract errors causing orientation detection failures

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): revert layout updates

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>

* chore(ocr): update e2e OCR test data

* chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrCliModel`

* chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrModel`

* chore(ocr): default `TesseractOcrCliModel._is_auto` to `False`

* fix(ocr): fix `TesseractOcrCliModel._is_auto` computation

* chore(ocr): improve logging in case of OSD failure in `TesseractOcrCliModel` and `TesseractOcrModel`

---------

Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
Clément Doumouro
2025-05-21 18:12:33 +02:00
committed by GitHub
parent 90875247e5
commit 45265bf8b1
96 changed files with 9864 additions and 5258 deletions

View File

@@ -2,6 +2,7 @@ import csv
import io
import logging
import os
import subprocess
import tempfile
from collections.abc import Iterable
from pathlib import Path
@@ -10,7 +11,7 @@ from typing import List, Optional, Tuple, Type
import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
from docling_core.types.doc.page import TextCell
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
@@ -21,7 +22,11 @@ from docling.datamodel.pipeline_options import (
)
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.ocr_utils import map_tesseract_script
from docling.utils.ocr_utils import (
map_tesseract_script,
parse_tesseract_orientation,
tesseract_box_to_bounding_rectangle,
)
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
@@ -49,6 +54,7 @@ class TesseractOcrCliModel(BaseOcrModel):
self._version: Optional[str] = None
self._tesseract_languages: Optional[List[str]] = None
self._script_prefix: Optional[str] = None
self._is_auto: bool = "auto" in self.options.lang
if self.enabled:
try:
@@ -93,14 +99,13 @@ class TesseractOcrCliModel(BaseOcrModel):
return name, version
def _run_tesseract(self, ifilename: str):
def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
r"""
Run tesseract CLI
"""
cmd = [self.options.tesseract_cmd]
if "auto" in self.options.lang:
lang = self._detect_language(ifilename)
if self._is_auto:
lang = self._parse_language(osd)
if lang is not None:
cmd.append("-l")
cmd.append(lang)
@@ -115,13 +120,12 @@ class TesseractOcrCliModel(BaseOcrModel):
cmd += [ifilename, "stdout", "tsv"]
_log.info("command: {}".format(" ".join(cmd)))
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate()
output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
# _log.info(output)
# Decode the byte string to a regular string
decoded_data = output.decode("utf-8")
decoded_data = output.stdout.decode("utf-8")
# _log.info(decoded_data)
# Read the TSV file generated by Tesseract
@@ -139,22 +143,24 @@ class TesseractOcrCliModel(BaseOcrModel):
return df_filtered
def _detect_language(self, ifilename: str):
def _perform_osd(self, ifilename: str) -> pd.DataFrame:
r"""
Run tesseract in PSM 0 mode to detect the language
"""
assert self._tesseract_languages is not None
cmd = [self.options.tesseract_cmd]
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
_log.info("command: {}".format(" ".join(cmd)))
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate()
decoded_data = output.decode("utf-8")
output = subprocess.run(cmd, capture_output=True, check=True)
decoded_data = output.stdout.decode("utf-8")
df_detected = pd.read_csv(
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
)
scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
return df_detected
def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
assert self._tesseract_languages is not None
scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
if len(scripts) == 0:
_log.warning("Tesseract cannot detect the script of the page")
return None
@@ -182,9 +188,8 @@ class TesseractOcrCliModel(BaseOcrModel):
cmd = [self.options.tesseract_cmd]
cmd.append("--list-langs")
_log.info("command: {}".format(" ".join(cmd)))
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate()
decoded_data = output.decode("utf-8")
output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
decoded_data = output.stdout.decode("utf-8")
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
self._tesseract_languages = df_list[0].tolist()[1:]
@@ -203,7 +208,7 @@ class TesseractOcrCliModel(BaseOcrModel):
yield from page_batch
return
for page in page_batch:
for page_i, page in enumerate(page_batch):
assert page._backend is not None
if not page._backend.is_valid():
yield page
@@ -212,7 +217,7 @@ class TesseractOcrCliModel(BaseOcrModel):
ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = []
for ocr_rect in ocr_rects:
for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
# Skip zero area boxes
if ocr_rect.area() == 0:
continue
@@ -225,8 +230,42 @@ class TesseractOcrCliModel(BaseOcrModel):
) as image_file:
fname = image_file.name
high_res_image.save(image_file)
df_result = self._run_tesseract(fname)
doc_orientation = 0
try:
df_osd = self._perform_osd(fname)
doc_orientation = _parse_orientation(df_osd)
except subprocess.CalledProcessError as exc:
_log.error(
"OSD failed (doc %s, page: %s, "
"OCR rectangle: %s, processed image file %s):\n %s",
conv_res.input.file,
page_i,
ocr_rect_i,
image_file,
exc.stderr,
)
# Skipping if OSD fail when in auto mode, otherwise proceed
# to OCR in the hope OCR will succeed while OSD failed
if self._is_auto:
continue
if doc_orientation != 0:
high_res_image = high_res_image.rotate(
-doc_orientation, expand=True
)
high_res_image.save(fname)
try:
df_result = self._run_tesseract(fname, df_osd)
except subprocess.CalledProcessError as exc:
_log.error(
"tesseract OCR failed (doc %s, page: %s, "
"OCR rectangle: %s, processed image file %s):\n %s",
conv_res.input.file,
page_i,
ocr_rect_i,
image_file,
exc.stderr,
)
continue
finally:
if os.path.exists(fname):
os.remove(fname)
@@ -238,31 +277,30 @@ class TesseractOcrCliModel(BaseOcrModel):
text = row["text"]
conf = row["conf"]
l = float(row["left"]) # noqa: E741
b = float(row["top"])
w = float(row["width"])
h = float(row["height"])
t = b + h
r = l + w
left, top = float(row["left"]), float(row["top"])
right = left + float(row["width"])
bottom = top + row["height"]
bbox = BoundingBox(
l=left,
t=top,
r=right,
b=bottom,
coord_origin=CoordOrigin.TOPLEFT,
)
rect = tesseract_box_to_bounding_rectangle(
bbox,
original_offset=ocr_rect,
scale=self.scale,
orientation=doc_orientation,
im_size=high_res_image.size,
)
cell = TextCell(
index=ix,
text=str(text),
orig=str(text),
from_ocr=True,
confidence=conf / 100.0,
rect=BoundingRectangle.from_bounding_box(
BoundingBox.from_tuple(
coord=(
(l / self.scale) + ocr_rect.l,
(b / self.scale) + ocr_rect.t,
(r / self.scale) + ocr_rect.l,
(t / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
)
),
rect=rect,
)
all_ocr_cells.append(cell)
@@ -278,3 +316,9 @@ class TesseractOcrCliModel(BaseOcrModel):
@classmethod
def get_options_type(cls) -> Type[OcrOptions]:
return TesseractCliOcrOptions
def _parse_orientation(df_osd: pd.DataFrame) -> int:
orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
orientation = parse_tesseract_orientation(orientations[0].strip())
return orientation