feat(ocr): auto-detect rotated pages in Tesseract (#1167)
* fix(ocr): tesseract support mis-oriented documents Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): update missing test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): rotate image to the natural orientation before layout prediction Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): move bounding bow rotation util to orientation.py Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): refactor rotation utilities Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): avoid to swallow tesseract errors causing orientation detection failures Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrCliModel` * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrModel` * chore(ocr): default `TesseractOcrCliModel._is_auto` to `False` * fix(ocr): fix `TesseractOcrCliModel._is_auto` computation * chore(ocr): improve logging in case of OSD failure in `TesseractOcrCliModel` and `TesseractOcrModel` --------- Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
This commit is contained in:
@@ -2,6 +2,7 @@ import csv
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
@@ -10,7 +11,7 @@ from typing import List, Optional, Tuple, Type
|
||||
|
||||
import pandas as pd
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
from docling_core.types.doc.page import TextCell
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
@@ -21,7 +22,11 @@ from docling.datamodel.pipeline_options import (
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.ocr_utils import map_tesseract_script
|
||||
from docling.utils.ocr_utils import (
|
||||
map_tesseract_script,
|
||||
parse_tesseract_orientation,
|
||||
tesseract_box_to_bounding_rectangle,
|
||||
)
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@@ -49,6 +54,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
self._version: Optional[str] = None
|
||||
self._tesseract_languages: Optional[List[str]] = None
|
||||
self._script_prefix: Optional[str] = None
|
||||
self._is_auto: bool = "auto" in self.options.lang
|
||||
|
||||
if self.enabled:
|
||||
try:
|
||||
@@ -93,14 +99,13 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
|
||||
return name, version
|
||||
|
||||
def _run_tesseract(self, ifilename: str):
|
||||
def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
|
||||
r"""
|
||||
Run tesseract CLI
|
||||
"""
|
||||
cmd = [self.options.tesseract_cmd]
|
||||
|
||||
if "auto" in self.options.lang:
|
||||
lang = self._detect_language(ifilename)
|
||||
if self._is_auto:
|
||||
lang = self._parse_language(osd)
|
||||
if lang is not None:
|
||||
cmd.append("-l")
|
||||
cmd.append(lang)
|
||||
@@ -115,13 +120,12 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
cmd += [ifilename, "stdout", "tsv"]
|
||||
_log.info("command: {}".format(" ".join(cmd)))
|
||||
|
||||
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
||||
output, _ = proc.communicate()
|
||||
output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
|
||||
|
||||
# _log.info(output)
|
||||
|
||||
# Decode the byte string to a regular string
|
||||
decoded_data = output.decode("utf-8")
|
||||
decoded_data = output.stdout.decode("utf-8")
|
||||
# _log.info(decoded_data)
|
||||
|
||||
# Read the TSV file generated by Tesseract
|
||||
@@ -139,22 +143,24 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
|
||||
return df_filtered
|
||||
|
||||
def _detect_language(self, ifilename: str):
|
||||
def _perform_osd(self, ifilename: str) -> pd.DataFrame:
|
||||
r"""
|
||||
Run tesseract in PSM 0 mode to detect the language
|
||||
"""
|
||||
assert self._tesseract_languages is not None
|
||||
|
||||
cmd = [self.options.tesseract_cmd]
|
||||
cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
|
||||
_log.info("command: {}".format(" ".join(cmd)))
|
||||
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
||||
output, _ = proc.communicate()
|
||||
decoded_data = output.decode("utf-8")
|
||||
output = subprocess.run(cmd, capture_output=True, check=True)
|
||||
decoded_data = output.stdout.decode("utf-8")
|
||||
df_detected = pd.read_csv(
|
||||
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
||||
)
|
||||
scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
|
||||
return df_detected
|
||||
|
||||
def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
|
||||
assert self._tesseract_languages is not None
|
||||
scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
|
||||
if len(scripts) == 0:
|
||||
_log.warning("Tesseract cannot detect the script of the page")
|
||||
return None
|
||||
@@ -182,9 +188,8 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
cmd = [self.options.tesseract_cmd]
|
||||
cmd.append("--list-langs")
|
||||
_log.info("command: {}".format(" ".join(cmd)))
|
||||
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
||||
output, _ = proc.communicate()
|
||||
decoded_data = output.decode("utf-8")
|
||||
output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
|
||||
decoded_data = output.stdout.decode("utf-8")
|
||||
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
|
||||
self._tesseract_languages = df_list[0].tolist()[1:]
|
||||
|
||||
@@ -203,7 +208,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
yield from page_batch
|
||||
return
|
||||
|
||||
for page in page_batch:
|
||||
for page_i, page in enumerate(page_batch):
|
||||
assert page._backend is not None
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
@@ -212,7 +217,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
|
||||
all_ocr_cells = []
|
||||
for ocr_rect in ocr_rects:
|
||||
for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
|
||||
# Skip zero area boxes
|
||||
if ocr_rect.area() == 0:
|
||||
continue
|
||||
@@ -225,8 +230,42 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
) as image_file:
|
||||
fname = image_file.name
|
||||
high_res_image.save(image_file)
|
||||
|
||||
df_result = self._run_tesseract(fname)
|
||||
doc_orientation = 0
|
||||
try:
|
||||
df_osd = self._perform_osd(fname)
|
||||
doc_orientation = _parse_orientation(df_osd)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
_log.error(
|
||||
"OSD failed (doc %s, page: %s, "
|
||||
"OCR rectangle: %s, processed image file %s):\n %s",
|
||||
conv_res.input.file,
|
||||
page_i,
|
||||
ocr_rect_i,
|
||||
image_file,
|
||||
exc.stderr,
|
||||
)
|
||||
# Skipping if OSD fail when in auto mode, otherwise proceed
|
||||
# to OCR in the hope OCR will succeed while OSD failed
|
||||
if self._is_auto:
|
||||
continue
|
||||
if doc_orientation != 0:
|
||||
high_res_image = high_res_image.rotate(
|
||||
-doc_orientation, expand=True
|
||||
)
|
||||
high_res_image.save(fname)
|
||||
try:
|
||||
df_result = self._run_tesseract(fname, df_osd)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
_log.error(
|
||||
"tesseract OCR failed (doc %s, page: %s, "
|
||||
"OCR rectangle: %s, processed image file %s):\n %s",
|
||||
conv_res.input.file,
|
||||
page_i,
|
||||
ocr_rect_i,
|
||||
image_file,
|
||||
exc.stderr,
|
||||
)
|
||||
continue
|
||||
finally:
|
||||
if os.path.exists(fname):
|
||||
os.remove(fname)
|
||||
@@ -238,31 +277,30 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
text = row["text"]
|
||||
conf = row["conf"]
|
||||
|
||||
l = float(row["left"]) # noqa: E741
|
||||
b = float(row["top"])
|
||||
w = float(row["width"])
|
||||
h = float(row["height"])
|
||||
|
||||
t = b + h
|
||||
r = l + w
|
||||
|
||||
left, top = float(row["left"]), float(row["top"])
|
||||
right = left + float(row["width"])
|
||||
bottom = top + row["height"]
|
||||
bbox = BoundingBox(
|
||||
l=left,
|
||||
t=top,
|
||||
r=right,
|
||||
b=bottom,
|
||||
coord_origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
rect = tesseract_box_to_bounding_rectangle(
|
||||
bbox,
|
||||
original_offset=ocr_rect,
|
||||
scale=self.scale,
|
||||
orientation=doc_orientation,
|
||||
im_size=high_res_image.size,
|
||||
)
|
||||
cell = TextCell(
|
||||
index=ix,
|
||||
text=str(text),
|
||||
orig=str(text),
|
||||
from_ocr=True,
|
||||
confidence=conf / 100.0,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox.from_tuple(
|
||||
coord=(
|
||||
(l / self.scale) + ocr_rect.l,
|
||||
(b / self.scale) + ocr_rect.t,
|
||||
(r / self.scale) + ocr_rect.l,
|
||||
(t / self.scale) + ocr_rect.t,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
),
|
||||
rect=rect,
|
||||
)
|
||||
all_ocr_cells.append(cell)
|
||||
|
||||
@@ -278,3 +316,9 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
@classmethod
|
||||
def get_options_type(cls) -> Type[OcrOptions]:
|
||||
return TesseractCliOcrOptions
|
||||
|
||||
|
||||
def _parse_orientation(df_osd: pd.DataFrame) -> int:
|
||||
orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
|
||||
orientation = parse_tesseract_orientation(orientations[0].strip())
|
||||
return orientation
|
||||
|
||||
Reference in New Issue
Block a user