feat(ocr): auto-detect rotated pages in Tesseract (#1167)

* fix(ocr): tesseract support mis-oriented documents Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): update missing test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): rotate image to the natural orientation before layout prediction Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): move bounding bow rotation util to orientation.py Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): refactor rotation utilities Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * fix(ocr): avoid to swallow tesseract errors causing orientation detection failures Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): revert layout updates Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com> * chore(ocr): update e2e OCR test data * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrCliModel` * chore(ocr): proceed to OCR without rotation when OSD fails in `TesseractOcrModel` * chore(ocr): default `TesseractOcrCliModel._is_auto` to `False` * fix(ocr): fix `TesseractOcrCliModel._is_auto` computation * chore(ocr): improve logging in case of OSD failure in `TesseractOcrCliModel` and `TesseractOcrModel` --------- Signed-off-by: Clément Doumouro <clement.doumouro@gmail.com>
2025-05-21 18:12:33 +02:00
parent 90875247e5
commit 45265bf8b1
96 changed files with 9864 additions and 5258 deletions
@@ -2,6 +2,7 @@ import csv
 import io
 import logging
 import os
+import subprocess
 import tempfile
 from collections.abc import Iterable
 from pathlib import Path
@@ -10,7 +11,7 @@ from typing import List, Optional, Tuple, Type

 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import BoundingRectangle, TextCell
+from docling_core.types.doc.page import TextCell

 from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
@@ -21,7 +22,11 @@ from docling.datamodel.pipeline_options import (
 )
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
-from docling.utils.ocr_utils import map_tesseract_script
+from docling.utils.ocr_utils import (
+    map_tesseract_script,
+    parse_tesseract_orientation,
+    tesseract_box_to_bounding_rectangle,
+)
 from docling.utils.profiling import TimeRecorder

 _log = logging.getLogger(__name__)
@@ -49,6 +54,7 @@ class TesseractOcrCliModel(BaseOcrModel):
        self._version: Optional[str] = None
        self._tesseract_languages: Optional[List[str]] = None
        self._script_prefix: Optional[str] = None
+        self._is_auto: bool = "auto" in self.options.lang

        if self.enabled:
            try:
@@ -93,14 +99,13 @@ class TesseractOcrCliModel(BaseOcrModel):

        return name, version

-    def _run_tesseract(self, ifilename: str):
+    def _run_tesseract(self, ifilename: str, osd: pd.DataFrame):
        r"""
        Run tesseract CLI
        """
        cmd = [self.options.tesseract_cmd]
-
-        if "auto" in self.options.lang:
-            lang = self._detect_language(ifilename)
+        if self._is_auto:
+            lang = self._parse_language(osd)
            if lang is not None:
                cmd.append("-l")
                cmd.append(lang)
@@ -115,13 +120,12 @@ class TesseractOcrCliModel(BaseOcrModel):
        cmd += [ifilename, "stdout", "tsv"]
        _log.info("command: {}".format(" ".join(cmd)))

-        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
-        output, _ = proc.communicate()
+        output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)

        # _log.info(output)

        # Decode the byte string to a regular string
-        decoded_data = output.decode("utf-8")
+        decoded_data = output.stdout.decode("utf-8")
        # _log.info(decoded_data)

        # Read the TSV file generated by Tesseract
@@ -139,22 +143,24 @@ class TesseractOcrCliModel(BaseOcrModel):

        return df_filtered

-    def _detect_language(self, ifilename: str):
+    def _perform_osd(self, ifilename: str) -> pd.DataFrame:
        r"""
        Run tesseract in PSM 0 mode to detect the language
        """
-        assert self._tesseract_languages is not None

        cmd = [self.options.tesseract_cmd]
        cmd.extend(["--psm", "0", "-l", "osd", ifilename, "stdout"])
        _log.info("command: {}".format(" ".join(cmd)))
-        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
-        output, _ = proc.communicate()
-        decoded_data = output.decode("utf-8")
+        output = subprocess.run(cmd, capture_output=True, check=True)
+        decoded_data = output.stdout.decode("utf-8")
        df_detected = pd.read_csv(
            io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
        )
-        scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
+        return df_detected
+
+    def _parse_language(self, df_osd: pd.DataFrame) -> Optional[str]:
+        assert self._tesseract_languages is not None
+        scripts = df_osd.loc[df_osd["key"] == "Script"].value.tolist()
        if len(scripts) == 0:
            _log.warning("Tesseract cannot detect the script of the page")
            return None
@@ -182,9 +188,8 @@ class TesseractOcrCliModel(BaseOcrModel):
        cmd = [self.options.tesseract_cmd]
        cmd.append("--list-langs")
        _log.info("command: {}".format(" ".join(cmd)))
-        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
-        output, _ = proc.communicate()
-        decoded_data = output.decode("utf-8")
+        output = subprocess.run(cmd, stdout=PIPE, stderr=DEVNULL, check=True)
+        decoded_data = output.stdout.decode("utf-8")
        df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
        self._tesseract_languages = df_list[0].tolist()[1:]

@@ -203,7 +208,7 @@ class TesseractOcrCliModel(BaseOcrModel):
            yield from page_batch
            return

-        for page in page_batch:
+        for page_i, page in enumerate(page_batch):
            assert page._backend is not None
            if not page._backend.is_valid():
                yield page
@@ -212,7 +217,7 @@ class TesseractOcrCliModel(BaseOcrModel):
                    ocr_rects = self.get_ocr_rects(page)

                    all_ocr_cells = []
-                    for ocr_rect in ocr_rects:
+                    for ocr_rect_i, ocr_rect in enumerate(ocr_rects):
                        # Skip zero area boxes
                        if ocr_rect.area() == 0:
                            continue
@@ -225,8 +230,42 @@ class TesseractOcrCliModel(BaseOcrModel):
                            ) as image_file:
                                fname = image_file.name
                                high_res_image.save(image_file)
-
-                            df_result = self._run_tesseract(fname)
+                            doc_orientation = 0
+                            try:
+                                df_osd = self._perform_osd(fname)
+                                doc_orientation = _parse_orientation(df_osd)
+                            except subprocess.CalledProcessError as exc:
+                                _log.error(
+                                    "OSD failed (doc %s, page: %s, "
+                                    "OCR rectangle: %s, processed image file %s):\n %s",
+                                    conv_res.input.file,
+                                    page_i,
+                                    ocr_rect_i,
+                                    image_file,
+                                    exc.stderr,
+                                )
+                                # Skipping if OSD fail when in auto mode, otherwise proceed
+                                # to OCR in the hope OCR will succeed while OSD failed
+                                if self._is_auto:
+                                    continue
+                            if doc_orientation != 0:
+                                high_res_image = high_res_image.rotate(
+                                    -doc_orientation, expand=True
+                                )
+                                high_res_image.save(fname)
+                            try:
+                                df_result = self._run_tesseract(fname, df_osd)
+                            except subprocess.CalledProcessError as exc:
+                                _log.error(
+                                    "tesseract OCR failed (doc %s, page: %s, "
+                                    "OCR rectangle: %s, processed image file %s):\n %s",
+                                    conv_res.input.file,
+                                    page_i,
+                                    ocr_rect_i,
+                                    image_file,
+                                    exc.stderr,
+                                )
+                                continue
                        finally:
                            if os.path.exists(fname):
                                os.remove(fname)
@@ -238,31 +277,30 @@ class TesseractOcrCliModel(BaseOcrModel):
                            text = row["text"]
                            conf = row["conf"]

-                            l = float(row["left"])  # noqa: E741
-                            b = float(row["top"])
-                            w = float(row["width"])
-                            h = float(row["height"])
-
-                            t = b + h
-                            r = l + w
-
+                            left, top = float(row["left"]), float(row["top"])
+                            right = left + float(row["width"])
+                            bottom = top + row["height"]
+                            bbox = BoundingBox(
+                                l=left,
+                                t=top,
+                                r=right,
+                                b=bottom,
+                                coord_origin=CoordOrigin.TOPLEFT,
+                            )
+                            rect = tesseract_box_to_bounding_rectangle(
+                                bbox,
+                                original_offset=ocr_rect,
+                                scale=self.scale,
+                                orientation=doc_orientation,
+                                im_size=high_res_image.size,
+                            )
                            cell = TextCell(
                                index=ix,
                                text=str(text),
                                orig=str(text),
                                from_ocr=True,
                                confidence=conf / 100.0,
-                                rect=BoundingRectangle.from_bounding_box(
-                                    BoundingBox.from_tuple(
-                                        coord=(
-                                            (l / self.scale) + ocr_rect.l,
-                                            (b / self.scale) + ocr_rect.t,
-                                            (r / self.scale) + ocr_rect.l,
-                                            (t / self.scale) + ocr_rect.t,
-                                        ),
-                                        origin=CoordOrigin.TOPLEFT,
-                                    )
-                                ),
+                                rect=rect,
                            )
                            all_ocr_cells.append(cell)

@@ -278,3 +316,9 @@ class TesseractOcrCliModel(BaseOcrModel):
    @classmethod
    def get_options_type(cls) -> Type[OcrOptions]:
        return TesseractCliOcrOptions
+
+
+def _parse_orientation(df_osd: pd.DataFrame) -> int:
+    orientations = df_osd.loc[df_osd["key"] == "Orientation in degrees"].value.tolist()
+    orientation = parse_tesseract_orientation(orientations[0].strip())
+    return orientation