ci: add coverage and ruff (#1383)

* add coverage calculation and push Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * new codecov version and usage of token Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * enable ruff formatter instead of black and isort Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff lint fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff unsafe fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add removed imports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * runs 1 on linter issues Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * finalize linter fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update pyproject.toml Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-04-14 18:01:26 +02:00
parent 293c28ca7c
commit 5458a88464
104 changed files with 665 additions and 633 deletions
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@@ -3,9 +3,10 @@ import io
 import logging
 import os
 import tempfile
+from collections.abc import Iterable
 from pathlib import Path
 from subprocess import DEVNULL, PIPE, Popen
-from typing import Iterable, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type

 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -63,8 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel):
                )

    def _get_name_and_version(self) -> Tuple[str, str]:
-
-        if self._name != None and self._version != None:
+        if self._name is not None and self._version is not None:
            return self._name, self._version  # type: ignore

        cmd = [self.options.tesseract_cmd, "--version"]
@@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
        # _log.info(decoded_data)

        # Read the TSV file generated by Tesseract
-        df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
+        df_result = pd.read_csv(
+            io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
+        )

        # Display the dataframe (optional)
        # _log.info("df: ", df.head())

        # Filter rows that contain actual text (ignore header or empty rows)
-        df_filtered = df[
-            df["text"].notnull() & (df["text"].apply(str).str.strip() != "")
+        df_filtered = df_result[
+            df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
        ]

        return df_filtered
@@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
        output, _ = proc.communicate()
        decoded_data = output.decode("utf-8")
-        df = pd.read_csv(
+        df_detected = pd.read_csv(
            io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
        )
-        scripts = df.loc[df["key"] == "Script"].value.tolist()
+        scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
        if len(scripts) == 0:
            _log.warning("Tesseract cannot detect the script of the page")
            return None
@@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
        output, _ = proc.communicate()
        decoded_data = output.decode("utf-8")
-        df = pd.read_csv(io.StringIO(decoded_data), header=None)
-        self._tesseract_languages = df[0].tolist()[1:]
+        df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
+        self._tesseract_languages = df_list[0].tolist()[1:]

        # Decide the script prefix
-        if any([l.startswith("script/") for l in self._tesseract_languages]):
+        if any(lang.startswith("script/") for lang in self._tesseract_languages):
            script_prefix = "script/"
        else:
            script_prefix = ""
@@ -197,7 +199,6 @@ class TesseractOcrCliModel(BaseOcrModel):
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
-
        if not self.enabled:
            yield from page_batch
            return
@@ -225,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
                                fname = image_file.name
                                high_res_image.save(image_file)

-                            df = self._run_tesseract(fname)
+                            df_result = self._run_tesseract(fname)
                        finally:
                            if os.path.exists(fname):
                                os.remove(fname)

-                        # _log.info(df)
+                        # _log.info(df_result)

                        # Print relevant columns (bounding box and text)
-                        for ix, row in df.iterrows():
+                        for ix, row in df_result.iterrows():
                            text = row["text"]
                            conf = row["conf"]

-                            l = float(row["left"])
+                            l = float(row["left"])  # noqa: E741
                            b = float(row["top"])
                            w = float(row["width"])
                            h = float(row["height"])