ci: add coverage and ruff (#1383)
* add coverage calculation and push Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * new codecov version and usage of token Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * enable ruff formatter instead of black and isort Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff lint fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff unsafe fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add removed imports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * runs 1 on linter issues Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * finalize linter fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update pyproject.toml Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
@@ -3,9 +3,10 @@ import io
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from subprocess import DEVNULL, PIPE, Popen
|
||||
from typing import Iterable, List, Optional, Tuple, Type
|
||||
from typing import List, Optional, Tuple, Type
|
||||
|
||||
import pandas as pd
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
@@ -63,8 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
)
|
||||
|
||||
def _get_name_and_version(self) -> Tuple[str, str]:
|
||||
|
||||
if self._name != None and self._version != None:
|
||||
if self._name is not None and self._version is not None:
|
||||
return self._name, self._version # type: ignore
|
||||
|
||||
cmd = [self.options.tesseract_cmd, "--version"]
|
||||
@@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
# _log.info(decoded_data)
|
||||
|
||||
# Read the TSV file generated by Tesseract
|
||||
df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
|
||||
df_result = pd.read_csv(
|
||||
io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
|
||||
)
|
||||
|
||||
# Display the dataframe (optional)
|
||||
# _log.info("df: ", df.head())
|
||||
|
||||
# Filter rows that contain actual text (ignore header or empty rows)
|
||||
df_filtered = df[
|
||||
df["text"].notnull() & (df["text"].apply(str).str.strip() != "")
|
||||
df_filtered = df_result[
|
||||
df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
|
||||
]
|
||||
|
||||
return df_filtered
|
||||
@@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
||||
output, _ = proc.communicate()
|
||||
decoded_data = output.decode("utf-8")
|
||||
df = pd.read_csv(
|
||||
df_detected = pd.read_csv(
|
||||
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
||||
)
|
||||
scripts = df.loc[df["key"] == "Script"].value.tolist()
|
||||
scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
|
||||
if len(scripts) == 0:
|
||||
_log.warning("Tesseract cannot detect the script of the page")
|
||||
return None
|
||||
@@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
||||
output, _ = proc.communicate()
|
||||
decoded_data = output.decode("utf-8")
|
||||
df = pd.read_csv(io.StringIO(decoded_data), header=None)
|
||||
self._tesseract_languages = df[0].tolist()[1:]
|
||||
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
|
||||
self._tesseract_languages = df_list[0].tolist()[1:]
|
||||
|
||||
# Decide the script prefix
|
||||
if any([l.startswith("script/") for l in self._tesseract_languages]):
|
||||
if any(lang.startswith("script/") for lang in self._tesseract_languages):
|
||||
script_prefix = "script/"
|
||||
else:
|
||||
script_prefix = ""
|
||||
@@ -197,7 +199,6 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
return
|
||||
@@ -225,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
fname = image_file.name
|
||||
high_res_image.save(image_file)
|
||||
|
||||
df = self._run_tesseract(fname)
|
||||
df_result = self._run_tesseract(fname)
|
||||
finally:
|
||||
if os.path.exists(fname):
|
||||
os.remove(fname)
|
||||
|
||||
# _log.info(df)
|
||||
# _log.info(df_result)
|
||||
|
||||
# Print relevant columns (bounding box and text)
|
||||
for ix, row in df.iterrows():
|
||||
for ix, row in df_result.iterrows():
|
||||
text = row["text"]
|
||||
conf = row["conf"]
|
||||
|
||||
l = float(row["left"])
|
||||
l = float(row["left"]) # noqa: E741
|
||||
b = float(row["top"])
|
||||
w = float(row["width"])
|
||||
h = float(row["height"])
|
||||
|
||||
Reference in New Issue
Block a user