feat: expose ocr-lang in CLI (#375)

* feat: expose ocr-lang in CLI Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use regex for supporting multiple sep Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2024-11-19 15:58:49 +01:00
parent 926dfd29d5
commit ed785ea122
2 changed files with 19 additions and 0 deletions
@@ -1,6 +1,7 @@
 import importlib
 import json
 import logging
+import re
 import time
 import warnings
 from enum import Enum
@@ -129,6 +130,12 @@ def export_documents(
    )


+def _split_list(raw: Optional[str]) -> Optional[List[str]]:
+    if raw is None:
+        return None
+    return re.split(r"[;,]", raw)
+
+
@app.command(no_args_is_help=True)
 def convert(
    input_sources: Annotated[
@@ -163,6 +170,13 @@ def convert(
    ocr_engine: Annotated[
        OcrEngine, typer.Option(..., help="The OCR engine to use.")
    ] = OcrEngine.EASYOCR,
+    ocr_lang: Annotated[
+        Optional[str],
+        typer.Option(
+            ...,
+            help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
+        ),
+    ] = None,
    pdf_backend: Annotated[
        PdfBackend, typer.Option(..., help="The PDF backend to use.")
    ] = PdfBackend.DLPARSE_V1,
@@ -248,6 +262,10 @@ def convert(
        case _:
            raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")

+    ocr_lang_list = _split_list(ocr_lang)
+    if ocr_lang_list is not None:
+        ocr_options.lang = ocr_lang_list
+
    pipeline_options = PdfPipelineOptions(
        do_ocr=ocr,
        ocr_options=ocr_options,
@@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):

 class OcrOptions(BaseModel):
    kind: str
+    lang: List[str]
    force_full_page_ocr: bool = False  # If enabled a full page OCR is always applied
    bitmap_area_threshold: float = (
        0.05  # percentage of the area for a bitmap to processed with OCR