feat: expose ocr-lang in CLI (#375)
* feat: expose ocr-lang in CLI Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use regex for supporting multiple sep Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
926dfd29d5
commit
ed785ea122
@ -1,6 +1,7 @@
|
|||||||
import importlib
|
import importlib
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
import warnings
|
import warnings
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
@ -129,6 +130,12 @@ def export_documents(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _split_list(raw: Optional[str]) -> Optional[List[str]]:
|
||||||
|
if raw is None:
|
||||||
|
return None
|
||||||
|
return re.split(r"[;,]", raw)
|
||||||
|
|
||||||
|
|
||||||
@app.command(no_args_is_help=True)
|
@app.command(no_args_is_help=True)
|
||||||
def convert(
|
def convert(
|
||||||
input_sources: Annotated[
|
input_sources: Annotated[
|
||||||
@ -163,6 +170,13 @@ def convert(
|
|||||||
ocr_engine: Annotated[
|
ocr_engine: Annotated[
|
||||||
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
||||||
] = OcrEngine.EASYOCR,
|
] = OcrEngine.EASYOCR,
|
||||||
|
ocr_lang: Annotated[
|
||||||
|
Optional[str],
|
||||||
|
typer.Option(
|
||||||
|
...,
|
||||||
|
help="Provide a comma-separated list of languages used by the OCR engine. Note that each OCR engine has different values for the language names.",
|
||||||
|
),
|
||||||
|
] = None,
|
||||||
pdf_backend: Annotated[
|
pdf_backend: Annotated[
|
||||||
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
||||||
] = PdfBackend.DLPARSE_V1,
|
] = PdfBackend.DLPARSE_V1,
|
||||||
@ -248,6 +262,10 @@ def convert(
|
|||||||
case _:
|
case _:
|
||||||
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
||||||
|
|
||||||
|
ocr_lang_list = _split_list(ocr_lang)
|
||||||
|
if ocr_lang_list is not None:
|
||||||
|
ocr_options.lang = ocr_lang_list
|
||||||
|
|
||||||
pipeline_options = PdfPipelineOptions(
|
pipeline_options = PdfPipelineOptions(
|
||||||
do_ocr=ocr,
|
do_ocr=ocr,
|
||||||
ocr_options=ocr_options,
|
ocr_options=ocr_options,
|
||||||
|
@ -22,6 +22,7 @@ class TableStructureOptions(BaseModel):
|
|||||||
|
|
||||||
class OcrOptions(BaseModel):
|
class OcrOptions(BaseModel):
|
||||||
kind: str
|
kind: str
|
||||||
|
lang: List[str]
|
||||||
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
||||||
bitmap_area_threshold: float = (
|
bitmap_area_threshold: float = (
|
||||||
0.05 # percentage of the area for a bitmap to processed with OCR
|
0.05 # percentage of the area for a bitmap to processed with OCR
|
||||||
|
Loading…
Reference in New Issue
Block a user