fix: Introduce Image format options in CLI. Silence the tqdm downloading messages. (#544)

* fix: main: Introduce format options for Image with the same pdf pipeline_options.
Add RapidOcrOptions to the Union of ocr_options for PdfPipelineOptions

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix: Silence the tqdm messages during the downloading of model files

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix: Code styling

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

* fix: Use the HF API to disable the tqdm progress bars

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>

---------

Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
Nikos Livathinos 2024-12-09 15:57:37 +01:00 committed by GitHub
parent aca57f0527
commit 78f61a8522
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 13 additions and 5 deletions

View File

@ -372,11 +372,13 @@ def convert(
else:
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: PdfFormatOption(
pdf_format_option = PdfFormatOption(
pipeline_options=pipeline_options,
backend=backend, # pdf_backend
)
format_options: Dict[InputFormat, FormatOption] = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
}
doc_converter = DocumentConverter(
allowed_formats=from_formats,

View File

@ -143,7 +143,11 @@ class PdfPipelineOptions(PipelineOptions):
table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[
EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
EasyOcrOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
OcrMacOptions,
RapidOcrOptions,
] = Field(EasyOcrOptions(), discriminator="kind")
images_scale: float = 1.0

View File

@ -97,7 +97,9 @@ class StandardPdfPipeline(PaginatedPipeline):
local_dir: Optional[Path] = None, force: bool = False
) -> Path:
from huggingface_hub import snapshot_download
from huggingface_hub.utils import disable_progress_bars
disable_progress_bars()
download_path = snapshot_download(
repo_id="ds4sd/docling-models",
force_download=force,