feat: Describe pictures using vision models (#259)

* draft for picture description models

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* vlm description using AutoModelForVision2Seq

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add generation options

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update vlm API

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* allow only localhost traffic

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* rename model

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* do not run with vlm api

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* more renaming

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix examples path

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply CLI download login

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix name of cli argument

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* use with_smolvlm in models download

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-02-07 16:30:42 +01:00
committed by GitHub
parent fba3cf9be7
commit 4cc6e3ea5e
14 changed files with 508 additions and 11 deletions

View File

@@ -226,6 +226,10 @@ def convert(
help="Enable the picture classification enrichment model in the pipeline.",
),
] = False,
enrich_picture_description: Annotated[
bool,
typer.Option(..., help="Enable the picture description model in the pipeline."),
] = False,
artifacts_path: Annotated[
Optional[Path],
typer.Option(..., help="If provided, the location of the model artifacts."),
@@ -382,6 +386,7 @@ def convert(
do_table_structure=True,
do_code_enrichment=enrich_code,
do_formula_enrichment=enrich_formula,
do_picture_description=enrich_picture_description,
do_picture_classification=enrich_picture_classes,
document_timeout=document_timeout,
)

View File

@@ -31,6 +31,7 @@ class _AvailableModels(str, Enum):
TABLEFORMER = "tableformer"
CODE_FORMULA = "code_formula"
PICTURE_CLASSIFIER = "picture_classifier"
SMOLVLM = "smolvlm"
EASYOCR = "easyocr"
@@ -81,6 +82,7 @@ def download(
with_tableformer=_AvailableModels.TABLEFORMER in to_download,
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
with_smolvlm=_AvailableModels.SMOLVLM in to_download,
with_easyocr=_AvailableModels.EASYOCR in to_download,
)