
* feat: adding new vlm-models support Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the transformers Signed-off-by: Peter Staar <taa@zurich.ibm.com> * got microsoft/Phi-4-multimodal-instruct to work Signed-off-by: Peter Staar <taa@zurich.ibm.com> * working on vlm's Signed-off-by: Peter Staar <taa@zurich.ibm.com> * refactoring the VLM part Signed-off-by: Peter Staar <taa@zurich.ibm.com> * all working, now serious refacgtoring necessary Signed-off-by: Peter Staar <taa@zurich.ibm.com> * refactoring the download_model Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added the formulate_prompt Signed-off-by: Peter Staar <taa@zurich.ibm.com> * pixtral 12b runs via MLX and native transformers Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added the VlmPredictionToken Signed-off-by: Peter Staar <taa@zurich.ibm.com> * refactoring minimal_vlm_pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the MyPy Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added pipeline_model_specializations file Signed-off-by: Peter Staar <taa@zurich.ibm.com> * need to get Phi4 working again ... Signed-off-by: Peter Staar <taa@zurich.ibm.com> * finalising last points for vlms support Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the pipeline for Phi4 Signed-off-by: Peter Staar <taa@zurich.ibm.com> * streamlining all code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted the code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixing the tests Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added the html backend to the VLM pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the static load_from_doctags Signed-off-by: Peter Staar <taa@zurich.ibm.com> * restore stable imports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use AutoModelForVision2Seq for Pixtral and review example (including rename) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove unused value Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * refactor instances of VLM models Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * skip compare example in CI Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use lowercase and uppercase only Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add new minimal_vlm example and refactor pipeline_options_vlm_model for cleaner import Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename pipeline_vlm_model_spec Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * move more argument to options and simplify model init Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add supported_devices Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove not-needed function Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * exclude minimal_vlm Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * missing file Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add message for transformers version Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename to specs Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use module import and remove MLX from non-darwin Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove hf_vlm_model and add extra_generation_args Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use single HF VLM model class Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove torch type Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add docs for vision models Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
187 lines
6.0 KiB
Python
187 lines
6.0 KiB
Python
from collections.abc import Iterable
|
|
from pathlib import Path
|
|
from typing import List, Literal, Optional, Union
|
|
|
|
import numpy as np
|
|
from docling_core.types.doc import (
|
|
DoclingDocument,
|
|
NodeItem,
|
|
PictureClassificationClass,
|
|
PictureClassificationData,
|
|
PictureItem,
|
|
)
|
|
from PIL import Image
|
|
from pydantic import BaseModel
|
|
|
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
|
from docling.models.base_model import BaseEnrichmentModel
|
|
from docling.models.utils.hf_model_download import download_hf_model
|
|
from docling.utils.accelerator_utils import decide_device
|
|
|
|
|
|
class DocumentPictureClassifierOptions(BaseModel):
|
|
"""
|
|
Options for configuring the DocumentPictureClassifier.
|
|
|
|
Attributes
|
|
----------
|
|
kind : Literal["document_picture_classifier"]
|
|
Identifier for the type of classifier.
|
|
"""
|
|
|
|
kind: Literal["document_picture_classifier"] = "document_picture_classifier"
|
|
|
|
|
|
class DocumentPictureClassifier(BaseEnrichmentModel):
|
|
"""
|
|
A model for classifying pictures in documents.
|
|
|
|
This class enriches document pictures with predicted classifications
|
|
based on a predefined set of classes.
|
|
|
|
Attributes
|
|
----------
|
|
enabled : bool
|
|
Whether the classifier is enabled for use.
|
|
options : DocumentPictureClassifierOptions
|
|
Configuration options for the classifier.
|
|
document_picture_classifier : DocumentPictureClassifierPredictor
|
|
The underlying prediction model, loaded if the classifier is enabled.
|
|
|
|
Methods
|
|
-------
|
|
__init__(enabled, artifacts_path, options, accelerator_options)
|
|
Initializes the classifier with specified configurations.
|
|
is_processable(doc, element)
|
|
Checks if the given element can be processed by the classifier.
|
|
__call__(doc, element_batch)
|
|
Processes a batch of elements and adds classification annotations.
|
|
"""
|
|
|
|
_model_repo_folder = "ds4sd--DocumentFigureClassifier"
|
|
images_scale = 2
|
|
|
|
def __init__(
|
|
self,
|
|
enabled: bool,
|
|
artifacts_path: Optional[Path],
|
|
options: DocumentPictureClassifierOptions,
|
|
accelerator_options: AcceleratorOptions,
|
|
):
|
|
"""
|
|
Initializes the DocumentPictureClassifier.
|
|
|
|
Parameters
|
|
----------
|
|
enabled : bool
|
|
Indicates whether the classifier is enabled.
|
|
artifacts_path : Optional[Union[Path, str]],
|
|
Path to the directory containing model artifacts.
|
|
options : DocumentPictureClassifierOptions
|
|
Configuration options for the classifier.
|
|
accelerator_options : AcceleratorOptions
|
|
Options for configuring the device and parallelism.
|
|
"""
|
|
self.enabled = enabled
|
|
self.options = options
|
|
|
|
if self.enabled:
|
|
device = decide_device(accelerator_options.device)
|
|
from docling_ibm_models.document_figure_classifier_model.document_figure_classifier_predictor import (
|
|
DocumentFigureClassifierPredictor,
|
|
)
|
|
|
|
if artifacts_path is None:
|
|
artifacts_path = self.download_models()
|
|
else:
|
|
artifacts_path = artifacts_path / self._model_repo_folder
|
|
|
|
self.document_picture_classifier = DocumentFigureClassifierPredictor(
|
|
artifacts_path=str(artifacts_path),
|
|
device=device,
|
|
num_threads=accelerator_options.num_threads,
|
|
)
|
|
|
|
@staticmethod
|
|
def download_models(
|
|
local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
|
|
) -> Path:
|
|
return download_hf_model(
|
|
repo_id="ds4sd/DocumentFigureClassifier",
|
|
revision="v1.0.1",
|
|
local_dir=local_dir,
|
|
force=force,
|
|
progress=progress,
|
|
)
|
|
|
|
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
|
"""
|
|
Determines if the given element can be processed by the classifier.
|
|
|
|
Parameters
|
|
----------
|
|
doc : DoclingDocument
|
|
The document containing the element.
|
|
element : NodeItem
|
|
The element to be checked.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
True if the element is a PictureItem and processing is enabled; False otherwise.
|
|
"""
|
|
return self.enabled and isinstance(element, PictureItem)
|
|
|
|
def __call__(
|
|
self,
|
|
doc: DoclingDocument,
|
|
element_batch: Iterable[NodeItem],
|
|
) -> Iterable[NodeItem]:
|
|
"""
|
|
Processes a batch of elements and enriches them with classification predictions.
|
|
|
|
Parameters
|
|
----------
|
|
doc : DoclingDocument
|
|
The document containing the elements to be processed.
|
|
element_batch : Iterable[NodeItem]
|
|
A batch of pictures to classify.
|
|
|
|
Returns
|
|
-------
|
|
Iterable[NodeItem]
|
|
An iterable of NodeItem objects after processing. The field
|
|
'data.classification' is added containing the classification for each picture.
|
|
"""
|
|
if not self.enabled:
|
|
for element in element_batch:
|
|
yield element
|
|
return
|
|
|
|
images: List[Union[Image.Image, np.ndarray]] = []
|
|
elements: List[PictureItem] = []
|
|
for el in element_batch:
|
|
assert isinstance(el, PictureItem)
|
|
elements.append(el)
|
|
img = el.get_image(doc)
|
|
assert img is not None
|
|
images.append(img)
|
|
|
|
outputs = self.document_picture_classifier.predict(images)
|
|
|
|
for element, output in zip(elements, outputs):
|
|
element.annotations.append(
|
|
PictureClassificationData(
|
|
provenance="DocumentPictureClassifier",
|
|
predicted_classes=[
|
|
PictureClassificationClass(
|
|
class_name=pred[0],
|
|
confidence=pred[1],
|
|
)
|
|
for pred in output
|
|
],
|
|
)
|
|
)
|
|
|
|
yield element
|