feat(SmolDocling): Support MLX acceleration in VLM pipeline (#1199)
* Initial implementation to support MLX for VLM pipeline and SmolDocling Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * mlx_model unit Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Add CLI choices for VLM pipeline and model Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Initial implementation to support MLX for VLM pipeline and SmolDocling Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * mlx_model unit Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Add CLI choices for VLM pipeline and model Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Updated minimal vlm pipeline example Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * make vlm_pipeline python3.9 compatible Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fixed extract_text_from_backend definition Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated README Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated example Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated documentation Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * corrections in the documentation Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Consmetic changes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -14,8 +14,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat, Page
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import ResponseFormat, VlmPipelineOptions
|
||||
from docling.datamodel.pipeline_options import (
|
||||
InferenceFramework,
|
||||
ResponseFormat,
|
||||
VlmPipelineOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.hf_mlx_model import HuggingFaceMlxModel
|
||||
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
@@ -29,12 +34,6 @@ class VlmPipeline(PaginatedPipeline):
|
||||
super().__init__(pipeline_options)
|
||||
self.keep_backend = True
|
||||
|
||||
warnings.warn(
|
||||
"The VlmPipeline is currently experimental and may change in upcoming versions without notice.",
|
||||
category=UserWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
|
||||
self.pipeline_options: VlmPipelineOptions
|
||||
|
||||
artifacts_path: Optional[Path] = None
|
||||
@@ -58,14 +57,27 @@ class VlmPipeline(PaginatedPipeline):
|
||||
|
||||
self.keep_images = self.pipeline_options.generate_page_images
|
||||
|
||||
self.build_pipe = [
|
||||
HuggingFaceVlmModel(
|
||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||
artifacts_path=artifacts_path,
|
||||
accelerator_options=pipeline_options.accelerator_options,
|
||||
vlm_options=self.pipeline_options.vlm_options,
|
||||
),
|
||||
]
|
||||
if (
|
||||
self.pipeline_options.vlm_options.inference_framework
|
||||
== InferenceFramework.MLX
|
||||
):
|
||||
self.build_pipe = [
|
||||
HuggingFaceMlxModel(
|
||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||
artifacts_path=artifacts_path,
|
||||
accelerator_options=pipeline_options.accelerator_options,
|
||||
vlm_options=self.pipeline_options.vlm_options,
|
||||
),
|
||||
]
|
||||
else:
|
||||
self.build_pipe = [
|
||||
HuggingFaceVlmModel(
|
||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||
artifacts_path=artifacts_path,
|
||||
accelerator_options=pipeline_options.accelerator_options,
|
||||
vlm_options=self.pipeline_options.vlm_options,
|
||||
),
|
||||
]
|
||||
|
||||
self.enrichment_pipe = [
|
||||
# Other models working on `NodeItem` elements in the DoclingDocument
|
||||
@@ -79,7 +91,9 @@ class VlmPipeline(PaginatedPipeline):
|
||||
|
||||
return page
|
||||
|
||||
def extract_text_from_backend(self, page: Page, bbox: BoundingBox | None) -> str:
|
||||
def extract_text_from_backend(
|
||||
self, page: Page, bbox: Union[BoundingBox, None]
|
||||
) -> str:
|
||||
# Convert bounding box normalized to 0-100 into page coordinates for cropping
|
||||
text = ""
|
||||
if bbox:
|
||||
|
||||
Reference in New Issue
Block a user