feat: new vlm-models support (#1570)
* feat: adding new vlm-models support Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the transformers Signed-off-by: Peter Staar <taa@zurich.ibm.com> * got microsoft/Phi-4-multimodal-instruct to work Signed-off-by: Peter Staar <taa@zurich.ibm.com> * working on vlm's Signed-off-by: Peter Staar <taa@zurich.ibm.com> * refactoring the VLM part Signed-off-by: Peter Staar <taa@zurich.ibm.com> * all working, now serious refacgtoring necessary Signed-off-by: Peter Staar <taa@zurich.ibm.com> * refactoring the download_model Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added the formulate_prompt Signed-off-by: Peter Staar <taa@zurich.ibm.com> * pixtral 12b runs via MLX and native transformers Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added the VlmPredictionToken Signed-off-by: Peter Staar <taa@zurich.ibm.com> * refactoring minimal_vlm_pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the MyPy Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added pipeline_model_specializations file Signed-off-by: Peter Staar <taa@zurich.ibm.com> * need to get Phi4 working again ... Signed-off-by: Peter Staar <taa@zurich.ibm.com> * finalising last points for vlms support Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the pipeline for Phi4 Signed-off-by: Peter Staar <taa@zurich.ibm.com> * streamlining all code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted the code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixing the tests Signed-off-by: Peter Staar <taa@zurich.ibm.com> * added the html backend to the VLM pipeline Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the static load_from_doctags Signed-off-by: Peter Staar <taa@zurich.ibm.com> * restore stable imports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use AutoModelForVision2Seq for Pixtral and review example (including rename) Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove unused value Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * refactor instances of VLM models Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * skip compare example in CI Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use lowercase and uppercase only Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add new minimal_vlm example and refactor pipeline_options_vlm_model for cleaner import Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename pipeline_vlm_model_spec Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * move more argument to options and simplify model init Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add supported_devices Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove not-needed function Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * exclude minimal_vlm Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * missing file Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add message for transformers version Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename to specs Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use module import and remove MLX from non-darwin Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove hf_vlm_model and add extra_generation_args Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use single HF VLM model class Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove torch type Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add docs for vision models Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
08dcacc5cb
commit
cfdf4cea25
@@ -1,29 +1,46 @@
|
||||
import logging
|
||||
import re
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union, cast
|
||||
|
||||
from docling_core.types import DoclingDocument
|
||||
from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
|
||||
from docling_core.types.doc import (
|
||||
BoundingBox,
|
||||
DocItem,
|
||||
DoclingDocument,
|
||||
ImageRef,
|
||||
PictureItem,
|
||||
ProvenanceItem,
|
||||
TextItem,
|
||||
)
|
||||
from docling_core.types.doc.base import (
|
||||
BoundingBox,
|
||||
Size,
|
||||
)
|
||||
from docling_core.types.doc.document import DocTagsDocument
|
||||
from PIL import Image as PILImage
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat, Page
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.pipeline_options import (
|
||||
ApiVlmOptions,
|
||||
HuggingFaceVlmOptions,
|
||||
InferenceFramework,
|
||||
ResponseFormat,
|
||||
VlmPipelineOptions,
|
||||
)
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
ApiVlmOptions,
|
||||
InferenceFramework,
|
||||
InlineVlmOptions,
|
||||
ResponseFormat,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.api_vlm_model import ApiVlmModel
|
||||
from docling.models.hf_mlx_model import HuggingFaceMlxModel
|
||||
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
||||
from docling.models.vlm_models_inline.hf_transformers_model import (
|
||||
HuggingFaceTransformersVlmModel,
|
||||
)
|
||||
from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
|
||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
|
||||
@@ -66,8 +83,8 @@ class VlmPipeline(PaginatedPipeline):
|
||||
vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
|
||||
),
|
||||
]
|
||||
elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
|
||||
vlm_options = cast(HuggingFaceVlmOptions, self.pipeline_options.vlm_options)
|
||||
elif isinstance(self.pipeline_options.vlm_options, InlineVlmOptions):
|
||||
vlm_options = cast(InlineVlmOptions, self.pipeline_options.vlm_options)
|
||||
if vlm_options.inference_framework == InferenceFramework.MLX:
|
||||
self.build_pipe = [
|
||||
HuggingFaceMlxModel(
|
||||
@@ -77,15 +94,19 @@ class VlmPipeline(PaginatedPipeline):
|
||||
vlm_options=vlm_options,
|
||||
),
|
||||
]
|
||||
else:
|
||||
elif vlm_options.inference_framework == InferenceFramework.TRANSFORMERS:
|
||||
self.build_pipe = [
|
||||
HuggingFaceVlmModel(
|
||||
HuggingFaceTransformersVlmModel(
|
||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||
artifacts_path=artifacts_path,
|
||||
accelerator_options=pipeline_options.accelerator_options,
|
||||
vlm_options=vlm_options,
|
||||
),
|
||||
]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
|
||||
)
|
||||
|
||||
self.enrichment_pipe = [
|
||||
# Other models working on `NodeItem` elements in the DoclingDocument
|
||||
@@ -116,49 +137,19 @@ class VlmPipeline(PaginatedPipeline):
|
||||
self.pipeline_options.vlm_options.response_format
|
||||
== ResponseFormat.DOCTAGS
|
||||
):
|
||||
doctags_list = []
|
||||
image_list = []
|
||||
for page in conv_res.pages:
|
||||
predicted_doctags = ""
|
||||
img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
|
||||
if page.predictions.vlm_response:
|
||||
predicted_doctags = page.predictions.vlm_response.text
|
||||
if page.image:
|
||||
img = page.image
|
||||
image_list.append(img)
|
||||
doctags_list.append(predicted_doctags)
|
||||
conv_res.document = self._turn_dt_into_doc(conv_res)
|
||||
|
||||
doctags_list_c = cast(List[Union[Path, str]], doctags_list)
|
||||
image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
|
||||
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
||||
doctags_list_c, image_list_c
|
||||
)
|
||||
conv_res.document = DoclingDocument.load_from_doctags(doctags_doc)
|
||||
|
||||
# If forced backend text, replace model predicted text with backend one
|
||||
if self.force_backend_text:
|
||||
scale = self.pipeline_options.images_scale
|
||||
for element, _level in conv_res.document.iterate_items():
|
||||
if not isinstance(element, TextItem) or len(element.prov) == 0:
|
||||
continue
|
||||
page_ix = element.prov[0].page_no - 1
|
||||
page = conv_res.pages[page_ix]
|
||||
if not page.size:
|
||||
continue
|
||||
crop_bbox = (
|
||||
element.prov[0]
|
||||
.bbox.scaled(scale=scale)
|
||||
.to_top_left_origin(page_height=page.size.height * scale)
|
||||
)
|
||||
txt = self.extract_text_from_backend(page, crop_bbox)
|
||||
element.text = txt
|
||||
element.orig = txt
|
||||
elif (
|
||||
self.pipeline_options.vlm_options.response_format
|
||||
== ResponseFormat.MARKDOWN
|
||||
):
|
||||
conv_res.document = self._turn_md_into_doc(conv_res)
|
||||
|
||||
elif (
|
||||
self.pipeline_options.vlm_options.response_format == ResponseFormat.HTML
|
||||
):
|
||||
conv_res.document = self._turn_html_into_doc(conv_res)
|
||||
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Unsupported VLM response format {self.pipeline_options.vlm_options.response_format}"
|
||||
@@ -192,23 +183,199 @@ class VlmPipeline(PaginatedPipeline):
|
||||
|
||||
return conv_res
|
||||
|
||||
def _turn_md_into_doc(self, conv_res):
|
||||
predicted_text = ""
|
||||
for pg_idx, page in enumerate(conv_res.pages):
|
||||
def _turn_dt_into_doc(self, conv_res) -> DoclingDocument:
|
||||
doctags_list = []
|
||||
image_list = []
|
||||
for page in conv_res.pages:
|
||||
predicted_doctags = ""
|
||||
img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
|
||||
if page.predictions.vlm_response:
|
||||
predicted_text += page.predictions.vlm_response.text + "\n\n"
|
||||
response_bytes = BytesIO(predicted_text.encode("utf8"))
|
||||
out_doc = InputDocument(
|
||||
path_or_stream=response_bytes,
|
||||
filename=conv_res.input.file.name,
|
||||
format=InputFormat.MD,
|
||||
backend=MarkdownDocumentBackend,
|
||||
predicted_doctags = page.predictions.vlm_response.text
|
||||
if page.image:
|
||||
img = page.image
|
||||
image_list.append(img)
|
||||
doctags_list.append(predicted_doctags)
|
||||
|
||||
doctags_list_c = cast(List[Union[Path, str]], doctags_list)
|
||||
image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
|
||||
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
||||
doctags_list_c, image_list_c
|
||||
)
|
||||
backend = MarkdownDocumentBackend(
|
||||
in_doc=out_doc,
|
||||
path_or_stream=response_bytes,
|
||||
conv_res.document = DoclingDocument.load_from_doctags(
|
||||
doctag_document=doctags_doc
|
||||
)
|
||||
return backend.convert()
|
||||
|
||||
# If forced backend text, replace model predicted text with backend one
|
||||
if page.size:
|
||||
if self.force_backend_text:
|
||||
scale = self.pipeline_options.images_scale
|
||||
for element, _level in conv_res.document.iterate_items():
|
||||
if not isinstance(element, TextItem) or len(element.prov) == 0:
|
||||
continue
|
||||
crop_bbox = (
|
||||
element.prov[0]
|
||||
.bbox.scaled(scale=scale)
|
||||
.to_top_left_origin(page_height=page.size.height * scale)
|
||||
)
|
||||
txt = self.extract_text_from_backend(page, crop_bbox)
|
||||
element.text = txt
|
||||
element.orig = txt
|
||||
|
||||
return conv_res.document
|
||||
|
||||
def _turn_md_into_doc(self, conv_res):
|
||||
def _extract_markdown_code(text):
|
||||
"""
|
||||
Extracts text from markdown code blocks (enclosed in triple backticks).
|
||||
If no code blocks are found, returns the original text.
|
||||
|
||||
Args:
|
||||
text (str): Input text that may contain markdown code blocks
|
||||
|
||||
Returns:
|
||||
str: Extracted code if code blocks exist, otherwise original text
|
||||
"""
|
||||
# Regex pattern to match content between triple backticks
|
||||
# This handles multiline content and optional language specifier
|
||||
pattern = r"^```(?:\w*\n)?(.*?)```(\n)*$"
|
||||
|
||||
# Search with DOTALL flag to match across multiple lines
|
||||
mtch = re.search(pattern, text, re.DOTALL)
|
||||
|
||||
if mtch:
|
||||
# Return only the content of the first capturing group
|
||||
return mtch.group(1)
|
||||
else:
|
||||
# No code blocks found, return original text
|
||||
return text
|
||||
|
||||
for pg_idx, page in enumerate(conv_res.pages):
|
||||
page_no = pg_idx + 1 # FIXME: might be incorrect
|
||||
|
||||
predicted_text = ""
|
||||
if page.predictions.vlm_response:
|
||||
predicted_text = page.predictions.vlm_response.text + "\n\n"
|
||||
|
||||
predicted_text = _extract_markdown_code(text=predicted_text)
|
||||
|
||||
response_bytes = BytesIO(predicted_text.encode("utf8"))
|
||||
out_doc = InputDocument(
|
||||
path_or_stream=response_bytes,
|
||||
filename=conv_res.input.file.name,
|
||||
format=InputFormat.MD,
|
||||
backend=MarkdownDocumentBackend,
|
||||
)
|
||||
backend = MarkdownDocumentBackend(
|
||||
in_doc=out_doc,
|
||||
path_or_stream=response_bytes,
|
||||
)
|
||||
page_doc = backend.convert()
|
||||
|
||||
if page.image is not None:
|
||||
pg_width = page.image.width
|
||||
pg_height = page.image.height
|
||||
else:
|
||||
pg_width = 1
|
||||
pg_height = 1
|
||||
|
||||
conv_res.document.add_page(
|
||||
page_no=page_no,
|
||||
size=Size(width=pg_width, height=pg_height),
|
||||
image=ImageRef.from_pil(image=page.image, dpi=72)
|
||||
if page.image
|
||||
else None,
|
||||
)
|
||||
|
||||
for item, level in page_doc.iterate_items():
|
||||
item.prov = [
|
||||
ProvenanceItem(
|
||||
page_no=pg_idx + 1,
|
||||
bbox=BoundingBox(
|
||||
t=0.0, b=0.0, l=0.0, r=0.0
|
||||
), # FIXME: would be nice not to have to "fake" it
|
||||
charspan=[0, 0],
|
||||
)
|
||||
]
|
||||
conv_res.document.append_child_item(child=item)
|
||||
|
||||
return conv_res.document
|
||||
|
||||
def _turn_html_into_doc(self, conv_res):
|
||||
def _extract_html_code(text):
|
||||
"""
|
||||
Extracts text from markdown code blocks (enclosed in triple backticks).
|
||||
If no code blocks are found, returns the original text.
|
||||
|
||||
Args:
|
||||
text (str): Input text that may contain markdown code blocks
|
||||
|
||||
Returns:
|
||||
str: Extracted code if code blocks exist, otherwise original text
|
||||
"""
|
||||
# Regex pattern to match content between triple backticks
|
||||
# This handles multiline content and optional language specifier
|
||||
pattern = r"^```(?:\w*\n)?(.*?)```(\n)*$"
|
||||
|
||||
# Search with DOTALL flag to match across multiple lines
|
||||
mtch = re.search(pattern, text, re.DOTALL)
|
||||
|
||||
if mtch:
|
||||
# Return only the content of the first capturing group
|
||||
return mtch.group(1)
|
||||
else:
|
||||
# No code blocks found, return original text
|
||||
return text
|
||||
|
||||
for pg_idx, page in enumerate(conv_res.pages):
|
||||
page_no = pg_idx + 1 # FIXME: might be incorrect
|
||||
|
||||
predicted_text = ""
|
||||
if page.predictions.vlm_response:
|
||||
predicted_text = page.predictions.vlm_response.text + "\n\n"
|
||||
|
||||
predicted_text = _extract_html_code(text=predicted_text)
|
||||
|
||||
response_bytes = BytesIO(predicted_text.encode("utf8"))
|
||||
out_doc = InputDocument(
|
||||
path_or_stream=response_bytes,
|
||||
filename=conv_res.input.file.name,
|
||||
format=InputFormat.MD,
|
||||
backend=HTMLDocumentBackend,
|
||||
)
|
||||
backend = HTMLDocumentBackend(
|
||||
in_doc=out_doc,
|
||||
path_or_stream=response_bytes,
|
||||
)
|
||||
page_doc = backend.convert()
|
||||
|
||||
if page.image is not None:
|
||||
pg_width = page.image.width
|
||||
pg_height = page.image.height
|
||||
else:
|
||||
pg_width = 1
|
||||
pg_height = 1
|
||||
|
||||
conv_res.document.add_page(
|
||||
page_no=page_no,
|
||||
size=Size(width=pg_width, height=pg_height),
|
||||
image=ImageRef.from_pil(image=page.image, dpi=72)
|
||||
if page.image
|
||||
else None,
|
||||
)
|
||||
|
||||
for item, level in page_doc.iterate_items():
|
||||
item.prov = [
|
||||
ProvenanceItem(
|
||||
page_no=pg_idx + 1,
|
||||
bbox=BoundingBox(
|
||||
t=0.0, b=0.0, l=0.0, r=0.0
|
||||
), # FIXME: would be nice not to have to "fake" it
|
||||
charspan=[0, 0],
|
||||
)
|
||||
]
|
||||
conv_res.document.append_child_item(child=item)
|
||||
|
||||
return conv_res.document
|
||||
|
||||
@classmethod
|
||||
def get_default_options(cls) -> VlmPipelineOptions:
|
||||
|
||||
Reference in New Issue
Block a user