feat: updated vlm pipeline (with latest changes from docling-core) (#1158)
* Draft implementation of Doctag backend Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Updated VLM pipeline doctags to docling conversion, now properly supports lists Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * preparing to migrate to new doctags deserializer Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * re-using DocTagsDocument.from_doctags_and_image_pairs Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * satisfying mypy and other checks Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added support for force_backend_text parameter Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * removed unnecessary transformation Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Cleaned up Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Update tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Updated readme Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
1a2a9e4eff
commit
2f72167ff6
@ -34,12 +34,12 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|||||||
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
||||||
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
||||||
* 🔍 Extensive OCR support for scanned PDFs and images
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
||||||
|
* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
||||||
* 💻 Simple and convenient CLI
|
* 💻 Simple and convenient CLI
|
||||||
|
|
||||||
### Coming soon
|
### Coming soon
|
||||||
|
|
||||||
* 📝 Metadata extraction, including title, authors, references & language
|
* 📝 Metadata extraction, including title, authors, references & language
|
||||||
* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling))
|
|
||||||
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc)
|
||||||
* 📝 Complex chemistry understanding (Molecular structures)
|
* 📝 Complex chemistry understanding (Molecular structures)
|
||||||
|
|
||||||
|
@ -1,41 +1,20 @@
|
|||||||
import itertools
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
|
||||||
import warnings
|
import warnings
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
# from io import BytesIO
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import List, Optional, Union, cast
|
||||||
|
|
||||||
from docling_core.types import DoclingDocument
|
# from docling_core.types import DoclingDocument
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
|
||||||
BoundingBox,
|
from docling_core.types.doc.document import DocTagsDocument
|
||||||
DocItem,
|
from PIL import Image as PILImage
|
||||||
DocItemLabel,
|
|
||||||
DoclingDocument,
|
|
||||||
GroupLabel,
|
|
||||||
ImageRef,
|
|
||||||
ImageRefMode,
|
|
||||||
PictureItem,
|
|
||||||
ProvenanceItem,
|
|
||||||
Size,
|
|
||||||
TableCell,
|
|
||||||
TableData,
|
|
||||||
TableItem,
|
|
||||||
)
|
|
||||||
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat, Page
|
from docling.datamodel.base_models import InputFormat, Page
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import ResponseFormat, VlmPipelineOptions
|
||||||
PdfPipelineOptions,
|
|
||||||
ResponseFormat,
|
|
||||||
VlmPipelineOptions,
|
|
||||||
)
|
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
||||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||||
@ -100,6 +79,15 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
def extract_text_from_backend(self, page: Page, bbox: BoundingBox | None) -> str:
|
||||||
|
# Convert bounding box normalized to 0-100 into page coordinates for cropping
|
||||||
|
text = ""
|
||||||
|
if bbox:
|
||||||
|
if page.size:
|
||||||
|
if page._backend:
|
||||||
|
text = page._backend.get_text_in_rect(bbox)
|
||||||
|
return text
|
||||||
|
|
||||||
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
||||||
|
|
||||||
@ -107,7 +95,45 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
self.pipeline_options.vlm_options.response_format
|
self.pipeline_options.vlm_options.response_format
|
||||||
== ResponseFormat.DOCTAGS
|
== ResponseFormat.DOCTAGS
|
||||||
):
|
):
|
||||||
conv_res.document = self._turn_tags_into_doc(conv_res.pages)
|
doctags_list = []
|
||||||
|
image_list = []
|
||||||
|
for page in conv_res.pages:
|
||||||
|
predicted_doctags = ""
|
||||||
|
img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
|
||||||
|
if page.predictions.vlm_response:
|
||||||
|
predicted_doctags = page.predictions.vlm_response.text
|
||||||
|
if page.image:
|
||||||
|
img = page.image
|
||||||
|
image_list.append(img)
|
||||||
|
doctags_list.append(predicted_doctags)
|
||||||
|
|
||||||
|
doctags_list_c = cast(List[Union[Path, str]], doctags_list)
|
||||||
|
image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
|
||||||
|
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
||||||
|
doctags_list_c, image_list_c
|
||||||
|
)
|
||||||
|
conv_res.document.load_from_doctags(doctags_doc)
|
||||||
|
|
||||||
|
# If forced backend text, replace model predicted text with backend one
|
||||||
|
if page.size:
|
||||||
|
if self.force_backend_text:
|
||||||
|
scale = self.pipeline_options.images_scale
|
||||||
|
for element, _level in conv_res.document.iterate_items():
|
||||||
|
if (
|
||||||
|
not isinstance(element, TextItem)
|
||||||
|
or len(element.prov) == 0
|
||||||
|
):
|
||||||
|
continue
|
||||||
|
crop_bbox = (
|
||||||
|
element.prov[0]
|
||||||
|
.bbox.scaled(scale=scale)
|
||||||
|
.to_top_left_origin(
|
||||||
|
page_height=page.size.height * scale
|
||||||
|
)
|
||||||
|
)
|
||||||
|
txt = self.extract_text_from_backend(page, crop_bbox)
|
||||||
|
element.text = txt
|
||||||
|
element.orig = txt
|
||||||
elif (
|
elif (
|
||||||
self.pipeline_options.vlm_options.response_format
|
self.pipeline_options.vlm_options.response_format
|
||||||
== ResponseFormat.MARKDOWN
|
== ResponseFormat.MARKDOWN
|
||||||
@ -165,366 +191,6 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
)
|
)
|
||||||
return backend.convert()
|
return backend.convert()
|
||||||
|
|
||||||
def _turn_tags_into_doc(self, pages: list[Page]) -> DoclingDocument:
|
|
||||||
###############################################
|
|
||||||
# Tag definitions and color mappings
|
|
||||||
###############################################
|
|
||||||
|
|
||||||
# Maps the recognized tag to a Docling label.
|
|
||||||
# Code items will be given DocItemLabel.CODE
|
|
||||||
tag_to_doclabel = {
|
|
||||||
"title": DocItemLabel.TITLE,
|
|
||||||
"document_index": DocItemLabel.DOCUMENT_INDEX,
|
|
||||||
"otsl": DocItemLabel.TABLE,
|
|
||||||
"section_header_level_1": DocItemLabel.SECTION_HEADER,
|
|
||||||
"checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
|
|
||||||
"checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
|
|
||||||
"text": DocItemLabel.TEXT,
|
|
||||||
"page_header": DocItemLabel.PAGE_HEADER,
|
|
||||||
"page_footer": DocItemLabel.PAGE_FOOTER,
|
|
||||||
"formula": DocItemLabel.FORMULA,
|
|
||||||
"caption": DocItemLabel.CAPTION,
|
|
||||||
"picture": DocItemLabel.PICTURE,
|
|
||||||
"list_item": DocItemLabel.LIST_ITEM,
|
|
||||||
"footnote": DocItemLabel.FOOTNOTE,
|
|
||||||
"code": DocItemLabel.CODE,
|
|
||||||
}
|
|
||||||
|
|
||||||
# Maps each tag to an associated bounding box color.
|
|
||||||
tag_to_color = {
|
|
||||||
"title": "blue",
|
|
||||||
"document_index": "darkblue",
|
|
||||||
"otsl": "green",
|
|
||||||
"section_header_level_1": "purple",
|
|
||||||
"checkbox_selected": "black",
|
|
||||||
"checkbox_unselected": "gray",
|
|
||||||
"text": "red",
|
|
||||||
"page_header": "orange",
|
|
||||||
"page_footer": "cyan",
|
|
||||||
"formula": "pink",
|
|
||||||
"caption": "magenta",
|
|
||||||
"picture": "yellow",
|
|
||||||
"list_item": "brown",
|
|
||||||
"footnote": "darkred",
|
|
||||||
"code": "lightblue",
|
|
||||||
}
|
|
||||||
|
|
||||||
def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
|
|
||||||
"""Extracts <loc_...> bounding box coords from the chunk, normalized by / 500."""
|
|
||||||
coords = re.findall(r"<loc_(\d+)>", text_chunk)
|
|
||||||
if len(coords) == 4:
|
|
||||||
l, t, r, b = map(float, coords)
|
|
||||||
return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
|
|
||||||
return None
|
|
||||||
|
|
||||||
def extract_inner_text(text_chunk: str) -> str:
|
|
||||||
"""Strips all <...> tags inside the chunk to get the raw text content."""
|
|
||||||
return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
|
|
||||||
|
|
||||||
def extract_text_from_backend(page: Page, bbox: BoundingBox | None) -> str:
|
|
||||||
# Convert bounding box normalized to 0-100 into page coordinates for cropping
|
|
||||||
text = ""
|
|
||||||
if bbox:
|
|
||||||
if page.size:
|
|
||||||
bbox.l = bbox.l * page.size.width
|
|
||||||
bbox.t = bbox.t * page.size.height
|
|
||||||
bbox.r = bbox.r * page.size.width
|
|
||||||
bbox.b = bbox.b * page.size.height
|
|
||||||
if page._backend:
|
|
||||||
text = page._backend.get_text_in_rect(bbox)
|
|
||||||
return text
|
|
||||||
|
|
||||||
def otsl_parse_texts(texts, tokens):
|
|
||||||
split_word = TableToken.OTSL_NL.value
|
|
||||||
split_row_tokens = [
|
|
||||||
list(y)
|
|
||||||
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
|
||||||
if not x
|
|
||||||
]
|
|
||||||
table_cells = []
|
|
||||||
r_idx = 0
|
|
||||||
c_idx = 0
|
|
||||||
|
|
||||||
def count_right(tokens, c_idx, r_idx, which_tokens):
|
|
||||||
span = 0
|
|
||||||
c_idx_iter = c_idx
|
|
||||||
while tokens[r_idx][c_idx_iter] in which_tokens:
|
|
||||||
c_idx_iter += 1
|
|
||||||
span += 1
|
|
||||||
if c_idx_iter >= len(tokens[r_idx]):
|
|
||||||
return span
|
|
||||||
return span
|
|
||||||
|
|
||||||
def count_down(tokens, c_idx, r_idx, which_tokens):
|
|
||||||
span = 0
|
|
||||||
r_idx_iter = r_idx
|
|
||||||
while tokens[r_idx_iter][c_idx] in which_tokens:
|
|
||||||
r_idx_iter += 1
|
|
||||||
span += 1
|
|
||||||
if r_idx_iter >= len(tokens):
|
|
||||||
return span
|
|
||||||
return span
|
|
||||||
|
|
||||||
for i, text in enumerate(texts):
|
|
||||||
cell_text = ""
|
|
||||||
if text in [
|
|
||||||
TableToken.OTSL_FCEL.value,
|
|
||||||
TableToken.OTSL_ECEL.value,
|
|
||||||
TableToken.OTSL_CHED.value,
|
|
||||||
TableToken.OTSL_RHED.value,
|
|
||||||
TableToken.OTSL_SROW.value,
|
|
||||||
]:
|
|
||||||
row_span = 1
|
|
||||||
col_span = 1
|
|
||||||
right_offset = 1
|
|
||||||
if text != TableToken.OTSL_ECEL.value:
|
|
||||||
cell_text = texts[i + 1]
|
|
||||||
right_offset = 2
|
|
||||||
|
|
||||||
# Check next element(s) for lcel / ucel / xcel, set properly row_span, col_span
|
|
||||||
next_right_cell = ""
|
|
||||||
if i + right_offset < len(texts):
|
|
||||||
next_right_cell = texts[i + right_offset]
|
|
||||||
|
|
||||||
next_bottom_cell = ""
|
|
||||||
if r_idx + 1 < len(split_row_tokens):
|
|
||||||
if c_idx < len(split_row_tokens[r_idx + 1]):
|
|
||||||
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
|
|
||||||
|
|
||||||
if next_right_cell in [
|
|
||||||
TableToken.OTSL_LCEL.value,
|
|
||||||
TableToken.OTSL_XCEL.value,
|
|
||||||
]:
|
|
||||||
# we have horisontal spanning cell or 2d spanning cell
|
|
||||||
col_span += count_right(
|
|
||||||
split_row_tokens,
|
|
||||||
c_idx + 1,
|
|
||||||
r_idx,
|
|
||||||
[TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
|
|
||||||
)
|
|
||||||
if next_bottom_cell in [
|
|
||||||
TableToken.OTSL_UCEL.value,
|
|
||||||
TableToken.OTSL_XCEL.value,
|
|
||||||
]:
|
|
||||||
# we have a vertical spanning cell or 2d spanning cell
|
|
||||||
row_span += count_down(
|
|
||||||
split_row_tokens,
|
|
||||||
c_idx,
|
|
||||||
r_idx + 1,
|
|
||||||
[TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
|
|
||||||
)
|
|
||||||
|
|
||||||
table_cells.append(
|
|
||||||
TableCell(
|
|
||||||
text=cell_text.strip(),
|
|
||||||
row_span=row_span,
|
|
||||||
col_span=col_span,
|
|
||||||
start_row_offset_idx=r_idx,
|
|
||||||
end_row_offset_idx=r_idx + row_span,
|
|
||||||
start_col_offset_idx=c_idx,
|
|
||||||
end_col_offset_idx=c_idx + col_span,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
if text in [
|
|
||||||
TableToken.OTSL_FCEL.value,
|
|
||||||
TableToken.OTSL_ECEL.value,
|
|
||||||
TableToken.OTSL_CHED.value,
|
|
||||||
TableToken.OTSL_RHED.value,
|
|
||||||
TableToken.OTSL_SROW.value,
|
|
||||||
TableToken.OTSL_LCEL.value,
|
|
||||||
TableToken.OTSL_UCEL.value,
|
|
||||||
TableToken.OTSL_XCEL.value,
|
|
||||||
]:
|
|
||||||
c_idx += 1
|
|
||||||
if text == TableToken.OTSL_NL.value:
|
|
||||||
r_idx += 1
|
|
||||||
c_idx = 0
|
|
||||||
return table_cells, split_row_tokens
|
|
||||||
|
|
||||||
def otsl_extract_tokens_and_text(s: str):
|
|
||||||
# Pattern to match anything enclosed by < > (including the angle brackets themselves)
|
|
||||||
pattern = r"(<[^>]+>)"
|
|
||||||
# Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
|
|
||||||
tokens = re.findall(pattern, s)
|
|
||||||
# Remove any tokens that start with "<loc_"
|
|
||||||
tokens = [
|
|
||||||
token
|
|
||||||
for token in tokens
|
|
||||||
if not (
|
|
||||||
token.startswith(rf"<{DocumentToken.LOC.value}")
|
|
||||||
or token
|
|
||||||
in [
|
|
||||||
rf"<{DocumentToken.OTSL.value}>",
|
|
||||||
rf"</{DocumentToken.OTSL.value}>",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
]
|
|
||||||
# Split the string by those tokens to get the in-between text
|
|
||||||
text_parts = re.split(pattern, s)
|
|
||||||
text_parts = [
|
|
||||||
token
|
|
||||||
for token in text_parts
|
|
||||||
if not (
|
|
||||||
token.startswith(rf"<{DocumentToken.LOC.value}")
|
|
||||||
or token
|
|
||||||
in [
|
|
||||||
rf"<{DocumentToken.OTSL.value}>",
|
|
||||||
rf"</{DocumentToken.OTSL.value}>",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
]
|
|
||||||
# Remove any empty or purely whitespace strings from text_parts
|
|
||||||
text_parts = [part for part in text_parts if part.strip()]
|
|
||||||
|
|
||||||
return tokens, text_parts
|
|
||||||
|
|
||||||
def parse_table_content(otsl_content: str) -> TableData:
|
|
||||||
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
|
|
||||||
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
|
|
||||||
|
|
||||||
return TableData(
|
|
||||||
num_rows=len(split_row_tokens),
|
|
||||||
num_cols=(
|
|
||||||
max(len(row) for row in split_row_tokens) if split_row_tokens else 0
|
|
||||||
),
|
|
||||||
table_cells=table_cells,
|
|
||||||
)
|
|
||||||
|
|
||||||
doc = DoclingDocument(name="Document")
|
|
||||||
for pg_idx, page in enumerate(pages):
|
|
||||||
xml_content = ""
|
|
||||||
predicted_text = ""
|
|
||||||
if page.predictions.vlm_response:
|
|
||||||
predicted_text = page.predictions.vlm_response.text
|
|
||||||
image = page.image
|
|
||||||
|
|
||||||
page_no = pg_idx + 1
|
|
||||||
bounding_boxes = []
|
|
||||||
|
|
||||||
if page.size:
|
|
||||||
pg_width = page.size.width
|
|
||||||
pg_height = page.size.height
|
|
||||||
size = Size(width=pg_width, height=pg_height)
|
|
||||||
parent_page = doc.add_page(page_no=page_no, size=size)
|
|
||||||
|
|
||||||
"""
|
|
||||||
1. Finds all <tag>...</tag> blocks in the entire string (multi-line friendly) in the order they appear.
|
|
||||||
2. For each chunk, extracts bounding box (if any) and inner text.
|
|
||||||
3. Adds the item to a DoclingDocument structure with the right label.
|
|
||||||
4. Tracks bounding boxes + color in a separate list for later visualization.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Regex for all recognized tags
|
|
||||||
tag_pattern = (
|
|
||||||
rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
|
|
||||||
rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
|
|
||||||
rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
|
|
||||||
rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
|
|
||||||
rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
|
|
||||||
rf"{DocItemLabel.LIST_ITEM}|{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
|
|
||||||
rf"{DocItemLabel.SECTION_HEADER}_level_1|{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
|
|
||||||
)
|
|
||||||
|
|
||||||
# DocumentToken.OTSL
|
|
||||||
pattern = re.compile(tag_pattern, re.DOTALL)
|
|
||||||
|
|
||||||
# Go through each match in order
|
|
||||||
for match in pattern.finditer(predicted_text):
|
|
||||||
full_chunk = match.group(0)
|
|
||||||
tag_name = match.group("tag")
|
|
||||||
|
|
||||||
bbox = extract_bounding_box(full_chunk)
|
|
||||||
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
|
|
||||||
color = tag_to_color.get(tag_name, "white")
|
|
||||||
|
|
||||||
# Store bounding box + color
|
|
||||||
if bbox:
|
|
||||||
bounding_boxes.append((bbox, color))
|
|
||||||
|
|
||||||
if tag_name == DocumentToken.OTSL.value:
|
|
||||||
table_data = parse_table_content(full_chunk)
|
|
||||||
bbox = extract_bounding_box(full_chunk)
|
|
||||||
|
|
||||||
if bbox:
|
|
||||||
prov = ProvenanceItem(
|
|
||||||
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
||||||
charspan=(0, 0),
|
|
||||||
page_no=page_no,
|
|
||||||
)
|
|
||||||
doc.add_table(data=table_data, prov=prov)
|
|
||||||
else:
|
|
||||||
doc.add_table(data=table_data)
|
|
||||||
|
|
||||||
elif tag_name == DocItemLabel.PICTURE:
|
|
||||||
text_caption_content = extract_inner_text(full_chunk)
|
|
||||||
if image:
|
|
||||||
if bbox:
|
|
||||||
im_width, im_height = image.size
|
|
||||||
|
|
||||||
crop_box = (
|
|
||||||
int(bbox.l * im_width),
|
|
||||||
int(bbox.t * im_height),
|
|
||||||
int(bbox.r * im_width),
|
|
||||||
int(bbox.b * im_height),
|
|
||||||
)
|
|
||||||
cropped_image = image.crop(crop_box)
|
|
||||||
pic = doc.add_picture(
|
|
||||||
parent=None,
|
|
||||||
image=ImageRef.from_pil(image=cropped_image, dpi=72),
|
|
||||||
prov=(
|
|
||||||
ProvenanceItem(
|
|
||||||
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
||||||
charspan=(0, 0),
|
|
||||||
page_no=page_no,
|
|
||||||
)
|
|
||||||
),
|
|
||||||
)
|
|
||||||
# If there is a caption to an image, add it as well
|
|
||||||
if len(text_caption_content) > 0:
|
|
||||||
caption_item = doc.add_text(
|
|
||||||
label=DocItemLabel.CAPTION,
|
|
||||||
text=text_caption_content,
|
|
||||||
parent=None,
|
|
||||||
)
|
|
||||||
pic.captions.append(caption_item.get_ref())
|
|
||||||
else:
|
|
||||||
if bbox:
|
|
||||||
# In case we don't have access to an binary of an image
|
|
||||||
doc.add_picture(
|
|
||||||
parent=None,
|
|
||||||
prov=ProvenanceItem(
|
|
||||||
bbox=bbox, charspan=(0, 0), page_no=page_no
|
|
||||||
),
|
|
||||||
)
|
|
||||||
# If there is a caption to an image, add it as well
|
|
||||||
if len(text_caption_content) > 0:
|
|
||||||
caption_item = doc.add_text(
|
|
||||||
label=DocItemLabel.CAPTION,
|
|
||||||
text=text_caption_content,
|
|
||||||
parent=None,
|
|
||||||
)
|
|
||||||
pic.captions.append(caption_item.get_ref())
|
|
||||||
else:
|
|
||||||
# For everything else, treat as text
|
|
||||||
if self.force_backend_text:
|
|
||||||
text_content = extract_text_from_backend(page, bbox)
|
|
||||||
else:
|
|
||||||
text_content = extract_inner_text(full_chunk)
|
|
||||||
doc.add_text(
|
|
||||||
label=doc_label,
|
|
||||||
text=text_content,
|
|
||||||
prov=(
|
|
||||||
ProvenanceItem(
|
|
||||||
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
|
||||||
charspan=(0, len(text_content)),
|
|
||||||
page_no=page_no,
|
|
||||||
)
|
|
||||||
if bbox
|
|
||||||
else None
|
|
||||||
),
|
|
||||||
)
|
|
||||||
return doc
|
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_default_options(cls) -> VlmPipelineOptions:
|
def get_default_options(cls) -> VlmPipelineOptions:
|
||||||
return VlmPipelineOptions()
|
return VlmPipelineOptions()
|
||||||
|
10
poetry.lock
generated
10
poetry.lock
generated
@ -1,4 +1,4 @@
|
|||||||
# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
|
# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "accelerate"
|
name = "accelerate"
|
||||||
@ -870,13 +870,13 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "docling-core"
|
name = "docling-core"
|
||||||
version = "2.23.0"
|
version = "2.23.1"
|
||||||
description = "A python library to define and validate data types in Docling."
|
description = "A python library to define and validate data types in Docling."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "<4.0,>=3.9"
|
python-versions = "<4.0,>=3.9"
|
||||||
files = [
|
files = [
|
||||||
{file = "docling_core-2.23.0-py3-none-any.whl", hash = "sha256:de17e2821216cc1817f99e226d2bad28f226289644fbffdf442ad282c842a79a"},
|
{file = "docling_core-2.23.1-py3-none-any.whl", hash = "sha256:4a3f7bcc55a735a070d69d74cf1278f7e40cb403c5059d4149672c7ca163992f"},
|
||||||
{file = "docling_core-2.23.0.tar.gz", hash = "sha256:16a5dbca0a639aa5c49b58ceb7a98e7e1dd24cd956912c68f573f77164c96526"},
|
{file = "docling_core-2.23.1.tar.gz", hash = "sha256:0708f4ffe61faef9a2dee48e71cf3890248bf1d9b409f6414cd9c0dd6c7a1681"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -7838,4 +7838,4 @@ vlm = ["accelerate", "transformers", "transformers"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.9"
|
python-versions = "^3.9"
|
||||||
content-hash = "a9ace62bd5b629cb2f20186b750d7c63f383f37f2e3df04cfcc821fc83c877b8"
|
content-hash = "16324c95a8aae1a710c4151e509c59e9a97d8bb97d4c726861ab3215fbea0a9d"
|
||||||
|
@ -46,7 +46,7 @@ packages = [{ include = "docling" }]
|
|||||||
######################
|
######################
|
||||||
python = "^3.9"
|
python = "^3.9"
|
||||||
pydantic = "^2.0.0"
|
pydantic = "^2.0.0"
|
||||||
docling-core = {extras = ["chunking"], version = "^2.23.0"}
|
docling-core = {extras = ["chunking"], version = "^2.23.1"}
|
||||||
docling-ibm-models = "^3.4.0"
|
docling-ibm-models = "^3.4.0"
|
||||||
docling-parse = "^4.0.0"
|
docling-parse = "^4.0.0"
|
||||||
filetype = "^1.2.0"
|
filetype = "^1.2.0"
|
||||||
|
@ -2,7 +2,7 @@
|
|||||||
<html lang="en">
|
<html lang="en">
|
||||||
<head>
|
<head>
|
||||||
<link rel="icon" type="image/png"
|
<link rel="icon" type="image/png"
|
||||||
href="https://ds4sd.github.io/docling/assets/logo.png"/>
|
href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/>
|
||||||
<meta charset="UTF-8">
|
<meta charset="UTF-8">
|
||||||
<title>
|
<title>
|
||||||
Powered by Docling
|
Powered by Docling
|
||||||
|
Loading…
Reference in New Issue
Block a user