diff --git a/README.md b/README.md index 040df18..358968c 100644 --- a/README.md +++ b/README.md @@ -34,12 +34,12 @@ Docling simplifies document processing, parsing diverse formats — including ad * 🔒 Local execution capabilities for sensitive data and air-gapped environments * 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI * 🔍 Extensive OCR support for scanned PDFs and images +* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) * 💻 Simple and convenient CLI ### Coming soon * 📝 Metadata extraction, including title, authors, references & language -* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling)) * 📝 Chart understanding (Barchart, Piechart, LinePlot, etc) * 📝 Complex chemistry understanding (Molecular structures) diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index c6920b6..4afb918 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -1,41 +1,20 @@ -import itertools import logging -import re import warnings from io import BytesIO - -# from io import BytesIO from pathlib import Path -from typing import Optional +from typing import List, Optional, Union, cast -from docling_core.types import DoclingDocument -from docling_core.types.doc import ( - BoundingBox, - DocItem, - DocItemLabel, - DoclingDocument, - GroupLabel, - ImageRef, - ImageRefMode, - PictureItem, - ProvenanceItem, - Size, - TableCell, - TableData, - TableItem, -) -from docling_core.types.doc.tokens import DocumentToken, TableToken +# from docling_core.types import DoclingDocument +from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem +from docling_core.types.doc.document import DocTagsDocument +from PIL import Image as PILImage from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import InputFormat, Page from docling.datamodel.document import ConversionResult, InputDocument -from docling.datamodel.pipeline_options import ( - PdfPipelineOptions, - ResponseFormat, - VlmPipelineOptions, -) +from docling.datamodel.pipeline_options import ResponseFormat, VlmPipelineOptions from docling.datamodel.settings import settings from docling.models.hf_vlm_model import HuggingFaceVlmModel from docling.pipeline.base_pipeline import PaginatedPipeline @@ -100,6 +79,15 @@ class VlmPipeline(PaginatedPipeline): return page + def extract_text_from_backend(self, page: Page, bbox: BoundingBox | None) -> str: + # Convert bounding box normalized to 0-100 into page coordinates for cropping + text = "" + if bbox: + if page.size: + if page._backend: + text = page._backend.get_text_in_rect(bbox) + return text + def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT): @@ -107,7 +95,45 @@ class VlmPipeline(PaginatedPipeline): self.pipeline_options.vlm_options.response_format == ResponseFormat.DOCTAGS ): - conv_res.document = self._turn_tags_into_doc(conv_res.pages) + doctags_list = [] + image_list = [] + for page in conv_res.pages: + predicted_doctags = "" + img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)") + if page.predictions.vlm_response: + predicted_doctags = page.predictions.vlm_response.text + if page.image: + img = page.image + image_list.append(img) + doctags_list.append(predicted_doctags) + + doctags_list_c = cast(List[Union[Path, str]], doctags_list) + image_list_c = cast(List[Union[Path, PILImage.Image]], image_list) + doctags_doc = DocTagsDocument.from_doctags_and_image_pairs( + doctags_list_c, image_list_c + ) + conv_res.document.load_from_doctags(doctags_doc) + + # If forced backend text, replace model predicted text with backend one + if page.size: + if self.force_backend_text: + scale = self.pipeline_options.images_scale + for element, _level in conv_res.document.iterate_items(): + if ( + not isinstance(element, TextItem) + or len(element.prov) == 0 + ): + continue + crop_bbox = ( + element.prov[0] + .bbox.scaled(scale=scale) + .to_top_left_origin( + page_height=page.size.height * scale + ) + ) + txt = self.extract_text_from_backend(page, crop_bbox) + element.text = txt + element.orig = txt elif ( self.pipeline_options.vlm_options.response_format == ResponseFormat.MARKDOWN @@ -165,366 +191,6 @@ class VlmPipeline(PaginatedPipeline): ) return backend.convert() - def _turn_tags_into_doc(self, pages: list[Page]) -> DoclingDocument: - ############################################### - # Tag definitions and color mappings - ############################################### - - # Maps the recognized tag to a Docling label. - # Code items will be given DocItemLabel.CODE - tag_to_doclabel = { - "title": DocItemLabel.TITLE, - "document_index": DocItemLabel.DOCUMENT_INDEX, - "otsl": DocItemLabel.TABLE, - "section_header_level_1": DocItemLabel.SECTION_HEADER, - "checkbox_selected": DocItemLabel.CHECKBOX_SELECTED, - "checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED, - "text": DocItemLabel.TEXT, - "page_header": DocItemLabel.PAGE_HEADER, - "page_footer": DocItemLabel.PAGE_FOOTER, - "formula": DocItemLabel.FORMULA, - "caption": DocItemLabel.CAPTION, - "picture": DocItemLabel.PICTURE, - "list_item": DocItemLabel.LIST_ITEM, - "footnote": DocItemLabel.FOOTNOTE, - "code": DocItemLabel.CODE, - } - - # Maps each tag to an associated bounding box color. - tag_to_color = { - "title": "blue", - "document_index": "darkblue", - "otsl": "green", - "section_header_level_1": "purple", - "checkbox_selected": "black", - "checkbox_unselected": "gray", - "text": "red", - "page_header": "orange", - "page_footer": "cyan", - "formula": "pink", - "caption": "magenta", - "picture": "yellow", - "list_item": "brown", - "footnote": "darkred", - "code": "lightblue", - } - - def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]: - """Extracts bounding box coords from the chunk, normalized by / 500.""" - coords = re.findall(r"", text_chunk) - if len(coords) == 4: - l, t, r, b = map(float, coords) - return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500) - return None - - def extract_inner_text(text_chunk: str) -> str: - """Strips all <...> tags inside the chunk to get the raw text content.""" - return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip() - - def extract_text_from_backend(page: Page, bbox: BoundingBox | None) -> str: - # Convert bounding box normalized to 0-100 into page coordinates for cropping - text = "" - if bbox: - if page.size: - bbox.l = bbox.l * page.size.width - bbox.t = bbox.t * page.size.height - bbox.r = bbox.r * page.size.width - bbox.b = bbox.b * page.size.height - if page._backend: - text = page._backend.get_text_in_rect(bbox) - return text - - def otsl_parse_texts(texts, tokens): - split_word = TableToken.OTSL_NL.value - split_row_tokens = [ - list(y) - for x, y in itertools.groupby(tokens, lambda z: z == split_word) - if not x - ] - table_cells = [] - r_idx = 0 - c_idx = 0 - - def count_right(tokens, c_idx, r_idx, which_tokens): - span = 0 - c_idx_iter = c_idx - while tokens[r_idx][c_idx_iter] in which_tokens: - c_idx_iter += 1 - span += 1 - if c_idx_iter >= len(tokens[r_idx]): - return span - return span - - def count_down(tokens, c_idx, r_idx, which_tokens): - span = 0 - r_idx_iter = r_idx - while tokens[r_idx_iter][c_idx] in which_tokens: - r_idx_iter += 1 - span += 1 - if r_idx_iter >= len(tokens): - return span - return span - - for i, text in enumerate(texts): - cell_text = "" - if text in [ - TableToken.OTSL_FCEL.value, - TableToken.OTSL_ECEL.value, - TableToken.OTSL_CHED.value, - TableToken.OTSL_RHED.value, - TableToken.OTSL_SROW.value, - ]: - row_span = 1 - col_span = 1 - right_offset = 1 - if text != TableToken.OTSL_ECEL.value: - cell_text = texts[i + 1] - right_offset = 2 - - # Check next element(s) for lcel / ucel / xcel, set properly row_span, col_span - next_right_cell = "" - if i + right_offset < len(texts): - next_right_cell = texts[i + right_offset] - - next_bottom_cell = "" - if r_idx + 1 < len(split_row_tokens): - if c_idx < len(split_row_tokens[r_idx + 1]): - next_bottom_cell = split_row_tokens[r_idx + 1][c_idx] - - if next_right_cell in [ - TableToken.OTSL_LCEL.value, - TableToken.OTSL_XCEL.value, - ]: - # we have horisontal spanning cell or 2d spanning cell - col_span += count_right( - split_row_tokens, - c_idx + 1, - r_idx, - [TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value], - ) - if next_bottom_cell in [ - TableToken.OTSL_UCEL.value, - TableToken.OTSL_XCEL.value, - ]: - # we have a vertical spanning cell or 2d spanning cell - row_span += count_down( - split_row_tokens, - c_idx, - r_idx + 1, - [TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value], - ) - - table_cells.append( - TableCell( - text=cell_text.strip(), - row_span=row_span, - col_span=col_span, - start_row_offset_idx=r_idx, - end_row_offset_idx=r_idx + row_span, - start_col_offset_idx=c_idx, - end_col_offset_idx=c_idx + col_span, - ) - ) - if text in [ - TableToken.OTSL_FCEL.value, - TableToken.OTSL_ECEL.value, - TableToken.OTSL_CHED.value, - TableToken.OTSL_RHED.value, - TableToken.OTSL_SROW.value, - TableToken.OTSL_LCEL.value, - TableToken.OTSL_UCEL.value, - TableToken.OTSL_XCEL.value, - ]: - c_idx += 1 - if text == TableToken.OTSL_NL.value: - r_idx += 1 - c_idx = 0 - return table_cells, split_row_tokens - - def otsl_extract_tokens_and_text(s: str): - # Pattern to match anything enclosed by < > (including the angle brackets themselves) - pattern = r"(<[^>]+>)" - # Find all tokens (e.g. "", "", etc.) - tokens = re.findall(pattern, s) - # Remove any tokens that start with "", - rf"", - ] - ) - ] - # Split the string by those tokens to get the in-between text - text_parts = re.split(pattern, s) - text_parts = [ - token - for token in text_parts - if not ( - token.startswith(rf"<{DocumentToken.LOC.value}") - or token - in [ - rf"<{DocumentToken.OTSL.value}>", - rf"", - ] - ) - ] - # Remove any empty or purely whitespace strings from text_parts - text_parts = [part for part in text_parts if part.strip()] - - return tokens, text_parts - - def parse_table_content(otsl_content: str) -> TableData: - tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content) - table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens) - - return TableData( - num_rows=len(split_row_tokens), - num_cols=( - max(len(row) for row in split_row_tokens) if split_row_tokens else 0 - ), - table_cells=table_cells, - ) - - doc = DoclingDocument(name="Document") - for pg_idx, page in enumerate(pages): - xml_content = "" - predicted_text = "" - if page.predictions.vlm_response: - predicted_text = page.predictions.vlm_response.text - image = page.image - - page_no = pg_idx + 1 - bounding_boxes = [] - - if page.size: - pg_width = page.size.width - pg_height = page.size.height - size = Size(width=pg_width, height=pg_height) - parent_page = doc.add_page(page_no=page_no, size=size) - - """ - 1. Finds all ... blocks in the entire string (multi-line friendly) in the order they appear. - 2. For each chunk, extracts bounding box (if any) and inner text. - 3. Adds the item to a DoclingDocument structure with the right label. - 4. Tracks bounding boxes + color in a separate list for later visualization. - """ - - # Regex for all recognized tags - tag_pattern = ( - rf"<(?P{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|" - rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|" - rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|" - rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|" - rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|" - rf"{DocItemLabel.LIST_ITEM}|{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|" - rf"{DocItemLabel.SECTION_HEADER}_level_1|{DocumentToken.OTSL.value})>.*?" - ) - - # DocumentToken.OTSL - pattern = re.compile(tag_pattern, re.DOTALL) - - # Go through each match in order - for match in pattern.finditer(predicted_text): - full_chunk = match.group(0) - tag_name = match.group("tag") - - bbox = extract_bounding_box(full_chunk) - doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH) - color = tag_to_color.get(tag_name, "white") - - # Store bounding box + color - if bbox: - bounding_boxes.append((bbox, color)) - - if tag_name == DocumentToken.OTSL.value: - table_data = parse_table_content(full_chunk) - bbox = extract_bounding_box(full_chunk) - - if bbox: - prov = ProvenanceItem( - bbox=bbox.resize_by_scale(pg_width, pg_height), - charspan=(0, 0), - page_no=page_no, - ) - doc.add_table(data=table_data, prov=prov) - else: - doc.add_table(data=table_data) - - elif tag_name == DocItemLabel.PICTURE: - text_caption_content = extract_inner_text(full_chunk) - if image: - if bbox: - im_width, im_height = image.size - - crop_box = ( - int(bbox.l * im_width), - int(bbox.t * im_height), - int(bbox.r * im_width), - int(bbox.b * im_height), - ) - cropped_image = image.crop(crop_box) - pic = doc.add_picture( - parent=None, - image=ImageRef.from_pil(image=cropped_image, dpi=72), - prov=( - ProvenanceItem( - bbox=bbox.resize_by_scale(pg_width, pg_height), - charspan=(0, 0), - page_no=page_no, - ) - ), - ) - # If there is a caption to an image, add it as well - if len(text_caption_content) > 0: - caption_item = doc.add_text( - label=DocItemLabel.CAPTION, - text=text_caption_content, - parent=None, - ) - pic.captions.append(caption_item.get_ref()) - else: - if bbox: - # In case we don't have access to an binary of an image - doc.add_picture( - parent=None, - prov=ProvenanceItem( - bbox=bbox, charspan=(0, 0), page_no=page_no - ), - ) - # If there is a caption to an image, add it as well - if len(text_caption_content) > 0: - caption_item = doc.add_text( - label=DocItemLabel.CAPTION, - text=text_caption_content, - parent=None, - ) - pic.captions.append(caption_item.get_ref()) - else: - # For everything else, treat as text - if self.force_backend_text: - text_content = extract_text_from_backend(page, bbox) - else: - text_content = extract_inner_text(full_chunk) - doc.add_text( - label=doc_label, - text=text_content, - prov=( - ProvenanceItem( - bbox=bbox.resize_by_scale(pg_width, pg_height), - charspan=(0, len(text_content)), - page_no=page_no, - ) - if bbox - else None - ), - ) - return doc - @classmethod def get_default_options(cls) -> VlmPipelineOptions: return VlmPipelineOptions() diff --git a/poetry.lock b/poetry.lock index f33576b..45b53c6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand. [[package]] name = "accelerate" @@ -870,13 +870,13 @@ files = [ [[package]] name = "docling-core" -version = "2.23.0" +version = "2.23.1" description = "A python library to define and validate data types in Docling." optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "docling_core-2.23.0-py3-none-any.whl", hash = "sha256:de17e2821216cc1817f99e226d2bad28f226289644fbffdf442ad282c842a79a"}, - {file = "docling_core-2.23.0.tar.gz", hash = "sha256:16a5dbca0a639aa5c49b58ceb7a98e7e1dd24cd956912c68f573f77164c96526"}, + {file = "docling_core-2.23.1-py3-none-any.whl", hash = "sha256:4a3f7bcc55a735a070d69d74cf1278f7e40cb403c5059d4149672c7ca163992f"}, + {file = "docling_core-2.23.1.tar.gz", hash = "sha256:0708f4ffe61faef9a2dee48e71cf3890248bf1d9b409f6414cd9c0dd6c7a1681"}, ] [package.dependencies] @@ -7838,4 +7838,4 @@ vlm = ["accelerate", "transformers", "transformers"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "a9ace62bd5b629cb2f20186b750d7c63f383f37f2e3df04cfcc821fc83c877b8" +content-hash = "16324c95a8aae1a710c4151e509c59e9a97d8bb97d4c726861ab3215fbea0a9d" diff --git a/pyproject.toml b/pyproject.toml index 6a65bb2..0f85915 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ packages = [{ include = "docling" }] ###################### python = "^3.9" pydantic = "^2.0.0" -docling-core = {extras = ["chunking"], version = "^2.23.0"} +docling-core = {extras = ["chunking"], version = "^2.23.1"} docling-ibm-models = "^3.4.0" docling-parse = "^4.0.0" filetype = "^1.2.0" diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.html b/tests/data/groundtruth/docling_v2/word_tables.docx.html index 2dc087f..527fc82 100644 --- a/tests/data/groundtruth/docling_v2/word_tables.docx.html +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.html @@ -2,7 +2,7 @@ + href="https://raw.githubusercontent.com/docling-project/docling/refs/heads/main/docs/assets/logo.svg"/> Powered by Docling