From 861abcdcb0d406342b9566f81203b87cf32b7ad0 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Wed, 18 Jun 2025 15:57:57 +0200 Subject: [PATCH] feat(markdown): add formatting & improve inline support (#1804) feat(markdown): support formatting & hyperlinks Signed-off-by: Panos Vagenas --- docling/backend/md_backend.py | 182 +++--- .../docling_v2/inline_and_formatting.md.md | 20 + .../docling_v2/inline_and_formatting.md.yaml | 565 ++++++++++++++++++ tests/data/md/inline_and_formatting.md | 18 + tests/test_backend_markdown.py | 25 +- 5 files changed, 722 insertions(+), 88 deletions(-) create mode 100644 tests/data/groundtruth/docling_v2/inline_and_formatting.md.md create mode 100644 tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml create mode 100644 tests/data/md/inline_and_formatting.md diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index fbe17ee..b8b0e6d 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -1,17 +1,15 @@ import logging import re import warnings +from copy import deepcopy from io import BytesIO from pathlib import Path from typing import List, Optional, Set, Union import marko import marko.element -import marko.ext -import marko.ext.gfm import marko.inline from docling_core.types.doc import ( - DocItem, DocItemLabel, DoclingDocument, DocumentOrigin, @@ -21,7 +19,9 @@ from docling_core.types.doc import ( TableData, TextItem, ) +from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList from marko import Markdown +from pydantic import AnyUrl, TypeAdapter from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.html_backend import HTMLDocumentBackend @@ -71,7 +71,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): self.in_table = False self.md_table_buffer: list[str] = [] - self.inline_texts: list[str] = [] self._html_blocks: int = 0 try: @@ -156,25 +155,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): doc.add_table(data=table_data) return - def _process_inline_text( - self, parent_item: Optional[NodeItem], doc: DoclingDocument - ): - txt = " ".join(self.inline_texts) - if len(txt) > 0: - doc.add_text( - label=DocItemLabel.PARAGRAPH, - parent=parent_item, - text=txt, - ) - self.inline_texts = [] - def _iterate_elements( # noqa: C901 self, + *, element: marko.element.Element, depth: int, doc: DoclingDocument, visited: Set[marko.element.Element], parent_item: Optional[NodeItem] = None, + formatting: Optional[Formatting] = None, + hyperlink: Optional[Union[AnyUrl, Path]] = None, ): if element in visited: return @@ -183,44 +173,32 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): # Check for different element types and process relevant details if isinstance(element, marko.block.Heading) and len(element.children) > 0: self._close_table(doc) - self._process_inline_text(parent_item, doc) _log.debug( f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore ) - if element.level == 1: - doc_label = DocItemLabel.TITLE + + if len(element.children) == 1: + child = element.children[0] + snippet_text = str(child.children) # type: ignore + visited.add(child) else: - doc_label = DocItemLabel.SECTION_HEADER + snippet_text = "" # inline group will be created - # Header could have arbitrary inclusion of bold, italic or emphasis, - # hence we need to traverse the tree to get full text of a header - strings: List[str] = [] - - # Define a recursive function to traverse the tree - def traverse(node: marko.block.BlockElement): - # Check if the node has a "children" attribute - if hasattr(node, "children"): - # If "children" is a list, continue traversal - if isinstance(node.children, list): - for child in node.children: - traverse(child) - # If "children" is text, add it to header text - elif isinstance(node.children, str): - strings.append(node.children) - - traverse(element) - snippet_text = "".join(strings) - if len(snippet_text) > 0: - if doc_label == DocItemLabel.SECTION_HEADER: - parent_item = doc.add_heading( - text=snippet_text, - level=element.level - 1, - parent=parent_item, - ) - else: - parent_item = doc.add_text( - label=doc_label, parent=parent_item, text=snippet_text - ) + if element.level == 1: + parent_item = doc.add_title( + text=snippet_text, + parent=parent_item, + formatting=formatting, + hyperlink=hyperlink, + ) + else: + parent_item = doc.add_heading( + text=snippet_text, + level=element.level - 1, + parent=parent_item, + formatting=formatting, + hyperlink=hyperlink, + ) elif isinstance(element, marko.block.List): has_non_empty_list_items = False @@ -230,7 +208,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): break self._close_table(doc) - self._process_inline_text(parent_item, doc) _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}") if has_non_empty_list_items: label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST @@ -240,41 +217,60 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): elif ( isinstance(element, marko.block.ListItem) - and len(element.children) > 0 - and isinstance((first_child := element.children[0]), marko.block.Paragraph) + and len(element.children) == 1 + and isinstance((child := element.children[0]), marko.block.Paragraph) + and len(child.children) > 0 ): self._close_table(doc) - self._process_inline_text(parent_item, doc) _log.debug(" - List item") - snippet_text = str(first_child.children[0].children) # type: ignore - is_numbered = False - if ( - parent_item is not None - and isinstance(parent_item, DocItem) - and parent_item.label == GroupLabel.ORDERED_LIST - ): - is_numbered = True - doc.add_list_item( - enumerated=is_numbered, parent=parent_item, text=snippet_text + if len(child.children) == 1: + snippet_text = str(child.children[0].children) # type: ignore + visited.add(child) + else: + snippet_text = "" # inline group will be created + is_numbered = isinstance(parent_item, OrderedList) + if not isinstance(parent_item, (OrderedList, UnorderedList)): + _log.warning("ListItem would have not had a list parent, adding one.") + parent_item = doc.add_unordered_list(parent=parent_item) + parent_item = doc.add_list_item( + enumerated=is_numbered, + parent=parent_item, + text=snippet_text, + formatting=formatting, + hyperlink=hyperlink, ) - visited.add(first_child) elif isinstance(element, marko.inline.Image): self._close_table(doc) - self._process_inline_text(parent_item, doc) _log.debug(f" - Image with alt: {element.title}, url: {element.dest}") fig_caption: Optional[TextItem] = None if element.title is not None and element.title != "": fig_caption = doc.add_text( - label=DocItemLabel.CAPTION, text=element.title + label=DocItemLabel.CAPTION, + text=element.title, + formatting=formatting, + hyperlink=hyperlink, ) doc.add_picture(parent=parent_item, caption=fig_caption) - elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0: - self._process_inline_text(parent_item, doc) + elif isinstance(element, marko.inline.Emphasis): + _log.debug(f" - Emphasis: {element.children}") + formatting = deepcopy(formatting) if formatting else Formatting() + formatting.italic = True + + elif isinstance(element, marko.inline.StrongEmphasis): + _log.debug(f" - StrongEmphasis: {element.children}") + formatting = deepcopy(formatting) if formatting else Formatting() + formatting.bold = True + + elif isinstance(element, marko.inline.Link): + _log.debug(f" - Link: {element.children}") + hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python( + element.dest + ) elif isinstance(element, marko.inline.RawText): _log.debug(f" - Paragraph (raw text): {element.children}") @@ -287,28 +283,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text else: self.md_table_buffer.append(snippet_text) - else: + elif snippet_text: self._close_table(doc) - # most likely just inline text - self.inline_texts.append(str(element.children)) + doc.add_text( + label=DocItemLabel.TEXT, + parent=parent_item, + text=snippet_text, + formatting=formatting, + hyperlink=hyperlink, + ) elif isinstance(element, marko.inline.CodeSpan): self._close_table(doc) - self._process_inline_text(parent_item, doc) _log.debug(f" - Code Span: {element.children}") snippet_text = str(element.children).strip() - doc.add_code(parent=parent_item, text=snippet_text) + doc.add_code( + parent=parent_item, + text=snippet_text, + formatting=formatting, + hyperlink=hyperlink, + ) elif ( isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode)) and len(element.children) > 0 - and isinstance((first_child := element.children[0]), marko.inline.RawText) - and len(snippet_text := (first_child.children.strip())) > 0 + and isinstance((child := element.children[0]), marko.inline.RawText) + and len(snippet_text := (child.children.strip())) > 0 ): self._close_table(doc) - self._process_inline_text(parent_item, doc) _log.debug(f" - Code Block: {element.children}") - doc.add_code(parent=parent_item, text=snippet_text) + doc.add_code( + parent=parent_item, + text=snippet_text, + formatting=formatting, + hyperlink=hyperlink, + ) elif isinstance(element, marko.inline.LineBreak): if self.in_table: @@ -317,7 +326,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): elif isinstance(element, marko.block.HTMLBlock): self._html_blocks += 1 - self._process_inline_text(parent_item, doc) self._close_table(doc) _log.debug(f"HTML Block: {element}") if ( @@ -327,14 +335,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): # wrap in markers to enable post-processing in convert() text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}" - doc.add_code(parent=parent_item, text=text_to_add) + doc.add_code( + parent=parent_item, + text=text_to_add, + formatting=formatting, + hyperlink=hyperlink, + ) else: if not isinstance(element, str): self._close_table(doc) _log.debug(f"Some other element: {element}") + if ( + isinstance(element, (marko.block.Paragraph, marko.block.Heading)) + and len(element.children) > 1 + ): + parent_item = doc.add_inline_group(parent=parent_item) + processed_block_types = ( - marko.block.Heading, + # marko.block.Heading, marko.block.CodeBlock, marko.block.FencedCode, marko.inline.RawText, @@ -351,6 +370,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): doc=doc, visited=visited, parent_item=parent_item, + formatting=formatting, + hyperlink=hyperlink, ) def is_valid(self) -> bool: @@ -392,7 +413,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): parent_item=None, visited=set(), ) - self._process_inline_text(None, doc) # handle last hanging inline text self._close_table(doc=doc) # handle any last hanging table # if HTML blocks were detected, export to HTML and delegate to HTML backend diff --git a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md new file mode 100644 index 0000000..31c3f3b --- /dev/null +++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md @@ -0,0 +1,20 @@ +# Contribution guideline example + +This is simple. + +Foo *emphasis* **strong emphasis** ***both*** . + +Create your feature branch: `git checkout -b feature/AmazingFeature` . + +1. Pull the [**repository**](https://github.com/docling-project/docling) . +2. Create your feature branch ( `git checkout -b feature/AmazingFeature` ) +3. Commit your changes ( `git commit -m 'Add some AmazingFeature'` ) +4. Push to the branch ( `git push origin feature/AmazingFeature` ) +5. Open a Pull Request + +## + +*Second* section + +- **First** : Lorem ipsum. +- **Second** : Dolor `sit` amet. diff --git a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml new file mode 100644 index 0000000..0cdc5c5 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml @@ -0,0 +1,565 @@ +body: + children: + - $ref: '#/texts/0' + - $ref: '#/texts/1' + - $ref: '#/groups/0' + - $ref: '#/groups/1' + - $ref: '#/groups/2' + - $ref: '#/texts/27' + - $ref: '#/groups/8' + content_layer: body + label: unspecified + name: _root_ + self_ref: '#/body' +form_items: [] +furniture: + children: [] + content_layer: furniture + label: unspecified + name: _root_ + self_ref: '#/furniture' +groups: +- children: + - $ref: '#/texts/2' + - $ref: '#/texts/3' + - $ref: '#/texts/4' + - $ref: '#/texts/5' + - $ref: '#/texts/6' + content_layer: body + label: inline + name: group + parent: + $ref: '#/body' + self_ref: '#/groups/0' +- children: + - $ref: '#/texts/7' + - $ref: '#/texts/8' + - $ref: '#/texts/9' + content_layer: body + label: inline + name: group + parent: + $ref: '#/body' + self_ref: '#/groups/1' +- children: + - $ref: '#/texts/10' + - $ref: '#/texts/14' + - $ref: '#/texts/18' + - $ref: '#/texts/22' + - $ref: '#/texts/26' + content_layer: body + label: ordered_list + name: list + parent: + $ref: '#/body' + self_ref: '#/groups/2' +- children: + - $ref: '#/texts/11' + - $ref: '#/texts/12' + - $ref: '#/texts/13' + content_layer: body + label: inline + name: group + parent: + $ref: '#/texts/10' + self_ref: '#/groups/3' +- children: + - $ref: '#/texts/15' + - $ref: '#/texts/16' + - $ref: '#/texts/17' + content_layer: body + label: inline + name: group + parent: + $ref: '#/texts/14' + self_ref: '#/groups/4' +- children: + - $ref: '#/texts/19' + - $ref: '#/texts/20' + - $ref: '#/texts/21' + content_layer: body + label: inline + name: group + parent: + $ref: '#/texts/18' + self_ref: '#/groups/5' +- children: + - $ref: '#/texts/23' + - $ref: '#/texts/24' + - $ref: '#/texts/25' + content_layer: body + label: inline + name: group + parent: + $ref: '#/texts/22' + self_ref: '#/groups/6' +- children: + - $ref: '#/texts/28' + - $ref: '#/texts/29' + content_layer: body + label: inline + name: group + parent: + $ref: '#/texts/27' + self_ref: '#/groups/7' +- children: + - $ref: '#/texts/30' + - $ref: '#/texts/33' + content_layer: body + label: list + name: list + parent: + $ref: '#/body' + self_ref: '#/groups/8' +- children: + - $ref: '#/texts/31' + - $ref: '#/texts/32' + content_layer: body + label: inline + name: group + parent: + $ref: '#/texts/30' + self_ref: '#/groups/9' +- children: + - $ref: '#/texts/34' + - $ref: '#/texts/35' + - $ref: '#/texts/36' + - $ref: '#/texts/37' + content_layer: body + label: inline + name: group + parent: + $ref: '#/texts/33' + self_ref: '#/groups/10' +key_value_items: [] +name: inline_and_formatting +origin: + binary_hash: 9342273634728023910 + filename: inline_and_formatting.md + mimetype: text/markdown +pages: {} +pictures: [] +schema_name: DoclingDocument +tables: [] +texts: +- children: [] + content_layer: body + label: title + orig: Contribution guideline example + parent: + $ref: '#/body' + prov: [] + self_ref: '#/texts/0' + text: Contribution guideline example +- children: [] + content_layer: body + label: text + orig: This is simple. + parent: + $ref: '#/body' + prov: [] + self_ref: '#/texts/1' + text: This is simple. +- children: [] + content_layer: body + label: text + orig: Foo + parent: + $ref: '#/groups/0' + prov: [] + self_ref: '#/texts/2' + text: Foo +- children: [] + content_layer: body + formatting: + bold: false + italic: true + strikethrough: false + underline: false + label: text + orig: emphasis + parent: + $ref: '#/groups/0' + prov: [] + self_ref: '#/texts/3' + text: emphasis +- children: [] + content_layer: body + formatting: + bold: true + italic: false + strikethrough: false + underline: false + label: text + orig: strong emphasis + parent: + $ref: '#/groups/0' + prov: [] + self_ref: '#/texts/4' + text: strong emphasis +- children: [] + content_layer: body + formatting: + bold: true + italic: true + strikethrough: false + underline: false + label: text + orig: both + parent: + $ref: '#/groups/0' + prov: [] + self_ref: '#/texts/5' + text: both +- children: [] + content_layer: body + label: text + orig: . + parent: + $ref: '#/groups/0' + prov: [] + self_ref: '#/texts/6' + text: . +- children: [] + content_layer: body + label: text + orig: 'Create your feature branch:' + parent: + $ref: '#/groups/1' + prov: [] + self_ref: '#/texts/7' + text: 'Create your feature branch:' +- captions: [] + children: [] + code_language: unknown + content_layer: body + footnotes: [] + label: code + orig: git checkout -b feature/AmazingFeature + parent: + $ref: '#/groups/1' + prov: [] + references: [] + self_ref: '#/texts/8' + text: git checkout -b feature/AmazingFeature +- children: [] + content_layer: body + label: text + orig: . + parent: + $ref: '#/groups/1' + prov: [] + self_ref: '#/texts/9' + text: . +- children: + - $ref: '#/groups/3' + content_layer: body + enumerated: true + label: list_item + marker: '-' + orig: '' + parent: + $ref: '#/groups/2' + prov: [] + self_ref: '#/texts/10' + text: '' +- children: [] + content_layer: body + label: text + orig: Pull the + parent: + $ref: '#/groups/3' + prov: [] + self_ref: '#/texts/11' + text: Pull the +- children: [] + content_layer: body + formatting: + bold: true + italic: false + strikethrough: false + underline: false + hyperlink: https://github.com/docling-project/docling + label: text + orig: repository + parent: + $ref: '#/groups/3' + prov: [] + self_ref: '#/texts/12' + text: repository +- children: [] + content_layer: body + label: text + orig: . + parent: + $ref: '#/groups/3' + prov: [] + self_ref: '#/texts/13' + text: . +- children: + - $ref: '#/groups/4' + content_layer: body + enumerated: true + label: list_item + marker: '-' + orig: '' + parent: + $ref: '#/groups/2' + prov: [] + self_ref: '#/texts/14' + text: '' +- children: [] + content_layer: body + label: text + orig: Create your feature branch ( + parent: + $ref: '#/groups/4' + prov: [] + self_ref: '#/texts/15' + text: Create your feature branch ( +- captions: [] + children: [] + code_language: unknown + content_layer: body + footnotes: [] + label: code + orig: git checkout -b feature/AmazingFeature + parent: + $ref: '#/groups/4' + prov: [] + references: [] + self_ref: '#/texts/16' + text: git checkout -b feature/AmazingFeature +- children: [] + content_layer: body + label: text + orig: ) + parent: + $ref: '#/groups/4' + prov: [] + self_ref: '#/texts/17' + text: ) +- children: + - $ref: '#/groups/5' + content_layer: body + enumerated: true + label: list_item + marker: '-' + orig: '' + parent: + $ref: '#/groups/2' + prov: [] + self_ref: '#/texts/18' + text: '' +- children: [] + content_layer: body + label: text + orig: Commit your changes ( + parent: + $ref: '#/groups/5' + prov: [] + self_ref: '#/texts/19' + text: Commit your changes ( +- captions: [] + children: [] + code_language: unknown + content_layer: body + footnotes: [] + label: code + orig: git commit -m 'Add some AmazingFeature' + parent: + $ref: '#/groups/5' + prov: [] + references: [] + self_ref: '#/texts/20' + text: git commit -m 'Add some AmazingFeature' +- children: [] + content_layer: body + label: text + orig: ) + parent: + $ref: '#/groups/5' + prov: [] + self_ref: '#/texts/21' + text: ) +- children: + - $ref: '#/groups/6' + content_layer: body + enumerated: true + label: list_item + marker: '-' + orig: '' + parent: + $ref: '#/groups/2' + prov: [] + self_ref: '#/texts/22' + text: '' +- children: [] + content_layer: body + label: text + orig: Push to the branch ( + parent: + $ref: '#/groups/6' + prov: [] + self_ref: '#/texts/23' + text: Push to the branch ( +- captions: [] + children: [] + code_language: unknown + content_layer: body + footnotes: [] + label: code + orig: git push origin feature/AmazingFeature + parent: + $ref: '#/groups/6' + prov: [] + references: [] + self_ref: '#/texts/24' + text: git push origin feature/AmazingFeature +- children: [] + content_layer: body + label: text + orig: ) + parent: + $ref: '#/groups/6' + prov: [] + self_ref: '#/texts/25' + text: ) +- children: [] + content_layer: body + enumerated: true + label: list_item + marker: '-' + orig: Open a Pull Request + parent: + $ref: '#/groups/2' + prov: [] + self_ref: '#/texts/26' + text: Open a Pull Request +- children: + - $ref: '#/groups/7' + content_layer: body + label: section_header + level: 1 + orig: '' + parent: + $ref: '#/body' + prov: [] + self_ref: '#/texts/27' + text: '' +- children: [] + content_layer: body + formatting: + bold: false + italic: true + strikethrough: false + underline: false + label: text + orig: Second + parent: + $ref: '#/groups/7' + prov: [] + self_ref: '#/texts/28' + text: Second +- children: [] + content_layer: body + label: text + orig: section + parent: + $ref: '#/groups/7' + prov: [] + self_ref: '#/texts/29' + text: section +- children: + - $ref: '#/groups/9' + content_layer: body + enumerated: false + label: list_item + marker: '-' + orig: '' + parent: + $ref: '#/groups/8' + prov: [] + self_ref: '#/texts/30' + text: '' +- children: [] + content_layer: body + formatting: + bold: true + italic: false + strikethrough: false + underline: false + label: text + orig: First + parent: + $ref: '#/groups/9' + prov: [] + self_ref: '#/texts/31' + text: First +- children: [] + content_layer: body + label: text + orig: ': Lorem ipsum.' + parent: + $ref: '#/groups/9' + prov: [] + self_ref: '#/texts/32' + text: ': Lorem ipsum.' +- children: + - $ref: '#/groups/10' + content_layer: body + enumerated: false + label: list_item + marker: '-' + orig: '' + parent: + $ref: '#/groups/8' + prov: [] + self_ref: '#/texts/33' + text: '' +- children: [] + content_layer: body + formatting: + bold: true + italic: false + strikethrough: false + underline: false + label: text + orig: Second + parent: + $ref: '#/groups/10' + prov: [] + self_ref: '#/texts/34' + text: Second +- children: [] + content_layer: body + label: text + orig: ': Dolor' + parent: + $ref: '#/groups/10' + prov: [] + self_ref: '#/texts/35' + text: ': Dolor' +- captions: [] + children: [] + code_language: unknown + content_layer: body + footnotes: [] + label: code + orig: sit + parent: + $ref: '#/groups/10' + prov: [] + references: [] + self_ref: '#/texts/36' + text: sit +- children: [] + content_layer: body + label: text + orig: amet. + parent: + $ref: '#/groups/10' + prov: [] + self_ref: '#/texts/37' + text: amet. +version: 1.3.0 diff --git a/tests/data/md/inline_and_formatting.md b/tests/data/md/inline_and_formatting.md new file mode 100644 index 0000000..e18a46c --- /dev/null +++ b/tests/data/md/inline_and_formatting.md @@ -0,0 +1,18 @@ +# Contribution guideline example + +This is simple. + +Foo *emphasis* **strong emphasis** ***both***. + +Create your feature branch: `git checkout -b feature/AmazingFeature`. + +1. Pull the [**repository**](https://github.com/docling-project/docling). +2. Create your feature branch (`git checkout -b feature/AmazingFeature`) +3. Commit your changes (`git commit -m 'Add some AmazingFeature'`) +4. Push to the branch (`git push origin feature/AmazingFeature`) +5. Open a Pull Request + +## *Second* section + +- **First**: Lorem ipsum. +- **Second**: Dolor `sit` amet. diff --git a/tests/test_backend_markdown.py b/tests/test_backend_markdown.py index 5a201ab..e499608 100644 --- a/tests/test_backend_markdown.py +++ b/tests/test_backend_markdown.py @@ -2,7 +2,7 @@ from pathlib import Path from docling.backend.md_backend import MarkdownDocumentBackend from docling.datamodel.base_models import InputFormat -from docling.datamodel.document import InputDocument +from docling.datamodel.document import DoclingDocument, InputDocument from .test_data_gen_flag import GEN_TEST_DATA @@ -11,12 +11,15 @@ def test_convert_valid(): fmt = InputFormat.MD cls = MarkdownDocumentBackend - test_data_path = Path("tests") / "data" - relevant_paths = sorted((test_data_path / "md").rglob("*.md")) + root_path = Path("tests") / "data" + relevant_paths = sorted((root_path / "md").rglob("*.md")) assert len(relevant_paths) > 0 + yaml_filter = ["inline_and_formatting"] + for in_path in relevant_paths: - gt_path = test_data_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md" + md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md" + yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml" in_doc = InputDocument( path_or_stream=in_path, @@ -33,9 +36,17 @@ def test_convert_valid(): act_data = act_doc.export_to_markdown() if GEN_TEST_DATA: - with open(gt_path, mode="w", encoding="utf-8") as f: + with open(md_gt_path, mode="w", encoding="utf-8") as f: f.write(f"{act_data}\n") + + if in_path.stem in yaml_filter: + with open(yaml_gt_path, mode="w", encoding="utf-8") as f: + act_doc.save_as_yaml(yaml_gt_path) else: - with open(gt_path, encoding="utf-8") as f: + with open(md_gt_path, encoding="utf-8") as f: exp_data = f.read().rstrip() - assert exp_data == act_data + assert act_data == exp_data + + if in_path.stem in yaml_filter: + exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path) + assert act_doc == exp_doc