From 90b766e2ae1695a759191df37c272efc09be5ee3 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Fri, 7 Feb 2025 12:55:12 +0100 Subject: [PATCH] fix(markdown): handle nested lists (#910) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling/backend/md_backend.py | 108 ++++++++++-------- .../data/groundtruth/docling_v2/nested.md.md | 31 +++++ tests/data/md/nested.md | 66 +++++++++++ tests/test_backend_markdown.py | 12 +- tests/test_data_gen_flag.py | 9 ++ 5 files changed, 177 insertions(+), 49 deletions(-) create mode 100644 tests/data/groundtruth/docling_v2/nested.md.md create mode 100644 tests/data/md/nested.md create mode 100644 tests/test_data_gen_flag.py diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index eaf4753..19a21c1 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -36,7 +36,7 @@ _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#" class MarkdownDocumentBackend(DeclarativeDocumentBackend): - def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10): + def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10): # This regex will match any sequence of underscores pattern = r"_+" @@ -81,7 +81,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): # very long sequences of underscores will lead to unnecessary long processing times. # In any proper Markdown files, underscores have to be escaped, # otherwise they represent emphasis (bold or italic) - self.markdown = self.shorten_underscore_sequences(text_stream) + self.markdown = self._shorten_underscore_sequences(text_stream) if isinstance(self.path_or_stream, Path): with open(self.path_or_stream, "r", encoding="utf-8") as f: md_content = f.read() @@ -89,7 +89,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): # very long sequences of underscores will lead to unnecessary long processing times. # In any proper Markdown files, underscores have to be escaped, # otherwise they represent emphasis (bold or italic) - self.markdown = self.shorten_underscore_sequences(md_content) + self.markdown = self._shorten_underscore_sequences(md_content) self.valid = True _log.debug(self.markdown) @@ -99,7 +99,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): ) from e return - def close_table(self, doc: DoclingDocument): + def _close_table(self, doc: DoclingDocument): if self.in_table: _log.debug("=== TABLE START ===") for md_table_row in self.md_table_buffer: @@ -156,30 +156,35 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): doc.add_table(data=table_data) return - def process_inline_text( - self, parent_element: Optional[NodeItem], doc: DoclingDocument + def _process_inline_text( + self, parent_item: Optional[NodeItem], doc: DoclingDocument ): txt = " ".join(self.inline_texts) if len(txt) > 0: doc.add_text( label=DocItemLabel.PARAGRAPH, - parent=parent_element, + parent=parent_item, text=txt, ) self.inline_texts = [] - def iterate_elements( + def _iterate_elements( self, element: marko.element.Element, depth: int, doc: DoclingDocument, - parent_element: Optional[NodeItem] = None, + visited: Set[marko.element.Element], + parent_item: Optional[NodeItem] = None, ): + + if element in visited: + return + # Iterates over all elements in the AST # Check for different element types and process relevant details if isinstance(element, marko.block.Heading) and len(element.children) > 0: - self.close_table(doc) - self.process_inline_text(parent_element, doc) + self._close_table(doc) + self._process_inline_text(parent_item, doc) _log.debug( f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore ) @@ -207,8 +212,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): traverse(element) snippet_text = "".join(strings) if len(snippet_text) > 0: - parent_element = doc.add_text( - label=doc_label, parent=parent_element, text=snippet_text + parent_item = doc.add_text( + label=doc_label, parent=parent_item, text=snippet_text ) elif isinstance(element, marko.block.List): @@ -218,35 +223,37 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): has_non_empty_list_items = True break - self.close_table(doc) - self.process_inline_text(parent_element, doc) + self._close_table(doc) + self._process_inline_text(parent_item, doc) _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}") if has_non_empty_list_items: label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST - parent_element = doc.add_group( - label=label, name=f"list", parent=parent_element + parent_item = doc.add_group( + label=label, name=f"list", parent=parent_item ) elif isinstance(element, marko.block.ListItem) and len(element.children) > 0: - self.close_table(doc) - self.process_inline_text(parent_element, doc) + self._close_table(doc) + self._process_inline_text(parent_item, doc) _log.debug(" - List item") - snippet_text = str(element.children[0].children[0].children) # type: ignore + first_child = element.children[0] + snippet_text = str(first_child.children[0].children) # type: ignore is_numbered = False if ( - parent_element is not None - and isinstance(parent_element, DocItem) - and parent_element.label == GroupLabel.ORDERED_LIST + parent_item is not None + and isinstance(parent_item, DocItem) + and parent_item.label == GroupLabel.ORDERED_LIST ): is_numbered = True doc.add_list_item( - enumerated=is_numbered, parent=parent_element, text=snippet_text + enumerated=is_numbered, parent=parent_item, text=snippet_text ) + visited.add(first_child) elif isinstance(element, marko.inline.Image): - self.close_table(doc) - self.process_inline_text(parent_element, doc) + self._close_table(doc) + self._process_inline_text(parent_item, doc) _log.debug(f" - Image with alt: {element.title}, url: {element.dest}") fig_caption: Optional[TextItem] = None @@ -255,10 +262,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): label=DocItemLabel.CAPTION, text=element.title ) - doc.add_picture(parent=parent_element, caption=fig_caption) + doc.add_picture(parent=parent_item, caption=fig_caption) elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0: - self.process_inline_text(parent_element, doc) + self._process_inline_text(parent_item, doc) elif isinstance(element, marko.inline.RawText): _log.debug(f" - Paragraph (raw text): {element.children}") @@ -272,17 +279,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): else: self.md_table_buffer.append(snippet_text) else: - self.close_table(doc) - self.in_table = False + self._close_table(doc) # most likely just inline text self.inline_texts.append(str(element.children)) elif isinstance(element, marko.inline.CodeSpan): - self.close_table(doc) - self.process_inline_text(parent_element, doc) + self._close_table(doc) + self._process_inline_text(parent_item, doc) _log.debug(f" - Code Span: {element.children}") snippet_text = str(element.children).strip() - doc.add_code(parent=parent_element, text=snippet_text) + doc.add_code(parent=parent_item, text=snippet_text) elif ( isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode)) @@ -290,10 +296,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): and isinstance((first_child := element.children[0]), marko.inline.RawText) and len(snippet_text := (first_child.children.strip())) > 0 ): - self.close_table(doc) - self.process_inline_text(parent_element, doc) + self._close_table(doc) + self._process_inline_text(parent_item, doc) _log.debug(f" - Code Block: {element.children}") - doc.add_code(parent=parent_element, text=snippet_text) + doc.add_code(parent=parent_item, text=snippet_text) elif isinstance(element, marko.inline.LineBreak): if self.in_table: @@ -302,8 +308,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): elif isinstance(element, marko.block.HTMLBlock): self._html_blocks += 1 - self.process_inline_text(parent_element, doc) - self.close_table(doc) + self._process_inline_text(parent_item, doc) + self._close_table(doc) _log.debug("HTML Block: {}".format(element)) if ( len(element.body) > 0 @@ -312,18 +318,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): # wrap in markers to enable post-processing in convert() text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}" - doc.add_code(parent=parent_element, text=text_to_add) + doc.add_code(parent=parent_item, text=text_to_add) else: if not isinstance(element, str): - self.close_table(doc) + self._close_table(doc) _log.debug("Some other element: {}".format(element)) processed_block_types = ( - marko.block.ListItem, marko.block.Heading, marko.block.CodeBlock, marko.block.FencedCode, - # marko.block.Paragraph, marko.inline.RawText, ) @@ -332,7 +336,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): element, processed_block_types ): for child in element.children: - self.iterate_elements(child, depth + 1, doc, parent_element) + self._iterate_elements( + element=child, + depth=depth + 1, + doc=doc, + visited=visited, + parent_item=parent_item, + ) def is_valid(self) -> bool: return self.valid @@ -366,9 +376,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): marko_parser = Markdown() parsed_ast = marko_parser.parse(self.markdown) # Start iterating from the root of the AST - self.iterate_elements(parsed_ast, 0, doc, None) - self.process_inline_text(None, doc) # handle last hanging inline text - self.close_table(doc=doc) # handle any last hanging table + self._iterate_elements( + element=parsed_ast, + depth=0, + doc=doc, + parent_item=None, + visited=set(), + ) + self._process_inline_text(None, doc) # handle last hanging inline text + self._close_table(doc=doc) # handle any last hanging table # if HTML blocks were detected, export to HTML and delegate to HTML backend if self._html_blocks > 0: diff --git a/tests/data/groundtruth/docling_v2/nested.md.md b/tests/data/groundtruth/docling_v2/nested.md.md new file mode 100644 index 0000000..6e430e0 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/nested.md.md @@ -0,0 +1,31 @@ +# Nesting + +A list featuring nesting: + +- abc + - abc123 + - abc1234 + - abc12345 + - a. + - b. + - abcd1234: + - abcd12345: + - a. + - b. +- def: + - def1234: + - def12345。 +- after one empty line + - foo +- afer two empty lines + - bar + +- changing symbol + +A nested HTML list: + +- First item +- Second item with subitems: + - Subitem 1 + - Subitem 2 +- Last list item diff --git a/tests/data/md/nested.md b/tests/data/md/nested.md new file mode 100644 index 0000000..4e203ee --- /dev/null +++ b/tests/data/md/nested.md @@ -0,0 +1,66 @@ +# Nesting + +A list featuring nesting: + +- abc + - abc123 + - abc1234 + - abc12345 + - a. + - b. + - abcd1234: + - abcd12345: + - a. + - b. +- def: + - def1234: + - def12345。 + +- after one empty line + - foo + + +- afer two empty lines + - bar +* changing symbol + +A nested HTML list: + + + + diff --git a/tests/test_backend_markdown.py b/tests/test_backend_markdown.py index caa94d9..5a201ab 100644 --- a/tests/test_backend_markdown.py +++ b/tests/test_backend_markdown.py @@ -4,6 +4,8 @@ from docling.backend.md_backend import MarkdownDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument +from .test_data_gen_flag import GEN_TEST_DATA + def test_convert_valid(): fmt = InputFormat.MD @@ -30,6 +32,10 @@ def test_convert_valid(): act_doc = backend.convert() act_data = act_doc.export_to_markdown() - with open(gt_path, "r", encoding="utf-8") as f: - exp_data = f.read().rstrip() - assert act_data == exp_data + if GEN_TEST_DATA: + with open(gt_path, mode="w", encoding="utf-8") as f: + f.write(f"{act_data}\n") + else: + with open(gt_path, encoding="utf-8") as f: + exp_data = f.read().rstrip() + assert exp_data == act_data diff --git a/tests/test_data_gen_flag.py b/tests/test_data_gen_flag.py new file mode 100644 index 0000000..a4baff6 --- /dev/null +++ b/tests/test_data_gen_flag.py @@ -0,0 +1,9 @@ +import os + +from pydantic import TypeAdapter + +GEN_TEST_DATA = TypeAdapter(bool).validate_python(os.getenv("DOCLING_GEN_TEST_DATA", 0)) + + +def test_gen_test_data_flag(): + assert not GEN_TEST_DATA