fix(markdown): handle nested lists (#910)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
2025-02-07 12:55:12 +01:00 · 2025-02-07 12:55:12 +01:00 · 90b766e2ae
commit 90b766e2ae
parent 9114ada7bc
5 changed files with 177 additions and 49 deletions
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@ -36,7 +36,7 @@ _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"


 class MarkdownDocumentBackend(DeclarativeDocumentBackend):
-    def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
+    def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
        # This regex will match any sequence of underscores
        pattern = r"_+"

@ -81,7 +81,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                # very long sequences of underscores will lead to unnecessary long processing times.
                # In any proper Markdown files, underscores have to be escaped,
                # otherwise they represent emphasis (bold or italic)
-                self.markdown = self.shorten_underscore_sequences(text_stream)
+                self.markdown = self._shorten_underscore_sequences(text_stream)
            if isinstance(self.path_or_stream, Path):
                with open(self.path_or_stream, "r", encoding="utf-8") as f:
                    md_content = f.read()
@ -89,7 +89,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                    # very long sequences of underscores will lead to unnecessary long processing times.
                    # In any proper Markdown files, underscores have to be escaped,
                    # otherwise they represent emphasis (bold or italic)
-                    self.markdown = self.shorten_underscore_sequences(md_content)
+                    self.markdown = self._shorten_underscore_sequences(md_content)
            self.valid = True

            _log.debug(self.markdown)
@ -99,7 +99,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            ) from e
        return

-    def close_table(self, doc: DoclingDocument):
+    def _close_table(self, doc: DoclingDocument):
        if self.in_table:
            _log.debug("=== TABLE START ===")
            for md_table_row in self.md_table_buffer:
@ -156,30 +156,35 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                doc.add_table(data=table_data)
        return

-    def process_inline_text(
-        self, parent_element: Optional[NodeItem], doc: DoclingDocument
+    def _process_inline_text(
+        self, parent_item: Optional[NodeItem], doc: DoclingDocument
    ):
        txt = " ".join(self.inline_texts)
        if len(txt) > 0:
            doc.add_text(
                label=DocItemLabel.PARAGRAPH,
-                parent=parent_element,
+                parent=parent_item,
                text=txt,
            )
        self.inline_texts = []

-    def iterate_elements(
+    def _iterate_elements(
        self,
        element: marko.element.Element,
        depth: int,
        doc: DoclingDocument,
-        parent_element: Optional[NodeItem] = None,
+        visited: Set[marko.element.Element],
+        parent_item: Optional[NodeItem] = None,
    ):
+
+        if element in visited:
+            return
+
        # Iterates over all elements in the AST
        # Check for different element types and process relevant details
        if isinstance(element, marko.block.Heading) and len(element.children) > 0:
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
            _log.debug(
                f" - Heading level {element.level}, content: {element.children[0].children}"  # type: ignore
            )
@ -207,8 +212,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            traverse(element)
            snippet_text = "".join(strings)
            if len(snippet_text) > 0:
-                parent_element = doc.add_text(
-                    label=doc_label, parent=parent_element, text=snippet_text
+                parent_item = doc.add_text(
+                    label=doc_label, parent=parent_item, text=snippet_text
                )

        elif isinstance(element, marko.block.List):
@ -218,35 +223,37 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                    has_non_empty_list_items = True
                    break

-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
            _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
            if has_non_empty_list_items:
                label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
-                parent_element = doc.add_group(
-                    label=label, name=f"list", parent=parent_element
+                parent_item = doc.add_group(
+                    label=label, name=f"list", parent=parent_item
                )

        elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
            _log.debug(" - List item")

-            snippet_text = str(element.children[0].children[0].children)  # type: ignore
+            first_child = element.children[0]
+            snippet_text = str(first_child.children[0].children)  # type: ignore
            is_numbered = False
            if (
-                parent_element is not None
-                and isinstance(parent_element, DocItem)
-                and parent_element.label == GroupLabel.ORDERED_LIST
+                parent_item is not None
+                and isinstance(parent_item, DocItem)
+                and parent_item.label == GroupLabel.ORDERED_LIST
            ):
                is_numbered = True
            doc.add_list_item(
-                enumerated=is_numbered, parent=parent_element, text=snippet_text
+                enumerated=is_numbered, parent=parent_item, text=snippet_text
            )
+            visited.add(first_child)

        elif isinstance(element, marko.inline.Image):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
            _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")

            fig_caption: Optional[TextItem] = None
@ -255,10 +262,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                    label=DocItemLabel.CAPTION, text=element.title
                )

-            doc.add_picture(parent=parent_element, caption=fig_caption)
+            doc.add_picture(parent=parent_item, caption=fig_caption)

        elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
-            self.process_inline_text(parent_element, doc)
+            self._process_inline_text(parent_item, doc)

        elif isinstance(element, marko.inline.RawText):
            _log.debug(f" - Paragraph (raw text): {element.children}")
@ -272,17 +279,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                else:
                    self.md_table_buffer.append(snippet_text)
            else:
-                self.close_table(doc)
-                self.in_table = False
+                self._close_table(doc)
                # most likely just inline text
                self.inline_texts.append(str(element.children))

        elif isinstance(element, marko.inline.CodeSpan):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
            _log.debug(f" - Code Span: {element.children}")
            snippet_text = str(element.children).strip()
-            doc.add_code(parent=parent_element, text=snippet_text)
+            doc.add_code(parent=parent_item, text=snippet_text)

        elif (
            isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
@ -290,10 +296,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            and isinstance((first_child := element.children[0]), marko.inline.RawText)
            and len(snippet_text := (first_child.children.strip())) > 0
        ):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
            _log.debug(f" - Code Block: {element.children}")
-            doc.add_code(parent=parent_element, text=snippet_text)
+            doc.add_code(parent=parent_item, text=snippet_text)

        elif isinstance(element, marko.inline.LineBreak):
            if self.in_table:
@ -302,8 +308,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):

        elif isinstance(element, marko.block.HTMLBlock):
            self._html_blocks += 1
-            self.process_inline_text(parent_element, doc)
-            self.close_table(doc)
+            self._process_inline_text(parent_item, doc)
+            self._close_table(doc)
            _log.debug("HTML Block: {}".format(element))
            if (
                len(element.body) > 0
@ -312,18 +318,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):

                # wrap in markers to enable post-processing in convert()
                text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
-                doc.add_code(parent=parent_element, text=text_to_add)
+                doc.add_code(parent=parent_item, text=text_to_add)
        else:
            if not isinstance(element, str):
-                self.close_table(doc)
+                self._close_table(doc)
                _log.debug("Some other element: {}".format(element))

        processed_block_types = (
-            marko.block.ListItem,
            marko.block.Heading,
            marko.block.CodeBlock,
            marko.block.FencedCode,
-            # marko.block.Paragraph,
            marko.inline.RawText,
        )

@ -332,7 +336,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            element, processed_block_types
        ):
            for child in element.children:
-                self.iterate_elements(child, depth + 1, doc, parent_element)
+                self._iterate_elements(
+                    element=child,
+                    depth=depth + 1,
+                    doc=doc,
+                    visited=visited,
+                    parent_item=parent_item,
+                )

    def is_valid(self) -> bool:
        return self.valid
@ -366,9 +376,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            marko_parser = Markdown()
            parsed_ast = marko_parser.parse(self.markdown)
            # Start iterating from the root of the AST
-            self.iterate_elements(parsed_ast, 0, doc, None)
-            self.process_inline_text(None, doc)  # handle last hanging inline text
-            self.close_table(doc=doc)  # handle any last hanging table
+            self._iterate_elements(
+                element=parsed_ast,
+                depth=0,
+                doc=doc,
+                parent_item=None,
+                visited=set(),
+            )
+            self._process_inline_text(None, doc)  # handle last hanging inline text
+            self._close_table(doc=doc)  # handle any last hanging table

            # if HTML blocks were detected, export to HTML and delegate to HTML backend
            if self._html_blocks > 0:
--- a/tests/data/groundtruth/docling_v2/nested.md.md
+++ b/tests/data/groundtruth/docling_v2/nested.md.md
@ -0,0 +1,31 @@
+# Nesting
+
+A list featuring nesting:
+
+- abc
+    - abc123
+        - abc1234
+            - abc12345
+                - a.
+                - b.
+        - abcd1234：
+            - abcd12345：
+                - a.
+                - b.
+- def：
+    - def1234：
+        - def12345。
+- after one empty line
+    - foo
+- afer two empty lines
+    - bar
+
+- changing symbol
+
+A nested HTML list:
+
+- First item
+- Second item with subitems:
+    - Subitem 1
+    - Subitem 2
+- Last list item
--- a/tests/data/md/nested.md
+++ b/tests/data/md/nested.md
@ -0,0 +1,66 @@
+# Nesting
+
+A list featuring nesting:
+
+- abc
+	- abc123
+		- abc1234
+			- abc12345
+				- a.
+				- b.
+		- abcd1234：
+			- abcd12345：
+				- a.
+				- b.
+- def：
+	- def1234：
+		- def12345。
+
+- after one empty line
+	- foo
+
+
+- afer two empty lines
+	- bar
+* changing symbol
+
+A nested HTML list:
+
+<ul>
+    <li>First item</li>
+    <li>Second item with subitems:
+        <ul>
+            <li>Subitem 1</li>
+            <li>Subitem 2</li>
+        </ul>
+    </li>
+    <li>Last list item</li>
+</ul>
+
+<!--
+Table nesting apparently not yet suported by HTML backend:
+
+<table>
+  <tr>
+    <td>Cell</td>
+    <td>Nested Table
+      <table>
+        <tr>
+          <td>Cell 1</td>
+		  <>
+        </tr>
+        <tr>
+          <td>Cell 2</td>
+        </tr>
+        <tr>
+          <td>Cell 3</td>
+        </tr>
+        <tr>
+          <td>Cell 4</td>
+        </tr>
+      </table>
+    </td>
+  </tr>
+  <tr><td>additional row</td></tr>
+</table>
+-->
--- a/tests/test_backend_markdown.py
+++ b/tests/test_backend_markdown.py
@ -4,6 +4,8 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument

+from .test_data_gen_flag import GEN_TEST_DATA
+

 def test_convert_valid():
    fmt = InputFormat.MD
@ -30,6 +32,10 @@ def test_convert_valid():
        act_doc = backend.convert()
        act_data = act_doc.export_to_markdown()

-        with open(gt_path, "r", encoding="utf-8") as f:
-            exp_data = f.read().rstrip()
-        assert act_data == exp_data
+        if GEN_TEST_DATA:
+            with open(gt_path, mode="w", encoding="utf-8") as f:
+                f.write(f"{act_data}\n")
+        else:
+            with open(gt_path, encoding="utf-8") as f:
+                exp_data = f.read().rstrip()
+            assert exp_data == act_data
--- a/tests/test_data_gen_flag.py
+++ b/tests/test_data_gen_flag.py
@ -0,0 +1,9 @@
+import os
+
+from pydantic import TypeAdapter
+
+GEN_TEST_DATA = TypeAdapter(bool).validate_python(os.getenv("DOCLING_GEN_TEST_DATA", 0))
+
+
+def test_gen_test_data_flag():
+    assert not GEN_TEST_DATA