From 90b766e2ae1695a759191df37c272efc09be5ee3 Mon Sep 17 00:00:00 2001
From: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Date: Fri, 7 Feb 2025 12:55:12 +0100
Subject: [PATCH] fix(markdown): handle nested lists (#910)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
---
 docling/backend/md_backend.py                 | 108 ++++++++++--------
 .../data/groundtruth/docling_v2/nested.md.md  |  31 +++++
 tests/data/md/nested.md                       |  66 +++++++++++
 tests/test_backend_markdown.py                |  12 +-
 tests/test_data_gen_flag.py                   |   9 ++
 5 files changed, 177 insertions(+), 49 deletions(-)
 create mode 100644 tests/data/groundtruth/docling_v2/nested.md.md
 create mode 100644 tests/data/md/nested.md
 create mode 100644 tests/test_data_gen_flag.py

diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py
index eaf4753..19a21c1 100644
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@@ -36,7 +36,7 @@ _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
 
 
 class MarkdownDocumentBackend(DeclarativeDocumentBackend):
-    def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
+    def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
         # This regex will match any sequence of underscores
         pattern = r"_+"
 
@@ -81,7 +81,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 # very long sequences of underscores will lead to unnecessary long processing times.
                 # In any proper Markdown files, underscores have to be escaped,
                 # otherwise they represent emphasis (bold or italic)
-                self.markdown = self.shorten_underscore_sequences(text_stream)
+                self.markdown = self._shorten_underscore_sequences(text_stream)
             if isinstance(self.path_or_stream, Path):
                 with open(self.path_or_stream, "r", encoding="utf-8") as f:
                     md_content = f.read()
@@ -89,7 +89,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     # very long sequences of underscores will lead to unnecessary long processing times.
                     # In any proper Markdown files, underscores have to be escaped,
                     # otherwise they represent emphasis (bold or italic)
-                    self.markdown = self.shorten_underscore_sequences(md_content)
+                    self.markdown = self._shorten_underscore_sequences(md_content)
             self.valid = True
 
             _log.debug(self.markdown)
@@ -99,7 +99,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             ) from e
         return
 
-    def close_table(self, doc: DoclingDocument):
+    def _close_table(self, doc: DoclingDocument):
         if self.in_table:
             _log.debug("=== TABLE START ===")
             for md_table_row in self.md_table_buffer:
@@ -156,30 +156,35 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 doc.add_table(data=table_data)
         return
 
-    def process_inline_text(
-        self, parent_element: Optional[NodeItem], doc: DoclingDocument
+    def _process_inline_text(
+        self, parent_item: Optional[NodeItem], doc: DoclingDocument
     ):
         txt = " ".join(self.inline_texts)
         if len(txt) > 0:
             doc.add_text(
                 label=DocItemLabel.PARAGRAPH,
-                parent=parent_element,
+                parent=parent_item,
                 text=txt,
             )
         self.inline_texts = []
 
-    def iterate_elements(
+    def _iterate_elements(
         self,
         element: marko.element.Element,
         depth: int,
         doc: DoclingDocument,
-        parent_element: Optional[NodeItem] = None,
+        visited: Set[marko.element.Element],
+        parent_item: Optional[NodeItem] = None,
     ):
+
+        if element in visited:
+            return
+
         # Iterates over all elements in the AST
         # Check for different element types and process relevant details
         if isinstance(element, marko.block.Heading) and len(element.children) > 0:
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(
                 f" - Heading level {element.level}, content: {element.children[0].children}"  # type: ignore
             )
@@ -207,8 +212,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             traverse(element)
             snippet_text = "".join(strings)
             if len(snippet_text) > 0:
-                parent_element = doc.add_text(
-                    label=doc_label, parent=parent_element, text=snippet_text
+                parent_item = doc.add_text(
+                    label=doc_label, parent=parent_item, text=snippet_text
                 )
 
         elif isinstance(element, marko.block.List):
@@ -218,35 +223,37 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     has_non_empty_list_items = True
                     break
 
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
             if has_non_empty_list_items:
                 label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
-                parent_element = doc.add_group(
-                    label=label, name=f"list", parent=parent_element
+                parent_item = doc.add_group(
+                    label=label, name=f"list", parent=parent_item
                 )
 
         elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(" - List item")
 
-            snippet_text = str(element.children[0].children[0].children)  # type: ignore
+            first_child = element.children[0]
+            snippet_text = str(first_child.children[0].children)  # type: ignore
             is_numbered = False
             if (
-                parent_element is not None
-                and isinstance(parent_element, DocItem)
-                and parent_element.label == GroupLabel.ORDERED_LIST
+                parent_item is not None
+                and isinstance(parent_item, DocItem)
+                and parent_item.label == GroupLabel.ORDERED_LIST
             ):
                 is_numbered = True
             doc.add_list_item(
-                enumerated=is_numbered, parent=parent_element, text=snippet_text
+                enumerated=is_numbered, parent=parent_item, text=snippet_text
             )
+            visited.add(first_child)
 
         elif isinstance(element, marko.inline.Image):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
 
             fig_caption: Optional[TextItem] = None
@@ -255,10 +262,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     label=DocItemLabel.CAPTION, text=element.title
                 )
 
-            doc.add_picture(parent=parent_element, caption=fig_caption)
+            doc.add_picture(parent=parent_item, caption=fig_caption)
 
         elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
-            self.process_inline_text(parent_element, doc)
+            self._process_inline_text(parent_item, doc)
 
         elif isinstance(element, marko.inline.RawText):
             _log.debug(f" - Paragraph (raw text): {element.children}")
@@ -272,17 +279,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 else:
                     self.md_table_buffer.append(snippet_text)
             else:
-                self.close_table(doc)
-                self.in_table = False
+                self._close_table(doc)
                 # most likely just inline text
                 self.inline_texts.append(str(element.children))
 
         elif isinstance(element, marko.inline.CodeSpan):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(f" - Code Span: {element.children}")
             snippet_text = str(element.children).strip()
-            doc.add_code(parent=parent_element, text=snippet_text)
+            doc.add_code(parent=parent_item, text=snippet_text)
 
         elif (
             isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
@@ -290,10 +296,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             and isinstance((first_child := element.children[0]), marko.inline.RawText)
             and len(snippet_text := (first_child.children.strip())) > 0
         ):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(f" - Code Block: {element.children}")
-            doc.add_code(parent=parent_element, text=snippet_text)
+            doc.add_code(parent=parent_item, text=snippet_text)
 
         elif isinstance(element, marko.inline.LineBreak):
             if self.in_table:
@@ -302,8 +308,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
 
         elif isinstance(element, marko.block.HTMLBlock):
             self._html_blocks += 1
-            self.process_inline_text(parent_element, doc)
-            self.close_table(doc)
+            self._process_inline_text(parent_item, doc)
+            self._close_table(doc)
             _log.debug("HTML Block: {}".format(element))
             if (
                 len(element.body) > 0
@@ -312,18 +318,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
 
                 # wrap in markers to enable post-processing in convert()
                 text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
-                doc.add_code(parent=parent_element, text=text_to_add)
+                doc.add_code(parent=parent_item, text=text_to_add)
         else:
             if not isinstance(element, str):
-                self.close_table(doc)
+                self._close_table(doc)
                 _log.debug("Some other element: {}".format(element))
 
         processed_block_types = (
-            marko.block.ListItem,
             marko.block.Heading,
             marko.block.CodeBlock,
             marko.block.FencedCode,
-            # marko.block.Paragraph,
             marko.inline.RawText,
         )
 
@@ -332,7 +336,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             element, processed_block_types
         ):
             for child in element.children:
-                self.iterate_elements(child, depth + 1, doc, parent_element)
+                self._iterate_elements(
+                    element=child,
+                    depth=depth + 1,
+                    doc=doc,
+                    visited=visited,
+                    parent_item=parent_item,
+                )
 
     def is_valid(self) -> bool:
         return self.valid
@@ -366,9 +376,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             marko_parser = Markdown()
             parsed_ast = marko_parser.parse(self.markdown)
             # Start iterating from the root of the AST
-            self.iterate_elements(parsed_ast, 0, doc, None)
-            self.process_inline_text(None, doc)  # handle last hanging inline text
-            self.close_table(doc=doc)  # handle any last hanging table
+            self._iterate_elements(
+                element=parsed_ast,
+                depth=0,
+                doc=doc,
+                parent_item=None,
+                visited=set(),
+            )
+            self._process_inline_text(None, doc)  # handle last hanging inline text
+            self._close_table(doc=doc)  # handle any last hanging table
 
             # if HTML blocks were detected, export to HTML and delegate to HTML backend
             if self._html_blocks > 0:
diff --git a/tests/data/groundtruth/docling_v2/nested.md.md b/tests/data/groundtruth/docling_v2/nested.md.md
new file mode 100644
index 0000000..6e430e0
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/nested.md.md
@@ -0,0 +1,31 @@
+# Nesting
+
+A list featuring nesting:
+
+- abc
+    - abc123
+        - abc1234
+            - abc12345
+                - a.
+                - b.
+        - abcd1234：
+            - abcd12345：
+                - a.
+                - b.
+- def：
+    - def1234：
+        - def12345。
+- after one empty line
+    - foo
+- afer two empty lines
+    - bar
+
+- changing symbol
+
+A nested HTML list:
+
+- First item
+- Second item with subitems:
+    - Subitem 1
+    - Subitem 2
+- Last list item
diff --git a/tests/data/md/nested.md b/tests/data/md/nested.md
new file mode 100644
index 0000000..4e203ee
--- /dev/null
+++ b/tests/data/md/nested.md
@@ -0,0 +1,66 @@
+# Nesting
+
+A list featuring nesting:
+
+- abc
+	- abc123
+		- abc1234
+			- abc12345
+				- a.
+				- b.
+		- abcd1234：
+			- abcd12345：
+				- a.
+				- b.
+- def：
+	- def1234：
+		- def12345。
+
+- after one empty line
+	- foo
+
+
+- afer two empty lines
+	- bar
+* changing symbol
+
+A nested HTML list:
+
+<ul>
+    <li>First item</li>
+    <li>Second item with subitems:
+        <ul>
+            <li>Subitem 1</li>
+            <li>Subitem 2</li>
+        </ul>
+    </li>
+    <li>Last list item</li>
+</ul>
+
+<!--
+Table nesting apparently not yet suported by HTML backend:
+
+<table>
+  <tr>
+    <td>Cell</td>
+    <td>Nested Table
+      <table>
+        <tr>
+          <td>Cell 1</td>
+		  <>
+        </tr>
+        <tr>
+          <td>Cell 2</td>
+        </tr>
+        <tr>
+          <td>Cell 3</td>
+        </tr>
+        <tr>
+          <td>Cell 4</td>
+        </tr>
+      </table>
+    </td>
+  </tr>
+  <tr><td>additional row</td></tr>
+</table>
+-->
diff --git a/tests/test_backend_markdown.py b/tests/test_backend_markdown.py
index caa94d9..5a201ab 100644
--- a/tests/test_backend_markdown.py
+++ b/tests/test_backend_markdown.py
@@ -4,6 +4,8 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
 
+from .test_data_gen_flag import GEN_TEST_DATA
+
 
 def test_convert_valid():
     fmt = InputFormat.MD
@@ -30,6 +32,10 @@ def test_convert_valid():
         act_doc = backend.convert()
         act_data = act_doc.export_to_markdown()
 
-        with open(gt_path, "r", encoding="utf-8") as f:
-            exp_data = f.read().rstrip()
-        assert act_data == exp_data
+        if GEN_TEST_DATA:
+            with open(gt_path, mode="w", encoding="utf-8") as f:
+                f.write(f"{act_data}\n")
+        else:
+            with open(gt_path, encoding="utf-8") as f:
+                exp_data = f.read().rstrip()
+            assert exp_data == act_data
diff --git a/tests/test_data_gen_flag.py b/tests/test_data_gen_flag.py
new file mode 100644
index 0000000..a4baff6
--- /dev/null
+++ b/tests/test_data_gen_flag.py
@@ -0,0 +1,9 @@
+import os
+
+from pydantic import TypeAdapter
+
+GEN_TEST_DATA = TypeAdapter(bool).validate_python(os.getenv("DOCLING_GEN_TEST_DATA", 0))
+
+
+def test_gen_test_data_flag():
+    assert not GEN_TEST_DATA