fix(markdown): handle nested lists (#910)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Panos Vagenas 2025-02-07 12:55:12 +01:00 committed by GitHub
parent 9114ada7bc
commit 90b766e2ae
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 177 additions and 49 deletions

View File

@ -36,7 +36,7 @@ _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
# This regex will match any sequence of underscores
pattern = r"_+"
@ -81,7 +81,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# very long sequences of underscores will lead to unnecessary long processing times.
# In any proper Markdown files, underscores have to be escaped,
# otherwise they represent emphasis (bold or italic)
self.markdown = self.shorten_underscore_sequences(text_stream)
self.markdown = self._shorten_underscore_sequences(text_stream)
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
md_content = f.read()
@ -89,7 +89,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# very long sequences of underscores will lead to unnecessary long processing times.
# In any proper Markdown files, underscores have to be escaped,
# otherwise they represent emphasis (bold or italic)
self.markdown = self.shorten_underscore_sequences(md_content)
self.markdown = self._shorten_underscore_sequences(md_content)
self.valid = True
_log.debug(self.markdown)
@ -99,7 +99,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) from e
return
def close_table(self, doc: DoclingDocument):
def _close_table(self, doc: DoclingDocument):
if self.in_table:
_log.debug("=== TABLE START ===")
for md_table_row in self.md_table_buffer:
@ -156,30 +156,35 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc.add_table(data=table_data)
return
def process_inline_text(
self, parent_element: Optional[NodeItem], doc: DoclingDocument
def _process_inline_text(
self, parent_item: Optional[NodeItem], doc: DoclingDocument
):
txt = " ".join(self.inline_texts)
if len(txt) > 0:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
parent=parent_element,
parent=parent_item,
text=txt,
)
self.inline_texts = []
def iterate_elements(
def _iterate_elements(
self,
element: marko.element.Element,
depth: int,
doc: DoclingDocument,
parent_element: Optional[NodeItem] = None,
visited: Set[marko.element.Element],
parent_item: Optional[NodeItem] = None,
):
if element in visited:
return
# Iterates over all elements in the AST
# Check for different element types and process relevant details
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
self.close_table(doc)
self.process_inline_text(parent_element, doc)
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
)
@ -207,8 +212,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
traverse(element)
snippet_text = "".join(strings)
if len(snippet_text) > 0:
parent_element = doc.add_text(
label=doc_label, parent=parent_element, text=snippet_text
parent_item = doc.add_text(
label=doc_label, parent=parent_item, text=snippet_text
)
elif isinstance(element, marko.block.List):
@ -218,35 +223,37 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
has_non_empty_list_items = True
break
self.close_table(doc)
self.process_inline_text(parent_element, doc)
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
if has_non_empty_list_items:
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
parent_element = doc.add_group(
label=label, name=f"list", parent=parent_element
parent_item = doc.add_group(
label=label, name=f"list", parent=parent_item
)
elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
self.close_table(doc)
self.process_inline_text(parent_element, doc)
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(" - List item")
snippet_text = str(element.children[0].children[0].children) # type: ignore
first_child = element.children[0]
snippet_text = str(first_child.children[0].children) # type: ignore
is_numbered = False
if (
parent_element is not None
and isinstance(parent_element, DocItem)
and parent_element.label == GroupLabel.ORDERED_LIST
parent_item is not None
and isinstance(parent_item, DocItem)
and parent_item.label == GroupLabel.ORDERED_LIST
):
is_numbered = True
doc.add_list_item(
enumerated=is_numbered, parent=parent_element, text=snippet_text
enumerated=is_numbered, parent=parent_item, text=snippet_text
)
visited.add(first_child)
elif isinstance(element, marko.inline.Image):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
fig_caption: Optional[TextItem] = None
@ -255,10 +262,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
label=DocItemLabel.CAPTION, text=element.title
)
doc.add_picture(parent=parent_element, caption=fig_caption)
doc.add_picture(parent=parent_item, caption=fig_caption)
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
self.process_inline_text(parent_element, doc)
self._process_inline_text(parent_item, doc)
elif isinstance(element, marko.inline.RawText):
_log.debug(f" - Paragraph (raw text): {element.children}")
@ -272,17 +279,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
else:
self.md_table_buffer.append(snippet_text)
else:
self.close_table(doc)
self.in_table = False
self._close_table(doc)
# most likely just inline text
self.inline_texts.append(str(element.children))
elif isinstance(element, marko.inline.CodeSpan):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Code Span: {element.children}")
snippet_text = str(element.children).strip()
doc.add_code(parent=parent_element, text=snippet_text)
doc.add_code(parent=parent_item, text=snippet_text)
elif (
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
@ -290,10 +296,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
and isinstance((first_child := element.children[0]), marko.inline.RawText)
and len(snippet_text := (first_child.children.strip())) > 0
):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
self._close_table(doc)
self._process_inline_text(parent_item, doc)
_log.debug(f" - Code Block: {element.children}")
doc.add_code(parent=parent_element, text=snippet_text)
doc.add_code(parent=parent_item, text=snippet_text)
elif isinstance(element, marko.inline.LineBreak):
if self.in_table:
@ -302,8 +308,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(element, marko.block.HTMLBlock):
self._html_blocks += 1
self.process_inline_text(parent_element, doc)
self.close_table(doc)
self._process_inline_text(parent_item, doc)
self._close_table(doc)
_log.debug("HTML Block: {}".format(element))
if (
len(element.body) > 0
@ -312,18 +318,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# wrap in markers to enable post-processing in convert()
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
doc.add_code(parent=parent_element, text=text_to_add)
doc.add_code(parent=parent_item, text=text_to_add)
else:
if not isinstance(element, str):
self.close_table(doc)
self._close_table(doc)
_log.debug("Some other element: {}".format(element))
processed_block_types = (
marko.block.ListItem,
marko.block.Heading,
marko.block.CodeBlock,
marko.block.FencedCode,
# marko.block.Paragraph,
marko.inline.RawText,
)
@ -332,7 +336,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
element, processed_block_types
):
for child in element.children:
self.iterate_elements(child, depth + 1, doc, parent_element)
self._iterate_elements(
element=child,
depth=depth + 1,
doc=doc,
visited=visited,
parent_item=parent_item,
)
def is_valid(self) -> bool:
return self.valid
@ -366,9 +376,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
marko_parser = Markdown()
parsed_ast = marko_parser.parse(self.markdown)
# Start iterating from the root of the AST
self.iterate_elements(parsed_ast, 0, doc, None)
self.process_inline_text(None, doc) # handle last hanging inline text
self.close_table(doc=doc) # handle any last hanging table
self._iterate_elements(
element=parsed_ast,
depth=0,
doc=doc,
parent_item=None,
visited=set(),
)
self._process_inline_text(None, doc) # handle last hanging inline text
self._close_table(doc=doc) # handle any last hanging table
# if HTML blocks were detected, export to HTML and delegate to HTML backend
if self._html_blocks > 0:

View File

@ -0,0 +1,31 @@
# Nesting
A list featuring nesting:
- abc
- abc123
- abc1234
- abc12345
- a.
- b.
- abcd1234
- abcd12345
- a.
- b.
- def
- def1234
- def12345。
- after one empty line
- foo
- afer two empty lines
- bar
- changing symbol
A nested HTML list:
- First item
- Second item with subitems:
- Subitem 1
- Subitem 2
- Last list item

66
tests/data/md/nested.md Normal file
View File

@ -0,0 +1,66 @@
# Nesting
A list featuring nesting:
- abc
- abc123
- abc1234
- abc12345
- a.
- b.
- abcd1234
- abcd12345
- a.
- b.
- def
- def1234
- def12345。
- after one empty line
- foo
- afer two empty lines
- bar
* changing symbol
A nested HTML list:
<ul>
<li>First item</li>
<li>Second item with subitems:
<ul>
<li>Subitem 1</li>
<li>Subitem 2</li>
</ul>
</li>
<li>Last list item</li>
</ul>
<!--
Table nesting apparently not yet suported by HTML backend:
<table>
<tr>
<td>Cell</td>
<td>Nested Table
<table>
<tr>
<td>Cell 1</td>
<>
</tr>
<tr>
<td>Cell 2</td>
</tr>
<tr>
<td>Cell 3</td>
</tr>
<tr>
<td>Cell 4</td>
</tr>
</table>
</td>
</tr>
<tr><td>additional row</td></tr>
</table>
-->

View File

@ -4,6 +4,8 @@ from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
from .test_data_gen_flag import GEN_TEST_DATA
def test_convert_valid():
fmt = InputFormat.MD
@ -30,6 +32,10 @@ def test_convert_valid():
act_doc = backend.convert()
act_data = act_doc.export_to_markdown()
with open(gt_path, "r", encoding="utf-8") as f:
exp_data = f.read().rstrip()
assert act_data == exp_data
if GEN_TEST_DATA:
with open(gt_path, mode="w", encoding="utf-8") as f:
f.write(f"{act_data}\n")
else:
with open(gt_path, encoding="utf-8") as f:
exp_data = f.read().rstrip()
assert exp_data == act_data

View File

@ -0,0 +1,9 @@
import os
from pydantic import TypeAdapter
GEN_TEST_DATA = TypeAdapter(bool).validate_python(os.getenv("DOCLING_GEN_TEST_DATA", 0))
def test_gen_test_data_flag():
assert not GEN_TEST_DATA