fix(markdown): handle nested lists (#910)
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
9114ada7bc
commit
90b766e2ae
@ -36,7 +36,7 @@ _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
|
||||
|
||||
|
||||
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
||||
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
|
||||
# This regex will match any sequence of underscores
|
||||
pattern = r"_+"
|
||||
|
||||
@ -81,7 +81,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
# very long sequences of underscores will lead to unnecessary long processing times.
|
||||
# In any proper Markdown files, underscores have to be escaped,
|
||||
# otherwise they represent emphasis (bold or italic)
|
||||
self.markdown = self.shorten_underscore_sequences(text_stream)
|
||||
self.markdown = self._shorten_underscore_sequences(text_stream)
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||
md_content = f.read()
|
||||
@ -89,7 +89,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
# very long sequences of underscores will lead to unnecessary long processing times.
|
||||
# In any proper Markdown files, underscores have to be escaped,
|
||||
# otherwise they represent emphasis (bold or italic)
|
||||
self.markdown = self.shorten_underscore_sequences(md_content)
|
||||
self.markdown = self._shorten_underscore_sequences(md_content)
|
||||
self.valid = True
|
||||
|
||||
_log.debug(self.markdown)
|
||||
@ -99,7 +99,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
) from e
|
||||
return
|
||||
|
||||
def close_table(self, doc: DoclingDocument):
|
||||
def _close_table(self, doc: DoclingDocument):
|
||||
if self.in_table:
|
||||
_log.debug("=== TABLE START ===")
|
||||
for md_table_row in self.md_table_buffer:
|
||||
@ -156,30 +156,35 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
doc.add_table(data=table_data)
|
||||
return
|
||||
|
||||
def process_inline_text(
|
||||
self, parent_element: Optional[NodeItem], doc: DoclingDocument
|
||||
def _process_inline_text(
|
||||
self, parent_item: Optional[NodeItem], doc: DoclingDocument
|
||||
):
|
||||
txt = " ".join(self.inline_texts)
|
||||
if len(txt) > 0:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
parent=parent_element,
|
||||
parent=parent_item,
|
||||
text=txt,
|
||||
)
|
||||
self.inline_texts = []
|
||||
|
||||
def iterate_elements(
|
||||
def _iterate_elements(
|
||||
self,
|
||||
element: marko.element.Element,
|
||||
depth: int,
|
||||
doc: DoclingDocument,
|
||||
parent_element: Optional[NodeItem] = None,
|
||||
visited: Set[marko.element.Element],
|
||||
parent_item: Optional[NodeItem] = None,
|
||||
):
|
||||
|
||||
if element in visited:
|
||||
return
|
||||
|
||||
# Iterates over all elements in the AST
|
||||
# Check for different element types and process relevant details
|
||||
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(
|
||||
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
|
||||
)
|
||||
@ -207,8 +212,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
traverse(element)
|
||||
snippet_text = "".join(strings)
|
||||
if len(snippet_text) > 0:
|
||||
parent_element = doc.add_text(
|
||||
label=doc_label, parent=parent_element, text=snippet_text
|
||||
parent_item = doc.add_text(
|
||||
label=doc_label, parent=parent_item, text=snippet_text
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.block.List):
|
||||
@ -218,35 +223,37 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
has_non_empty_list_items = True
|
||||
break
|
||||
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
||||
if has_non_empty_list_items:
|
||||
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
||||
parent_element = doc.add_group(
|
||||
label=label, name=f"list", parent=parent_element
|
||||
parent_item = doc.add_group(
|
||||
label=label, name=f"list", parent=parent_item
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(" - List item")
|
||||
|
||||
snippet_text = str(element.children[0].children[0].children) # type: ignore
|
||||
first_child = element.children[0]
|
||||
snippet_text = str(first_child.children[0].children) # type: ignore
|
||||
is_numbered = False
|
||||
if (
|
||||
parent_element is not None
|
||||
and isinstance(parent_element, DocItem)
|
||||
and parent_element.label == GroupLabel.ORDERED_LIST
|
||||
parent_item is not None
|
||||
and isinstance(parent_item, DocItem)
|
||||
and parent_item.label == GroupLabel.ORDERED_LIST
|
||||
):
|
||||
is_numbered = True
|
||||
doc.add_list_item(
|
||||
enumerated=is_numbered, parent=parent_element, text=snippet_text
|
||||
enumerated=is_numbered, parent=parent_item, text=snippet_text
|
||||
)
|
||||
visited.add(first_child)
|
||||
|
||||
elif isinstance(element, marko.inline.Image):
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
||||
|
||||
fig_caption: Optional[TextItem] = None
|
||||
@ -255,10 +262,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
label=DocItemLabel.CAPTION, text=element.title
|
||||
)
|
||||
|
||||
doc.add_picture(parent=parent_element, caption=fig_caption)
|
||||
doc.add_picture(parent=parent_item, caption=fig_caption)
|
||||
|
||||
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
|
||||
self.process_inline_text(parent_element, doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
|
||||
elif isinstance(element, marko.inline.RawText):
|
||||
_log.debug(f" - Paragraph (raw text): {element.children}")
|
||||
@ -272,17 +279,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
self.md_table_buffer.append(snippet_text)
|
||||
else:
|
||||
self.close_table(doc)
|
||||
self.in_table = False
|
||||
self._close_table(doc)
|
||||
# most likely just inline text
|
||||
self.inline_texts.append(str(element.children))
|
||||
|
||||
elif isinstance(element, marko.inline.CodeSpan):
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(f" - Code Span: {element.children}")
|
||||
snippet_text = str(element.children).strip()
|
||||
doc.add_code(parent=parent_element, text=snippet_text)
|
||||
doc.add_code(parent=parent_item, text=snippet_text)
|
||||
|
||||
elif (
|
||||
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
|
||||
@ -290,10 +296,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
and isinstance((first_child := element.children[0]), marko.inline.RawText)
|
||||
and len(snippet_text := (first_child.children.strip())) > 0
|
||||
):
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
self._close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
_log.debug(f" - Code Block: {element.children}")
|
||||
doc.add_code(parent=parent_element, text=snippet_text)
|
||||
doc.add_code(parent=parent_item, text=snippet_text)
|
||||
|
||||
elif isinstance(element, marko.inline.LineBreak):
|
||||
if self.in_table:
|
||||
@ -302,8 +308,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
elif isinstance(element, marko.block.HTMLBlock):
|
||||
self._html_blocks += 1
|
||||
self.process_inline_text(parent_element, doc)
|
||||
self.close_table(doc)
|
||||
self._process_inline_text(parent_item, doc)
|
||||
self._close_table(doc)
|
||||
_log.debug("HTML Block: {}".format(element))
|
||||
if (
|
||||
len(element.body) > 0
|
||||
@ -312,18 +318,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
# wrap in markers to enable post-processing in convert()
|
||||
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
|
||||
doc.add_code(parent=parent_element, text=text_to_add)
|
||||
doc.add_code(parent=parent_item, text=text_to_add)
|
||||
else:
|
||||
if not isinstance(element, str):
|
||||
self.close_table(doc)
|
||||
self._close_table(doc)
|
||||
_log.debug("Some other element: {}".format(element))
|
||||
|
||||
processed_block_types = (
|
||||
marko.block.ListItem,
|
||||
marko.block.Heading,
|
||||
marko.block.CodeBlock,
|
||||
marko.block.FencedCode,
|
||||
# marko.block.Paragraph,
|
||||
marko.inline.RawText,
|
||||
)
|
||||
|
||||
@ -332,7 +336,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
element, processed_block_types
|
||||
):
|
||||
for child in element.children:
|
||||
self.iterate_elements(child, depth + 1, doc, parent_element)
|
||||
self._iterate_elements(
|
||||
element=child,
|
||||
depth=depth + 1,
|
||||
doc=doc,
|
||||
visited=visited,
|
||||
parent_item=parent_item,
|
||||
)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
@ -366,9 +376,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
marko_parser = Markdown()
|
||||
parsed_ast = marko_parser.parse(self.markdown)
|
||||
# Start iterating from the root of the AST
|
||||
self.iterate_elements(parsed_ast, 0, doc, None)
|
||||
self.process_inline_text(None, doc) # handle last hanging inline text
|
||||
self.close_table(doc=doc) # handle any last hanging table
|
||||
self._iterate_elements(
|
||||
element=parsed_ast,
|
||||
depth=0,
|
||||
doc=doc,
|
||||
parent_item=None,
|
||||
visited=set(),
|
||||
)
|
||||
self._process_inline_text(None, doc) # handle last hanging inline text
|
||||
self._close_table(doc=doc) # handle any last hanging table
|
||||
|
||||
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
||||
if self._html_blocks > 0:
|
||||
|
31
tests/data/groundtruth/docling_v2/nested.md.md
Normal file
31
tests/data/groundtruth/docling_v2/nested.md.md
Normal file
@ -0,0 +1,31 @@
|
||||
# Nesting
|
||||
|
||||
A list featuring nesting:
|
||||
|
||||
- abc
|
||||
- abc123
|
||||
- abc1234
|
||||
- abc12345
|
||||
- a.
|
||||
- b.
|
||||
- abcd1234:
|
||||
- abcd12345:
|
||||
- a.
|
||||
- b.
|
||||
- def:
|
||||
- def1234:
|
||||
- def12345。
|
||||
- after one empty line
|
||||
- foo
|
||||
- afer two empty lines
|
||||
- bar
|
||||
|
||||
- changing symbol
|
||||
|
||||
A nested HTML list:
|
||||
|
||||
- First item
|
||||
- Second item with subitems:
|
||||
- Subitem 1
|
||||
- Subitem 2
|
||||
- Last list item
|
66
tests/data/md/nested.md
Normal file
66
tests/data/md/nested.md
Normal file
@ -0,0 +1,66 @@
|
||||
# Nesting
|
||||
|
||||
A list featuring nesting:
|
||||
|
||||
- abc
|
||||
- abc123
|
||||
- abc1234
|
||||
- abc12345
|
||||
- a.
|
||||
- b.
|
||||
- abcd1234:
|
||||
- abcd12345:
|
||||
- a.
|
||||
- b.
|
||||
- def:
|
||||
- def1234:
|
||||
- def12345。
|
||||
|
||||
- after one empty line
|
||||
- foo
|
||||
|
||||
|
||||
- afer two empty lines
|
||||
- bar
|
||||
* changing symbol
|
||||
|
||||
A nested HTML list:
|
||||
|
||||
<ul>
|
||||
<li>First item</li>
|
||||
<li>Second item with subitems:
|
||||
<ul>
|
||||
<li>Subitem 1</li>
|
||||
<li>Subitem 2</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>Last list item</li>
|
||||
</ul>
|
||||
|
||||
<!--
|
||||
Table nesting apparently not yet suported by HTML backend:
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<td>Cell</td>
|
||||
<td>Nested Table
|
||||
<table>
|
||||
<tr>
|
||||
<td>Cell 1</td>
|
||||
<>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Cell 2</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Cell 3</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Cell 4</td>
|
||||
</tr>
|
||||
</table>
|
||||
</td>
|
||||
</tr>
|
||||
<tr><td>additional row</td></tr>
|
||||
</table>
|
||||
-->
|
@ -4,6 +4,8 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
from .test_data_gen_flag import GEN_TEST_DATA
|
||||
|
||||
|
||||
def test_convert_valid():
|
||||
fmt = InputFormat.MD
|
||||
@ -30,6 +32,10 @@ def test_convert_valid():
|
||||
act_doc = backend.convert()
|
||||
act_data = act_doc.export_to_markdown()
|
||||
|
||||
with open(gt_path, "r", encoding="utf-8") as f:
|
||||
exp_data = f.read().rstrip()
|
||||
assert act_data == exp_data
|
||||
if GEN_TEST_DATA:
|
||||
with open(gt_path, mode="w", encoding="utf-8") as f:
|
||||
f.write(f"{act_data}\n")
|
||||
else:
|
||||
with open(gt_path, encoding="utf-8") as f:
|
||||
exp_data = f.read().rstrip()
|
||||
assert exp_data == act_data
|
||||
|
9
tests/test_data_gen_flag.py
Normal file
9
tests/test_data_gen_flag.py
Normal file
@ -0,0 +1,9 @@
|
||||
import os
|
||||
|
||||
from pydantic import TypeAdapter
|
||||
|
||||
GEN_TEST_DATA = TypeAdapter(bool).validate_python(os.getenv("DOCLING_GEN_TEST_DATA", 0))
|
||||
|
||||
|
||||
def test_gen_test_data_flag():
|
||||
assert not GEN_TEST_DATA
|
Loading…
Reference in New Issue
Block a user