fix(markdown): fix empty block handling (#843)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Panos Vagenas 2025-01-30 16:22:29 +01:00 committed by GitHub
parent fea0a99a95
commit bccb022fc8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 114 additions and 30 deletions

View File

@ -6,6 +6,7 @@ from pathlib import Path
from typing import List, Optional, Set, Union
import marko
import marko.element
import marko.ext
import marko.ext.gfm
import marko.inline
@ -163,14 +164,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def iterate_elements(
self,
element: marko.block.Element,
element: marko.element.Element,
depth: int,
doc: DoclingDocument,
parent_element: Optional[NodeItem] = None,
):
# Iterates over all elements in the AST
# Check for different element types and process relevant details
if isinstance(element, marko.block.Heading):
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(
@ -205,17 +206,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
)
elif isinstance(element, marko.block.List):
has_non_empty_list_items = False
for child in element.children:
if isinstance(child, marko.block.ListItem) and len(child.children) > 0:
has_non_empty_list_items = True
break
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
list_label = GroupLabel.LIST
if element.ordered:
list_label = GroupLabel.ORDERED_LIST
parent_element = doc.add_group(
label=list_label, name=f"list", parent=parent_element
)
if has_non_empty_list_items:
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
parent_element = doc.add_group(
label=label, name=f"list", parent=parent_element
)
elif isinstance(element, marko.block.ListItem):
elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(" - List item")
@ -245,20 +251,18 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc.add_picture(parent=parent_element, caption=fig_caption)
elif isinstance(element, marko.block.Paragraph):
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
self.process_inline_text(parent_element, doc)
elif isinstance(element, marko.inline.RawText):
_log.debug(f" - Paragraph (raw text): {element.children}")
snippet_text = str(element.children).strip()
snippet_text = element.children.strip()
# Detect start of the table:
if "|" in snippet_text:
# most likely part of the markdown table
self.in_table = True
if len(self.md_table_buffer) > 0:
self.md_table_buffer[len(self.md_table_buffer) - 1] += str(
snippet_text
)
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
else:
self.md_table_buffer.append(snippet_text)
else:
@ -274,18 +278,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
snippet_text = str(element.children).strip()
doc.add_code(parent=parent_element, text=snippet_text)
elif isinstance(element, marko.block.CodeBlock):
elif (
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
and len(element.children) > 0
and isinstance((first_child := element.children[0]), marko.inline.RawText)
and len(snippet_text := (first_child.children.strip())) > 0
):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Block: {element.children}")
snippet_text = str(element.children[0].children).strip() # type: ignore
doc.add_code(parent=parent_element, text=snippet_text)
elif isinstance(element, marko.block.FencedCode):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Block: {element.children}")
snippet_text = str(element.children[0].children).strip() # type: ignore
doc.add_code(parent=parent_element, text=snippet_text)
elif isinstance(element, marko.inline.LineBreak):
@ -309,14 +310,21 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.close_table(doc)
_log.debug("Some other element: {}".format(element))
processed_block_types = (
marko.block.ListItem,
marko.block.Heading,
marko.block.CodeBlock,
marko.block.FencedCode,
# marko.block.Paragraph,
marko.inline.RawText,
)
# Iterate through the element's children (if any)
if not isinstance(element, marko.block.ListItem):
if not isinstance(element, marko.block.Heading):
if not isinstance(element, marko.block.FencedCode):
# if not isinstance(element, marko.block.Paragraph):
if hasattr(element, "children"):
for child in element.children:
self.iterate_elements(child, depth + 1, doc, parent_element)
if hasattr(element, "children") and not isinstance(
element, processed_block_types
):
for child in element.children:
self.iterate_elements(child, depth + 1, doc, parent_element)
def is_valid(self) -> bool:
return self.valid

View File

@ -0,0 +1,33 @@
Unordered list:
- foo
Empty unordered list:
Ordered list:
- bar
Empty ordered list:
Heading:
# my heading
Empty heading:
Indented code block:
```
print("Hi!")
```
Empty indented code block:
Fenced code block:
```
print("Hello world!")
```
Empty fenced code block:

43
tests/data/md/blocks.md Normal file
View File

@ -0,0 +1,43 @@
Unordered list:
- foo
Empty unordered list:
-
Ordered list:
1. bar
Empty ordered list:
1.
Heading:
# my heading
Empty heading:
#
Indented code block:
print("Hi!")
Empty indented code block:
Fenced code block:
```python
print("Hello world!")
```
Empty fenced code block:
```
```