fix(markdown): fix empty block handling (#843)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Panos Vagenas 2025-01-30 16:22:29 +01:00 committed by GitHub
parent fea0a99a95
commit bccb022fc8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 114 additions and 30 deletions

View File

@ -6,6 +6,7 @@ from pathlib import Path
from typing import List, Optional, Set, Union from typing import List, Optional, Set, Union
import marko import marko
import marko.element
import marko.ext import marko.ext
import marko.ext.gfm import marko.ext.gfm
import marko.inline import marko.inline
@ -163,14 +164,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def iterate_elements( def iterate_elements(
self, self,
element: marko.block.Element, element: marko.element.Element,
depth: int, depth: int,
doc: DoclingDocument, doc: DoclingDocument,
parent_element: Optional[NodeItem] = None, parent_element: Optional[NodeItem] = None,
): ):
# Iterates over all elements in the AST # Iterates over all elements in the AST
# Check for different element types and process relevant details # Check for different element types and process relevant details
if isinstance(element, marko.block.Heading): if isinstance(element, marko.block.Heading) and len(element.children) > 0:
self.close_table(doc) self.close_table(doc)
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
_log.debug( _log.debug(
@ -205,17 +206,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) )
elif isinstance(element, marko.block.List): elif isinstance(element, marko.block.List):
has_non_empty_list_items = False
for child in element.children:
if isinstance(child, marko.block.ListItem) and len(child.children) > 0:
has_non_empty_list_items = True
break
self.close_table(doc) self.close_table(doc)
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}") _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
list_label = GroupLabel.LIST if has_non_empty_list_items:
if element.ordered: label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
list_label = GroupLabel.ORDERED_LIST parent_element = doc.add_group(
parent_element = doc.add_group( label=label, name=f"list", parent=parent_element
label=list_label, name=f"list", parent=parent_element )
)
elif isinstance(element, marko.block.ListItem): elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
self.close_table(doc) self.close_table(doc)
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
_log.debug(" - List item") _log.debug(" - List item")
@ -245,20 +251,18 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc.add_picture(parent=parent_element, caption=fig_caption) doc.add_picture(parent=parent_element, caption=fig_caption)
elif isinstance(element, marko.block.Paragraph): elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
elif isinstance(element, marko.inline.RawText): elif isinstance(element, marko.inline.RawText):
_log.debug(f" - Paragraph (raw text): {element.children}") _log.debug(f" - Paragraph (raw text): {element.children}")
snippet_text = str(element.children).strip() snippet_text = element.children.strip()
# Detect start of the table: # Detect start of the table:
if "|" in snippet_text: if "|" in snippet_text:
# most likely part of the markdown table # most likely part of the markdown table
self.in_table = True self.in_table = True
if len(self.md_table_buffer) > 0: if len(self.md_table_buffer) > 0:
self.md_table_buffer[len(self.md_table_buffer) - 1] += str( self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
snippet_text
)
else: else:
self.md_table_buffer.append(snippet_text) self.md_table_buffer.append(snippet_text)
else: else:
@ -274,18 +278,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
snippet_text = str(element.children).strip() snippet_text = str(element.children).strip()
doc.add_code(parent=parent_element, text=snippet_text) doc.add_code(parent=parent_element, text=snippet_text)
elif isinstance(element, marko.block.CodeBlock): elif (
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
and len(element.children) > 0
and isinstance((first_child := element.children[0]), marko.inline.RawText)
and len(snippet_text := (first_child.children.strip())) > 0
):
self.close_table(doc) self.close_table(doc)
self.process_inline_text(parent_element, doc) self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Block: {element.children}") _log.debug(f" - Code Block: {element.children}")
snippet_text = str(element.children[0].children).strip() # type: ignore
doc.add_code(parent=parent_element, text=snippet_text)
elif isinstance(element, marko.block.FencedCode):
self.close_table(doc)
self.process_inline_text(parent_element, doc)
_log.debug(f" - Code Block: {element.children}")
snippet_text = str(element.children[0].children).strip() # type: ignore
doc.add_code(parent=parent_element, text=snippet_text) doc.add_code(parent=parent_element, text=snippet_text)
elif isinstance(element, marko.inline.LineBreak): elif isinstance(element, marko.inline.LineBreak):
@ -309,14 +310,21 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.close_table(doc) self.close_table(doc)
_log.debug("Some other element: {}".format(element)) _log.debug("Some other element: {}".format(element))
processed_block_types = (
marko.block.ListItem,
marko.block.Heading,
marko.block.CodeBlock,
marko.block.FencedCode,
# marko.block.Paragraph,
marko.inline.RawText,
)
# Iterate through the element's children (if any) # Iterate through the element's children (if any)
if not isinstance(element, marko.block.ListItem): if hasattr(element, "children") and not isinstance(
if not isinstance(element, marko.block.Heading): element, processed_block_types
if not isinstance(element, marko.block.FencedCode): ):
# if not isinstance(element, marko.block.Paragraph): for child in element.children:
if hasattr(element, "children"): self.iterate_elements(child, depth + 1, doc, parent_element)
for child in element.children:
self.iterate_elements(child, depth + 1, doc, parent_element)
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid

View File

@ -0,0 +1,33 @@
Unordered list:
- foo
Empty unordered list:
Ordered list:
- bar
Empty ordered list:
Heading:
# my heading
Empty heading:
Indented code block:
```
print("Hi!")
```
Empty indented code block:
Fenced code block:
```
print("Hello world!")
```
Empty fenced code block:

43
tests/data/md/blocks.md Normal file
View File

@ -0,0 +1,43 @@
Unordered list:
- foo
Empty unordered list:
-
Ordered list:
1. bar
Empty ordered list:
1.
Heading:
# my heading
Empty heading:
#
Indented code block:
print("Hi!")
Empty indented code block:
Fenced code block:
```python
print("Hello world!")
```
Empty fenced code block:
```
```