fix(markdown): fix empty block handling (#843)
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
fea0a99a95
commit
bccb022fc8
@ -6,6 +6,7 @@ from pathlib import Path
|
||||
from typing import List, Optional, Set, Union
|
||||
|
||||
import marko
|
||||
import marko.element
|
||||
import marko.ext
|
||||
import marko.ext.gfm
|
||||
import marko.inline
|
||||
@ -163,14 +164,14 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
def iterate_elements(
|
||||
self,
|
||||
element: marko.block.Element,
|
||||
element: marko.element.Element,
|
||||
depth: int,
|
||||
doc: DoclingDocument,
|
||||
parent_element: Optional[NodeItem] = None,
|
||||
):
|
||||
# Iterates over all elements in the AST
|
||||
# Check for different element types and process relevant details
|
||||
if isinstance(element, marko.block.Heading):
|
||||
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
_log.debug(
|
||||
@ -205,17 +206,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.block.List):
|
||||
has_non_empty_list_items = False
|
||||
for child in element.children:
|
||||
if isinstance(child, marko.block.ListItem) and len(child.children) > 0:
|
||||
has_non_empty_list_items = True
|
||||
break
|
||||
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
||||
list_label = GroupLabel.LIST
|
||||
if element.ordered:
|
||||
list_label = GroupLabel.ORDERED_LIST
|
||||
parent_element = doc.add_group(
|
||||
label=list_label, name=f"list", parent=parent_element
|
||||
)
|
||||
if has_non_empty_list_items:
|
||||
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
||||
parent_element = doc.add_group(
|
||||
label=label, name=f"list", parent=parent_element
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.block.ListItem):
|
||||
elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
_log.debug(" - List item")
|
||||
@ -245,20 +251,18 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
doc.add_picture(parent=parent_element, caption=fig_caption)
|
||||
|
||||
elif isinstance(element, marko.block.Paragraph):
|
||||
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
|
||||
self.process_inline_text(parent_element, doc)
|
||||
|
||||
elif isinstance(element, marko.inline.RawText):
|
||||
_log.debug(f" - Paragraph (raw text): {element.children}")
|
||||
snippet_text = str(element.children).strip()
|
||||
snippet_text = element.children.strip()
|
||||
# Detect start of the table:
|
||||
if "|" in snippet_text:
|
||||
# most likely part of the markdown table
|
||||
self.in_table = True
|
||||
if len(self.md_table_buffer) > 0:
|
||||
self.md_table_buffer[len(self.md_table_buffer) - 1] += str(
|
||||
snippet_text
|
||||
)
|
||||
self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
|
||||
else:
|
||||
self.md_table_buffer.append(snippet_text)
|
||||
else:
|
||||
@ -274,18 +278,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
snippet_text = str(element.children).strip()
|
||||
doc.add_code(parent=parent_element, text=snippet_text)
|
||||
|
||||
elif isinstance(element, marko.block.CodeBlock):
|
||||
elif (
|
||||
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
|
||||
and len(element.children) > 0
|
||||
and isinstance((first_child := element.children[0]), marko.inline.RawText)
|
||||
and len(snippet_text := (first_child.children.strip())) > 0
|
||||
):
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
_log.debug(f" - Code Block: {element.children}")
|
||||
snippet_text = str(element.children[0].children).strip() # type: ignore
|
||||
doc.add_code(parent=parent_element, text=snippet_text)
|
||||
|
||||
elif isinstance(element, marko.block.FencedCode):
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
_log.debug(f" - Code Block: {element.children}")
|
||||
snippet_text = str(element.children[0].children).strip() # type: ignore
|
||||
doc.add_code(parent=parent_element, text=snippet_text)
|
||||
|
||||
elif isinstance(element, marko.inline.LineBreak):
|
||||
@ -309,14 +310,21 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.close_table(doc)
|
||||
_log.debug("Some other element: {}".format(element))
|
||||
|
||||
processed_block_types = (
|
||||
marko.block.ListItem,
|
||||
marko.block.Heading,
|
||||
marko.block.CodeBlock,
|
||||
marko.block.FencedCode,
|
||||
# marko.block.Paragraph,
|
||||
marko.inline.RawText,
|
||||
)
|
||||
|
||||
# Iterate through the element's children (if any)
|
||||
if not isinstance(element, marko.block.ListItem):
|
||||
if not isinstance(element, marko.block.Heading):
|
||||
if not isinstance(element, marko.block.FencedCode):
|
||||
# if not isinstance(element, marko.block.Paragraph):
|
||||
if hasattr(element, "children"):
|
||||
for child in element.children:
|
||||
self.iterate_elements(child, depth + 1, doc, parent_element)
|
||||
if hasattr(element, "children") and not isinstance(
|
||||
element, processed_block_types
|
||||
):
|
||||
for child in element.children:
|
||||
self.iterate_elements(child, depth + 1, doc, parent_element)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
33
tests/data/groundtruth/docling_v2/blocks.md.md
Normal file
33
tests/data/groundtruth/docling_v2/blocks.md.md
Normal file
@ -0,0 +1,33 @@
|
||||
Unordered list:
|
||||
|
||||
- foo
|
||||
|
||||
Empty unordered list:
|
||||
|
||||
Ordered list:
|
||||
|
||||
- bar
|
||||
|
||||
Empty ordered list:
|
||||
|
||||
Heading:
|
||||
|
||||
# my heading
|
||||
|
||||
Empty heading:
|
||||
|
||||
Indented code block:
|
||||
|
||||
```
|
||||
print("Hi!")
|
||||
```
|
||||
|
||||
Empty indented code block:
|
||||
|
||||
Fenced code block:
|
||||
|
||||
```
|
||||
print("Hello world!")
|
||||
```
|
||||
|
||||
Empty fenced code block:
|
43
tests/data/md/blocks.md
Normal file
43
tests/data/md/blocks.md
Normal file
@ -0,0 +1,43 @@
|
||||
Unordered list:
|
||||
|
||||
- foo
|
||||
|
||||
Empty unordered list:
|
||||
|
||||
-
|
||||
|
||||
Ordered list:
|
||||
|
||||
1. bar
|
||||
|
||||
Empty ordered list:
|
||||
|
||||
1.
|
||||
|
||||
Heading:
|
||||
|
||||
# my heading
|
||||
|
||||
Empty heading:
|
||||
|
||||
#
|
||||
|
||||
Indented code block:
|
||||
|
||||
print("Hi!")
|
||||
|
||||
Empty indented code block:
|
||||
|
||||
|
||||
|
||||
Fenced code block:
|
||||
|
||||
```python
|
||||
print("Hello world!")
|
||||
```
|
||||
|
||||
Empty fenced code block:
|
||||
|
||||
```
|
||||
|
||||
```
|
Loading…
Reference in New Issue
Block a user