fix(markdown): make parsing of rich table cells valid (#1821)

* fix: update md table classification

Signed-off-by: Michael Honaker <Michael.Honaker@ibm.com>

* Fix ground truth header changes

Signed-off-by: Michael Honaker <Michael.Honaker@ibm.com>

* Fix merge issues

Signed-off-by: Michael Honaker <Michael.Honaker@ibm.com>

* Fix minor ground truth errors

Signed-off-by: Michael Honaker <Michael.Honaker@ibm.com>

---------

Signed-off-by: Michael Honaker <Michael.Honaker@ibm.com>
This commit is contained in:
Michael Honaker 2025-06-26 13:50:45 -04:00 committed by GitHub
parent ee4781075a
commit e79e4f0ab6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 322 additions and 24 deletions

View File

@ -335,7 +335,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
_log.debug(f" - Paragraph (raw text): {element.children}")
snippet_text = element.children.strip()
# Detect start of the table:
if "|" in snippet_text:
if "|" in snippet_text or self.in_table:
# most likely part of the markdown table
self.in_table = True
if len(self.md_table_buffer) > 0:

View File

@ -16,8 +16,17 @@ Create your feature branch: `git checkout -b feature/AmazingFeature` .
# *Whole heading is italic*
- **First** : Lorem ipsum.
- **Second** : Dolor `sit` amet.
Some *`formatted_code`*
## *Partially formatted* heading to\_escape `not_to_escape`
[$$E=mc^2$$](https://en.wikipedia.org/wiki/Albert_Einstein)
## Table Heading
| Bold Heading | Italic Heading |
|----------------|------------------|
| data a | data b |

View File

@ -7,8 +7,12 @@ body:
- $ref: '#/groups/2'
- $ref: '#/texts/32'
- $ref: '#/groups/8'
- $ref: '#/texts/35'
- $ref: '#/texts/39'
- $ref: '#/groups/11'
- $ref: '#/texts/43'
- $ref: '#/texts/47'
- $ref: '#/texts/48'
- $ref: '#/groups/13'
- $ref: '#/tables/0'
content_layer: body
label: unspecified
name: _root_
@ -109,33 +113,205 @@ groups:
self_ref: '#/groups/7'
- children:
- $ref: '#/texts/33'
- $ref: '#/texts/36'
content_layer: body
label: list
name: list
parent:
$ref: '#/body'
self_ref: '#/groups/8'
- children:
- $ref: '#/texts/34'
- $ref: '#/texts/35'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/33'
self_ref: '#/groups/9'
- children:
- $ref: '#/texts/37'
- $ref: '#/texts/38'
- $ref: '#/texts/39'
- $ref: '#/texts/40'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/36'
self_ref: '#/groups/10'
- children:
- $ref: '#/texts/41'
- $ref: '#/texts/42'
content_layer: body
label: inline
name: group
parent:
$ref: '#/body'
self_ref: '#/groups/8'
self_ref: '#/groups/11'
- children:
- $ref: '#/texts/36'
- $ref: '#/texts/37'
- $ref: '#/texts/38'
- $ref: '#/texts/44'
- $ref: '#/texts/45'
- $ref: '#/texts/46'
content_layer: body
label: inline
name: group
parent:
$ref: '#/texts/35'
self_ref: '#/groups/9'
$ref: '#/texts/43'
self_ref: '#/groups/12'
- children: []
content_layer: body
label: inline
name: group
parent:
$ref: '#/body'
self_ref: '#/groups/13'
key_value_items: []
name: inline_and_formatting
origin:
binary_hash: 16409076955457599155
binary_hash: 14550011543526094526
filename: inline_and_formatting.md
mimetype: text/markdown
pages: {}
pictures: []
schema_name: DoclingDocument
tables: []
tables:
- annotations: []
captions: []
children: []
content_layer: body
data:
grid:
- - col_span: 1
column_header: true
end_col_offset_idx: 1
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 0
text: Bold Heading
- col_span: 1
column_header: true
end_col_offset_idx: 2
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 0
text: Italic Heading
- - col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 1
text: data a
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 1
text: data b
num_cols: 2
num_rows: 2
table_cells:
- col_span: 1
column_header: true
end_col_offset_idx: 1
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 0
text: Bold Heading
- col_span: 1
column_header: true
end_col_offset_idx: 2
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 0
text: Italic Heading
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 1
text: data a
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 1
text: data b
- col_span: 1
column_header: true
end_col_offset_idx: 1
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 0
text: Bold Heading
- col_span: 1
column_header: true
end_col_offset_idx: 2
end_row_offset_idx: 1
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 0
text: Italic Heading
- col_span: 1
column_header: false
end_col_offset_idx: 1
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 0
start_row_offset_idx: 1
text: data a
- col_span: 1
column_header: false
end_col_offset_idx: 2
end_row_offset_idx: 2
row_header: false
row_section: false
row_span: 1
start_col_offset_idx: 1
start_row_offset_idx: 1
text: data b
footnotes: []
label: table
parent:
$ref: '#/body'
prov: []
references: []
self_ref: '#/tables/0'
texts:
- children: []
content_layer: body
@ -512,14 +688,108 @@ texts:
prov: []
self_ref: '#/texts/32'
text: Whole heading is italic
- children:
- $ref: '#/groups/9'
content_layer: body
enumerated: false
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/8'
prov: []
self_ref: '#/texts/33'
text: ''
- children: []
content_layer: body
formatting:
bold: true
italic: false
script: baseline
strikethrough: false
underline: false
label: text
orig: First
parent:
$ref: '#/groups/9'
prov: []
self_ref: '#/texts/34'
text: First
- children: []
content_layer: body
label: text
orig: ': Lorem ipsum.'
parent:
$ref: '#/groups/9'
prov: []
self_ref: '#/texts/35'
text: ': Lorem ipsum.'
- children:
- $ref: '#/groups/10'
content_layer: body
enumerated: false
label: list_item
marker: '-'
orig: ''
parent:
$ref: '#/groups/8'
prov: []
self_ref: '#/texts/36'
text: ''
- children: []
content_layer: body
formatting:
bold: true
italic: false
script: baseline
strikethrough: false
underline: false
label: text
orig: Second
parent:
$ref: '#/groups/10'
prov: []
self_ref: '#/texts/37'
text: Second
- children: []
content_layer: body
label: text
orig: ': Dolor'
parent:
$ref: '#/groups/10'
prov: []
self_ref: '#/texts/38'
text: ': Dolor'
- captions: []
children: []
code_language: unknown
content_layer: body
footnotes: []
label: code
orig: sit
parent:
$ref: '#/groups/10'
prov: []
references: []
self_ref: '#/texts/39'
text: sit
- children: []
content_layer: body
label: text
orig: amet.
parent:
$ref: '#/groups/10'
prov: []
self_ref: '#/texts/40'
text: amet.
- children: []
content_layer: body
label: text
orig: Some
parent:
$ref: '#/groups/8'
$ref: '#/groups/11'
prov: []
self_ref: '#/texts/33'
self_ref: '#/texts/41'
text: Some
- captions: []
children: []
@ -535,13 +805,13 @@ texts:
label: code
orig: formatted_code
parent:
$ref: '#/groups/8'
$ref: '#/groups/11'
prov: []
references: []
self_ref: '#/texts/34'
self_ref: '#/texts/42'
text: formatted_code
- children:
- $ref: '#/groups/9'
- $ref: '#/groups/12'
content_layer: body
label: section_header
level: 1
@ -549,7 +819,7 @@ texts:
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/35'
self_ref: '#/texts/43'
text: ''
- children: []
content_layer: body
@ -562,18 +832,18 @@ texts:
label: text
orig: Partially formatted
parent:
$ref: '#/groups/9'
$ref: '#/groups/12'
prov: []
self_ref: '#/texts/36'
self_ref: '#/texts/44'
text: Partially formatted
- children: []
content_layer: body
label: text
orig: heading to_escape
parent:
$ref: '#/groups/9'
$ref: '#/groups/12'
prov: []
self_ref: '#/texts/37'
self_ref: '#/texts/45'
text: heading to_escape
- captions: []
children: []
@ -583,10 +853,10 @@ texts:
label: code
orig: not_to_escape
parent:
$ref: '#/groups/9'
$ref: '#/groups/12'
prov: []
references: []
self_ref: '#/texts/38'
self_ref: '#/texts/46'
text: not_to_escape
- children: []
content_layer: body
@ -596,6 +866,16 @@ texts:
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/39'
self_ref: '#/texts/47'
text: $$E=mc^2$$
- children: []
content_layer: body
label: section_header
level: 1
orig: Table Heading
parent:
$ref: '#/body'
prov: []
self_ref: '#/texts/48'
text: Table Heading
version: 1.4.0

View File

@ -16,8 +16,17 @@ Create your feature branch: `git checkout -b feature/AmazingFeature`.
# *Whole heading is italic*
- **First**: Lorem ipsum.
- **Second**: Dolor `sit` amet.
Some *`formatted_code`*
## *Partially formatted* heading to_escape `not_to_escape`
[$$E=mc^2$$](https://en.wikipedia.org/wiki/Albert_Einstein)
## Table Heading
| **Bold Heading** | *Italic Heading* |
|------------------|------------------|
| data a | data b |