diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index e2e970d..58c0e6e 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -335,7 +335,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): _log.debug(f" - Paragraph (raw text): {element.children}") snippet_text = element.children.strip() # Detect start of the table: - if "|" in snippet_text: + if "|" in snippet_text or self.in_table: # most likely part of the markdown table self.in_table = True if len(self.md_table_buffer) > 0: diff --git a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md index 130375c..282be7f 100644 --- a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md +++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md @@ -16,8 +16,17 @@ Create your feature branch: `git checkout -b feature/AmazingFeature` . # *Whole heading is italic* +- **First** : Lorem ipsum. +- **Second** : Dolor `sit` amet. + Some *`formatted_code`* ## *Partially formatted* heading to\_escape `not_to_escape` [$$E=mc^2$$](https://en.wikipedia.org/wiki/Albert_Einstein) + +## Table Heading + +| Bold Heading | Italic Heading | +|----------------|------------------| +| data a | data b | diff --git a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml index 531a5e9..f04fa50 100644 --- a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml +++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml @@ -7,8 +7,12 @@ body: - $ref: '#/groups/2' - $ref: '#/texts/32' - $ref: '#/groups/8' - - $ref: '#/texts/35' - - $ref: '#/texts/39' + - $ref: '#/groups/11' + - $ref: '#/texts/43' + - $ref: '#/texts/47' + - $ref: '#/texts/48' + - $ref: '#/groups/13' + - $ref: '#/tables/0' content_layer: body label: unspecified name: _root_ @@ -109,33 +113,205 @@ groups: self_ref: '#/groups/7' - children: - $ref: '#/texts/33' + - $ref: '#/texts/36' + content_layer: body + label: list + name: list + parent: + $ref: '#/body' + self_ref: '#/groups/8' +- children: - $ref: '#/texts/34' + - $ref: '#/texts/35' + content_layer: body + label: inline + name: group + parent: + $ref: '#/texts/33' + self_ref: '#/groups/9' +- children: + - $ref: '#/texts/37' + - $ref: '#/texts/38' + - $ref: '#/texts/39' + - $ref: '#/texts/40' + content_layer: body + label: inline + name: group + parent: + $ref: '#/texts/36' + self_ref: '#/groups/10' +- children: + - $ref: '#/texts/41' + - $ref: '#/texts/42' content_layer: body label: inline name: group parent: $ref: '#/body' - self_ref: '#/groups/8' + self_ref: '#/groups/11' - children: - - $ref: '#/texts/36' - - $ref: '#/texts/37' - - $ref: '#/texts/38' + - $ref: '#/texts/44' + - $ref: '#/texts/45' + - $ref: '#/texts/46' content_layer: body label: inline name: group parent: - $ref: '#/texts/35' - self_ref: '#/groups/9' + $ref: '#/texts/43' + self_ref: '#/groups/12' +- children: [] + content_layer: body + label: inline + name: group + parent: + $ref: '#/body' + self_ref: '#/groups/13' key_value_items: [] name: inline_and_formatting origin: - binary_hash: 16409076955457599155 + binary_hash: 14550011543526094526 filename: inline_and_formatting.md mimetype: text/markdown pages: {} pictures: [] schema_name: DoclingDocument -tables: [] +tables: +- annotations: [] + captions: [] + children: [] + content_layer: body + data: + grid: + - - col_span: 1 + column_header: true + end_col_offset_idx: 1 + end_row_offset_idx: 1 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 0 + start_row_offset_idx: 0 + text: Bold Heading + - col_span: 1 + column_header: true + end_col_offset_idx: 2 + end_row_offset_idx: 1 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 1 + start_row_offset_idx: 0 + text: Italic Heading + - - col_span: 1 + column_header: false + end_col_offset_idx: 1 + end_row_offset_idx: 2 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 0 + start_row_offset_idx: 1 + text: data a + - col_span: 1 + column_header: false + end_col_offset_idx: 2 + end_row_offset_idx: 2 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 1 + start_row_offset_idx: 1 + text: data b + num_cols: 2 + num_rows: 2 + table_cells: + - col_span: 1 + column_header: true + end_col_offset_idx: 1 + end_row_offset_idx: 1 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 0 + start_row_offset_idx: 0 + text: Bold Heading + - col_span: 1 + column_header: true + end_col_offset_idx: 2 + end_row_offset_idx: 1 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 1 + start_row_offset_idx: 0 + text: Italic Heading + - col_span: 1 + column_header: false + end_col_offset_idx: 1 + end_row_offset_idx: 2 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 0 + start_row_offset_idx: 1 + text: data a + - col_span: 1 + column_header: false + end_col_offset_idx: 2 + end_row_offset_idx: 2 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 1 + start_row_offset_idx: 1 + text: data b + - col_span: 1 + column_header: true + end_col_offset_idx: 1 + end_row_offset_idx: 1 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 0 + start_row_offset_idx: 0 + text: Bold Heading + - col_span: 1 + column_header: true + end_col_offset_idx: 2 + end_row_offset_idx: 1 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 1 + start_row_offset_idx: 0 + text: Italic Heading + - col_span: 1 + column_header: false + end_col_offset_idx: 1 + end_row_offset_idx: 2 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 0 + start_row_offset_idx: 1 + text: data a + - col_span: 1 + column_header: false + end_col_offset_idx: 2 + end_row_offset_idx: 2 + row_header: false + row_section: false + row_span: 1 + start_col_offset_idx: 1 + start_row_offset_idx: 1 + text: data b + footnotes: [] + label: table + parent: + $ref: '#/body' + prov: [] + references: [] + self_ref: '#/tables/0' texts: - children: [] content_layer: body @@ -512,14 +688,108 @@ texts: prov: [] self_ref: '#/texts/32' text: Whole heading is italic +- children: + - $ref: '#/groups/9' + content_layer: body + enumerated: false + label: list_item + marker: '-' + orig: '' + parent: + $ref: '#/groups/8' + prov: [] + self_ref: '#/texts/33' + text: '' +- children: [] + content_layer: body + formatting: + bold: true + italic: false + script: baseline + strikethrough: false + underline: false + label: text + orig: First + parent: + $ref: '#/groups/9' + prov: [] + self_ref: '#/texts/34' + text: First +- children: [] + content_layer: body + label: text + orig: ': Lorem ipsum.' + parent: + $ref: '#/groups/9' + prov: [] + self_ref: '#/texts/35' + text: ': Lorem ipsum.' +- children: + - $ref: '#/groups/10' + content_layer: body + enumerated: false + label: list_item + marker: '-' + orig: '' + parent: + $ref: '#/groups/8' + prov: [] + self_ref: '#/texts/36' + text: '' +- children: [] + content_layer: body + formatting: + bold: true + italic: false + script: baseline + strikethrough: false + underline: false + label: text + orig: Second + parent: + $ref: '#/groups/10' + prov: [] + self_ref: '#/texts/37' + text: Second +- children: [] + content_layer: body + label: text + orig: ': Dolor' + parent: + $ref: '#/groups/10' + prov: [] + self_ref: '#/texts/38' + text: ': Dolor' +- captions: [] + children: [] + code_language: unknown + content_layer: body + footnotes: [] + label: code + orig: sit + parent: + $ref: '#/groups/10' + prov: [] + references: [] + self_ref: '#/texts/39' + text: sit +- children: [] + content_layer: body + label: text + orig: amet. + parent: + $ref: '#/groups/10' + prov: [] + self_ref: '#/texts/40' + text: amet. - children: [] content_layer: body label: text orig: Some parent: - $ref: '#/groups/8' + $ref: '#/groups/11' prov: [] - self_ref: '#/texts/33' + self_ref: '#/texts/41' text: Some - captions: [] children: [] @@ -535,13 +805,13 @@ texts: label: code orig: formatted_code parent: - $ref: '#/groups/8' + $ref: '#/groups/11' prov: [] references: [] - self_ref: '#/texts/34' + self_ref: '#/texts/42' text: formatted_code - children: - - $ref: '#/groups/9' + - $ref: '#/groups/12' content_layer: body label: section_header level: 1 @@ -549,7 +819,7 @@ texts: parent: $ref: '#/body' prov: [] - self_ref: '#/texts/35' + self_ref: '#/texts/43' text: '' - children: [] content_layer: body @@ -562,18 +832,18 @@ texts: label: text orig: Partially formatted parent: - $ref: '#/groups/9' + $ref: '#/groups/12' prov: [] - self_ref: '#/texts/36' + self_ref: '#/texts/44' text: Partially formatted - children: [] content_layer: body label: text orig: heading to_escape parent: - $ref: '#/groups/9' + $ref: '#/groups/12' prov: [] - self_ref: '#/texts/37' + self_ref: '#/texts/45' text: heading to_escape - captions: [] children: [] @@ -583,10 +853,10 @@ texts: label: code orig: not_to_escape parent: - $ref: '#/groups/9' + $ref: '#/groups/12' prov: [] references: [] - self_ref: '#/texts/38' + self_ref: '#/texts/46' text: not_to_escape - children: [] content_layer: body @@ -596,6 +866,16 @@ texts: parent: $ref: '#/body' prov: [] - self_ref: '#/texts/39' + self_ref: '#/texts/47' text: $$E=mc^2$$ +- children: [] + content_layer: body + label: section_header + level: 1 + orig: Table Heading + parent: + $ref: '#/body' + prov: [] + self_ref: '#/texts/48' + text: Table Heading version: 1.4.0 diff --git a/tests/data/md/inline_and_formatting.md b/tests/data/md/inline_and_formatting.md index 2f93669..65a8ff0 100644 --- a/tests/data/md/inline_and_formatting.md +++ b/tests/data/md/inline_and_formatting.md @@ -16,8 +16,17 @@ Create your feature branch: `git checkout -b feature/AmazingFeature`. # *Whole heading is italic* +- **First**: Lorem ipsum. +- **Second**: Dolor `sit` amet. + Some *`formatted_code`* ## *Partially formatted* heading to_escape `not_to_escape` [$$E=mc^2$$](https://en.wikipedia.org/wiki/Albert_Einstein) + +## Table Heading + +| **Bold Heading** | *Italic Heading* | +|------------------|------------------| +| data a | data b |