From df140227c3b8bcad0c68bf3d129930cccd96a07e Mon Sep 17 00:00:00 2001 From: Ayraf <85089952+ShiroYasha18@users.noreply.github.com> Date: Tue, 10 Jun 2025 20:25:59 +0530 Subject: [PATCH] feat: support xlsm files (#1520) * code for xlsm support * updated support for xlsm * updated code for xlsm support * Update docling_parse_v4_backend.py Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com> * Update docling_parse_v4_backend.py Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com> * Update test_backend_msexcel_xlsm.py updated the tests/test_backend_msexcel_xlsm.py: have a function starting with test removed all print statements ** To add an explicit assert {test}=={pred} Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com> * Update base_models.py Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com> * Update test_backend_msexcel.py Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com> * Update test_backend_msexcel_xlsm.py Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com> * Update document_converter.py Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com> * Delete tests/test_backend_msexcel_xlsm.py Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com> * xlsm file Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com> * run tests * ran tests * Fix tests, upgrade XSLM example to a valid file Signed-off-by: Christoph Auer --------- Signed-off-by: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com> Signed-off-by: Christoph Auer Co-authored-by: Christoph Auer --- docling/datamodel/base_models.py | 2 +- .../docling_v1/2305.03393v1-pg9.json | 4 +- .../docling_v1/2305.03393v1-pg9.pages.json | 58 +- .../docling_v2/2305.03393v1-pg9.json | 4 +- .../docling_v2/2305.03393v1-pg9.pages.json | 58 +- .../docling_v2/example_8.html.itxt | 8 + .../docling_v2/example_8.html.json | 2008 +++++++++++++++ .../groundtruth/docling_v2/example_8.html.md | 29 + .../docling_v2/sample_sales_data.xlsm.itxt | 3 + .../docling_v2/sample_sales_data.xlsm.json | 2153 +++++++++++++++++ .../docling_v2/sample_sales_data.xlsm.md | 22 + .../groundtruth/docling_v2/textbox.docx.itxt | 127 +- .../groundtruth/docling_v2/textbox.docx.json | 800 +++--- .../groundtruth/docling_v2/textbox.docx.md | 8 +- .../docling_v2/webp-test.doctags.txt | 2 +- .../groundtruth/docling_v2/webp-test.json | 6 +- .../docling_v2/webp-test.pages.json | 152 +- tests/data/xlsx/sample_sales_data.xlsm | Bin 0 -> 9945 bytes tests/test_backend_msexcel.py | 22 +- 19 files changed, 4834 insertions(+), 632 deletions(-) create mode 100644 tests/data/groundtruth/docling_v2/example_8.html.itxt create mode 100644 tests/data/groundtruth/docling_v2/example_8.html.json create mode 100644 tests/data/groundtruth/docling_v2/example_8.html.md create mode 100644 tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.itxt create mode 100644 tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json create mode 100644 tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.md create mode 100644 tests/data/xlsx/sample_sales_data.xlsm diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 0269e13..2cd2515 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -70,7 +70,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = { InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp", "webp"], InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"], InputFormat.CSV: ["csv"], - InputFormat.XLSX: ["xlsx"], + InputFormat.XLSX: ["xlsx", "xlsm"], InputFormat.XML_USPTO: ["xml", "txt"], InputFormat.JSON_DOCLING: ["json"], } diff --git a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json index e938e2d..dd51e39 100644 --- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json +++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json @@ -213,10 +213,10 @@ "prov": [ { "bbox": [ - 139.6674041748047, + 139.66741943359375, 322.5054626464844, 475.00927734375, - 454.4546203613281 + 454.45458984375 ], "page": 1, "span": [ diff --git a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json index 3bca0d5..5db555b 100644 --- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json +++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json @@ -2646,7 +2646,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -2686,7 +2686,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, @@ -2726,7 +2726,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -2881,7 +2881,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -3096,7 +3096,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -3280,8 +3280,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -7787,7 +7787,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -7852,7 +7852,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -8184,8 +8184,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -13582,7 +13582,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -13628,7 +13628,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, @@ -13674,7 +13674,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -13841,7 +13841,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -14062,7 +14062,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -14252,8 +14252,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -19642,7 +19642,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -19713,7 +19713,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -20057,7 +20057,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -20224,7 +20224,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -20445,7 +20445,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -20635,8 +20635,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -26025,7 +26025,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -26096,7 +26096,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -26440,7 +26440,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -26486,7 +26486,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json index c057009..f281a44 100644 --- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json +++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json @@ -336,8 +336,8 @@ { "page_no": 1, "bbox": { - "l": 139.6674041748047, - "t": 454.4546203613281, + "l": 139.66741943359375, + "t": 454.45458984375, "r": 475.00927734375, "b": 322.5054626464844, "coord_origin": "BOTTOMLEFT" diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json index 3bca0d5..5db555b 100644 --- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json +++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json @@ -2646,7 +2646,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -2686,7 +2686,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, @@ -2726,7 +2726,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -2881,7 +2881,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -3096,7 +3096,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -3280,8 +3280,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -7787,7 +7787,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -7852,7 +7852,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -8184,8 +8184,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -13582,7 +13582,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -13628,7 +13628,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, @@ -13674,7 +13674,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -13841,7 +13841,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -14062,7 +14062,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -14252,8 +14252,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -19642,7 +19642,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -19713,7 +19713,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -20057,7 +20057,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -20224,7 +20224,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -20445,7 +20445,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -20635,8 +20635,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -26025,7 +26025,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -26096,7 +26096,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -26440,7 +26440,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -26486,7 +26486,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, diff --git a/tests/data/groundtruth/docling_v2/example_8.html.itxt b/tests/data/groundtruth/docling_v2/example_8.html.itxt new file mode 100644 index 0000000..505408e --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_8.html.itxt @@ -0,0 +1,8 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group header-1 + item-2 at level 2: section_header: Pivot table with with 1 row header + item-3 at level 3: table with [6x4] + item-4 at level 2: section_header: Pivot table with 2 row headers + item-5 at level 3: table with [6x5] + item-6 at level 2: section_header: Equivalent pivot table + item-7 at level 3: table with [6x5] \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_8.html.json b/tests/data/groundtruth/docling_v2/example_8.html.json new file mode 100644 index 0000000..e77d5cf --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_8.html.json @@ -0,0 +1,2008 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.3.0", + "name": "example_8", + "origin": { + "mimetype": "text/html", + "binary_hash": 12799593797322619937, + "filename": "example_8.html" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + } + ], + "content_layer": "body", + "name": "header-1", + "label": "section" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/tables/0" + } + ], + "content_layer": "body", + "label": "section_header", + "prov": [], + "orig": "Pivot table with with 1 row header", + "text": "Pivot table with with 1 row header", + "level": 1 + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/tables/1" + } + ], + "content_layer": "body", + "label": "section_header", + "prov": [], + "orig": "Pivot table with 2 row headers", + "text": "Pivot table with 2 row headers", + "level": 1 + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/tables/2" + } + ], + "content_layer": "body", + "label": "section_header", + "prov": [], + "orig": "Equivalent pivot table", + "text": "Equivalent pivot table", + "level": 1 + } + ], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Month", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Revenue", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cost", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 5, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "January", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$134", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$162", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "February", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$155", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "March", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$160", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$143", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "April", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$210", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "May", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$280", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$120", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 6, + "num_cols": 4, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Month", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Revenue", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cost", + "column_header": true, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 5, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "January", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$134", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$162", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 5, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "February", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$155", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 5, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "March", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$160", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$143", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 5, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "April", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$210", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 5, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "May", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "$280", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$120", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/1", + "parent": { + "$ref": "#/texts/1" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Quarter", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Month", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Revenue", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Cost", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 6, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q1", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "January", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$134", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$162", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "February", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$155", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "March", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$160", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$143", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q2", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "April", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$210", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "May", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$280", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$120", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 6, + "num_cols": 5, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Quarter", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Month", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Revenue", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Cost", + "column_header": true, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 6, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q1", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "January", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$134", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$162", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 6, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q1", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "February", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$155", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 6, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q1", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "March", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$160", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$143", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 6, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q2", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "April", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$210", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 6, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q2", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "May", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$280", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$120", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/2", + "parent": { + "$ref": "#/texts/2" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Quarter", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Month", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Revenue", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Cost", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 7, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q1", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "January", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$134", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$162", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "February", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$155", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "March", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$160", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$143", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q2", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "April", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$210", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "May", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$280", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$120", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 6, + "num_cols": 5, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Year", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Quarter", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Month", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Revenue", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Cost", + "column_header": true, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 7, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q1", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "January", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$134", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$162", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 7, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q1", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "February", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$155", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 7, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q1", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "March", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$160", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$143", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 7, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q2", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "April", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$210", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$150", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 7, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2025", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Q2", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "May", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "$280", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "$120", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + } + ], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_8.html.md b/tests/data/groundtruth/docling_v2/example_8.html.md new file mode 100644 index 0000000..462a810 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_8.html.md @@ -0,0 +1,29 @@ +## Pivot table with with 1 row header + +| Year | Month | Revenue | Cost | +|--------|----------|-----------|--------| +| 2025 | January | $134 | $162 | +| 2025 | February | $150 | $155 | +| 2025 | March | $160 | $143 | +| 2025 | April | $210 | $150 | +| 2025 | May | $280 | $120 | + +## Pivot table with 2 row headers + +| Year | Quarter | Month | Revenue | Cost | +|--------|-----------|----------|-----------|--------| +| 2025 | Q1 | January | $134 | $162 | +| 2025 | Q1 | February | $150 | $155 | +| 2025 | Q1 | March | $160 | $143 | +| 2025 | Q2 | April | $210 | $150 | +| 2025 | Q2 | May | $280 | $120 | + +## Equivalent pivot table + +| Year | Quarter | Month | Revenue | Cost | +|--------|-----------|----------|-----------|--------| +| 2025 | Q1 | January | $134 | $162 | +| 2025 | Q1 | February | $150 | $155 | +| 2025 | Q1 | March | $160 | $143 | +| 2025 | Q2 | April | $210 | $150 | +| 2025 | Q2 | May | $280 | $120 | \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.itxt b/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.itxt new file mode 100644 index 0000000..f7965d2 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.itxt @@ -0,0 +1,3 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group sheet: SalesData + item-2 at level 2: table with [21x4] \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json b/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json new file mode 100644 index 0000000..04f8198 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json @@ -0,0 +1,2153 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.3.0", + "name": "sample_sales_data", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "binary_hash": 14806485565397602516, + "filename": "sample_sales_data.xlsm" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/tables/0" + } + ], + "content_layer": "body", + "name": "sheet: SalesData", + "label": "section" + } + ], + "texts": [], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 4.0, + "b": 21.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Product", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Date", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Quantity", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Revenue", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-01 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "5", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "5000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-02 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "10", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "12000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-03 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "3000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-04 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "8000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-05 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "7", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "7000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-06 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "6000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-07 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "15000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-08 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "9", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "9000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-09 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "4000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-10 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "11", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "11000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-11 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "5", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "5000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-12 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "8500", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 13, + "end_row_offset_idx": 14, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 13, + "end_row_offset_idx": 14, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-13 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 13, + "end_row_offset_idx": 14, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 13, + "end_row_offset_idx": 14, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "6200", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 14, + "end_row_offset_idx": 15, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 14, + "end_row_offset_idx": 15, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-14 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 14, + "end_row_offset_idx": 15, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "7", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 14, + "end_row_offset_idx": 15, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "7100", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 15, + "end_row_offset_idx": 16, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 15, + "end_row_offset_idx": 16, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-15 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 15, + "end_row_offset_idx": 16, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "10", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 15, + "end_row_offset_idx": 16, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "10500", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 16, + "end_row_offset_idx": 17, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 16, + "end_row_offset_idx": 17, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-16 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 16, + "end_row_offset_idx": 17, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 16, + "end_row_offset_idx": 17, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "3200", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 17, + "end_row_offset_idx": 18, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 17, + "end_row_offset_idx": 18, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-17 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 17, + "end_row_offset_idx": 18, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "9", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 17, + "end_row_offset_idx": 18, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "9400", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 18, + "end_row_offset_idx": 19, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 18, + "end_row_offset_idx": 19, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-18 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 18, + "end_row_offset_idx": 19, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 18, + "end_row_offset_idx": 19, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "12500", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 19, + "end_row_offset_idx": 20, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 19, + "end_row_offset_idx": 20, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-19 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 19, + "end_row_offset_idx": 20, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 19, + "end_row_offset_idx": 20, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "6100", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 20, + "end_row_offset_idx": 21, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 20, + "end_row_offset_idx": 21, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-20 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 20, + "end_row_offset_idx": 21, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 20, + "end_row_offset_idx": 21, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "8900", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 21, + "num_cols": 4, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Product", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Date", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Quantity", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Revenue", + "column_header": true, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-01 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "5", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "5000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-02 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "10", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "12000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-03 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "3000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-04 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "8000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-05 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "7", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "7000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-06 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "6000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-07 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "15000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-08 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "9", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "9000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-09 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "4000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-10 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "11", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "11000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-11 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "5", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "5000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-12 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "8500", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 13, + "end_row_offset_idx": 14, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 13, + "end_row_offset_idx": 14, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-13 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 13, + "end_row_offset_idx": 14, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 13, + "end_row_offset_idx": 14, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "6200", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 14, + "end_row_offset_idx": 15, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 14, + "end_row_offset_idx": 15, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-14 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 14, + "end_row_offset_idx": 15, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "7", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 14, + "end_row_offset_idx": 15, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "7100", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 15, + "end_row_offset_idx": 16, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 15, + "end_row_offset_idx": 16, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-15 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 15, + "end_row_offset_idx": 16, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "10", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 15, + "end_row_offset_idx": 16, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "10500", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 16, + "end_row_offset_idx": 17, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 16, + "end_row_offset_idx": 17, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-16 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 16, + "end_row_offset_idx": 17, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 16, + "end_row_offset_idx": 17, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "3200", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 17, + "end_row_offset_idx": 18, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 17, + "end_row_offset_idx": 18, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-17 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 17, + "end_row_offset_idx": 18, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "9", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 17, + "end_row_offset_idx": 18, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "9400", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 18, + "end_row_offset_idx": 19, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 18, + "end_row_offset_idx": 19, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-18 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 18, + "end_row_offset_idx": 19, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 18, + "end_row_offset_idx": 19, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "12500", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 19, + "end_row_offset_idx": 20, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 19, + "end_row_offset_idx": 20, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-19 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 19, + "end_row_offset_idx": 20, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 19, + "end_row_offset_idx": 20, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "6100", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 20, + "end_row_offset_idx": 21, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 20, + "end_row_offset_idx": 21, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-20 00:00:00", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 20, + "end_row_offset_idx": 21, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 20, + "end_row_offset_idx": 21, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "8900", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + } + ], + "key_value_items": [], + "form_items": [], + "pages": { + "1": { + "size": { + "width": 4.0, + "height": 21.0 + }, + "page_no": 1 + } + } +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.md b/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.md new file mode 100644 index 0000000..55e52de --- /dev/null +++ b/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.md @@ -0,0 +1,22 @@ +| Product | Date | Quantity | Revenue | +|-----------|---------------------|------------|-----------| +| Widget A | 2024-01-01 00:00:00 | 5 | 5000 | +| Widget B | 2024-01-02 00:00:00 | 10 | 12000 | +| Widget C | 2024-01-03 00:00:00 | 3 | 3000 | +| Widget D | 2024-01-04 00:00:00 | 8 | 8000 | +| Widget A | 2024-01-05 00:00:00 | 7 | 7000 | +| Widget B | 2024-01-06 00:00:00 | 6 | 6000 | +| Widget C | 2024-01-07 00:00:00 | 12 | 15000 | +| Widget D | 2024-01-08 00:00:00 | 9 | 9000 | +| Widget A | 2024-01-09 00:00:00 | 4 | 4000 | +| Widget B | 2024-01-10 00:00:00 | 11 | 11000 | +| Widget C | 2024-01-11 00:00:00 | 5 | 5000 | +| Widget D | 2024-01-12 00:00:00 | 8 | 8500 | +| Widget A | 2024-01-13 00:00:00 | 6 | 6200 | +| Widget B | 2024-01-14 00:00:00 | 7 | 7100 | +| Widget C | 2024-01-15 00:00:00 | 10 | 10500 | +| Widget D | 2024-01-16 00:00:00 | 3 | 3200 | +| Widget A | 2024-01-17 00:00:00 | 9 | 9400 | +| Widget B | 2024-01-18 00:00:00 | 12 | 12500 | +| Widget C | 2024-01-19 00:00:00 | 6 | 6100 | +| Widget D | 2024-01-20 00:00:00 | 8 | 8900 | \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.itxt b/tests/data/groundtruth/docling_v2/textbox.docx.itxt index e17e2be..406de95 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.itxt +++ b/tests/data/groundtruth/docling_v2/textbox.docx.itxt @@ -5,92 +5,89 @@ item-0 at level 0: unspecified: group _root_ item-4 at level 1: section: group textbox item-5 at level 2: paragraph: Student falls ill item-6 at level 2: paragraph: - item-7 at level 2: paragraph: - item-8 at level 2: list: group list - item-9 at level 3: list_item: Suggested Reportable Symptoms: + item-7 at level 2: list: group list + item-8 at level 3: list_item: Suggested Reportable Symptoms: * ... sh * Blisters * Headache * Sore throat - item-10 at level 1: list_item: + item-9 at level 1: list_item: + item-10 at level 1: paragraph: item-11 at level 1: paragraph: - item-12 at level 1: paragraph: - item-13 at level 1: section: group textbox - item-14 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms + item-12 at level 1: section: group textbox + item-13 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms + item-14 at level 1: paragraph: item-15 at level 1: paragraph: item-16 at level 1: paragraph: item-17 at level 1: paragraph: - item-18 at level 1: paragraph: - item-19 at level 1: section: group textbox - item-20 at level 2: paragraph: Yes + item-18 at level 1: section: group textbox + item-19 at level 2: paragraph: Yes + item-20 at level 1: paragraph: item-21 at level 1: paragraph: - item-22 at level 1: paragraph: - item-23 at level 1: section: group textbox - item-24 at level 2: list: group list - item-25 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network. - item-26 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System. - item-27 at level 2: paragraph: - item-28 at level 2: paragraph: - item-29 at level 1: list: group list - item-30 at level 2: list_item: + item-22 at level 1: section: group textbox + item-23 at level 2: list: group list + item-24 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network. + item-25 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System. + item-26 at level 2: paragraph: + item-27 at level 1: list: group list + item-28 at level 2: list_item: + item-29 at level 1: paragraph: + item-30 at level 1: paragraph: item-31 at level 1: paragraph: item-32 at level 1: paragraph: item-33 at level 1: paragraph: - item-34 at level 1: paragraph: - item-35 at level 1: paragraph: - item-36 at level 1: section: group textbox - item-37 at level 2: paragraph: Health Bureau: - item-38 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control. - item-39 at level 2: list: group list - item-40 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection. - item-41 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act. - item-42 at level 2: paragraph: - item-43 at level 2: paragraph: - item-44 at level 1: list: group list - item-45 at level 2: list_item: - item-46 at level 1: paragraph: - item-47 at level 1: section: group textbox - item-48 at level 2: paragraph: Department of Education: + item-34 at level 1: section: group textbox + item-35 at level 2: paragraph: Health Bureau: + item-36 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control. + item-37 at level 2: list: group list + item-38 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection. + item-39 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act. + item-40 at level 2: paragraph: + item-41 at level 1: list: group list + item-42 at level 2: list_item: + item-43 at level 1: paragraph: + item-44 at level 1: section: group textbox + item-45 at level 2: paragraph: Department of Education: Collabo ... vention measures at all school levels. + item-46 at level 1: paragraph: + item-47 at level 1: paragraph: + item-48 at level 1: paragraph: item-49 at level 1: paragraph: item-50 at level 1: paragraph: item-51 at level 1: paragraph: item-52 at level 1: paragraph: - item-53 at level 1: paragraph: - item-54 at level 1: paragraph: - item-55 at level 1: paragraph: - item-56 at level 1: section: group textbox - item-57 at level 2: inline: group group - item-58 at level 3: paragraph: The Health Bureau will handle - item-59 at level 3: paragraph: reporting and specimen collection - item-60 at level 3: paragraph: . - item-61 at level 2: paragraph: - item-62 at level 2: paragraph: - item-63 at level 1: paragraph: - item-64 at level 1: paragraph: + item-53 at level 1: section: group textbox + item-54 at level 2: inline: group group + item-55 at level 3: paragraph: The Health Bureau will handle + item-56 at level 3: paragraph: reporting and specimen collection + item-57 at level 3: paragraph: . + item-58 at level 2: paragraph: + item-59 at level 1: paragraph: + item-60 at level 1: paragraph: + item-61 at level 1: paragraph: + item-62 at level 1: section: group textbox + item-63 at level 2: paragraph: Whether the epidemic has eased. + item-64 at level 2: paragraph: item-65 at level 1: paragraph: item-66 at level 1: section: group textbox - item-67 at level 2: paragraph: Whether the epidemic has eased. - item-68 at level 2: paragraph: - item-69 at level 2: paragraph: + item-67 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease. + item-68 at level 2: paragraph: No + item-69 at level 1: paragraph: item-70 at level 1: paragraph: item-71 at level 1: section: group textbox - item-72 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease. - item-73 at level 2: paragraph: No - item-74 at level 1: paragraph: - item-75 at level 1: paragraph: - item-76 at level 1: section: group textbox + item-72 at level 2: paragraph: Yes + item-73 at level 1: paragraph: + item-74 at level 1: section: group textbox + item-75 at level 2: paragraph: Yes + item-76 at level 1: paragraph: item-77 at level 1: paragraph: item-78 at level 1: section: group textbox - item-79 at level 1: paragraph: - item-80 at level 1: paragraph: - item-81 at level 1: section: group textbox - item-82 at level 2: paragraph: Case closed. - item-83 at level 2: paragraph: - item-84 at level 2: paragraph: - item-85 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary. + item-79 at level 2: paragraph: Case closed. + item-80 at level 2: paragraph: + item-81 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary. + item-82 at level 1: paragraph: + item-83 at level 1: section: group textbox + item-84 at level 2: paragraph: No + item-85 at level 1: paragraph: item-86 at level 1: paragraph: - item-87 at level 1: section: group textbox - item-88 at level 1: paragraph: - item-89 at level 1: paragraph: - item-90 at level 1: paragraph: \ No newline at end of file + item-87 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.json b/tests/data/groundtruth/docling_v2/textbox.docx.json index 743fb57..840e937 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.json +++ b/tests/data/groundtruth/docling_v2/textbox.docx.json @@ -29,6 +29,9 @@ { "$ref": "#/groups/0" }, + { + "$ref": "#/texts/6" + }, { "$ref": "#/texts/7" }, @@ -36,10 +39,10 @@ "$ref": "#/texts/8" }, { - "$ref": "#/texts/9" + "$ref": "#/groups/2" }, { - "$ref": "#/groups/2" + "$ref": "#/texts/10" }, { "$ref": "#/texts/11" @@ -50,17 +53,14 @@ { "$ref": "#/texts/13" }, - { - "$ref": "#/texts/14" - }, { "$ref": "#/groups/3" }, { - "$ref": "#/texts/16" + "$ref": "#/texts/15" }, { - "$ref": "#/texts/17" + "$ref": "#/texts/16" }, { "$ref": "#/groups/4" @@ -68,6 +68,12 @@ { "$ref": "#/groups/6" }, + { + "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" + }, { "$ref": "#/texts/23" }, @@ -77,12 +83,6 @@ { "$ref": "#/texts/25" }, - { - "$ref": "#/texts/26" - }, - { - "$ref": "#/texts/27" - }, { "$ref": "#/groups/7" }, @@ -90,11 +90,20 @@ "$ref": "#/groups/9" }, { - "$ref": "#/texts/35" + "$ref": "#/texts/32" }, { "$ref": "#/groups/10" }, + { + "$ref": "#/texts/34" + }, + { + "$ref": "#/texts/35" + }, + { + "$ref": "#/texts/36" + }, { "$ref": "#/texts/37" }, @@ -107,74 +116,65 @@ { "$ref": "#/texts/40" }, - { - "$ref": "#/texts/41" - }, - { - "$ref": "#/texts/42" - }, - { - "$ref": "#/texts/43" - }, { "$ref": "#/groups/11" }, { - "$ref": "#/texts/49" + "$ref": "#/texts/45" }, { - "$ref": "#/texts/50" + "$ref": "#/texts/46" }, { - "$ref": "#/texts/51" + "$ref": "#/texts/47" }, { "$ref": "#/groups/13" }, { - "$ref": "#/texts/55" + "$ref": "#/texts/50" }, { "$ref": "#/groups/14" }, + { + "$ref": "#/texts/53" + }, + { + "$ref": "#/texts/54" + }, + { + "$ref": "#/groups/15" + }, + { + "$ref": "#/texts/56" + }, + { + "$ref": "#/groups/16" + }, { "$ref": "#/texts/58" }, { "$ref": "#/texts/59" }, - { - "$ref": "#/groups/15" - }, - { - "$ref": "#/texts/60" - }, - { - "$ref": "#/groups/16" - }, - { - "$ref": "#/texts/61" - }, - { - "$ref": "#/texts/62" - }, { "$ref": "#/groups/17" }, { - "$ref": "#/texts/67" + "$ref": "#/texts/63" }, { "$ref": "#/groups/18" }, { - "$ref": "#/texts/68" + "$ref": "#/texts/65" }, { - "$ref": "#/texts/69" + "$ref": "#/texts/66" }, { - "$ref": "#/texts/70" + "$ref": "#/texts/67" } ], "content_layer": "body", @@ -194,9 +194,6 @@ { "$ref": "#/texts/4" }, - { - "$ref": "#/texts/5" - }, { "$ref": "#/groups/1" } @@ -212,7 +209,7 @@ }, "children": [ { - "$ref": "#/texts/6" + "$ref": "#/texts/5" } ], "content_layer": "body", @@ -226,7 +223,7 @@ }, "children": [ { - "$ref": "#/texts/10" + "$ref": "#/texts/9" } ], "content_layer": "body", @@ -240,7 +237,7 @@ }, "children": [ { - "$ref": "#/texts/15" + "$ref": "#/texts/14" } ], "content_layer": "body", @@ -257,10 +254,7 @@ "$ref": "#/groups/5" }, { - "$ref": "#/texts/20" - }, - { - "$ref": "#/texts/21" + "$ref": "#/texts/19" } ], "content_layer": "body", @@ -274,10 +268,10 @@ }, "children": [ { - "$ref": "#/texts/18" + "$ref": "#/texts/17" }, { - "$ref": "#/texts/19" + "$ref": "#/texts/18" } ], "content_layer": "body", @@ -291,7 +285,7 @@ }, "children": [ { - "$ref": "#/texts/22" + "$ref": "#/texts/20" } ], "content_layer": "body", @@ -305,19 +299,16 @@ }, "children": [ { - "$ref": "#/texts/28" + "$ref": "#/texts/26" }, { - "$ref": "#/texts/29" + "$ref": "#/texts/27" }, { "$ref": "#/groups/8" }, { - "$ref": "#/texts/32" - }, - { - "$ref": "#/texts/33" + "$ref": "#/texts/30" } ], "content_layer": "body", @@ -331,10 +322,10 @@ }, "children": [ { - "$ref": "#/texts/30" + "$ref": "#/texts/28" }, { - "$ref": "#/texts/31" + "$ref": "#/texts/29" } ], "content_layer": "body", @@ -348,7 +339,7 @@ }, "children": [ { - "$ref": "#/texts/34" + "$ref": "#/texts/31" } ], "content_layer": "body", @@ -362,7 +353,7 @@ }, "children": [ { - "$ref": "#/texts/36" + "$ref": "#/texts/33" } ], "content_layer": "body", @@ -379,10 +370,7 @@ "$ref": "#/groups/12" }, { - "$ref": "#/texts/47" - }, - { - "$ref": "#/texts/48" + "$ref": "#/texts/44" } ], "content_layer": "body", @@ -396,13 +384,13 @@ }, "children": [ { - "$ref": "#/texts/44" + "$ref": "#/texts/41" }, { - "$ref": "#/texts/45" + "$ref": "#/texts/42" }, { - "$ref": "#/texts/46" + "$ref": "#/texts/43" } ], "content_layer": "body", @@ -416,13 +404,10 @@ }, "children": [ { - "$ref": "#/texts/52" + "$ref": "#/texts/48" }, { - "$ref": "#/texts/53" - }, - { - "$ref": "#/texts/54" + "$ref": "#/texts/49" } ], "content_layer": "body", @@ -436,10 +421,10 @@ }, "children": [ { - "$ref": "#/texts/56" + "$ref": "#/texts/51" }, { - "$ref": "#/texts/57" + "$ref": "#/texts/52" } ], "content_layer": "body", @@ -451,7 +436,11 @@ "parent": { "$ref": "#/body" }, - "children": [], + "children": [ + { + "$ref": "#/texts/55" + } + ], "content_layer": "body", "name": "textbox", "label": "section" @@ -461,7 +450,11 @@ "parent": { "$ref": "#/body" }, - "children": [], + "children": [ + { + "$ref": "#/texts/57" + } + ], "content_layer": "body", "name": "textbox", "label": "section" @@ -473,16 +466,13 @@ }, "children": [ { - "$ref": "#/texts/63" + "$ref": "#/texts/60" }, { - "$ref": "#/texts/64" + "$ref": "#/texts/61" }, { - "$ref": "#/texts/65" - }, - { - "$ref": "#/texts/66" + "$ref": "#/texts/62" } ], "content_layer": "body", @@ -494,7 +484,11 @@ "parent": { "$ref": "#/body" }, - "children": [], + "children": [ + { + "$ref": "#/texts/64" + } + ], "content_layer": "body", "name": "textbox", "label": "section" @@ -581,18 +575,6 @@ }, { "self_ref": "#/texts/5", - "parent": { - "$ref": "#/groups/0" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/6", "parent": { "$ref": "#/groups/1" }, @@ -612,7 +594,7 @@ "marker": "-" }, { - "self_ref": "#/texts/7", + "self_ref": "#/texts/6", "parent": { "$ref": "#/body" }, @@ -625,6 +607,18 @@ "enumerated": false, "marker": "-" }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/8", "parent": { @@ -639,18 +633,6 @@ }, { "self_ref": "#/texts/9", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/10", "parent": { "$ref": "#/groups/2" }, @@ -667,6 +649,18 @@ "strikethrough": false } }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/11", "parent": { @@ -705,18 +699,6 @@ }, { "self_ref": "#/texts/14", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/15", "parent": { "$ref": "#/groups/3" }, @@ -733,6 +715,18 @@ "strikethrough": false } }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/16", "parent": { @@ -747,18 +741,6 @@ }, { "self_ref": "#/texts/17", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/18", "parent": { "$ref": "#/groups/5" }, @@ -778,7 +760,7 @@ "marker": "-" }, { - "self_ref": "#/texts/19", + "self_ref": "#/texts/18", "parent": { "$ref": "#/groups/5" }, @@ -797,32 +779,20 @@ "enumerated": false, "marker": "-" }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/20", - "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/21", - "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/22", "parent": { "$ref": "#/groups/6" }, @@ -835,6 +805,30 @@ "enumerated": false, "marker": "-" }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/23", "parent": { @@ -873,30 +867,6 @@ }, { "self_ref": "#/texts/26", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/27", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/28", "parent": { "$ref": "#/groups/7" }, @@ -914,7 +884,7 @@ } }, { - "self_ref": "#/texts/29", + "self_ref": "#/texts/27", "parent": { "$ref": "#/groups/7" }, @@ -932,7 +902,7 @@ } }, { - "self_ref": "#/texts/30", + "self_ref": "#/texts/28", "parent": { "$ref": "#/groups/8" }, @@ -952,7 +922,7 @@ "marker": "-" }, { - "self_ref": "#/texts/31", + "self_ref": "#/texts/29", "parent": { "$ref": "#/groups/8" }, @@ -972,7 +942,7 @@ "marker": "-" }, { - "self_ref": "#/texts/32", + "self_ref": "#/texts/30", "parent": { "$ref": "#/groups/7" }, @@ -984,19 +954,7 @@ "text": "" }, { - "self_ref": "#/texts/33", - "parent": { - "$ref": "#/groups/7" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/34", + "self_ref": "#/texts/31", "parent": { "$ref": "#/groups/9" }, @@ -1009,6 +967,48 @@ "enumerated": false, "marker": "-" }, + { + "self_ref": "#/texts/32", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/33", + "parent": { + "$ref": "#/groups/10" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", + "text": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/34", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/35", "parent": { @@ -1024,20 +1024,14 @@ { "self_ref": "#/texts/36", "parent": { - "$ref": "#/groups/10" + "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", - "text": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false - } + "orig": "", + "text": "" }, { "self_ref": "#/texts/37", @@ -1089,42 +1083,6 @@ }, { "self_ref": "#/texts/41", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/42", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/43", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/44", "parent": { "$ref": "#/groups/12" }, @@ -1142,7 +1100,7 @@ } }, { - "self_ref": "#/texts/45", + "self_ref": "#/texts/42", "parent": { "$ref": "#/groups/12" }, @@ -1160,7 +1118,7 @@ } }, { - "self_ref": "#/texts/46", + "self_ref": "#/texts/43", "parent": { "$ref": "#/groups/12" }, @@ -1178,7 +1136,7 @@ } }, { - "self_ref": "#/texts/47", + "self_ref": "#/texts/44", "parent": { "$ref": "#/groups/11" }, @@ -1189,22 +1147,64 @@ "orig": "", "text": "" }, + { + "self_ref": "#/texts/45", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/46", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/47", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/48", "parent": { - "$ref": "#/groups/11" + "$ref": "#/groups/13" }, "children": [], "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "", - "text": "" + "orig": "Whether the epidemic has eased.", + "text": "Whether the epidemic has eased.", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } }, { "self_ref": "#/texts/49", "parent": { - "$ref": "#/body" + "$ref": "#/groups/13" }, "children": [], "content_layer": "body", @@ -1227,72 +1227,6 @@ }, { "self_ref": "#/texts/51", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/52", - "parent": { - "$ref": "#/groups/13" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "Whether the epidemic has eased.", - "text": "Whether the epidemic has eased.", - "formatting": { - "bold": true, - "italic": false, - "underline": false, - "strikethrough": false - } - }, - { - "self_ref": "#/texts/53", - "parent": { - "$ref": "#/groups/13" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/54", - "parent": { - "$ref": "#/groups/13" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/55", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/56", "parent": { "$ref": "#/groups/14" }, @@ -1310,7 +1244,7 @@ } }, { - "self_ref": "#/texts/57", + "self_ref": "#/texts/52", "parent": { "$ref": "#/groups/14" }, @@ -1327,6 +1261,78 @@ "strikethrough": false } }, + { + "self_ref": "#/texts/53", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/54", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/55", + "parent": { + "$ref": "#/groups/15" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Yes", + "text": "Yes", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/56", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/57", + "parent": { + "$ref": "#/groups/16" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Yes", + "text": "Yes", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, { "self_ref": "#/texts/58", "parent": { @@ -1353,42 +1359,6 @@ }, { "self_ref": "#/texts/60", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/61", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/62", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/63", "parent": { "$ref": "#/groups/17" }, @@ -1406,7 +1376,7 @@ } }, { - "self_ref": "#/texts/64", + "self_ref": "#/texts/61", "parent": { "$ref": "#/groups/17" }, @@ -1418,19 +1388,7 @@ "text": "" }, { - "self_ref": "#/texts/65", - "parent": { - "$ref": "#/groups/17" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/66", + "self_ref": "#/texts/62", "parent": { "$ref": "#/groups/17" }, @@ -1447,6 +1405,60 @@ "strikethrough": false } }, + { + "self_ref": "#/texts/63", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/64", + "parent": { + "$ref": "#/groups/18" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "No", + "text": "No", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/65", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/66", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/67", "parent": { @@ -1458,42 +1470,6 @@ "prov": [], "orig": "", "text": "" - }, - { - "self_ref": "#/texts/68", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/69", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/70", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" } ], "pictures": [], diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.md b/tests/data/groundtruth/docling_v2/textbox.docx.md index 9458bd0..293c4d8 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.md +++ b/tests/data/groundtruth/docling_v2/textbox.docx.md @@ -40,6 +40,12 @@ The Health Bureau will handle **reporting and specimen collection** . No +Yes + +Yes + **Case closed.** -The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary. \ No newline at end of file +The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary. + +No \ No newline at end of file diff --git a/tests/data/webp/groundtruth/docling_v2/webp-test.doctags.txt b/tests/data/webp/groundtruth/docling_v2/webp-test.doctags.txt index 76fe886..5682a13 100644 --- a/tests/data/webp/groundtruth/docling_v2/webp-test.doctags.txt +++ b/tests/data/webp/groundtruth/docling_v2/webp-test.doctags.txt @@ -1,2 +1,2 @@ -Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package +Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package \ No newline at end of file diff --git a/tests/data/webp/groundtruth/docling_v2/webp-test.json b/tests/data/webp/groundtruth/docling_v2/webp-test.json index 94c9bda..bf14a5c 100644 --- a/tests/data/webp/groundtruth/docling_v2/webp-test.json +++ b/tests/data/webp/groundtruth/docling_v2/webp-test.json @@ -42,10 +42,10 @@ { "page_no": 1, "bbox": { - "l": 238.19302423176944, + "l": 234.08627147881114, "t": 2570.0959833241664, - "r": 1696.0985546594009, - "b": 2315.204273887442, + "r": 1696.0985042090742, + "b": 2319.1220927976665, "coord_origin": "BOTTOMLEFT" }, "charspan": [ diff --git a/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json b/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json index 67ad465..732403c 100644 --- a/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json +++ b/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json @@ -40,14 +40,14 @@ "a": 255 }, "rect": { - "r_x0": 238.19302423176944, - "r_y0": 415.36904822716525, - "r_x1": 1696.0985546594009, - "r_y1": 415.36904822716525, - "r_x2": 1696.0985546594009, - "r_y2": 345.20535775097477, - "r_x3": 238.19302423176944, - "r_y3": 345.20535775097477, + "r_x0": 234.08627147881114, + "r_y0": 419.5788697734327, + "r_x1": 1696.0985042090742, + "r_y1": 419.5788697734327, + "r_x2": 1696.0985042090742, + "r_y2": 349.4151792972422, + "r_x3": 234.08627147881114, + "r_y3": 349.4151792972422, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -65,14 +65,14 @@ "a": 255 }, "rect": { - "r_x0": 245.43122061153045, - "r_y0": 513.795726112558, - "r_x1": 514.3223724413002, - "r_y1": 513.795726112558, - "r_x2": 514.3223724413002, - "r_y2": 436.0574704074058, - "r_x3": 245.43122061153045, - "r_y3": 436.0574704074058, + "r_x0": 242.29979922858777, + "r_y0": 509.8779072023336, + "r_x1": 513.3470125989277, + "r_y1": 509.8779072023336, + "r_x2": 513.3470125989277, + "r_y2": 439.9752910477536, + "r_x3": 242.29979922858777, + "r_y3": 439.9752910477536, "coord_origin": "TOPLEFT" }, "text": "package", @@ -90,13 +90,13 @@ "id": 0, "label": "text", "bbox": { - "l": 238.19302423176944, + "l": 234.08627147881114, "t": 258.9040166758338, - "r": 1696.0985546594009, - "b": 513.795726112558, + "r": 1696.0985042090742, + "b": 509.8779072023336, "coord_origin": "TOPLEFT" }, - "confidence": 0.9721010327339172, + "confidence": 0.9721011519432068, "cells": [ { "index": 0, @@ -132,14 +132,14 @@ "a": 255 }, "rect": { - "r_x0": 238.19302423176944, - "r_y0": 415.36904822716525, - "r_x1": 1696.0985546594009, - "r_y1": 415.36904822716525, - "r_x2": 1696.0985546594009, - "r_y2": 345.20535775097477, - "r_x3": 238.19302423176944, - "r_y3": 345.20535775097477, + "r_x0": 234.08627147881114, + "r_y0": 419.5788697734327, + "r_x1": 1696.0985042090742, + "r_y1": 419.5788697734327, + "r_x2": 1696.0985042090742, + "r_y2": 349.4151792972422, + "r_x3": 234.08627147881114, + "r_y3": 349.4151792972422, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -157,14 +157,14 @@ "a": 255 }, "rect": { - "r_x0": 245.43122061153045, - "r_y0": 513.795726112558, - "r_x1": 514.3223724413002, - "r_y1": 513.795726112558, - "r_x2": 514.3223724413002, - "r_y2": 436.0574704074058, - "r_x3": 245.43122061153045, - "r_y3": 436.0574704074058, + "r_x0": 242.29979922858777, + "r_y0": 509.8779072023336, + "r_x1": 513.3470125989277, + "r_y1": 509.8779072023336, + "r_x2": 513.3470125989277, + "r_y2": 439.9752910477536, + "r_x3": 242.29979922858777, + "r_y3": 439.9752910477536, "coord_origin": "TOPLEFT" }, "text": "package", @@ -195,13 +195,13 @@ "id": 0, "label": "text", "bbox": { - "l": 238.19302423176944, + "l": 234.08627147881114, "t": 258.9040166758338, - "r": 1696.0985546594009, - "b": 513.795726112558, + "r": 1696.0985042090742, + "b": 509.8779072023336, "coord_origin": "TOPLEFT" }, - "confidence": 0.9721010327339172, + "confidence": 0.9721011519432068, "cells": [ { "index": 0, @@ -237,14 +237,14 @@ "a": 255 }, "rect": { - "r_x0": 238.19302423176944, - "r_y0": 415.36904822716525, - "r_x1": 1696.0985546594009, - "r_y1": 415.36904822716525, - "r_x2": 1696.0985546594009, - "r_y2": 345.20535775097477, - "r_x3": 238.19302423176944, - "r_y3": 345.20535775097477, + "r_x0": 234.08627147881114, + "r_y0": 419.5788697734327, + "r_x1": 1696.0985042090742, + "r_y1": 419.5788697734327, + "r_x2": 1696.0985042090742, + "r_y2": 349.4151792972422, + "r_x3": 234.08627147881114, + "r_y3": 349.4151792972422, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -262,14 +262,14 @@ "a": 255 }, "rect": { - "r_x0": 245.43122061153045, - "r_y0": 513.795726112558, - "r_x1": 514.3223724413002, - "r_y1": 513.795726112558, - "r_x2": 514.3223724413002, - "r_y2": 436.0574704074058, - "r_x3": 245.43122061153045, - "r_y3": 436.0574704074058, + "r_x0": 242.29979922858777, + "r_y0": 509.8779072023336, + "r_x1": 513.3470125989277, + "r_y1": 509.8779072023336, + "r_x2": 513.3470125989277, + "r_y2": 439.9752910477536, + "r_x3": 242.29979922858777, + "r_y3": 439.9752910477536, "coord_origin": "TOPLEFT" }, "text": "package", @@ -293,13 +293,13 @@ "id": 0, "label": "text", "bbox": { - "l": 238.19302423176944, + "l": 234.08627147881114, "t": 258.9040166758338, - "r": 1696.0985546594009, - "b": 513.795726112558, + "r": 1696.0985042090742, + "b": 509.8779072023336, "coord_origin": "TOPLEFT" }, - "confidence": 0.9721010327339172, + "confidence": 0.9721011519432068, "cells": [ { "index": 0, @@ -335,14 +335,14 @@ "a": 255 }, "rect": { - "r_x0": 238.19302423176944, - "r_y0": 415.36904822716525, - "r_x1": 1696.0985546594009, - "r_y1": 415.36904822716525, - "r_x2": 1696.0985546594009, - "r_y2": 345.20535775097477, - "r_x3": 238.19302423176944, - "r_y3": 345.20535775097477, + "r_x0": 234.08627147881114, + "r_y0": 419.5788697734327, + "r_x1": 1696.0985042090742, + "r_y1": 419.5788697734327, + "r_x2": 1696.0985042090742, + "r_y2": 349.4151792972422, + "r_x3": 234.08627147881114, + "r_y3": 349.4151792972422, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -360,14 +360,14 @@ "a": 255 }, "rect": { - "r_x0": 245.43122061153045, - "r_y0": 513.795726112558, - "r_x1": 514.3223724413002, - "r_y1": 513.795726112558, - "r_x2": 514.3223724413002, - "r_y2": 436.0574704074058, - "r_x3": 245.43122061153045, - "r_y3": 436.0574704074058, + "r_x0": 242.29979922858777, + "r_y0": 509.8779072023336, + "r_x1": 513.3470125989277, + "r_y1": 509.8779072023336, + "r_x2": 513.3470125989277, + "r_y2": 439.9752910477536, + "r_x3": 242.29979922858777, + "r_y3": 439.9752910477536, "coord_origin": "TOPLEFT" }, "text": "package", diff --git a/tests/data/xlsx/sample_sales_data.xlsm b/tests/data/xlsx/sample_sales_data.xlsm new file mode 100644 index 0000000000000000000000000000000000000000..0f3832a0b178f816e214f8fcde79c572af372bdb GIT binary patch literal 9945 zcmeHt1y@|j)^-!zAvEsp?i$=71Pku&(!o7QaCZp~2_Bpv!QDx4*Wd*Ax09LsWrmsW z7ue=?}qa+6j1q8qV-~a#sDIgYBzrz~>0Jwny05AY>&$Pwt?Lek> zAbnML2UBNVCO2Cfl3b`~G}(Y>;OGB${TJ^*X@Y`cCktB8rSt;n4=hSMO_IPg7YI}M zL>gF!151prRgBTv6Z7mYzEH&+*)AB=RomX9>_n)Nwt3Z@Zlz_knr%B(ND(z3xx(XG zUcNu|Ofp21gttXbuz~E8?A$A92dx9x!?-j)(Lr5);T=H0^ACDsU}0wGP5#XL2HI7W zsr%WLZG~%Z0VjlfpV*W^XFJAFw`_>b!o*P-1}dKBMT0JmCMCwmcdamC#p}6Z;sh@# z8XC5;_Kz-ZVq7>cWZ$v3LAp3rB;e`O_fd6(V2{TjFIhbp-aDBceBxnzuFI7oMlXV` z0TGysJ~3Z~&W_AMZB^~+7LY&|Ta@z%yK#B8?IKh`Nkjh{nX*NATcnuc+YZ?wMT_ed zoT}!^clXvHib;sB%5gvJr33Yva2tV$rUYI&%o42`?aHZ|?R#i8adri^*-W?8;ZE!GL>wgaj!4&1!4a zSjf-7GxA`gLn8g_oTyNWn+acOiY3Ul(K0MCDvX zq(76Y`uNE#pw>p_QQ$AOQ4^r4;=P5G@@?_C8<<}dh}`WXy;$QY4Sx>gC$DoY3r@aq zd;?2I<&-4pSi0JU?(+5g>qV-RtUHZMOEg_cLvfB=|1!DcAurRh2N21yyKPMpGd{*b2c)cE%)lPAw9pw zQ&X|vwX8JCa^fR%*Eg~Gaw?wQhH>x3sF=~GOvds2ifvS?mood@SF48OWH8gciyOYP zr0=*tAe^{(2^_%wX(aDIzBKv(HV`%#000%dGj2A_uJ%sWM)vmBKf_wS>b&AQ3!1mS zLsWjQv+)bIMx1*A(RohD0P_EHHadUB|=6dsDnSS%L_`387ttV%1o_2 zao!(?=>y~q7j}W|IKl;bLqM9E8jL(PrSMP?&~Dz(NVU?WNp)_l5vqQ#INA!*9!VeA zF=lE1Y;s&J$UsuN#eo{2;F2GG7cKf+0+zHm+W43?pNQ3q(F^6WRQ^sz{e+0#2Rgii zIZuM5Op+cR>L}jgUa%g8D2q98-JAA<|rj9DY6uxkG8H44hIk(8vQDBz{ zU?qW1*2>cQhh+0vgYtt|Yr^#o`n#EtlEoYEvoE~yHP177-x>%KJ_d3uPH<8n=R-uK zr)Axgf`~EMWzMcZi+fR2_v0ixC?1(BR#2VU^Y~EflS`H}_zG*)gM;P{JzemVHBQ&z zL;|2N&zL*%NoKA3BpP|T*p>-51pTZpqesEtF7^p8GLQ6*BfZt4mUC}JiskL`qU#Qt zhy#4ndyYxil8YLMMQ*J}eH^NW{B5^O_fT{A1WJI!mAFSU@+vGA7vIdE9-cDZ}_8iIs!{&$|+J5+S z7!uCtpo}O$3L4}2{_bAM0U}D8Ge)COTpvRgkZ0?Ph7va z1<4_6U3~2)^^C7pdBu%tw)KL>0#UENGzhnDK3I^njo`x`2w^ZRzL%0P5E~(X=`q1Dy z=Rywa8o}&1=5wk$N56L&UNG?Ic8pbLcsUth$>ixlmT*B7Y^hCaVA!QDcJvYrG0gMq zh>y2{T_N1lA#Hxx1qVJ4R(wiX#y3Jqwua{Q-jaqM{m^ZULIVkGxNFqt6$8dPx2zU| zb%v#PfQicwuk;5DPa_>2>H6#Cpnj^M4#i;P=G-5nngc}4rFRJUzR%Tl3|3udo#@lU z9u$@<&|9m=@~5{1E}}=wBfAJuG|>@lCGpzUeMx zWxj76W}<9Nb6*XA=u5vHjze$HeTIXKNzJAQnBJ&sPDif}mX`OU$!^UAy{osU=BM|{ z*H-M_gfFOWzBiU}`FeW_2tHn8k4}ESb9Dc(prRYu8LFaBo175p(-vhnqt|QbIZ(xo zduqzir=i=E8^I}OBL_k>vK7u>fUbfTFYhECy0U$J>G%9ftil-9PxH;;LpQXpadzH9 z5X!>{{&Xzpa#%W@I8A)paM?J?Ix!mMybU$Q3A>eWpJ+PPtMYS>f|oF1^~Z@fFnj~L zI~T&eZzu@&?RK6oLX2>g&@GM#OEp?;I}*ykdg8l;qO{VQsTIuQ@6fzH5GW`A?nl0l zLCDfu`Vj;RNx1pZ`=@Y*ApQ19GGygQ8(cMlJF!@7O_zN6-2%-Z)C^w& z29j|hn$=j}p-)j3j%dt`P$49+h~j5xvIgP=Z>!~#H>)5$!$TcDEkzbYa-i4-s&ZzTvy$v`6dat2K*;S@Y(9%uV@?X5;NbvjWir9EVqxjN zunI46v!pMvWAH9;+@Y^YBR%P78N7L=UD_>Y-;PZR8;3m+nFg5o6;R7DKm_aOsYS*o>jeyqFM zO=ML4HsuXO%ON@cRO(YVff{P4VOH->lk*)kN!IvYAF9GHH{UQ{l3j$F?L_vZR^+R% zP>mLh@Z8-7xbV77IXh^)IbG+}%e{ANReGvcU|T=&9iU&^OiZf0WN)!sq2 zhiPJ=>&@-ad*Qz9Msem=uOIW()Xc3nXO~Aro-5kVPzR2M6hg)8`_SjMX?M(!)@2o8 z*1Y&{;0s4kG61%y-^*nQR?6VbgmlsydOp)pnRL_)koDd<8KgV*7n9A{BR}p)|T-ETQ?$p*$(K#0{C^iNy z#Xax{V=Lu^kBC0wM*6~WU;w5qRBa98hXGj!TRA zUTe5L6LkpbiIP4AjNA9*7SAc(D_CAM84b~>uDf=-dGOh`k8j9@W@1pf;U4O1xYeKV z%#)+-g~N`1Zn`Z$gn7Jf`pl3#s%5(kjg_|Hu5Z({0kVu2$d4c_h9~DP_#RJ5fb|L$ zO99fpuwiyAf%>?X<`9JmYi3_0pd`9j7Ug+{ShOxDxjqBpM3p#iGRu6bDZ5u3M$Isf zOwKS@yrJ$WQXP?Wsf)Ww^7KIUYD7dT(3ss<5x30pnYn45%jQ9fTA8Zc@YY9l&fHYx z?P4w!T(5>=Jcf}{cwM-FCFs%xKXlzelxlc0iV2r`=hG?1wDX`QX(}GFpy-;VaG{TQ z)C$I?c#=fK-+7OK#EA`@oU`>SE?CrwvtN@c(>;!Cn>h&Ww)H05J;?|))fE+!a`M#2 zN-MrW0@s=`kqk?NDlAd-lABIC8G=9OD8b`j6j4>fPd9R>KU9`>C9N|~8 z*Ba=KG2&7dUwVv)-`>6{4iG|5bl^9n3-IwP;jpzm9bg=hm`G%ehKPNHt0sppd@ zO0`*~iZ*9Up{6fGM+Kb@*d_3{(NS$OOD8(vJdUc4Z^P?gmrBe*k}hl@M1WfBo-wI* z3a6=Y>zU<#>BMG@_8|Ibk%;%K%wtC+qt;7?pN(MLk_LE8O z5zy%C+WsdBO_4rvRT-UB?JC%fn9A02e`N1ESQ>vN?yvbYkT9-n4-4C5Z(*wJoE|=8 zMk{Qc*1S=@6i1ipM)f~`i_;YG(hO+zdb z-y*Jl-vsw}(5U6Urt{V(XQz|%#qH||U5%NlH;t3$DR^-gCSki57JPj;y(#*&11|(l z+PxjlxO7gUQ{yhI9wBs2+7k?YDf~9-M>w^)AHnt5KT~flr4XJ2@C*PJ06_jDEIEVR zZA_hiF%W7B_VX-AEvPfTXdqY9&yGn@b>Rgc{GaKvW)oJ5F^4fyi=G=w%Wm7u_!67) zhlZoYC$R-zExwt(G^M7~%2v3M-Z)NPcZ zwQspiu52FBrQURi)K7`4Q+qoNnzU|#P39ogE+rsd3txuNW;l4Jheb>Chypsa(?OPE z6X6mLQS$-$UH+EiG*m$n@*A&VXxCd5L~>20a&B4tCM@CFNU9ofTMkZEt6@@Ub8mF1 z9}^ZO)~+xO;?tbWvvN(&Y}7RZq!2&mV{!unPpTKRPbKr=S@%OduN|^JW-{bg7f`YNX;T&ke$ zLkIDB&Rv;6?eaJVra`EH(0uMN4y+rLdM)&*-R(s?R>_<5TdUk=cPp4H9K@rIjd!>9&hy6LeB_CFF10!^f_YI zt4M^f%3;XFu+c)8w3b(Vm!X+B>*!myh5+;n6H!EJlbMEj{7wd|x#!wD57LvO>->dWF@XwB7(AhiDqhv64dfv3{v%U$VL8~e-`9_&^f=$@?T_5V01e|2Me0`QNSbGh24Y)T5Ms@S#!C?6)*RKOtoWB9QU1LcD~5KW$?1{z#T(zld}Hi&m!Sp zjvj+lpr;$WaC$H!@#oQl42{4pk@?r~Clm3Dn+-mCy++8eTrV?+!1zifP&hGW22*Yp zmSrTr_L%fXPK{{uZPdDw(qy=m_bLzj1@~h_@9AD82xnPgXBEVj@}1@F z%^hmW5q4d6OAb{KYLp%`lh2T#XRPpXjSJ?Y+j5fT))frU*?2NQ@}+U95s~i!pK}?s z`e$Un*KFw<*QSLjV;tzR-J||AX8L0MYm&7?;n<`>%dQNvK<1oDKxBnzyGH#1JuDAM!A+(82d1 zs*XeVN)QlpXy;qEak9}Iz;TP*hK;a@PNVccBrbwd^kP%I-@OwYyFDW+`v7?<)y}s= zk+jc1!=(|oAa;}zqS7@(zKvca)%C*PGWy;%jE_H^k3Uza9SgLRZ?-JEd-JT~POqcI zPX4|6G<4<(_7qt$y;mvzX&vy}H3%288=KG;)yBYs&7R&~0`V%K3V*#Ev|})A>>|^* z_V2vqo;7XoEAX+KgQFojn71^sH&$}8cW`Dlws$i9v(5ZJx)N+LfpM?oI$3~0%g~Qv z{hsNC#*;yHV$`0>Xc;uEK^u0+b(Yd>pC3FYT}lWMMD^mF4|{o&^Gl4?VDt@2OVxs< zfq4L6MUd((Qi~n;DkDOvv25ih^_Uzi2CdEM*R<0K(=b^IW$irhT*nlqw9~kB8IwTc zLB!&e$YwEj$Jgm$yuO^CrMQnKo?m1Lm>r;_;J0uN{c?yxcLIT4;=6BYJ^fa2WQ&D) z!)3&eqo@5p?7xR>VRKZUsGW1=>(2ZHP24r^UE{>TrGUX%EfhTZ^7wXgT~41f;p03e zY(i0hik)BM+=j02BGDl5CDCW-q@gD2w!KU!$K|cP4(UBBT_{&r7TSk{%zG0#EKruxr@NhJIzz6kAtAfKb^SFmijyQXJZ4KLcG4^e z_xHCytT+Ch3|&EKVGRZ25D^jpK>I5W4ILc*2SYIa{=72cM1CfxK}V1m#7K+yyo*Xm z{-s71+-H?W5OH&B-vUFRan-e(C3_UjN|m)0Y1cvxULxde@ z%XnT?)*gE2J3N~KclPkd%9w3_4uBF>)LAhT2m+Hyqf#-cj02T{(J3-ET*iP8yh+Xn z%@^+voPg4fOaw?&Xt){Vc7m|^?}zc&9^bn0QWN1)Ey0_Ui;OX{G>Z|~GltxoK6dt+ zd+gN7aIIQbVvu?}n#e|eSQso>Lw z>Tl6`F!T1Lz4}!6?|rM^q5wb@{4e4EPZ#T{o~O->-jA9oc4hC;&++%RP<@?`&-l%?+?+ZnebB$PxHs$8s32E&%d{K{xzF? zs_$t~Ndy4=BiKI`|GQWItN01YU&R0P(ModAV88(Y QNZ=0!7_*f>lW4&I0ZH2!6aWAK literal 0 HcmV?d00001 diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index 65f636e..6dd7342 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -16,13 +16,13 @@ _log = logging.getLogger(__name__) GENERATE = GEN_TEST_DATA -def get_xlsx_paths(): +def get_excel_paths(): # Define the directory you want to search directory = Path("./tests/data/xlsx/") - # List all PDF files in the directory and its subdirectories - pdf_files = sorted(directory.rglob("*.xlsx")) - return pdf_files + # List all Excel files in the directory and its subdirectories + excel_files = sorted(directory.rglob("*.xlsx")) + sorted(directory.rglob("*.xlsm")) + return excel_files def get_converter(): @@ -35,17 +35,17 @@ def get_converter(): def documents() -> list[tuple[Path, DoclingDocument]]: documents: list[dict[Path, DoclingDocument]] = [] - xlsx_paths = get_xlsx_paths() + excel_paths = get_excel_paths() converter = get_converter() - for xlsx_path in xlsx_paths: - _log.debug(f"converting {xlsx_path}") + for excel_path in excel_paths: + _log.debug(f"converting {excel_path}") gt_path = ( - xlsx_path.parent.parent / "groundtruth" / "docling_v2" / xlsx_path.name + excel_path.parent.parent / "groundtruth" / "docling_v2" / excel_path.name ) - conv_result: ConversionResult = converter.convert(xlsx_path) + conv_result: ConversionResult = converter.convert(excel_path) doc: DoclingDocument = conv_result.document @@ -55,7 +55,7 @@ def documents() -> list[tuple[Path, DoclingDocument]]: return documents -def test_e2e_xlsx_conversions(documents) -> None: +def test_e2e_excel_conversions(documents) -> None: for gt_path, doc in documents: pred_md: str = doc.export_to_markdown() assert verify_export(pred_md, str(gt_path) + ".md"), "export to md" @@ -79,7 +79,7 @@ def test_pages(documents) -> None: documents: The paths and converted documents. """ # number of pages from the backend method - path = next(item for item in get_xlsx_paths() if item.stem == "test-01") + path = next(item for item in get_excel_paths() if item.stem == "test-01") in_doc = InputDocument( path_or_stream=path, format=InputFormat.XLSX,