feat: leverage new list modeling, capture default markers (#1856)

* chore: update docling-core & regenerate test data

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* update backends to leverage new list modeling

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* repin docling-core

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

* ensure availability of latest docling-core API

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>

---------

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
Panos Vagenas
2025-06-27 16:37:15 +02:00
committed by GitHub
parent e79e4f0ab6
commit 0533da1923
90 changed files with 2252 additions and 2240 deletions
+70 -70
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "2203.01017v2",
"origin": {
"mimetype": "application/pdf",
@@ -1340,7 +1340,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/13",
@@ -2096,7 +2096,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/39",
@@ -3055,7 +3055,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/72",
@@ -3086,7 +3086,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/73",
@@ -3117,7 +3117,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/74",
@@ -3148,7 +3148,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/75",
@@ -9249,7 +9249,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/284",
@@ -9280,7 +9280,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/285",
@@ -11288,7 +11288,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/354",
@@ -11348,7 +11348,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/356",
@@ -11379,7 +11379,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/357",
@@ -11410,7 +11410,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/358",
@@ -11441,7 +11441,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/359",
@@ -11472,7 +11472,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/360",
@@ -11503,7 +11503,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/361",
@@ -11534,7 +11534,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/362",
@@ -11565,7 +11565,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/363",
@@ -11596,7 +11596,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/364",
@@ -11627,7 +11627,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/365",
@@ -11658,7 +11658,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/366",
@@ -11689,7 +11689,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/367",
@@ -11720,7 +11720,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/368",
@@ -11751,7 +11751,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/369",
@@ -11782,7 +11782,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/370",
@@ -11813,7 +11813,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/371",
@@ -11844,7 +11844,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/372",
@@ -11875,7 +11875,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/373",
@@ -11906,7 +11906,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/374",
@@ -11937,7 +11937,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/375",
@@ -11968,7 +11968,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/376",
@@ -11999,7 +11999,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/377",
@@ -12030,7 +12030,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/378",
@@ -12061,7 +12061,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/379",
@@ -12092,7 +12092,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/380",
@@ -12181,7 +12181,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/383",
@@ -12212,7 +12212,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/384",
@@ -12243,7 +12243,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/385",
@@ -12274,7 +12274,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/386",
@@ -12305,7 +12305,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/387",
@@ -12336,7 +12336,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/388",
@@ -12367,7 +12367,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/389",
@@ -12398,7 +12398,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/390",
@@ -12429,7 +12429,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/391",
@@ -12460,7 +12460,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/392",
@@ -12491,7 +12491,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/393",
@@ -12522,7 +12522,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/394",
@@ -12553,7 +12553,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/395",
@@ -12584,7 +12584,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/396",
@@ -12923,7 +12923,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/407",
@@ -12954,7 +12954,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/408",
@@ -12985,7 +12985,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/409",
@@ -13016,7 +13016,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/410",
@@ -13047,7 +13047,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/411",
@@ -14906,7 +14906,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/475",
@@ -14937,7 +14937,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/476",
@@ -15055,7 +15055,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/480",
@@ -15086,7 +15086,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/481",
@@ -15117,7 +15117,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/482",
@@ -15148,7 +15148,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/483",
@@ -15179,7 +15179,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/484",
@@ -15268,7 +15268,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/487",
@@ -15299,7 +15299,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/488",
@@ -15330,7 +15330,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/489",
@@ -15361,7 +15361,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/490",
@@ -15392,7 +15392,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/491",
@@ -15452,7 +15452,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/493",
@@ -15483,7 +15483,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/494",
@@ -15514,7 +15514,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/495",
@@ -15545,7 +15545,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/496",
+35 -35
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "2206.01062",
"origin": {
"mimetype": "application/pdf",
@@ -10866,7 +10866,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/356",
@@ -10897,7 +10897,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/357",
@@ -10928,7 +10928,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/358",
@@ -10959,7 +10959,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/359",
@@ -11048,7 +11048,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/362",
@@ -12430,7 +12430,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/409",
@@ -12461,7 +12461,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/410",
@@ -12492,7 +12492,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/411",
@@ -12523,7 +12523,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/412",
@@ -12554,7 +12554,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/413",
@@ -12585,7 +12585,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/414",
@@ -14713,7 +14713,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/487",
@@ -14744,7 +14744,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/488",
@@ -14775,7 +14775,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/489",
@@ -14806,7 +14806,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/490",
@@ -14837,7 +14837,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/491",
@@ -14868,7 +14868,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/492",
@@ -14899,7 +14899,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/493",
@@ -14930,7 +14930,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/494",
@@ -14961,7 +14961,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/495",
@@ -14992,7 +14992,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/496",
@@ -15023,7 +15023,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/497",
@@ -15054,7 +15054,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/498",
@@ -15085,7 +15085,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/499",
@@ -15580,7 +15580,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/516",
@@ -15611,7 +15611,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/517",
@@ -15642,7 +15642,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/518",
@@ -15673,7 +15673,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/519",
@@ -15704,7 +15704,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/520",
@@ -15735,7 +15735,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/521",
@@ -15766,7 +15766,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/522",
@@ -15797,7 +15797,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/523",
@@ -15828,7 +15828,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/524",
@@ -15859,7 +15859,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
}
],
"pictures": [
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "2305.03393v1-pg9",
"origin": {
"mimetype": "application/pdf",
@@ -60,8 +60,6 @@
<page_header><loc_159><loc_59><loc_366><loc_64>Optimized Table Tokenization for Table Structure Recognition</page_header>
<page_header><loc_389><loc_59><loc_393><loc_64>7</page_header>
<picture><loc_135><loc_103><loc_367><loc_177><caption><loc_110><loc_79><loc_393><loc_98>Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding</caption></picture>
<unordered_list><list_item><loc_273><loc_172><loc_349><loc_176>4 - 2d merges: "C", "L", "U", "X"</list_item>
</unordered_list>
<section_header_level_1><loc_110><loc_193><loc_202><loc_198>4.2 Language Syntax</section_header_level_1>
<text><loc_110><loc_205><loc_297><loc_211>The OTSL representation follows these syntax rules:</text>
<unordered_list><list_item><loc_114><loc_219><loc_393><loc_232>1. Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell.</list_item>
File diff suppressed because it is too large Load Diff
-2
View File
@@ -84,8 +84,6 @@ Fig. 3. OTSL description of table structure: A - table example; B - graphical re
<!-- image -->
- 4 - 2d merges: "C", "L", "U", "X"
## 4.2 Language Syntax
The OTSL representation follows these syntax rules:
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "amt_handbook_sample",
"origin": {
"mimetype": "application/pdf",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "code_and_formula",
"origin": {
"mimetype": "application/pdf",
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "csv-comma-in-cell",
"origin": {
"mimetype": "text/csv",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "csv-comma",
"origin": {
"mimetype": "text/csv",
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "csv-inconsistent-header",
"origin": {
"mimetype": "text/csv",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "csv-pipe",
"origin": {
"mimetype": "text/csv",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "csv-semicolon",
"origin": {
"mimetype": "text/csv",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "csv-tab",
"origin": {
"mimetype": "text/csv",
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "csv-too-few-columns",
"origin": {
"mimetype": "text/csv",
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "csv-too-many-columns",
"origin": {
"mimetype": "text/csv",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "equations",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+5 -2
View File
@@ -7,6 +7,9 @@ item-0 at level 0: unspecified: group _root_
item-6 at level 3: list: group list
item-7 at level 4: list_item: First item in unordered list
item-8 at level 4: list_item: Second item in unordered list
item-9 at level 3: ordered_list: group ordered list
item-9 at level 3: list: group ordered list
item-10 at level 4: list_item: First item in ordered list
item-11 at level 4: list_item: Second item in ordered list
item-11 at level 4: list_item: Second item in ordered list
item-12 at level 3: list: group ordered list start 42
item-13 at level 4: list_item: First item in ordered list with start
item-14 at level 4: list_item: Second item in ordered list with start
+55 -7
View File
@@ -1,10 +1,10 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "example_01",
"origin": {
"mimetype": "text/html",
"binary_hash": 13782069548509991617,
"binary_hash": 13726679883013609282,
"filename": "example_01.html"
},
"furniture": {
@@ -58,7 +58,24 @@
],
"content_layer": "body",
"name": "ordered list",
"label": "ordered_list"
"label": "list"
},
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/texts/2"
},
"children": [
{
"$ref": "#/texts/8"
},
{
"$ref": "#/texts/9"
}
],
"content_layer": "body",
"name": "ordered list start 42",
"label": "list"
}
],
"texts": [
@@ -110,6 +127,9 @@
},
{
"$ref": "#/groups/1"
},
{
"$ref": "#/groups/2"
}
],
"content_layer": "body",
@@ -143,7 +163,7 @@
"orig": "First item in unordered list",
"text": "First item in unordered list",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/5",
@@ -157,7 +177,7 @@
"orig": "Second item in unordered list",
"text": "Second item in unordered list",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/6",
@@ -171,7 +191,7 @@
"orig": "First item in ordered list",
"text": "First item in ordered list",
"enumerated": true,
"marker": "1."
"marker": ""
},
{
"self_ref": "#/texts/7",
@@ -185,7 +205,35 @@
"orig": "Second item in ordered list",
"text": "Second item in ordered list",
"enumerated": true,
"marker": "2."
"marker": ""
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "First item in ordered list with start",
"text": "First item in ordered list with start",
"enumerated": true,
"marker": "42."
},
{
"self_ref": "#/texts/9",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "Second item in ordered list with start",
"text": "Second item in ordered list with start",
"enumerated": true,
"marker": "43."
}
],
"pictures": [
+4 -1
View File
@@ -12,4 +12,7 @@ Some background information here.
- Second item in unordered list
1. First item in ordered list
2. Second item in ordered list
2. Second item in ordered list
42. First item in ordered list with start
43. Second item in ordered list with start
+1 -1
View File
@@ -6,6 +6,6 @@ item-0 at level 0: unspecified: group _root_
item-5 at level 3: list: group list
item-6 at level 4: list_item: First item in unordered list
item-7 at level 4: list_item: Second item in unordered list
item-8 at level 3: ordered_list: group ordered list
item-8 at level 3: list: group ordered list
item-9 at level 4: list_item: First item in ordered list
item-10 at level 4: list_item: Second item in ordered list
+6 -6
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "example_02",
"origin": {
"mimetype": "text/html",
@@ -58,7 +58,7 @@
],
"content_layer": "body",
"name": "ordered list",
"label": "ordered_list"
"label": "list"
}
],
"texts": [
@@ -140,7 +140,7 @@
"orig": "First item in unordered list",
"text": "First item in unordered list",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/5",
@@ -154,7 +154,7 @@
"orig": "Second item in unordered list",
"text": "Second item in unordered list",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/6",
@@ -168,7 +168,7 @@
"orig": "First item in ordered list",
"text": "First item in ordered list",
"enumerated": true,
"marker": "1."
"marker": ""
},
{
"self_ref": "#/texts/7",
@@ -182,7 +182,7 @@
"orig": "Second item in ordered list",
"text": "Second item in ordered list",
"enumerated": true,
"marker": "2."
"marker": ""
}
],
"pictures": [],
+2 -2
View File
@@ -10,9 +10,9 @@ item-0 at level 0: unspecified: group _root_
item-9 at level 6: list_item: Nested item 1
item-10 at level 6: list_item: Nested item 2
item-11 at level 4: list_item: Second item in unordered list
item-12 at level 3: ordered_list: group ordered list
item-12 at level 3: list: group ordered list
item-13 at level 4: list_item: First item in ordered list
item-14 at level 5: ordered_list: group ordered list
item-14 at level 5: list: group ordered list
item-15 at level 6: list_item: Nested ordered item 1
item-16 at level 6: list_item: Nested ordered item 2
item-17 at level 4: list_item: Second item in ordered list
+11 -11
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "example_03",
"origin": {
"mimetype": "text/html",
@@ -75,7 +75,7 @@
],
"content_layer": "body",
"name": "ordered list",
"label": "ordered_list"
"label": "list"
},
{
"self_ref": "#/groups/3",
@@ -92,7 +92,7 @@
],
"content_layer": "body",
"name": "ordered list",
"label": "ordered_list"
"label": "list"
}
],
"texts": [
@@ -198,7 +198,7 @@
"orig": "First item in unordered list",
"text": "First item in unordered list",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/6",
@@ -212,7 +212,7 @@
"orig": "Nested item 1",
"text": "Nested item 1",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/7",
@@ -226,7 +226,7 @@
"orig": "Nested item 2",
"text": "Nested item 2",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/8",
@@ -240,7 +240,7 @@
"orig": "Second item in unordered list",
"text": "Second item in unordered list",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/9",
@@ -258,7 +258,7 @@
"orig": "First item in ordered list",
"text": "First item in ordered list",
"enumerated": true,
"marker": "1"
"marker": ""
},
{
"self_ref": "#/texts/10",
@@ -272,7 +272,7 @@
"orig": "Nested ordered item 1",
"text": "Nested ordered item 1",
"enumerated": true,
"marker": "1."
"marker": ""
},
{
"self_ref": "#/texts/11",
@@ -286,7 +286,7 @@
"orig": "Nested ordered item 2",
"text": "Nested ordered item 2",
"enumerated": true,
"marker": "2."
"marker": ""
},
{
"self_ref": "#/texts/12",
@@ -300,7 +300,7 @@
"orig": "Second item in ordered list",
"text": "Second item in ordered list",
"enumerated": true,
"marker": "2."
"marker": ""
},
{
"self_ref": "#/texts/13",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "example_04",
"origin": {
"mimetype": "text/html",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "example_05",
"origin": {
"mimetype": "text/html",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "example_06",
"origin": {
"mimetype": "text/html",
+15 -15
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "example_07",
"origin": {
"mimetype": "text/html",
@@ -169,7 +169,7 @@
"orig": "Asia",
"text": "Asia",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/1",
@@ -183,7 +183,7 @@
"orig": "China",
"text": "China",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/2",
@@ -197,7 +197,7 @@
"orig": "Japan",
"text": "Japan",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/3",
@@ -211,7 +211,7 @@
"orig": "Thailand",
"text": "Thailand",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/4",
@@ -229,7 +229,7 @@
"orig": "Europe",
"text": "Europe",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/5",
@@ -243,7 +243,7 @@
"orig": "UK",
"text": "UK",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/6",
@@ -257,7 +257,7 @@
"orig": "Germany",
"text": "Germany",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/7",
@@ -275,7 +275,7 @@
"orig": "Switzerland",
"text": "Switzerland",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/8",
@@ -289,7 +289,7 @@
"orig": "Bern",
"text": "Bern",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/9",
@@ -303,7 +303,7 @@
"orig": "Aargau",
"text": "Aargau",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/10",
@@ -321,7 +321,7 @@
"orig": "Italy",
"text": "Italy",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/11",
@@ -335,7 +335,7 @@
"orig": "Piedmont",
"text": "Piedmont",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/12",
@@ -349,7 +349,7 @@
"orig": "Liguria",
"text": "Liguria",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/13",
@@ -363,7 +363,7 @@
"orig": "Africa",
"text": "Africa",
"enumerated": false,
"marker": "-"
"marker": ""
}
],
"pictures": [],
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "example_08",
"origin": {
"mimetype": "text/html",
@@ -56,7 +56,7 @@ groups:
- $ref: '#/texts/27'
- $ref: '#/texts/28'
content_layer: body
label: ordered_list
label: list
name: list
parent:
$ref: '#/body'
@@ -430,7 +430,7 @@ texts:
content_layer: body
enumerated: true
label: list_item
marker: '-'
marker: ''
orig: ''
parent:
$ref: '#/groups/2'
@@ -476,7 +476,7 @@ texts:
content_layer: body
enumerated: true
label: list_item
marker: '-'
marker: ''
orig: ''
parent:
$ref: '#/groups/2'
@@ -519,7 +519,7 @@ texts:
content_layer: body
enumerated: true
label: list_item
marker: '-'
marker: ''
orig: ''
parent:
$ref: '#/groups/2'
@@ -562,7 +562,7 @@ texts:
content_layer: body
enumerated: true
label: list_item
marker: '-'
marker: ''
orig: ''
parent:
$ref: '#/groups/2'
@@ -604,7 +604,7 @@ texts:
content_layer: body
enumerated: true
label: list_item
marker: '-'
marker: ''
orig: Open a Pull Request
parent:
$ref: '#/groups/2'
@@ -621,7 +621,7 @@ texts:
strikethrough: false
underline: false
label: list_item
marker: '-'
marker: ''
orig: Whole list item has same formatting
parent:
$ref: '#/groups/2'
@@ -633,7 +633,7 @@ texts:
content_layer: body
enumerated: true
label: list_item
marker: '-'
marker: ''
orig: ''
parent:
$ref: '#/groups/2'
@@ -693,7 +693,7 @@ texts:
content_layer: body
enumerated: false
label: list_item
marker: '-'
marker: ''
orig: ''
parent:
$ref: '#/groups/8'
@@ -729,7 +729,7 @@ texts:
content_layer: body
enumerated: false
label: list_item
marker: '-'
marker: ''
orig: ''
parent:
$ref: '#/groups/8'
@@ -878,4 +878,4 @@ texts:
prov: []
self_ref: '#/texts/48'
text: Table Heading
version: 1.4.0
version: 1.5.0
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "ipa20180000016.xml",
"origin": {
"mimetype": "application/xml",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "ipa20200022300.xml",
"origin": {
"mimetype": "application/xml",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "lorem_ipsum",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+27 -27
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "multi_page",
"origin": {
"mimetype": "application/pdf",
@@ -534,7 +534,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/8",
@@ -565,7 +565,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/9",
@@ -684,7 +684,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/13",
@@ -715,7 +715,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/14",
@@ -834,7 +834,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/18",
@@ -865,7 +865,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/19",
@@ -896,7 +896,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/20",
@@ -1074,7 +1074,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/26",
@@ -1105,7 +1105,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/27",
@@ -1136,7 +1136,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/28",
@@ -1226,7 +1226,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/31",
@@ -1257,7 +1257,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/32",
@@ -1288,7 +1288,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/33",
@@ -1319,7 +1319,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/34",
@@ -1350,7 +1350,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/35",
@@ -1440,7 +1440,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/38",
@@ -1471,7 +1471,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/39",
@@ -1502,7 +1502,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/40",
@@ -1592,7 +1592,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/43",
@@ -1623,7 +1623,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/44",
@@ -1654,7 +1654,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/45",
@@ -1685,7 +1685,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/46",
@@ -1716,7 +1716,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/47",
@@ -1806,7 +1806,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/50",
@@ -1837,7 +1837,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/51",
@@ -1868,7 +1868,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/52",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "pa20010031492.xml",
"origin": {
"mimetype": "application/xml",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "pftaps057006474.txt",
"origin": {
"mimetype": "text/plain",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "pg06442728.xml",
"origin": {
"mimetype": "application/xml",
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "picture_classification",
"origin": {
"mimetype": "application/pdf",
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "powerpoint_bad_text",
"origin": {
"mimetype": "application/vnd.ms-powerpoint",
@@ -11,7 +11,7 @@ item-0 at level 0: unspecified: group _root_
item-10 at level 2: paragraph: And baz things
item-11 at level 2: paragraph: A rectangle shape with this text inside.
item-12 at level 1: chapter: group slide-2
item-13 at level 2: ordered_list: group list
item-13 at level 2: list: group list
item-14 at level 3: list_item: List item4
item-15 at level 3: list_item: List item5
item-16 at level 3: list_item: List item6
@@ -25,7 +25,7 @@ item-0 at level 0: unspecified: group _root_
item-24 at level 3: list_item: Item A
item-25 at level 3: list_item: Item B
item-26 at level 2: paragraph: Maybe a list?
item-27 at level 2: ordered_list: group list
item-27 at level 2: list: group list
item-28 at level 3: list_item: List1
item-29 at level 3: list_item: List2
item-30 at level 3: list_item: List3
+12 -12
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "powerpoint_sample",
"origin": {
"mimetype": "application/vnd.ms-powerpoint",
@@ -137,7 +137,7 @@
],
"content_layer": "body",
"name": "list",
"label": "ordered_list"
"label": "list"
},
{
"self_ref": "#/groups/4",
@@ -197,7 +197,7 @@
],
"content_layer": "body",
"name": "list",
"label": "ordered_list"
"label": "list"
},
{
"self_ref": "#/groups/7",
@@ -578,7 +578,7 @@
"orig": "I1",
"text": "I1",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/13",
@@ -607,7 +607,7 @@
"orig": "I2",
"text": "I2",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/14",
@@ -636,7 +636,7 @@
"orig": "I3",
"text": "I3",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/15",
@@ -665,7 +665,7 @@
"orig": "I4",
"text": "I4",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/16",
@@ -721,7 +721,7 @@
"orig": "Item A",
"text": "Item A",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/18",
@@ -750,7 +750,7 @@
"orig": "Item B",
"text": "Item B",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/19",
@@ -893,7 +893,7 @@
"orig": "l1 ",
"text": "l1 ",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/24",
@@ -922,7 +922,7 @@
"orig": "l2",
"text": "l2",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/25",
@@ -951,7 +951,7 @@
"orig": "l3",
"text": "l3",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/26",
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "powerpoint_with_image",
"origin": {
"mimetype": "application/vnd.ms-powerpoint",
+44 -44
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "redp5110_sampled",
"origin": {
"mimetype": "application/pdf",
@@ -1295,7 +1295,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/15",
@@ -1326,7 +1326,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/16",
@@ -1357,7 +1357,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/17",
@@ -1388,7 +1388,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/18",
@@ -1683,7 +1683,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/28",
@@ -1714,7 +1714,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/29",
@@ -1745,7 +1745,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/30",
@@ -1776,7 +1776,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/31",
@@ -1807,7 +1807,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/32",
@@ -1838,7 +1838,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/33",
@@ -1869,7 +1869,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/34",
@@ -1900,7 +1900,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/35",
@@ -1931,7 +1931,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/36",
@@ -2400,7 +2400,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/52",
@@ -2431,7 +2431,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/53",
@@ -2462,7 +2462,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/54",
@@ -2668,7 +2668,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/61",
@@ -2699,7 +2699,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/62",
@@ -2759,7 +2759,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/64",
@@ -3344,7 +3344,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/84",
@@ -3375,7 +3375,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/85",
@@ -3406,7 +3406,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/86",
@@ -5992,7 +5992,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/175",
@@ -6023,7 +6023,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/176",
@@ -6054,7 +6054,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/177",
@@ -6085,7 +6085,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/178",
@@ -6116,7 +6116,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/179",
@@ -6787,7 +6787,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/202",
@@ -6818,7 +6818,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/203",
@@ -6849,7 +6849,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/204",
@@ -7064,7 +7064,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/211",
@@ -7095,7 +7095,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/212",
@@ -7126,7 +7126,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/213",
@@ -7157,7 +7157,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/214",
@@ -7188,7 +7188,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/215",
@@ -7219,7 +7219,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/216",
@@ -7379,7 +7379,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/221",
@@ -7498,7 +7498,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/225",
@@ -7559,7 +7559,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/227",
@@ -7590,7 +7590,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/228",
@@ -7737,7 +7737,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/233",
@@ -7855,7 +7855,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/237",
@@ -7915,7 +7915,7 @@
"formatting": null,
"hyperlink": null,
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/239",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "right_to_left_01",
"origin": {
"mimetype": "application/pdf",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "right_to_left_02",
"origin": {
"mimetype": "application/pdf",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "right_to_left_03",
"origin": {
"mimetype": "application/pdf",
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "sample_sales_data",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+3 -3
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "tablecell",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -82,7 +82,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/1",
@@ -103,7 +103,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/2",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "test-01",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "test_emf_docx",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+48 -50
View File
@@ -29,64 +29,62 @@ item-0 at level 0: unspecified: group _root_
item-24 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System.
item-25 at level 2: paragraph:
item-26 at level 1: list: group list
item-27 at level 2: list_item:
item-27 at level 1: paragraph:
item-28 at level 1: paragraph:
item-29 at level 1: paragraph:
item-30 at level 1: paragraph:
item-31 at level 1: paragraph:
item-32 at level 1: paragraph:
item-33 at level 1: section: group textbox
item-34 at level 2: paragraph: Health Bureau:
item-35 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
item-36 at level 2: list: group list
item-37 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
item-38 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
item-39 at level 2: paragraph:
item-40 at level 1: list: group list
item-41 at level 2: list_item:
item-42 at level 1: paragraph:
item-43 at level 1: section: group textbox
item-44 at level 2: paragraph: Department of Education:
item-32 at level 1: section: group textbox
item-33 at level 2: paragraph: Health Bureau:
item-34 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
item-35 at level 2: list: group list
item-36 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
item-37 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
item-38 at level 2: paragraph:
item-39 at level 1: list: group list
item-40 at level 1: paragraph:
item-41 at level 1: section: group textbox
item-42 at level 2: paragraph: Department of Education:
Collabo ... vention measures at all school levels.
item-43 at level 1: paragraph:
item-44 at level 1: paragraph:
item-45 at level 1: paragraph:
item-46 at level 1: paragraph:
item-47 at level 1: paragraph:
item-48 at level 1: paragraph:
item-49 at level 1: paragraph:
item-50 at level 1: paragraph:
item-51 at level 1: paragraph:
item-52 at level 1: section: group textbox
item-53 at level 2: inline: group group
item-54 at level 3: paragraph: The Health Bureau will handle
item-55 at level 3: paragraph: reporting and specimen collection
item-56 at level 3: paragraph: .
item-57 at level 2: paragraph:
item-50 at level 1: section: group textbox
item-51 at level 2: inline: group group
item-52 at level 3: paragraph: The Health Bureau will handle
item-53 at level 3: paragraph: reporting and specimen collection
item-54 at level 3: paragraph: .
item-55 at level 2: paragraph:
item-56 at level 1: paragraph:
item-57 at level 1: paragraph:
item-58 at level 1: paragraph:
item-59 at level 1: paragraph:
item-60 at level 1: paragraph:
item-61 at level 1: section: group textbox
item-62 at level 2: paragraph: Whether the epidemic has eased.
item-63 at level 2: paragraph:
item-64 at level 1: paragraph:
item-65 at level 1: section: group textbox
item-66 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
item-67 at level 2: paragraph: No
item-68 at level 1: paragraph:
item-69 at level 1: paragraph:
item-70 at level 1: section: group textbox
item-71 at level 2: paragraph: Yes
item-72 at level 1: paragraph:
item-73 at level 1: section: group textbox
item-74 at level 2: paragraph: Yes
item-75 at level 1: paragraph:
item-76 at level 1: paragraph:
item-77 at level 1: section: group textbox
item-78 at level 2: paragraph: Case closed.
item-79 at level 2: paragraph:
item-80 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
item-81 at level 1: paragraph:
item-82 at level 1: section: group textbox
item-83 at level 2: paragraph: No
item-84 at level 1: paragraph:
item-85 at level 1: paragraph:
item-86 at level 1: paragraph:
item-59 at level 1: section: group textbox
item-60 at level 2: paragraph: Whether the epidemic has eased.
item-61 at level 2: paragraph:
item-62 at level 1: paragraph:
item-63 at level 1: section: group textbox
item-64 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
item-65 at level 2: paragraph: No
item-66 at level 1: paragraph:
item-67 at level 1: paragraph:
item-68 at level 1: section: group textbox
item-69 at level 2: paragraph: Yes
item-70 at level 1: paragraph:
item-71 at level 1: section: group textbox
item-72 at level 2: paragraph: Yes
item-73 at level 1: paragraph:
item-74 at level 1: paragraph:
item-75 at level 1: section: group textbox
item-76 at level 2: paragraph: Case closed.
item-77 at level 2: paragraph:
item-78 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
item-79 at level 1: paragraph:
item-80 at level 1: section: group textbox
item-81 at level 2: paragraph: No
item-82 at level 1: paragraph:
item-83 at level 1: paragraph:
item-84 at level 1: paragraph:
+199 -235
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "textbox",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -65,6 +65,9 @@
{
"$ref": "#/groups/6"
},
{
"$ref": "#/texts/19"
},
{
"$ref": "#/texts/20"
},
@@ -77,9 +80,6 @@
{
"$ref": "#/texts/23"
},
{
"$ref": "#/texts/24"
},
{
"$ref": "#/groups/7"
},
@@ -87,11 +87,17 @@
"$ref": "#/groups/9"
},
{
"$ref": "#/texts/31"
"$ref": "#/texts/29"
},
{
"$ref": "#/groups/10"
},
{
"$ref": "#/texts/31"
},
{
"$ref": "#/texts/32"
},
{
"$ref": "#/texts/33"
},
@@ -107,71 +113,65 @@
{
"$ref": "#/texts/37"
},
{
"$ref": "#/texts/38"
},
{
"$ref": "#/texts/39"
},
{
"$ref": "#/groups/11"
},
{
"$ref": "#/texts/42"
},
{
"$ref": "#/texts/43"
},
{
"$ref": "#/texts/44"
},
{
"$ref": "#/texts/45"
},
{
"$ref": "#/texts/46"
},
{
"$ref": "#/groups/13"
},
{
"$ref": "#/texts/49"
"$ref": "#/texts/47"
},
{
"$ref": "#/groups/14"
},
{
"$ref": "#/texts/52"
"$ref": "#/texts/50"
},
{
"$ref": "#/texts/53"
"$ref": "#/texts/51"
},
{
"$ref": "#/groups/15"
},
{
"$ref": "#/texts/55"
"$ref": "#/texts/53"
},
{
"$ref": "#/groups/16"
},
{
"$ref": "#/texts/57"
"$ref": "#/texts/55"
},
{
"$ref": "#/texts/58"
"$ref": "#/texts/56"
},
{
"$ref": "#/groups/17"
},
{
"$ref": "#/texts/62"
"$ref": "#/texts/60"
},
{
"$ref": "#/groups/18"
},
{
"$ref": "#/texts/62"
},
{
"$ref": "#/texts/63"
},
{
"$ref": "#/texts/64"
},
{
"$ref": "#/texts/65"
},
{
"$ref": "#/texts/66"
}
],
"content_layer": "body",
@@ -280,11 +280,7 @@
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/19"
}
],
"children": [],
"content_layer": "body",
"name": "list",
"label": "list"
@@ -296,16 +292,16 @@
},
"children": [
{
"$ref": "#/texts/25"
"$ref": "#/texts/24"
},
{
"$ref": "#/texts/26"
"$ref": "#/texts/25"
},
{
"$ref": "#/groups/8"
},
{
"$ref": "#/texts/29"
"$ref": "#/texts/28"
}
],
"content_layer": "body",
@@ -319,10 +315,10 @@
},
"children": [
{
"$ref": "#/texts/27"
"$ref": "#/texts/26"
},
{
"$ref": "#/texts/28"
"$ref": "#/texts/27"
}
],
"content_layer": "body",
@@ -334,11 +330,7 @@
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/30"
}
],
"children": [],
"content_layer": "body",
"name": "list",
"label": "list"
@@ -350,7 +342,7 @@
},
"children": [
{
"$ref": "#/texts/32"
"$ref": "#/texts/30"
}
],
"content_layer": "body",
@@ -367,7 +359,7 @@
"$ref": "#/groups/12"
},
{
"$ref": "#/texts/43"
"$ref": "#/texts/41"
}
],
"content_layer": "body",
@@ -380,14 +372,14 @@
"$ref": "#/groups/11"
},
"children": [
{
"$ref": "#/texts/38"
},
{
"$ref": "#/texts/39"
},
{
"$ref": "#/texts/40"
},
{
"$ref": "#/texts/41"
},
{
"$ref": "#/texts/42"
}
],
"content_layer": "body",
@@ -401,10 +393,10 @@
},
"children": [
{
"$ref": "#/texts/47"
"$ref": "#/texts/45"
},
{
"$ref": "#/texts/48"
"$ref": "#/texts/46"
}
],
"content_layer": "body",
@@ -418,10 +410,10 @@
},
"children": [
{
"$ref": "#/texts/50"
"$ref": "#/texts/48"
},
{
"$ref": "#/texts/51"
"$ref": "#/texts/49"
}
],
"content_layer": "body",
@@ -435,7 +427,7 @@
},
"children": [
{
"$ref": "#/texts/54"
"$ref": "#/texts/52"
}
],
"content_layer": "body",
@@ -449,7 +441,7 @@
},
"children": [
{
"$ref": "#/texts/56"
"$ref": "#/texts/54"
}
],
"content_layer": "body",
@@ -462,14 +454,14 @@
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/57"
},
{
"$ref": "#/texts/58"
},
{
"$ref": "#/texts/59"
},
{
"$ref": "#/texts/60"
},
{
"$ref": "#/texts/61"
}
],
"content_layer": "body",
@@ -483,7 +475,7 @@
},
"children": [
{
"$ref": "#/texts/63"
"$ref": "#/texts/61"
}
],
"content_layer": "body",
@@ -592,7 +584,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/6",
@@ -747,7 +739,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/17",
@@ -768,7 +760,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/18",
@@ -785,16 +777,14 @@
{
"self_ref": "#/texts/19",
"parent": {
"$ref": "#/groups/6"
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"label": "paragraph",
"prov": [],
"orig": "",
"text": "",
"enumerated": false,
"marker": "-"
"text": ""
},
{
"self_ref": "#/texts/20",
@@ -846,18 +836,6 @@
},
{
"self_ref": "#/texts/24",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/25",
"parent": {
"$ref": "#/groups/7"
},
@@ -876,7 +854,7 @@
}
},
{
"self_ref": "#/texts/26",
"self_ref": "#/texts/25",
"parent": {
"$ref": "#/groups/7"
},
@@ -895,7 +873,7 @@
}
},
{
"self_ref": "#/texts/27",
"self_ref": "#/texts/26",
"parent": {
"$ref": "#/groups/8"
},
@@ -913,10 +891,10 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/28",
"self_ref": "#/texts/27",
"parent": {
"$ref": "#/groups/8"
},
@@ -934,10 +912,10 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/29",
"self_ref": "#/texts/28",
"parent": {
"$ref": "#/groups/7"
},
@@ -949,21 +927,7 @@
"text": ""
},
{
"self_ref": "#/texts/30",
"parent": {
"$ref": "#/groups/9"
},
"children": [],
"content_layer": "body",
"label": "list_item",
"prov": [],
"orig": "",
"text": "",
"enumerated": false,
"marker": "-"
},
{
"self_ref": "#/texts/31",
"self_ref": "#/texts/29",
"parent": {
"$ref": "#/body"
},
@@ -975,7 +939,7 @@
"text": ""
},
{
"self_ref": "#/texts/32",
"self_ref": "#/texts/30",
"parent": {
"$ref": "#/groups/10"
},
@@ -993,6 +957,30 @@
"script": "baseline"
}
},
{
"self_ref": "#/texts/31",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/32",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/33",
"parent": {
@@ -1055,30 +1043,6 @@
},
{
"self_ref": "#/texts/38",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/39",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/40",
"parent": {
"$ref": "#/groups/12"
},
@@ -1097,7 +1061,7 @@
}
},
{
"self_ref": "#/texts/41",
"self_ref": "#/texts/39",
"parent": {
"$ref": "#/groups/12"
},
@@ -1116,7 +1080,7 @@
}
},
{
"self_ref": "#/texts/42",
"self_ref": "#/texts/40",
"parent": {
"$ref": "#/groups/12"
},
@@ -1135,7 +1099,7 @@
}
},
{
"self_ref": "#/texts/43",
"self_ref": "#/texts/41",
"parent": {
"$ref": "#/groups/11"
},
@@ -1146,6 +1110,30 @@
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/42",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/43",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/44",
"parent": {
@@ -1160,30 +1148,6 @@
},
{
"self_ref": "#/texts/45",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/46",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/47",
"parent": {
"$ref": "#/groups/13"
},
@@ -1202,7 +1166,7 @@
}
},
{
"self_ref": "#/texts/48",
"self_ref": "#/texts/46",
"parent": {
"$ref": "#/groups/13"
},
@@ -1214,7 +1178,7 @@
"text": ""
},
{
"self_ref": "#/texts/49",
"self_ref": "#/texts/47",
"parent": {
"$ref": "#/body"
},
@@ -1226,7 +1190,7 @@
"text": ""
},
{
"self_ref": "#/texts/50",
"self_ref": "#/texts/48",
"parent": {
"$ref": "#/groups/14"
},
@@ -1245,7 +1209,7 @@
}
},
{
"self_ref": "#/texts/51",
"self_ref": "#/texts/49",
"parent": {
"$ref": "#/groups/14"
},
@@ -1264,7 +1228,7 @@
}
},
{
"self_ref": "#/texts/52",
"self_ref": "#/texts/50",
"parent": {
"$ref": "#/body"
},
@@ -1275,6 +1239,37 @@
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/51",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/52",
"parent": {
"$ref": "#/groups/15"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "Yes",
"text": "Yes",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/53",
"parent": {
@@ -1290,7 +1285,7 @@
{
"self_ref": "#/texts/54",
"parent": {
"$ref": "#/groups/15"
"$ref": "#/groups/16"
},
"children": [],
"content_layer": "body",
@@ -1321,48 +1316,17 @@
{
"self_ref": "#/texts/56",
"parent": {
"$ref": "#/groups/16"
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "Yes",
"text": "Yes",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/57",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/58",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/59",
"parent": {
"$ref": "#/groups/17"
},
@@ -1381,7 +1345,7 @@
}
},
{
"self_ref": "#/texts/60",
"self_ref": "#/texts/58",
"parent": {
"$ref": "#/groups/17"
},
@@ -1393,7 +1357,7 @@
"text": ""
},
{
"self_ref": "#/texts/61",
"self_ref": "#/texts/59",
"parent": {
"$ref": "#/groups/17"
},
@@ -1411,6 +1375,37 @@
"script": "baseline"
}
},
{
"self_ref": "#/texts/60",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/61",
"parent": {
"$ref": "#/groups/18"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "No",
"text": "No",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/62",
"parent": {
@@ -1426,21 +1421,14 @@
{
"self_ref": "#/texts/63",
"parent": {
"$ref": "#/groups/18"
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "No",
"text": "No",
"formatting": {
"bold": false,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/64",
@@ -1453,30 +1441,6 @@
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/65",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
},
{
"self_ref": "#/texts/66",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"prov": [],
"orig": "",
"text": ""
}
],
"pictures": [],
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "unit_test_01",
"origin": {
"mimetype": "text/html",
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "unit_test_formatting",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -429,7 +429,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/14",
@@ -450,7 +450,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/15",
@@ -471,7 +471,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/16",
@@ -489,7 +489,7 @@
"orig": "",
"text": "",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/17",
@@ -583,7 +583,7 @@
"orig": "",
"text": "",
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/22",
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "unit_test_headers",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "unit_test_headers_numbered",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+26 -26
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "unit_test_lists",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -456,7 +456,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/9",
@@ -477,7 +477,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/10",
@@ -498,7 +498,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/11",
@@ -551,7 +551,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/14",
@@ -572,7 +572,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/15",
@@ -593,7 +593,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/16",
@@ -646,7 +646,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/19",
@@ -667,7 +667,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/20",
@@ -688,7 +688,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/21",
@@ -709,7 +709,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/22",
@@ -730,7 +730,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/23",
@@ -751,7 +751,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/24",
@@ -804,7 +804,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/27",
@@ -825,7 +825,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/28",
@@ -846,7 +846,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/29",
@@ -899,7 +899,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/32",
@@ -920,7 +920,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/33",
@@ -941,7 +941,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/34",
@@ -962,7 +962,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/35",
@@ -1021,7 +1021,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/38",
@@ -1042,7 +1042,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/39",
@@ -1063,7 +1063,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/40",
@@ -1084,7 +1084,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/41",
@@ -1105,7 +1105,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/42",
@@ -1126,7 +1126,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/43",
+1 -1
View File
@@ -302,7 +302,7 @@ item-0 at level 0: unspecified: group _root_
item-288 at level 4: list_item: Rubber duck
item-289 at level 2: section_header: Notes
item-290 at level 3: section_header: Citations
item-291 at level 4: ordered_list: group ordered list
item-291 at level 4: list: group ordered list
item-292 at level 5: list_item: ^ "Duckling". The American Herit ... n Company. 2006. Retrieved 2015-05-22.
item-293 at level 5: list_item: ^ "Duckling". Kernerman English ... Ltd. 20002006. Retrieved 2015-05-22.
item-294 at level 5: list_item: ^ Dohner, Janet Vorwald (2001). ... University Press. ISBN 978-0300138139.
File diff suppressed because it is too large Load Diff
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "word_image_anchors",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+10 -10
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "word_sample",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
@@ -243,7 +243,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/7",
@@ -264,7 +264,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/8",
@@ -285,7 +285,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/9",
@@ -325,7 +325,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/11",
@@ -346,7 +346,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/12",
@@ -367,7 +367,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/13",
@@ -530,7 +530,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/21",
@@ -551,7 +551,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
},
{
"self_ref": "#/texts/22",
@@ -572,7 +572,7 @@
"script": "baseline"
},
"enumerated": false,
"marker": "-"
"marker": ""
}
],
"pictures": [
+1 -1
View File
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
"version": "1.4.0",
"version": "1.5.0",
"name": "word_tables",
"origin": {
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",