Docling/tests/data/groundtruth/docling_v1/right_to_left_03.json
Pedro Ribeiro 98b5eeb844
fix(pypdfium): resolve overlapping text when merging bounding boxes (#1549)
get merged_text from boundingbox instead of merging it to prevent overlaps

Signed-off-by: Pedro Ribeiro <pedro_ribeiro_93@hotmail.com>
2025-05-19 15:26:00 +02:00

778 lines
18 KiB
JSON

{
"_name": "",
"type": "pdf-document",
"description": {
"title": null,
"abstract": null,
"authors": null,
"affiliations": null,
"subjects": null,
"keywords": null,
"publication_date": null,
"languages": null,
"license": null,
"publishers": null,
"url_refs": null,
"references": null,
"publication": null,
"reference_count": null,
"citation_count": null,
"citation_date": null,
"advanced": null,
"analytics": null,
"logs": [],
"collection": null,
"acquisition": null
},
"file-info": {
"filename": "right_to_left_03.pdf",
"filename-prov": null,
"document-hash": "367cb9ca8606ce5676164d44f08ba7e28b794379a2124402672712e12a160bee",
"#-pages": 1,
"collection-name": null,
"description": null,
"page-hashes": [
{
"hash": "c13f4c78e4268264071589d2e5620246a5c3b3bf286522a5fed5edb9b6fdc1bc",
"model": "default",
"page": 1
}
]
},
"main-text": [
{
"prov": [
{
"bbox": [
68.78399669083697,
761.0098882171737,
267.65960879695194,
779.3882381741187
],
"page": 1,
"span": [
0,
42
],
"__ref_s3_data": null
}
],
"text": "\u06cc\u0644\u062e\u0627\u062f \u06cc\u0644\u0627\u0627\u06a9 - \u06cc\u0644\u0635\u0627 \u0631\u0627\u0632\u0627\u0628 \u0631\u062f \u0634\u0631\u064a\u0630\u067e \u0647\u0645\u0627\u0646\u062f\u064a\u0645\u0627",
"type": "subtitle-level-1",
"payload": null,
"name": "Section-header",
"font": null
},
{
"name": "Picture",
"type": "figure",
"$ref": "#/figures/0"
},
{
"prov": [
{
"bbox": [
373.9899883190294,
685.3749983943645,
479.52999784465936,
703.4050283521253
],
"page": 1,
"span": [
0,
19
],
"__ref_s3_data": null
}
],
"text": "\u0644\u0627\u0627\u06a9 \u062f\u0631\u0627\u062f\u0646\u0627\u062a\u0633\u0627 -2-5",
"type": "subtitle-level-1",
"payload": null,
"name": "Section-header",
"font": null
},
{
"prov": [
{
"bbox": [
458.7399879381041,
662.7401084473915,
519.2383976661823,
679.6162084078558
],
"page": 1,
"span": [
0,
13
],
"__ref_s3_data": null
}
],
"text": "\u062f\u0631\u0627\u062f\u0646\u0627\u062a\u0633\u0627 \u0645\u0627\u0646",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
66.26399970216359,
631.5399785204845,
264.81795880972436,
681.171998404211
],
"page": 1,
"span": [
0,
97
],
"__ref_s3_data": null
}
],
"text": "\u06cc\u0631\u06af \u0647\u062a\u062e\u064a\u0631 \u0634\u0648\u0631 \u0647\u0628 \u0647\u062f\u0634 \u062f\u064a\u0644\u0648\u062a \u0644\u0627\u0634\u0645\u0634 \u0648 \u0647\u0634\u0645\u0634 \u0641\u0631\u0635\u0645 \u062f\u0631\u0648\u0645 \u0647\u062a\u0633\u0648\u064a\u067e \u06cc\u0627 \u0647\u0632\u0627\u0633 \u06cc\u0627\u0647\u062f\u0644\u0627\u0648\u0641 \u0631\u062f - \u0642\u0628\u0627\u0637\u0645 \u062a\u0633\u0648\u064a\u067e \u0632\u064a\u0644\u0627\u0646\u0622",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
420.9099981081384,
608.8601085736167,
519.1619876665258,
625.7362085340809
],
"page": 1,
"span": [
0,
19
],
"__ref_s3_data": null
}
],
"text": "\u06cc\u0644\u0645 \u062f\u0631\u0627\u062f\u0646\u0627\u062a\u0633\u0627 \u0647\u0631\u0627\u0645\u0634",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
236.80999893561153,
613.2999885632154,
265.01000880886113,
627.2919885304362
],
"page": 1,
"span": [
0,
5
],
"__ref_s3_data": null
}
],
"text": "20300",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
406.9899881707045,
586.1501486268197,
519.1415376666176,
603.0262485872838
],
"page": 1,
"span": [
0,
21
],
"__ref_s3_data": null
}
],
"text": "\u061f\u062a\u0633\u0627 \u06cc\u0631\u0627\u0628\u062c\u0627 \u062f\u0631\u0627\u062f\u0646\u0627\u062a\u0633\u0627",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
197.32999911306206,
590.5900286164182,
264.91399880929265,
604.5820285836392
],
"page": 1,
"span": [
0,
13
],
"__ref_s3_data": null
}
],
"text": "\u0631\u064a\u062e \u06cc\u0644\u0628",
"type": "checkbox-unselected",
"payload": null,
"name": "Checkbox-Unselected",
"font": null
},
{
"prov": [
{
"bbox": [
389.4699982494516,
563.4701486799523,
519.2136776662934,
580.3462486404165
],
"page": 1,
"span": [
0,
24
],
"__ref_s3_data": null
}
],
"text": "\u062f\u0631\u0627\u062f\u0646\u0627\u062a\u0633\u0627 \u0647\u062f\u0646\u0646\u06a9\u0631\u062f\u0627\u0635 \u0639\u062c\u0631\u0645",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
166.5799992512739,
567.9100286695509,
264.77599880991295,
581.9020386367717
],
"page": 1,
"span": [
0,
26
],
"__ref_s3_data": null
}
],
"text": "\u0646\u0627\u0631\u064a\u0627 \u062f\u0631\u0627\u062f\u0646\u0627\u062a\u0633\u0627 \u06cc\u0644\u0645 \u0646\u0627\u0645\u0632\u0627\u0633",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
292.129998686965,
518.5901487850932,
519.2351676661968,
557.6661986935493
],
"page": 1,
"span": [
0,
55
],
"__ref_s3_data": null
}
],
"text": "\u0630\u062e\u0627 \u0627\u0631 \u0631\u0648\u06a9\u0630\u0645 \u062f\u0631\u0627\u062f\u0646\u0627\u062a\u0633\u0627 \u060c\u0644\u0648\u0635\u062d\u0645 \u0647\u062f\u0646\u0646\u06a9\u062f\u064a\u0644\u0648\u062a \u0627\u064a\u0622 \u061f\u062a\u0633\u0627 \u0647\u062f\u0648\u0645\u0646",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
197.32999911306206,
545.2299787226838,
208.04769906488926,
559.2219786899045
],
"page": 1,
"span": [
0,
3
],
"__ref_s3_data": null
}
],
"text": "\u0631\u064a\u062e",
"type": "checkbox-selected",
"payload": null,
"name": "Checkbox-Selected",
"font": null
},
{
"prov": [
{
"bbox": [
236.62821893642857,
545.2299787226838,
247.34591888825577,
559.2219786899045
],
"page": 1,
"span": [
0,
3
],
"__ref_s3_data": null
}
],
"text": "\u06cc\u0644\u0628",
"type": "checkbox-unselected",
"payload": null,
"name": "Checkbox-Unselected",
"font": null
},
{
"prov": [
{
"bbox": [
409.0299981615353,
473.71013889023413,
505.7644977267433,
490.58620885069837
],
"page": 1,
"span": [
0,
16
],
"__ref_s3_data": null
}
],
"text": "\u0633\u0631\u0648\u0628 \u0631\u062f \u0634\u0631\u064a\u0630\u067e -3",
"type": "subtitle-level-1",
"payload": null,
"name": "Section-header",
"font": null
},
{
"prov": [
{
"bbox": [
405.30999817825557,
451.01012894341363,
492.6107177858655,
467.88619890387787
],
"page": 1,
"span": [
0,
17
],
"__ref_s3_data": null
}
],
"text": "\u06a9\u0631\u0627\u062f\u0645 \u0647\u0626\u0627\u0631\u0627 \u062e\u064a\u0631\u0627\u062a",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
137.89998938018175,
455.4699989329655,
187.8199891558066,
469.4620089001862
],
"page": 1,
"span": [
0,
10
],
"__ref_s3_data": null
}
],
"text": "1403/09/19",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
426.309998083867,
428.3301389965463,
492.59463778593783,
445.2062089570106
],
"page": 1,
"span": [
0,
11
],
"__ref_s3_data": null
}
],
"text": "\u0634\u0631\u064a\u0630\u067e \u062e\u064a\u0631\u0627\u062a",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
137.89998938018175,
432.7700189861449,
187.8199891558066,
446.7620189533657
],
"page": 1,
"span": [
0,
10
],
"__ref_s3_data": null
}
],
"text": "1403/10/04",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
367.14998834977314,
405.65011904967906,
492.68526778553047,
422.5261790101433
],
"page": 1,
"span": [
0,
21
],
"__ref_s3_data": null
}
],
"text": "\u0647\u0636\u0631\u0639 \u0647\u062a\u064a\u0645\u06a9 \u0647\u0633\u0644\u062c \u0647\u0631\u0627\u0645\u0634",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
154.69999930467083,
409.96999903955884,
171.19999923050838,
423.96200900677957
],
"page": 1,
"span": [
0,
3
],
"__ref_s3_data": null
}
],
"text": "436",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
399.42998820468443,
382.8501291030928,
492.62752778578994,
399.72619906355703
],
"page": 1,
"span": [
0,
18
],
"__ref_s3_data": null
}
],
"text": "\u0647\u0645\u0627\u0646\u062f\u064a\u0645\u0627 \u062c\u0631\u062f \u062e\u064a\u0631\u0627\u062a",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
137.89998938018175,
387.29000909269143,
187.8199891558066,
401.2820090599123
],
"page": 1,
"span": [
0,
10
],
"__ref_s3_data": null
}
],
"text": "1403/10/05",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
422.82998809950857,
360.17013915622545,
492.6789577855588,
377.04619911668976
],
"page": 1,
"span": [
0,
11
],
"__ref_s3_data": null
}
],
"text": "\u0634\u0631\u064a\u0630\u067e \u0631\u0648\u0627\u0634\u0645",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
122.05999945137766,
364.6100191458242,
203.6480090846645,
378.6020191130449
],
"page": 1,
"span": [
0,
19
],
"__ref_s3_data": null
}
],
"text": "\u0633\u0631\u0648\u0628 \u0646\u0648\u0645\u0631\u0622 \u06cc\u0631\u0627\u0632\u06af\u0631\u0627\u06a9",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
281.3299887355078,
313.730129265021,
492.70525778544066,
352.6861891737582
],
"page": 1,
"span": [
0,
45
],
"__ref_s3_data": null
}
],
"text": "\u0631\u062f \u0644\u0627\u0627\u06a9 \u0634\u0631\u064a\u0630\u067e \u0632\u0627 \u0633\u067e \u0647\u064a\u0627\u067e \u062a\u0645\u064a\u0642 \u0646\u064a\u064a\u0639\u062a \u0629\u0648\u062d\u0646 \u0633\u0631\u0648\u0628",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
109.21999950908952,
340.2499992028926,
213.67396903960088,
354.24199917011344
],
"page": 1,
"span": [
0,
23
],
"__ref_s3_data": null
}
],
"text": "\u06cc\u0646\u0627\u0647\u062c \u06cc\u0627\u0647 \u062a\u0645\u064a\u0642 \u0633\u0627\u0633\u0627 \u0631\u0628",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
266.5700088018494,
268.82012937023217,
492.7008677854604,
307.7761792789694
],
"page": 1,
"span": [
0,
45
],
"__ref_s3_data": null
}
],
"text": "\u0634\u0648\u0631\u0641 /\u0634\u0648\u0631\u0641 \u0644\u06a9 /\u062f\u064a\u0644\u0648\u062a \u0632\u0627 \u0647\u0636\u0631\u0639 \u062f\u0635\u0631\u062f \u0644\u0642\u0627\u062f\u062d \u06cc\u0644\u062e\u0627\u062f",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
85.4639966158655,
295.33999930810376,
240.36199891964634,
309.3319992753245
],
"page": 1,
"span": [
0,
39
],
"__ref_s3_data": null
}
],
"text": "\u0646\u062a 47.500 \u0627\u064a \u0647\u0646\u0627\u064a\u0644\u0627\u0633 \u062f\u064a\u0644\u0648\u062a \u0632\u0627 %50 \u0644\u0642\u0627\u062f\u062d",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
404.2300081831098,
246.02010942364598,
492.6399177857343,
262.8962093841102
],
"page": 1,
"span": [
0,
15
],
"__ref_s3_data": null
}
],
"text": "\u0644\u064a\u0648\u062d\u062a \u0632\u0627\u062c\u0645 \u06cc\u0627\u0637\u062e",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
},
{
"prov": [
{
"bbox": [
106.93999951933742,
250.45998941324467,
218.89399901613845,
264.4519993804654
],
"page": 1,
"span": [
0,
26
],
"__ref_s3_data": null
}
],
"text": "\u0644\u064a\u0648\u062d\u062a \u0644\u0628\u0627\u0642 \u0647\u0644\u0648\u0645\u062d\u0645 \u0646\u064a\u0631\u062e\u0622 5%",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
}
],
"figures": [
{
"prov": [
{
"bbox": [
388.5767822265625,
739.034423828125,
482.4759216308594,
806.0040969848633
],
"page": 1,
"span": [
0,
0
],
"__ref_s3_data": null
}
],
"text": "",
"type": "figure",
"payload": null,
"bounding-box": null
}
],
"tables": [],
"bitmaps": null,
"equations": [],
"footnotes": [],
"page-dimensions": [
{
"height": 842.0399780273438,
"page": 1,
"width": 595.3200073242188
}
],
"page-footers": [],
"page-headers": [],
"_s3_data": null,
"identifiers": null
}