
get merged_text from boundingbox instead of merging it to prevent overlaps Signed-off-by: Pedro Ribeiro <pedro_ribeiro_93@hotmail.com>
83 lines
1.8 KiB
JSON
83 lines
1.8 KiB
JSON
{
|
|
"_name": "",
|
|
"type": "pdf-document",
|
|
"description": {
|
|
"title": null,
|
|
"abstract": null,
|
|
"authors": null,
|
|
"affiliations": null,
|
|
"subjects": null,
|
|
"keywords": null,
|
|
"publication_date": null,
|
|
"languages": null,
|
|
"license": null,
|
|
"publishers": null,
|
|
"url_refs": null,
|
|
"references": null,
|
|
"publication": null,
|
|
"reference_count": null,
|
|
"citation_count": null,
|
|
"citation_date": null,
|
|
"advanced": null,
|
|
"analytics": null,
|
|
"logs": [],
|
|
"collection": null,
|
|
"acquisition": null
|
|
},
|
|
"file-info": {
|
|
"filename": "ocr_test.pdf",
|
|
"filename-prov": null,
|
|
"document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61",
|
|
"#-pages": 1,
|
|
"collection-name": null,
|
|
"description": null,
|
|
"page-hashes": [
|
|
{
|
|
"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3",
|
|
"model": "default",
|
|
"page": 1
|
|
}
|
|
]
|
|
},
|
|
"main-text": [
|
|
{
|
|
"prov": [
|
|
{
|
|
"bbox": [
|
|
69.6796630536824,
|
|
689.0124221922704,
|
|
504.8720051760782,
|
|
764.9216921155637
|
|
],
|
|
"page": 1,
|
|
"span": [
|
|
0,
|
|
94
|
|
],
|
|
"__ref_s3_data": null
|
|
}
|
|
],
|
|
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
|
|
"type": "paragraph",
|
|
"payload": null,
|
|
"name": "Text",
|
|
"font": null
|
|
}
|
|
],
|
|
"figures": [],
|
|
"tables": [],
|
|
"bitmaps": null,
|
|
"equations": [],
|
|
"footnotes": [],
|
|
"page-dimensions": [
|
|
{
|
|
"height": 841.9216918945312,
|
|
"page": 1,
|
|
"width": 595.201171875
|
|
}
|
|
],
|
|
"page-footers": [],
|
|
"page-headers": [],
|
|
"_s3_data": null,
|
|
"identifiers": null
|
|
} |