chore: format JSON test files to enable comparison (#1511)

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
Panos Vagenas 2025-05-02 11:52:18 +03:00 committed by GitHub
parent b147331f2a
commit de56523974
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
49 changed files with 2373840 additions and 52 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1 +1,83 @@
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.5883585611979, 506.6666666666667, 767.2550252278646], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
{
"_name": "",
"type": "pdf-document",
"description": {
"title": null,
"abstract": null,
"authors": null,
"affiliations": null,
"subjects": null,
"keywords": null,
"publication_date": null,
"languages": null,
"license": null,
"publishers": null,
"url_refs": null,
"references": null,
"publication": null,
"reference_count": null,
"citation_count": null,
"citation_date": null,
"advanced": null,
"analytics": null,
"logs": [],
"collection": null,
"acquisition": null
},
"file-info": {
"filename": "ocr_test.pdf",
"filename-prov": null,
"document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61",
"#-pages": 1,
"collection-name": null,
"description": null,
"page-hashes": [
{
"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3",
"model": "default",
"page": 1
}
]
},
"main-text": [
{
"prov": [
{
"bbox": [
69.0,
688.5883585611979,
506.6666666666667,
767.2550252278646
],
"page": 1,
"span": [
0,
94
],
"__ref_s3_data": null
}
],
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
"type": "paragraph",
"payload": null,
"name": "Text",
"font": null
}
],
"figures": [],
"tables": [],
"bitmaps": null,
"equations": [],
"footnotes": [],
"page-dimensions": [
{
"height": 841.9216918945312,
"page": 1,
"width": 595.201171875
}
],
"page-footers": [],
"page-headers": [],
"_s3_data": null,
"identifiers": null
}

File diff suppressed because one or more lines are too long

View File

@ -1 +1,77 @@
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550252278646, "r": 506.6666666666667, "b": 688.5883585611979, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
{
"schema_name": "DoclingDocument",
"version": "1.3.0",
"name": "ocr_test",
"origin": {
"mimetype": "application/pdf",
"binary_hash": 14853448746796404529,
"filename": "ocr_test.pdf",
"uri": null
},
"furniture": {
"self_ref": "#/furniture",
"parent": null,
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"parent": null,
"children": [
{
"cref": "#/texts/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"cref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [
{
"page_no": 1,
"bbox": {
"l": 69.0,
"t": 767.2550252278646,
"r": 506.6666666666667,
"b": 688.5883585611979,
"coord_origin": "BOTTOMLEFT"
},
"charspan": [
0,
94
]
}
],
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
"formatting": null,
"hyperlink": null
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {
"1": {
"size": {
"width": 595.201171875,
"height": 841.9216918945312
},
"image": null,
"page_no": 1
}
}
}

File diff suppressed because one or more lines are too long

View File

@ -293,6 +293,7 @@ def verify_conversion_result_v1(
generate: bool = False,
ocr_engine: Optional[str] = None,
fuzzy: bool = False,
indent: int = 2,
):
PageList = TypeAdapter(List[Page])
@ -323,11 +324,13 @@ def verify_conversion_result_v1(
if generate: # only used when re-generating truth
pages_path.parent.mkdir(parents=True, exist_ok=True)
with open(pages_path, "w") as fw:
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
fw.write(
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
)
json_path.parent.mkdir(parents=True, exist_ok=True)
with open(json_path, "w") as fw:
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
md_path.parent.mkdir(parents=True, exist_ok=True)
with open(md_path, "w") as fw:
@ -377,6 +380,7 @@ def verify_conversion_result_v2(
generate: bool = False,
ocr_engine: Optional[str] = None,
fuzzy: bool = False,
indent: int = 2,
):
PageList = TypeAdapter(List[Page])
@ -405,11 +409,13 @@ def verify_conversion_result_v2(
if generate: # only used when re-generating truth
pages_path.parent.mkdir(parents=True, exist_ok=True)
with open(pages_path, "w") as fw:
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
fw.write(
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
)
json_path.parent.mkdir(parents=True, exist_ok=True)
with open(json_path, "w") as fw:
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
md_path.parent.mkdir(parents=True, exist_ok=True)
with open(md_path, "w") as fw: