chore: format JSON test files to enable comparison (#1511)
Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
This commit is contained in:
parent
b147331f2a
commit
de56523974
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1 +1,83 @@
|
||||
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.5883585611979, 506.6666666666667, 767.2550252278646], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
|
||||
{
|
||||
"_name": "",
|
||||
"type": "pdf-document",
|
||||
"description": {
|
||||
"title": null,
|
||||
"abstract": null,
|
||||
"authors": null,
|
||||
"affiliations": null,
|
||||
"subjects": null,
|
||||
"keywords": null,
|
||||
"publication_date": null,
|
||||
"languages": null,
|
||||
"license": null,
|
||||
"publishers": null,
|
||||
"url_refs": null,
|
||||
"references": null,
|
||||
"publication": null,
|
||||
"reference_count": null,
|
||||
"citation_count": null,
|
||||
"citation_date": null,
|
||||
"advanced": null,
|
||||
"analytics": null,
|
||||
"logs": [],
|
||||
"collection": null,
|
||||
"acquisition": null
|
||||
},
|
||||
"file-info": {
|
||||
"filename": "ocr_test.pdf",
|
||||
"filename-prov": null,
|
||||
"document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61",
|
||||
"#-pages": 1,
|
||||
"collection-name": null,
|
||||
"description": null,
|
||||
"page-hashes": [
|
||||
{
|
||||
"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3",
|
||||
"model": "default",
|
||||
"page": 1
|
||||
}
|
||||
]
|
||||
},
|
||||
"main-text": [
|
||||
{
|
||||
"prov": [
|
||||
{
|
||||
"bbox": [
|
||||
69.0,
|
||||
688.5883585611979,
|
||||
506.6666666666667,
|
||||
767.2550252278646
|
||||
],
|
||||
"page": 1,
|
||||
"span": [
|
||||
0,
|
||||
94
|
||||
],
|
||||
"__ref_s3_data": null
|
||||
}
|
||||
],
|
||||
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
|
||||
"type": "paragraph",
|
||||
"payload": null,
|
||||
"name": "Text",
|
||||
"font": null
|
||||
}
|
||||
],
|
||||
"figures": [],
|
||||
"tables": [],
|
||||
"bitmaps": null,
|
||||
"equations": [],
|
||||
"footnotes": [],
|
||||
"page-dimensions": [
|
||||
{
|
||||
"height": 841.9216918945312,
|
||||
"page": 1,
|
||||
"width": 595.201171875
|
||||
}
|
||||
],
|
||||
"page-footers": [],
|
||||
"page-headers": [],
|
||||
"_s3_data": null,
|
||||
"identifiers": null
|
||||
}
|
File diff suppressed because one or more lines are too long
@ -1 +1,77 @@
|
||||
{"schema_name": "DoclingDocument", "version": "1.3.0", "name": "ocr_test", "origin": {"mimetype": "application/pdf", "binary_hash": 14853448746796404529, "filename": "ocr_test.pdf", "uri": null}, "furniture": {"self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified"}, "body": {"self_ref": "#/body", "parent": null, "children": [{"cref": "#/texts/0"}], "content_layer": "body", "name": "_root_", "label": "unspecified"}, "groups": [], "texts": [{"self_ref": "#/texts/0", "parent": {"cref": "#/body"}, "children": [], "content_layer": "body", "label": "text", "prov": [{"page_no": 1, "bbox": {"l": 69.0, "t": 767.2550252278646, "r": 506.6666666666667, "b": 688.5883585611979, "coord_origin": "BOTTOMLEFT"}, "charspan": [0, 94]}], "orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "formatting": null, "hyperlink": null}], "pictures": [], "tables": [], "key_value_items": [], "form_items": [], "pages": {"1": {"size": {"width": 595.201171875, "height": 841.9216918945312}, "image": null, "page_no": 1}}}
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.3.0",
|
||||
"name": "ocr_test",
|
||||
"origin": {
|
||||
"mimetype": "application/pdf",
|
||||
"binary_hash": 14853448746796404529,
|
||||
"filename": "ocr_test.pdf",
|
||||
"uri": null
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
"parent": null,
|
||||
"children": [],
|
||||
"content_layer": "furniture",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"body": {
|
||||
"self_ref": "#/body",
|
||||
"parent": null,
|
||||
"children": [
|
||||
{
|
||||
"cref": "#/texts/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [],
|
||||
"texts": [
|
||||
{
|
||||
"self_ref": "#/texts/0",
|
||||
"parent": {
|
||||
"cref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
"label": "text",
|
||||
"prov": [
|
||||
{
|
||||
"page_no": 1,
|
||||
"bbox": {
|
||||
"l": 69.0,
|
||||
"t": 767.2550252278646,
|
||||
"r": 506.6666666666667,
|
||||
"b": 688.5883585611979,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"charspan": [
|
||||
0,
|
||||
94
|
||||
]
|
||||
}
|
||||
],
|
||||
"orig": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
|
||||
"text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package",
|
||||
"formatting": null,
|
||||
"hyperlink": null
|
||||
}
|
||||
],
|
||||
"pictures": [],
|
||||
"tables": [],
|
||||
"key_value_items": [],
|
||||
"form_items": [],
|
||||
"pages": {
|
||||
"1": {
|
||||
"size": {
|
||||
"width": 595.201171875,
|
||||
"height": 841.9216918945312
|
||||
},
|
||||
"image": null,
|
||||
"page_no": 1
|
||||
}
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
@ -293,6 +293,7 @@ def verify_conversion_result_v1(
|
||||
generate: bool = False,
|
||||
ocr_engine: Optional[str] = None,
|
||||
fuzzy: bool = False,
|
||||
indent: int = 2,
|
||||
):
|
||||
PageList = TypeAdapter(List[Page])
|
||||
|
||||
@ -323,11 +324,13 @@ def verify_conversion_result_v1(
|
||||
if generate: # only used when re-generating truth
|
||||
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(pages_path, "w") as fw:
|
||||
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
|
||||
fw.write(
|
||||
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
|
||||
)
|
||||
|
||||
json_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(json_path, "w") as fw:
|
||||
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
|
||||
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
|
||||
|
||||
md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(md_path, "w") as fw:
|
||||
@ -377,6 +380,7 @@ def verify_conversion_result_v2(
|
||||
generate: bool = False,
|
||||
ocr_engine: Optional[str] = None,
|
||||
fuzzy: bool = False,
|
||||
indent: int = 2,
|
||||
):
|
||||
PageList = TypeAdapter(List[Page])
|
||||
|
||||
@ -405,11 +409,13 @@ def verify_conversion_result_v2(
|
||||
if generate: # only used when re-generating truth
|
||||
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(pages_path, "w") as fw:
|
||||
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
|
||||
fw.write(
|
||||
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
|
||||
)
|
||||
|
||||
json_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(json_path, "w") as fw:
|
||||
fw.write(json.dumps(doc_pred, default=pydantic_encoder))
|
||||
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
|
||||
|
||||
md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(md_path, "w") as fw:
|
||||
|
Loading…
Reference in New Issue
Block a user