* Implement new reading-order model, replacing DS GLM model (WIP) Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update reading-order model branch Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update lockfile [skip ci] Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add captions, footnotes and merges [skip ci] Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Updates for reading-order implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Updates for reading-order implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests and lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes, update tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add normalization, update tests again Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests with code Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Push final lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * sanitize text Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Inlcude furniture, Update tests with furniture Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix content_layer assignment Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * chore: Delete empty file docling/models/ds_glm_model.py Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Nikos Livathinos <nli@zurich.ibm.com>
1 line
9.2 KiB
JSON
1 line
9.2 KiB
JSON
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "right_to_left_01.pdf", "filename-prov": null, "document-hash": "85c9c0772fa51fd26f16eaae6abd522c96a4d169ceb7b72cbcfe3444ce22db79", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "6400df9d1750f707e1e0b310224d0b988ed99457bd230029715def0a6030dd06", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [223.85000999999997, 704.4510500000001, 521.98181, 719.4619800000002], "page": 1, "span": [0, 59], "__ref_s3_data": null}], "text": "Python\u0648 R \u0629\u063a\u0644\u0628 \u0629\u062c\u0645\u0631\u0628\u0644\u0627 \u0644\u0644\u0627\u062e \u0646\u0645 \u062a\u0644\u0627\u0643\u0634\u0645\u0644\u0627 \u0644\u062d\u0648 \u0629\u064a\u062c\u0627\u062a\u0646\u0644\u0625\u0627 \u0646\u064a\u0633\u062d\u062a", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [90.744003, 635.30804, 522.19, 689.992], "page": 1, "span": [0, 345], "__ref_s3_data": null}], "text": "Python \u0648 R \u0629\u063a\u0644\u0628 \u0629\u062c\u0645\u0631\u0628\u0644\u0627 \u0631\u0628\u062a\u0639\u062a \u0629\u0644\u0627\u0639\u0641 \u0644\u0648\u0644\u062d \u062f\u0627\u062c\u064a\u0625 \u064a\u0641 \u062f\u0639\u0627\u0633\u062a\u0648 \u0629\u064a\u062c\u0627\u062a\u0646\u0644\u0625\u0627 \u0632\u0632\u0639\u062a \u0646\u0623 \u0646\u0643\u0645\u064a \u064a\u062a\u0644\u0627 \u0629\u064a\u0648\u0642\u0644\u0627 \u062a\u0627\u0648\u062f\u0644\u0623\u0627 \u0646\u0645 \u0621\u0627\u0645\u0644\u0639\u0644\u0627\u0648 \u0646\u064a\u0644\u0644\u062d\u0645\u0644\u0627 \u0649\u0644\u0639 \u0644\u0647\u0633\u064a \u0627\u0645\u0645 \u060c\u062a\u0627\u0646\u0627\u064a\u0628\u0644\u0627 \u0644\u064a\u0644\u062d\u062a\u0644 \u0629\u064a\u0644\u0627\u062b\u0645 \u0627\u0647\u0644\u0639\u062c\u062a \u0629\u062f\u064a\u0631\u0641 \u062a\u0627\u0632\u064a\u0645Python \u0648 R \u0646\u0645 \u0644\u0643 \u0643\u0644\u062a\u0645\u064a .\u062a\u0644\u0627\u0643\u0634\u0645\u0644\u0644 \u0646\u0627\u0643 \u0627\u0630\u0625 .\u0629\u0644\u0627\u0639\u0641\u0648 \u0629\u0639\u064a\u0631\u0633 \u0629\u0642\u064a\u0631\u0637\u0628 \u0629\u062f\u0642\u0639\u0645 \u062a\u0644\u0627\u064a\u0644\u062d\u062a \u0621\u0627\u0631\u062c\u0625 \u0645\u0647\u0633\u064a \u0646\u0623 \u0646\u0643\u0645\u064a \u062a\u0627\u063a\u0644\u0644\u0627 \u0647\u0630\u0647 \u0645\u0627\u062f\u062e\u062a\u0633\u0627 \u0646\u0625\u0641 \u060c\u0629\u064a\u0644\u064a\u0644\u062d\u062a \u0629\u064a\u0644\u0642\u0639 \u0643\u064a\u062f\u0644 .\u0644\u0645\u0639\u0644\u0627 \u062c\u0626\u0627\u062a\u0646 \u0646\u064a\u0633\u062d\u062a \u064a\u0641 \u0631\u064a\u0628\u0643 \u0644\u0643\u0634\u0628", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [208.10402, 579.38806, 208.10402, 592.67206], "page": 1, "span": [0, 1], "__ref_s3_data": null}], "text": "\u064b", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [509.34990999999997, 564.74799, 509.34990999999997, 578.03198], "page": 1, "span": [0, 1], "__ref_s3_data": null}], "text": "\u064b", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [99.863998, 566.06799, 522.23792, 620.75201], "page": 1, "span": [0, 348], "__ref_s3_data": null}], "text": "\u062c\u0627\u0631\u062e\u062a\u0633\u0627\u0648 \u062a\u0627\u0646\u0627\u064a\u0628\u0644\u0627 \u0646\u0645 \u0629\u0644\u0626\u0627\u0647 \u062a\u0627\u064a\u0645\u0643 \u0629\u062c\u0644\u0627\u0639\u0645 \u0646\u0643\u0645\u0645\u0644\u0627 \u0646\u0645 \u062d\u0628\u0635\u064a \u060c\u0629\u062c\u0645\u0631\u0628\u0644\u0627 \u062a\u0627\u0631\u0627\u0647\u0645 \u0639\u0645 \u064a\u0644\u064a\u0644\u062d\u062a\u0644\u0627 \u0631\u064a\u0643\u0641\u062a\u0644\u0627 \u0639\u0645\u062a\u062c\u064a \u0627\u0645\u062f\u0646\u0639 \u0630\u064a\u0641\u0646\u062a\u0644Python \u0648 R \u0645\u0627\u062f\u062e\u062a\u0633\u0627 \u0646\u064a\u062c\u0645\u0631\u0628\u0645\u0644\u0644 \u0646\u0643\u0645\u064a .\u0627\u0647\u0646\u0645 \u062a\u0627\u0647\u062c\u0648\u062a\u0644\u0627\u0648 \u0637\u0627\u0645\u0646\u0644\u0623\u0627 \u0629\u062c\u0630\u0645\u0646\u0644\u0627 \u0644\u062b\u0645 \u060c\u0629\u0645\u062f\u0642\u062a\u0645 \u0629\u064a\u0644\u064a\u0644\u062d\u062a \u062a\u0627\u064a\u0644\u0645\u0639 \u0629\u0642\u062f \u0631\u062b\u0643\u0623 \u062a\u0627\u0631\u0627\u0631\u0642 \u0630\u0627\u062e\u062a\u0627 \u0649\u0644\u0625 \u0627 \u0636\u064a\u0623 \u064a\u062f\u0624\u064a \u0646\u0623 \u0646\u0643\u0645\u064a \u0644\u0628 \u060c\u062a\u0642\u0648\u0644\u0627 \u0631\u0641\u0648\u064a \u0637\u0642\u0641 \u0633\u064a\u0644 \u0627\u0630\u0647 .\u0629\u0631\u064a\u0628\u0643\u0644\u0627 \u062a\u0627\u0646\u0627\u064a\u0628\u0644\u0627 \u0644\u064a\u0644\u062d\u062a\u0648 \u0629\u064a\u0626\u0627\u0635\u062d\u0644\u0625\u0627 \u062a\u0627\u0646\u0627\u064a\u0628\u0644\u0627 \u0649\u0644\u0639 \u0629\u0645\u0626\u0627\u0642 \u062a\u0627\u062c\u0627\u062a\u0646\u062a\u0633\u0627 \u0649\u0644\u0639 \u0621\u0627\u0646\u0628 .", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [92.903999, 496.91799999999995, 522.10596, 551.63202], "page": 1, "span": [0, 375], "__ref_s3_data": null}], "text": "\u0644\u064a\u0644\u062d\u062a\u0644\u0627 \u0646\u0645 \u060c\u062a\u0627\u0642\u064a\u0628\u0637\u062a\u0644\u0627 \u0646\u0645 \u0629\u0639\u0633\u0627\u0648 \u0629\u0639\u0648\u0645\u062c\u0645 \u0645\u0639\u062f\u062a \u0629\u064a\u0646\u063a \u062a\u0627\u0648\u062f\u0623\u0648 \u062a\u0627\u0628\u062a\u0643\u0645Python \u0648 R \u0646\u0645 \u0644\u0643 \u0631\u0641\u0648\u062a \u060c\u0643\u0644\u0630 \u0649\u0644\u0639 \u0629\u0648\u0644\u0627\u0639 \u0649\u0644\u0639 .\u0629\u0641\u0644\u062a\u062e\u0645\u0644\u0627 \u062a\u0644\u0627\u0643\u0634\u0645\u0644\u0644 \u0629\u0631\u0643\u062a\u0628\u0645 \u0644\u0648\u0644\u062d \u0631\u064a\u0648\u0637\u062a\u0644 \u062a\u0627\u0628\u062a\u0643\u0645\u0644\u0627 \u0647\u0630\u0647 \u0646\u0645 \u0629\u062f\u0627\u0641\u062a\u0633\u0644\u0627\u0627 \u0646\u064a\u0645\u062f\u062e\u062a\u0633\u0645\u0644\u0644 \u0646\u0643\u0645\u064a .\u064a\u0644\u0644\u0622\u0627 \u0645\u0644\u0639\u062a\u0644\u0627 \u0649\u0644\u0625 \u064a\u0646\u0627\u064a\u0628\u0644\u0627 R \u0631\u0641\u0648\u062a \u0627\u0645\u0646\u064a\u0628 \u060c\u0629\u0621\u0627\u0641\u0643\u0628 \u062a\u0627\u0646\u0627\u064a\u0628\u0644\u0627 \u0629\u0631\u0627\u062f\u0644\u0625 Python \u064a\u0641 pandas \u0629\u0628\u062a\u0643\u0645 \u0645\u0627\u062f\u062e\u062a\u0633\u0627 \u0646\u0643\u0645\u064a \u060c\u0644\u0627\u062b\u0645\u0644\u0627 \u0644\u064a\u0628\u0633 \u0645\u0633\u0631\u0644\u0644 \u0629\u064a\u0648\u0642 \u062a\u0627\u0648\u062f\u0623 .\u0646\u064a\u0644\u0644\u062d\u0645\u0644\u0627\u0648 \u0646\u064a\u062b\u062d\u0627\u0628\u0644\u0644 \u0629\u064a\u0644\u0627\u062b\u0645 \u0627\u0647\u0644\u0639\u062c\u064a \u0627\u0645\u0645 \u060c\u064a\u0626\u0627\u0635\u062d\u0644\u0625\u0627 \u0644\u064a\u0644\u062d\u062a\u0644\u0627\u0648 \u064a\u0646\u0627\u064a\u0628\u0644\u0627", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [96.863998, 441.478, 522.07404, 482.362], "page": 1, "span": [0, 267], "__ref_s3_data": null}], "text": "Python \u0648 R \u0629\u063a\u0644\u0628 \u0629\u062c\u0645\u0631\u0628\u0644\u0627 \u064a\u062f\u0624\u062a \u0646\u0623 \u0646\u0643\u0645\u064a \u060c\u0629\u064a\u0627\u0647\u0646\u0644\u0627 \u064a\u0641 \u0629\u0631\u0643\u062a\u0628\u0645 \u0644\u0648\u0644\u062d \u0631\u064a\u0641\u0648\u062a\u0648 \u0629\u064a\u062c\u0627\u062a\u0646\u0644\u0625\u0627 \u0646\u064a\u0633\u062d\u062a \u0649\u0644\u0625 \u0629\u064a\u0644\u064a\u0644\u062d\u062a \u0629\u064a\u0644\u0642\u0639 \u0639\u0645 \u0627\u0647\u0644 \u0646\u0648\u0643\u062a \u0646\u0623 \u0646\u0643\u0645\u064a \u0629\u0628\u0633\u0627\u0646\u0645\u0644\u0627 \u0629\u064a\u062c\u0645\u0631\u0628\u0644\u0627 \u0628\u064a\u0644\u0627\u0633\u0644\u0623\u0627 \u0642\u064a\u0628\u0637\u062a\u0648 \u0644\u0627\u0639\u0641 \u0644\u0643\u0634\u0628 \u062a\u0627\u0646\u0627\u064a\u0628\u0644\u0627 \u0644\u064a\u0644\u062d\u062a \u0649\u0644\u0639 \u0629\u0631\u062f\u0642\u0644\u0627 \u0646\u0625 .\u0629\u062f\u0642\u0639\u0645\u0644\u0627 \u062a\u0644\u0627\u0643\u0634\u0645\u0644\u0644 .\u064a\u0646\u0647\u0645\u0644\u0627\u0648 \u064a\u0635\u062e\u0634\u0644\u0627 \u0621\u0627\u062f\u0644\u0623\u0627 \u0649\u0644\u0639 \u0649\u062f\u0645\u0644\u0627 \u0629\u062f\u064a\u0639\u0628 \u0629\u064a\u0628\u0627\u062c\u064a\u0625 \u062a\u0627\u0631\u064a\u062b\u0623\u062a", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 792.0, "page": 1, "width": 612.0}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null} |