
fix: Support for RTL programmatic documents fix(parser): detect and handle rotated pages fix(parser): fix bug causing duplicated text fix(formula): improve stopping criteria chore: update lock file fix: temporary constrain beautifulsoup * switch to code formula model v1.0.1 and new test pdf Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * switch to code formula model v1.0.1 and new test pdf Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * cleaned up the data folder in the tests Signed-off-by: Peter Staar <taa@zurich.ibm.com> * switch to code formula model v1.0.1 and new test pdf Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * added three test-files for right-to-left Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fix black Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * added new gt for test_e2e_conversion Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * added new gt for test_e2e_conversion Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * Add code to expose text direction of cell Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * new test file Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> * update lock Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix mypy reports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix example filepaths Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add test data results Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * pin wheel of latest docling-parse release Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use latest docling-core Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove debugging code Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix path to files in example Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Revert unwanted RTL additions Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix test data paths in examples Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Matteo-Omenetti <Matteo.Omenetti1@ibm.com> Co-authored-by: Peter Staar <taa@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
1 line
12 KiB
JSON
1 line
12 KiB
JSON
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "right_to_left_03.pdf", "filename-prov": null, "document-hash": "367cb9ca8606ce5676164d44f08ba7e28b794379a2124402672712e12a160bee", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "c13f4c78e4268264071589d2e5620246a5c3b3bf286522a5fed5edb9b6fdc1bc", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [68.78399658203125, 761.0098876953125, 267.65960693359375, 779.3882446289062], "page": 1, "span": [0, 42], "__ref_s3_data": null}], "text": "\u06cc\u0644\u062e\u0627\u062f \u06cc\u0644\u0627\u0627\u06a9 - \u06cc\u0644\u0635\u0627 \u0631\u0627\u0632\u0627\u0628 \u0631\u062f \u0634\u0631\u064a\u0630\u067e \u0647\u0645\u0627\u0646\u062f\u064a\u0645\u0627", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"name": "Picture", "type": "figure", "$ref": "#/figures/0"}, {"prov": [{"bbox": [373.989990234375, 685.375, 479.5299987792969, 703.405029296875], "page": 1, "span": [0, 19], "__ref_s3_data": null}], "text": "\u0644\u0627\u0627\u06a9 \u062f\u0631\u0627\u062f\u0646\u0627\u062a\u0633\u0627 -2-5", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [458.7399879381041, 662.7401084473915, 519.2383976661823, 679.6162084078558], "page": 1, "span": [0, 13], "__ref_s3_data": null}], "text": "\u062f\u0631\u0627\u062f\u0646\u0627\u062a\u0633\u0627 \u0645\u0627\u0646", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [66.26399970216359, 631.5399785204845, 264.81795880972436, 681.171998404211], "page": 1, "span": [0, 97], "__ref_s3_data": null}], "text": "\u06cc\u0631\u06af \u0647\u062a\u062e\u064a\u0631 \u0634\u0648\u0631 \u0647\u0628 \u0647\u062f\u0634 \u062f\u064a\u0644\u0648\u062a \u0644\u0627\u0634\u0645\u0634 \u0648 \u0647\u0634\u0645\u0634 \u0641\u0631\u0635\u0645 \u062f\u0631\u0648\u0645 \u0647\u062a\u0633\u0648\u064a\u067e \u06cc\u0627 \u0647\u0632\u0627\u0633 \u06cc\u0627\u0647\u062f\u0644\u0627\u0648\u0641 \u0631\u062f - \u0642\u0628\u0627\u0637\u0645 \u062a\u0633\u0648\u064a\u067e \u0632\u064a\u0644\u0627\u0646\u0622", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [420.9099981081384, 608.8601085736167, 519.1619876665258, 625.7362085340809], "page": 1, "span": [0, 19], "__ref_s3_data": null}], "text": "\u06cc\u0644\u0645 \u062f\u0631\u0627\u062f\u0646\u0627\u062a\u0633\u0627 \u0647\u0631\u0627\u0645\u0634", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [236.80999893561153, 613.2999885632154, 265.01000880886113, 627.2919885304362], "page": 1, "span": [0, 5], "__ref_s3_data": null}], "text": "20300", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [406.9899881707045, 586.1501486268197, 519.1415376666176, 603.0262485872838], "page": 1, "span": [0, 21], "__ref_s3_data": null}], "text": "\u061f\u062a\u0633\u0627 \u06cc\u0631\u0627\u0628\u062c\u0627 \u062f\u0631\u0627\u062f\u0646\u0627\u062a\u0633\u0627", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [197.32999911306206, 590.5900286164182, 264.91399880929265, 604.5820285836392], "page": 1, "span": [0, 13], "__ref_s3_data": null}], "text": "\u0631\u064a\u062e \u06cc\u0644\u0628", "type": "checkbox-unselected", "payload": null, "name": "Checkbox-Unselected", "font": null}, {"prov": [{"bbox": [389.4699982494516, 563.4701486799523, 519.2136776662934, 580.3462486404165], "page": 1, "span": [0, 24], "__ref_s3_data": null}], "text": "\u062f\u0631\u0627\u062f\u0646\u0627\u062a\u0633\u0627 \u0647\u062f\u0646\u0646\u06a9\u0631\u062f\u0627\u0635 \u0639\u062c\u0631\u0645", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [166.5799992512739, 567.9100286695509, 264.77599880991295, 581.9020386367717], "page": 1, "span": [0, 26], "__ref_s3_data": null}], "text": "\u0646\u0627\u0631\u064a\u0627 \u062f\u0631\u0627\u062f\u0646\u0627\u062a\u0633\u0627 \u06cc\u0644\u0645 \u0646\u0627\u0645\u0632\u0627\u0633", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [292.129998686965, 518.5901487850932, 519.2351676661968, 557.6661986935493], "page": 1, "span": [0, 55], "__ref_s3_data": null}], "text": "\u0630\u062e\u0627 \u0627\u0631 \u0631\u0648\u06a9\u0630\u0645 \u062f\u0631\u0627\u062f\u0646\u0627\u062a\u0633\u0627 \u060c\u0644\u0648\u0635\u062d\u0645 \u0647\u062f\u0646\u0646\u06a9\u062f\u064a\u0644\u0648\u062a \u0627\u064a\u0622 \u061f\u062a\u0633\u0627 \u0647\u062f\u0648\u0645\u0646", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [197.32999911306206, 545.2299787226838, 208.04769906488926, 559.2219786899045], "page": 1, "span": [0, 3], "__ref_s3_data": null}], "text": "\u0631\u064a\u062e", "type": "checkbox-selected", "payload": null, "name": "Checkbox-Selected", "font": null}, {"prov": [{"bbox": [236.62821893642857, 545.2299787226838, 247.34591888825577, 559.2219786899045], "page": 1, "span": [0, 3], "__ref_s3_data": null}], "text": "\u06cc\u0644\u0628", "type": "checkbox-unselected", "payload": null, "name": "Checkbox-Unselected", "font": null}, {"prov": [{"bbox": [409.0299987792969, 473.71014404296875, 505.7644958496094, 490.5862121582031], "page": 1, "span": [0, 16], "__ref_s3_data": null}], "text": "\u0633\u0631\u0648\u0628 \u0631\u062f \u0634\u0631\u064a\u0630\u067e -3", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [405.30999817825557, 451.01012894341363, 492.6107177858655, 467.88619890387787], "page": 1, "span": [0, 17], "__ref_s3_data": null}], "text": "\u06a9\u0631\u0627\u062f\u0645 \u0647\u0626\u0627\u0631\u0627 \u062e\u064a\u0631\u0627\u062a", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [137.89998938018175, 455.4699989329655, 187.8199891558066, 469.4620089001862], "page": 1, "span": [0, 10], "__ref_s3_data": null}], "text": "1403/09/19", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [426.309998083867, 428.3301389965463, 492.59463778593783, 445.2062089570106], "page": 1, "span": [0, 11], "__ref_s3_data": null}], "text": "\u0634\u0631\u064a\u0630\u067e \u062e\u064a\u0631\u0627\u062a", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [137.89998938018175, 432.7700189861449, 187.8199891558066, 446.7620189533657], "page": 1, "span": [0, 10], "__ref_s3_data": null}], "text": "1403/10/04", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [367.14998834977314, 405.65011904967906, 492.68526778553047, 422.5261790101433], "page": 1, "span": [0, 21], "__ref_s3_data": null}], "text": "\u0647\u0636\u0631\u0639 \u0647\u062a\u064a\u0645\u06a9 \u0647\u0633\u0644\u062c \u0647\u0631\u0627\u0645\u0634", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [154.69999930467083, 409.96999903955884, 171.19999923050838, 423.96200900677957], "page": 1, "span": [0, 3], "__ref_s3_data": null}], "text": "436", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [399.42998820468443, 382.8501291030928, 492.62752778578994, 399.72619906355703], "page": 1, "span": [0, 18], "__ref_s3_data": null}], "text": "\u0647\u0645\u0627\u0646\u062f\u064a\u0645\u0627 \u062c\u0631\u062f \u062e\u064a\u0631\u0627\u062a", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [137.89998938018175, 387.29000909269143, 187.8199891558066, 401.2820090599123], "page": 1, "span": [0, 10], "__ref_s3_data": null}], "text": "1403/10/05", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [422.82998809950857, 360.17013915622545, 492.6789577855588, 377.04619911668976], "page": 1, "span": [0, 11], "__ref_s3_data": null}], "text": "\u0634\u0631\u064a\u0630\u067e \u0631\u0648\u0627\u0634\u0645", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [122.05999945137766, 364.6100191458242, 203.6480090846645, 378.6020191130449], "page": 1, "span": [0, 19], "__ref_s3_data": null}], "text": "\u0633\u0631\u0648\u0628 \u0646\u0648\u0645\u0631\u0622 \u06cc\u0631\u0627\u0632\u06af\u0631\u0627\u06a9", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [281.3299887355078, 313.730129265021, 492.70525778544066, 352.6861891737582], "page": 1, "span": [0, 45], "__ref_s3_data": null}], "text": "\u0631\u062f \u0644\u0627\u0627\u06a9 \u0634\u0631\u064a\u0630\u067e \u0632\u0627 \u0633\u067e \u0647\u064a\u0627\u067e \u062a\u0645\u064a\u0642 \u0646\u064a\u064a\u0639\u062a \u0629\u0648\u062d\u0646 \u0633\u0631\u0648\u0628", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [109.21999950908952, 340.2499992028926, 213.67396903960088, 354.24199917011344], "page": 1, "span": [0, 23], "__ref_s3_data": null}], "text": "\u06cc\u0646\u0627\u0647\u062c \u06cc\u0627\u0647 \u062a\u0645\u064a\u0642 \u0633\u0627\u0633\u0627 \u0631\u0628", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [266.5700088018494, 268.82012937023217, 492.7008677854604, 307.7761792789694], "page": 1, "span": [0, 45], "__ref_s3_data": null}], "text": "\u0634\u0648\u0631\u0641 /\u0634\u0648\u0631\u0641 \u0644\u06a9 /\u062f\u064a\u0644\u0648\u062a \u0632\u0627 \u0647\u0636\u0631\u0639 \u062f\u0635\u0631\u062f \u0644\u0642\u0627\u062f\u062d \u06cc\u0644\u062e\u0627\u062f", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [85.4639966158655, 295.33999930810376, 240.36199891964634, 309.3319992753245], "page": 1, "span": [0, 39], "__ref_s3_data": null}], "text": "\u0646\u062a 47.500 \u0627\u064a \u0647\u0646\u0627\u064a\u0644\u0627\u0633 \u062f\u064a\u0644\u0648\u062a \u0632\u0627 %50 \u0644\u0642\u0627\u062f\u062d", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [404.2300081831098, 246.02010942364598, 492.6399177857343, 262.8962093841102], "page": 1, "span": [0, 15], "__ref_s3_data": null}], "text": "\u0644\u064a\u0648\u062d\u062a \u0632\u0627\u062c\u0645 \u06cc\u0627\u0637\u062e", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [106.93999951933742, 250.45998941324467, 218.89399901613845, 264.4519993804654], "page": 1, "span": [0, 26], "__ref_s3_data": null}], "text": "\u0644\u064a\u0648\u062d\u062a \u0644\u0628\u0627\u0642 \u0647\u0644\u0648\u0645\u062d\u0645 \u0646\u064a\u0631\u062e\u0622 5%", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [267.2900085449219, 52.00199890136719, 294.5899963378906, 62.54999923706055], "page": 1, "span": [0, 6], "__ref_s3_data": null}], "text": "Page 7", "type": "page-footer", "payload": null, "name": "Page-footer", "font": null}], "figures": [{"prov": [{"bbox": [388.5767822265625, 739.034423828125, 482.4759216308594, 806.0040893554688], "page": 1, "span": [0, 0], "__ref_s3_data": null}], "text": "", "type": "figure", "payload": null, "bounding-box": null}], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 842.0399780273438, "page": 1, "width": 595.3200073242188}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null} |