From e979750ce93b2fae89dbb60ff06333f80c1c2908 Mon Sep 17 00:00:00 2001 From: Maras Ioannis <34172738+IoannisMaras@users.noreply.github.com> Date: Tue, 10 Jun 2025 11:57:45 +0300 Subject: [PATCH] fix(tesseract): initialize df_osd to avoid uninitialized variable error (#1718) * fix: initialize df_osd to avoid uninitialized variable error Signed-off-by: IoannisMaras * Fix formatting Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> * Satisfy mypy, regenerate OCR tests Signed-off-by: Christoph Auer --------- Signed-off-by: IoannisMaras Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Signed-off-by: Christoph Auer Co-authored-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Co-authored-by: Christoph Auer --- docling/models/tesseract_ocr_cli_model.py | 5 +- .../groundtruth/docling_v1/ocr_test.json | 6 +- .../docling_v1/ocr_test.pages.json | 146 +++++------ .../docling_v1/ocr_test_rotated_180.json | 16 +- .../ocr_test_rotated_180.pages.json | 240 +++++++++--------- .../docling_v1/ocr_test_rotated_270.json | 8 +- .../ocr_test_rotated_270.pages.json | 170 ++++++------- .../docling_v1/ocr_test_rotated_90.pages.json | 152 +++++------ .../docling_v2/ocr_test.doctags.txt | 2 +- .../groundtruth/docling_v2/ocr_test.json | 6 +- .../docling_v2/ocr_test.pages.json | 146 +++++------ .../ocr_test_rotated_180.doctags.txt | 2 +- .../docling_v2/ocr_test_rotated_180.json | 16 +- .../ocr_test_rotated_180.pages.json | 240 +++++++++--------- .../ocr_test_rotated_270.doctags.txt | 2 +- .../docling_v2/ocr_test_rotated_270.json | 14 +- .../ocr_test_rotated_270.pages.json | 170 ++++++------- .../ocr_test_rotated_90.doctags.txt | 2 +- .../docling_v2/ocr_test_rotated_90.json | 8 +- .../docling_v2/ocr_test_rotated_90.pages.json | 152 +++++------ 20 files changed, 752 insertions(+), 751 deletions(-) diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index e690104..5c8d0ae 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -99,12 +99,12 @@ class TesseractOcrCliModel(BaseOcrModel): return name, version - def _run_tesseract(self, ifilename: str, osd: pd.DataFrame): + def _run_tesseract(self, ifilename: str, osd: Optional[pd.DataFrame]): r""" Run tesseract CLI """ cmd = [self.options.tesseract_cmd] - if self._is_auto: + if self._is_auto and osd is not None: lang = self._parse_language(osd) if lang is not None: cmd.append("-l") @@ -231,6 +231,7 @@ class TesseractOcrCliModel(BaseOcrModel): fname = image_file.name high_res_image.save(image_file) doc_orientation = 0 + df_osd: Optional[pd.DataFrame] = None try: df_osd = self._perform_osd(fname) doc_orientation = _parse_orientation(df_osd) diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test.json index 9895cda..8dbfff1 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test.json +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test.json @@ -44,9 +44,9 @@ "prov": [ { "bbox": [ - 70.90211866351085, - 689.216658542347, - 504.8720079864275, + 69.6796630536824, + 689.0124221922704, + 504.8720051760782, 764.9216921155637 ], "page": 1, diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test.pages.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test.pages.json index 10d2f99..f5c2816 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test.pages.json +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test.pages.json @@ -40,14 +40,14 @@ "a": 255 }, "rect": { - "r_x0": 70.90211866351085, - "r_y0": 124.83139551297342, - "r_x1": 504.8720079864275, - "r_y1": 124.83139551297342, - "r_x2": 504.8720079864275, - "r_y2": 102.66666671251768, - "r_x3": 70.90211866351085, - "r_y3": 102.66666671251768, + "r_x0": 69.6796630536824, + "r_y0": 124.83139494707741, + "r_x1": 504.8720051760782, + "r_y1": 124.83139494707741, + "r_x2": 504.8720051760782, + "r_y2": 104.00000011573796, + "r_x3": 69.6796630536824, + "r_y3": 104.00000011573796, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -65,14 +65,14 @@ "a": 255 }, "rect": { - "r_x0": 73.10852522817731, - "r_y0": 152.70503335218433, - "r_x1": 153.04479435252625, - "r_y1": 152.70503335218433, - "r_x2": 153.04479435252625, - "r_y2": 130.00136157890958, - "r_x3": 73.10852522817731, - "r_y3": 130.00136157890958, + "r_x0": 71.84193505100733, + "r_y0": 152.90926970226084, + "r_x1": 153.088934155825, + "r_y1": 152.90926970226084, + "r_x2": 153.088934155825, + "r_y2": 129.797125232046, + "r_x3": 71.84193505100733, + "r_y3": 129.797125232046, "coord_origin": "TOPLEFT" }, "text": "package", @@ -90,10 +90,10 @@ "id": 0, "label": "text", "bbox": { - "l": 70.90211866351085, + "l": 69.6796630536824, "t": 76.99999977896756, - "r": 504.8720079864275, - "b": 152.70503335218433, + "r": 504.8720051760782, + "b": 152.90926970226084, "coord_origin": "TOPLEFT" }, "confidence": 0.9715733528137207, @@ -132,14 +132,14 @@ "a": 255 }, "rect": { - "r_x0": 70.90211866351085, - "r_y0": 124.83139551297342, - "r_x1": 504.8720079864275, - "r_y1": 124.83139551297342, - "r_x2": 504.8720079864275, - "r_y2": 102.66666671251768, - "r_x3": 70.90211866351085, - "r_y3": 102.66666671251768, + "r_x0": 69.6796630536824, + "r_y0": 124.83139494707741, + "r_x1": 504.8720051760782, + "r_y1": 124.83139494707741, + "r_x2": 504.8720051760782, + "r_y2": 104.00000011573796, + "r_x3": 69.6796630536824, + "r_y3": 104.00000011573796, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -157,14 +157,14 @@ "a": 255 }, "rect": { - "r_x0": 73.10852522817731, - "r_y0": 152.70503335218433, - "r_x1": 153.04479435252625, - "r_y1": 152.70503335218433, - "r_x2": 153.04479435252625, - "r_y2": 130.00136157890958, - "r_x3": 73.10852522817731, - "r_y3": 130.00136157890958, + "r_x0": 71.84193505100733, + "r_y0": 152.90926970226084, + "r_x1": 153.088934155825, + "r_y1": 152.90926970226084, + "r_x2": 153.088934155825, + "r_y2": 129.797125232046, + "r_x3": 71.84193505100733, + "r_y3": 129.797125232046, "coord_origin": "TOPLEFT" }, "text": "package", @@ -195,10 +195,10 @@ "id": 0, "label": "text", "bbox": { - "l": 70.90211866351085, + "l": 69.6796630536824, "t": 76.99999977896756, - "r": 504.8720079864275, - "b": 152.70503335218433, + "r": 504.8720051760782, + "b": 152.90926970226084, "coord_origin": "TOPLEFT" }, "confidence": 0.9715733528137207, @@ -237,14 +237,14 @@ "a": 255 }, "rect": { - "r_x0": 70.90211866351085, - "r_y0": 124.83139551297342, - "r_x1": 504.8720079864275, - "r_y1": 124.83139551297342, - "r_x2": 504.8720079864275, - "r_y2": 102.66666671251768, - "r_x3": 70.90211866351085, - "r_y3": 102.66666671251768, + "r_x0": 69.6796630536824, + "r_y0": 124.83139494707741, + "r_x1": 504.8720051760782, + "r_y1": 124.83139494707741, + "r_x2": 504.8720051760782, + "r_y2": 104.00000011573796, + "r_x3": 69.6796630536824, + "r_y3": 104.00000011573796, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -262,14 +262,14 @@ "a": 255 }, "rect": { - "r_x0": 73.10852522817731, - "r_y0": 152.70503335218433, - "r_x1": 153.04479435252625, - "r_y1": 152.70503335218433, - "r_x2": 153.04479435252625, - "r_y2": 130.00136157890958, - "r_x3": 73.10852522817731, - "r_y3": 130.00136157890958, + "r_x0": 71.84193505100733, + "r_y0": 152.90926970226084, + "r_x1": 153.088934155825, + "r_y1": 152.90926970226084, + "r_x2": 153.088934155825, + "r_y2": 129.797125232046, + "r_x3": 71.84193505100733, + "r_y3": 129.797125232046, "coord_origin": "TOPLEFT" }, "text": "package", @@ -293,10 +293,10 @@ "id": 0, "label": "text", "bbox": { - "l": 70.90211866351085, + "l": 69.6796630536824, "t": 76.99999977896756, - "r": 504.8720079864275, - "b": 152.70503335218433, + "r": 504.8720051760782, + "b": 152.90926970226084, "coord_origin": "TOPLEFT" }, "confidence": 0.9715733528137207, @@ -335,14 +335,14 @@ "a": 255 }, "rect": { - "r_x0": 70.90211866351085, - "r_y0": 124.83139551297342, - "r_x1": 504.8720079864275, - "r_y1": 124.83139551297342, - "r_x2": 504.8720079864275, - "r_y2": 102.66666671251768, - "r_x3": 70.90211866351085, - "r_y3": 102.66666671251768, + "r_x0": 69.6796630536824, + "r_y0": 124.83139494707741, + "r_x1": 504.8720051760782, + "r_y1": 124.83139494707741, + "r_x2": 504.8720051760782, + "r_y2": 104.00000011573796, + "r_x3": 69.6796630536824, + "r_y3": 104.00000011573796, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -360,14 +360,14 @@ "a": 255 }, "rect": { - "r_x0": 73.10852522817731, - "r_y0": 152.70503335218433, - "r_x1": 153.04479435252625, - "r_y1": 152.70503335218433, - "r_x2": 153.04479435252625, - "r_y2": 130.00136157890958, - "r_x3": 73.10852522817731, - "r_y3": 130.00136157890958, + "r_x0": 71.84193505100733, + "r_y0": 152.90926970226084, + "r_x1": 153.088934155825, + "r_y1": 152.90926970226084, + "r_x2": 153.088934155825, + "r_y2": 129.797125232046, + "r_x3": 71.84193505100733, + "r_y3": 129.797125232046, "coord_origin": "TOPLEFT" }, "text": "package", diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.json index 1722766..8de137d 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.json +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.json @@ -44,10 +44,10 @@ "prov": [ { "bbox": [ - 441.304584329099, - 132.09610360960653, - 521.9863114205704, - 151.67751306395223 + 441.2561096985719, + 131.89488404865142, + 522.0347860494834, + 151.87873262042876 ], "page": 1, "span": [ @@ -67,10 +67,10 @@ "prov": [ { "bbox": [ - 89.12133215549848, - 77.02339849621205, - 523.3501733013318, - 124.86176457554109 + 89.23887497045128, + 77.02339852098021, + 523.208764293368, + 124.75312428291147 ], "page": 1, "span": [ diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.pages.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.pages.json index 1141247..23614fb 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.pages.json +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.pages.json @@ -15,14 +15,14 @@ "a": 255 }, "rect": { - "r_x0": 90.46133071208328, - "r_y0": 764.8982933983192, - "r_x1": 520.7638616365624, - "r_y1": 764.8982933983192, - "r_x2": 520.7638616365624, - "r_y2": 744.0929853742306, - "r_x3": 90.46133071208328, - "r_y3": 744.0929853742306, + "r_x0": 89.2388782764286, + "r_y0": 764.898293373551, + "r_x1": 521.9863147998661, + "r_y1": 764.898293373551, + "r_x2": 521.9863147998661, + "r_y2": 744.0929853494625, + "r_x3": 89.2388782764286, + "r_y3": 744.0929853494625, "coord_origin": "TOPLEFT" }, "text": "Docling bundles PDF document conversion to", @@ -40,14 +40,14 @@ "a": 255 }, "rect": { - "r_x0": 89.12133215549848, - "r_y0": 741.5247710689902, - "r_x1": 523.3501733013318, - "r_y1": 741.5247710689902, - "r_x2": 523.3501733013318, - "r_y2": 717.0599273189902, - "r_x3": 89.12133215549848, - "r_y3": 717.0599273189902, + "r_x0": 89.23887497045128, + "r_y0": 739.1977118987292, + "r_x1": 523.208764293368, + "r_y1": 739.1977118987292, + "r_x2": 523.208764293368, + "r_y2": 717.1685676116198, + "r_x3": 89.23887497045128, + "r_y3": 717.1685676116198, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -65,14 +65,14 @@ "a": 255 }, "rect": { - "r_x0": 441.304584329099, - "r_y0": 709.8255882849247, - "r_x1": 521.9863114205704, - "r_y1": 709.8255882849247, - "r_x2": 521.9863114205704, - "r_y2": 690.244178830579, - "r_x3": 441.304584329099, - "r_y3": 690.244178830579, + "r_x0": 441.2561096985719, + "r_y0": 710.0268078458798, + "r_x1": 522.0347860494834, + "r_y1": 710.0268078458798, + "r_x2": 522.0347860494834, + "r_y2": 690.0429592741025, + "r_x3": 441.2561096985719, + "r_y3": 690.0429592741025, "coord_origin": "TOPLEFT" }, "text": "package", @@ -90,10 +90,10 @@ "id": 0, "label": "text", "bbox": { - "l": 89.12133215549848, - "t": 717.0599273189902, - "r": 523.3501733013318, - "b": 764.8982933983192, + "l": 89.23887497045128, + "t": 717.1685676116198, + "r": 523.208764293368, + "b": 764.898293373551, "coord_origin": "TOPLEFT" }, "confidence": 0.7318570613861084, @@ -107,14 +107,14 @@ "a": 255 }, "rect": { - "r_x0": 90.46133071208328, - "r_y0": 764.8982933983192, - "r_x1": 520.7638616365624, - "r_y1": 764.8982933983192, - "r_x2": 520.7638616365624, - "r_y2": 744.0929853742306, - "r_x3": 90.46133071208328, - "r_y3": 744.0929853742306, + "r_x0": 89.2388782764286, + "r_y0": 764.898293373551, + "r_x1": 521.9863147998661, + "r_y1": 764.898293373551, + "r_x2": 521.9863147998661, + "r_y2": 744.0929853494625, + "r_x3": 89.2388782764286, + "r_y3": 744.0929853494625, "coord_origin": "TOPLEFT" }, "text": "Docling bundles PDF document conversion to", @@ -132,14 +132,14 @@ "a": 255 }, "rect": { - "r_x0": 89.12133215549848, - "r_y0": 741.5247710689902, - "r_x1": 523.3501733013318, - "r_y1": 741.5247710689902, - "r_x2": 523.3501733013318, - "r_y2": 717.0599273189902, - "r_x3": 89.12133215549848, - "r_y3": 717.0599273189902, + "r_x0": 89.23887497045128, + "r_y0": 739.1977118987292, + "r_x1": 523.208764293368, + "r_y1": 739.1977118987292, + "r_x2": 523.208764293368, + "r_y2": 717.1685676116198, + "r_x3": 89.23887497045128, + "r_y3": 717.1685676116198, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -155,10 +155,10 @@ "id": 2, "label": "text", "bbox": { - "l": 441.304584329099, - "t": 690.244178830579, - "r": 521.9863114205704, - "b": 709.8255882849247, + "l": 441.2561096985719, + "t": 690.0429592741025, + "r": 522.0347860494834, + "b": 710.0268078458798, "coord_origin": "TOPLEFT" }, "confidence": 0.5982133150100708, @@ -172,14 +172,14 @@ "a": 255 }, "rect": { - "r_x0": 441.304584329099, - "r_y0": 709.8255882849247, - "r_x1": 521.9863114205704, - "r_y1": 709.8255882849247, - "r_x2": 521.9863114205704, - "r_y2": 690.244178830579, - "r_x3": 441.304584329099, - "r_y3": 690.244178830579, + "r_x0": 441.2561096985719, + "r_y0": 710.0268078458798, + "r_x1": 522.0347860494834, + "r_y1": 710.0268078458798, + "r_x2": 522.0347860494834, + "r_y2": 690.0429592741025, + "r_x3": 441.2561096985719, + "r_y3": 690.0429592741025, "coord_origin": "TOPLEFT" }, "text": "package", @@ -210,10 +210,10 @@ "id": 0, "label": "text", "bbox": { - "l": 89.12133215549848, - "t": 717.0599273189902, - "r": 523.3501733013318, - "b": 764.8982933983192, + "l": 89.23887497045128, + "t": 717.1685676116198, + "r": 523.208764293368, + "b": 764.898293373551, "coord_origin": "TOPLEFT" }, "confidence": 0.7318570613861084, @@ -227,14 +227,14 @@ "a": 255 }, "rect": { - "r_x0": 90.46133071208328, - "r_y0": 764.8982933983192, - "r_x1": 520.7638616365624, - "r_y1": 764.8982933983192, - "r_x2": 520.7638616365624, - "r_y2": 744.0929853742306, - "r_x3": 90.46133071208328, - "r_y3": 744.0929853742306, + "r_x0": 89.2388782764286, + "r_y0": 764.898293373551, + "r_x1": 521.9863147998661, + "r_y1": 764.898293373551, + "r_x2": 521.9863147998661, + "r_y2": 744.0929853494625, + "r_x3": 89.2388782764286, + "r_y3": 744.0929853494625, "coord_origin": "TOPLEFT" }, "text": "Docling bundles PDF document conversion to", @@ -252,14 +252,14 @@ "a": 255 }, "rect": { - "r_x0": 89.12133215549848, - "r_y0": 741.5247710689902, - "r_x1": 523.3501733013318, - "r_y1": 741.5247710689902, - "r_x2": 523.3501733013318, - "r_y2": 717.0599273189902, - "r_x3": 89.12133215549848, - "r_y3": 717.0599273189902, + "r_x0": 89.23887497045128, + "r_y0": 739.1977118987292, + "r_x1": 523.208764293368, + "r_y1": 739.1977118987292, + "r_x2": 523.208764293368, + "r_y2": 717.1685676116198, + "r_x3": 89.23887497045128, + "r_y3": 717.1685676116198, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -281,10 +281,10 @@ "id": 2, "label": "text", "bbox": { - "l": 441.304584329099, - "t": 690.244178830579, - "r": 521.9863114205704, - "b": 709.8255882849247, + "l": 441.2561096985719, + "t": 690.0429592741025, + "r": 522.0347860494834, + "b": 710.0268078458798, "coord_origin": "TOPLEFT" }, "confidence": 0.5982133150100708, @@ -298,14 +298,14 @@ "a": 255 }, "rect": { - "r_x0": 441.304584329099, - "r_y0": 709.8255882849247, - "r_x1": 521.9863114205704, - "r_y1": 709.8255882849247, - "r_x2": 521.9863114205704, - "r_y2": 690.244178830579, - "r_x3": 441.304584329099, - "r_y3": 690.244178830579, + "r_x0": 441.2561096985719, + "r_y0": 710.0268078458798, + "r_x1": 522.0347860494834, + "r_y1": 710.0268078458798, + "r_x2": 522.0347860494834, + "r_y2": 690.0429592741025, + "r_x3": 441.2561096985719, + "r_y3": 690.0429592741025, "coord_origin": "TOPLEFT" }, "text": "package", @@ -329,10 +329,10 @@ "id": 0, "label": "text", "bbox": { - "l": 89.12133215549848, - "t": 717.0599273189902, - "r": 523.3501733013318, - "b": 764.8982933983192, + "l": 89.23887497045128, + "t": 717.1685676116198, + "r": 523.208764293368, + "b": 764.898293373551, "coord_origin": "TOPLEFT" }, "confidence": 0.7318570613861084, @@ -346,14 +346,14 @@ "a": 255 }, "rect": { - "r_x0": 90.46133071208328, - "r_y0": 764.8982933983192, - "r_x1": 520.7638616365624, - "r_y1": 764.8982933983192, - "r_x2": 520.7638616365624, - "r_y2": 744.0929853742306, - "r_x3": 90.46133071208328, - "r_y3": 744.0929853742306, + "r_x0": 89.2388782764286, + "r_y0": 764.898293373551, + "r_x1": 521.9863147998661, + "r_y1": 764.898293373551, + "r_x2": 521.9863147998661, + "r_y2": 744.0929853494625, + "r_x3": 89.2388782764286, + "r_y3": 744.0929853494625, "coord_origin": "TOPLEFT" }, "text": "Docling bundles PDF document conversion to", @@ -371,14 +371,14 @@ "a": 255 }, "rect": { - "r_x0": 89.12133215549848, - "r_y0": 741.5247710689902, - "r_x1": 523.3501733013318, - "r_y1": 741.5247710689902, - "r_x2": 523.3501733013318, - "r_y2": 717.0599273189902, - "r_x3": 89.12133215549848, - "r_y3": 717.0599273189902, + "r_x0": 89.23887497045128, + "r_y0": 739.1977118987292, + "r_x1": 523.208764293368, + "r_y1": 739.1977118987292, + "r_x2": 523.208764293368, + "r_y2": 717.1685676116198, + "r_x3": 89.23887497045128, + "r_y3": 717.1685676116198, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -400,10 +400,10 @@ "id": 2, "label": "text", "bbox": { - "l": 441.304584329099, - "t": 690.244178830579, - "r": 521.9863114205704, - "b": 709.8255882849247, + "l": 441.2561096985719, + "t": 690.0429592741025, + "r": 522.0347860494834, + "b": 710.0268078458798, "coord_origin": "TOPLEFT" }, "confidence": 0.5982133150100708, @@ -417,14 +417,14 @@ "a": 255 }, "rect": { - "r_x0": 441.304584329099, - "r_y0": 709.8255882849247, - "r_x1": 521.9863114205704, - "r_y1": 709.8255882849247, - "r_x2": 521.9863114205704, - "r_y2": 690.244178830579, - "r_x3": 441.304584329099, - "r_y3": 690.244178830579, + "r_x0": 441.2561096985719, + "r_y0": 710.0268078458798, + "r_x1": 522.0347860494834, + "r_y1": 710.0268078458798, + "r_x2": 522.0347860494834, + "r_y2": 690.0429592741025, + "r_x3": 441.2561096985719, + "r_y3": 690.0429592741025, "coord_origin": "TOPLEFT" }, "text": "package", diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.json index b143922..fed4d9e 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.json +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.json @@ -44,10 +44,10 @@ "prov": [ { "bbox": [ - 691.4680194659409, - 442.3948768148814, - 709.8255850278712, - 523.0765988200898 + 690.2441821046808, + 442.39487414368364, + 709.8255852011977, + 523.076601235155 ], "page": 1, "span": [ diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.pages.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.pages.json index dac1284..69af79a 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.pages.json +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.pages.json @@ -40,14 +40,14 @@ "a": 255 }, "rect": { - "r_x0": 717.1685859527342, - "r_y0": 504.8720063438988, - "r_x1": 737.9738558298501, - "r_y1": 504.8720063438988, - "r_x2": 737.9738558298501, - "r_y2": 70.90211702098213, - "r_x3": 717.1685859527342, - "r_y3": 70.90211702098213, + "r_x0": 717.168585936602, + "r_y0": 504.8720061466397, + "r_x1": 737.9738558137178, + "r_y1": 504.8720061466397, + "r_x2": 737.9738558137178, + "r_y2": 70.90211682372312, + "r_x3": 717.168585936602, + "r_y3": 70.90211682372312, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -65,14 +65,14 @@ "a": 255 }, "rect": { - "r_x0": 691.4680194659409, - "r_y0": 152.80629506011857, - "r_x1": 709.8255850278712, - "r_y1": 152.80629506011857, - "r_x2": 709.8255850278712, - "r_y2": 72.12457305491027, - "r_x3": 691.4680194659409, - "r_y3": 72.12457305491027, + "r_x0": 690.2441821046808, + "r_y0": 152.80629773131633, + "r_x1": 709.8255852011977, + "r_y1": 152.80629773131633, + "r_x2": 709.8255852011977, + "r_y2": 72.124570639845, + "r_x3": 690.2441821046808, + "r_y3": 72.124570639845, "coord_origin": "TOPLEFT" }, "text": "package", @@ -90,10 +90,10 @@ "id": 0, "label": "page_header", "bbox": { - "l": 717.1685859527342, - "t": 70.90211702098213, + "l": 717.168585936602, + "t": 70.90211682372312, "r": 764.8982839673505, - "b": 504.8720063438988, + "b": 504.8720061466397, "coord_origin": "TOPLEFT" }, "confidence": 0.6915205121040344, @@ -132,14 +132,14 @@ "a": 255 }, "rect": { - "r_x0": 717.1685859527342, - "r_y0": 504.8720063438988, - "r_x1": 737.9738558298501, - "r_y1": 504.8720063438988, - "r_x2": 737.9738558298501, - "r_y2": 70.90211702098213, - "r_x3": 717.1685859527342, - "r_y3": 70.90211702098213, + "r_x0": 717.168585936602, + "r_y0": 504.8720061466397, + "r_x1": 737.9738558137178, + "r_y1": 504.8720061466397, + "r_x2": 737.9738558137178, + "r_y2": 70.90211682372312, + "r_x3": 717.168585936602, + "r_y3": 70.90211682372312, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -155,10 +155,10 @@ "id": 8, "label": "text", "bbox": { - "l": 691.4680194659409, - "t": 72.12457305491027, - "r": 709.8255850278712, - "b": 152.80629506011857, + "l": 690.2441821046808, + "t": 72.124570639845, + "r": 709.8255852011977, + "b": 152.80629773131633, "coord_origin": "TOPLEFT" }, "confidence": 1.0, @@ -172,14 +172,14 @@ "a": 255 }, "rect": { - "r_x0": 691.4680194659409, - "r_y0": 152.80629506011857, - "r_x1": 709.8255850278712, - "r_y1": 152.80629506011857, - "r_x2": 709.8255850278712, - "r_y2": 72.12457305491027, - "r_x3": 691.4680194659409, - "r_y3": 72.12457305491027, + "r_x0": 690.2441821046808, + "r_y0": 152.80629773131633, + "r_x1": 709.8255852011977, + "r_y1": 152.80629773131633, + "r_x2": 709.8255852011977, + "r_y2": 72.124570639845, + "r_x3": 690.2441821046808, + "r_y3": 72.124570639845, "coord_origin": "TOPLEFT" }, "text": "package", @@ -210,10 +210,10 @@ "id": 0, "label": "page_header", "bbox": { - "l": 717.1685859527342, - "t": 70.90211702098213, + "l": 717.168585936602, + "t": 70.90211682372312, "r": 764.8982839673505, - "b": 504.8720063438988, + "b": 504.8720061466397, "coord_origin": "TOPLEFT" }, "confidence": 0.6915205121040344, @@ -252,14 +252,14 @@ "a": 255 }, "rect": { - "r_x0": 717.1685859527342, - "r_y0": 504.8720063438988, - "r_x1": 737.9738558298501, - "r_y1": 504.8720063438988, - "r_x2": 737.9738558298501, - "r_y2": 70.90211702098213, - "r_x3": 717.1685859527342, - "r_y3": 70.90211702098213, + "r_x0": 717.168585936602, + "r_y0": 504.8720061466397, + "r_x1": 737.9738558137178, + "r_y1": 504.8720061466397, + "r_x2": 737.9738558137178, + "r_y2": 70.90211682372312, + "r_x3": 717.168585936602, + "r_y3": 70.90211682372312, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -281,10 +281,10 @@ "id": 8, "label": "text", "bbox": { - "l": 691.4680194659409, - "t": 72.12457305491027, - "r": 709.8255850278712, - "b": 152.80629506011857, + "l": 690.2441821046808, + "t": 72.124570639845, + "r": 709.8255852011977, + "b": 152.80629773131633, "coord_origin": "TOPLEFT" }, "confidence": 1.0, @@ -298,14 +298,14 @@ "a": 255 }, "rect": { - "r_x0": 691.4680194659409, - "r_y0": 152.80629506011857, - "r_x1": 709.8255850278712, - "r_y1": 152.80629506011857, - "r_x2": 709.8255850278712, - "r_y2": 72.12457305491027, - "r_x3": 691.4680194659409, - "r_y3": 72.12457305491027, + "r_x0": 690.2441821046808, + "r_y0": 152.80629773131633, + "r_x1": 709.8255852011977, + "r_y1": 152.80629773131633, + "r_x2": 709.8255852011977, + "r_y2": 72.124570639845, + "r_x3": 690.2441821046808, + "r_y3": 72.124570639845, "coord_origin": "TOPLEFT" }, "text": "package", @@ -329,10 +329,10 @@ "id": 8, "label": "text", "bbox": { - "l": 691.4680194659409, - "t": 72.12457305491027, - "r": 709.8255850278712, - "b": 152.80629506011857, + "l": 690.2441821046808, + "t": 72.124570639845, + "r": 709.8255852011977, + "b": 152.80629773131633, "coord_origin": "TOPLEFT" }, "confidence": 1.0, @@ -346,14 +346,14 @@ "a": 255 }, "rect": { - "r_x0": 691.4680194659409, - "r_y0": 152.80629506011857, - "r_x1": 709.8255850278712, - "r_y1": 152.80629506011857, - "r_x2": 709.8255850278712, - "r_y2": 72.12457305491027, - "r_x3": 691.4680194659409, - "r_y3": 72.12457305491027, + "r_x0": 690.2441821046808, + "r_y0": 152.80629773131633, + "r_x1": 709.8255852011977, + "r_y1": 152.80629773131633, + "r_x2": 709.8255852011977, + "r_y2": 72.124570639845, + "r_x3": 690.2441821046808, + "r_y3": 72.124570639845, "coord_origin": "TOPLEFT" }, "text": "package", @@ -377,10 +377,10 @@ "id": 0, "label": "page_header", "bbox": { - "l": 717.1685859527342, - "t": 70.90211702098213, + "l": 717.168585936602, + "t": 70.90211682372312, "r": 764.8982839673505, - "b": 504.8720063438988, + "b": 504.8720061466397, "coord_origin": "TOPLEFT" }, "confidence": 0.6915205121040344, @@ -419,14 +419,14 @@ "a": 255 }, "rect": { - "r_x0": 717.1685859527342, - "r_y0": 504.8720063438988, - "r_x1": 737.9738558298501, - "r_y1": 504.8720063438988, - "r_x2": 737.9738558298501, - "r_y2": 70.90211702098213, - "r_x3": 717.1685859527342, - "r_y3": 70.90211702098213, + "r_x0": 717.168585936602, + "r_y0": 504.8720061466397, + "r_x1": 737.9738558137178, + "r_y1": 504.8720061466397, + "r_x2": 737.9738558137178, + "r_y2": 70.90211682372312, + "r_x3": 717.168585936602, + "r_y3": 70.90211682372312, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", diff --git a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.pages.json b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.pages.json index 7b76b51..b0defdc 100644 --- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.pages.json +++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.pages.json @@ -15,14 +15,14 @@ "a": 255 }, "rect": { - "r_x0": 77.10171546422428, - "r_y0": 520.7638577050515, - "r_x1": 96.6831586150625, - "r_y1": 520.7638577050515, - "r_x2": 96.6831586150625, - "r_y2": 89.23887398109309, - "r_x3": 77.10171546422428, - "r_y3": 89.23887398109309, + "r_x0": 77.10171545548258, + "r_y0": 520.7638571913312, + "r_x1": 96.68315797053792, + "r_y1": 520.7638571913312, + "r_x2": 96.68315797053792, + "r_y2": 89.2388734673729, + "r_x3": 77.10171545548258, + "r_y3": 89.2388734673729, "coord_origin": "TOPLEFT" }, "text": "Docling bundles PDF document conversion to", @@ -40,14 +40,14 @@ "a": 255 }, "rect": { - "r_x0": 100.55299576256091, - "r_y0": 523.3155494272656, - "r_x1": 124.91101654503161, - "r_y1": 523.3155494272656, - "r_x2": 124.91101654503161, - "r_y2": 89.12381765643227, - "r_x3": 100.55299576256091, - "r_y3": 89.12381765643227, + "r_x0": 100.64168123325977, + "r_y0": 523.3236155182395, + "r_x1": 126.08064862014129, + "r_y1": 523.3236155182395, + "r_x2": 126.08064862014129, + "r_y2": 89.1266754140729, + "r_x3": 100.64168123325977, + "r_y3": 89.1266754140729, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -90,10 +90,10 @@ "id": 0, "label": "page_header", "bbox": { - "l": 77.10171546422428, - "t": 89.12381765643227, - "r": 124.91101654503161, - "b": 523.3155494272656, + "l": 77.10171545548258, + "t": 89.1266754140729, + "r": 126.08064862014129, + "b": 523.3236155182395, "coord_origin": "TOPLEFT" }, "confidence": 0.6016772389411926, @@ -107,14 +107,14 @@ "a": 255 }, "rect": { - "r_x0": 77.10171546422428, - "r_y0": 520.7638577050515, - "r_x1": 96.6831586150625, - "r_y1": 520.7638577050515, - "r_x2": 96.6831586150625, - "r_y2": 89.23887398109309, - "r_x3": 77.10171546422428, - "r_y3": 89.23887398109309, + "r_x0": 77.10171545548258, + "r_y0": 520.7638571913312, + "r_x1": 96.68315797053792, + "r_y1": 520.7638571913312, + "r_x2": 96.68315797053792, + "r_y2": 89.2388734673729, + "r_x3": 77.10171545548258, + "r_y3": 89.2388734673729, "coord_origin": "TOPLEFT" }, "text": "Docling bundles PDF document conversion to", @@ -132,14 +132,14 @@ "a": 255 }, "rect": { - "r_x0": 100.55299576256091, - "r_y0": 523.3155494272656, - "r_x1": 124.91101654503161, - "r_y1": 523.3155494272656, - "r_x2": 124.91101654503161, - "r_y2": 89.12381765643227, - "r_x3": 100.55299576256091, - "r_y3": 89.12381765643227, + "r_x0": 100.64168123325977, + "r_y0": 523.3236155182395, + "r_x1": 126.08064862014129, + "r_y1": 523.3236155182395, + "r_x2": 126.08064862014129, + "r_y2": 89.1266754140729, + "r_x3": 100.64168123325977, + "r_y3": 89.1266754140729, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -210,10 +210,10 @@ "id": 0, "label": "page_header", "bbox": { - "l": 77.10171546422428, - "t": 89.12381765643227, - "r": 124.91101654503161, - "b": 523.3155494272656, + "l": 77.10171545548258, + "t": 89.1266754140729, + "r": 126.08064862014129, + "b": 523.3236155182395, "coord_origin": "TOPLEFT" }, "confidence": 0.6016772389411926, @@ -227,14 +227,14 @@ "a": 255 }, "rect": { - "r_x0": 77.10171546422428, - "r_y0": 520.7638577050515, - "r_x1": 96.6831586150625, - "r_y1": 520.7638577050515, - "r_x2": 96.6831586150625, - "r_y2": 89.23887398109309, - "r_x3": 77.10171546422428, - "r_y3": 89.23887398109309, + "r_x0": 77.10171545548258, + "r_y0": 520.7638571913312, + "r_x1": 96.68315797053792, + "r_y1": 520.7638571913312, + "r_x2": 96.68315797053792, + "r_y2": 89.2388734673729, + "r_x3": 77.10171545548258, + "r_y3": 89.2388734673729, "coord_origin": "TOPLEFT" }, "text": "Docling bundles PDF document conversion to", @@ -252,14 +252,14 @@ "a": 255 }, "rect": { - "r_x0": 100.55299576256091, - "r_y0": 523.3155494272656, - "r_x1": 124.91101654503161, - "r_y1": 523.3155494272656, - "r_x2": 124.91101654503161, - "r_y2": 89.12381765643227, - "r_x3": 100.55299576256091, - "r_y3": 89.12381765643227, + "r_x0": 100.64168123325977, + "r_y0": 523.3236155182395, + "r_x1": 126.08064862014129, + "r_y1": 523.3236155182395, + "r_x2": 126.08064862014129, + "r_y2": 89.1266754140729, + "r_x3": 100.64168123325977, + "r_y3": 89.1266754140729, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -377,10 +377,10 @@ "id": 0, "label": "page_header", "bbox": { - "l": 77.10171546422428, - "t": 89.12381765643227, - "r": 124.91101654503161, - "b": 523.3155494272656, + "l": 77.10171545548258, + "t": 89.1266754140729, + "r": 126.08064862014129, + "b": 523.3236155182395, "coord_origin": "TOPLEFT" }, "confidence": 0.6016772389411926, @@ -394,14 +394,14 @@ "a": 255 }, "rect": { - "r_x0": 77.10171546422428, - "r_y0": 520.7638577050515, - "r_x1": 96.6831586150625, - "r_y1": 520.7638577050515, - "r_x2": 96.6831586150625, - "r_y2": 89.23887398109309, - "r_x3": 77.10171546422428, - "r_y3": 89.23887398109309, + "r_x0": 77.10171545548258, + "r_y0": 520.7638571913312, + "r_x1": 96.68315797053792, + "r_y1": 520.7638571913312, + "r_x2": 96.68315797053792, + "r_y2": 89.2388734673729, + "r_x3": 77.10171545548258, + "r_y3": 89.2388734673729, "coord_origin": "TOPLEFT" }, "text": "Docling bundles PDF document conversion to", @@ -419,14 +419,14 @@ "a": 255 }, "rect": { - "r_x0": 100.55299576256091, - "r_y0": 523.3155494272656, - "r_x1": 124.91101654503161, - "r_y1": 523.3155494272656, - "r_x2": 124.91101654503161, - "r_y2": 89.12381765643227, - "r_x3": 100.55299576256091, - "r_y3": 89.12381765643227, + "r_x0": 100.64168123325977, + "r_y0": 523.3236155182395, + "r_x1": 126.08064862014129, + "r_y1": 523.3236155182395, + "r_x2": 126.08064862014129, + "r_y2": 89.1266754140729, + "r_x3": 100.64168123325977, + "r_y3": 89.1266754140729, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test.doctags.txt b/tests/data_scanned/groundtruth/docling_v2/ocr_test.doctags.txt index 76fe886..c210e4d 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.doctags.txt +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.doctags.txt @@ -1,2 +1,2 @@ -Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package +Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test.json index 2ffbc91..e08af9c 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.json @@ -42,10 +42,10 @@ { "page_no": 1, "bbox": { - "l": 70.90211866351085, + "l": 69.6796630536824, "t": 764.9216921155637, - "r": 504.8720079864275, - "b": 689.216658542347, + "r": 504.8720051760782, + "b": 689.0124221922704, "coord_origin": "BOTTOMLEFT" }, "charspan": [ diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test.pages.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test.pages.json index 10d2f99..f5c2816 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.pages.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.pages.json @@ -40,14 +40,14 @@ "a": 255 }, "rect": { - "r_x0": 70.90211866351085, - "r_y0": 124.83139551297342, - "r_x1": 504.8720079864275, - "r_y1": 124.83139551297342, - "r_x2": 504.8720079864275, - "r_y2": 102.66666671251768, - "r_x3": 70.90211866351085, - "r_y3": 102.66666671251768, + "r_x0": 69.6796630536824, + "r_y0": 124.83139494707741, + "r_x1": 504.8720051760782, + "r_y1": 124.83139494707741, + "r_x2": 504.8720051760782, + "r_y2": 104.00000011573796, + "r_x3": 69.6796630536824, + "r_y3": 104.00000011573796, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -65,14 +65,14 @@ "a": 255 }, "rect": { - "r_x0": 73.10852522817731, - "r_y0": 152.70503335218433, - "r_x1": 153.04479435252625, - "r_y1": 152.70503335218433, - "r_x2": 153.04479435252625, - "r_y2": 130.00136157890958, - "r_x3": 73.10852522817731, - "r_y3": 130.00136157890958, + "r_x0": 71.84193505100733, + "r_y0": 152.90926970226084, + "r_x1": 153.088934155825, + "r_y1": 152.90926970226084, + "r_x2": 153.088934155825, + "r_y2": 129.797125232046, + "r_x3": 71.84193505100733, + "r_y3": 129.797125232046, "coord_origin": "TOPLEFT" }, "text": "package", @@ -90,10 +90,10 @@ "id": 0, "label": "text", "bbox": { - "l": 70.90211866351085, + "l": 69.6796630536824, "t": 76.99999977896756, - "r": 504.8720079864275, - "b": 152.70503335218433, + "r": 504.8720051760782, + "b": 152.90926970226084, "coord_origin": "TOPLEFT" }, "confidence": 0.9715733528137207, @@ -132,14 +132,14 @@ "a": 255 }, "rect": { - "r_x0": 70.90211866351085, - "r_y0": 124.83139551297342, - "r_x1": 504.8720079864275, - "r_y1": 124.83139551297342, - "r_x2": 504.8720079864275, - "r_y2": 102.66666671251768, - "r_x3": 70.90211866351085, - "r_y3": 102.66666671251768, + "r_x0": 69.6796630536824, + "r_y0": 124.83139494707741, + "r_x1": 504.8720051760782, + "r_y1": 124.83139494707741, + "r_x2": 504.8720051760782, + "r_y2": 104.00000011573796, + "r_x3": 69.6796630536824, + "r_y3": 104.00000011573796, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -157,14 +157,14 @@ "a": 255 }, "rect": { - "r_x0": 73.10852522817731, - "r_y0": 152.70503335218433, - "r_x1": 153.04479435252625, - "r_y1": 152.70503335218433, - "r_x2": 153.04479435252625, - "r_y2": 130.00136157890958, - "r_x3": 73.10852522817731, - "r_y3": 130.00136157890958, + "r_x0": 71.84193505100733, + "r_y0": 152.90926970226084, + "r_x1": 153.088934155825, + "r_y1": 152.90926970226084, + "r_x2": 153.088934155825, + "r_y2": 129.797125232046, + "r_x3": 71.84193505100733, + "r_y3": 129.797125232046, "coord_origin": "TOPLEFT" }, "text": "package", @@ -195,10 +195,10 @@ "id": 0, "label": "text", "bbox": { - "l": 70.90211866351085, + "l": 69.6796630536824, "t": 76.99999977896756, - "r": 504.8720079864275, - "b": 152.70503335218433, + "r": 504.8720051760782, + "b": 152.90926970226084, "coord_origin": "TOPLEFT" }, "confidence": 0.9715733528137207, @@ -237,14 +237,14 @@ "a": 255 }, "rect": { - "r_x0": 70.90211866351085, - "r_y0": 124.83139551297342, - "r_x1": 504.8720079864275, - "r_y1": 124.83139551297342, - "r_x2": 504.8720079864275, - "r_y2": 102.66666671251768, - "r_x3": 70.90211866351085, - "r_y3": 102.66666671251768, + "r_x0": 69.6796630536824, + "r_y0": 124.83139494707741, + "r_x1": 504.8720051760782, + "r_y1": 124.83139494707741, + "r_x2": 504.8720051760782, + "r_y2": 104.00000011573796, + "r_x3": 69.6796630536824, + "r_y3": 104.00000011573796, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -262,14 +262,14 @@ "a": 255 }, "rect": { - "r_x0": 73.10852522817731, - "r_y0": 152.70503335218433, - "r_x1": 153.04479435252625, - "r_y1": 152.70503335218433, - "r_x2": 153.04479435252625, - "r_y2": 130.00136157890958, - "r_x3": 73.10852522817731, - "r_y3": 130.00136157890958, + "r_x0": 71.84193505100733, + "r_y0": 152.90926970226084, + "r_x1": 153.088934155825, + "r_y1": 152.90926970226084, + "r_x2": 153.088934155825, + "r_y2": 129.797125232046, + "r_x3": 71.84193505100733, + "r_y3": 129.797125232046, "coord_origin": "TOPLEFT" }, "text": "package", @@ -293,10 +293,10 @@ "id": 0, "label": "text", "bbox": { - "l": 70.90211866351085, + "l": 69.6796630536824, "t": 76.99999977896756, - "r": 504.8720079864275, - "b": 152.70503335218433, + "r": 504.8720051760782, + "b": 152.90926970226084, "coord_origin": "TOPLEFT" }, "confidence": 0.9715733528137207, @@ -335,14 +335,14 @@ "a": 255 }, "rect": { - "r_x0": 70.90211866351085, - "r_y0": 124.83139551297342, - "r_x1": 504.8720079864275, - "r_y1": 124.83139551297342, - "r_x2": 504.8720079864275, - "r_y2": 102.66666671251768, - "r_x3": 70.90211866351085, - "r_y3": 102.66666671251768, + "r_x0": 69.6796630536824, + "r_y0": 124.83139494707741, + "r_x1": 504.8720051760782, + "r_y1": 124.83139494707741, + "r_x2": 504.8720051760782, + "r_y2": 104.00000011573796, + "r_x3": 69.6796630536824, + "r_y3": 104.00000011573796, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -360,14 +360,14 @@ "a": 255 }, "rect": { - "r_x0": 73.10852522817731, - "r_y0": 152.70503335218433, - "r_x1": 153.04479435252625, - "r_y1": 152.70503335218433, - "r_x2": 153.04479435252625, - "r_y2": 130.00136157890958, - "r_x3": 73.10852522817731, - "r_y3": 130.00136157890958, + "r_x0": 71.84193505100733, + "r_y0": 152.90926970226084, + "r_x1": 153.088934155825, + "r_y1": 152.90926970226084, + "r_x2": 153.088934155825, + "r_y2": 129.797125232046, + "r_x3": 71.84193505100733, + "r_y3": 129.797125232046, "coord_origin": "TOPLEFT" }, "text": "package", diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.doctags.txt b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.doctags.txt index da0deb0..405aa96 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.doctags.txt +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.doctags.txt @@ -1,3 +1,3 @@ -package +package Docling bundles PDF document conversion to JSON and Markdown in an easy self contained \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json index 9306f8b..835b1c7 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json @@ -45,10 +45,10 @@ { "page_no": 1, "bbox": { - "l": 441.304584329099, - "t": 151.67751306395223, - "r": 521.9863114205704, - "b": 132.09610360960653, + "l": 441.2561096985719, + "t": 151.87873262042876, + "r": 522.0347860494834, + "b": 131.89488404865142, "coord_origin": "BOTTOMLEFT" }, "charspan": [ @@ -74,10 +74,10 @@ { "page_no": 1, "bbox": { - "l": 89.12133215549848, - "t": 124.86176457554109, - "r": 523.3501733013318, - "b": 77.02339849621205, + "l": 89.23887497045128, + "t": 124.75312428291147, + "r": 523.208764293368, + "b": 77.02339852098021, "coord_origin": "BOTTOMLEFT" }, "charspan": [ diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.pages.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.pages.json index 1141247..23614fb 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.pages.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.pages.json @@ -15,14 +15,14 @@ "a": 255 }, "rect": { - "r_x0": 90.46133071208328, - "r_y0": 764.8982933983192, - "r_x1": 520.7638616365624, - "r_y1": 764.8982933983192, - "r_x2": 520.7638616365624, - "r_y2": 744.0929853742306, - "r_x3": 90.46133071208328, - "r_y3": 744.0929853742306, + "r_x0": 89.2388782764286, + "r_y0": 764.898293373551, + "r_x1": 521.9863147998661, + "r_y1": 764.898293373551, + "r_x2": 521.9863147998661, + "r_y2": 744.0929853494625, + "r_x3": 89.2388782764286, + "r_y3": 744.0929853494625, "coord_origin": "TOPLEFT" }, "text": "Docling bundles PDF document conversion to", @@ -40,14 +40,14 @@ "a": 255 }, "rect": { - "r_x0": 89.12133215549848, - "r_y0": 741.5247710689902, - "r_x1": 523.3501733013318, - "r_y1": 741.5247710689902, - "r_x2": 523.3501733013318, - "r_y2": 717.0599273189902, - "r_x3": 89.12133215549848, - "r_y3": 717.0599273189902, + "r_x0": 89.23887497045128, + "r_y0": 739.1977118987292, + "r_x1": 523.208764293368, + "r_y1": 739.1977118987292, + "r_x2": 523.208764293368, + "r_y2": 717.1685676116198, + "r_x3": 89.23887497045128, + "r_y3": 717.1685676116198, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -65,14 +65,14 @@ "a": 255 }, "rect": { - "r_x0": 441.304584329099, - "r_y0": 709.8255882849247, - "r_x1": 521.9863114205704, - "r_y1": 709.8255882849247, - "r_x2": 521.9863114205704, - "r_y2": 690.244178830579, - "r_x3": 441.304584329099, - "r_y3": 690.244178830579, + "r_x0": 441.2561096985719, + "r_y0": 710.0268078458798, + "r_x1": 522.0347860494834, + "r_y1": 710.0268078458798, + "r_x2": 522.0347860494834, + "r_y2": 690.0429592741025, + "r_x3": 441.2561096985719, + "r_y3": 690.0429592741025, "coord_origin": "TOPLEFT" }, "text": "package", @@ -90,10 +90,10 @@ "id": 0, "label": "text", "bbox": { - "l": 89.12133215549848, - "t": 717.0599273189902, - "r": 523.3501733013318, - "b": 764.8982933983192, + "l": 89.23887497045128, + "t": 717.1685676116198, + "r": 523.208764293368, + "b": 764.898293373551, "coord_origin": "TOPLEFT" }, "confidence": 0.7318570613861084, @@ -107,14 +107,14 @@ "a": 255 }, "rect": { - "r_x0": 90.46133071208328, - "r_y0": 764.8982933983192, - "r_x1": 520.7638616365624, - "r_y1": 764.8982933983192, - "r_x2": 520.7638616365624, - "r_y2": 744.0929853742306, - "r_x3": 90.46133071208328, - "r_y3": 744.0929853742306, + "r_x0": 89.2388782764286, + "r_y0": 764.898293373551, + "r_x1": 521.9863147998661, + "r_y1": 764.898293373551, + "r_x2": 521.9863147998661, + "r_y2": 744.0929853494625, + "r_x3": 89.2388782764286, + "r_y3": 744.0929853494625, "coord_origin": "TOPLEFT" }, "text": "Docling bundles PDF document conversion to", @@ -132,14 +132,14 @@ "a": 255 }, "rect": { - "r_x0": 89.12133215549848, - "r_y0": 741.5247710689902, - "r_x1": 523.3501733013318, - "r_y1": 741.5247710689902, - "r_x2": 523.3501733013318, - "r_y2": 717.0599273189902, - "r_x3": 89.12133215549848, - "r_y3": 717.0599273189902, + "r_x0": 89.23887497045128, + "r_y0": 739.1977118987292, + "r_x1": 523.208764293368, + "r_y1": 739.1977118987292, + "r_x2": 523.208764293368, + "r_y2": 717.1685676116198, + "r_x3": 89.23887497045128, + "r_y3": 717.1685676116198, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -155,10 +155,10 @@ "id": 2, "label": "text", "bbox": { - "l": 441.304584329099, - "t": 690.244178830579, - "r": 521.9863114205704, - "b": 709.8255882849247, + "l": 441.2561096985719, + "t": 690.0429592741025, + "r": 522.0347860494834, + "b": 710.0268078458798, "coord_origin": "TOPLEFT" }, "confidence": 0.5982133150100708, @@ -172,14 +172,14 @@ "a": 255 }, "rect": { - "r_x0": 441.304584329099, - "r_y0": 709.8255882849247, - "r_x1": 521.9863114205704, - "r_y1": 709.8255882849247, - "r_x2": 521.9863114205704, - "r_y2": 690.244178830579, - "r_x3": 441.304584329099, - "r_y3": 690.244178830579, + "r_x0": 441.2561096985719, + "r_y0": 710.0268078458798, + "r_x1": 522.0347860494834, + "r_y1": 710.0268078458798, + "r_x2": 522.0347860494834, + "r_y2": 690.0429592741025, + "r_x3": 441.2561096985719, + "r_y3": 690.0429592741025, "coord_origin": "TOPLEFT" }, "text": "package", @@ -210,10 +210,10 @@ "id": 0, "label": "text", "bbox": { - "l": 89.12133215549848, - "t": 717.0599273189902, - "r": 523.3501733013318, - "b": 764.8982933983192, + "l": 89.23887497045128, + "t": 717.1685676116198, + "r": 523.208764293368, + "b": 764.898293373551, "coord_origin": "TOPLEFT" }, "confidence": 0.7318570613861084, @@ -227,14 +227,14 @@ "a": 255 }, "rect": { - "r_x0": 90.46133071208328, - "r_y0": 764.8982933983192, - "r_x1": 520.7638616365624, - "r_y1": 764.8982933983192, - "r_x2": 520.7638616365624, - "r_y2": 744.0929853742306, - "r_x3": 90.46133071208328, - "r_y3": 744.0929853742306, + "r_x0": 89.2388782764286, + "r_y0": 764.898293373551, + "r_x1": 521.9863147998661, + "r_y1": 764.898293373551, + "r_x2": 521.9863147998661, + "r_y2": 744.0929853494625, + "r_x3": 89.2388782764286, + "r_y3": 744.0929853494625, "coord_origin": "TOPLEFT" }, "text": "Docling bundles PDF document conversion to", @@ -252,14 +252,14 @@ "a": 255 }, "rect": { - "r_x0": 89.12133215549848, - "r_y0": 741.5247710689902, - "r_x1": 523.3501733013318, - "r_y1": 741.5247710689902, - "r_x2": 523.3501733013318, - "r_y2": 717.0599273189902, - "r_x3": 89.12133215549848, - "r_y3": 717.0599273189902, + "r_x0": 89.23887497045128, + "r_y0": 739.1977118987292, + "r_x1": 523.208764293368, + "r_y1": 739.1977118987292, + "r_x2": 523.208764293368, + "r_y2": 717.1685676116198, + "r_x3": 89.23887497045128, + "r_y3": 717.1685676116198, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -281,10 +281,10 @@ "id": 2, "label": "text", "bbox": { - "l": 441.304584329099, - "t": 690.244178830579, - "r": 521.9863114205704, - "b": 709.8255882849247, + "l": 441.2561096985719, + "t": 690.0429592741025, + "r": 522.0347860494834, + "b": 710.0268078458798, "coord_origin": "TOPLEFT" }, "confidence": 0.5982133150100708, @@ -298,14 +298,14 @@ "a": 255 }, "rect": { - "r_x0": 441.304584329099, - "r_y0": 709.8255882849247, - "r_x1": 521.9863114205704, - "r_y1": 709.8255882849247, - "r_x2": 521.9863114205704, - "r_y2": 690.244178830579, - "r_x3": 441.304584329099, - "r_y3": 690.244178830579, + "r_x0": 441.2561096985719, + "r_y0": 710.0268078458798, + "r_x1": 522.0347860494834, + "r_y1": 710.0268078458798, + "r_x2": 522.0347860494834, + "r_y2": 690.0429592741025, + "r_x3": 441.2561096985719, + "r_y3": 690.0429592741025, "coord_origin": "TOPLEFT" }, "text": "package", @@ -329,10 +329,10 @@ "id": 0, "label": "text", "bbox": { - "l": 89.12133215549848, - "t": 717.0599273189902, - "r": 523.3501733013318, - "b": 764.8982933983192, + "l": 89.23887497045128, + "t": 717.1685676116198, + "r": 523.208764293368, + "b": 764.898293373551, "coord_origin": "TOPLEFT" }, "confidence": 0.7318570613861084, @@ -346,14 +346,14 @@ "a": 255 }, "rect": { - "r_x0": 90.46133071208328, - "r_y0": 764.8982933983192, - "r_x1": 520.7638616365624, - "r_y1": 764.8982933983192, - "r_x2": 520.7638616365624, - "r_y2": 744.0929853742306, - "r_x3": 90.46133071208328, - "r_y3": 744.0929853742306, + "r_x0": 89.2388782764286, + "r_y0": 764.898293373551, + "r_x1": 521.9863147998661, + "r_y1": 764.898293373551, + "r_x2": 521.9863147998661, + "r_y2": 744.0929853494625, + "r_x3": 89.2388782764286, + "r_y3": 744.0929853494625, "coord_origin": "TOPLEFT" }, "text": "Docling bundles PDF document conversion to", @@ -371,14 +371,14 @@ "a": 255 }, "rect": { - "r_x0": 89.12133215549848, - "r_y0": 741.5247710689902, - "r_x1": 523.3501733013318, - "r_y1": 741.5247710689902, - "r_x2": 523.3501733013318, - "r_y2": 717.0599273189902, - "r_x3": 89.12133215549848, - "r_y3": 717.0599273189902, + "r_x0": 89.23887497045128, + "r_y0": 739.1977118987292, + "r_x1": 523.208764293368, + "r_y1": 739.1977118987292, + "r_x2": 523.208764293368, + "r_y2": 717.1685676116198, + "r_x3": 89.23887497045128, + "r_y3": 717.1685676116198, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -400,10 +400,10 @@ "id": 2, "label": "text", "bbox": { - "l": 441.304584329099, - "t": 690.244178830579, - "r": 521.9863114205704, - "b": 709.8255882849247, + "l": 441.2561096985719, + "t": 690.0429592741025, + "r": 522.0347860494834, + "b": 710.0268078458798, "coord_origin": "TOPLEFT" }, "confidence": 0.5982133150100708, @@ -417,14 +417,14 @@ "a": 255 }, "rect": { - "r_x0": 441.304584329099, - "r_y0": 709.8255882849247, - "r_x1": 521.9863114205704, - "r_y1": 709.8255882849247, - "r_x2": 521.9863114205704, - "r_y2": 690.244178830579, - "r_x3": 441.304584329099, - "r_y3": 690.244178830579, + "r_x0": 441.2561096985719, + "r_y0": 710.0268078458798, + "r_x1": 522.0347860494834, + "r_y1": 710.0268078458798, + "r_x2": 522.0347860494834, + "r_y2": 690.0429592741025, + "r_x3": 441.2561096985719, + "r_y3": 690.0429592741025, "coord_origin": "TOPLEFT" }, "text": "package", diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.doctags.txt b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.doctags.txt index 95999c0..70ee51c 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.doctags.txt +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.doctags.txt @@ -1,3 +1,3 @@ Docling bundles PDF document conversion to JSON and Markdown in an easy self contained -package +package \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json index ac8fa44..69a028d 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json @@ -45,10 +45,10 @@ { "page_no": 1, "bbox": { - "l": 717.1685859527342, - "t": 524.2990548540179, + "l": 717.168585936602, + "t": 524.2990550512769, "r": 764.8982839673505, - "b": 90.32916553110118, + "b": 90.3291657283603, "coord_origin": "BOTTOMLEFT" }, "charspan": [ @@ -74,10 +74,10 @@ { "page_no": 1, "bbox": { - "l": 691.4680194659409, - "t": 523.0765988200898, - "r": 709.8255850278712, - "b": 442.3948768148814, + "l": 690.2441821046808, + "t": 523.076601235155, + "r": 709.8255852011977, + "b": 442.39487414368364, "coord_origin": "BOTTOMLEFT" }, "charspan": [ diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.pages.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.pages.json index dac1284..69af79a 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.pages.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.pages.json @@ -40,14 +40,14 @@ "a": 255 }, "rect": { - "r_x0": 717.1685859527342, - "r_y0": 504.8720063438988, - "r_x1": 737.9738558298501, - "r_y1": 504.8720063438988, - "r_x2": 737.9738558298501, - "r_y2": 70.90211702098213, - "r_x3": 717.1685859527342, - "r_y3": 70.90211702098213, + "r_x0": 717.168585936602, + "r_y0": 504.8720061466397, + "r_x1": 737.9738558137178, + "r_y1": 504.8720061466397, + "r_x2": 737.9738558137178, + "r_y2": 70.90211682372312, + "r_x3": 717.168585936602, + "r_y3": 70.90211682372312, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -65,14 +65,14 @@ "a": 255 }, "rect": { - "r_x0": 691.4680194659409, - "r_y0": 152.80629506011857, - "r_x1": 709.8255850278712, - "r_y1": 152.80629506011857, - "r_x2": 709.8255850278712, - "r_y2": 72.12457305491027, - "r_x3": 691.4680194659409, - "r_y3": 72.12457305491027, + "r_x0": 690.2441821046808, + "r_y0": 152.80629773131633, + "r_x1": 709.8255852011977, + "r_y1": 152.80629773131633, + "r_x2": 709.8255852011977, + "r_y2": 72.124570639845, + "r_x3": 690.2441821046808, + "r_y3": 72.124570639845, "coord_origin": "TOPLEFT" }, "text": "package", @@ -90,10 +90,10 @@ "id": 0, "label": "page_header", "bbox": { - "l": 717.1685859527342, - "t": 70.90211702098213, + "l": 717.168585936602, + "t": 70.90211682372312, "r": 764.8982839673505, - "b": 504.8720063438988, + "b": 504.8720061466397, "coord_origin": "TOPLEFT" }, "confidence": 0.6915205121040344, @@ -132,14 +132,14 @@ "a": 255 }, "rect": { - "r_x0": 717.1685859527342, - "r_y0": 504.8720063438988, - "r_x1": 737.9738558298501, - "r_y1": 504.8720063438988, - "r_x2": 737.9738558298501, - "r_y2": 70.90211702098213, - "r_x3": 717.1685859527342, - "r_y3": 70.90211702098213, + "r_x0": 717.168585936602, + "r_y0": 504.8720061466397, + "r_x1": 737.9738558137178, + "r_y1": 504.8720061466397, + "r_x2": 737.9738558137178, + "r_y2": 70.90211682372312, + "r_x3": 717.168585936602, + "r_y3": 70.90211682372312, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -155,10 +155,10 @@ "id": 8, "label": "text", "bbox": { - "l": 691.4680194659409, - "t": 72.12457305491027, - "r": 709.8255850278712, - "b": 152.80629506011857, + "l": 690.2441821046808, + "t": 72.124570639845, + "r": 709.8255852011977, + "b": 152.80629773131633, "coord_origin": "TOPLEFT" }, "confidence": 1.0, @@ -172,14 +172,14 @@ "a": 255 }, "rect": { - "r_x0": 691.4680194659409, - "r_y0": 152.80629506011857, - "r_x1": 709.8255850278712, - "r_y1": 152.80629506011857, - "r_x2": 709.8255850278712, - "r_y2": 72.12457305491027, - "r_x3": 691.4680194659409, - "r_y3": 72.12457305491027, + "r_x0": 690.2441821046808, + "r_y0": 152.80629773131633, + "r_x1": 709.8255852011977, + "r_y1": 152.80629773131633, + "r_x2": 709.8255852011977, + "r_y2": 72.124570639845, + "r_x3": 690.2441821046808, + "r_y3": 72.124570639845, "coord_origin": "TOPLEFT" }, "text": "package", @@ -210,10 +210,10 @@ "id": 0, "label": "page_header", "bbox": { - "l": 717.1685859527342, - "t": 70.90211702098213, + "l": 717.168585936602, + "t": 70.90211682372312, "r": 764.8982839673505, - "b": 504.8720063438988, + "b": 504.8720061466397, "coord_origin": "TOPLEFT" }, "confidence": 0.6915205121040344, @@ -252,14 +252,14 @@ "a": 255 }, "rect": { - "r_x0": 717.1685859527342, - "r_y0": 504.8720063438988, - "r_x1": 737.9738558298501, - "r_y1": 504.8720063438988, - "r_x2": 737.9738558298501, - "r_y2": 70.90211702098213, - "r_x3": 717.1685859527342, - "r_y3": 70.90211702098213, + "r_x0": 717.168585936602, + "r_y0": 504.8720061466397, + "r_x1": 737.9738558137178, + "r_y1": 504.8720061466397, + "r_x2": 737.9738558137178, + "r_y2": 70.90211682372312, + "r_x3": 717.168585936602, + "r_y3": 70.90211682372312, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -281,10 +281,10 @@ "id": 8, "label": "text", "bbox": { - "l": 691.4680194659409, - "t": 72.12457305491027, - "r": 709.8255850278712, - "b": 152.80629506011857, + "l": 690.2441821046808, + "t": 72.124570639845, + "r": 709.8255852011977, + "b": 152.80629773131633, "coord_origin": "TOPLEFT" }, "confidence": 1.0, @@ -298,14 +298,14 @@ "a": 255 }, "rect": { - "r_x0": 691.4680194659409, - "r_y0": 152.80629506011857, - "r_x1": 709.8255850278712, - "r_y1": 152.80629506011857, - "r_x2": 709.8255850278712, - "r_y2": 72.12457305491027, - "r_x3": 691.4680194659409, - "r_y3": 72.12457305491027, + "r_x0": 690.2441821046808, + "r_y0": 152.80629773131633, + "r_x1": 709.8255852011977, + "r_y1": 152.80629773131633, + "r_x2": 709.8255852011977, + "r_y2": 72.124570639845, + "r_x3": 690.2441821046808, + "r_y3": 72.124570639845, "coord_origin": "TOPLEFT" }, "text": "package", @@ -329,10 +329,10 @@ "id": 8, "label": "text", "bbox": { - "l": 691.4680194659409, - "t": 72.12457305491027, - "r": 709.8255850278712, - "b": 152.80629506011857, + "l": 690.2441821046808, + "t": 72.124570639845, + "r": 709.8255852011977, + "b": 152.80629773131633, "coord_origin": "TOPLEFT" }, "confidence": 1.0, @@ -346,14 +346,14 @@ "a": 255 }, "rect": { - "r_x0": 691.4680194659409, - "r_y0": 152.80629506011857, - "r_x1": 709.8255850278712, - "r_y1": 152.80629506011857, - "r_x2": 709.8255850278712, - "r_y2": 72.12457305491027, - "r_x3": 691.4680194659409, - "r_y3": 72.12457305491027, + "r_x0": 690.2441821046808, + "r_y0": 152.80629773131633, + "r_x1": 709.8255852011977, + "r_y1": 152.80629773131633, + "r_x2": 709.8255852011977, + "r_y2": 72.124570639845, + "r_x3": 690.2441821046808, + "r_y3": 72.124570639845, "coord_origin": "TOPLEFT" }, "text": "package", @@ -377,10 +377,10 @@ "id": 0, "label": "page_header", "bbox": { - "l": 717.1685859527342, - "t": 70.90211702098213, + "l": 717.168585936602, + "t": 70.90211682372312, "r": 764.8982839673505, - "b": 504.8720063438988, + "b": 504.8720061466397, "coord_origin": "TOPLEFT" }, "confidence": 0.6915205121040344, @@ -419,14 +419,14 @@ "a": 255 }, "rect": { - "r_x0": 717.1685859527342, - "r_y0": 504.8720063438988, - "r_x1": 737.9738558298501, - "r_y1": 504.8720063438988, - "r_x2": 737.9738558298501, - "r_y2": 70.90211702098213, - "r_x3": 717.1685859527342, - "r_y3": 70.90211702098213, + "r_x0": 717.168585936602, + "r_y0": 504.8720061466397, + "r_x1": 737.9738558137178, + "r_y1": 504.8720061466397, + "r_x2": 737.9738558137178, + "r_y2": 70.90211682372312, + "r_x3": 717.168585936602, + "r_y3": 70.90211682372312, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.doctags.txt b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.doctags.txt index c99f4b1..d8b8721 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.doctags.txt +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.doctags.txt @@ -1,3 +1,3 @@ -Docling bundles PDF document conversion to JSON and Markdown in an easy self contained +Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package \ No newline at end of file diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json index bb34723..94dc806 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json @@ -45,10 +45,10 @@ { "page_no": 1, "bbox": { - "l": 77.10171546422428, - "t": 506.07735421856773, - "r": 124.91101654503161, - "b": 71.88562244773436, + "l": 77.10171545548258, + "t": 506.0744964609271, + "r": 126.08064862014129, + "b": 71.87755635676046, "coord_origin": "BOTTOMLEFT" }, "charspan": [ diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.pages.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.pages.json index 7b76b51..b0defdc 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.pages.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.pages.json @@ -15,14 +15,14 @@ "a": 255 }, "rect": { - "r_x0": 77.10171546422428, - "r_y0": 520.7638577050515, - "r_x1": 96.6831586150625, - "r_y1": 520.7638577050515, - "r_x2": 96.6831586150625, - "r_y2": 89.23887398109309, - "r_x3": 77.10171546422428, - "r_y3": 89.23887398109309, + "r_x0": 77.10171545548258, + "r_y0": 520.7638571913312, + "r_x1": 96.68315797053792, + "r_y1": 520.7638571913312, + "r_x2": 96.68315797053792, + "r_y2": 89.2388734673729, + "r_x3": 77.10171545548258, + "r_y3": 89.2388734673729, "coord_origin": "TOPLEFT" }, "text": "Docling bundles PDF document conversion to", @@ -40,14 +40,14 @@ "a": 255 }, "rect": { - "r_x0": 100.55299576256091, - "r_y0": 523.3155494272656, - "r_x1": 124.91101654503161, - "r_y1": 523.3155494272656, - "r_x2": 124.91101654503161, - "r_y2": 89.12381765643227, - "r_x3": 100.55299576256091, - "r_y3": 89.12381765643227, + "r_x0": 100.64168123325977, + "r_y0": 523.3236155182395, + "r_x1": 126.08064862014129, + "r_y1": 523.3236155182395, + "r_x2": 126.08064862014129, + "r_y2": 89.1266754140729, + "r_x3": 100.64168123325977, + "r_y3": 89.1266754140729, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -90,10 +90,10 @@ "id": 0, "label": "page_header", "bbox": { - "l": 77.10171546422428, - "t": 89.12381765643227, - "r": 124.91101654503161, - "b": 523.3155494272656, + "l": 77.10171545548258, + "t": 89.1266754140729, + "r": 126.08064862014129, + "b": 523.3236155182395, "coord_origin": "TOPLEFT" }, "confidence": 0.6016772389411926, @@ -107,14 +107,14 @@ "a": 255 }, "rect": { - "r_x0": 77.10171546422428, - "r_y0": 520.7638577050515, - "r_x1": 96.6831586150625, - "r_y1": 520.7638577050515, - "r_x2": 96.6831586150625, - "r_y2": 89.23887398109309, - "r_x3": 77.10171546422428, - "r_y3": 89.23887398109309, + "r_x0": 77.10171545548258, + "r_y0": 520.7638571913312, + "r_x1": 96.68315797053792, + "r_y1": 520.7638571913312, + "r_x2": 96.68315797053792, + "r_y2": 89.2388734673729, + "r_x3": 77.10171545548258, + "r_y3": 89.2388734673729, "coord_origin": "TOPLEFT" }, "text": "Docling bundles PDF document conversion to", @@ -132,14 +132,14 @@ "a": 255 }, "rect": { - "r_x0": 100.55299576256091, - "r_y0": 523.3155494272656, - "r_x1": 124.91101654503161, - "r_y1": 523.3155494272656, - "r_x2": 124.91101654503161, - "r_y2": 89.12381765643227, - "r_x3": 100.55299576256091, - "r_y3": 89.12381765643227, + "r_x0": 100.64168123325977, + "r_y0": 523.3236155182395, + "r_x1": 126.08064862014129, + "r_y1": 523.3236155182395, + "r_x2": 126.08064862014129, + "r_y2": 89.1266754140729, + "r_x3": 100.64168123325977, + "r_y3": 89.1266754140729, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -210,10 +210,10 @@ "id": 0, "label": "page_header", "bbox": { - "l": 77.10171546422428, - "t": 89.12381765643227, - "r": 124.91101654503161, - "b": 523.3155494272656, + "l": 77.10171545548258, + "t": 89.1266754140729, + "r": 126.08064862014129, + "b": 523.3236155182395, "coord_origin": "TOPLEFT" }, "confidence": 0.6016772389411926, @@ -227,14 +227,14 @@ "a": 255 }, "rect": { - "r_x0": 77.10171546422428, - "r_y0": 520.7638577050515, - "r_x1": 96.6831586150625, - "r_y1": 520.7638577050515, - "r_x2": 96.6831586150625, - "r_y2": 89.23887398109309, - "r_x3": 77.10171546422428, - "r_y3": 89.23887398109309, + "r_x0": 77.10171545548258, + "r_y0": 520.7638571913312, + "r_x1": 96.68315797053792, + "r_y1": 520.7638571913312, + "r_x2": 96.68315797053792, + "r_y2": 89.2388734673729, + "r_x3": 77.10171545548258, + "r_y3": 89.2388734673729, "coord_origin": "TOPLEFT" }, "text": "Docling bundles PDF document conversion to", @@ -252,14 +252,14 @@ "a": 255 }, "rect": { - "r_x0": 100.55299576256091, - "r_y0": 523.3155494272656, - "r_x1": 124.91101654503161, - "r_y1": 523.3155494272656, - "r_x2": 124.91101654503161, - "r_y2": 89.12381765643227, - "r_x3": 100.55299576256091, - "r_y3": 89.12381765643227, + "r_x0": 100.64168123325977, + "r_y0": 523.3236155182395, + "r_x1": 126.08064862014129, + "r_y1": 523.3236155182395, + "r_x2": 126.08064862014129, + "r_y2": 89.1266754140729, + "r_x3": 100.64168123325977, + "r_y3": 89.1266754140729, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -377,10 +377,10 @@ "id": 0, "label": "page_header", "bbox": { - "l": 77.10171546422428, - "t": 89.12381765643227, - "r": 124.91101654503161, - "b": 523.3155494272656, + "l": 77.10171545548258, + "t": 89.1266754140729, + "r": 126.08064862014129, + "b": 523.3236155182395, "coord_origin": "TOPLEFT" }, "confidence": 0.6016772389411926, @@ -394,14 +394,14 @@ "a": 255 }, "rect": { - "r_x0": 77.10171546422428, - "r_y0": 520.7638577050515, - "r_x1": 96.6831586150625, - "r_y1": 520.7638577050515, - "r_x2": 96.6831586150625, - "r_y2": 89.23887398109309, - "r_x3": 77.10171546422428, - "r_y3": 89.23887398109309, + "r_x0": 77.10171545548258, + "r_y0": 520.7638571913312, + "r_x1": 96.68315797053792, + "r_y1": 520.7638571913312, + "r_x2": 96.68315797053792, + "r_y2": 89.2388734673729, + "r_x3": 77.10171545548258, + "r_y3": 89.2388734673729, "coord_origin": "TOPLEFT" }, "text": "Docling bundles PDF document conversion to", @@ -419,14 +419,14 @@ "a": 255 }, "rect": { - "r_x0": 100.55299576256091, - "r_y0": 523.3155494272656, - "r_x1": 124.91101654503161, - "r_y1": 523.3155494272656, - "r_x2": 124.91101654503161, - "r_y2": 89.12381765643227, - "r_x3": 100.55299576256091, - "r_y3": 89.12381765643227, + "r_x0": 100.64168123325977, + "r_y0": 523.3236155182395, + "r_x1": 126.08064862014129, + "r_y1": 523.3236155182395, + "r_x2": 126.08064862014129, + "r_y2": 89.1266754140729, + "r_x3": 100.64168123325977, + "r_y3": 89.1266754140729, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained",