fix: bumped the glm version and adjusted the tests (#83)

* bumped the glm version and adjusted the tests

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* updated the poetry lock

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fix hooks

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fixed the tests

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* reformatted the code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* added the tests for tables

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar 2024-09-18 07:43:49 +02:00 committed by GitHub
parent 8242bce4fa
commit 442443a102
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 406 additions and 361 deletions

736
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -25,7 +25,7 @@ python = "^3.10"
pydantic = "^2.0.0" pydantic = "^2.0.0"
docling-core = "^1.3.0" docling-core = "^1.3.0"
docling-ibm-models = "^1.2.0" docling-ibm-models = "^1.2.0"
deepsearch-glm = "^0.21.0" deepsearch-glm = "^0.21.1"
filetype = "^1.2.0" filetype = "^1.2.0"
pypdfium2 = "^4.30.0" pypdfium2 = "^4.30.0"
pydantic-settings = "^2.3.0" pydantic-settings = "^2.3.0"

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -96,10 +96,17 @@ def verify_tables(doc_pred: DsDocument, doc_true: DsDocument):
for i, row in enumerate(true_item.data): for i, row in enumerate(true_item.data):
for j, col in enumerate(true_item.data[i]): for j, col in enumerate(true_item.data[i]):
# print("true: ", true_item.data[i][j])
# print("pred: ", pred_item.data[i][j])
assert ( assert (
true_item.data[i][j].text == pred_item.data[i][j].text true_item.data[i][j].text == pred_item.data[i][j].text
), "table-cell does not have the same text" ), "table-cell does not have the same text"
assert (
true_item.data[i][j].obj_type == pred_item.data[i][j].obj_type
), "table-cell does not have the same type"
return True return True
@ -156,9 +163,13 @@ def verify_conversion_result(
), f"Mismatch in PDF cell prediction for {input_path}" ), f"Mismatch in PDF cell prediction for {input_path}"
# assert verify_output( # assert verify_output(
# doc_pred, doc_true # doc_pred, doc_true
# ), f"Mismatch in JSON prediction for {input_path}" # ), f"Mismatch in JSON prediction for {input_path}"
assert verify_tables(
doc_pred, doc_true
), f"verify_tables(doc_pred, doc_true) mismatch for {input_path}"
assert verify_md( assert verify_md(
doc_pred_md, doc_true_md doc_pred_md, doc_true_md
), f"Mismatch in Markdown prediction for {input_path}" ), f"Mismatch in Markdown prediction for {input_path}"