fix: Call into docling-core for legacy document transform (#551)
Call into docling-core for legacy document transform Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
78f61a8522
commit
7972d47f88
@ -33,6 +33,7 @@ from docling_core.types.legacy_doc.document import (
|
|||||||
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
||||||
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
||||||
from docling_core.utils.file import resolve_source_to_stream
|
from docling_core.utils.file import resolve_source_to_stream
|
||||||
|
from docling_core.utils.legacy import docling_document_to_legacy
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
@ -189,259 +190,7 @@ class ConversionResult(BaseModel):
|
|||||||
@property
|
@property
|
||||||
@deprecated("Use document instead.")
|
@deprecated("Use document instead.")
|
||||||
def legacy_document(self):
|
def legacy_document(self):
|
||||||
reverse_label_mapping = {
|
return docling_document_to_legacy(self.document)
|
||||||
DocItemLabel.CAPTION.value: "Caption",
|
|
||||||
DocItemLabel.FOOTNOTE.value: "Footnote",
|
|
||||||
DocItemLabel.FORMULA.value: "Formula",
|
|
||||||
DocItemLabel.LIST_ITEM.value: "List-item",
|
|
||||||
DocItemLabel.PAGE_FOOTER.value: "Page-footer",
|
|
||||||
DocItemLabel.PAGE_HEADER.value: "Page-header",
|
|
||||||
DocItemLabel.PICTURE.value: "Picture", # low threshold adjust to capture chemical structures for examples.
|
|
||||||
DocItemLabel.SECTION_HEADER.value: "Section-header",
|
|
||||||
DocItemLabel.TABLE.value: "Table",
|
|
||||||
DocItemLabel.TEXT.value: "Text",
|
|
||||||
DocItemLabel.TITLE.value: "Title",
|
|
||||||
DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
|
|
||||||
DocItemLabel.CODE.value: "Code",
|
|
||||||
DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
|
|
||||||
DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
|
|
||||||
DocItemLabel.FORM.value: "Form",
|
|
||||||
DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
|
|
||||||
DocItemLabel.PARAGRAPH.value: "paragraph",
|
|
||||||
}
|
|
||||||
|
|
||||||
title = ""
|
|
||||||
desc = DsDocumentDescription(logs=[])
|
|
||||||
|
|
||||||
page_hashes = [
|
|
||||||
PageReference(
|
|
||||||
hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
|
|
||||||
page=p.page_no,
|
|
||||||
model="default",
|
|
||||||
)
|
|
||||||
for p in self.document.pages.values()
|
|
||||||
]
|
|
||||||
|
|
||||||
file_info = DsFileInfoObject(
|
|
||||||
filename=self.input.file.name,
|
|
||||||
document_hash=self.input.document_hash,
|
|
||||||
num_pages=self.input.page_count,
|
|
||||||
page_hashes=page_hashes,
|
|
||||||
)
|
|
||||||
|
|
||||||
main_text = []
|
|
||||||
tables = []
|
|
||||||
figures = []
|
|
||||||
equations = []
|
|
||||||
footnotes = []
|
|
||||||
page_headers = []
|
|
||||||
page_footers = []
|
|
||||||
|
|
||||||
embedded_captions = set()
|
|
||||||
for ix, (item, level) in enumerate(
|
|
||||||
self.document.iterate_items(self.document.body)
|
|
||||||
):
|
|
||||||
|
|
||||||
if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
|
|
||||||
caption = item.caption_text(self.document)
|
|
||||||
if caption:
|
|
||||||
embedded_captions.add(caption)
|
|
||||||
|
|
||||||
for item, level in self.document.iterate_items():
|
|
||||||
if isinstance(item, DocItem):
|
|
||||||
item_type = item.label
|
|
||||||
|
|
||||||
if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
|
|
||||||
|
|
||||||
if isinstance(item, ListItem) and item.marker:
|
|
||||||
text = f"{item.marker} {item.text}"
|
|
||||||
else:
|
|
||||||
text = item.text
|
|
||||||
|
|
||||||
# Can be empty.
|
|
||||||
prov = [
|
|
||||||
Prov(
|
|
||||||
bbox=p.bbox.as_tuple(),
|
|
||||||
page=p.page_no,
|
|
||||||
span=[0, len(item.text)],
|
|
||||||
)
|
|
||||||
for p in item.prov
|
|
||||||
]
|
|
||||||
main_text.append(
|
|
||||||
BaseText(
|
|
||||||
text=text,
|
|
||||||
obj_type=layout_label_to_ds_type.get(item.label),
|
|
||||||
name=reverse_label_mapping[item.label],
|
|
||||||
prov=prov,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
# skip captions of they are embedded in the actual
|
|
||||||
# floating object
|
|
||||||
if item_type == DocItemLabel.CAPTION and text in embedded_captions:
|
|
||||||
continue
|
|
||||||
|
|
||||||
elif isinstance(item, TableItem) and item.data:
|
|
||||||
index = len(tables)
|
|
||||||
ref_str = f"#/tables/{index}"
|
|
||||||
main_text.append(
|
|
||||||
Ref(
|
|
||||||
name=reverse_label_mapping[item.label],
|
|
||||||
obj_type=layout_label_to_ds_type.get(item.label),
|
|
||||||
ref=ref_str,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Initialise empty table data grid (only empty cells)
|
|
||||||
table_data = [
|
|
||||||
[
|
|
||||||
TableCell(
|
|
||||||
text="",
|
|
||||||
# bbox=[0,0,0,0],
|
|
||||||
spans=[[i, j]],
|
|
||||||
obj_type="body",
|
|
||||||
)
|
|
||||||
for j in range(item.data.num_cols)
|
|
||||||
]
|
|
||||||
for i in range(item.data.num_rows)
|
|
||||||
]
|
|
||||||
|
|
||||||
# Overwrite cells in table data for which there is actual cell content.
|
|
||||||
for cell in item.data.table_cells:
|
|
||||||
for i in range(
|
|
||||||
min(cell.start_row_offset_idx, item.data.num_rows),
|
|
||||||
min(cell.end_row_offset_idx, item.data.num_rows),
|
|
||||||
):
|
|
||||||
for j in range(
|
|
||||||
min(cell.start_col_offset_idx, item.data.num_cols),
|
|
||||||
min(cell.end_col_offset_idx, item.data.num_cols),
|
|
||||||
):
|
|
||||||
celltype = "body"
|
|
||||||
if cell.column_header:
|
|
||||||
celltype = "col_header"
|
|
||||||
elif cell.row_header:
|
|
||||||
celltype = "row_header"
|
|
||||||
elif cell.row_section:
|
|
||||||
celltype = "row_section"
|
|
||||||
|
|
||||||
def make_spans(cell):
|
|
||||||
for rspan in range(
|
|
||||||
min(
|
|
||||||
cell.start_row_offset_idx,
|
|
||||||
item.data.num_rows,
|
|
||||||
),
|
|
||||||
min(
|
|
||||||
cell.end_row_offset_idx, item.data.num_rows
|
|
||||||
),
|
|
||||||
):
|
|
||||||
for cspan in range(
|
|
||||||
min(
|
|
||||||
cell.start_col_offset_idx,
|
|
||||||
item.data.num_cols,
|
|
||||||
),
|
|
||||||
min(
|
|
||||||
cell.end_col_offset_idx,
|
|
||||||
item.data.num_cols,
|
|
||||||
),
|
|
||||||
):
|
|
||||||
yield [rspan, cspan]
|
|
||||||
|
|
||||||
spans = list(make_spans(cell))
|
|
||||||
table_data[i][j] = GlmTableCell(
|
|
||||||
text=cell.text,
|
|
||||||
bbox=(
|
|
||||||
cell.bbox.as_tuple()
|
|
||||||
if cell.bbox is not None
|
|
||||||
else None
|
|
||||||
), # check if this is bottom-left
|
|
||||||
spans=spans,
|
|
||||||
obj_type=celltype,
|
|
||||||
col=j,
|
|
||||||
row=i,
|
|
||||||
row_header=cell.row_header,
|
|
||||||
row_section=cell.row_section,
|
|
||||||
col_header=cell.column_header,
|
|
||||||
row_span=[
|
|
||||||
cell.start_row_offset_idx,
|
|
||||||
cell.end_row_offset_idx,
|
|
||||||
],
|
|
||||||
col_span=[
|
|
||||||
cell.start_col_offset_idx,
|
|
||||||
cell.end_col_offset_idx,
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Compute the caption
|
|
||||||
caption = item.caption_text(self.document)
|
|
||||||
|
|
||||||
tables.append(
|
|
||||||
DsSchemaTable(
|
|
||||||
text=caption,
|
|
||||||
num_cols=item.data.num_cols,
|
|
||||||
num_rows=item.data.num_rows,
|
|
||||||
obj_type=layout_label_to_ds_type.get(item.label),
|
|
||||||
data=table_data,
|
|
||||||
prov=[
|
|
||||||
Prov(
|
|
||||||
bbox=p.bbox.as_tuple(),
|
|
||||||
page=p.page_no,
|
|
||||||
span=[0, 0],
|
|
||||||
)
|
|
||||||
for p in item.prov
|
|
||||||
],
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
elif isinstance(item, PictureItem):
|
|
||||||
index = len(figures)
|
|
||||||
ref_str = f"#/figures/{index}"
|
|
||||||
main_text.append(
|
|
||||||
Ref(
|
|
||||||
name=reverse_label_mapping[item.label],
|
|
||||||
obj_type=layout_label_to_ds_type.get(item.label),
|
|
||||||
ref=ref_str,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Compute the caption
|
|
||||||
caption = item.caption_text(self.document)
|
|
||||||
|
|
||||||
figures.append(
|
|
||||||
Figure(
|
|
||||||
prov=[
|
|
||||||
Prov(
|
|
||||||
bbox=p.bbox.as_tuple(),
|
|
||||||
page=p.page_no,
|
|
||||||
span=[0, len(caption)],
|
|
||||||
)
|
|
||||||
for p in item.prov
|
|
||||||
],
|
|
||||||
obj_type=layout_label_to_ds_type.get(item.label),
|
|
||||||
text=caption,
|
|
||||||
# data=[[]],
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
page_dimensions = [
|
|
||||||
PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
|
|
||||||
for p in self.document.pages.values()
|
|
||||||
]
|
|
||||||
|
|
||||||
ds_doc = DsDocument(
|
|
||||||
name=title,
|
|
||||||
description=desc,
|
|
||||||
file_info=file_info,
|
|
||||||
main_text=main_text,
|
|
||||||
equations=equations,
|
|
||||||
footnotes=footnotes,
|
|
||||||
page_headers=page_headers,
|
|
||||||
page_footers=page_footers,
|
|
||||||
tables=tables,
|
|
||||||
figures=figures,
|
|
||||||
page_dimensions=page_dimensions,
|
|
||||||
)
|
|
||||||
|
|
||||||
return ds_doc
|
|
||||||
|
|
||||||
|
|
||||||
class _DummyBackend(AbstractDocumentBackend):
|
class _DummyBackend(AbstractDocumentBackend):
|
||||||
|
13
poetry.lock
generated
13
poetry.lock
generated
@ -888,13 +888,13 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "docling-core"
|
name = "docling-core"
|
||||||
version = "2.8.0"
|
version = "2.9.0"
|
||||||
description = "A python library to define and validate data types in Docling."
|
description = "A python library to define and validate data types in Docling."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "<4.0,>=3.9"
|
python-versions = "<4.0,>=3.9"
|
||||||
files = [
|
files = [
|
||||||
{file = "docling_core-2.8.0-py3-none-any.whl", hash = "sha256:392aad49e25f5fd1d279410118fbd91d9aaab9dd92d043738d20c10c57193d86"},
|
{file = "docling_core-2.9.0-py3-none-any.whl", hash = "sha256:b44b077db5d2ac8a900f30a15abe329c165b1f2eb7f1c90d1275c423c1c3d668"},
|
||||||
{file = "docling_core-2.8.0.tar.gz", hash = "sha256:6ac5cbc6f0abcbdf599c2a4b1a3f7b52fd8baebf3c4ebf94d7b7e2ee061a654e"},
|
{file = "docling_core-2.9.0.tar.gz", hash = "sha256:1bf12fe67ee4852330e9bac33fe62b45598ff885481e03a88fa8e1bf48252424"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -6061,6 +6061,11 @@ files = [
|
|||||||
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"},
|
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f60021ec1574e56632be2a36b946f8143bf4e5e6af4a06d85281adc22938e0dd"},
|
||||||
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"},
|
{file = "scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:394397841449853c2290a32050382edaec3da89e35b3e03d6cc966aebc6a8ae6"},
|
||||||
{file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"},
|
{file = "scikit_learn-1.5.2-cp312-cp312-win_amd64.whl", hash = "sha256:57cc1786cfd6bd118220a92ede80270132aa353647684efa385a74244a41e3b1"},
|
||||||
|
{file = "scikit_learn-1.5.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9a702e2de732bbb20d3bad29ebd77fc05a6b427dc49964300340e4c9328b3f5"},
|
||||||
|
{file = "scikit_learn-1.5.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:b0768ad641981f5d3a198430a1d31c3e044ed2e8a6f22166b4d546a5116d7908"},
|
||||||
|
{file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:178ddd0a5cb0044464fc1bfc4cca5b1833bfc7bb022d70b05db8530da4bb3dd3"},
|
||||||
|
{file = "scikit_learn-1.5.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f7284ade780084d94505632241bf78c44ab3b6f1e8ccab3d2af58e0e950f9c12"},
|
||||||
|
{file = "scikit_learn-1.5.2-cp313-cp313-win_amd64.whl", hash = "sha256:b7b0f9a0b1040830d38c39b91b3a44e1b643f4b36e36567b80b7c6bd2202a27f"},
|
||||||
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"},
|
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:757c7d514ddb00ae249832fe87100d9c73c6ea91423802872d9e74970a0e40b9"},
|
||||||
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"},
|
{file = "scikit_learn-1.5.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:52788f48b5d8bca5c0736c175fa6bdaab2ef00a8f536cda698db61bd89c551c1"},
|
||||||
{file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},
|
{file = "scikit_learn-1.5.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:643964678f4b5fbdc95cbf8aec638acc7aa70f5f79ee2cdad1eec3df4ba6ead8"},
|
||||||
@ -7597,4 +7602,4 @@ tesserocr = ["tesserocr"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.9"
|
python-versions = "^3.9"
|
||||||
content-hash = "621f8de238fd1f82cfd783531b6ab7c1598378a499c0dcfac323d66bc7ab32ea"
|
content-hash = "3e66a54bd0433581e4909003124e2b79b42bdd1fb90d17c037f3294aeff56aa9"
|
||||||
|
@ -25,7 +25,7 @@ packages = [{include = "docling"}]
|
|||||||
# actual dependencies:
|
# actual dependencies:
|
||||||
######################
|
######################
|
||||||
python = "^3.9"
|
python = "^3.9"
|
||||||
docling-core = { version = "^2.8.0", extras = ["chunking"] }
|
docling-core = { version = "^2.9.0", extras = ["chunking"] }
|
||||||
pydantic = "^2.0.0"
|
pydantic = "^2.0.0"
|
||||||
docling-ibm-models = "^2.0.6"
|
docling-ibm-models = "^2.0.6"
|
||||||
deepsearch-glm = "^1.0.0"
|
deepsearch-glm = "^1.0.0"
|
||||||
|
Loading…
Reference in New Issue
Block a user