feat: add table exports (#86)
* feat: expose docling-core table exporters and add examples Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * remove temp internal implementation of html export Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * pin latest docling-core 1.4.0 with table exports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
442443a102
commit
f19bd43798
@ -9,67 +9,6 @@ from docling.datamodel.document import ConversionResult, Page
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _export_table_to_html(table: Table):
|
||||
|
||||
# TODO: this is flagged as internal, because we will move it
|
||||
# to the docling-core package.
|
||||
|
||||
def _get_tablecell_span(cell: TableCell, ix):
|
||||
if cell.spans is None:
|
||||
span = set()
|
||||
else:
|
||||
span = set([s[ix] for s in cell.spans])
|
||||
if len(span) == 0:
|
||||
return 1, None, None
|
||||
return len(span), min(span), max(span)
|
||||
|
||||
body = ""
|
||||
nrows = table.num_rows
|
||||
ncols = table.num_cols
|
||||
|
||||
if table.data is None:
|
||||
return ""
|
||||
for i in range(nrows):
|
||||
body += "<tr>"
|
||||
for j in range(ncols):
|
||||
cell: TableCell = table.data[i][j]
|
||||
|
||||
rowspan, rowstart, rowend = _get_tablecell_span(cell, 0)
|
||||
colspan, colstart, colend = _get_tablecell_span(cell, 1)
|
||||
|
||||
if rowstart is not None and rowstart != i:
|
||||
continue
|
||||
if colstart is not None and colstart != j:
|
||||
continue
|
||||
|
||||
if rowstart is None:
|
||||
rowstart = i
|
||||
if colstart is None:
|
||||
colstart = j
|
||||
|
||||
content = cell.text.strip()
|
||||
label = cell.obj_type
|
||||
label_class = "body"
|
||||
celltag = "td"
|
||||
if label in ["row_header", "row_multi_header", "row_title"]:
|
||||
label_class = "header"
|
||||
elif label in ["col_header", "col_multi_header"]:
|
||||
label_class = "header"
|
||||
celltag = "th"
|
||||
|
||||
opening_tag = f"{celltag}"
|
||||
if rowspan > 1:
|
||||
opening_tag += f' rowspan="{rowspan}"'
|
||||
if colspan > 1:
|
||||
opening_tag += f' colspan="{colspan}"'
|
||||
|
||||
body += f"<{opening_tag}>{content}</{celltag}>"
|
||||
body += "</tr>"
|
||||
body = f"<table>{body}</table>"
|
||||
|
||||
return body
|
||||
|
||||
|
||||
def generate_multimodal_pages(
|
||||
doc_result: ConversionResult,
|
||||
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
|
||||
@ -129,7 +68,7 @@ def generate_multimodal_pages(
|
||||
}
|
||||
|
||||
if isinstance(item, Table):
|
||||
table_html = _export_table_to_html(item)
|
||||
table_html = item.export_to_html()
|
||||
new_segment["data"].append(
|
||||
{
|
||||
"html_seq": table_html,
|
||||
|
74
examples/export_tables.py
Normal file
74
examples/export_tables.py
Normal file
@ -0,0 +1,74 @@
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
output_dir = Path("./scratch")
|
||||
|
||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
doc_converter = DocumentConverter()
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert(input_files)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status != ConversionStatus.SUCCESS:
|
||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export tables
|
||||
for table_ix, table in enumerate(conv_res.output.tables):
|
||||
table_df: pd.DataFrame = table.export_to_dataframe()
|
||||
print(f"## Table {table_ix}")
|
||||
print(table_df.to_markdown())
|
||||
|
||||
# Save the table as csv
|
||||
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
|
||||
_log.info(f"Saving CSV table to {element_csv_filename}")
|
||||
table_df.to_csv(element_csv_filename)
|
||||
|
||||
# Save the table as html
|
||||
element_html_filename = (
|
||||
output_dir / f"{doc_filename}-table-{table_ix+1}.html"
|
||||
)
|
||||
_log.info(f"Saving HTML table to {element_html_filename}")
|
||||
with element_html_filename.open("w") as fp:
|
||||
fp.write(table.export_to_html())
|
||||
|
||||
success_count += 1
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
46
poetry.lock
generated
46
poetry.lock
generated
@ -957,13 +957,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "docling-core"
|
||||
version = "1.3.0"
|
||||
version = "1.4.0"
|
||||
description = "A python library to define and validate data types in Docling."
|
||||
optional = false
|
||||
python-versions = "<4.0,>=3.9"
|
||||
files = [
|
||||
{file = "docling_core-1.3.0-py3-none-any.whl", hash = "sha256:31779b9a5cce7e925d01d3b78fa8a835c531fa74646205ae2a8721f534eb8b27"},
|
||||
{file = "docling_core-1.3.0.tar.gz", hash = "sha256:beb55fb0018c912209bdf12958e4cf5a6c8bbe73fd097d03da25fc3979260fab"},
|
||||
{file = "docling_core-1.4.0-py3-none-any.whl", hash = "sha256:11cd6228d5f321fd11427cf61f40148afd544170e82236228794300f14f8a15a"},
|
||||
{file = "docling_core-1.4.0.tar.gz", hash = "sha256:6ea151974172a87a9bca0d63787dc16bdb4170ecb73f18e61e3c2e95eb3fe3d8"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@ -1151,18 +1151,18 @@ tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipyth
|
||||
|
||||
[[package]]
|
||||
name = "filelock"
|
||||
version = "3.16.0"
|
||||
version = "3.16.1"
|
||||
description = "A platform independent file lock."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "filelock-3.16.0-py3-none-any.whl", hash = "sha256:f6ed4c963184f4c84dd5557ce8fece759a3724b37b80c6c4f20a2f63a4dc6609"},
|
||||
{file = "filelock-3.16.0.tar.gz", hash = "sha256:81de9eb8453c769b63369f87f11131a7ab04e367f8d97ad39dc230daa07e3bec"},
|
||||
{file = "filelock-3.16.1-py3-none-any.whl", hash = "sha256:2082e5703d51fbf98ea75855d9d5527e33d8ff23099bec374a134febee6946b0"},
|
||||
{file = "filelock-3.16.1.tar.gz", hash = "sha256:c249fbfcd5db47e5e2d6d62198e565475ee65e4831e2561c8e313fa7eb961435"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"]
|
||||
testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.1.1)", "pytest (>=8.3.2)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.3)"]
|
||||
docs = ["furo (>=2024.8.6)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4.1)"]
|
||||
testing = ["covdefaults (>=2.3)", "coverage (>=7.6.1)", "diff-cover (>=9.2)", "pytest (>=8.3.3)", "pytest-asyncio (>=0.24)", "pytest-cov (>=5)", "pytest-mock (>=3.14)", "pytest-timeout (>=2.3.1)", "virtualenv (>=20.26.4)"]
|
||||
typing = ["typing-extensions (>=4.12.2)"]
|
||||
|
||||
[[package]]
|
||||
@ -2383,17 +2383,17 @@ transformers = ">=4.39.0"
|
||||
|
||||
[[package]]
|
||||
name = "langchain-milvus"
|
||||
version = "0.1.4"
|
||||
version = "0.1.5"
|
||||
description = "An integration package connecting Milvus and LangChain"
|
||||
optional = true
|
||||
python-versions = "<4.0,>=3.8.1"
|
||||
files = [
|
||||
{file = "langchain_milvus-0.1.4-py3-none-any.whl", hash = "sha256:f5c1f2d023c6853d1acc22dc8d0b61ca4d99015c1b095b0cf84ec84a9ba2936e"},
|
||||
{file = "langchain_milvus-0.1.4.tar.gz", hash = "sha256:1cd67f127d60c73ffb07cd789705766479137630d43f8ff547c69eee4775dae8"},
|
||||
{file = "langchain_milvus-0.1.5-py3-none-any.whl", hash = "sha256:74aa487738afde4c3e1346433ef26f9556e599826161562b308d3357d86529fd"},
|
||||
{file = "langchain_milvus-0.1.5.tar.gz", hash = "sha256:1cceab384783ba264055102e5831451482fd726a68feb64258f6dbbd8d702557"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
langchain-core = ">=0.2.20,<0.3.0"
|
||||
langchain-core = {version = ">=0.2.38,<0.4", markers = "python_version >= \"3.9\""}
|
||||
pymilvus = ">=2.4.3,<3.0.0"
|
||||
scipy = [
|
||||
{version = ">=1.7,<2.0", markers = "python_version < \"3.12\""},
|
||||
@ -3950,13 +3950,13 @@ testing = ["pytest", "pytest-cov", "wheel"]
|
||||
|
||||
[[package]]
|
||||
name = "platformdirs"
|
||||
version = "4.3.4"
|
||||
version = "4.3.6"
|
||||
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "platformdirs-4.3.4-py3-none-any.whl", hash = "sha256:8b4ba85412f5065dae40aa19feaa02ac2be584c8b14abd70712b5cd11ad80034"},
|
||||
{file = "platformdirs-4.3.4.tar.gz", hash = "sha256:9e8a037c36fe1b1f1b5de4482e60464272cc8dca725e40b568bf2c285f7509cf"},
|
||||
{file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"},
|
||||
{file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
@ -4500,17 +4500,17 @@ wheel = "*"
|
||||
|
||||
[[package]]
|
||||
name = "pyreadline3"
|
||||
version = "3.5.2"
|
||||
version = "3.5.3"
|
||||
description = "A python implementation of GNU readline."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "pyreadline3-3.5.2-py3-none-any.whl", hash = "sha256:a87d56791e2965b2b187e2ea33dcf664600842c997c0623c95cf8ef07db83de9"},
|
||||
{file = "pyreadline3-3.5.2.tar.gz", hash = "sha256:ba82292e52c5a3bb256b291af0c40b457c1e8699cac9a873abbcaac8aef3a1bb"},
|
||||
{file = "pyreadline3-3.5.3-py3-none-any.whl", hash = "sha256:ddede153a92e5aad9c1fe63d692efd6a3e478f686adcd4938a051ffb63ec4f52"},
|
||||
{file = "pyreadline3-3.5.3.tar.gz", hash = "sha256:9234684ca75a00a702fda42b17cc26ca665bc9d7c2da06af450468253099ff61"},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
dev = ["build", "flake8", "pytest", "twine"]
|
||||
dev = ["build", "flake8", "mypy", "pytest", "twine"]
|
||||
|
||||
[[package]]
|
||||
name = "pytest"
|
||||
@ -6862,13 +6862,13 @@ zstd = ["zstandard (>=0.18.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "virtualenv"
|
||||
version = "20.26.4"
|
||||
version = "20.26.5"
|
||||
description = "Virtual Python Environment builder"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "virtualenv-20.26.4-py3-none-any.whl", hash = "sha256:48f2695d9809277003f30776d155615ffc11328e6a0a8c1f0ec80188d7874a55"},
|
||||
{file = "virtualenv-20.26.4.tar.gz", hash = "sha256:c17f4e0f3e6036e9f26700446f85c76ab11df65ff6d8a9cbfad9f71aabfcf23c"},
|
||||
{file = "virtualenv-20.26.5-py3-none-any.whl", hash = "sha256:4f3ac17b81fba3ce3bd6f4ead2749a72da5929c01774948e243db9ba41df4ff6"},
|
||||
{file = "virtualenv-20.26.5.tar.gz", hash = "sha256:ce489cac131aa58f4b25e321d6d186171f78e6cb13fafbf32a840cee67733ff4"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@ -7257,4 +7257,4 @@ examples = ["langchain-huggingface", "langchain-milvus", "langchain-text-splitte
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "ae5c784c10b8d5635bc8fd7490c89049a8b4f0247e2b8ddd7b0d65106c10dda5"
|
||||
content-hash = "7dc789b3c981898fdabec03f85ebb92273f2bb55b2bf1e18dad1d4c361c6b97b"
|
||||
|
@ -23,7 +23,7 @@ packages = [{include = "docling"}]
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.10"
|
||||
pydantic = "^2.0.0"
|
||||
docling-core = "^1.3.0"
|
||||
docling-core = "^1.4.0"
|
||||
docling-ibm-models = "^1.2.0"
|
||||
deepsearch-glm = "^0.21.1"
|
||||
filetype = "^1.2.0"
|
||||
|
Loading…
Reference in New Issue
Block a user