feat: adding txt and doctags output (#68)

* feat: adding txt and doctags output

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* cleaned up the export

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* Fix datamodel usage for Figure

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* updated all the examples to deal with new rendering

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar 2024-09-10 17:30:52 +02:00 committed by GitHub
parent cd5b6293cc
commit bdfdfbf092
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 784 additions and 1173 deletions

View File

@ -11,6 +11,7 @@ from docling_core.types import FileInfoObject as DsFileInfoObject
from docling_core.types import PageDimensions, PageReference, Prov, Ref from docling_core.types import PageDimensions, PageReference, Prov, Ref
from docling_core.types import Table as DsSchemaTable from docling_core.types import Table as DsSchemaTable
from docling_core.types import TableCell from docling_core.types import TableCell
from docling_core.types.doc.base import Figure
from pydantic import BaseModel from pydantic import BaseModel
from typing_extensions import deprecated from typing_extensions import deprecated
@ -279,7 +280,7 @@ class ConvertedDocument(BaseModel):
), ),
) )
figures.append( figures.append(
BaseCell( Figure(
prov=[ prov=[
Prov( Prov(
bbox=target_bbox, bbox=target_bbox,
@ -312,8 +313,76 @@ class ConvertedDocument(BaseModel):
def render_as_dict(self): def render_as_dict(self):
return self.output.model_dump(by_alias=True, exclude_none=True) return self.output.model_dump(by_alias=True, exclude_none=True)
def render_as_markdown(self): def render_as_markdown(
return self.output.export_to_markdown() self,
delim: str = "\n\n",
main_text_start: int = 0,
main_text_stop: Optional[int] = None,
main_text_labels: list[str] = [
"title",
"subtitle-level-1",
"paragraph",
"caption",
"table",
],
strict_text: bool = False,
):
return self.output.export_to_markdown(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
strict_text=strict_text,
)
def render_as_text(
self,
delim: str = "\n\n",
main_text_start: int = 0,
main_text_stop: Optional[int] = None,
main_text_labels: list[str] = [
"title",
"subtitle-level-1",
"paragraph",
"caption",
],
):
return self.output.export_to_markdown(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
strict_text=True,
)
def render_as_doctags(
self,
delim: str = "\n\n",
main_text_start: int = 0,
main_text_stop: Optional[int] = None,
main_text_labels: list[str] = [
"title",
"subtitle-level-1",
"paragraph",
"caption",
"table",
"figure",
],
page_tagging: bool = True,
location_tagging: bool = True,
location_dimensions: Tuple[int, int] = (100, 100),
add_new_line: bool = True,
) -> str:
return self.output.export_to_document_tokens(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
page_tagging=page_tagging,
location_tagging=location_tagging,
location_dimensions=location_dimensions,
add_new_line=add_new_line,
)
def render_element_images( def render_element_images(
self, element_types: Tuple[PageElement] = (FigureElement,) self, element_types: Tuple[PageElement] = (FigureElement,)

View File

@ -163,8 +163,12 @@ def generate_multimodal_pages(
content_md = doc.export_to_markdown( content_md = doc.export_to_markdown(
main_text_start=start_ix, main_text_stop=end_ix main_text_start=start_ix, main_text_stop=end_ix
) )
# No page-tagging since we only do 1 page at the time
content_dt = doc.export_to_document_tokens(
main_text_start=start_ix, main_text_stop=end_ix, page_tagging=False
)
return content_text, content_md, page_cells, page_segments, page return content_text, content_md, content_dt, page_cells, page_segments, page
for ix, orig_item in enumerate(doc.main_text): for ix, orig_item in enumerate(doc.main_text):

View File

@ -30,9 +30,18 @@ def export_documents(
with (output_dir / f"{doc_filename}.json").open("w") as fp: with (output_dir / f"{doc_filename}.json").open("w") as fp:
fp.write(json.dumps(conv_res.render_as_dict())) fp.write(json.dumps(conv_res.render_as_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
fp.write(conv_res.render_as_text())
# Export Markdown format: # Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w") as fp: with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(conv_res.render_as_markdown()) fp.write(conv_res.render_as_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
fp.write(conv_res.render_as_doctags())
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS: elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
_log.info( _log.info(
f"Document {conv_res.input.file} was partially converted with the following errors:" f"Document {conv_res.input.file} was partially converted with the following errors:"

View File

@ -31,9 +31,18 @@ def export_documents(
with (output_dir / f"{doc_filename}.json").open("w") as fp: with (output_dir / f"{doc_filename}.json").open("w") as fp:
fp.write(json.dumps(conv_res.render_as_dict())) fp.write(json.dumps(conv_res.render_as_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
fp.write(conv_res.render_as_text())
# Export Markdown format: # Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w") as fp: with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(conv_res.render_as_markdown()) fp.write(conv_res.render_as_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
fp.write(conv_res.render_as_doctags())
else: else:
_log.info(f"Document {conv_res.input.file} failed to convert.") _log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1 failure_count += 1

View File

@ -51,6 +51,7 @@ def main():
for ( for (
content_text, content_text,
content_md, content_md,
content_dt,
page_cells, page_cells,
page_segments, page_segments,
page, page,
@ -71,6 +72,7 @@ def main():
"cells": page_cells, "cells": page_cells,
"contents": content_text, "contents": content_text,
"contents_md": content_md, "contents_md": content_md,
"contents_dt": content_dt,
"segments": page_segments, "segments": page_segments,
"extra": { "extra": {
"page_num": page.page_no + 1, "page_num": page.page_no + 1,

1852
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -23,8 +23,8 @@ packages = [{include = "docling"}]
[tool.poetry.dependencies] [tool.poetry.dependencies]
python = "^3.10" python = "^3.10"
pydantic = "^2.0.0" pydantic = "^2.0.0"
docling-core = "^1.1.3" docling-core = "^1.2.0"
docling-ibm-models = "^1.1.3" docling-ibm-models = "^1.1.7"
deepsearch-glm = "^0.21.0" deepsearch-glm = "^0.21.0"
filetype = "^1.2.0" filetype = "^1.2.0"
pypdfium2 = "^4.30.0" pypdfium2 = "^4.30.0"