feat: adding txt and doctags output (#68)
* feat: adding txt and doctags output Signed-off-by: Peter Staar <taa@zurich.ibm.com> * cleaned up the export Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Fix datamodel usage for Figure Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * updated all the examples to deal with new rendering Signed-off-by: Peter Staar <taa@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
cd5b6293cc
commit
bdfdfbf092
@ -11,6 +11,7 @@ from docling_core.types import FileInfoObject as DsFileInfoObject
|
|||||||
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
||||||
from docling_core.types import Table as DsSchemaTable
|
from docling_core.types import Table as DsSchemaTable
|
||||||
from docling_core.types import TableCell
|
from docling_core.types import TableCell
|
||||||
|
from docling_core.types.doc.base import Figure
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
@ -279,7 +280,7 @@ class ConvertedDocument(BaseModel):
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
figures.append(
|
figures.append(
|
||||||
BaseCell(
|
Figure(
|
||||||
prov=[
|
prov=[
|
||||||
Prov(
|
Prov(
|
||||||
bbox=target_bbox,
|
bbox=target_bbox,
|
||||||
@ -312,8 +313,76 @@ class ConvertedDocument(BaseModel):
|
|||||||
def render_as_dict(self):
|
def render_as_dict(self):
|
||||||
return self.output.model_dump(by_alias=True, exclude_none=True)
|
return self.output.model_dump(by_alias=True, exclude_none=True)
|
||||||
|
|
||||||
def render_as_markdown(self):
|
def render_as_markdown(
|
||||||
return self.output.export_to_markdown()
|
self,
|
||||||
|
delim: str = "\n\n",
|
||||||
|
main_text_start: int = 0,
|
||||||
|
main_text_stop: Optional[int] = None,
|
||||||
|
main_text_labels: list[str] = [
|
||||||
|
"title",
|
||||||
|
"subtitle-level-1",
|
||||||
|
"paragraph",
|
||||||
|
"caption",
|
||||||
|
"table",
|
||||||
|
],
|
||||||
|
strict_text: bool = False,
|
||||||
|
):
|
||||||
|
return self.output.export_to_markdown(
|
||||||
|
delim=delim,
|
||||||
|
main_text_start=main_text_start,
|
||||||
|
main_text_stop=main_text_stop,
|
||||||
|
main_text_labels=main_text_labels,
|
||||||
|
strict_text=strict_text,
|
||||||
|
)
|
||||||
|
|
||||||
|
def render_as_text(
|
||||||
|
self,
|
||||||
|
delim: str = "\n\n",
|
||||||
|
main_text_start: int = 0,
|
||||||
|
main_text_stop: Optional[int] = None,
|
||||||
|
main_text_labels: list[str] = [
|
||||||
|
"title",
|
||||||
|
"subtitle-level-1",
|
||||||
|
"paragraph",
|
||||||
|
"caption",
|
||||||
|
],
|
||||||
|
):
|
||||||
|
return self.output.export_to_markdown(
|
||||||
|
delim=delim,
|
||||||
|
main_text_start=main_text_start,
|
||||||
|
main_text_stop=main_text_stop,
|
||||||
|
main_text_labels=main_text_labels,
|
||||||
|
strict_text=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
def render_as_doctags(
|
||||||
|
self,
|
||||||
|
delim: str = "\n\n",
|
||||||
|
main_text_start: int = 0,
|
||||||
|
main_text_stop: Optional[int] = None,
|
||||||
|
main_text_labels: list[str] = [
|
||||||
|
"title",
|
||||||
|
"subtitle-level-1",
|
||||||
|
"paragraph",
|
||||||
|
"caption",
|
||||||
|
"table",
|
||||||
|
"figure",
|
||||||
|
],
|
||||||
|
page_tagging: bool = True,
|
||||||
|
location_tagging: bool = True,
|
||||||
|
location_dimensions: Tuple[int, int] = (100, 100),
|
||||||
|
add_new_line: bool = True,
|
||||||
|
) -> str:
|
||||||
|
return self.output.export_to_document_tokens(
|
||||||
|
delim=delim,
|
||||||
|
main_text_start=main_text_start,
|
||||||
|
main_text_stop=main_text_stop,
|
||||||
|
main_text_labels=main_text_labels,
|
||||||
|
page_tagging=page_tagging,
|
||||||
|
location_tagging=location_tagging,
|
||||||
|
location_dimensions=location_dimensions,
|
||||||
|
add_new_line=add_new_line,
|
||||||
|
)
|
||||||
|
|
||||||
def render_element_images(
|
def render_element_images(
|
||||||
self, element_types: Tuple[PageElement] = (FigureElement,)
|
self, element_types: Tuple[PageElement] = (FigureElement,)
|
||||||
|
@ -163,8 +163,12 @@ def generate_multimodal_pages(
|
|||||||
content_md = doc.export_to_markdown(
|
content_md = doc.export_to_markdown(
|
||||||
main_text_start=start_ix, main_text_stop=end_ix
|
main_text_start=start_ix, main_text_stop=end_ix
|
||||||
)
|
)
|
||||||
|
# No page-tagging since we only do 1 page at the time
|
||||||
|
content_dt = doc.export_to_document_tokens(
|
||||||
|
main_text_start=start_ix, main_text_stop=end_ix, page_tagging=False
|
||||||
|
)
|
||||||
|
|
||||||
return content_text, content_md, page_cells, page_segments, page
|
return content_text, content_md, content_dt, page_cells, page_segments, page
|
||||||
|
|
||||||
for ix, orig_item in enumerate(doc.main_text):
|
for ix, orig_item in enumerate(doc.main_text):
|
||||||
|
|
||||||
|
@ -30,9 +30,18 @@ def export_documents(
|
|||||||
with (output_dir / f"{doc_filename}.json").open("w") as fp:
|
with (output_dir / f"{doc_filename}.json").open("w") as fp:
|
||||||
fp.write(json.dumps(conv_res.render_as_dict()))
|
fp.write(json.dumps(conv_res.render_as_dict()))
|
||||||
|
|
||||||
|
# Export Text format:
|
||||||
|
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
|
||||||
|
fp.write(conv_res.render_as_text())
|
||||||
|
|
||||||
# Export Markdown format:
|
# Export Markdown format:
|
||||||
with (output_dir / f"{doc_filename}.md").open("w") as fp:
|
with (output_dir / f"{doc_filename}.md").open("w") as fp:
|
||||||
fp.write(conv_res.render_as_markdown())
|
fp.write(conv_res.render_as_markdown())
|
||||||
|
|
||||||
|
# Export Document Tags format:
|
||||||
|
with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
|
||||||
|
fp.write(conv_res.render_as_doctags())
|
||||||
|
|
||||||
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
|
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
|
||||||
_log.info(
|
_log.info(
|
||||||
f"Document {conv_res.input.file} was partially converted with the following errors:"
|
f"Document {conv_res.input.file} was partially converted with the following errors:"
|
||||||
|
@ -31,9 +31,18 @@ def export_documents(
|
|||||||
with (output_dir / f"{doc_filename}.json").open("w") as fp:
|
with (output_dir / f"{doc_filename}.json").open("w") as fp:
|
||||||
fp.write(json.dumps(conv_res.render_as_dict()))
|
fp.write(json.dumps(conv_res.render_as_dict()))
|
||||||
|
|
||||||
|
# Export Text format:
|
||||||
|
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
|
||||||
|
fp.write(conv_res.render_as_text())
|
||||||
|
|
||||||
# Export Markdown format:
|
# Export Markdown format:
|
||||||
with (output_dir / f"{doc_filename}.md").open("w") as fp:
|
with (output_dir / f"{doc_filename}.md").open("w") as fp:
|
||||||
fp.write(conv_res.render_as_markdown())
|
fp.write(conv_res.render_as_markdown())
|
||||||
|
|
||||||
|
# Export Document Tags format:
|
||||||
|
with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
|
||||||
|
fp.write(conv_res.render_as_doctags())
|
||||||
|
|
||||||
else:
|
else:
|
||||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||||
failure_count += 1
|
failure_count += 1
|
||||||
|
@ -51,6 +51,7 @@ def main():
|
|||||||
for (
|
for (
|
||||||
content_text,
|
content_text,
|
||||||
content_md,
|
content_md,
|
||||||
|
content_dt,
|
||||||
page_cells,
|
page_cells,
|
||||||
page_segments,
|
page_segments,
|
||||||
page,
|
page,
|
||||||
@ -71,6 +72,7 @@ def main():
|
|||||||
"cells": page_cells,
|
"cells": page_cells,
|
||||||
"contents": content_text,
|
"contents": content_text,
|
||||||
"contents_md": content_md,
|
"contents_md": content_md,
|
||||||
|
"contents_dt": content_dt,
|
||||||
"segments": page_segments,
|
"segments": page_segments,
|
||||||
"extra": {
|
"extra": {
|
||||||
"page_num": page.page_no + 1,
|
"page_num": page.page_no + 1,
|
||||||
|
1852
poetry.lock
generated
1852
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -23,8 +23,8 @@ packages = [{include = "docling"}]
|
|||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.10"
|
python = "^3.10"
|
||||||
pydantic = "^2.0.0"
|
pydantic = "^2.0.0"
|
||||||
docling-core = "^1.1.3"
|
docling-core = "^1.2.0"
|
||||||
docling-ibm-models = "^1.1.3"
|
docling-ibm-models = "^1.1.7"
|
||||||
deepsearch-glm = "^0.21.0"
|
deepsearch-glm = "^0.21.0"
|
||||||
filetype = "^1.2.0"
|
filetype = "^1.2.0"
|
||||||
pypdfium2 = "^4.30.0"
|
pypdfium2 = "^4.30.0"
|
||||||
|
Loading…
Reference in New Issue
Block a user