feat: adding txt and doctags output (#68)
* feat: adding txt and doctags output Signed-off-by: Peter Staar <taa@zurich.ibm.com> * cleaned up the export Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Fix datamodel usage for Figure Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * updated all the examples to deal with new rendering Signed-off-by: Peter Staar <taa@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
cd5b6293cc
commit
bdfdfbf092
@ -11,6 +11,7 @@ from docling_core.types import FileInfoObject as DsFileInfoObject
|
||||
from docling_core.types import PageDimensions, PageReference, Prov, Ref
|
||||
from docling_core.types import Table as DsSchemaTable
|
||||
from docling_core.types import TableCell
|
||||
from docling_core.types.doc.base import Figure
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
@ -279,7 +280,7 @@ class ConvertedDocument(BaseModel):
|
||||
),
|
||||
)
|
||||
figures.append(
|
||||
BaseCell(
|
||||
Figure(
|
||||
prov=[
|
||||
Prov(
|
||||
bbox=target_bbox,
|
||||
@ -312,8 +313,76 @@ class ConvertedDocument(BaseModel):
|
||||
def render_as_dict(self):
|
||||
return self.output.model_dump(by_alias=True, exclude_none=True)
|
||||
|
||||
def render_as_markdown(self):
|
||||
return self.output.export_to_markdown()
|
||||
def render_as_markdown(
|
||||
self,
|
||||
delim: str = "\n\n",
|
||||
main_text_start: int = 0,
|
||||
main_text_stop: Optional[int] = None,
|
||||
main_text_labels: list[str] = [
|
||||
"title",
|
||||
"subtitle-level-1",
|
||||
"paragraph",
|
||||
"caption",
|
||||
"table",
|
||||
],
|
||||
strict_text: bool = False,
|
||||
):
|
||||
return self.output.export_to_markdown(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
main_text_labels=main_text_labels,
|
||||
strict_text=strict_text,
|
||||
)
|
||||
|
||||
def render_as_text(
|
||||
self,
|
||||
delim: str = "\n\n",
|
||||
main_text_start: int = 0,
|
||||
main_text_stop: Optional[int] = None,
|
||||
main_text_labels: list[str] = [
|
||||
"title",
|
||||
"subtitle-level-1",
|
||||
"paragraph",
|
||||
"caption",
|
||||
],
|
||||
):
|
||||
return self.output.export_to_markdown(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
main_text_labels=main_text_labels,
|
||||
strict_text=True,
|
||||
)
|
||||
|
||||
def render_as_doctags(
|
||||
self,
|
||||
delim: str = "\n\n",
|
||||
main_text_start: int = 0,
|
||||
main_text_stop: Optional[int] = None,
|
||||
main_text_labels: list[str] = [
|
||||
"title",
|
||||
"subtitle-level-1",
|
||||
"paragraph",
|
||||
"caption",
|
||||
"table",
|
||||
"figure",
|
||||
],
|
||||
page_tagging: bool = True,
|
||||
location_tagging: bool = True,
|
||||
location_dimensions: Tuple[int, int] = (100, 100),
|
||||
add_new_line: bool = True,
|
||||
) -> str:
|
||||
return self.output.export_to_document_tokens(
|
||||
delim=delim,
|
||||
main_text_start=main_text_start,
|
||||
main_text_stop=main_text_stop,
|
||||
main_text_labels=main_text_labels,
|
||||
page_tagging=page_tagging,
|
||||
location_tagging=location_tagging,
|
||||
location_dimensions=location_dimensions,
|
||||
add_new_line=add_new_line,
|
||||
)
|
||||
|
||||
def render_element_images(
|
||||
self, element_types: Tuple[PageElement] = (FigureElement,)
|
||||
|
@ -163,8 +163,12 @@ def generate_multimodal_pages(
|
||||
content_md = doc.export_to_markdown(
|
||||
main_text_start=start_ix, main_text_stop=end_ix
|
||||
)
|
||||
# No page-tagging since we only do 1 page at the time
|
||||
content_dt = doc.export_to_document_tokens(
|
||||
main_text_start=start_ix, main_text_stop=end_ix, page_tagging=False
|
||||
)
|
||||
|
||||
return content_text, content_md, page_cells, page_segments, page
|
||||
return content_text, content_md, content_dt, page_cells, page_segments, page
|
||||
|
||||
for ix, orig_item in enumerate(doc.main_text):
|
||||
|
||||
|
@ -30,9 +30,18 @@ def export_documents(
|
||||
with (output_dir / f"{doc_filename}.json").open("w") as fp:
|
||||
fp.write(json.dumps(conv_res.render_as_dict()))
|
||||
|
||||
# Export Text format:
|
||||
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
|
||||
fp.write(conv_res.render_as_text())
|
||||
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.md").open("w") as fp:
|
||||
fp.write(conv_res.render_as_markdown())
|
||||
|
||||
# Export Document Tags format:
|
||||
with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
|
||||
fp.write(conv_res.render_as_doctags())
|
||||
|
||||
elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
|
||||
_log.info(
|
||||
f"Document {conv_res.input.file} was partially converted with the following errors:"
|
||||
|
@ -31,9 +31,18 @@ def export_documents(
|
||||
with (output_dir / f"{doc_filename}.json").open("w") as fp:
|
||||
fp.write(json.dumps(conv_res.render_as_dict()))
|
||||
|
||||
# Export Text format:
|
||||
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
|
||||
fp.write(conv_res.render_as_text())
|
||||
|
||||
# Export Markdown format:
|
||||
with (output_dir / f"{doc_filename}.md").open("w") as fp:
|
||||
fp.write(conv_res.render_as_markdown())
|
||||
|
||||
# Export Document Tags format:
|
||||
with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
|
||||
fp.write(conv_res.render_as_doctags())
|
||||
|
||||
else:
|
||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
|
@ -51,6 +51,7 @@ def main():
|
||||
for (
|
||||
content_text,
|
||||
content_md,
|
||||
content_dt,
|
||||
page_cells,
|
||||
page_segments,
|
||||
page,
|
||||
@ -71,6 +72,7 @@ def main():
|
||||
"cells": page_cells,
|
||||
"contents": content_text,
|
||||
"contents_md": content_md,
|
||||
"contents_dt": content_dt,
|
||||
"segments": page_segments,
|
||||
"extra": {
|
||||
"page_num": page.page_no + 1,
|
||||
|
1852
poetry.lock
generated
1852
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -23,8 +23,8 @@ packages = [{include = "docling"}]
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.10"
|
||||
pydantic = "^2.0.0"
|
||||
docling-core = "^1.1.3"
|
||||
docling-ibm-models = "^1.1.3"
|
||||
docling-core = "^1.2.0"
|
||||
docling-ibm-models = "^1.1.7"
|
||||
deepsearch-glm = "^0.21.0"
|
||||
filetype = "^1.2.0"
|
||||
pypdfium2 = "^4.30.0"
|
||||
|
Loading…
Reference in New Issue
Block a user