feat: adding txt and doctags output (#68)

* feat: adding txt and doctags output Signed-off-by: Peter Staar <taa@zurich.ibm.com> * cleaned up the export Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Fix datamodel usage for Figure Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * updated all the examples to deal with new rendering Signed-off-by: Peter Staar <taa@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
2024-09-10 17:30:52 +02:00 · 2024-09-10 17:30:52 +02:00 · bdfdfbf092
commit bdfdfbf092
parent cd5b6293cc
7 changed files with 784 additions and 1173 deletions
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -11,6 +11,7 @@ from docling_core.types import FileInfoObject as DsFileInfoObject
 from docling_core.types import PageDimensions, PageReference, Prov, Ref
 from docling_core.types import Table as DsSchemaTable
 from docling_core.types import TableCell
 from docling_core.types.doc.base import Figure
 from pydantic import BaseModel
 from typing_extensions import deprecated
@ -279,7 +280,7 @@ class ConvertedDocument(BaseModel):
                    ),
                )
                figures.append(
-                    BaseCell(
+                    Figure(
                        prov=[
                            Prov(
                                bbox=target_bbox,
@ -312,8 +313,76 @@ class ConvertedDocument(BaseModel):
    def render_as_dict(self):
        return self.output.model_dump(by_alias=True, exclude_none=True)
-    def render_as_markdown(self):
+    def render_as_markdown(
-        return self.output.export_to_markdown()
+        self,
        delim: str = "\n\n",
        main_text_start: int = 0,
        main_text_stop: Optional[int] = None,
        main_text_labels: list[str] = [
            "title",
            "subtitle-level-1",
            "paragraph",
            "caption",
            "table",
        ],
        strict_text: bool = False,
    ):
        return self.output.export_to_markdown(
            delim=delim,
            main_text_start=main_text_start,
            main_text_stop=main_text_stop,
            main_text_labels=main_text_labels,
            strict_text=strict_text,
        )
    def render_as_text(
        self,
        delim: str = "\n\n",
        main_text_start: int = 0,
        main_text_stop: Optional[int] = None,
        main_text_labels: list[str] = [
            "title",
            "subtitle-level-1",
            "paragraph",
            "caption",
        ],
    ):
        return self.output.export_to_markdown(
            delim=delim,
            main_text_start=main_text_start,
            main_text_stop=main_text_stop,
            main_text_labels=main_text_labels,
            strict_text=True,
        )
    def render_as_doctags(
        self,
        delim: str = "\n\n",
        main_text_start: int = 0,
        main_text_stop: Optional[int] = None,
        main_text_labels: list[str] = [
            "title",
            "subtitle-level-1",
            "paragraph",
            "caption",
            "table",
            "figure",
        ],
        page_tagging: bool = True,
        location_tagging: bool = True,
        location_dimensions: Tuple[int, int] = (100, 100),
        add_new_line: bool = True,
    ) -> str:
        return self.output.export_to_document_tokens(
            delim=delim,
            main_text_start=main_text_start,
            main_text_stop=main_text_stop,
            main_text_labels=main_text_labels,
            page_tagging=page_tagging,
            location_tagging=location_tagging,
            location_dimensions=location_dimensions,
            add_new_line=add_new_line,
        )
    def render_element_images(
        self, element_types: Tuple[PageElement] = (FigureElement,)
--- a/docling/utils/export.py
+++ b/docling/utils/export.py
@ -163,8 +163,12 @@ def generate_multimodal_pages(
        content_md = doc.export_to_markdown(
            main_text_start=start_ix, main_text_stop=end_ix
        )
        # No page-tagging since we only do 1 page at the time
        content_dt = doc.export_to_document_tokens(
            main_text_start=start_ix, main_text_stop=end_ix, page_tagging=False
        )
-        return content_text, content_md, page_cells, page_segments, page
+        return content_text, content_md, content_dt, page_cells, page_segments, page
    for ix, orig_item in enumerate(doc.main_text):
--- a/examples/batch_convert.py
+++ b/examples/batch_convert.py
@ -30,9 +30,18 @@ def export_documents(
            with (output_dir / f"{doc_filename}.json").open("w") as fp:
                fp.write(json.dumps(conv_res.render_as_dict()))
            # Export Text format:
            with (output_dir / f"{doc_filename}.txt").open("w") as fp:
                fp.write(conv_res.render_as_text())
            # Export Markdown format:
            with (output_dir / f"{doc_filename}.md").open("w") as fp:
                fp.write(conv_res.render_as_markdown())
            # Export Document Tags format:
            with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
                fp.write(conv_res.render_as_doctags())
        elif conv_res.status == ConversionStatus.PARTIAL_SUCCESS:
            _log.info(
                f"Document {conv_res.input.file} was partially converted with the following errors:"
--- a/examples/custom_convert.py
+++ b/examples/custom_convert.py
@ -31,9 +31,18 @@ def export_documents(
            with (output_dir / f"{doc_filename}.json").open("w") as fp:
                fp.write(json.dumps(conv_res.render_as_dict()))
            # Export Text format:
            with (output_dir / f"{doc_filename}.txt").open("w") as fp:
                fp.write(conv_res.render_as_text())
            # Export Markdown format:
            with (output_dir / f"{doc_filename}.md").open("w") as fp:
                fp.write(conv_res.render_as_markdown())
            # Export Document Tags format:
            with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
                fp.write(conv_res.render_as_doctags())
        else:
            _log.info(f"Document {conv_res.input.file} failed to convert.")
            failure_count += 1
--- a/examples/export_multimodal.py
+++ b/examples/export_multimodal.py
@ -51,6 +51,7 @@ def main():
        for (
            content_text,
            content_md,
            content_dt,
            page_cells,
            page_segments,
            page,
@ -71,6 +72,7 @@ def main():
                    "cells": page_cells,
                    "contents": content_text,
                    "contents_md": content_md,
                    "contents_dt": content_dt,
                    "segments": page_segments,
                    "extra": {
                        "page_num": page.page_no + 1,
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -23,8 +23,8 @@ packages = [{include = "docling"}]
 [tool.poetry.dependencies]
 python = "^3.10"
 pydantic = "^2.0.0"
-docling-core = "^1.1.3"
+docling-core = "^1.2.0"
-docling-ibm-models = "^1.1.3"
+docling-ibm-models = "^1.1.7"
 deepsearch-glm = "^0.21.0"
 filetype = "^1.2.0"
 pypdfium2 = "^4.30.0"