Docling/docling/utils/export.py
Christoph Auer 3960b199d6
feat: Add DoclingParseV4 backend, using high-level docling-parse API (#905)
* Add DoclingParseV3 backend implementation

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Use docling-core with docling-parse types

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fixes and test updates

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix streams

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix streams

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Reset tests

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* update test cases

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* update test units

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add back DoclingParse v1 backend, pipeline options

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update locks

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fix: update docling-core to 2.22.0

Update dependency library docling-core to latest release 2.22.0
Fix regression tests and ground truth files

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>

* Ground-truth files updated

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update tests, use TextCell.from_ocr property

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Text fixes, new test data

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Rename docling backend to v4

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Test all backends, fixes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Reset all tests to use docling-parse v1 for now

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fixes for DPv4 backend init, better test coverage

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* test_input_doc use default backend

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-03-18 10:38:19 +01:00

149 lines
4.6 KiB
Python

import logging
from typing import Any, Dict, Iterable, List, Tuple, Union
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import TextCell
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
from docling.datamodel.document import ConversionResult, Page
_log = logging.getLogger(__name__)
def generate_multimodal_pages(
doc_result: ConversionResult,
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
label_to_doclaynet = {
"title": "title",
"table-of-contents": "document_index",
"subtitle-level-1": "section_header",
"checkbox-selected": "checkbox_selected",
"checkbox-unselected": "checkbox_unselected",
"caption": "caption",
"page-header": "page_header",
"page-footer": "page_footer",
"footnote": "footnote",
"table": "table",
"formula": "formula",
"list-item": "list_item",
"code": "code",
"figure": "picture",
"picture": "picture",
"reference": "text",
"paragraph": "text",
"text": "text",
}
content_text = ""
page_no = 0
start_ix = 0
end_ix = 0
doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
doc = doc_result.legacy_document
def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
segments = []
for ix, item in doc_items:
item_type = item.obj_type
label = label_to_doclaynet.get(item_type, None)
if label is None or item.prov is None or page.size is None:
continue
bbox = BoundingBox.from_tuple(
tuple(item.prov[0].bbox), origin=CoordOrigin.BOTTOMLEFT
)
new_bbox = bbox.to_top_left_origin(page_height=page.size.height).normalized(
page_size=page.size
)
new_segment = {
"index_in_doc": ix,
"label": label,
"text": item.text if item.text is not None else "",
"bbox": new_bbox.as_tuple(),
"data": [],
}
if isinstance(item, Table):
table_html = item.export_to_html()
new_segment["data"].append(
{
"html_seq": table_html,
"otsl_seq": "",
}
)
segments.append(new_segment)
return segments
def _process_page_cells(page: Page):
cells: List[dict] = []
if page.size is None:
return cells
for cell in page.cells:
new_bbox = (
cell.rect.to_bounding_box()
.to_top_left_origin(page_height=page.size.height)
.normalized(page_size=page.size)
)
is_ocr = cell.from_ocr
ocr_confidence = cell.confidence
cells.append(
{
"text": cell.text,
"bbox": new_bbox.as_tuple(),
"ocr": is_ocr,
"ocr_confidence": ocr_confidence,
}
)
return cells
def _process_page():
page_ix = page_no - 1
page = doc_result.pages[page_ix]
page_cells = _process_page_cells(page=page)
page_segments = _process_page_segments(doc_items=doc_items, page=page)
content_md = doc.export_to_markdown(
main_text_start=start_ix, main_text_stop=end_ix
)
# No page-tagging since we only do 1 page at the time
content_dt = doc.export_to_document_tokens(
main_text_start=start_ix, main_text_stop=end_ix, add_page_index=False
)
return content_text, content_md, content_dt, page_cells, page_segments, page
if doc.main_text is None:
return
for ix, orig_item in enumerate(doc.main_text):
item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
if item is None or item.prov is None or len(item.prov) == 0:
_log.debug(f"Skipping item {orig_item}")
continue
item_page = item.prov[0].page
# Page is complete
if page_no > 0 and item_page > page_no:
yield _process_page()
start_ix = ix
doc_items = []
content_text = ""
page_no = item_page
end_ix = ix
doc_items.append((ix, item))
if item.text is not None and item.text != "":
content_text += item.text + " "
if len(doc_items) > 0:
yield _process_page()