
* Upgraded Layout Postprocessing, sending old code back to ERZ Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Implement hierachical cluster layout processing Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Pass nested cluster processing through full pipeline Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Pass nested clusters through GLM as payload Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Move to_docling_document from ds-glm to this repo Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Clean up imports again Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * feat(Accelerator): Introduce options to control the num_threads and device from API, envvars, CLI. - Introduce the AcceleratorOptions, AcceleratorDevice and use them to set the device where the models run. - Introduce the accelerator_utils with function to decide the device and resolve the AUTO setting. - Refactor the way how the docling-ibm-models are called to match the new init signature of models. - Translate the accelerator options to the specific inputs for third-party models. - Extend the docling CLI with parameters to set the num_threads and device. - Add new unit tests. - Write new example how to use the accelerator options. * fix: Improve the pydantic objects in the pipeline_options and imports. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: TableStructureModel: Refactor the artifacts path to use the new structure for fast/accurate model Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * Updated test ground-truth Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Updated test ground-truth (again), bugfix for empty layout Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: Do proper check to set the device in EasyOCR, RapidOCR. Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: Correct the way to set GPU for EasyOCR, RapidOCR Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * fix: Ocr AccleratorDevice Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> * Merge pull request #556 from DS4SD/cau/layout-processing-improvement feat: layout processing improvements and bugfixes * Update lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update HF model ref, reset test generate Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Repin to release package versions Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Many layout processing improvements, add document index type Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update pinnings to docling-core Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update test GT Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix table box snapping Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for cluster pre-ordering Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Introduce OCR confidence, propagate to orphan in post-processing Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix form and key value area groups Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Adjust confidence in EasyOcr Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Roll back CLI changes from main Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update test GT Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update docling-core pinning Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Annoying fixes for historical python versions Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Updated test GT for legacy Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Comment cleanup Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> Co-authored-by: Nikos Livathinos <nli@zurich.ibm.com>
185 lines
7.4 KiB
Python
185 lines
7.4 KiB
Python
import logging
|
|
import re
|
|
from typing import Iterable, List
|
|
|
|
from pydantic import BaseModel
|
|
|
|
from docling.datamodel.base_models import (
|
|
AssembledUnit,
|
|
ContainerElement,
|
|
FigureElement,
|
|
Page,
|
|
PageElement,
|
|
Table,
|
|
TextElement,
|
|
)
|
|
from docling.datamodel.document import ConversionResult
|
|
from docling.models.base_model import BasePageModel
|
|
from docling.models.layout_model import LayoutModel
|
|
from docling.utils.profiling import TimeRecorder
|
|
|
|
_log = logging.getLogger(__name__)
|
|
|
|
|
|
class PageAssembleOptions(BaseModel):
|
|
keep_images: bool = False
|
|
|
|
|
|
class PageAssembleModel(BasePageModel):
|
|
def __init__(self, options: PageAssembleOptions):
|
|
self.options = options
|
|
|
|
def sanitize_text(self, lines):
|
|
if len(lines) <= 1:
|
|
return " ".join(lines)
|
|
|
|
for ix, line in enumerate(lines[1:]):
|
|
prev_line = lines[ix]
|
|
|
|
if prev_line.endswith("-"):
|
|
prev_words = re.findall(r"\b[\w]+\b", prev_line)
|
|
line_words = re.findall(r"\b[\w]+\b", line)
|
|
|
|
if (
|
|
len(prev_words)
|
|
and len(line_words)
|
|
and prev_words[-1].isalnum()
|
|
and line_words[0].isalnum()
|
|
):
|
|
lines[ix] = prev_line[:-1]
|
|
else:
|
|
lines[ix] += " "
|
|
|
|
sanitized_text = "".join(lines)
|
|
|
|
return sanitized_text.strip() # Strip any leading or trailing whitespace
|
|
|
|
def __call__(
|
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
|
) -> Iterable[Page]:
|
|
for page in page_batch:
|
|
assert page._backend is not None
|
|
if not page._backend.is_valid():
|
|
yield page
|
|
else:
|
|
with TimeRecorder(conv_res, "page_assemble"):
|
|
|
|
assert page.predictions.layout is not None
|
|
|
|
# assembles some JSON output page by page.
|
|
|
|
elements: List[PageElement] = []
|
|
headers: List[PageElement] = []
|
|
body: List[PageElement] = []
|
|
|
|
for cluster in page.predictions.layout.clusters:
|
|
# _log.info("Cluster label seen:", cluster.label)
|
|
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
|
|
|
textlines = [
|
|
cell.text.replace("\x02", "-").strip()
|
|
for cell in cluster.cells
|
|
if len(cell.text.strip()) > 0
|
|
]
|
|
text = self.sanitize_text(textlines)
|
|
text_el = TextElement(
|
|
label=cluster.label,
|
|
id=cluster.id,
|
|
text=text,
|
|
page_no=page.page_no,
|
|
cluster=cluster,
|
|
)
|
|
elements.append(text_el)
|
|
|
|
if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
|
|
headers.append(text_el)
|
|
else:
|
|
body.append(text_el)
|
|
elif cluster.label in LayoutModel.TABLE_LABELS:
|
|
tbl = None
|
|
if page.predictions.tablestructure:
|
|
tbl = page.predictions.tablestructure.table_map.get(
|
|
cluster.id, None
|
|
)
|
|
if (
|
|
not tbl
|
|
): # fallback: add table without structure, if it isn't present
|
|
tbl = Table(
|
|
label=cluster.label,
|
|
id=cluster.id,
|
|
text="",
|
|
otsl_seq=[],
|
|
table_cells=[],
|
|
cluster=cluster,
|
|
page_no=page.page_no,
|
|
)
|
|
|
|
elements.append(tbl)
|
|
body.append(tbl)
|
|
elif cluster.label == LayoutModel.FIGURE_LABEL:
|
|
fig = None
|
|
if page.predictions.figures_classification:
|
|
fig = page.predictions.figures_classification.figure_map.get(
|
|
cluster.id, None
|
|
)
|
|
if (
|
|
not fig
|
|
): # fallback: add figure without classification, if it isn't present
|
|
fig = FigureElement(
|
|
label=cluster.label,
|
|
id=cluster.id,
|
|
text="",
|
|
data=None,
|
|
cluster=cluster,
|
|
page_no=page.page_no,
|
|
)
|
|
elements.append(fig)
|
|
body.append(fig)
|
|
elif cluster.label == LayoutModel.FORMULA_LABEL:
|
|
equation = None
|
|
if page.predictions.equations_prediction:
|
|
equation = page.predictions.equations_prediction.equation_map.get(
|
|
cluster.id, None
|
|
)
|
|
if (
|
|
not equation
|
|
): # fallback: add empty formula, if it isn't present
|
|
text = self.sanitize_text(
|
|
[
|
|
cell.text.replace("\x02", "-").strip()
|
|
for cell in cluster.cells
|
|
if len(cell.text.strip()) > 0
|
|
]
|
|
)
|
|
equation = TextElement(
|
|
label=cluster.label,
|
|
id=cluster.id,
|
|
cluster=cluster,
|
|
page_no=page.page_no,
|
|
text=text,
|
|
)
|
|
elements.append(equation)
|
|
body.append(equation)
|
|
elif cluster.label in LayoutModel.CONTAINER_LABELS:
|
|
container_el = ContainerElement(
|
|
label=cluster.label,
|
|
id=cluster.id,
|
|
page_no=page.page_no,
|
|
cluster=cluster,
|
|
)
|
|
elements.append(container_el)
|
|
body.append(container_el)
|
|
|
|
page.assembled = AssembledUnit(
|
|
elements=elements, headers=headers, body=body
|
|
)
|
|
|
|
# Remove page images (can be disabled)
|
|
if not self.options.keep_images:
|
|
page._image_cache = {}
|
|
|
|
# Unload backend
|
|
page._backend.unload()
|
|
|
|
yield page
|