feat: Add pipeline timings and toggle visualization, establish debug settings (#183)

* Add settings to turn visualization on or off

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add profiling code to all models

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Refactor and fix profiling codes

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Visualization codes output PNG to debug dir

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fixes for time logging

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Optimize imports

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update lockfile

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add start_timestamps to ProfilingItem

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-10-30 15:04:19 +01:00
committed by GitHub
parent 94a5290789
commit 2a2c65bf4f
23 changed files with 998 additions and 771 deletions

View File

@@ -8,8 +8,11 @@ import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
@@ -102,7 +105,9 @@ class TesseractOcrCliModel(BaseOcrModel):
return df_filtered
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
if not self.enabled:
yield from page_batch
@@ -113,62 +118,67 @@ class TesseractOcrCliModel(BaseOcrModel):
if not page._backend.is_valid():
yield page
else:
ocr_rects = self.get_ocr_rects(page)
with TimeRecorder(conv_res, "ocr"):
all_ocr_cells = []
for ocr_rect in ocr_rects:
# Skip zero area boxes
if ocr_rect.area() == 0:
continue
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = []
for ocr_rect in ocr_rects:
# Skip zero area boxes
if ocr_rect.area() == 0:
continue
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
)
with tempfile.NamedTemporaryFile(
suffix=".png", mode="w"
) as image_file:
fname = image_file.name
high_res_image.save(fname)
df = self._run_tesseract(fname)
# _log.info(df)
# Print relevant columns (bounding box and text)
for ix, row in df.iterrows():
text = row["text"]
conf = row["conf"]
l = float(row["left"])
b = float(row["top"])
w = float(row["width"])
h = float(row["height"])
t = b + h
r = l + w
cell = OcrCell(
id=ix,
text=text,
confidence=conf / 100.0,
bbox=BoundingBox.from_tuple(
coord=(
(l / self.scale) + ocr_rect.l,
(b / self.scale) + ocr_rect.t,
(r / self.scale) + ocr_rect.l,
(t / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
),
)
all_ocr_cells.append(cell)
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(
all_ocr_cells, page.cells
)
with tempfile.NamedTemporaryFile(
suffix=".png", mode="w"
) as image_file:
fname = image_file.name
high_res_image.save(fname)
df = self._run_tesseract(fname)
# _log.info(df)
# Print relevant columns (bounding box and text)
for ix, row in df.iterrows():
text = row["text"]
conf = row["conf"]
l = float(row["left"])
b = float(row["top"])
w = float(row["width"])
h = float(row["height"])
t = b + h
r = l + w
cell = OcrCell(
id=ix,
text=text,
confidence=conf / 100.0,
bbox=BoundingBox.from_tuple(
coord=(
(l / self.scale) + ocr_rect.l,
(b / self.scale) + ocr_rect.t,
(r / self.scale) + ocr_rect.l,
(t / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
),
)
all_ocr_cells.append(cell)
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
page.cells.extend(filtered_ocr_cells)
page.cells.extend(filtered_ocr_cells)
# DEBUG code:
# self.draw_ocr_rects_and_cells(page, ocr_rects)
if settings.debug.visualize_ocr:
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
yield page