Docling/docling/models/easyocr_model.py
2024-07-15 09:42:42 +02:00

78 lines
2.2 KiB
Python

import copy
import logging
import random
from typing import Iterable
import numpy
from PIL import ImageDraw
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
_log = logging.getLogger(__name__)
class EasyOcrModel:
def __init__(self, config):
self.config = config
self.enabled = config["enabled"]
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
if self.enabled:
import easyocr
self.reader = easyocr.Reader(config["lang"])
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
if not self.enabled:
yield from page_batch
return
for page in page_batch:
# rects = page._fpage.
high_res_image = page._backend.get_page_image(scale=self.scale)
im = numpy.array(high_res_image)
result = self.reader.readtext(im)
del high_res_image
del im
cells = [
OcrCell(
id=ix,
text=line[1],
confidence=line[2],
bbox=BoundingBox.from_tuple(
coord=(
line[0][0][0] / self.scale,
line[0][0][1] / self.scale,
line[0][2][0] / self.scale,
line[0][2][1] / self.scale,
),
origin=CoordOrigin.TOPLEFT,
),
)
for ix, line in enumerate(result)
]
page.cells = cells # For now, just overwrites all digital cells.
# DEBUG code:
def draw_clusters_and_cells():
image = copy.deepcopy(page.image)
draw = ImageDraw.Draw(image)
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
for tc in cells:
x0, y0, x1, y1 = tc.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# draw_clusters_and_cells()
yield page