feat: Add adaptive OCR, factor out treatment of OCR areas and cell filtering (#38)

* Introduce adaptive OCR

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Factor out BaseOcrModel, add docling-parse backend tests, fixes

* Make easyocr default dep

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-08-20 15:28:03 +02:00 committed by GitHub
parent 47b8ad917e
commit e94d317c02
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 285 additions and 83 deletions

View File

@ -18,6 +18,10 @@ class PdfPageBackend(ABC):
def get_text_cells(self) -> Iterable["Cell"]: def get_text_cells(self) -> Iterable["Cell"]:
pass pass
@abstractmethod
def get_bitmap_rects(self, scale: int = 1) -> Iterable["BoundingBox"]:
pass
@abstractmethod @abstractmethod
def get_page_image( def get_page_image(
self, scale: int = 1, cropbox: Optional["BoundingBox"] = None self, scale: int = 1, cropbox: Optional["BoundingBox"] = None

View File

@ -3,7 +3,7 @@ import random
import time import time
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Iterable, List, Optional, Union from typing import Iterable, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
from docling_parse.docling_parse import pdf_parser from docling_parse.docling_parse import pdf_parser
@ -43,7 +43,7 @@ class DoclingParsePageBackend(PdfPageBackend):
r=x1 * scale * page_size.width / parser_width, r=x1 * scale * page_size.width / parser_width,
t=y1 * scale * page_size.height / parser_height, t=y1 * scale * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT, coord_origin=CoordOrigin.BOTTOMLEFT,
).to_top_left_origin(page_size.height * scale) ).to_top_left_origin(page_height=page_size.height * scale)
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area() overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
@ -66,6 +66,12 @@ class DoclingParsePageBackend(PdfPageBackend):
for i in range(len(self._dpage["cells"])): for i in range(len(self._dpage["cells"])):
rect = self._dpage["cells"][i]["box"]["device"] rect = self._dpage["cells"][i]["box"]["device"]
x0, y0, x1, y1 = rect x0, y0, x1, y1 = rect
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = self._dpage["cells"][i]["content"]["rnormalized"] text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
cells.append( cells.append(
Cell( Cell(
@ -108,6 +114,20 @@ class DoclingParsePageBackend(PdfPageBackend):
return cells return cells
def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 32 * 32
for i in range(len(self._dpage["images"])):
bitmap = self._dpage["images"][i]
cropbox = BoundingBox.from_tuple(
bitmap["box"], origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(self.get_size().height)
if cropbox.area() > AREA_THRESHOLD:
cropbox = cropbox.scaled(scale=scale)
yield cropbox
def get_page_image( def get_page_image(
self, scale: int = 1, cropbox: Optional[BoundingBox] = None self, scale: int = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image: ) -> Image.Image:
@ -173,7 +193,7 @@ class DoclingParseDocumentBackend(PdfDocumentBackend):
def page_count(self) -> int: def page_count(self) -> int:
return len(self._parser_doc["pages"]) return len(self._parser_doc["pages"])
def load_page(self, page_no: int) -> PdfPage: def load_page(self, page_no: int) -> DoclingParsePageBackend:
return DoclingParsePageBackend( return DoclingParsePageBackend(
self._pdoc[page_no], self._parser_doc["pages"][page_no] self._pdoc[page_no], self._parser_doc["pages"][page_no]
) )

View File

@ -4,6 +4,7 @@ from pathlib import Path
from typing import Iterable, List, Optional, Union from typing import Iterable, List, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from pypdfium2 import PdfPage from pypdfium2 import PdfPage
@ -17,6 +18,19 @@ class PyPdfiumPageBackend(PdfPageBackend):
self._ppage = page_obj self._ppage = page_obj
self.text_page = None self.text_page = None
def get_bitmap_rects(self, scale: int = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 32 * 32
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
cropbox = BoundingBox.from_tuple(
pos, origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height=self.get_size().height)
if cropbox.area() > AREA_THRESHOLD:
cropbox = cropbox.scaled(scale=scale)
yield cropbox
def get_text_in_rect(self, bbox: BoundingBox) -> str: def get_text_in_rect(self, bbox: BoundingBox) -> str:
if not self.text_page: if not self.text_page:
self.text_page = self._ppage.get_textpage() self.text_page = self._ppage.get_textpage()
@ -208,7 +222,7 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
def page_count(self) -> int: def page_count(self) -> int:
return len(self._pdoc) return len(self._pdoc)
def load_page(self, page_no: int) -> PdfPage: def load_page(self, page_no: int) -> PyPdfiumPageBackend:
return PyPdfiumPageBackend(self._pdoc[page_no]) return PyPdfiumPageBackend(self._pdoc[page_no])
def is_valid(self) -> bool: def is_valid(self) -> bool:

View File

@ -68,13 +68,21 @@ class BoundingBox(BaseModel):
@classmethod @classmethod
def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin): def from_tuple(cls, coord: Tuple[float], origin: CoordOrigin):
if origin == CoordOrigin.TOPLEFT: if origin == CoordOrigin.TOPLEFT:
return BoundingBox( l, t, r, b = coord[0], coord[1], coord[2], coord[3]
l=coord[0], t=coord[1], r=coord[2], b=coord[3], coord_origin=origin if r < l:
) l, r = r, l
if b < t:
b, t = t, b
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
elif origin == CoordOrigin.BOTTOMLEFT: elif origin == CoordOrigin.BOTTOMLEFT:
return BoundingBox( l, b, r, t = coord[0], coord[1], coord[2], coord[3]
l=coord[0], b=coord[1], r=coord[2], t=coord[3], coord_origin=origin if r < l:
) l, r = r, l
if b > t:
b, t = t, b
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
def area(self) -> float: def area(self) -> float:
return (self.r - self.l) * (self.b - self.t) return (self.r - self.l) * (self.b - self.t)
@ -280,7 +288,7 @@ class TableStructureOptions(BaseModel):
class PipelineOptions(BaseModel): class PipelineOptions(BaseModel):
do_table_structure: bool = True # True: perform table structure extraction do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = False # True: perform OCR, replace programmatic PDF text do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions() table_structure_options: TableStructureOptions = TableStructureOptions()

View File

@ -35,8 +35,6 @@ _log = logging.getLogger(__name__)
class DocumentConverter: class DocumentConverter:
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
_table_model_path = "model_artifacts/tableformer"
_default_download_filename = "file.pdf" _default_download_filename = "file.pdf"
def __init__( def __init__(

View File

@ -0,0 +1,124 @@
import copy
import logging
from abc import abstractmethod
from typing import Iterable, List, Tuple
import numpy
import numpy as np
from PIL import Image, ImageDraw
from rtree import index
from scipy.ndimage import find_objects, label
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
_log = logging.getLogger(__name__)
class BaseOcrModel:
def __init__(self, config):
self.config = config
self.enabled = config["enabled"]
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
BITMAP_COVERAGE_TRESHOLD = 0.75
def find_ocr_rects(size, bitmap_rects):
image = Image.new(
"1", (round(size.width), round(size.height))
) # '1' mode is binary
# Draw all bitmap rects into a binary image
draw = ImageDraw.Draw(image)
for rect in bitmap_rects:
x0, y0, x1, y1 = rect.as_tuple()
x0, y0, x1, y1 = round(x0), round(y0), round(x1), round(y1)
draw.rectangle([(x0, y0), (x1, y1)], fill=1)
np_image = np.array(image)
# Find the connected components
labeled_image, num_features = label(
np_image > 0
) # Label black (0 value) regions
# Find enclosing bounding boxes for each connected component.
slices = find_objects(labeled_image)
bounding_boxes = [
BoundingBox(
l=slc[1].start,
t=slc[0].start,
r=slc[1].stop - 1,
b=slc[0].stop - 1,
coord_origin=CoordOrigin.TOPLEFT,
)
for slc in slices
]
# Compute area fraction on page covered by bitmaps
area_frac = np.sum(np_image > 0) / (size.width * size.height)
return (area_frac, bounding_boxes) # fraction covered # boxes
bitmap_rects = page._backend.get_bitmap_rects()
coverage, ocr_rects = find_ocr_rects(page.size, bitmap_rects)
# return full-page rectangle if sufficiently covered with bitmaps
if coverage > BITMAP_COVERAGE_TRESHOLD:
return [
BoundingBox(
l=0,
t=0,
r=page.size.width,
b=page.size.height,
coord_origin=CoordOrigin.TOPLEFT,
)
]
# return individual rectangles if the bitmap coverage is smaller
elif coverage < BITMAP_COVERAGE_TRESHOLD:
return ocr_rects
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
def filter_ocr_cells(self, ocr_cells, programmatic_cells):
# Create R-tree index for programmatic cells
p = index.Property()
p.dimension = 2
idx = index.Index(properties=p)
for i, cell in enumerate(programmatic_cells):
idx.insert(i, cell.bbox.as_tuple())
def is_overlapping_with_existing_cells(ocr_cell):
# Query the R-tree to get overlapping rectangles
possible_matches_index = list(idx.intersection(ocr_cell.bbox.as_tuple()))
return (
len(possible_matches_index) > 0
) # this is a weak criterion but it works.
filtered_ocr_cells = [
rect for rect in ocr_cells if not is_overlapping_with_existing_cells(rect)
]
return filtered_ocr_cells
def draw_ocr_rects_and_cells(self, page, ocr_rects):
image = copy.deepcopy(page.image)
draw = ImageDraw.Draw(image, "RGBA")
# Draw OCR rectangles as yellow filled rect
for rect in ocr_rects:
x0, y0, x1, y1 = rect.as_tuple()
shade_color = (255, 255, 0, 40) # transparent yellow
draw.rectangle([(x0, y0), (x1, y1)], fill=shade_color, outline=None)
# Draw OCR and programmatic cells
for tc in page.cells:
x0, y0, x1, y1 = tc.bbox.as_tuple()
color = "red"
if isinstance(tc, OcrCell):
color = "magenta"
draw.rectangle([(x0, y0), (x1, y1)], outline=color)
image.show()
@abstractmethod
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
pass

View File

@ -1,20 +1,18 @@
import copy
import logging import logging
import random
from typing import Iterable from typing import Iterable
import numpy import numpy
from PIL import ImageDraw
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
from docling.models.base_ocr_model import BaseOcrModel
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class EasyOcrModel: class EasyOcrModel(BaseOcrModel):
def __init__(self, config): def __init__(self, config):
self.config = config super().__init__(config)
self.enabled = config["enabled"]
self.scale = 3 # multiplier for 72 dpi == 216 dpi. self.scale = 3 # multiplier for 72 dpi == 216 dpi.
if self.enabled: if self.enabled:
@ -29,8 +27,13 @@ class EasyOcrModel:
return return
for page in page_batch: for page in page_batch:
# rects = page._fpage. ocr_rects = self.get_ocr_rects(page)
high_res_image = page.get_image(scale=self.scale)
all_ocr_cells = []
for ocr_rect in ocr_rects:
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
)
im = numpy.array(high_res_image) im = numpy.array(high_res_image)
result = self.reader.readtext(im) result = self.reader.readtext(im)
@ -44,34 +47,24 @@ class EasyOcrModel:
confidence=line[2], confidence=line[2],
bbox=BoundingBox.from_tuple( bbox=BoundingBox.from_tuple(
coord=( coord=(
line[0][0][0] / self.scale, (line[0][0][0] / self.scale) + ocr_rect.l,
line[0][0][1] / self.scale, (line[0][0][1] / self.scale) + ocr_rect.t,
line[0][2][0] / self.scale, (line[0][2][0] / self.scale) + ocr_rect.l,
line[0][2][1] / self.scale, (line[0][2][1] / self.scale) + ocr_rect.t,
), ),
origin=CoordOrigin.TOPLEFT, origin=CoordOrigin.TOPLEFT,
), ),
) )
for ix, line in enumerate(result) for ix, line in enumerate(result)
] ]
all_ocr_cells.extend(cells)
page.cells = cells # For now, just overwrites all digital cells. ## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
page.cells.extend(filtered_ocr_cells)
# DEBUG code: # DEBUG code:
def draw_clusters_and_cells(): # self.draw_ocr_rects_and_cells(page, ocr_rects)
image = copy.deepcopy(page.image)
draw = ImageDraw.Draw(image)
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
for tc in cells:
x0, y0, x1, y1 = tc.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# draw_clusters_and_cells()
yield page yield page

View File

@ -1,5 +1,4 @@
import copy import copy
import random
from typing import Iterable, List from typing import Iterable, List
import numpy import numpy

View File

@ -1,4 +1,3 @@
from abc import abstractmethod
from pathlib import Path from pathlib import Path
from typing import Iterable from typing import Iterable

View File

@ -1,10 +1,8 @@
from pathlib import Path from pathlib import Path
from typing import Iterable
from docling.datamodel.base_models import Page, PipelineOptions from docling.datamodel.base_models import PipelineOptions
from docling.models.easyocr_model import EasyOcrModel from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel from docling.models.layout_model import LayoutModel
from docling.models.page_assemble_model import PageAssembleModel
from docling.models.table_structure_model import TableStructureModel from docling.models.table_structure_model import TableStructureModel
from docling.pipeline.base_model_pipeline import BaseModelPipeline from docling.pipeline.base_model_pipeline import BaseModelPipeline

48
poetry.lock generated
View File

@ -966,7 +966,7 @@ pgp = ["gpg"]
name = "easyocr" name = "easyocr"
version = "1.7.1" version = "1.7.1"
description = "End-to-End Multi-Lingual Optical Character Recognition (OCR) Solution" description = "End-to-End Multi-Lingual Optical Character Recognition (OCR) Solution"
optional = true optional = false
python-versions = "*" python-versions = "*"
files = [ files = [
{file = "easyocr-1.7.1-py3-none-any.whl", hash = "sha256:5b0a2e7cfdfc6c1ec99d9583663e570e4189dca6fbf373f074b21b8809e44d2b"}, {file = "easyocr-1.7.1-py3-none-any.whl", hash = "sha256:5b0a2e7cfdfc6c1ec99d9583663e570e4189dca6fbf373f074b21b8809e44d2b"},
@ -1336,7 +1336,7 @@ files = [
name = "imageio" name = "imageio"
version = "2.34.2" version = "2.34.2"
description = "Library for reading and writing a wide range of image, video, scientific, and volumetric data formats." description = "Library for reading and writing a wide range of image, video, scientific, and volumetric data formats."
optional = true optional = false
python-versions = ">=3.8" python-versions = ">=3.8"
files = [ files = [
{file = "imageio-2.34.2-py3-none-any.whl", hash = "sha256:a0bb27ec9d5bab36a9f4835e51b21d2cb099e1f78451441f94687ff3404b79f8"}, {file = "imageio-2.34.2-py3-none-any.whl", hash = "sha256:a0bb27ec9d5bab36a9f4835e51b21d2cb099e1f78451441f94687ff3404b79f8"},
@ -1760,7 +1760,7 @@ files = [
name = "lazy-loader" name = "lazy-loader"
version = "0.4" version = "0.4"
description = "Makes it easy to load subpackages and functions on demand." description = "Makes it easy to load subpackages and functions on demand."
optional = true optional = false
python-versions = ">=3.7" python-versions = ">=3.7"
files = [ files = [
{file = "lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc"}, {file = "lazy_loader-0.4-py3-none-any.whl", hash = "sha256:342aa8e14d543a154047afb4ba8ef17f5563baad3fc610d7b15b213b0f119efc"},
@ -2389,7 +2389,7 @@ files = [
name = "ninja" name = "ninja"
version = "1.11.1.1" version = "1.11.1.1"
description = "Ninja is a small build system with a focus on speed" description = "Ninja is a small build system with a focus on speed"
optional = true optional = false
python-versions = "*" python-versions = "*"
files = [ files = [
{file = "ninja-1.11.1.1-py2.py3-none-macosx_10_9_universal2.macosx_10_9_x86_64.macosx_11_0_arm64.macosx_11_0_universal2.whl", hash = "sha256:376889c76d87b95b5719fdd61dd7db193aa7fd4432e5d52d2e44e4c497bdbbee"}, {file = "ninja-1.11.1.1-py2.py3-none-macosx_10_9_universal2.macosx_10_9_x86_64.macosx_11_0_arm64.macosx_11_0_universal2.whl", hash = "sha256:376889c76d87b95b5719fdd61dd7db193aa7fd4432e5d52d2e44e4c497bdbbee"},
@ -3132,7 +3132,7 @@ global = ["pybind11-global (==2.13.1)"]
name = "pyclipper" name = "pyclipper"
version = "1.3.0.post5" version = "1.3.0.post5"
description = "Cython wrapper for the C++ translation of the Angus Johnson's Clipper library (ver. 6.4.2)" description = "Cython wrapper for the C++ translation of the Angus Johnson's Clipper library (ver. 6.4.2)"
optional = true optional = false
python-versions = "*" python-versions = "*"
files = [ files = [
{file = "pyclipper-1.3.0.post5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3c45f99b8180dd4df4c86642657ca92b7d5289a5e3724521822e0f9461961fe2"}, {file = "pyclipper-1.3.0.post5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3c45f99b8180dd4df4c86642657ca92b7d5289a5e3724521822e0f9461961fe2"},
@ -3535,7 +3535,7 @@ testing = ["filelock"]
name = "python-bidi" name = "python-bidi"
version = "0.6.0" version = "0.6.0"
description = "Python Bidi layout wrapping the Rust crate unicode-bidi" description = "Python Bidi layout wrapping the Rust crate unicode-bidi"
optional = true optional = false
python-versions = "*" python-versions = "*"
files = [ files = [
{file = "python_bidi-0.6.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:032b16f70c5d4f48c8dc5a4ade071826a0fb64172e0435d49deba6ea66fc5d42"}, {file = "python_bidi-0.6.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:032b16f70c5d4f48c8dc5a4ade071826a0fb64172e0435d49deba6ea66fc5d42"},
@ -4155,11 +4155,30 @@ files = [
{file = "rpds_py-0.20.0.tar.gz", hash = "sha256:d72a210824facfdaf8768cf2d7ca25a042c30320b3020de2fa04640920d4e121"}, {file = "rpds_py-0.20.0.tar.gz", hash = "sha256:d72a210824facfdaf8768cf2d7ca25a042c30320b3020de2fa04640920d4e121"},
] ]
[[package]]
name = "rtree"
version = "1.3.0"
description = "R-Tree spatial index for Python GIS"
optional = false
python-versions = ">=3.8"
files = [
{file = "Rtree-1.3.0-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:80879d9db282a2273ca3a0d896c84583940e9777477727a277624ebfd424c517"},
{file = "Rtree-1.3.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:4328e9e421797c347e6eb08efbbade962fe3664ebd60c1dffe82c40911b1e125"},
{file = "Rtree-1.3.0-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:037130d3ce1fc029de81941ec416ba5546f66228380ba19bb41f2ea1294e8423"},
{file = "Rtree-1.3.0-py3-none-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:864a05d0c3b7ce6c5e34378b7ab630057603b79179368bc50624258bdf2ff631"},
{file = "Rtree-1.3.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ec2ed6d1635753dab966e68f592a9c4896f3f4ec6ad2b09b776d592eacd883a9"},
{file = "Rtree-1.3.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:b4485fb3e5c5e85b94a95f0a930a3848e040d2699cfb012940ba5b0130f1e09a"},
{file = "Rtree-1.3.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:7e2e9211f4fb404c06a08fd2cbebb03234214f73c51913bb371c3d9954e99cc9"},
{file = "Rtree-1.3.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:c021f4772b25cc24915da8073e553ded6fa8d0b317caa4202255ed26b2344c1c"},
{file = "Rtree-1.3.0-py3-none-win_amd64.whl", hash = "sha256:97f835801d24c10bbf02381abe5e327345c8296ec711dde7658792376abafc66"},
{file = "rtree-1.3.0.tar.gz", hash = "sha256:b36e9dd2dc60ffe3d02e367242d2c26f7281b00e1aaf0c39590442edaaadd916"},
]
[[package]] [[package]]
name = "scikit-image" name = "scikit-image"
version = "0.24.0" version = "0.24.0"
description = "Image processing in Python" description = "Image processing in Python"
optional = true optional = false
python-versions = ">=3.9" python-versions = ">=3.9"
files = [ files = [
{file = "scikit_image-0.24.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cb3bc0264b6ab30b43c4179ee6156bc18b4861e78bb329dd8d16537b7bbf827a"}, {file = "scikit_image-0.24.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:cb3bc0264b6ab30b43c4179ee6156bc18b4861e78bb329dd8d16537b7bbf827a"},
@ -4207,7 +4226,7 @@ test = ["asv", "numpydoc (>=1.7)", "pooch (>=1.6.0)", "pytest (>=7.0)", "pytest-
name = "scipy" name = "scipy"
version = "1.14.0" version = "1.14.0"
description = "Fundamental algorithms for scientific computing in Python" description = "Fundamental algorithms for scientific computing in Python"
optional = true optional = false
python-versions = ">=3.10" python-versions = ">=3.10"
files = [ files = [
{file = "scipy-1.14.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7e911933d54ead4d557c02402710c2396529540b81dd554fc1ba270eb7308484"}, {file = "scipy-1.14.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7e911933d54ead4d557c02402710c2396529540b81dd554fc1ba270eb7308484"},
@ -4291,41 +4310,35 @@ test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "importlib-metadata
name = "shapely" name = "shapely"
version = "2.0.5" version = "2.0.5"
description = "Manipulation and analysis of geometric objects" description = "Manipulation and analysis of geometric objects"
optional = true optional = false
python-versions = ">=3.7" python-versions = ">=3.7"
files = [ files = [
{file = "shapely-2.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:89d34787c44f77a7d37d55ae821f3a784fa33592b9d217a45053a93ade899375"}, {file = "shapely-2.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:89d34787c44f77a7d37d55ae821f3a784fa33592b9d217a45053a93ade899375"},
{file = "shapely-2.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:798090b426142df2c5258779c1d8d5734ec6942f778dab6c6c30cfe7f3bf64ff"}, {file = "shapely-2.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:798090b426142df2c5258779c1d8d5734ec6942f778dab6c6c30cfe7f3bf64ff"},
{file = "shapely-2.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45211276900c4790d6bfc6105cbf1030742da67594ea4161a9ce6812a6721e68"},
{file = "shapely-2.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e119444bc27ca33e786772b81760f2028d930ac55dafe9bc50ef538b794a8e1"}, {file = "shapely-2.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2e119444bc27ca33e786772b81760f2028d930ac55dafe9bc50ef538b794a8e1"},
{file = "shapely-2.0.5-cp310-cp310-win32.whl", hash = "sha256:9a4492a2b2ccbeaebf181e7310d2dfff4fdd505aef59d6cb0f217607cb042fb3"}, {file = "shapely-2.0.5-cp310-cp310-win32.whl", hash = "sha256:9a4492a2b2ccbeaebf181e7310d2dfff4fdd505aef59d6cb0f217607cb042fb3"},
{file = "shapely-2.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:1e5cb5ee72f1bc7ace737c9ecd30dc174a5295fae412972d3879bac2e82c8fae"}, {file = "shapely-2.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:1e5cb5ee72f1bc7ace737c9ecd30dc174a5295fae412972d3879bac2e82c8fae"},
{file = "shapely-2.0.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5bbfb048a74cf273db9091ff3155d373020852805a37dfc846ab71dde4be93ec"}, {file = "shapely-2.0.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5bbfb048a74cf273db9091ff3155d373020852805a37dfc846ab71dde4be93ec"},
{file = "shapely-2.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93be600cbe2fbaa86c8eb70656369f2f7104cd231f0d6585c7d0aa555d6878b8"}, {file = "shapely-2.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:93be600cbe2fbaa86c8eb70656369f2f7104cd231f0d6585c7d0aa555d6878b8"},
{file = "shapely-2.0.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0f8e71bb9a46814019f6644c4e2560a09d44b80100e46e371578f35eaaa9da1c"},
{file = "shapely-2.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5251c28a29012e92de01d2e84f11637eb1d48184ee8f22e2df6c8c578d26760"}, {file = "shapely-2.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5251c28a29012e92de01d2e84f11637eb1d48184ee8f22e2df6c8c578d26760"},
{file = "shapely-2.0.5-cp311-cp311-win32.whl", hash = "sha256:35110e80070d664781ec7955c7de557456b25727a0257b354830abb759bf8311"}, {file = "shapely-2.0.5-cp311-cp311-win32.whl", hash = "sha256:35110e80070d664781ec7955c7de557456b25727a0257b354830abb759bf8311"},
{file = "shapely-2.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:6c6b78c0007a34ce7144f98b7418800e0a6a5d9a762f2244b00ea560525290c9"}, {file = "shapely-2.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:6c6b78c0007a34ce7144f98b7418800e0a6a5d9a762f2244b00ea560525290c9"},
{file = "shapely-2.0.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:03bd7b5fa5deb44795cc0a503999d10ae9d8a22df54ae8d4a4cd2e8a93466195"}, {file = "shapely-2.0.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:03bd7b5fa5deb44795cc0a503999d10ae9d8a22df54ae8d4a4cd2e8a93466195"},
{file = "shapely-2.0.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ff9521991ed9e201c2e923da014e766c1aa04771bc93e6fe97c27dcf0d40ace"}, {file = "shapely-2.0.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ff9521991ed9e201c2e923da014e766c1aa04771bc93e6fe97c27dcf0d40ace"},
{file = "shapely-2.0.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1b65365cfbf657604e50d15161ffcc68de5cdb22a601bbf7823540ab4918a98d"},
{file = "shapely-2.0.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21f64e647a025b61b19585d2247137b3a38a35314ea68c66aaf507a1c03ef6fe"}, {file = "shapely-2.0.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21f64e647a025b61b19585d2247137b3a38a35314ea68c66aaf507a1c03ef6fe"},
{file = "shapely-2.0.5-cp312-cp312-win32.whl", hash = "sha256:3ac7dc1350700c139c956b03d9c3df49a5b34aaf91d024d1510a09717ea39199"}, {file = "shapely-2.0.5-cp312-cp312-win32.whl", hash = "sha256:3ac7dc1350700c139c956b03d9c3df49a5b34aaf91d024d1510a09717ea39199"},
{file = "shapely-2.0.5-cp312-cp312-win_amd64.whl", hash = "sha256:30e8737983c9d954cd17feb49eb169f02f1da49e24e5171122cf2c2b62d65c95"}, {file = "shapely-2.0.5-cp312-cp312-win_amd64.whl", hash = "sha256:30e8737983c9d954cd17feb49eb169f02f1da49e24e5171122cf2c2b62d65c95"},
{file = "shapely-2.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ff7731fea5face9ec08a861ed351734a79475631b7540ceb0b66fb9732a5f529"}, {file = "shapely-2.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:ff7731fea5face9ec08a861ed351734a79475631b7540ceb0b66fb9732a5f529"},
{file = "shapely-2.0.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ff9e520af0c5a578e174bca3c18713cd47a6c6a15b6cf1f50ac17dc8bb8db6a2"},
{file = "shapely-2.0.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49b299b91557b04acb75e9732645428470825061f871a2edc36b9417d66c1fc5"}, {file = "shapely-2.0.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49b299b91557b04acb75e9732645428470825061f871a2edc36b9417d66c1fc5"},
{file = "shapely-2.0.5-cp37-cp37m-win32.whl", hash = "sha256:b5870633f8e684bf6d1ae4df527ddcb6f3895f7b12bced5c13266ac04f47d231"}, {file = "shapely-2.0.5-cp37-cp37m-win32.whl", hash = "sha256:b5870633f8e684bf6d1ae4df527ddcb6f3895f7b12bced5c13266ac04f47d231"},
{file = "shapely-2.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:401cb794c5067598f50518e5a997e270cd7642c4992645479b915c503866abed"}, {file = "shapely-2.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:401cb794c5067598f50518e5a997e270cd7642c4992645479b915c503866abed"},
{file = "shapely-2.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e91ee179af539100eb520281ba5394919067c6b51824e6ab132ad4b3b3e76dd0"}, {file = "shapely-2.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:e91ee179af539100eb520281ba5394919067c6b51824e6ab132ad4b3b3e76dd0"},
{file = "shapely-2.0.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8af6f7260f809c0862741ad08b1b89cb60c130ae30efab62320bbf4ee9cc71fa"}, {file = "shapely-2.0.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:8af6f7260f809c0862741ad08b1b89cb60c130ae30efab62320bbf4ee9cc71fa"},
{file = "shapely-2.0.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f5456dd522800306ba3faef77c5ba847ec30a0bd73ab087a25e0acdd4db2514f"},
{file = "shapely-2.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b714a840402cde66fd7b663bb08cacb7211fa4412ea2a209688f671e0d0631fd"}, {file = "shapely-2.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b714a840402cde66fd7b663bb08cacb7211fa4412ea2a209688f671e0d0631fd"},
{file = "shapely-2.0.5-cp38-cp38-win32.whl", hash = "sha256:7e8cf5c252fac1ea51b3162be2ec3faddedc82c256a1160fc0e8ddbec81b06d2"}, {file = "shapely-2.0.5-cp38-cp38-win32.whl", hash = "sha256:7e8cf5c252fac1ea51b3162be2ec3faddedc82c256a1160fc0e8ddbec81b06d2"},
{file = "shapely-2.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:4461509afdb15051e73ab178fae79974387f39c47ab635a7330d7fee02c68a3f"}, {file = "shapely-2.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:4461509afdb15051e73ab178fae79974387f39c47ab635a7330d7fee02c68a3f"},
{file = "shapely-2.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7545a39c55cad1562be302d74c74586f79e07b592df8ada56b79a209731c0219"}, {file = "shapely-2.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:7545a39c55cad1562be302d74c74586f79e07b592df8ada56b79a209731c0219"},
{file = "shapely-2.0.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4c83a36f12ec8dee2066946d98d4d841ab6512a6ed7eb742e026a64854019b5f"}, {file = "shapely-2.0.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4c83a36f12ec8dee2066946d98d4d841ab6512a6ed7eb742e026a64854019b5f"},
{file = "shapely-2.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89e640c2cd37378480caf2eeda9a51be64201f01f786d127e78eaeff091ec897"},
{file = "shapely-2.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06efe39beafde3a18a21dde169d32f315c57da962826a6d7d22630025200c5e6"}, {file = "shapely-2.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06efe39beafde3a18a21dde169d32f315c57da962826a6d7d22630025200c5e6"},
{file = "shapely-2.0.5-cp39-cp39-win32.whl", hash = "sha256:8203a8b2d44dcb366becbc8c3d553670320e4acf0616c39e218c9561dd738d92"}, {file = "shapely-2.0.5-cp39-cp39-win32.whl", hash = "sha256:8203a8b2d44dcb366becbc8c3d553670320e4acf0616c39e218c9561dd738d92"},
{file = "shapely-2.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:7fed9dbfbcfec2682d9a047b9699db8dcc890dfca857ecba872c42185fc9e64e"}, {file = "shapely-2.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:7fed9dbfbcfec2682d9a047b9699db8dcc890dfca857ecba872c42185fc9e64e"},
@ -4544,7 +4557,7 @@ files = [
name = "tifffile" name = "tifffile"
version = "2024.7.24" version = "2024.7.24"
description = "Read and write TIFF files" description = "Read and write TIFF files"
optional = true optional = false
python-versions = ">=3.9" python-versions = ">=3.9"
files = [ files = [
{file = "tifffile-2024.7.24-py3-none-any.whl", hash = "sha256:f5cce1a915c37bc44ae4a792e3b42c07a30a3fa88406f5c6060a3de076487ed1"}, {file = "tifffile-2024.7.24-py3-none-any.whl", hash = "sha256:f5cce1a915c37bc44ae4a792e3b42c07a30a3fa88406f5c6060a3de076487ed1"},
@ -5105,10 +5118,9 @@ doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linke
test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
[extras] [extras]
easyocr = ["easyocr"]
ocr = ["easyocr"] ocr = ["easyocr"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "1b8f8f79c26b79a1421f9c587eb8972a4434cc2ea8d7112b97ebb56ab7cda845" content-hash = "b2eabf8ecd0ce4a702875d0f785eac86d2cfe3c7d36c09b25d503ee31ea19bd4"

View File

@ -31,9 +31,10 @@ pypdfium2 = "^4.30.0"
pydantic-settings = "^2.3.0" pydantic-settings = "^2.3.0"
huggingface_hub = ">=0.23,<1" huggingface_hub = ">=0.23,<1"
requests = "^2.32.3" requests = "^2.32.3"
easyocr = { version = "^1.7", optional = true } easyocr = { version = "^1.7"}
docling-parse = "^0.2.0" docling-parse = "^0.2.0"
certifi = ">=2024.7.4" certifi = ">=2024.7.4"
rtree = "^1.3.0"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"} black = {extras = ["jupyter"], version = "^24.4.2"}
@ -50,7 +51,6 @@ flake8-pyproject = "^1.2.3"
pylint = "^2.17.5" pylint = "^2.17.5"
[tool.poetry.extras] [tool.poetry.extras]
easyocr = ["easyocr"]
ocr = ["easyocr"] ocr = ["easyocr"]
[build-system] [build-system]

View File

@ -0,0 +1,33 @@
from pathlib import Path
import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend, DoclingParsePageBackend
from docling.datamodel.base_models import BoundingBox
@pytest.fixture
def test_doc_path():
return Path("./data/2206.01062.pdf")
def test_get_text_from_rect(test_doc_path):
doc_backend = DoclingParseDocumentBackend(test_doc_path)
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
# Get the title text of the DocLayNet paper
textpiece = page_backend.get_text_in_rect(bbox=BoundingBox(l=102,t=77,r=511,b=124))
ref = "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis"
assert textpiece.strip() == ref
def test_crop_page_image(test_doc_path):
doc_backend = DoclingParseDocumentBackend(test_doc_path)
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image(scale=2, cropbox=BoundingBox(l=317,t=246,r=574,b=527))
# im.show()
def test_num_pages(test_doc_path):
doc_backend = DoclingParseDocumentBackend(test_doc_path)
doc_backend.page_count() == 9