feat: introducing docling_backend (#26)
Uses our own docling_parse to reliably get PDF cells To get page images, this backend uses pypdfium2 Signed-off-by: Maxim Lysak <mly@zurich.ibm.com> Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
62ba4aaf31
commit
b8f5e38a8c
171
docling/backend/docling_parse_backend.py
Normal file
171
docling/backend/docling_parse_backend.py
Normal file
@ -0,0 +1,171 @@
|
||||
import random
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_parse.docling_parse import pdf_parser
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import BoundingBox, Cell, CoordOrigin, PageSize
|
||||
|
||||
|
||||
class DoclingParsePageBackend(PdfPageBackend):
|
||||
def __init__(self, page_obj: PdfPage, docling_page_obj):
|
||||
super().__init__(page_obj)
|
||||
self._ppage = page_obj
|
||||
self._dpage = docling_page_obj
|
||||
self.text_page = None
|
||||
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
# Find intersecting cells on the page
|
||||
text_piece = ""
|
||||
page_size = self.get_size()
|
||||
parser_width = self._dpage["width"]
|
||||
parser_height = self._dpage["height"]
|
||||
|
||||
scale = (
|
||||
1 # FIX - Replace with param in get_text_in_rect across backends (optional)
|
||||
)
|
||||
|
||||
for i in range(len(self._dpage["cells"])):
|
||||
rect = self._dpage["cells"][i]["box"]["device"]
|
||||
x0, y0, x1, y1 = rect
|
||||
cell_bbox = BoundingBox(
|
||||
l=x0 * scale * page_size.width / parser_width,
|
||||
b=y0 * scale * page_size.height / parser_height,
|
||||
r=x1 * scale * page_size.width / parser_width,
|
||||
t=y1 * scale * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
).to_top_left_origin(page_size.height * scale)
|
||||
|
||||
overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
|
||||
|
||||
if overlap_frac > 0.5:
|
||||
if len(text_piece) > 0:
|
||||
text_piece += " "
|
||||
text_piece += self._dpage["cells"][i]["content"]["rnormalized"]
|
||||
|
||||
return text_piece
|
||||
|
||||
def get_text_cells(self) -> Iterable[Cell]:
|
||||
cells = []
|
||||
cell_counter = 0
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
parser_width = self._dpage["width"]
|
||||
parser_height = self._dpage["height"]
|
||||
|
||||
for i in range(len(self._dpage["cells"])):
|
||||
rect = self._dpage["cells"][i]["box"]["device"]
|
||||
x0, y0, x1, y1 = rect
|
||||
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
|
||||
cells.append(
|
||||
Cell(
|
||||
id=cell_counter,
|
||||
text=text_piece,
|
||||
bbox=BoundingBox(
|
||||
# l=x0, b=y0, r=x1, t=y1,
|
||||
l=x0 * page_size.width / parser_width,
|
||||
b=y0 * page_size.height / parser_height,
|
||||
r=x1 * page_size.width / parser_width,
|
||||
t=y1 * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
cell_counter += 1
|
||||
|
||||
def draw_clusters_and_cells():
|
||||
image = self.get_page_image()
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
)
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
image.show()
|
||||
|
||||
# before merge:
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
# cells = merge_horizontal_cells(cells)
|
||||
|
||||
# after merge:
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
return cells
|
||||
|
||||
def get_page_image(
|
||||
self, scale: int = 1, cropbox: Optional[BoundingBox] = None
|
||||
) -> Image.Image:
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
if not cropbox:
|
||||
cropbox = BoundingBox(
|
||||
l=0,
|
||||
r=page_size.width,
|
||||
t=0,
|
||||
b=page_size.height,
|
||||
coord_origin=CoordOrigin.TOPLEFT,
|
||||
)
|
||||
padbox = BoundingBox(
|
||||
l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT
|
||||
)
|
||||
else:
|
||||
padbox = cropbox.to_bottom_left_origin(page_size.height)
|
||||
padbox.r = page_size.width - padbox.r
|
||||
padbox.t = page_size.height - padbox.t
|
||||
|
||||
image = (
|
||||
self._ppage.render(
|
||||
scale=scale * 1.5,
|
||||
rotation=0, # no additional rotation
|
||||
crop=padbox.as_tuple(),
|
||||
)
|
||||
.to_pil()
|
||||
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
|
||||
) # We resize the image from 1.5x the given scale to make it sharper.
|
||||
|
||||
return image
|
||||
|
||||
def get_size(self) -> PageSize:
|
||||
return PageSize(width=self._ppage.get_width(), height=self._ppage.get_height())
|
||||
|
||||
def unload(self):
|
||||
self._ppage = None
|
||||
self._dpage = None
|
||||
self.text_page = None
|
||||
|
||||
|
||||
class DoclingParseDocumentBackend(PdfDocumentBackend):
|
||||
def __init__(self, path_or_stream: Iterable[Union[BytesIO, Path]]):
|
||||
super().__init__(path_or_stream)
|
||||
self._pdoc = pdfium.PdfDocument(path_or_stream)
|
||||
# Parsing cells with docling_parser call
|
||||
print("PARSING WITH DOCLING PARSE")
|
||||
parser = pdf_parser()
|
||||
self._parser_doc = parser.find_cells(str(path_or_stream))
|
||||
|
||||
def page_count(self) -> int:
|
||||
return len(self._parser_doc["pages"])
|
||||
|
||||
def load_page(self, page_no: int) -> PdfPage:
|
||||
return DoclingParsePageBackend(
|
||||
self._pdoc[page_no], self._parser_doc["pages"][page_no]
|
||||
)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.page_count() > 0
|
||||
|
||||
def unload(self):
|
||||
self._pdoc.close()
|
||||
self._pdoc = None
|
||||
self._parser_doc = None
|
@ -4,7 +4,8 @@ import time
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
# from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
@ -54,11 +55,12 @@ def main():
|
||||
artifacts_path = DocumentConverter.download_models_hf()
|
||||
|
||||
pipeline_options = PipelineOptions(do_table_structure=True)
|
||||
# use text cells predicted from table structure model, instead of matching with pdf cells
|
||||
pipeline_options.table_structure_options.do_cell_matching = False
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
artifacts_path=artifacts_path, pipeline_options=pipeline_options
|
||||
artifacts_path=artifacts_path,
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
27
poetry.lock
generated
27
poetry.lock
generated
@ -759,6 +759,30 @@ torch = "2.2.2"
|
||||
torchvision = "0.17.2"
|
||||
tqdm = ">=4.64.0,<5.0.0"
|
||||
|
||||
[[package]]
|
||||
name = "docling-parse"
|
||||
version = "0.0.1"
|
||||
description = "Simple package to extract text with coordinates from programmatic PDFs"
|
||||
optional = false
|
||||
python-versions = "<4.0,>=3.9"
|
||||
files = [
|
||||
{file = "docling_parse-0.0.1-cp310-cp310-macosx_13_6_arm64.whl", hash = "sha256:d6301dde11157f94b6436bb87186b4723cce7b1e59e0f74b0a7333339d6f911d"},
|
||||
{file = "docling_parse-0.0.1-cp310-cp310-macosx_13_6_x86_64.whl", hash = "sha256:ac5fb3b6ac568159930103521f2e7002b78c37f6555f23d767b2e247ddbce740"},
|
||||
{file = "docling_parse-0.0.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:ec9066ad9e7f11a18aa230f67b733d64433185be1da8e887ac273c9683e02938"},
|
||||
{file = "docling_parse-0.0.1-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:3e5d560ac3527a9bda5bf01905ec6a5fb9eb889a5bec2c3c909cf9c75642e2d3"},
|
||||
{file = "docling_parse-0.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d56de1a5b45b19117d4fe1f444878501796ec5f17de880c06c1ce3184ac360e7"},
|
||||
{file = "docling_parse-0.0.1-cp311-cp311-macosx_13_6_arm64.whl", hash = "sha256:110a08f4663ee18833b2b89013993c2326b519a7fe21a64940d9f2789f52be29"},
|
||||
{file = "docling_parse-0.0.1-cp311-cp311-macosx_13_6_x86_64.whl", hash = "sha256:19cf275ce78d2ebb7c3e577b5126f1f2af6fd28557b63c42d1455f1cc87be454"},
|
||||
{file = "docling_parse-0.0.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:1fdd07ac20951935e3f74b1ec1f503c4493440664aaa8e30ab7fa6334c2a4937"},
|
||||
{file = "docling_parse-0.0.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:d8018263eba239c702f79149ed16ec4e749bdec5396aea9e78b9cdfbae1b86bd"},
|
||||
{file = "docling_parse-0.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:299281bfc14ca95cc1db677f48f152105be0f96beab171313004cdb7ce448df4"},
|
||||
{file = "docling_parse-0.0.1-cp312-cp312-macosx_13_6_arm64.whl", hash = "sha256:b05d40d6570212ca1e3b98fb55ce1c861d28484db2bde513b6c5e8b3339f4021"},
|
||||
{file = "docling_parse-0.0.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:11bdddc8f767bdd14b317bcb25d7fc46b656f867f137a5d8fe6d0f95d61d2ce9"},
|
||||
{file = "docling_parse-0.0.1-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:14a52b46c887c00b0a1da0f5ea4e6652ab9e23deeac43f6d98b239a6cba7fbf1"},
|
||||
{file = "docling_parse-0.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:17caa551f7432555823f01a4882e869068198a8b27eec1449afc6c821b594330"},
|
||||
{file = "docling_parse-0.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27aac51dd7753fac57466fa5de55e0ff0294367cf62a539941e72cfff8fb7e87"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "docutils"
|
||||
version = "0.21.2"
|
||||
@ -2510,7 +2534,6 @@ description = "Nvidia JIT LTO Library"
|
||||
optional = false
|
||||
python-versions = ">=3"
|
||||
files = [
|
||||
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_aarch64.whl", hash = "sha256:98103729cc5226e13ca319a10bbf9433bbbd44ef64fe72f45f067cacc14b8d27"},
|
||||
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"},
|
||||
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"},
|
||||
]
|
||||
@ -4882,4 +4905,4 @@ ocr = ["easyocr"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "a6685d5cf1b283d805e10193a437662a1807f99dad40b56ab1e58e1b708fc184"
|
||||
content-hash = "9dfea6fabd2b8be0183a671c1540446cadc1da45a5460e636c71ae5b24abee0d"
|
||||
|
@ -32,6 +32,7 @@ pydantic-settings = "^2.3.0"
|
||||
huggingface_hub = ">=0.23,<1"
|
||||
requests = "^2.32.3"
|
||||
easyocr = { version = "^1.7", optional = true }
|
||||
docling-parse = "^0.0.1"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = {extras = ["jupyter"], version = "^24.4.2"}
|
||||
|
Loading…
Reference in New Issue
Block a user