feat: Make Page.parsed_page the only source of truth for text cells, add OCR cells to it (#1745)

* Keep page.parsed_page.textline_cells and page.cells in sync, including OCR

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Make page.parsed_page the only source of truth for text cells

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Small fix

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Correctly compute PDF boxes from pymupdf

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Use different OCR engine order

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add type hints and fix mypy

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* One more test fix

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Remove with pypdfium2_lock from caller sites

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix typing

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-06-13 19:01:55 +02:00 committed by GitHub
parent 0432a31b2f
commit 7d3302cb48
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
50 changed files with 339091 additions and 330047 deletions

View File

@ -7,12 +7,17 @@ from typing import List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
from docling_core.types.doc.page import (
BoundingRectangle,
SegmentedPdfPage,
TextCell,
)
from docling_parse.pdf_parsers import pdf_parser_v1
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.backend.pypdfium2_backend import get_pdf_page_geometry
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
@ -36,6 +41,51 @@ class DoclingParsePageBackend(PdfPageBackend):
def is_valid(self) -> bool:
return self.valid
def _compute_text_cells(self) -> List[TextCell]:
"""Compute text cells from docling-parse data."""
cells: List[TextCell] = []
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["width"]
parser_height = self._dpage["height"]
for i in range(len(self._dpage["cells"])):
rect = self._dpage["cells"][i]["box"]["device"]
x0, y0, x1, y1 = rect
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
return cells
def get_text_in_rect(self, bbox: BoundingBox) -> str:
if not self.valid:
return ""
@ -70,75 +120,27 @@ class DoclingParsePageBackend(PdfPageBackend):
return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
return None
if not self.valid:
return None
text_cells = self._compute_text_cells()
# Get the PDF page geometry from pypdfium2
dimension = get_pdf_page_geometry(self._ppage)
# Create SegmentedPdfPage
return SegmentedPdfPage(
dimension=dimension,
textline_cells=text_cells,
char_cells=[],
word_cells=[],
has_lines=len(text_cells) > 0,
has_words=False,
has_chars=False,
)
def get_text_cells(self) -> Iterable[TextCell]:
cells: List[TextCell] = []
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["width"]
parser_height = self._dpage["height"]
for i in range(len(self._dpage["cells"])):
rect = self._dpage["cells"][i]["box"]["device"]
x0, y0, x1, y1 = rect
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
# l=x0, b=y0, r=x1, t=y1,
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
def draw_clusters_and_cells():
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# before merge:
# draw_clusters_and_cells()
# cells = merge_horizontal_cells(cells)
# after merge:
# draw_clusters_and_cells()
return cells
return self._compute_text_cells()
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32

View File

@ -7,12 +7,19 @@ from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
from docling_core.types.doc.page import (
BoundingRectangle,
PdfPageBoundaryType,
PdfPageGeometry,
SegmentedPdfPage,
TextCell,
)
from docling_parse.pdf_parsers import pdf_parser_v2
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.backend.pypdfium2_backend import get_pdf_page_geometry
from docling.datamodel.base_models import Size
from docling.utils.locks import pypdfium2_lock
@ -40,6 +47,55 @@ class DoclingParseV2PageBackend(PdfPageBackend):
def is_valid(self) -> bool:
return self.valid
def _compute_text_cells(self) -> List[TextCell]:
"""Compute text cells from docling-parse v2 data."""
cells: List[TextCell] = []
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["sanitized"]["dimension"]["width"]
parser_height = self._dpage["sanitized"]["dimension"]["height"]
cells_data = self._dpage["sanitized"]["cells"]["data"]
cells_header = self._dpage["sanitized"]["cells"]["header"]
for i, cell_data in enumerate(cells_data):
x0 = cell_data[cells_header.index("x0")]
y0 = cell_data[cells_header.index("y0")]
x1 = cell_data[cells_header.index("x1")]
y1 = cell_data[cells_header.index("y1")]
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = cell_data[cells_header.index("text")]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
return cells
def get_text_in_rect(self, bbox: BoundingBox) -> str:
if not self.valid:
return ""
@ -81,73 +137,27 @@ class DoclingParseV2PageBackend(PdfPageBackend):
return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
return None
if not self.valid:
return None
text_cells = self._compute_text_cells()
# Get the PDF page geometry from pypdfium2
dimension = get_pdf_page_geometry(self._ppage)
# Create SegmentedPdfPage
return SegmentedPdfPage(
dimension=dimension,
textline_cells=text_cells,
char_cells=[],
word_cells=[],
has_textlines=len(text_cells) > 0,
has_words=False,
has_chars=False,
)
def get_text_cells(self) -> Iterable[TextCell]:
cells: List[TextCell] = []
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["sanitized"]["dimension"]["width"]
parser_height = self._dpage["sanitized"]["dimension"]["height"]
cells_data = self._dpage["sanitized"]["cells"]["data"]
cells_header = self._dpage["sanitized"]["cells"]["header"]
for i, cell_data in enumerate(cells_data):
x0 = cell_data[cells_header.index("x0")]
y0 = cell_data[cells_header.index("y0")]
x1 = cell_data[cells_header.index("x1")]
y1 = cell_data[cells_header.index("y1")]
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = cell_data[cells_header.index("text")]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
# l=x0, b=y0, r=x1, t=y1,
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
def draw_clusters_and_cells():
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# draw_clusters_and_cells()
return cells
return self._compute_text_cells()
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32

View File

@ -59,20 +59,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
return self._dpage
def get_text_cells(self) -> Iterable[TextCell]:
page_size = self.get_size()
[tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
# for cell in self._dpage.textline_cells:
# rect = cell.rect
#
# assert (
# rect.to_bounding_box().l <= rect.to_bounding_box().r
# ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
# assert (
# rect.to_bounding_box().t <= rect.to_bounding_box().b
# ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
return self._dpage.textline_cells
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
@ -171,12 +157,28 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
self, page_no: int, create_words: bool = True, create_textlines: bool = True
) -> DoclingParseV4PageBackend:
with pypdfium2_lock:
seg_page = self.dp_doc.get_page(
page_no + 1,
create_words=create_words,
create_textlines=create_textlines,
)
# In Docling, all TextCell instances are expected with top-left origin.
[
tc.to_top_left_origin(seg_page.dimension.height)
for tc in seg_page.textline_cells
]
[
tc.to_top_left_origin(seg_page.dimension.height)
for tc in seg_page.char_cells
]
[
tc.to_top_left_origin(seg_page.dimension.height)
for tc in seg_page.word_cells
]
return DoclingParseV4PageBackend(
self.dp_doc.get_page(
page_no + 1,
create_words=create_words,
create_textlines=create_textlines,
),
seg_page,
self._pdoc[page_no],
)

View File

@ -8,7 +8,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
from docling_core.types.doc.page import (
BoundingRectangle,
PdfPageBoundaryType,
PdfPageGeometry,
SegmentedPdfPage,
TextCell,
)
from PIL import Image, ImageDraw
from pypdfium2 import PdfTextPage
from pypdfium2._helpers.misc import PdfiumError
@ -16,6 +22,76 @@ from pypdfium2._helpers.misc import PdfiumError
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.utils.locks import pypdfium2_lock
def get_pdf_page_geometry(
ppage: pdfium.PdfPage,
angle: float = 0.0,
boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
) -> PdfPageGeometry:
"""
Create PdfPageGeometry from a pypdfium2 PdfPage object.
Args:
ppage: pypdfium2 PdfPage object
angle: Page rotation angle in degrees (default: 0.0)
boundary_type: The boundary type for the page (default: CROP_BOX)
Returns:
PdfPageGeometry with all the different bounding boxes properly set
"""
with pypdfium2_lock:
# Get the main bounding box (intersection of crop_box and media_box)
bbox_tuple = ppage.get_bbox()
bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
# Get all the different page boxes from pypdfium2
media_box_tuple = ppage.get_mediabox()
crop_box_tuple = ppage.get_cropbox()
art_box_tuple = ppage.get_artbox()
bleed_box_tuple = ppage.get_bleedbox()
trim_box_tuple = ppage.get_trimbox()
# Convert to BoundingBox objects using existing from_tuple method
# pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
# Use bbox as fallback when specific box types are not defined
media_bbox = (
BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
if media_box_tuple
else bbox
)
crop_bbox = (
BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
if crop_box_tuple
else bbox
)
art_bbox = (
BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
if art_box_tuple
else bbox
)
bleed_bbox = (
BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
if bleed_box_tuple
else bbox
)
trim_bbox = (
BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
if trim_box_tuple
else bbox
)
return PdfPageGeometry(
angle=angle,
rect=BoundingRectangle.from_bounding_box(bbox),
boundary_type=boundary_type,
art_bbox=art_bbox,
bleed_bbox=bleed_bbox,
crop_bbox=crop_bbox,
media_bbox=media_bbox,
trim_bbox=trim_bbox,
)
if TYPE_CHECKING:
from docling.datamodel.document import InputDocument
@ -41,38 +117,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
def is_valid(self) -> bool:
return self.valid
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32
page_size = self.get_size()
with pypdfium2_lock:
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
cropbox = BoundingBox.from_tuple(
pos, origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height=page_size.height)
if cropbox.area() > AREA_THRESHOLD:
cropbox = cropbox.scaled(scale=scale)
yield cropbox
def get_text_in_rect(self, bbox: BoundingBox) -> str:
with pypdfium2_lock:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
bbox = bbox.to_bottom_left_origin(self.get_size().height)
with pypdfium2_lock:
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
return None
def get_text_cells(self) -> Iterable[TextCell]:
def _compute_text_cells(self) -> List[TextCell]:
"""Compute text cells from pypdfium."""
with pypdfium2_lock:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
@ -203,30 +249,58 @@ class PyPdfiumPageBackend(PdfPageBackend):
return merged_cells
def draw_clusters_and_cells():
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
return merge_horizontal_cells(cells)
# before merge:
# draw_clusters_and_cells()
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32
page_size = self.get_size()
with pypdfium2_lock:
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
cropbox = BoundingBox.from_tuple(
pos, origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height=page_size.height)
cells = merge_horizontal_cells(cells)
if cropbox.area() > AREA_THRESHOLD:
cropbox = cropbox.scaled(scale=scale)
# after merge:
# draw_clusters_and_cells()
yield cropbox
return cells
def get_text_in_rect(self, bbox: BoundingBox) -> str:
with pypdfium2_lock:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
bbox = bbox.to_bottom_left_origin(self.get_size().height)
with pypdfium2_lock:
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
if not self.valid:
return None
text_cells = self._compute_text_cells()
# Get the PDF page geometry from pypdfium2
dimension = get_pdf_page_geometry(self._ppage)
# Create SegmentedPdfPage
return SegmentedPdfPage(
dimension=dimension,
textline_cells=text_cells,
char_cells=[],
word_cells=[],
has_textlines=len(text_cells) > 0,
has_words=False,
has_chars=False,
)
def get_text_cells(self) -> Iterable[TextCell]:
return self._compute_text_cells()
def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None

View File

@ -232,7 +232,6 @@ class Page(BaseModel):
page_no: int
# page_hash: Optional[str] = None
size: Optional[Size] = None
cells: List[TextCell] = []
parsed_page: Optional[SegmentedPdfPage] = None
predictions: PagePredictions = PagePredictions()
assembled: Optional[AssembledUnit] = None
@ -245,6 +244,14 @@ class Page(BaseModel):
float, Image
] = {} # Cache of images in different scales. By default it is cleared during assembling.
@property
def cells(self) -> List[TextCell]:
"""Return text cells as a read-only view of parsed_page.textline_cells."""
if self.parsed_page is not None:
return self.parsed_page.textline_cells
else:
return []
def get_image(
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
) -> Optional[Image]:

View File

@ -292,7 +292,9 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
),
)
generate_parsed_pages: bool = False
generate_parsed_pages: Literal[True] = (
True # Always True since parsed_page is now mandatory
)
class PdfPipeline(str, Enum):

View File

@ -7,6 +7,7 @@ from typing import List, Optional, Type
import numpy as np
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import TextCell
from PIL import Image, ImageDraw
from rtree import index
from scipy.ndimage import binary_dilation, find_objects, label
@ -107,7 +108,9 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
return []
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
def _filter_ocr_cells(
self, ocr_cells: List[TextCell], programmatic_cells: List[TextCell]
) -> List[TextCell]:
# Create R-tree index for programmatic cells
p = index.Property()
p.dimension = 2
@ -130,19 +133,38 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
]
return filtered_ocr_cells
def post_process_cells(self, ocr_cells, programmatic_cells):
def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
r"""
Post-process the ocr and programmatic cells and return the final list of of cells
Post-process the OCR cells and update the page object.
Updates parsed_page.textline_cells directly since page.cells is now read-only.
"""
if self.options.force_full_page_ocr:
# If a full page OCR is forced, use only the OCR cells
cells = ocr_cells
return cells
# Get existing cells from the read-only property
existing_cells = page.cells
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
programmatic_cells.extend(filtered_ocr_cells)
return programmatic_cells
# Combine existing and OCR cells with overlap filtering
final_cells = self._combine_cells(existing_cells, ocr_cells)
assert page.parsed_page is not None
# Update parsed_page.textline_cells directly
page.parsed_page.textline_cells = final_cells
page.parsed_page.has_lines = len(final_cells) > 0
def _combine_cells(
self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
) -> List[TextCell]:
"""Combine existing and OCR cells with filtering and re-indexing."""
if self.options.force_full_page_ocr:
combined = ocr_cells
else:
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, existing_cells)
combined = list(existing_cells) + filtered_ocr_cells
# Re-index in-place
for i, cell in enumerate(combined):
cell.index = i
return combined
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
image = copy.deepcopy(page.image)

View File

@ -177,7 +177,7 @@ class EasyOcrModel(BaseOcrModel):
all_ocr_cells.extend(cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
self.post_process_cells(all_ocr_cells, page)
# DEBUG code:
if settings.debug.visualize_ocr:

View File

@ -176,9 +176,9 @@ class LayoutModel(BasePageModel):
# Apply postprocessing
processed_clusters, processed_cells = LayoutPostprocessor(
page.cells, clusters, page.size
page, clusters
).postprocess()
# processed_clusters, processed_cells = clusters, page.cells
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
with warnings.catch_warnings():
warnings.filterwarnings(
@ -198,7 +198,6 @@ class LayoutModel(BasePageModel):
)
)
page.cells = processed_cells
page.predictions.layout = LayoutPrediction(
clusters=processed_clusters
)

View File

@ -132,7 +132,7 @@ class OcrMacModel(BaseOcrModel):
all_ocr_cells.extend(cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
self.post_process_cells(all_ocr_cells, page)
# DEBUG code:
if settings.debug.visualize_ocr:

View File

@ -2,7 +2,7 @@ import re
import warnings
from collections.abc import Iterable
from pathlib import Path
from typing import Optional
from typing import Literal, Optional
import numpy as np
from PIL import ImageDraw
@ -17,7 +17,6 @@ from docling.utils.profiling import TimeRecorder
class PagePreprocessingOptions(BaseModel):
images_scale: Optional[float]
create_parsed_page: bool
class PagePreprocessingModel(BasePageModel):
@ -66,10 +65,8 @@ class PagePreprocessingModel(BasePageModel):
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
assert page._backend is not None
page.cells = list(page._backend.get_text_cells())
if self.options.create_parsed_page:
page.parsed_page = page._backend.get_segmented_page()
page.parsed_page = page._backend.get_segmented_page()
assert page.parsed_page is not None
# Rate the text quality from the PDF parser, and aggregate on page
text_scores = []

View File

@ -134,7 +134,7 @@ class RapidOcrModel(BaseOcrModel):
all_ocr_cells.extend(cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
self.post_process_cells(all_ocr_cells, page)
# DEBUG code:
if settings.debug.visualize_ocr:

View File

@ -306,7 +306,7 @@ class TesseractOcrCliModel(BaseOcrModel):
all_ocr_cells.append(cell)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
self.post_process_cells(all_ocr_cells, page)
# DEBUG code:
if settings.debug.visualize_ocr:

View File

@ -235,7 +235,7 @@ class TesseractOcrModel(BaseOcrModel):
all_ocr_cells.extend(cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
self.post_process_cells(all_ocr_cells, page)
# DEBUG code:
if settings.debug.visualize_ocr:

View File

@ -72,7 +72,6 @@ class StandardPdfPipeline(PaginatedPipeline):
PagePreprocessingModel(
options=PagePreprocessingOptions(
images_scale=pipeline_options.images_scale,
create_parsed_page=pipeline_options.generate_parsed_pages,
)
),
# OCR

View File

@ -8,7 +8,7 @@ from docling_core.types.doc import DocItemLabel, Size
from docling_core.types.doc.page import TextCell
from rtree import index
from docling.datamodel.base_models import BoundingBox, Cluster
from docling.datamodel.base_models import BoundingBox, Cluster, Page
_log = logging.getLogger(__name__)
@ -194,11 +194,11 @@ class LayoutPostprocessor:
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
}
def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size):
"""Initialize processor with cells and clusters."""
"""Initialize processor with cells and spatial indices."""
self.cells = cells
self.page_size = page_size
def __init__(self, page: Page, clusters: List[Cluster]) -> None:
"""Initialize processor with page and clusters."""
self.cells = page.cells
self.page = page
self.page_size = page.size
self.all_clusters = clusters
self.regular_clusters = [
c for c in clusters if c.label not in self.SPECIAL_TYPES
@ -240,6 +240,10 @@ class LayoutPostprocessor:
for child in cluster.children:
child.cells = self._sort_cells(child.cells)
assert self.page.parsed_page is not None
self.page.parsed_page.textline_cells = self.cells
self.page.parsed_page.has_lines = len(self.cells) > 0
return final_clusters, self.cells
def _process_regular_clusters(self) -> List[Cluster]:
@ -301,6 +305,7 @@ class LayoutPostprocessor:
special_clusters = self._handle_cross_type_overlaps(special_clusters)
# Calculate page area from known page size
assert self.page_size is not None
page_area = self.page_size.width * self.page_size.height
if page_area > 0:
# Filter out full-page pictures

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -5,84 +5,159 @@
"width": 2000.0,
"height": 2829.0
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 246.4065456254215,
"r_y0": 329.06770715202435,
"r_x1": 1691.991797818404,
"r_y1": 329.06770715202435,
"r_x2": 1691.991797818404,
"r_y2": 258.9040166758338,
"r_x3": 246.4065456254215,
"r_y3": 258.9040166758338,
"coord_origin": "TOPLEFT"
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 2000.0,
"r_y1": 0.0,
"r_x2": 2000.0,
"r_y2": 2829.0,
"r_x3": 0.0,
"r_y3": 2829.0,
"coord_origin": "BOTTOMLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 2829.0,
"r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 2829.0,
"r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 2829.0,
"r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 2829.0,
"r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 2829.0,
"r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
"bitmap_resources": [
{
"index": 0,
"rect": {
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 2000.0,
"r_y1": 0.0,
"r_x2": 2000.0,
"r_y2": 2829.0,
"r_x3": 0.0,
"r_y3": 2829.0,
"coord_origin": "BOTTOMLEFT"
},
"uri": null
}
],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 246.4065456254215,
"r_y0": 329.06770715202435,
"r_x1": 1691.991797818404,
"r_y1": 329.06770715202435,
"r_x2": 1691.991797818404,
"r_y2": 258.9040166758338,
"r_x3": 246.4065456254215,
"r_y3": 258.9040166758338,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"rect": {
"r_x0": 234.08627147881114,
"r_y0": 419.5788697734327,
"r_x1": 1696.0985042090742,
"r_y1": 419.5788697734327,
"r_x2": 1696.0985042090742,
"r_y2": 349.4151792972422,
"r_x3": 234.08627147881114,
"r_y3": 349.4151792972422,
"coord_origin": "TOPLEFT"
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 234.08627147881114,
"r_y0": 419.5788697734327,
"r_x1": 1696.0985042090742,
"r_y1": 419.5788697734327,
"r_x2": 1696.0985042090742,
"r_y2": 349.4151792972422,
"r_x3": 234.08627147881114,
"r_y3": 349.4151792972422,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 242.29979922858777,
"r_y0": 509.8779072023336,
"r_x1": 513.3470125989277,
"r_y1": 509.8779072023336,
"r_x2": 513.3470125989277,
"r_y2": 439.9752910477536,
"r_x3": 242.29979922858777,
"r_y3": 439.9752910477536,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 242.29979922858777,
"r_y0": 509.8779072023336,
"r_x1": 513.3470125989277,
"r_y1": 509.8779072023336,
"r_x2": 513.3470125989277,
"r_y2": 439.9752910477536,
"r_x3": 242.29979922858777,
"r_y3": 439.9752910477536,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,84 +5,143 @@
"width": 595.201171875,
"height": 841.9216918945312
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 73.34702132031646,
"r_y0": 97.99999977896755,
"r_x1": 503.64955224479564,
"r_y1": 97.99999977896755,
"r_x2": 503.64955224479564,
"r_y2": 76.99999977896756,
"r_x3": 73.34702132031646,
"r_y3": 76.99999977896756,
"coord_origin": "TOPLEFT"
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 73.34702132031646,
"r_y0": 97.99999977896755,
"r_x1": 503.64955224479564,
"r_y1": 97.99999977896755,
"r_x2": 503.64955224479564,
"r_y2": 76.99999977896756,
"r_x3": 73.34702132031646,
"r_y3": 76.99999977896756,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"rect": {
"r_x0": 69.6796630536824,
"r_y0": 124.83139494707741,
"r_x1": 504.8720051760782,
"r_y1": 124.83139494707741,
"r_x2": 504.8720051760782,
"r_y2": 104.00000011573796,
"r_x3": 69.6796630536824,
"r_y3": 104.00000011573796,
"coord_origin": "TOPLEFT"
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 69.6796630536824,
"r_y0": 124.83139494707741,
"r_x1": 504.8720051760782,
"r_y1": 124.83139494707741,
"r_x2": 504.8720051760782,
"r_y2": 104.00000011573796,
"r_x3": 69.6796630536824,
"r_y3": 104.00000011573796,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 71.84193505100733,
"r_y0": 152.90926970226084,
"r_x1": 153.088934155825,
"r_y1": 152.90926970226084,
"r_x2": 153.088934155825,
"r_y2": 129.797125232046,
"r_x3": 71.84193505100733,
"r_y3": 129.797125232046,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 71.84193505100733,
"r_y0": 152.90926970226084,
"r_x1": 153.088934155825,
"r_y1": 152.90926970226084,
"r_x2": 153.088934155825,
"r_y2": 129.797125232046,
"r_x3": 71.84193505100733,
"r_y3": 129.797125232046,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,84 +5,143 @@
"width": 595.201171875,
"height": 841.9216918945312
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 89.2388782764286,
"r_y0": 764.898293373551,
"r_x1": 521.9863147998661,
"r_y1": 764.898293373551,
"r_x2": 521.9863147998661,
"r_y2": 744.0929853494625,
"r_x3": 89.2388782764286,
"r_y3": 744.0929853494625,
"coord_origin": "TOPLEFT"
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 89.2388782764286,
"r_y0": 764.898293373551,
"r_x1": 521.9863147998661,
"r_y1": 764.898293373551,
"r_x2": 521.9863147998661,
"r_y2": 744.0929853494625,
"r_x3": 89.2388782764286,
"r_y3": 744.0929853494625,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"rect": {
"r_x0": 89.23887497045128,
"r_y0": 739.1977118987292,
"r_x1": 523.208764293368,
"r_y1": 739.1977118987292,
"r_x2": 523.208764293368,
"r_y2": 717.1685676116198,
"r_x3": 89.23887497045128,
"r_y3": 717.1685676116198,
"coord_origin": "TOPLEFT"
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 89.23887497045128,
"r_y0": 739.1977118987292,
"r_x1": 523.208764293368,
"r_y1": 739.1977118987292,
"r_x2": 523.208764293368,
"r_y2": 717.1685676116198,
"r_x3": 89.23887497045128,
"r_y3": 717.1685676116198,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 441.2561096985719,
"r_y0": 710.0268078458798,
"r_x1": 522.0347860494834,
"r_y1": 710.0268078458798,
"r_x2": 522.0347860494834,
"r_y2": 690.0429592741025,
"r_x3": 441.2561096985719,
"r_y3": 690.0429592741025,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 441.2561096985719,
"r_y0": 710.0268078458798,
"r_x1": 522.0347860494834,
"r_y1": 710.0268078458798,
"r_x2": 522.0347860494834,
"r_y2": 690.0429592741025,
"r_x3": 441.2561096985719,
"r_y3": 690.0429592741025,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,84 +5,143 @@
"width": 841.9216918945312,
"height": 595.201171875
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 744.0930045534915,
"r_y0": 504.87200373583954,
"r_x1": 764.8982839673505,
"r_y1": 504.87200373583954,
"r_x2": 764.8982839673505,
"r_y2": 73.34702001188118,
"r_x3": 744.0930045534915,
"r_y3": 73.34702001188118,
"coord_origin": "TOPLEFT"
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 744.0930045534915,
"r_y0": 504.87200373583954,
"r_x1": 764.8982839673505,
"r_y1": 504.87200373583954,
"r_x2": 764.8982839673505,
"r_y2": 73.34702001188118,
"r_x3": 744.0930045534915,
"r_y3": 73.34702001188118,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"rect": {
"r_x0": 717.168585936602,
"r_y0": 504.8720061466397,
"r_x1": 737.9738558137178,
"r_y1": 504.8720061466397,
"r_x2": 737.9738558137178,
"r_y2": 70.90211682372312,
"r_x3": 717.168585936602,
"r_y3": 70.90211682372312,
"coord_origin": "TOPLEFT"
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 717.168585936602,
"r_y0": 504.8720061466397,
"r_x1": 737.9738558137178,
"r_y1": 504.8720061466397,
"r_x2": 737.9738558137178,
"r_y2": 70.90211682372312,
"r_x3": 717.168585936602,
"r_y3": 70.90211682372312,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 690.2441821046808,
"r_y0": 152.80629773131633,
"r_x1": 709.8255852011977,
"r_y1": 152.80629773131633,
"r_x2": 709.8255852011977,
"r_y2": 72.124570639845,
"r_x3": 690.2441821046808,
"r_y3": 72.124570639845,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 690.2441821046808,
"r_y0": 152.80629773131633,
"r_x1": 709.8255852011977,
"r_y1": 152.80629773131633,
"r_x2": 709.8255852011977,
"r_y2": 72.124570639845,
"r_x3": 690.2441821046808,
"r_y3": 72.124570639845,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,84 +5,143 @@
"width": 841.9216918945312,
"height": 595.201171875
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 77.10171545548258,
"r_y0": 520.7638571913312,
"r_x1": 96.68315797053792,
"r_y1": 520.7638571913312,
"r_x2": 96.68315797053792,
"r_y2": 89.2388734673729,
"r_x3": 77.10171545548258,
"r_y3": 89.2388734673729,
"coord_origin": "TOPLEFT"
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 77.10171545548258,
"r_y0": 520.7638571913312,
"r_x1": 96.68315797053792,
"r_y1": 520.7638571913312,
"r_x2": 96.68315797053792,
"r_y2": 89.2388734673729,
"r_x3": 77.10171545548258,
"r_y3": 89.2388734673729,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"rect": {
"r_x0": 100.64168123325977,
"r_y0": 523.3236155182395,
"r_x1": 126.08064862014129,
"r_y1": 523.3236155182395,
"r_x2": 126.08064862014129,
"r_y2": 89.1266754140729,
"r_x3": 100.64168123325977,
"r_y3": 89.1266754140729,
"coord_origin": "TOPLEFT"
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 100.64168123325977,
"r_y0": 523.3236155182395,
"r_x1": 126.08064862014129,
"r_y1": 523.3236155182395,
"r_x2": 126.08064862014129,
"r_y2": 89.1266754140729,
"r_x3": 100.64168123325977,
"r_y3": 89.1266754140729,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 131.21306574279092,
"r_y0": 521.0762158417759,
"r_x1": 152.19606490864376,
"r_y1": 521.0762158417759,
"r_x2": 152.19606490864376,
"r_y2": 441.0071698212682,
"r_x3": 131.21306574279092,
"r_y3": 441.0071698212682,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 131.21306574279092,
"r_y0": 521.0762158417759,
"r_x1": 152.19606490864376,
"r_y1": 521.0762158417759,
"r_x2": 152.19606490864376,
"r_y2": 441.0071698212682,
"r_x3": 131.21306574279092,
"r_y3": 441.0071698212682,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,84 +5,143 @@
"width": 595.201171875,
"height": 841.9216918945312
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 73.34702132031646,
"r_y0": 97.99999977896755,
"r_x1": 503.64955224479564,
"r_y1": 97.99999977896755,
"r_x2": 503.64955224479564,
"r_y2": 76.99999977896756,
"r_x3": 73.34702132031646,
"r_y3": 76.99999977896756,
"coord_origin": "TOPLEFT"
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 73.34702132031646,
"r_y0": 97.99999977896755,
"r_x1": 503.64955224479564,
"r_y1": 97.99999977896755,
"r_x2": 503.64955224479564,
"r_y2": 76.99999977896756,
"r_x3": 73.34702132031646,
"r_y3": 76.99999977896756,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"rect": {
"r_x0": 69.6796630536824,
"r_y0": 124.83139494707741,
"r_x1": 504.8720051760782,
"r_y1": 124.83139494707741,
"r_x2": 504.8720051760782,
"r_y2": 104.00000011573796,
"r_x3": 69.6796630536824,
"r_y3": 104.00000011573796,
"coord_origin": "TOPLEFT"
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 69.6796630536824,
"r_y0": 124.83139494707741,
"r_x1": 504.8720051760782,
"r_y1": 124.83139494707741,
"r_x2": 504.8720051760782,
"r_y2": 104.00000011573796,
"r_x3": 69.6796630536824,
"r_y3": 104.00000011573796,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 71.84193505100733,
"r_y0": 152.90926970226084,
"r_x1": 153.088934155825,
"r_y1": 152.90926970226084,
"r_x2": 153.088934155825,
"r_y2": 129.797125232046,
"r_x3": 71.84193505100733,
"r_y3": 129.797125232046,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 71.84193505100733,
"r_y0": 152.90926970226084,
"r_x1": 153.088934155825,
"r_y1": 152.90926970226084,
"r_x2": 153.088934155825,
"r_y2": 129.797125232046,
"r_x3": 71.84193505100733,
"r_y3": 129.797125232046,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,84 +5,143 @@
"width": 595.201171875,
"height": 841.9216918945312
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 89.2388782764286,
"r_y0": 764.898293373551,
"r_x1": 521.9863147998661,
"r_y1": 764.898293373551,
"r_x2": 521.9863147998661,
"r_y2": 744.0929853494625,
"r_x3": 89.2388782764286,
"r_y3": 744.0929853494625,
"coord_origin": "TOPLEFT"
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 89.2388782764286,
"r_y0": 764.898293373551,
"r_x1": 521.9863147998661,
"r_y1": 764.898293373551,
"r_x2": 521.9863147998661,
"r_y2": 744.0929853494625,
"r_x3": 89.2388782764286,
"r_y3": 744.0929853494625,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"rect": {
"r_x0": 89.23887497045128,
"r_y0": 739.1977118987292,
"r_x1": 523.208764293368,
"r_y1": 739.1977118987292,
"r_x2": 523.208764293368,
"r_y2": 717.1685676116198,
"r_x3": 89.23887497045128,
"r_y3": 717.1685676116198,
"coord_origin": "TOPLEFT"
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 89.23887497045128,
"r_y0": 739.1977118987292,
"r_x1": 523.208764293368,
"r_y1": 739.1977118987292,
"r_x2": 523.208764293368,
"r_y2": 717.1685676116198,
"r_x3": 89.23887497045128,
"r_y3": 717.1685676116198,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 441.2561096985719,
"r_y0": 710.0268078458798,
"r_x1": 522.0347860494834,
"r_y1": 710.0268078458798,
"r_x2": 522.0347860494834,
"r_y2": 690.0429592741025,
"r_x3": 441.2561096985719,
"r_y3": 690.0429592741025,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 441.2561096985719,
"r_y0": 710.0268078458798,
"r_x1": 522.0347860494834,
"r_y1": 710.0268078458798,
"r_x2": 522.0347860494834,
"r_y2": 690.0429592741025,
"r_x3": 441.2561096985719,
"r_y3": 690.0429592741025,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,84 +5,143 @@
"width": 841.9216918945312,
"height": 595.201171875
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 744.0930045534915,
"r_y0": 504.87200373583954,
"r_x1": 764.8982839673505,
"r_y1": 504.87200373583954,
"r_x2": 764.8982839673505,
"r_y2": 73.34702001188118,
"r_x3": 744.0930045534915,
"r_y3": 73.34702001188118,
"coord_origin": "TOPLEFT"
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 744.0930045534915,
"r_y0": 504.87200373583954,
"r_x1": 764.8982839673505,
"r_y1": 504.87200373583954,
"r_x2": 764.8982839673505,
"r_y2": 73.34702001188118,
"r_x3": 744.0930045534915,
"r_y3": 73.34702001188118,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"rect": {
"r_x0": 717.168585936602,
"r_y0": 504.8720061466397,
"r_x1": 737.9738558137178,
"r_y1": 504.8720061466397,
"r_x2": 737.9738558137178,
"r_y2": 70.90211682372312,
"r_x3": 717.168585936602,
"r_y3": 70.90211682372312,
"coord_origin": "TOPLEFT"
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 717.168585936602,
"r_y0": 504.8720061466397,
"r_x1": 737.9738558137178,
"r_y1": 504.8720061466397,
"r_x2": 737.9738558137178,
"r_y2": 70.90211682372312,
"r_x3": 717.168585936602,
"r_y3": 70.90211682372312,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 690.2441821046808,
"r_y0": 152.80629773131633,
"r_x1": 709.8255852011977,
"r_y1": 152.80629773131633,
"r_x2": 709.8255852011977,
"r_y2": 72.124570639845,
"r_x3": 690.2441821046808,
"r_y3": 72.124570639845,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 690.2441821046808,
"r_y0": 152.80629773131633,
"r_x1": 709.8255852011977,
"r_y1": 152.80629773131633,
"r_x2": 709.8255852011977,
"r_y2": 72.124570639845,
"r_x3": 690.2441821046808,
"r_y3": 72.124570639845,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -5,84 +5,143 @@
"width": 841.9216918945312,
"height": 595.201171875
},
"cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"parsed_page": {
"dimension": {
"angle": 0.0,
"rect": {
"r_x0": 77.10171545548258,
"r_y0": 520.7638571913312,
"r_x1": 96.68315797053792,
"r_y1": 520.7638571913312,
"r_x2": 96.68315797053792,
"r_y2": 89.2388734673729,
"r_x3": 77.10171545548258,
"r_y3": 89.2388734673729,
"coord_origin": "TOPLEFT"
"r_x0": 0.0,
"r_y0": 0.0,
"r_x1": 595.201171875,
"r_y1": 0.0,
"r_x2": 595.201171875,
"r_y2": 841.9216918945312,
"r_x3": 0.0,
"r_y3": 841.9216918945312,
"coord_origin": "BOTTOMLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
"boundary_type": "crop_box",
"art_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
},
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
"bitmap_resources": [],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 77.10171545548258,
"r_y0": 520.7638571913312,
"r_x1": 96.68315797053792,
"r_y1": 520.7638571913312,
"r_x2": 96.68315797053792,
"r_y2": 89.2388734673729,
"r_x3": 77.10171545548258,
"r_y3": 89.2388734673729,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"rect": {
"r_x0": 100.64168123325977,
"r_y0": 523.3236155182395,
"r_x1": 126.08064862014129,
"r_y1": 523.3236155182395,
"r_x2": 126.08064862014129,
"r_y2": 89.1266754140729,
"r_x3": 100.64168123325977,
"r_y3": 89.1266754140729,
"coord_origin": "TOPLEFT"
{
"index": 1,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 100.64168123325977,
"r_y0": 523.3236155182395,
"r_x1": 126.08064862014129,
"r_y1": 523.3236155182395,
"r_x2": 126.08064862014129,
"r_y2": 89.1266754140729,
"r_x3": 100.64168123325977,
"r_y3": 89.1266754140729,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
},
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 131.21306574279092,
"r_y0": 521.0762158417759,
"r_x1": 152.19606490864376,
"r_y1": 521.0762158417759,
"r_x2": 152.19606490864376,
"r_y2": 441.0071698212682,
"r_x3": 131.21306574279092,
"r_y3": 441.0071698212682,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"parsed_page": null,
{
"index": 2,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 131.21306574279092,
"r_y0": 521.0762158417759,
"r_x1": 152.19606490864376,
"r_y1": 521.0762158417759,
"r_x2": 152.19606490864376,
"r_y2": 441.0071698212682,
"r_x3": 131.21306574279092,
"r_y3": 441.0071698212682,
"coord_origin": "TOPLEFT"
},
"text": "package",
"orig": "package",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}
],
"has_chars": false,
"has_words": false,
"has_lines": true,
"image": null,
"lines": []
},
"predictions": {
"layout": {
"clusters": [

View File

@ -57,14 +57,14 @@ def test_e2e_conversions():
pdf_paths = get_pdf_paths()
engines: List[Tuple[OcrOptions, bool]] = [
(EasyOcrOptions(), False),
(TesseractOcrOptions(), True),
(TesseractCliOcrOptions(), True),
(EasyOcrOptions(force_full_page_ocr=True), False),
(EasyOcrOptions(), False),
(TesseractOcrOptions(force_full_page_ocr=True), True),
(TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
(TesseractCliOcrOptions(force_full_page_ocr=True), True),
(TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
(EasyOcrOptions(force_full_page_ocr=True), False),
]
# rapidocr is only available for Python >=3.6,<3.13