feat: Make Page.parsed_page the only source of truth for text cells, add OCR cells to it (#1745)

* Keep page.parsed_page.textline_cells and page.cells in sync, including OCR

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Make page.parsed_page the only source of truth for text cells

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Small fix

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Correctly compute PDF boxes from pymupdf

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Use different OCR engine order

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Add type hints and fix mypy

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* One more test fix

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Remove with pypdfium2_lock from caller sites

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Fix typing

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-06-13 19:01:55 +02:00 committed by GitHub
parent 0432a31b2f
commit 7d3302cb48
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
50 changed files with 339091 additions and 330047 deletions

View File

@ -7,12 +7,17 @@ from typing import List, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin, Size from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell from docling_core.types.doc.page import (
BoundingRectangle,
SegmentedPdfPage,
TextCell,
)
from docling_parse.pdf_parsers import pdf_parser_v1 from docling_parse.pdf_parsers import pdf_parser_v1
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from pypdfium2 import PdfPage from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.backend.pypdfium2_backend import get_pdf_page_geometry
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -36,6 +41,51 @@ class DoclingParsePageBackend(PdfPageBackend):
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
def _compute_text_cells(self) -> List[TextCell]:
"""Compute text cells from docling-parse data."""
cells: List[TextCell] = []
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["width"]
parser_height = self._dpage["height"]
for i in range(len(self._dpage["cells"])):
rect = self._dpage["cells"][i]["box"]["device"]
x0, y0, x1, y1 = rect
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
return cells
def get_text_in_rect(self, bbox: BoundingBox) -> str: def get_text_in_rect(self, bbox: BoundingBox) -> str:
if not self.valid: if not self.valid:
return "" return ""
@ -70,75 +120,27 @@ class DoclingParsePageBackend(PdfPageBackend):
return text_piece return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]: def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
return None if not self.valid:
return None
text_cells = self._compute_text_cells()
# Get the PDF page geometry from pypdfium2
dimension = get_pdf_page_geometry(self._ppage)
# Create SegmentedPdfPage
return SegmentedPdfPage(
dimension=dimension,
textline_cells=text_cells,
char_cells=[],
word_cells=[],
has_lines=len(text_cells) > 0,
has_words=False,
has_chars=False,
)
def get_text_cells(self) -> Iterable[TextCell]: def get_text_cells(self) -> Iterable[TextCell]:
cells: List[TextCell] = [] return self._compute_text_cells()
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["width"]
parser_height = self._dpage["height"]
for i in range(len(self._dpage["cells"])):
rect = self._dpage["cells"][i]["box"]["device"]
x0, y0, x1, y1 = rect
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
# l=x0, b=y0, r=x1, t=y1,
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
def draw_clusters_and_cells():
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# before merge:
# draw_clusters_and_cells()
# cells = merge_horizontal_cells(cells)
# after merge:
# draw_clusters_and_cells()
return cells
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32 AREA_THRESHOLD = 0 # 32 * 32

View File

@ -7,12 +7,19 @@ from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell from docling_core.types.doc.page import (
BoundingRectangle,
PdfPageBoundaryType,
PdfPageGeometry,
SegmentedPdfPage,
TextCell,
)
from docling_parse.pdf_parsers import pdf_parser_v2 from docling_parse.pdf_parsers import pdf_parser_v2
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from pypdfium2 import PdfPage from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.backend.pypdfium2_backend import get_pdf_page_geometry
from docling.datamodel.base_models import Size from docling.datamodel.base_models import Size
from docling.utils.locks import pypdfium2_lock from docling.utils.locks import pypdfium2_lock
@ -40,6 +47,55 @@ class DoclingParseV2PageBackend(PdfPageBackend):
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
def _compute_text_cells(self) -> List[TextCell]:
"""Compute text cells from docling-parse v2 data."""
cells: List[TextCell] = []
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["sanitized"]["dimension"]["width"]
parser_height = self._dpage["sanitized"]["dimension"]["height"]
cells_data = self._dpage["sanitized"]["cells"]["data"]
cells_header = self._dpage["sanitized"]["cells"]["header"]
for i, cell_data in enumerate(cells_data):
x0 = cell_data[cells_header.index("x0")]
y0 = cell_data[cells_header.index("y0")]
x1 = cell_data[cells_header.index("x1")]
y1 = cell_data[cells_header.index("y1")]
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = cell_data[cells_header.index("text")]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
return cells
def get_text_in_rect(self, bbox: BoundingBox) -> str: def get_text_in_rect(self, bbox: BoundingBox) -> str:
if not self.valid: if not self.valid:
return "" return ""
@ -81,73 +137,27 @@ class DoclingParseV2PageBackend(PdfPageBackend):
return text_piece return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]: def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
return None if not self.valid:
return None
text_cells = self._compute_text_cells()
# Get the PDF page geometry from pypdfium2
dimension = get_pdf_page_geometry(self._ppage)
# Create SegmentedPdfPage
return SegmentedPdfPage(
dimension=dimension,
textline_cells=text_cells,
char_cells=[],
word_cells=[],
has_textlines=len(text_cells) > 0,
has_words=False,
has_chars=False,
)
def get_text_cells(self) -> Iterable[TextCell]: def get_text_cells(self) -> Iterable[TextCell]:
cells: List[TextCell] = [] return self._compute_text_cells()
cell_counter = 0
if not self.valid:
return cells
page_size = self.get_size()
parser_width = self._dpage["sanitized"]["dimension"]["width"]
parser_height = self._dpage["sanitized"]["dimension"]["height"]
cells_data = self._dpage["sanitized"]["cells"]["data"]
cells_header = self._dpage["sanitized"]["cells"]["header"]
for i, cell_data in enumerate(cells_data):
x0 = cell_data[cells_header.index("x0")]
y0 = cell_data[cells_header.index("y0")]
x1 = cell_data[cells_header.index("x1")]
y1 = cell_data[cells_header.index("y1")]
if x1 < x0:
x0, x1 = x1, x0
if y1 < y0:
y0, y1 = y1, y0
text_piece = cell_data[cells_header.index("text")]
cells.append(
TextCell(
index=cell_counter,
text=text_piece,
orig=text_piece,
from_ocr=False,
rect=BoundingRectangle.from_bounding_box(
BoundingBox(
# l=x0, b=y0, r=x1, t=y1,
l=x0 * page_size.width / parser_width,
b=y0 * page_size.height / parser_height,
r=x1 * page_size.width / parser_width,
t=y1 * page_size.height / parser_height,
coord_origin=CoordOrigin.BOTTOMLEFT,
)
).to_top_left_origin(page_size.height),
)
)
cell_counter += 1
def draw_clusters_and_cells():
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# draw_clusters_and_cells()
return cells
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32 AREA_THRESHOLD = 0 # 32 * 32

View File

@ -59,20 +59,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
return self._dpage return self._dpage
def get_text_cells(self) -> Iterable[TextCell]: def get_text_cells(self) -> Iterable[TextCell]:
page_size = self.get_size()
[tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
# for cell in self._dpage.textline_cells:
# rect = cell.rect
#
# assert (
# rect.to_bounding_box().l <= rect.to_bounding_box().r
# ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
# assert (
# rect.to_bounding_box().t <= rect.to_bounding_box().b
# ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
return self._dpage.textline_cells return self._dpage.textline_cells
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
@ -171,12 +157,28 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
self, page_no: int, create_words: bool = True, create_textlines: bool = True self, page_no: int, create_words: bool = True, create_textlines: bool = True
) -> DoclingParseV4PageBackend: ) -> DoclingParseV4PageBackend:
with pypdfium2_lock: with pypdfium2_lock:
seg_page = self.dp_doc.get_page(
page_no + 1,
create_words=create_words,
create_textlines=create_textlines,
)
# In Docling, all TextCell instances are expected with top-left origin.
[
tc.to_top_left_origin(seg_page.dimension.height)
for tc in seg_page.textline_cells
]
[
tc.to_top_left_origin(seg_page.dimension.height)
for tc in seg_page.char_cells
]
[
tc.to_top_left_origin(seg_page.dimension.height)
for tc in seg_page.word_cells
]
return DoclingParseV4PageBackend( return DoclingParseV4PageBackend(
self.dp_doc.get_page( seg_page,
page_no + 1,
create_words=create_words,
create_textlines=create_textlines,
),
self._pdoc[page_no], self._pdoc[page_no],
) )

View File

@ -8,7 +8,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c import pypdfium2.raw as pdfium_c
from docling_core.types.doc import BoundingBox, CoordOrigin, Size from docling_core.types.doc import BoundingBox, CoordOrigin, Size
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell from docling_core.types.doc.page import (
BoundingRectangle,
PdfPageBoundaryType,
PdfPageGeometry,
SegmentedPdfPage,
TextCell,
)
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from pypdfium2 import PdfTextPage from pypdfium2 import PdfTextPage
from pypdfium2._helpers.misc import PdfiumError from pypdfium2._helpers.misc import PdfiumError
@ -16,6 +22,76 @@ from pypdfium2._helpers.misc import PdfiumError
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.utils.locks import pypdfium2_lock from docling.utils.locks import pypdfium2_lock
def get_pdf_page_geometry(
ppage: pdfium.PdfPage,
angle: float = 0.0,
boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
) -> PdfPageGeometry:
"""
Create PdfPageGeometry from a pypdfium2 PdfPage object.
Args:
ppage: pypdfium2 PdfPage object
angle: Page rotation angle in degrees (default: 0.0)
boundary_type: The boundary type for the page (default: CROP_BOX)
Returns:
PdfPageGeometry with all the different bounding boxes properly set
"""
with pypdfium2_lock:
# Get the main bounding box (intersection of crop_box and media_box)
bbox_tuple = ppage.get_bbox()
bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
# Get all the different page boxes from pypdfium2
media_box_tuple = ppage.get_mediabox()
crop_box_tuple = ppage.get_cropbox()
art_box_tuple = ppage.get_artbox()
bleed_box_tuple = ppage.get_bleedbox()
trim_box_tuple = ppage.get_trimbox()
# Convert to BoundingBox objects using existing from_tuple method
# pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
# Use bbox as fallback when specific box types are not defined
media_bbox = (
BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
if media_box_tuple
else bbox
)
crop_bbox = (
BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
if crop_box_tuple
else bbox
)
art_bbox = (
BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
if art_box_tuple
else bbox
)
bleed_bbox = (
BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
if bleed_box_tuple
else bbox
)
trim_bbox = (
BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
if trim_box_tuple
else bbox
)
return PdfPageGeometry(
angle=angle,
rect=BoundingRectangle.from_bounding_box(bbox),
boundary_type=boundary_type,
art_bbox=art_bbox,
bleed_bbox=bleed_bbox,
crop_bbox=crop_bbox,
media_bbox=media_bbox,
trim_bbox=trim_bbox,
)
if TYPE_CHECKING: if TYPE_CHECKING:
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
@ -41,38 +117,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]: def _compute_text_cells(self) -> List[TextCell]:
AREA_THRESHOLD = 0 # 32 * 32 """Compute text cells from pypdfium."""
page_size = self.get_size()
with pypdfium2_lock:
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
cropbox = BoundingBox.from_tuple(
pos, origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height=page_size.height)
if cropbox.area() > AREA_THRESHOLD:
cropbox = cropbox.scaled(scale=scale)
yield cropbox
def get_text_in_rect(self, bbox: BoundingBox) -> str:
with pypdfium2_lock:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
bbox = bbox.to_bottom_left_origin(self.get_size().height)
with pypdfium2_lock:
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
return None
def get_text_cells(self) -> Iterable[TextCell]:
with pypdfium2_lock: with pypdfium2_lock:
if not self.text_page: if not self.text_page:
self.text_page = self._ppage.get_textpage() self.text_page = self._ppage.get_textpage()
@ -203,30 +249,58 @@ class PyPdfiumPageBackend(PdfPageBackend):
return merged_cells return merged_cells
def draw_clusters_and_cells(): return merge_horizontal_cells(cells)
image = (
self.get_page_image()
) # make new image to avoid drawing on the saved ones
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# before merge: def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
# draw_clusters_and_cells() AREA_THRESHOLD = 0 # 32 * 32
page_size = self.get_size()
with pypdfium2_lock:
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
cropbox = BoundingBox.from_tuple(
pos, origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height=page_size.height)
cells = merge_horizontal_cells(cells) if cropbox.area() > AREA_THRESHOLD:
cropbox = cropbox.scaled(scale=scale)
# after merge: yield cropbox
# draw_clusters_and_cells()
return cells def get_text_in_rect(self, bbox: BoundingBox) -> str:
with pypdfium2_lock:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
bbox = bbox.to_bottom_left_origin(self.get_size().height)
with pypdfium2_lock:
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
return text_piece
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
if not self.valid:
return None
text_cells = self._compute_text_cells()
# Get the PDF page geometry from pypdfium2
dimension = get_pdf_page_geometry(self._ppage)
# Create SegmentedPdfPage
return SegmentedPdfPage(
dimension=dimension,
textline_cells=text_cells,
char_cells=[],
word_cells=[],
has_textlines=len(text_cells) > 0,
has_words=False,
has_chars=False,
)
def get_text_cells(self) -> Iterable[TextCell]:
return self._compute_text_cells()
def get_page_image( def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None self, scale: float = 1, cropbox: Optional[BoundingBox] = None

View File

@ -232,7 +232,6 @@ class Page(BaseModel):
page_no: int page_no: int
# page_hash: Optional[str] = None # page_hash: Optional[str] = None
size: Optional[Size] = None size: Optional[Size] = None
cells: List[TextCell] = []
parsed_page: Optional[SegmentedPdfPage] = None parsed_page: Optional[SegmentedPdfPage] = None
predictions: PagePredictions = PagePredictions() predictions: PagePredictions = PagePredictions()
assembled: Optional[AssembledUnit] = None assembled: Optional[AssembledUnit] = None
@ -245,6 +244,14 @@ class Page(BaseModel):
float, Image float, Image
] = {} # Cache of images in different scales. By default it is cleared during assembling. ] = {} # Cache of images in different scales. By default it is cleared during assembling.
@property
def cells(self) -> List[TextCell]:
"""Return text cells as a read-only view of parsed_page.textline_cells."""
if self.parsed_page is not None:
return self.parsed_page.textline_cells
else:
return []
def get_image( def get_image(
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
) -> Optional[Image]: ) -> Optional[Image]:

View File

@ -292,7 +292,9 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
), ),
) )
generate_parsed_pages: bool = False generate_parsed_pages: Literal[True] = (
True # Always True since parsed_page is now mandatory
)
class PdfPipeline(str, Enum): class PdfPipeline(str, Enum):

View File

@ -7,6 +7,7 @@ from typing import List, Optional, Type
import numpy as np import numpy as np
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import TextCell
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from rtree import index from rtree import index
from scipy.ndimage import binary_dilation, find_objects, label from scipy.ndimage import binary_dilation, find_objects, label
@ -107,7 +108,9 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
return [] return []
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell. # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
def _filter_ocr_cells(self, ocr_cells, programmatic_cells): def _filter_ocr_cells(
self, ocr_cells: List[TextCell], programmatic_cells: List[TextCell]
) -> List[TextCell]:
# Create R-tree index for programmatic cells # Create R-tree index for programmatic cells
p = index.Property() p = index.Property()
p.dimension = 2 p.dimension = 2
@ -130,19 +133,38 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
] ]
return filtered_ocr_cells return filtered_ocr_cells
def post_process_cells(self, ocr_cells, programmatic_cells): def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
r""" r"""
Post-process the ocr and programmatic cells and return the final list of of cells Post-process the OCR cells and update the page object.
Updates parsed_page.textline_cells directly since page.cells is now read-only.
""" """
if self.options.force_full_page_ocr: # Get existing cells from the read-only property
# If a full page OCR is forced, use only the OCR cells existing_cells = page.cells
cells = ocr_cells
return cells
## Remove OCR cells which overlap with programmatic cells. # Combine existing and OCR cells with overlap filtering
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells) final_cells = self._combine_cells(existing_cells, ocr_cells)
programmatic_cells.extend(filtered_ocr_cells)
return programmatic_cells assert page.parsed_page is not None
# Update parsed_page.textline_cells directly
page.parsed_page.textline_cells = final_cells
page.parsed_page.has_lines = len(final_cells) > 0
def _combine_cells(
self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
) -> List[TextCell]:
"""Combine existing and OCR cells with filtering and re-indexing."""
if self.options.force_full_page_ocr:
combined = ocr_cells
else:
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, existing_cells)
combined = list(existing_cells) + filtered_ocr_cells
# Re-index in-place
for i, cell in enumerate(combined):
cell.index = i
return combined
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False): def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
image = copy.deepcopy(page.image) image = copy.deepcopy(page.image)

View File

@ -177,7 +177,7 @@ class EasyOcrModel(BaseOcrModel):
all_ocr_cells.extend(cells) all_ocr_cells.extend(cells)
# Post-process the cells # Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells) self.post_process_cells(all_ocr_cells, page)
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:

View File

@ -176,9 +176,9 @@ class LayoutModel(BasePageModel):
# Apply postprocessing # Apply postprocessing
processed_clusters, processed_cells = LayoutPostprocessor( processed_clusters, processed_cells = LayoutPostprocessor(
page.cells, clusters, page.size page, clusters
).postprocess() ).postprocess()
# processed_clusters, processed_cells = clusters, page.cells # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.filterwarnings( warnings.filterwarnings(
@ -198,7 +198,6 @@ class LayoutModel(BasePageModel):
) )
) )
page.cells = processed_cells
page.predictions.layout = LayoutPrediction( page.predictions.layout = LayoutPrediction(
clusters=processed_clusters clusters=processed_clusters
) )

View File

@ -132,7 +132,7 @@ class OcrMacModel(BaseOcrModel):
all_ocr_cells.extend(cells) all_ocr_cells.extend(cells)
# Post-process the cells # Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells) self.post_process_cells(all_ocr_cells, page)
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:

View File

@ -2,7 +2,7 @@ import re
import warnings import warnings
from collections.abc import Iterable from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Literal, Optional
import numpy as np import numpy as np
from PIL import ImageDraw from PIL import ImageDraw
@ -17,7 +17,6 @@ from docling.utils.profiling import TimeRecorder
class PagePreprocessingOptions(BaseModel): class PagePreprocessingOptions(BaseModel):
images_scale: Optional[float] images_scale: Optional[float]
create_parsed_page: bool
class PagePreprocessingModel(BasePageModel): class PagePreprocessingModel(BasePageModel):
@ -66,10 +65,8 @@ class PagePreprocessingModel(BasePageModel):
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page: def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
assert page._backend is not None assert page._backend is not None
page.cells = list(page._backend.get_text_cells()) page.parsed_page = page._backend.get_segmented_page()
assert page.parsed_page is not None
if self.options.create_parsed_page:
page.parsed_page = page._backend.get_segmented_page()
# Rate the text quality from the PDF parser, and aggregate on page # Rate the text quality from the PDF parser, and aggregate on page
text_scores = [] text_scores = []

View File

@ -134,7 +134,7 @@ class RapidOcrModel(BaseOcrModel):
all_ocr_cells.extend(cells) all_ocr_cells.extend(cells)
# Post-process the cells # Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells) self.post_process_cells(all_ocr_cells, page)
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:

View File

@ -306,7 +306,7 @@ class TesseractOcrCliModel(BaseOcrModel):
all_ocr_cells.append(cell) all_ocr_cells.append(cell)
# Post-process the cells # Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells) self.post_process_cells(all_ocr_cells, page)
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:

View File

@ -235,7 +235,7 @@ class TesseractOcrModel(BaseOcrModel):
all_ocr_cells.extend(cells) all_ocr_cells.extend(cells)
# Post-process the cells # Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells) self.post_process_cells(all_ocr_cells, page)
# DEBUG code: # DEBUG code:
if settings.debug.visualize_ocr: if settings.debug.visualize_ocr:

View File

@ -72,7 +72,6 @@ class StandardPdfPipeline(PaginatedPipeline):
PagePreprocessingModel( PagePreprocessingModel(
options=PagePreprocessingOptions( options=PagePreprocessingOptions(
images_scale=pipeline_options.images_scale, images_scale=pipeline_options.images_scale,
create_parsed_page=pipeline_options.generate_parsed_pages,
) )
), ),
# OCR # OCR

View File

@ -8,7 +8,7 @@ from docling_core.types.doc import DocItemLabel, Size
from docling_core.types.doc.page import TextCell from docling_core.types.doc.page import TextCell
from rtree import index from rtree import index
from docling.datamodel.base_models import BoundingBox, Cluster from docling.datamodel.base_models import BoundingBox, Cluster, Page
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -194,11 +194,11 @@ class LayoutPostprocessor:
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
} }
def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size): def __init__(self, page: Page, clusters: List[Cluster]) -> None:
"""Initialize processor with cells and clusters.""" """Initialize processor with page and clusters."""
"""Initialize processor with cells and spatial indices.""" self.cells = page.cells
self.cells = cells self.page = page
self.page_size = page_size self.page_size = page.size
self.all_clusters = clusters self.all_clusters = clusters
self.regular_clusters = [ self.regular_clusters = [
c for c in clusters if c.label not in self.SPECIAL_TYPES c for c in clusters if c.label not in self.SPECIAL_TYPES
@ -240,6 +240,10 @@ class LayoutPostprocessor:
for child in cluster.children: for child in cluster.children:
child.cells = self._sort_cells(child.cells) child.cells = self._sort_cells(child.cells)
assert self.page.parsed_page is not None
self.page.parsed_page.textline_cells = self.cells
self.page.parsed_page.has_lines = len(self.cells) > 0
return final_clusters, self.cells return final_clusters, self.cells
def _process_regular_clusters(self) -> List[Cluster]: def _process_regular_clusters(self) -> List[Cluster]:
@ -301,6 +305,7 @@ class LayoutPostprocessor:
special_clusters = self._handle_cross_type_overlaps(special_clusters) special_clusters = self._handle_cross_type_overlaps(special_clusters)
# Calculate page area from known page size # Calculate page area from known page size
assert self.page_size is not None
page_area = self.page_size.width * self.page_size.height page_area = self.page_size.width * self.page_size.height
if page_area > 0: if page_area > 0:
# Filter out full-page pictures # Filter out full-page pictures

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -5,84 +5,159 @@
"width": 2000.0, "width": 2000.0,
"height": 2829.0 "height": 2829.0
}, },
"cells": [ "parsed_page": {
{ "dimension": {
"index": 0, "angle": 0.0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": { "rect": {
"r_x0": 246.4065456254215, "r_x0": 0.0,
"r_y0": 329.06770715202435, "r_y0": 0.0,
"r_x1": 1691.991797818404, "r_x1": 2000.0,
"r_y1": 329.06770715202435, "r_y1": 0.0,
"r_x2": 1691.991797818404, "r_x2": 2000.0,
"r_y2": 258.9040166758338, "r_y2": 2829.0,
"r_x3": 246.4065456254215, "r_x3": 0.0,
"r_y3": 258.9040166758338, "r_y3": 2829.0,
"coord_origin": "TOPLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"text": "Docling bundles PDF document conversion to", "boundary_type": "crop_box",
"orig": "Docling bundles PDF document conversion to", "art_bbox": {
"text_direction": "left_to_right", "l": 0.0,
"confidence": 1.0, "t": 2829.0,
"from_ocr": true "r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 2829.0,
"r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 2829.0,
"r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 2829.0,
"r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 2829.0,
"r": 2000.0,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
}, },
{ "bitmap_resources": [
"index": 1, {
"rgba": { "index": 0,
"r": 0, "rect": {
"g": 0, "r_x0": 0.0,
"b": 0, "r_y0": 0.0,
"a": 255 "r_x1": 2000.0,
"r_y1": 0.0,
"r_x2": 2000.0,
"r_y2": 2829.0,
"r_x3": 0.0,
"r_y3": 2829.0,
"coord_origin": "BOTTOMLEFT"
},
"uri": null
}
],
"char_cells": [],
"word_cells": [],
"textline_cells": [
{
"index": 0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 246.4065456254215,
"r_y0": 329.06770715202435,
"r_x1": 1691.991797818404,
"r_y1": 329.06770715202435,
"r_x2": 1691.991797818404,
"r_y2": 258.9040166758338,
"r_x3": 246.4065456254215,
"r_y3": 258.9040166758338,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"rect": { {
"r_x0": 234.08627147881114, "index": 1,
"r_y0": 419.5788697734327, "rgba": {
"r_x1": 1696.0985042090742, "r": 0,
"r_y1": 419.5788697734327, "g": 0,
"r_x2": 1696.0985042090742, "b": 0,
"r_y2": 349.4151792972422, "a": 255
"r_x3": 234.08627147881114, },
"r_y3": 349.4151792972422, "rect": {
"coord_origin": "TOPLEFT" "r_x0": 234.08627147881114,
"r_y0": 419.5788697734327,
"r_x1": 1696.0985042090742,
"r_y1": 419.5788697734327,
"r_x2": 1696.0985042090742,
"r_y2": 349.4151792972422,
"r_x3": 234.08627147881114,
"r_y3": 349.4151792972422,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"text": "JSON and Markdown in an easy self contained", {
"orig": "JSON and Markdown in an easy self contained", "index": 2,
"text_direction": "left_to_right", "rgba": {
"confidence": 1.0, "r": 0,
"from_ocr": true "g": 0,
}, "b": 0,
{ "a": 255
"index": 2, },
"rgba": { "rect": {
"r": 0, "r_x0": 242.29979922858777,
"g": 0, "r_y0": 509.8779072023336,
"b": 0, "r_x1": 513.3470125989277,
"a": 255 "r_y1": 509.8779072023336,
}, "r_x2": 513.3470125989277,
"rect": { "r_y2": 439.9752910477536,
"r_x0": 242.29979922858777, "r_x3": 242.29979922858777,
"r_y0": 509.8779072023336, "r_y3": 439.9752910477536,
"r_x1": 513.3470125989277, "coord_origin": "TOPLEFT"
"r_y1": 509.8779072023336, },
"r_x2": 513.3470125989277, "text": "package",
"r_y2": 439.9752910477536, "orig": "package",
"r_x3": 242.29979922858777, "text_direction": "left_to_right",
"r_y3": 439.9752910477536, "confidence": 1.0,
"coord_origin": "TOPLEFT" "from_ocr": true
}, }
"text": "package", ],
"orig": "package", "has_chars": false,
"text_direction": "left_to_right", "has_words": false,
"confidence": 1.0, "has_lines": true,
"from_ocr": true "image": null,
} "lines": []
], },
"parsed_page": null,
"predictions": { "predictions": {
"layout": { "layout": {
"clusters": [ "clusters": [

View File

@ -5,84 +5,143 @@
"width": 595.201171875, "width": 595.201171875,
"height": 841.9216918945312 "height": 841.9216918945312
}, },
"cells": [ "parsed_page": {
{ "dimension": {
"index": 0, "angle": 0.0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": { "rect": {
"r_x0": 73.34702132031646, "r_x0": 0.0,
"r_y0": 97.99999977896755, "r_y0": 0.0,
"r_x1": 503.64955224479564, "r_x1": 595.201171875,
"r_y1": 97.99999977896755, "r_y1": 0.0,
"r_x2": 503.64955224479564, "r_x2": 595.201171875,
"r_y2": 76.99999977896756, "r_y2": 841.9216918945312,
"r_x3": 73.34702132031646, "r_x3": 0.0,
"r_y3": 76.99999977896756, "r_y3": 841.9216918945312,
"coord_origin": "TOPLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"text": "Docling bundles PDF document conversion to", "boundary_type": "crop_box",
"orig": "Docling bundles PDF document conversion to", "art_bbox": {
"text_direction": "left_to_right", "l": 0.0,
"confidence": 1.0, "t": 841.9216918945312,
"from_ocr": true "r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
}, },
{ "bitmap_resources": [],
"index": 1, "char_cells": [],
"rgba": { "word_cells": [],
"r": 0, "textline_cells": [
"g": 0, {
"b": 0, "index": 0,
"a": 255 "rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 73.34702132031646,
"r_y0": 97.99999977896755,
"r_x1": 503.64955224479564,
"r_y1": 97.99999977896755,
"r_x2": 503.64955224479564,
"r_y2": 76.99999977896756,
"r_x3": 73.34702132031646,
"r_y3": 76.99999977896756,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"rect": { {
"r_x0": 69.6796630536824, "index": 1,
"r_y0": 124.83139494707741, "rgba": {
"r_x1": 504.8720051760782, "r": 0,
"r_y1": 124.83139494707741, "g": 0,
"r_x2": 504.8720051760782, "b": 0,
"r_y2": 104.00000011573796, "a": 255
"r_x3": 69.6796630536824, },
"r_y3": 104.00000011573796, "rect": {
"coord_origin": "TOPLEFT" "r_x0": 69.6796630536824,
"r_y0": 124.83139494707741,
"r_x1": 504.8720051760782,
"r_y1": 124.83139494707741,
"r_x2": 504.8720051760782,
"r_y2": 104.00000011573796,
"r_x3": 69.6796630536824,
"r_y3": 104.00000011573796,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"text": "JSON and Markdown in an easy self contained", {
"orig": "JSON and Markdown in an easy self contained", "index": 2,
"text_direction": "left_to_right", "rgba": {
"confidence": 1.0, "r": 0,
"from_ocr": true "g": 0,
}, "b": 0,
{ "a": 255
"index": 2, },
"rgba": { "rect": {
"r": 0, "r_x0": 71.84193505100733,
"g": 0, "r_y0": 152.90926970226084,
"b": 0, "r_x1": 153.088934155825,
"a": 255 "r_y1": 152.90926970226084,
}, "r_x2": 153.088934155825,
"rect": { "r_y2": 129.797125232046,
"r_x0": 71.84193505100733, "r_x3": 71.84193505100733,
"r_y0": 152.90926970226084, "r_y3": 129.797125232046,
"r_x1": 153.088934155825, "coord_origin": "TOPLEFT"
"r_y1": 152.90926970226084, },
"r_x2": 153.088934155825, "text": "package",
"r_y2": 129.797125232046, "orig": "package",
"r_x3": 71.84193505100733, "text_direction": "left_to_right",
"r_y3": 129.797125232046, "confidence": 1.0,
"coord_origin": "TOPLEFT" "from_ocr": true
}, }
"text": "package", ],
"orig": "package", "has_chars": false,
"text_direction": "left_to_right", "has_words": false,
"confidence": 1.0, "has_lines": true,
"from_ocr": true "image": null,
} "lines": []
], },
"parsed_page": null,
"predictions": { "predictions": {
"layout": { "layout": {
"clusters": [ "clusters": [

View File

@ -5,84 +5,143 @@
"width": 595.201171875, "width": 595.201171875,
"height": 841.9216918945312 "height": 841.9216918945312
}, },
"cells": [ "parsed_page": {
{ "dimension": {
"index": 0, "angle": 0.0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": { "rect": {
"r_x0": 89.2388782764286, "r_x0": 0.0,
"r_y0": 764.898293373551, "r_y0": 0.0,
"r_x1": 521.9863147998661, "r_x1": 595.201171875,
"r_y1": 764.898293373551, "r_y1": 0.0,
"r_x2": 521.9863147998661, "r_x2": 595.201171875,
"r_y2": 744.0929853494625, "r_y2": 841.9216918945312,
"r_x3": 89.2388782764286, "r_x3": 0.0,
"r_y3": 744.0929853494625, "r_y3": 841.9216918945312,
"coord_origin": "TOPLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"text": "Docling bundles PDF document conversion to", "boundary_type": "crop_box",
"orig": "Docling bundles PDF document conversion to", "art_bbox": {
"text_direction": "left_to_right", "l": 0.0,
"confidence": 1.0, "t": 841.9216918945312,
"from_ocr": true "r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
}, },
{ "bitmap_resources": [],
"index": 1, "char_cells": [],
"rgba": { "word_cells": [],
"r": 0, "textline_cells": [
"g": 0, {
"b": 0, "index": 0,
"a": 255 "rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 89.2388782764286,
"r_y0": 764.898293373551,
"r_x1": 521.9863147998661,
"r_y1": 764.898293373551,
"r_x2": 521.9863147998661,
"r_y2": 744.0929853494625,
"r_x3": 89.2388782764286,
"r_y3": 744.0929853494625,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"rect": { {
"r_x0": 89.23887497045128, "index": 1,
"r_y0": 739.1977118987292, "rgba": {
"r_x1": 523.208764293368, "r": 0,
"r_y1": 739.1977118987292, "g": 0,
"r_x2": 523.208764293368, "b": 0,
"r_y2": 717.1685676116198, "a": 255
"r_x3": 89.23887497045128, },
"r_y3": 717.1685676116198, "rect": {
"coord_origin": "TOPLEFT" "r_x0": 89.23887497045128,
"r_y0": 739.1977118987292,
"r_x1": 523.208764293368,
"r_y1": 739.1977118987292,
"r_x2": 523.208764293368,
"r_y2": 717.1685676116198,
"r_x3": 89.23887497045128,
"r_y3": 717.1685676116198,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"text": "JSON and Markdown in an easy self contained", {
"orig": "JSON and Markdown in an easy self contained", "index": 2,
"text_direction": "left_to_right", "rgba": {
"confidence": 1.0, "r": 0,
"from_ocr": true "g": 0,
}, "b": 0,
{ "a": 255
"index": 2, },
"rgba": { "rect": {
"r": 0, "r_x0": 441.2561096985719,
"g": 0, "r_y0": 710.0268078458798,
"b": 0, "r_x1": 522.0347860494834,
"a": 255 "r_y1": 710.0268078458798,
}, "r_x2": 522.0347860494834,
"rect": { "r_y2": 690.0429592741025,
"r_x0": 441.2561096985719, "r_x3": 441.2561096985719,
"r_y0": 710.0268078458798, "r_y3": 690.0429592741025,
"r_x1": 522.0347860494834, "coord_origin": "TOPLEFT"
"r_y1": 710.0268078458798, },
"r_x2": 522.0347860494834, "text": "package",
"r_y2": 690.0429592741025, "orig": "package",
"r_x3": 441.2561096985719, "text_direction": "left_to_right",
"r_y3": 690.0429592741025, "confidence": 1.0,
"coord_origin": "TOPLEFT" "from_ocr": true
}, }
"text": "package", ],
"orig": "package", "has_chars": false,
"text_direction": "left_to_right", "has_words": false,
"confidence": 1.0, "has_lines": true,
"from_ocr": true "image": null,
} "lines": []
], },
"parsed_page": null,
"predictions": { "predictions": {
"layout": { "layout": {
"clusters": [ "clusters": [

View File

@ -5,84 +5,143 @@
"width": 841.9216918945312, "width": 841.9216918945312,
"height": 595.201171875 "height": 595.201171875
}, },
"cells": [ "parsed_page": {
{ "dimension": {
"index": 0, "angle": 0.0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": { "rect": {
"r_x0": 744.0930045534915, "r_x0": 0.0,
"r_y0": 504.87200373583954, "r_y0": 0.0,
"r_x1": 764.8982839673505, "r_x1": 595.201171875,
"r_y1": 504.87200373583954, "r_y1": 0.0,
"r_x2": 764.8982839673505, "r_x2": 595.201171875,
"r_y2": 73.34702001188118, "r_y2": 841.9216918945312,
"r_x3": 744.0930045534915, "r_x3": 0.0,
"r_y3": 73.34702001188118, "r_y3": 841.9216918945312,
"coord_origin": "TOPLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"text": "Docling bundles PDF document conversion to", "boundary_type": "crop_box",
"orig": "Docling bundles PDF document conversion to", "art_bbox": {
"text_direction": "left_to_right", "l": 0.0,
"confidence": 1.0, "t": 841.9216918945312,
"from_ocr": true "r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
}, },
{ "bitmap_resources": [],
"index": 1, "char_cells": [],
"rgba": { "word_cells": [],
"r": 0, "textline_cells": [
"g": 0, {
"b": 0, "index": 0,
"a": 255 "rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 744.0930045534915,
"r_y0": 504.87200373583954,
"r_x1": 764.8982839673505,
"r_y1": 504.87200373583954,
"r_x2": 764.8982839673505,
"r_y2": 73.34702001188118,
"r_x3": 744.0930045534915,
"r_y3": 73.34702001188118,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"rect": { {
"r_x0": 717.168585936602, "index": 1,
"r_y0": 504.8720061466397, "rgba": {
"r_x1": 737.9738558137178, "r": 0,
"r_y1": 504.8720061466397, "g": 0,
"r_x2": 737.9738558137178, "b": 0,
"r_y2": 70.90211682372312, "a": 255
"r_x3": 717.168585936602, },
"r_y3": 70.90211682372312, "rect": {
"coord_origin": "TOPLEFT" "r_x0": 717.168585936602,
"r_y0": 504.8720061466397,
"r_x1": 737.9738558137178,
"r_y1": 504.8720061466397,
"r_x2": 737.9738558137178,
"r_y2": 70.90211682372312,
"r_x3": 717.168585936602,
"r_y3": 70.90211682372312,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"text": "JSON and Markdown in an easy self contained", {
"orig": "JSON and Markdown in an easy self contained", "index": 2,
"text_direction": "left_to_right", "rgba": {
"confidence": 1.0, "r": 0,
"from_ocr": true "g": 0,
}, "b": 0,
{ "a": 255
"index": 2, },
"rgba": { "rect": {
"r": 0, "r_x0": 690.2441821046808,
"g": 0, "r_y0": 152.80629773131633,
"b": 0, "r_x1": 709.8255852011977,
"a": 255 "r_y1": 152.80629773131633,
}, "r_x2": 709.8255852011977,
"rect": { "r_y2": 72.124570639845,
"r_x0": 690.2441821046808, "r_x3": 690.2441821046808,
"r_y0": 152.80629773131633, "r_y3": 72.124570639845,
"r_x1": 709.8255852011977, "coord_origin": "TOPLEFT"
"r_y1": 152.80629773131633, },
"r_x2": 709.8255852011977, "text": "package",
"r_y2": 72.124570639845, "orig": "package",
"r_x3": 690.2441821046808, "text_direction": "left_to_right",
"r_y3": 72.124570639845, "confidence": 1.0,
"coord_origin": "TOPLEFT" "from_ocr": true
}, }
"text": "package", ],
"orig": "package", "has_chars": false,
"text_direction": "left_to_right", "has_words": false,
"confidence": 1.0, "has_lines": true,
"from_ocr": true "image": null,
} "lines": []
], },
"parsed_page": null,
"predictions": { "predictions": {
"layout": { "layout": {
"clusters": [ "clusters": [

View File

@ -5,84 +5,143 @@
"width": 841.9216918945312, "width": 841.9216918945312,
"height": 595.201171875 "height": 595.201171875
}, },
"cells": [ "parsed_page": {
{ "dimension": {
"index": 0, "angle": 0.0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": { "rect": {
"r_x0": 77.10171545548258, "r_x0": 0.0,
"r_y0": 520.7638571913312, "r_y0": 0.0,
"r_x1": 96.68315797053792, "r_x1": 595.201171875,
"r_y1": 520.7638571913312, "r_y1": 0.0,
"r_x2": 96.68315797053792, "r_x2": 595.201171875,
"r_y2": 89.2388734673729, "r_y2": 841.9216918945312,
"r_x3": 77.10171545548258, "r_x3": 0.0,
"r_y3": 89.2388734673729, "r_y3": 841.9216918945312,
"coord_origin": "TOPLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"text": "Docling bundles PDF document conversion to", "boundary_type": "crop_box",
"orig": "Docling bundles PDF document conversion to", "art_bbox": {
"text_direction": "left_to_right", "l": 0.0,
"confidence": 1.0, "t": 841.9216918945312,
"from_ocr": true "r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
}, },
{ "bitmap_resources": [],
"index": 1, "char_cells": [],
"rgba": { "word_cells": [],
"r": 0, "textline_cells": [
"g": 0, {
"b": 0, "index": 0,
"a": 255 "rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 77.10171545548258,
"r_y0": 520.7638571913312,
"r_x1": 96.68315797053792,
"r_y1": 520.7638571913312,
"r_x2": 96.68315797053792,
"r_y2": 89.2388734673729,
"r_x3": 77.10171545548258,
"r_y3": 89.2388734673729,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"rect": { {
"r_x0": 100.64168123325977, "index": 1,
"r_y0": 523.3236155182395, "rgba": {
"r_x1": 126.08064862014129, "r": 0,
"r_y1": 523.3236155182395, "g": 0,
"r_x2": 126.08064862014129, "b": 0,
"r_y2": 89.1266754140729, "a": 255
"r_x3": 100.64168123325977, },
"r_y3": 89.1266754140729, "rect": {
"coord_origin": "TOPLEFT" "r_x0": 100.64168123325977,
"r_y0": 523.3236155182395,
"r_x1": 126.08064862014129,
"r_y1": 523.3236155182395,
"r_x2": 126.08064862014129,
"r_y2": 89.1266754140729,
"r_x3": 100.64168123325977,
"r_y3": 89.1266754140729,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"text": "JSON and Markdown in an easy self contained", {
"orig": "JSON and Markdown in an easy self contained", "index": 2,
"text_direction": "left_to_right", "rgba": {
"confidence": 1.0, "r": 0,
"from_ocr": true "g": 0,
}, "b": 0,
{ "a": 255
"index": 2, },
"rgba": { "rect": {
"r": 0, "r_x0": 131.21306574279092,
"g": 0, "r_y0": 521.0762158417759,
"b": 0, "r_x1": 152.19606490864376,
"a": 255 "r_y1": 521.0762158417759,
}, "r_x2": 152.19606490864376,
"rect": { "r_y2": 441.0071698212682,
"r_x0": 131.21306574279092, "r_x3": 131.21306574279092,
"r_y0": 521.0762158417759, "r_y3": 441.0071698212682,
"r_x1": 152.19606490864376, "coord_origin": "TOPLEFT"
"r_y1": 521.0762158417759, },
"r_x2": 152.19606490864376, "text": "package",
"r_y2": 441.0071698212682, "orig": "package",
"r_x3": 131.21306574279092, "text_direction": "left_to_right",
"r_y3": 441.0071698212682, "confidence": 1.0,
"coord_origin": "TOPLEFT" "from_ocr": true
}, }
"text": "package", ],
"orig": "package", "has_chars": false,
"text_direction": "left_to_right", "has_words": false,
"confidence": 1.0, "has_lines": true,
"from_ocr": true "image": null,
} "lines": []
], },
"parsed_page": null,
"predictions": { "predictions": {
"layout": { "layout": {
"clusters": [ "clusters": [

View File

@ -5,84 +5,143 @@
"width": 595.201171875, "width": 595.201171875,
"height": 841.9216918945312 "height": 841.9216918945312
}, },
"cells": [ "parsed_page": {
{ "dimension": {
"index": 0, "angle": 0.0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": { "rect": {
"r_x0": 73.34702132031646, "r_x0": 0.0,
"r_y0": 97.99999977896755, "r_y0": 0.0,
"r_x1": 503.64955224479564, "r_x1": 595.201171875,
"r_y1": 97.99999977896755, "r_y1": 0.0,
"r_x2": 503.64955224479564, "r_x2": 595.201171875,
"r_y2": 76.99999977896756, "r_y2": 841.9216918945312,
"r_x3": 73.34702132031646, "r_x3": 0.0,
"r_y3": 76.99999977896756, "r_y3": 841.9216918945312,
"coord_origin": "TOPLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"text": "Docling bundles PDF document conversion to", "boundary_type": "crop_box",
"orig": "Docling bundles PDF document conversion to", "art_bbox": {
"text_direction": "left_to_right", "l": 0.0,
"confidence": 1.0, "t": 841.9216918945312,
"from_ocr": true "r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
}, },
{ "bitmap_resources": [],
"index": 1, "char_cells": [],
"rgba": { "word_cells": [],
"r": 0, "textline_cells": [
"g": 0, {
"b": 0, "index": 0,
"a": 255 "rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 73.34702132031646,
"r_y0": 97.99999977896755,
"r_x1": 503.64955224479564,
"r_y1": 97.99999977896755,
"r_x2": 503.64955224479564,
"r_y2": 76.99999977896756,
"r_x3": 73.34702132031646,
"r_y3": 76.99999977896756,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"rect": { {
"r_x0": 69.6796630536824, "index": 1,
"r_y0": 124.83139494707741, "rgba": {
"r_x1": 504.8720051760782, "r": 0,
"r_y1": 124.83139494707741, "g": 0,
"r_x2": 504.8720051760782, "b": 0,
"r_y2": 104.00000011573796, "a": 255
"r_x3": 69.6796630536824, },
"r_y3": 104.00000011573796, "rect": {
"coord_origin": "TOPLEFT" "r_x0": 69.6796630536824,
"r_y0": 124.83139494707741,
"r_x1": 504.8720051760782,
"r_y1": 124.83139494707741,
"r_x2": 504.8720051760782,
"r_y2": 104.00000011573796,
"r_x3": 69.6796630536824,
"r_y3": 104.00000011573796,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"text": "JSON and Markdown in an easy self contained", {
"orig": "JSON and Markdown in an easy self contained", "index": 2,
"text_direction": "left_to_right", "rgba": {
"confidence": 1.0, "r": 0,
"from_ocr": true "g": 0,
}, "b": 0,
{ "a": 255
"index": 2, },
"rgba": { "rect": {
"r": 0, "r_x0": 71.84193505100733,
"g": 0, "r_y0": 152.90926970226084,
"b": 0, "r_x1": 153.088934155825,
"a": 255 "r_y1": 152.90926970226084,
}, "r_x2": 153.088934155825,
"rect": { "r_y2": 129.797125232046,
"r_x0": 71.84193505100733, "r_x3": 71.84193505100733,
"r_y0": 152.90926970226084, "r_y3": 129.797125232046,
"r_x1": 153.088934155825, "coord_origin": "TOPLEFT"
"r_y1": 152.90926970226084, },
"r_x2": 153.088934155825, "text": "package",
"r_y2": 129.797125232046, "orig": "package",
"r_x3": 71.84193505100733, "text_direction": "left_to_right",
"r_y3": 129.797125232046, "confidence": 1.0,
"coord_origin": "TOPLEFT" "from_ocr": true
}, }
"text": "package", ],
"orig": "package", "has_chars": false,
"text_direction": "left_to_right", "has_words": false,
"confidence": 1.0, "has_lines": true,
"from_ocr": true "image": null,
} "lines": []
], },
"parsed_page": null,
"predictions": { "predictions": {
"layout": { "layout": {
"clusters": [ "clusters": [

View File

@ -5,84 +5,143 @@
"width": 595.201171875, "width": 595.201171875,
"height": 841.9216918945312 "height": 841.9216918945312
}, },
"cells": [ "parsed_page": {
{ "dimension": {
"index": 0, "angle": 0.0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": { "rect": {
"r_x0": 89.2388782764286, "r_x0": 0.0,
"r_y0": 764.898293373551, "r_y0": 0.0,
"r_x1": 521.9863147998661, "r_x1": 595.201171875,
"r_y1": 764.898293373551, "r_y1": 0.0,
"r_x2": 521.9863147998661, "r_x2": 595.201171875,
"r_y2": 744.0929853494625, "r_y2": 841.9216918945312,
"r_x3": 89.2388782764286, "r_x3": 0.0,
"r_y3": 744.0929853494625, "r_y3": 841.9216918945312,
"coord_origin": "TOPLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"text": "Docling bundles PDF document conversion to", "boundary_type": "crop_box",
"orig": "Docling bundles PDF document conversion to", "art_bbox": {
"text_direction": "left_to_right", "l": 0.0,
"confidence": 1.0, "t": 841.9216918945312,
"from_ocr": true "r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
}, },
{ "bitmap_resources": [],
"index": 1, "char_cells": [],
"rgba": { "word_cells": [],
"r": 0, "textline_cells": [
"g": 0, {
"b": 0, "index": 0,
"a": 255 "rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 89.2388782764286,
"r_y0": 764.898293373551,
"r_x1": 521.9863147998661,
"r_y1": 764.898293373551,
"r_x2": 521.9863147998661,
"r_y2": 744.0929853494625,
"r_x3": 89.2388782764286,
"r_y3": 744.0929853494625,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"rect": { {
"r_x0": 89.23887497045128, "index": 1,
"r_y0": 739.1977118987292, "rgba": {
"r_x1": 523.208764293368, "r": 0,
"r_y1": 739.1977118987292, "g": 0,
"r_x2": 523.208764293368, "b": 0,
"r_y2": 717.1685676116198, "a": 255
"r_x3": 89.23887497045128, },
"r_y3": 717.1685676116198, "rect": {
"coord_origin": "TOPLEFT" "r_x0": 89.23887497045128,
"r_y0": 739.1977118987292,
"r_x1": 523.208764293368,
"r_y1": 739.1977118987292,
"r_x2": 523.208764293368,
"r_y2": 717.1685676116198,
"r_x3": 89.23887497045128,
"r_y3": 717.1685676116198,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"text": "JSON and Markdown in an easy self contained", {
"orig": "JSON and Markdown in an easy self contained", "index": 2,
"text_direction": "left_to_right", "rgba": {
"confidence": 1.0, "r": 0,
"from_ocr": true "g": 0,
}, "b": 0,
{ "a": 255
"index": 2, },
"rgba": { "rect": {
"r": 0, "r_x0": 441.2561096985719,
"g": 0, "r_y0": 710.0268078458798,
"b": 0, "r_x1": 522.0347860494834,
"a": 255 "r_y1": 710.0268078458798,
}, "r_x2": 522.0347860494834,
"rect": { "r_y2": 690.0429592741025,
"r_x0": 441.2561096985719, "r_x3": 441.2561096985719,
"r_y0": 710.0268078458798, "r_y3": 690.0429592741025,
"r_x1": 522.0347860494834, "coord_origin": "TOPLEFT"
"r_y1": 710.0268078458798, },
"r_x2": 522.0347860494834, "text": "package",
"r_y2": 690.0429592741025, "orig": "package",
"r_x3": 441.2561096985719, "text_direction": "left_to_right",
"r_y3": 690.0429592741025, "confidence": 1.0,
"coord_origin": "TOPLEFT" "from_ocr": true
}, }
"text": "package", ],
"orig": "package", "has_chars": false,
"text_direction": "left_to_right", "has_words": false,
"confidence": 1.0, "has_lines": true,
"from_ocr": true "image": null,
} "lines": []
], },
"parsed_page": null,
"predictions": { "predictions": {
"layout": { "layout": {
"clusters": [ "clusters": [

View File

@ -5,84 +5,143 @@
"width": 841.9216918945312, "width": 841.9216918945312,
"height": 595.201171875 "height": 595.201171875
}, },
"cells": [ "parsed_page": {
{ "dimension": {
"index": 0, "angle": 0.0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": { "rect": {
"r_x0": 744.0930045534915, "r_x0": 0.0,
"r_y0": 504.87200373583954, "r_y0": 0.0,
"r_x1": 764.8982839673505, "r_x1": 595.201171875,
"r_y1": 504.87200373583954, "r_y1": 0.0,
"r_x2": 764.8982839673505, "r_x2": 595.201171875,
"r_y2": 73.34702001188118, "r_y2": 841.9216918945312,
"r_x3": 744.0930045534915, "r_x3": 0.0,
"r_y3": 73.34702001188118, "r_y3": 841.9216918945312,
"coord_origin": "TOPLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"text": "Docling bundles PDF document conversion to", "boundary_type": "crop_box",
"orig": "Docling bundles PDF document conversion to", "art_bbox": {
"text_direction": "left_to_right", "l": 0.0,
"confidence": 1.0, "t": 841.9216918945312,
"from_ocr": true "r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
}, },
{ "bitmap_resources": [],
"index": 1, "char_cells": [],
"rgba": { "word_cells": [],
"r": 0, "textline_cells": [
"g": 0, {
"b": 0, "index": 0,
"a": 255 "rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 744.0930045534915,
"r_y0": 504.87200373583954,
"r_x1": 764.8982839673505,
"r_y1": 504.87200373583954,
"r_x2": 764.8982839673505,
"r_y2": 73.34702001188118,
"r_x3": 744.0930045534915,
"r_y3": 73.34702001188118,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"rect": { {
"r_x0": 717.168585936602, "index": 1,
"r_y0": 504.8720061466397, "rgba": {
"r_x1": 737.9738558137178, "r": 0,
"r_y1": 504.8720061466397, "g": 0,
"r_x2": 737.9738558137178, "b": 0,
"r_y2": 70.90211682372312, "a": 255
"r_x3": 717.168585936602, },
"r_y3": 70.90211682372312, "rect": {
"coord_origin": "TOPLEFT" "r_x0": 717.168585936602,
"r_y0": 504.8720061466397,
"r_x1": 737.9738558137178,
"r_y1": 504.8720061466397,
"r_x2": 737.9738558137178,
"r_y2": 70.90211682372312,
"r_x3": 717.168585936602,
"r_y3": 70.90211682372312,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"text": "JSON and Markdown in an easy self contained", {
"orig": "JSON and Markdown in an easy self contained", "index": 2,
"text_direction": "left_to_right", "rgba": {
"confidence": 1.0, "r": 0,
"from_ocr": true "g": 0,
}, "b": 0,
{ "a": 255
"index": 2, },
"rgba": { "rect": {
"r": 0, "r_x0": 690.2441821046808,
"g": 0, "r_y0": 152.80629773131633,
"b": 0, "r_x1": 709.8255852011977,
"a": 255 "r_y1": 152.80629773131633,
}, "r_x2": 709.8255852011977,
"rect": { "r_y2": 72.124570639845,
"r_x0": 690.2441821046808, "r_x3": 690.2441821046808,
"r_y0": 152.80629773131633, "r_y3": 72.124570639845,
"r_x1": 709.8255852011977, "coord_origin": "TOPLEFT"
"r_y1": 152.80629773131633, },
"r_x2": 709.8255852011977, "text": "package",
"r_y2": 72.124570639845, "orig": "package",
"r_x3": 690.2441821046808, "text_direction": "left_to_right",
"r_y3": 72.124570639845, "confidence": 1.0,
"coord_origin": "TOPLEFT" "from_ocr": true
}, }
"text": "package", ],
"orig": "package", "has_chars": false,
"text_direction": "left_to_right", "has_words": false,
"confidence": 1.0, "has_lines": true,
"from_ocr": true "image": null,
} "lines": []
], },
"parsed_page": null,
"predictions": { "predictions": {
"layout": { "layout": {
"clusters": [ "clusters": [

View File

@ -5,84 +5,143 @@
"width": 841.9216918945312, "width": 841.9216918945312,
"height": 595.201171875 "height": 595.201171875
}, },
"cells": [ "parsed_page": {
{ "dimension": {
"index": 0, "angle": 0.0,
"rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": { "rect": {
"r_x0": 77.10171545548258, "r_x0": 0.0,
"r_y0": 520.7638571913312, "r_y0": 0.0,
"r_x1": 96.68315797053792, "r_x1": 595.201171875,
"r_y1": 520.7638571913312, "r_y1": 0.0,
"r_x2": 96.68315797053792, "r_x2": 595.201171875,
"r_y2": 89.2388734673729, "r_y2": 841.9216918945312,
"r_x3": 77.10171545548258, "r_x3": 0.0,
"r_y3": 89.2388734673729, "r_y3": 841.9216918945312,
"coord_origin": "TOPLEFT" "coord_origin": "BOTTOMLEFT"
}, },
"text": "Docling bundles PDF document conversion to", "boundary_type": "crop_box",
"orig": "Docling bundles PDF document conversion to", "art_bbox": {
"text_direction": "left_to_right", "l": 0.0,
"confidence": 1.0, "t": 841.9216918945312,
"from_ocr": true "r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"bleed_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"crop_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"media_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
},
"trim_bbox": {
"l": 0.0,
"t": 841.9216918945312,
"r": 595.201171875,
"b": 0.0,
"coord_origin": "BOTTOMLEFT"
}
}, },
{ "bitmap_resources": [],
"index": 1, "char_cells": [],
"rgba": { "word_cells": [],
"r": 0, "textline_cells": [
"g": 0, {
"b": 0, "index": 0,
"a": 255 "rgba": {
"r": 0,
"g": 0,
"b": 0,
"a": 255
},
"rect": {
"r_x0": 77.10171545548258,
"r_y0": 520.7638571913312,
"r_x1": 96.68315797053792,
"r_y1": 520.7638571913312,
"r_x2": 96.68315797053792,
"r_y2": 89.2388734673729,
"r_x3": 77.10171545548258,
"r_y3": 89.2388734673729,
"coord_origin": "TOPLEFT"
},
"text": "Docling bundles PDF document conversion to",
"orig": "Docling bundles PDF document conversion to",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"rect": { {
"r_x0": 100.64168123325977, "index": 1,
"r_y0": 523.3236155182395, "rgba": {
"r_x1": 126.08064862014129, "r": 0,
"r_y1": 523.3236155182395, "g": 0,
"r_x2": 126.08064862014129, "b": 0,
"r_y2": 89.1266754140729, "a": 255
"r_x3": 100.64168123325977, },
"r_y3": 89.1266754140729, "rect": {
"coord_origin": "TOPLEFT" "r_x0": 100.64168123325977,
"r_y0": 523.3236155182395,
"r_x1": 126.08064862014129,
"r_y1": 523.3236155182395,
"r_x2": 126.08064862014129,
"r_y2": 89.1266754140729,
"r_x3": 100.64168123325977,
"r_y3": 89.1266754140729,
"coord_origin": "TOPLEFT"
},
"text": "JSON and Markdown in an easy self contained",
"orig": "JSON and Markdown in an easy self contained",
"text_direction": "left_to_right",
"confidence": 1.0,
"from_ocr": true
}, },
"text": "JSON and Markdown in an easy self contained", {
"orig": "JSON and Markdown in an easy self contained", "index": 2,
"text_direction": "left_to_right", "rgba": {
"confidence": 1.0, "r": 0,
"from_ocr": true "g": 0,
}, "b": 0,
{ "a": 255
"index": 2, },
"rgba": { "rect": {
"r": 0, "r_x0": 131.21306574279092,
"g": 0, "r_y0": 521.0762158417759,
"b": 0, "r_x1": 152.19606490864376,
"a": 255 "r_y1": 521.0762158417759,
}, "r_x2": 152.19606490864376,
"rect": { "r_y2": 441.0071698212682,
"r_x0": 131.21306574279092, "r_x3": 131.21306574279092,
"r_y0": 521.0762158417759, "r_y3": 441.0071698212682,
"r_x1": 152.19606490864376, "coord_origin": "TOPLEFT"
"r_y1": 521.0762158417759, },
"r_x2": 152.19606490864376, "text": "package",
"r_y2": 441.0071698212682, "orig": "package",
"r_x3": 131.21306574279092, "text_direction": "left_to_right",
"r_y3": 441.0071698212682, "confidence": 1.0,
"coord_origin": "TOPLEFT" "from_ocr": true
}, }
"text": "package", ],
"orig": "package", "has_chars": false,
"text_direction": "left_to_right", "has_words": false,
"confidence": 1.0, "has_lines": true,
"from_ocr": true "image": null,
} "lines": []
], },
"parsed_page": null,
"predictions": { "predictions": {
"layout": { "layout": {
"clusters": [ "clusters": [

View File

@ -57,14 +57,14 @@ def test_e2e_conversions():
pdf_paths = get_pdf_paths() pdf_paths = get_pdf_paths()
engines: List[Tuple[OcrOptions, bool]] = [ engines: List[Tuple[OcrOptions, bool]] = [
(EasyOcrOptions(), False),
(TesseractOcrOptions(), True), (TesseractOcrOptions(), True),
(TesseractCliOcrOptions(), True), (TesseractCliOcrOptions(), True),
(EasyOcrOptions(force_full_page_ocr=True), False), (EasyOcrOptions(), False),
(TesseractOcrOptions(force_full_page_ocr=True), True), (TesseractOcrOptions(force_full_page_ocr=True), True),
(TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True), (TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
(TesseractCliOcrOptions(force_full_page_ocr=True), True), (TesseractCliOcrOptions(force_full_page_ocr=True), True),
(TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True), (TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
(EasyOcrOptions(force_full_page_ocr=True), False),
] ]
# rapidocr is only available for Python >=3.6,<3.13 # rapidocr is only available for Python >=3.6,<3.13