feat: Make Page.parsed_page the only source of truth for text cells, add OCR cells to it (#1745)
* Keep page.parsed_page.textline_cells and page.cells in sync, including OCR Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Make page.parsed_page the only source of truth for text cells Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Small fix Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Correctly compute PDF boxes from pymupdf Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Use different OCR engine order Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add type hints and fix mypy Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * One more test fix Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Remove with pypdfium2_lock from caller sites Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix typing Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
0432a31b2f
commit
7d3302cb48
@ -7,12 +7,17 @@ from typing import List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
||||
from docling_core.types.doc.page import (
|
||||
BoundingRectangle,
|
||||
SegmentedPdfPage,
|
||||
TextCell,
|
||||
)
|
||||
from docling_parse.pdf_parsers import pdf_parser_v1
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.backend.pypdfium2_backend import get_pdf_page_geometry
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -36,6 +41,51 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
def _compute_text_cells(self) -> List[TextCell]:
|
||||
"""Compute text cells from docling-parse data."""
|
||||
cells: List[TextCell] = []
|
||||
cell_counter = 0
|
||||
|
||||
if not self.valid:
|
||||
return cells
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
parser_width = self._dpage["width"]
|
||||
parser_height = self._dpage["height"]
|
||||
|
||||
for i in range(len(self._dpage["cells"])):
|
||||
rect = self._dpage["cells"][i]["box"]["device"]
|
||||
x0, y0, x1, y1 = rect
|
||||
|
||||
if x1 < x0:
|
||||
x0, x1 = x1, x0
|
||||
if y1 < y0:
|
||||
y0, y1 = y1, y0
|
||||
|
||||
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
|
||||
cells.append(
|
||||
TextCell(
|
||||
index=cell_counter,
|
||||
text=text_piece,
|
||||
orig=text_piece,
|
||||
from_ocr=False,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox(
|
||||
l=x0 * page_size.width / parser_width,
|
||||
b=y0 * page_size.height / parser_height,
|
||||
r=x1 * page_size.width / parser_width,
|
||||
t=y1 * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
|
||||
cell_counter += 1
|
||||
|
||||
return cells
|
||||
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
if not self.valid:
|
||||
return ""
|
||||
@ -70,75 +120,27 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
return text_piece
|
||||
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
return None
|
||||
if not self.valid:
|
||||
return None
|
||||
|
||||
text_cells = self._compute_text_cells()
|
||||
|
||||
# Get the PDF page geometry from pypdfium2
|
||||
dimension = get_pdf_page_geometry(self._ppage)
|
||||
|
||||
# Create SegmentedPdfPage
|
||||
return SegmentedPdfPage(
|
||||
dimension=dimension,
|
||||
textline_cells=text_cells,
|
||||
char_cells=[],
|
||||
word_cells=[],
|
||||
has_lines=len(text_cells) > 0,
|
||||
has_words=False,
|
||||
has_chars=False,
|
||||
)
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
cells: List[TextCell] = []
|
||||
cell_counter = 0
|
||||
|
||||
if not self.valid:
|
||||
return cells
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
parser_width = self._dpage["width"]
|
||||
parser_height = self._dpage["height"]
|
||||
|
||||
for i in range(len(self._dpage["cells"])):
|
||||
rect = self._dpage["cells"][i]["box"]["device"]
|
||||
x0, y0, x1, y1 = rect
|
||||
|
||||
if x1 < x0:
|
||||
x0, x1 = x1, x0
|
||||
if y1 < y0:
|
||||
y0, y1 = y1, y0
|
||||
|
||||
text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
|
||||
cells.append(
|
||||
TextCell(
|
||||
index=cell_counter,
|
||||
text=text_piece,
|
||||
orig=text_piece,
|
||||
from_ocr=False,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox(
|
||||
# l=x0, b=y0, r=x1, t=y1,
|
||||
l=x0 * page_size.width / parser_width,
|
||||
b=y0 * page_size.height / parser_height,
|
||||
r=x1 * page_size.width / parser_width,
|
||||
t=y1 * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
|
||||
cell_counter += 1
|
||||
|
||||
def draw_clusters_and_cells():
|
||||
image = (
|
||||
self.get_page_image()
|
||||
) # make new image to avoid drawing on the saved ones
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
)
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
image.show()
|
||||
|
||||
# before merge:
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
# cells = merge_horizontal_cells(cells)
|
||||
|
||||
# after merge:
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
return cells
|
||||
return self._compute_text_cells()
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
|
@ -7,12 +7,19 @@ from typing import TYPE_CHECKING, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
||||
from docling_core.types.doc.page import (
|
||||
BoundingRectangle,
|
||||
PdfPageBoundaryType,
|
||||
PdfPageGeometry,
|
||||
SegmentedPdfPage,
|
||||
TextCell,
|
||||
)
|
||||
from docling_parse.pdf_parsers import pdf_parser_v2
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.backend.pypdfium2_backend import get_pdf_page_geometry
|
||||
from docling.datamodel.base_models import Size
|
||||
from docling.utils.locks import pypdfium2_lock
|
||||
|
||||
@ -40,6 +47,55 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
def _compute_text_cells(self) -> List[TextCell]:
|
||||
"""Compute text cells from docling-parse v2 data."""
|
||||
cells: List[TextCell] = []
|
||||
cell_counter = 0
|
||||
|
||||
if not self.valid:
|
||||
return cells
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
||||
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
||||
|
||||
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
||||
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
||||
|
||||
for i, cell_data in enumerate(cells_data):
|
||||
x0 = cell_data[cells_header.index("x0")]
|
||||
y0 = cell_data[cells_header.index("y0")]
|
||||
x1 = cell_data[cells_header.index("x1")]
|
||||
y1 = cell_data[cells_header.index("y1")]
|
||||
|
||||
if x1 < x0:
|
||||
x0, x1 = x1, x0
|
||||
if y1 < y0:
|
||||
y0, y1 = y1, y0
|
||||
|
||||
text_piece = cell_data[cells_header.index("text")]
|
||||
cells.append(
|
||||
TextCell(
|
||||
index=cell_counter,
|
||||
text=text_piece,
|
||||
orig=text_piece,
|
||||
from_ocr=False,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox(
|
||||
l=x0 * page_size.width / parser_width,
|
||||
b=y0 * page_size.height / parser_height,
|
||||
r=x1 * page_size.width / parser_width,
|
||||
t=y1 * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
cell_counter += 1
|
||||
|
||||
return cells
|
||||
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
if not self.valid:
|
||||
return ""
|
||||
@ -81,73 +137,27 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
||||
return text_piece
|
||||
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
return None
|
||||
if not self.valid:
|
||||
return None
|
||||
|
||||
text_cells = self._compute_text_cells()
|
||||
|
||||
# Get the PDF page geometry from pypdfium2
|
||||
dimension = get_pdf_page_geometry(self._ppage)
|
||||
|
||||
# Create SegmentedPdfPage
|
||||
return SegmentedPdfPage(
|
||||
dimension=dimension,
|
||||
textline_cells=text_cells,
|
||||
char_cells=[],
|
||||
word_cells=[],
|
||||
has_textlines=len(text_cells) > 0,
|
||||
has_words=False,
|
||||
has_chars=False,
|
||||
)
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
cells: List[TextCell] = []
|
||||
cell_counter = 0
|
||||
|
||||
if not self.valid:
|
||||
return cells
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
parser_width = self._dpage["sanitized"]["dimension"]["width"]
|
||||
parser_height = self._dpage["sanitized"]["dimension"]["height"]
|
||||
|
||||
cells_data = self._dpage["sanitized"]["cells"]["data"]
|
||||
cells_header = self._dpage["sanitized"]["cells"]["header"]
|
||||
|
||||
for i, cell_data in enumerate(cells_data):
|
||||
x0 = cell_data[cells_header.index("x0")]
|
||||
y0 = cell_data[cells_header.index("y0")]
|
||||
x1 = cell_data[cells_header.index("x1")]
|
||||
y1 = cell_data[cells_header.index("y1")]
|
||||
|
||||
if x1 < x0:
|
||||
x0, x1 = x1, x0
|
||||
if y1 < y0:
|
||||
y0, y1 = y1, y0
|
||||
|
||||
text_piece = cell_data[cells_header.index("text")]
|
||||
cells.append(
|
||||
TextCell(
|
||||
index=cell_counter,
|
||||
text=text_piece,
|
||||
orig=text_piece,
|
||||
from_ocr=False,
|
||||
rect=BoundingRectangle.from_bounding_box(
|
||||
BoundingBox(
|
||||
# l=x0, b=y0, r=x1, t=y1,
|
||||
l=x0 * page_size.width / parser_width,
|
||||
b=y0 * page_size.height / parser_height,
|
||||
r=x1 * page_size.width / parser_width,
|
||||
t=y1 * page_size.height / parser_height,
|
||||
coord_origin=CoordOrigin.BOTTOMLEFT,
|
||||
)
|
||||
).to_top_left_origin(page_size.height),
|
||||
)
|
||||
)
|
||||
cell_counter += 1
|
||||
|
||||
def draw_clusters_and_cells():
|
||||
image = (
|
||||
self.get_page_image()
|
||||
) # make new image to avoid drawing on the saved ones
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
)
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
image.show()
|
||||
|
||||
# draw_clusters_and_cells()
|
||||
|
||||
return cells
|
||||
return self._compute_text_cells()
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
|
@ -59,20 +59,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
||||
return self._dpage
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
page_size = self.get_size()
|
||||
|
||||
[tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
|
||||
|
||||
# for cell in self._dpage.textline_cells:
|
||||
# rect = cell.rect
|
||||
#
|
||||
# assert (
|
||||
# rect.to_bounding_box().l <= rect.to_bounding_box().r
|
||||
# ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
|
||||
# assert (
|
||||
# rect.to_bounding_box().t <= rect.to_bounding_box().b
|
||||
# ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
|
||||
|
||||
return self._dpage.textline_cells
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
@ -171,12 +157,28 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
||||
self, page_no: int, create_words: bool = True, create_textlines: bool = True
|
||||
) -> DoclingParseV4PageBackend:
|
||||
with pypdfium2_lock:
|
||||
seg_page = self.dp_doc.get_page(
|
||||
page_no + 1,
|
||||
create_words=create_words,
|
||||
create_textlines=create_textlines,
|
||||
)
|
||||
|
||||
# In Docling, all TextCell instances are expected with top-left origin.
|
||||
[
|
||||
tc.to_top_left_origin(seg_page.dimension.height)
|
||||
for tc in seg_page.textline_cells
|
||||
]
|
||||
[
|
||||
tc.to_top_left_origin(seg_page.dimension.height)
|
||||
for tc in seg_page.char_cells
|
||||
]
|
||||
[
|
||||
tc.to_top_left_origin(seg_page.dimension.height)
|
||||
for tc in seg_page.word_cells
|
||||
]
|
||||
|
||||
return DoclingParseV4PageBackend(
|
||||
self.dp_doc.get_page(
|
||||
page_no + 1,
|
||||
create_words=create_words,
|
||||
create_textlines=create_textlines,
|
||||
),
|
||||
seg_page,
|
||||
self._pdoc[page_no],
|
||||
)
|
||||
|
||||
|
@ -8,7 +8,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
|
||||
import pypdfium2 as pdfium
|
||||
import pypdfium2.raw as pdfium_c
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||
from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
|
||||
from docling_core.types.doc.page import (
|
||||
BoundingRectangle,
|
||||
PdfPageBoundaryType,
|
||||
PdfPageGeometry,
|
||||
SegmentedPdfPage,
|
||||
TextCell,
|
||||
)
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfTextPage
|
||||
from pypdfium2._helpers.misc import PdfiumError
|
||||
@ -16,6 +22,76 @@ from pypdfium2._helpers.misc import PdfiumError
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.utils.locks import pypdfium2_lock
|
||||
|
||||
|
||||
def get_pdf_page_geometry(
|
||||
ppage: pdfium.PdfPage,
|
||||
angle: float = 0.0,
|
||||
boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
|
||||
) -> PdfPageGeometry:
|
||||
"""
|
||||
Create PdfPageGeometry from a pypdfium2 PdfPage object.
|
||||
|
||||
Args:
|
||||
ppage: pypdfium2 PdfPage object
|
||||
angle: Page rotation angle in degrees (default: 0.0)
|
||||
boundary_type: The boundary type for the page (default: CROP_BOX)
|
||||
|
||||
Returns:
|
||||
PdfPageGeometry with all the different bounding boxes properly set
|
||||
"""
|
||||
with pypdfium2_lock:
|
||||
# Get the main bounding box (intersection of crop_box and media_box)
|
||||
bbox_tuple = ppage.get_bbox()
|
||||
bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
|
||||
|
||||
# Get all the different page boxes from pypdfium2
|
||||
media_box_tuple = ppage.get_mediabox()
|
||||
crop_box_tuple = ppage.get_cropbox()
|
||||
art_box_tuple = ppage.get_artbox()
|
||||
bleed_box_tuple = ppage.get_bleedbox()
|
||||
trim_box_tuple = ppage.get_trimbox()
|
||||
|
||||
# Convert to BoundingBox objects using existing from_tuple method
|
||||
# pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
|
||||
# Use bbox as fallback when specific box types are not defined
|
||||
media_bbox = (
|
||||
BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||
if media_box_tuple
|
||||
else bbox
|
||||
)
|
||||
crop_bbox = (
|
||||
BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||
if crop_box_tuple
|
||||
else bbox
|
||||
)
|
||||
art_bbox = (
|
||||
BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||
if art_box_tuple
|
||||
else bbox
|
||||
)
|
||||
bleed_bbox = (
|
||||
BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||
if bleed_box_tuple
|
||||
else bbox
|
||||
)
|
||||
trim_bbox = (
|
||||
BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
|
||||
if trim_box_tuple
|
||||
else bbox
|
||||
)
|
||||
|
||||
return PdfPageGeometry(
|
||||
angle=angle,
|
||||
rect=BoundingRectangle.from_bounding_box(bbox),
|
||||
boundary_type=boundary_type,
|
||||
art_bbox=art_bbox,
|
||||
bleed_bbox=bleed_bbox,
|
||||
crop_bbox=crop_bbox,
|
||||
media_bbox=media_bbox,
|
||||
trim_bbox=trim_bbox,
|
||||
)
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
@ -41,38 +117,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
page_size = self.get_size()
|
||||
with pypdfium2_lock:
|
||||
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
||||
pos = obj.get_pos()
|
||||
cropbox = BoundingBox.from_tuple(
|
||||
pos, origin=CoordOrigin.BOTTOMLEFT
|
||||
).to_top_left_origin(page_height=page_size.height)
|
||||
|
||||
if cropbox.area() > AREA_THRESHOLD:
|
||||
cropbox = cropbox.scaled(scale=scale)
|
||||
|
||||
yield cropbox
|
||||
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
with pypdfium2_lock:
|
||||
if not self.text_page:
|
||||
self.text_page = self._ppage.get_textpage()
|
||||
|
||||
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
||||
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
||||
|
||||
with pypdfium2_lock:
|
||||
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
||||
|
||||
return text_piece
|
||||
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
return None
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
def _compute_text_cells(self) -> List[TextCell]:
|
||||
"""Compute text cells from pypdfium."""
|
||||
with pypdfium2_lock:
|
||||
if not self.text_page:
|
||||
self.text_page = self._ppage.get_textpage()
|
||||
@ -203,30 +249,58 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
|
||||
return merged_cells
|
||||
|
||||
def draw_clusters_and_cells():
|
||||
image = (
|
||||
self.get_page_image()
|
||||
) # make new image to avoid drawing on the saved ones
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
|
||||
cell_color = (
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
random.randint(30, 140),
|
||||
)
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
|
||||
image.show()
|
||||
return merge_horizontal_cells(cells)
|
||||
|
||||
# before merge:
|
||||
# draw_clusters_and_cells()
|
||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||
AREA_THRESHOLD = 0 # 32 * 32
|
||||
page_size = self.get_size()
|
||||
with pypdfium2_lock:
|
||||
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
||||
pos = obj.get_pos()
|
||||
cropbox = BoundingBox.from_tuple(
|
||||
pos, origin=CoordOrigin.BOTTOMLEFT
|
||||
).to_top_left_origin(page_height=page_size.height)
|
||||
|
||||
cells = merge_horizontal_cells(cells)
|
||||
if cropbox.area() > AREA_THRESHOLD:
|
||||
cropbox = cropbox.scaled(scale=scale)
|
||||
|
||||
# after merge:
|
||||
# draw_clusters_and_cells()
|
||||
yield cropbox
|
||||
|
||||
return cells
|
||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||
with pypdfium2_lock:
|
||||
if not self.text_page:
|
||||
self.text_page = self._ppage.get_textpage()
|
||||
|
||||
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
||||
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
||||
|
||||
with pypdfium2_lock:
|
||||
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
||||
|
||||
return text_piece
|
||||
|
||||
def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
|
||||
if not self.valid:
|
||||
return None
|
||||
|
||||
text_cells = self._compute_text_cells()
|
||||
|
||||
# Get the PDF page geometry from pypdfium2
|
||||
dimension = get_pdf_page_geometry(self._ppage)
|
||||
|
||||
# Create SegmentedPdfPage
|
||||
return SegmentedPdfPage(
|
||||
dimension=dimension,
|
||||
textline_cells=text_cells,
|
||||
char_cells=[],
|
||||
word_cells=[],
|
||||
has_textlines=len(text_cells) > 0,
|
||||
has_words=False,
|
||||
has_chars=False,
|
||||
)
|
||||
|
||||
def get_text_cells(self) -> Iterable[TextCell]:
|
||||
return self._compute_text_cells()
|
||||
|
||||
def get_page_image(
|
||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||
|
@ -232,7 +232,6 @@ class Page(BaseModel):
|
||||
page_no: int
|
||||
# page_hash: Optional[str] = None
|
||||
size: Optional[Size] = None
|
||||
cells: List[TextCell] = []
|
||||
parsed_page: Optional[SegmentedPdfPage] = None
|
||||
predictions: PagePredictions = PagePredictions()
|
||||
assembled: Optional[AssembledUnit] = None
|
||||
@ -245,6 +244,14 @@ class Page(BaseModel):
|
||||
float, Image
|
||||
] = {} # Cache of images in different scales. By default it is cleared during assembling.
|
||||
|
||||
@property
|
||||
def cells(self) -> List[TextCell]:
|
||||
"""Return text cells as a read-only view of parsed_page.textline_cells."""
|
||||
if self.parsed_page is not None:
|
||||
return self.parsed_page.textline_cells
|
||||
else:
|
||||
return []
|
||||
|
||||
def get_image(
|
||||
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
|
||||
) -> Optional[Image]:
|
||||
|
@ -292,7 +292,9 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
||||
),
|
||||
)
|
||||
|
||||
generate_parsed_pages: bool = False
|
||||
generate_parsed_pages: Literal[True] = (
|
||||
True # Always True since parsed_page is now mandatory
|
||||
)
|
||||
|
||||
|
||||
class PdfPipeline(str, Enum):
|
||||
|
@ -7,6 +7,7 @@ from typing import List, Optional, Type
|
||||
|
||||
import numpy as np
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import TextCell
|
||||
from PIL import Image, ImageDraw
|
||||
from rtree import index
|
||||
from scipy.ndimage import binary_dilation, find_objects, label
|
||||
@ -107,7 +108,9 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
|
||||
return []
|
||||
|
||||
# Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
|
||||
def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
|
||||
def _filter_ocr_cells(
|
||||
self, ocr_cells: List[TextCell], programmatic_cells: List[TextCell]
|
||||
) -> List[TextCell]:
|
||||
# Create R-tree index for programmatic cells
|
||||
p = index.Property()
|
||||
p.dimension = 2
|
||||
@ -130,19 +133,38 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
|
||||
]
|
||||
return filtered_ocr_cells
|
||||
|
||||
def post_process_cells(self, ocr_cells, programmatic_cells):
|
||||
def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
|
||||
r"""
|
||||
Post-process the ocr and programmatic cells and return the final list of of cells
|
||||
Post-process the OCR cells and update the page object.
|
||||
Updates parsed_page.textline_cells directly since page.cells is now read-only.
|
||||
"""
|
||||
if self.options.force_full_page_ocr:
|
||||
# If a full page OCR is forced, use only the OCR cells
|
||||
cells = ocr_cells
|
||||
return cells
|
||||
# Get existing cells from the read-only property
|
||||
existing_cells = page.cells
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
|
||||
programmatic_cells.extend(filtered_ocr_cells)
|
||||
return programmatic_cells
|
||||
# Combine existing and OCR cells with overlap filtering
|
||||
final_cells = self._combine_cells(existing_cells, ocr_cells)
|
||||
|
||||
assert page.parsed_page is not None
|
||||
|
||||
# Update parsed_page.textline_cells directly
|
||||
page.parsed_page.textline_cells = final_cells
|
||||
page.parsed_page.has_lines = len(final_cells) > 0
|
||||
|
||||
def _combine_cells(
|
||||
self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
|
||||
) -> List[TextCell]:
|
||||
"""Combine existing and OCR cells with filtering and re-indexing."""
|
||||
if self.options.force_full_page_ocr:
|
||||
combined = ocr_cells
|
||||
else:
|
||||
filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, existing_cells)
|
||||
combined = list(existing_cells) + filtered_ocr_cells
|
||||
|
||||
# Re-index in-place
|
||||
for i, cell in enumerate(combined):
|
||||
cell.index = i
|
||||
|
||||
return combined
|
||||
|
||||
def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
|
||||
image = copy.deepcopy(page.image)
|
||||
|
@ -177,7 +177,7 @@ class EasyOcrModel(BaseOcrModel):
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
self.post_process_cells(all_ocr_cells, page)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
@ -176,9 +176,9 @@ class LayoutModel(BasePageModel):
|
||||
# Apply postprocessing
|
||||
|
||||
processed_clusters, processed_cells = LayoutPostprocessor(
|
||||
page.cells, clusters, page.size
|
||||
page, clusters
|
||||
).postprocess()
|
||||
# processed_clusters, processed_cells = clusters, page.cells
|
||||
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
|
||||
|
||||
with warnings.catch_warnings():
|
||||
warnings.filterwarnings(
|
||||
@ -198,7 +198,6 @@ class LayoutModel(BasePageModel):
|
||||
)
|
||||
)
|
||||
|
||||
page.cells = processed_cells
|
||||
page.predictions.layout = LayoutPrediction(
|
||||
clusters=processed_clusters
|
||||
)
|
||||
|
@ -132,7 +132,7 @@ class OcrMacModel(BaseOcrModel):
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
self.post_process_cells(all_ocr_cells, page)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
@ -2,7 +2,7 @@ import re
|
||||
import warnings
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from typing import Literal, Optional
|
||||
|
||||
import numpy as np
|
||||
from PIL import ImageDraw
|
||||
@ -17,7 +17,6 @@ from docling.utils.profiling import TimeRecorder
|
||||
|
||||
class PagePreprocessingOptions(BaseModel):
|
||||
images_scale: Optional[float]
|
||||
create_parsed_page: bool
|
||||
|
||||
|
||||
class PagePreprocessingModel(BasePageModel):
|
||||
@ -66,10 +65,8 @@ class PagePreprocessingModel(BasePageModel):
|
||||
def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||
assert page._backend is not None
|
||||
|
||||
page.cells = list(page._backend.get_text_cells())
|
||||
|
||||
if self.options.create_parsed_page:
|
||||
page.parsed_page = page._backend.get_segmented_page()
|
||||
page.parsed_page = page._backend.get_segmented_page()
|
||||
assert page.parsed_page is not None
|
||||
|
||||
# Rate the text quality from the PDF parser, and aggregate on page
|
||||
text_scores = []
|
||||
|
@ -134,7 +134,7 @@ class RapidOcrModel(BaseOcrModel):
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
self.post_process_cells(all_ocr_cells, page)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
@ -306,7 +306,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
all_ocr_cells.append(cell)
|
||||
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
self.post_process_cells(all_ocr_cells, page)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
@ -235,7 +235,7 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
self.post_process_cells(all_ocr_cells, page)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
|
@ -72,7 +72,6 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
PagePreprocessingModel(
|
||||
options=PagePreprocessingOptions(
|
||||
images_scale=pipeline_options.images_scale,
|
||||
create_parsed_page=pipeline_options.generate_parsed_pages,
|
||||
)
|
||||
),
|
||||
# OCR
|
||||
|
@ -8,7 +8,7 @@ from docling_core.types.doc import DocItemLabel, Size
|
||||
from docling_core.types.doc.page import TextCell
|
||||
from rtree import index
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, Cluster
|
||||
from docling.datamodel.base_models import BoundingBox, Cluster, Page
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@ -194,11 +194,11 @@ class LayoutPostprocessor:
|
||||
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
||||
}
|
||||
|
||||
def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size):
|
||||
"""Initialize processor with cells and clusters."""
|
||||
"""Initialize processor with cells and spatial indices."""
|
||||
self.cells = cells
|
||||
self.page_size = page_size
|
||||
def __init__(self, page: Page, clusters: List[Cluster]) -> None:
|
||||
"""Initialize processor with page and clusters."""
|
||||
self.cells = page.cells
|
||||
self.page = page
|
||||
self.page_size = page.size
|
||||
self.all_clusters = clusters
|
||||
self.regular_clusters = [
|
||||
c for c in clusters if c.label not in self.SPECIAL_TYPES
|
||||
@ -240,6 +240,10 @@ class LayoutPostprocessor:
|
||||
for child in cluster.children:
|
||||
child.cells = self._sort_cells(child.cells)
|
||||
|
||||
assert self.page.parsed_page is not None
|
||||
self.page.parsed_page.textline_cells = self.cells
|
||||
self.page.parsed_page.has_lines = len(self.cells) > 0
|
||||
|
||||
return final_clusters, self.cells
|
||||
|
||||
def _process_regular_clusters(self) -> List[Cluster]:
|
||||
@ -301,6 +305,7 @@ class LayoutPostprocessor:
|
||||
special_clusters = self._handle_cross_type_overlaps(special_clusters)
|
||||
|
||||
# Calculate page area from known page size
|
||||
assert self.page_size is not None
|
||||
page_area = self.page_size.width * self.page_size.height
|
||||
if page_area > 0:
|
||||
# Filter out full-page pictures
|
||||
|
101474
tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
vendored
101474
tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
vendored
File diff suppressed because it is too large
Load Diff
89985
tests/data/groundtruth/docling_v1/2206.01062.pages.json
vendored
89985
tests/data/groundtruth/docling_v1/2206.01062.pages.json
vendored
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
56232
tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
vendored
56232
tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
vendored
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
9633
tests/data/groundtruth/docling_v1/multi_page.pages.json
vendored
9633
tests/data/groundtruth/docling_v1/multi_page.pages.json
vendored
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
101474
tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
vendored
101474
tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
vendored
File diff suppressed because it is too large
Load Diff
89985
tests/data/groundtruth/docling_v2/2206.01062.pages.json
vendored
89985
tests/data/groundtruth/docling_v2/2206.01062.pages.json
vendored
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
56232
tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
vendored
56232
tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
vendored
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
9633
tests/data/groundtruth/docling_v2/multi_page.pages.json
vendored
9633
tests/data/groundtruth/docling_v2/multi_page.pages.json
vendored
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -5,84 +5,159 @@
|
||||
"width": 2000.0,
|
||||
"height": 2829.0
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 246.4065456254215,
|
||||
"r_y0": 329.06770715202435,
|
||||
"r_x1": 1691.991797818404,
|
||||
"r_y1": 329.06770715202435,
|
||||
"r_x2": 1691.991797818404,
|
||||
"r_y2": 258.9040166758338,
|
||||
"r_x3": 246.4065456254215,
|
||||
"r_y3": 258.9040166758338,
|
||||
"coord_origin": "TOPLEFT"
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 2000.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 2000.0,
|
||||
"r_y2": 2829.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 2829.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 2829.0,
|
||||
"r": 2000.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 2829.0,
|
||||
"r": 2000.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 2829.0,
|
||||
"r": 2000.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 2829.0,
|
||||
"r": 2000.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 2829.0,
|
||||
"r": 2000.0,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
"bitmap_resources": [
|
||||
{
|
||||
"index": 0,
|
||||
"rect": {
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 2000.0,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 2000.0,
|
||||
"r_y2": 2829.0,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 2829.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"uri": null
|
||||
}
|
||||
],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 246.4065456254215,
|
||||
"r_y0": 329.06770715202435,
|
||||
"r_x1": 1691.991797818404,
|
||||
"r_y1": 329.06770715202435,
|
||||
"r_x2": 1691.991797818404,
|
||||
"r_y2": 258.9040166758338,
|
||||
"r_x3": 246.4065456254215,
|
||||
"r_y3": 258.9040166758338,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 234.08627147881114,
|
||||
"r_y0": 419.5788697734327,
|
||||
"r_x1": 1696.0985042090742,
|
||||
"r_y1": 419.5788697734327,
|
||||
"r_x2": 1696.0985042090742,
|
||||
"r_y2": 349.4151792972422,
|
||||
"r_x3": 234.08627147881114,
|
||||
"r_y3": 349.4151792972422,
|
||||
"coord_origin": "TOPLEFT"
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 234.08627147881114,
|
||||
"r_y0": 419.5788697734327,
|
||||
"r_x1": 1696.0985042090742,
|
||||
"r_y1": 419.5788697734327,
|
||||
"r_x2": 1696.0985042090742,
|
||||
"r_y2": 349.4151792972422,
|
||||
"r_x3": 234.08627147881114,
|
||||
"r_y3": 349.4151792972422,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 242.29979922858777,
|
||||
"r_y0": 509.8779072023336,
|
||||
"r_x1": 513.3470125989277,
|
||||
"r_y1": 509.8779072023336,
|
||||
"r_x2": 513.3470125989277,
|
||||
"r_y2": 439.9752910477536,
|
||||
"r_x3": 242.29979922858777,
|
||||
"r_y3": 439.9752910477536,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 242.29979922858777,
|
||||
"r_y0": 509.8779072023336,
|
||||
"r_x1": 513.3470125989277,
|
||||
"r_y1": 509.8779072023336,
|
||||
"r_x2": 513.3470125989277,
|
||||
"r_y2": 439.9752910477536,
|
||||
"r_x3": 242.29979922858777,
|
||||
"r_y3": 439.9752910477536,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,84 +5,143 @@
|
||||
"width": 595.201171875,
|
||||
"height": 841.9216918945312
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 73.34702132031646,
|
||||
"r_y0": 97.99999977896755,
|
||||
"r_x1": 503.64955224479564,
|
||||
"r_y1": 97.99999977896755,
|
||||
"r_x2": 503.64955224479564,
|
||||
"r_y2": 76.99999977896756,
|
||||
"r_x3": 73.34702132031646,
|
||||
"r_y3": 76.99999977896756,
|
||||
"coord_origin": "TOPLEFT"
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.201171875,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.201171875,
|
||||
"r_y2": 841.9216918945312,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9216918945312,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 73.34702132031646,
|
||||
"r_y0": 97.99999977896755,
|
||||
"r_x1": 503.64955224479564,
|
||||
"r_y1": 97.99999977896755,
|
||||
"r_x2": 503.64955224479564,
|
||||
"r_y2": 76.99999977896756,
|
||||
"r_x3": 73.34702132031646,
|
||||
"r_y3": 76.99999977896756,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 69.6796630536824,
|
||||
"r_y0": 124.83139494707741,
|
||||
"r_x1": 504.8720051760782,
|
||||
"r_y1": 124.83139494707741,
|
||||
"r_x2": 504.8720051760782,
|
||||
"r_y2": 104.00000011573796,
|
||||
"r_x3": 69.6796630536824,
|
||||
"r_y3": 104.00000011573796,
|
||||
"coord_origin": "TOPLEFT"
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 69.6796630536824,
|
||||
"r_y0": 124.83139494707741,
|
||||
"r_x1": 504.8720051760782,
|
||||
"r_y1": 124.83139494707741,
|
||||
"r_x2": 504.8720051760782,
|
||||
"r_y2": 104.00000011573796,
|
||||
"r_x3": 69.6796630536824,
|
||||
"r_y3": 104.00000011573796,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 71.84193505100733,
|
||||
"r_y0": 152.90926970226084,
|
||||
"r_x1": 153.088934155825,
|
||||
"r_y1": 152.90926970226084,
|
||||
"r_x2": 153.088934155825,
|
||||
"r_y2": 129.797125232046,
|
||||
"r_x3": 71.84193505100733,
|
||||
"r_y3": 129.797125232046,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 71.84193505100733,
|
||||
"r_y0": 152.90926970226084,
|
||||
"r_x1": 153.088934155825,
|
||||
"r_y1": 152.90926970226084,
|
||||
"r_x2": 153.088934155825,
|
||||
"r_y2": 129.797125232046,
|
||||
"r_x3": 71.84193505100733,
|
||||
"r_y3": 129.797125232046,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,84 +5,143 @@
|
||||
"width": 595.201171875,
|
||||
"height": 841.9216918945312
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 89.2388782764286,
|
||||
"r_y0": 764.898293373551,
|
||||
"r_x1": 521.9863147998661,
|
||||
"r_y1": 764.898293373551,
|
||||
"r_x2": 521.9863147998661,
|
||||
"r_y2": 744.0929853494625,
|
||||
"r_x3": 89.2388782764286,
|
||||
"r_y3": 744.0929853494625,
|
||||
"coord_origin": "TOPLEFT"
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.201171875,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.201171875,
|
||||
"r_y2": 841.9216918945312,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9216918945312,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 89.2388782764286,
|
||||
"r_y0": 764.898293373551,
|
||||
"r_x1": 521.9863147998661,
|
||||
"r_y1": 764.898293373551,
|
||||
"r_x2": 521.9863147998661,
|
||||
"r_y2": 744.0929853494625,
|
||||
"r_x3": 89.2388782764286,
|
||||
"r_y3": 744.0929853494625,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 89.23887497045128,
|
||||
"r_y0": 739.1977118987292,
|
||||
"r_x1": 523.208764293368,
|
||||
"r_y1": 739.1977118987292,
|
||||
"r_x2": 523.208764293368,
|
||||
"r_y2": 717.1685676116198,
|
||||
"r_x3": 89.23887497045128,
|
||||
"r_y3": 717.1685676116198,
|
||||
"coord_origin": "TOPLEFT"
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 89.23887497045128,
|
||||
"r_y0": 739.1977118987292,
|
||||
"r_x1": 523.208764293368,
|
||||
"r_y1": 739.1977118987292,
|
||||
"r_x2": 523.208764293368,
|
||||
"r_y2": 717.1685676116198,
|
||||
"r_x3": 89.23887497045128,
|
||||
"r_y3": 717.1685676116198,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 441.2561096985719,
|
||||
"r_y0": 710.0268078458798,
|
||||
"r_x1": 522.0347860494834,
|
||||
"r_y1": 710.0268078458798,
|
||||
"r_x2": 522.0347860494834,
|
||||
"r_y2": 690.0429592741025,
|
||||
"r_x3": 441.2561096985719,
|
||||
"r_y3": 690.0429592741025,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 441.2561096985719,
|
||||
"r_y0": 710.0268078458798,
|
||||
"r_x1": 522.0347860494834,
|
||||
"r_y1": 710.0268078458798,
|
||||
"r_x2": 522.0347860494834,
|
||||
"r_y2": 690.0429592741025,
|
||||
"r_x3": 441.2561096985719,
|
||||
"r_y3": 690.0429592741025,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,84 +5,143 @@
|
||||
"width": 841.9216918945312,
|
||||
"height": 595.201171875
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 744.0930045534915,
|
||||
"r_y0": 504.87200373583954,
|
||||
"r_x1": 764.8982839673505,
|
||||
"r_y1": 504.87200373583954,
|
||||
"r_x2": 764.8982839673505,
|
||||
"r_y2": 73.34702001188118,
|
||||
"r_x3": 744.0930045534915,
|
||||
"r_y3": 73.34702001188118,
|
||||
"coord_origin": "TOPLEFT"
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.201171875,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.201171875,
|
||||
"r_y2": 841.9216918945312,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9216918945312,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 744.0930045534915,
|
||||
"r_y0": 504.87200373583954,
|
||||
"r_x1": 764.8982839673505,
|
||||
"r_y1": 504.87200373583954,
|
||||
"r_x2": 764.8982839673505,
|
||||
"r_y2": 73.34702001188118,
|
||||
"r_x3": 744.0930045534915,
|
||||
"r_y3": 73.34702001188118,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 717.168585936602,
|
||||
"r_y0": 504.8720061466397,
|
||||
"r_x1": 737.9738558137178,
|
||||
"r_y1": 504.8720061466397,
|
||||
"r_x2": 737.9738558137178,
|
||||
"r_y2": 70.90211682372312,
|
||||
"r_x3": 717.168585936602,
|
||||
"r_y3": 70.90211682372312,
|
||||
"coord_origin": "TOPLEFT"
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 717.168585936602,
|
||||
"r_y0": 504.8720061466397,
|
||||
"r_x1": 737.9738558137178,
|
||||
"r_y1": 504.8720061466397,
|
||||
"r_x2": 737.9738558137178,
|
||||
"r_y2": 70.90211682372312,
|
||||
"r_x3": 717.168585936602,
|
||||
"r_y3": 70.90211682372312,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 690.2441821046808,
|
||||
"r_y0": 152.80629773131633,
|
||||
"r_x1": 709.8255852011977,
|
||||
"r_y1": 152.80629773131633,
|
||||
"r_x2": 709.8255852011977,
|
||||
"r_y2": 72.124570639845,
|
||||
"r_x3": 690.2441821046808,
|
||||
"r_y3": 72.124570639845,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 690.2441821046808,
|
||||
"r_y0": 152.80629773131633,
|
||||
"r_x1": 709.8255852011977,
|
||||
"r_y1": 152.80629773131633,
|
||||
"r_x2": 709.8255852011977,
|
||||
"r_y2": 72.124570639845,
|
||||
"r_x3": 690.2441821046808,
|
||||
"r_y3": 72.124570639845,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,84 +5,143 @@
|
||||
"width": 841.9216918945312,
|
||||
"height": 595.201171875
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 77.10171545548258,
|
||||
"r_y0": 520.7638571913312,
|
||||
"r_x1": 96.68315797053792,
|
||||
"r_y1": 520.7638571913312,
|
||||
"r_x2": 96.68315797053792,
|
||||
"r_y2": 89.2388734673729,
|
||||
"r_x3": 77.10171545548258,
|
||||
"r_y3": 89.2388734673729,
|
||||
"coord_origin": "TOPLEFT"
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.201171875,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.201171875,
|
||||
"r_y2": 841.9216918945312,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9216918945312,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 77.10171545548258,
|
||||
"r_y0": 520.7638571913312,
|
||||
"r_x1": 96.68315797053792,
|
||||
"r_y1": 520.7638571913312,
|
||||
"r_x2": 96.68315797053792,
|
||||
"r_y2": 89.2388734673729,
|
||||
"r_x3": 77.10171545548258,
|
||||
"r_y3": 89.2388734673729,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 100.64168123325977,
|
||||
"r_y0": 523.3236155182395,
|
||||
"r_x1": 126.08064862014129,
|
||||
"r_y1": 523.3236155182395,
|
||||
"r_x2": 126.08064862014129,
|
||||
"r_y2": 89.1266754140729,
|
||||
"r_x3": 100.64168123325977,
|
||||
"r_y3": 89.1266754140729,
|
||||
"coord_origin": "TOPLEFT"
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 100.64168123325977,
|
||||
"r_y0": 523.3236155182395,
|
||||
"r_x1": 126.08064862014129,
|
||||
"r_y1": 523.3236155182395,
|
||||
"r_x2": 126.08064862014129,
|
||||
"r_y2": 89.1266754140729,
|
||||
"r_x3": 100.64168123325977,
|
||||
"r_y3": 89.1266754140729,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 131.21306574279092,
|
||||
"r_y0": 521.0762158417759,
|
||||
"r_x1": 152.19606490864376,
|
||||
"r_y1": 521.0762158417759,
|
||||
"r_x2": 152.19606490864376,
|
||||
"r_y2": 441.0071698212682,
|
||||
"r_x3": 131.21306574279092,
|
||||
"r_y3": 441.0071698212682,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 131.21306574279092,
|
||||
"r_y0": 521.0762158417759,
|
||||
"r_x1": 152.19606490864376,
|
||||
"r_y1": 521.0762158417759,
|
||||
"r_x2": 152.19606490864376,
|
||||
"r_y2": 441.0071698212682,
|
||||
"r_x3": 131.21306574279092,
|
||||
"r_y3": 441.0071698212682,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,84 +5,143 @@
|
||||
"width": 595.201171875,
|
||||
"height": 841.9216918945312
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 73.34702132031646,
|
||||
"r_y0": 97.99999977896755,
|
||||
"r_x1": 503.64955224479564,
|
||||
"r_y1": 97.99999977896755,
|
||||
"r_x2": 503.64955224479564,
|
||||
"r_y2": 76.99999977896756,
|
||||
"r_x3": 73.34702132031646,
|
||||
"r_y3": 76.99999977896756,
|
||||
"coord_origin": "TOPLEFT"
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.201171875,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.201171875,
|
||||
"r_y2": 841.9216918945312,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9216918945312,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 73.34702132031646,
|
||||
"r_y0": 97.99999977896755,
|
||||
"r_x1": 503.64955224479564,
|
||||
"r_y1": 97.99999977896755,
|
||||
"r_x2": 503.64955224479564,
|
||||
"r_y2": 76.99999977896756,
|
||||
"r_x3": 73.34702132031646,
|
||||
"r_y3": 76.99999977896756,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 69.6796630536824,
|
||||
"r_y0": 124.83139494707741,
|
||||
"r_x1": 504.8720051760782,
|
||||
"r_y1": 124.83139494707741,
|
||||
"r_x2": 504.8720051760782,
|
||||
"r_y2": 104.00000011573796,
|
||||
"r_x3": 69.6796630536824,
|
||||
"r_y3": 104.00000011573796,
|
||||
"coord_origin": "TOPLEFT"
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 69.6796630536824,
|
||||
"r_y0": 124.83139494707741,
|
||||
"r_x1": 504.8720051760782,
|
||||
"r_y1": 124.83139494707741,
|
||||
"r_x2": 504.8720051760782,
|
||||
"r_y2": 104.00000011573796,
|
||||
"r_x3": 69.6796630536824,
|
||||
"r_y3": 104.00000011573796,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 71.84193505100733,
|
||||
"r_y0": 152.90926970226084,
|
||||
"r_x1": 153.088934155825,
|
||||
"r_y1": 152.90926970226084,
|
||||
"r_x2": 153.088934155825,
|
||||
"r_y2": 129.797125232046,
|
||||
"r_x3": 71.84193505100733,
|
||||
"r_y3": 129.797125232046,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 71.84193505100733,
|
||||
"r_y0": 152.90926970226084,
|
||||
"r_x1": 153.088934155825,
|
||||
"r_y1": 152.90926970226084,
|
||||
"r_x2": 153.088934155825,
|
||||
"r_y2": 129.797125232046,
|
||||
"r_x3": 71.84193505100733,
|
||||
"r_y3": 129.797125232046,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,84 +5,143 @@
|
||||
"width": 595.201171875,
|
||||
"height": 841.9216918945312
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 89.2388782764286,
|
||||
"r_y0": 764.898293373551,
|
||||
"r_x1": 521.9863147998661,
|
||||
"r_y1": 764.898293373551,
|
||||
"r_x2": 521.9863147998661,
|
||||
"r_y2": 744.0929853494625,
|
||||
"r_x3": 89.2388782764286,
|
||||
"r_y3": 744.0929853494625,
|
||||
"coord_origin": "TOPLEFT"
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.201171875,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.201171875,
|
||||
"r_y2": 841.9216918945312,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9216918945312,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 89.2388782764286,
|
||||
"r_y0": 764.898293373551,
|
||||
"r_x1": 521.9863147998661,
|
||||
"r_y1": 764.898293373551,
|
||||
"r_x2": 521.9863147998661,
|
||||
"r_y2": 744.0929853494625,
|
||||
"r_x3": 89.2388782764286,
|
||||
"r_y3": 744.0929853494625,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 89.23887497045128,
|
||||
"r_y0": 739.1977118987292,
|
||||
"r_x1": 523.208764293368,
|
||||
"r_y1": 739.1977118987292,
|
||||
"r_x2": 523.208764293368,
|
||||
"r_y2": 717.1685676116198,
|
||||
"r_x3": 89.23887497045128,
|
||||
"r_y3": 717.1685676116198,
|
||||
"coord_origin": "TOPLEFT"
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 89.23887497045128,
|
||||
"r_y0": 739.1977118987292,
|
||||
"r_x1": 523.208764293368,
|
||||
"r_y1": 739.1977118987292,
|
||||
"r_x2": 523.208764293368,
|
||||
"r_y2": 717.1685676116198,
|
||||
"r_x3": 89.23887497045128,
|
||||
"r_y3": 717.1685676116198,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 441.2561096985719,
|
||||
"r_y0": 710.0268078458798,
|
||||
"r_x1": 522.0347860494834,
|
||||
"r_y1": 710.0268078458798,
|
||||
"r_x2": 522.0347860494834,
|
||||
"r_y2": 690.0429592741025,
|
||||
"r_x3": 441.2561096985719,
|
||||
"r_y3": 690.0429592741025,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 441.2561096985719,
|
||||
"r_y0": 710.0268078458798,
|
||||
"r_x1": 522.0347860494834,
|
||||
"r_y1": 710.0268078458798,
|
||||
"r_x2": 522.0347860494834,
|
||||
"r_y2": 690.0429592741025,
|
||||
"r_x3": 441.2561096985719,
|
||||
"r_y3": 690.0429592741025,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,84 +5,143 @@
|
||||
"width": 841.9216918945312,
|
||||
"height": 595.201171875
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 744.0930045534915,
|
||||
"r_y0": 504.87200373583954,
|
||||
"r_x1": 764.8982839673505,
|
||||
"r_y1": 504.87200373583954,
|
||||
"r_x2": 764.8982839673505,
|
||||
"r_y2": 73.34702001188118,
|
||||
"r_x3": 744.0930045534915,
|
||||
"r_y3": 73.34702001188118,
|
||||
"coord_origin": "TOPLEFT"
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.201171875,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.201171875,
|
||||
"r_y2": 841.9216918945312,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9216918945312,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 744.0930045534915,
|
||||
"r_y0": 504.87200373583954,
|
||||
"r_x1": 764.8982839673505,
|
||||
"r_y1": 504.87200373583954,
|
||||
"r_x2": 764.8982839673505,
|
||||
"r_y2": 73.34702001188118,
|
||||
"r_x3": 744.0930045534915,
|
||||
"r_y3": 73.34702001188118,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 717.168585936602,
|
||||
"r_y0": 504.8720061466397,
|
||||
"r_x1": 737.9738558137178,
|
||||
"r_y1": 504.8720061466397,
|
||||
"r_x2": 737.9738558137178,
|
||||
"r_y2": 70.90211682372312,
|
||||
"r_x3": 717.168585936602,
|
||||
"r_y3": 70.90211682372312,
|
||||
"coord_origin": "TOPLEFT"
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 717.168585936602,
|
||||
"r_y0": 504.8720061466397,
|
||||
"r_x1": 737.9738558137178,
|
||||
"r_y1": 504.8720061466397,
|
||||
"r_x2": 737.9738558137178,
|
||||
"r_y2": 70.90211682372312,
|
||||
"r_x3": 717.168585936602,
|
||||
"r_y3": 70.90211682372312,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 690.2441821046808,
|
||||
"r_y0": 152.80629773131633,
|
||||
"r_x1": 709.8255852011977,
|
||||
"r_y1": 152.80629773131633,
|
||||
"r_x2": 709.8255852011977,
|
||||
"r_y2": 72.124570639845,
|
||||
"r_x3": 690.2441821046808,
|
||||
"r_y3": 72.124570639845,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 690.2441821046808,
|
||||
"r_y0": 152.80629773131633,
|
||||
"r_x1": 709.8255852011977,
|
||||
"r_y1": 152.80629773131633,
|
||||
"r_x2": 709.8255852011977,
|
||||
"r_y2": 72.124570639845,
|
||||
"r_x3": 690.2441821046808,
|
||||
"r_y3": 72.124570639845,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -5,84 +5,143 @@
|
||||
"width": 841.9216918945312,
|
||||
"height": 595.201171875
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"parsed_page": {
|
||||
"dimension": {
|
||||
"angle": 0.0,
|
||||
"rect": {
|
||||
"r_x0": 77.10171545548258,
|
||||
"r_y0": 520.7638571913312,
|
||||
"r_x1": 96.68315797053792,
|
||||
"r_y1": 520.7638571913312,
|
||||
"r_x2": 96.68315797053792,
|
||||
"r_y2": 89.2388734673729,
|
||||
"r_x3": 77.10171545548258,
|
||||
"r_y3": 89.2388734673729,
|
||||
"coord_origin": "TOPLEFT"
|
||||
"r_x0": 0.0,
|
||||
"r_y0": 0.0,
|
||||
"r_x1": 595.201171875,
|
||||
"r_y1": 0.0,
|
||||
"r_x2": 595.201171875,
|
||||
"r_y2": 841.9216918945312,
|
||||
"r_x3": 0.0,
|
||||
"r_y3": 841.9216918945312,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
"boundary_type": "crop_box",
|
||||
"art_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"bleed_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"crop_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"media_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
},
|
||||
"trim_bbox": {
|
||||
"l": 0.0,
|
||||
"t": 841.9216918945312,
|
||||
"r": 595.201171875,
|
||||
"b": 0.0,
|
||||
"coord_origin": "BOTTOMLEFT"
|
||||
}
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
"bitmap_resources": [],
|
||||
"char_cells": [],
|
||||
"word_cells": [],
|
||||
"textline_cells": [
|
||||
{
|
||||
"index": 0,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 77.10171545548258,
|
||||
"r_y0": 520.7638571913312,
|
||||
"r_x1": 96.68315797053792,
|
||||
"r_y1": 520.7638571913312,
|
||||
"r_x2": 96.68315797053792,
|
||||
"r_y2": 89.2388734673729,
|
||||
"r_x3": 77.10171545548258,
|
||||
"r_y3": 89.2388734673729,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "Docling bundles PDF document conversion to",
|
||||
"orig": "Docling bundles PDF document conversion to",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 100.64168123325977,
|
||||
"r_y0": 523.3236155182395,
|
||||
"r_x1": 126.08064862014129,
|
||||
"r_y1": 523.3236155182395,
|
||||
"r_x2": 126.08064862014129,
|
||||
"r_y2": 89.1266754140729,
|
||||
"r_x3": 100.64168123325977,
|
||||
"r_y3": 89.1266754140729,
|
||||
"coord_origin": "TOPLEFT"
|
||||
{
|
||||
"index": 1,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 100.64168123325977,
|
||||
"r_y0": 523.3236155182395,
|
||||
"r_x1": 126.08064862014129,
|
||||
"r_y1": 523.3236155182395,
|
||||
"r_x2": 126.08064862014129,
|
||||
"r_y2": 89.1266754140729,
|
||||
"r_x3": 100.64168123325977,
|
||||
"r_y3": 89.1266754140729,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
"text": "JSON and Markdown in an easy self contained",
|
||||
"orig": "JSON and Markdown in an easy self contained",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 131.21306574279092,
|
||||
"r_y0": 521.0762158417759,
|
||||
"r_x1": 152.19606490864376,
|
||||
"r_y1": 521.0762158417759,
|
||||
"r_x2": 152.19606490864376,
|
||||
"r_y2": 441.0071698212682,
|
||||
"r_x3": 131.21306574279092,
|
||||
"r_y3": 441.0071698212682,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"parsed_page": null,
|
||||
{
|
||||
"index": 2,
|
||||
"rgba": {
|
||||
"r": 0,
|
||||
"g": 0,
|
||||
"b": 0,
|
||||
"a": 255
|
||||
},
|
||||
"rect": {
|
||||
"r_x0": 131.21306574279092,
|
||||
"r_y0": 521.0762158417759,
|
||||
"r_x1": 152.19606490864376,
|
||||
"r_y1": 521.0762158417759,
|
||||
"r_x2": 152.19606490864376,
|
||||
"r_y2": 441.0071698212682,
|
||||
"r_x3": 131.21306574279092,
|
||||
"r_y3": 441.0071698212682,
|
||||
"coord_origin": "TOPLEFT"
|
||||
},
|
||||
"text": "package",
|
||||
"orig": "package",
|
||||
"text_direction": "left_to_right",
|
||||
"confidence": 1.0,
|
||||
"from_ocr": true
|
||||
}
|
||||
],
|
||||
"has_chars": false,
|
||||
"has_words": false,
|
||||
"has_lines": true,
|
||||
"image": null,
|
||||
"lines": []
|
||||
},
|
||||
"predictions": {
|
||||
"layout": {
|
||||
"clusters": [
|
||||
|
@ -57,14 +57,14 @@ def test_e2e_conversions():
|
||||
pdf_paths = get_pdf_paths()
|
||||
|
||||
engines: List[Tuple[OcrOptions, bool]] = [
|
||||
(EasyOcrOptions(), False),
|
||||
(TesseractOcrOptions(), True),
|
||||
(TesseractCliOcrOptions(), True),
|
||||
(EasyOcrOptions(force_full_page_ocr=True), False),
|
||||
(EasyOcrOptions(), False),
|
||||
(TesseractOcrOptions(force_full_page_ocr=True), True),
|
||||
(TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
|
||||
(TesseractCliOcrOptions(force_full_page_ocr=True), True),
|
||||
(TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
|
||||
(EasyOcrOptions(force_full_page_ocr=True), False),
|
||||
]
|
||||
|
||||
# rapidocr is only available for Python >=3.6,<3.13
|
||||
|
Loading…
Reference in New Issue
Block a user