Ensure all models work only on valid pages (#158)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-18 08:54:06 +02:00 committed by GitHub
parent 034a411057
commit a00c937e19
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 413 additions and 376 deletions

View File

@ -202,6 +202,7 @@ class GlmModel:
page_dimensions = [ page_dimensions = [
PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width) PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
for p in conv_res.pages for p in conv_res.pages
if p.size is not None
] ]
ds_doc: DsDocument = DsDocument( ds_doc: DsDocument = DsDocument(

View File

@ -41,48 +41,50 @@ class EasyOcrModel(BaseOcrModel):
for page in page_batch: for page in page_batch:
assert page._backend is not None assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
ocr_rects = self.get_ocr_rects(page)
ocr_rects = self.get_ocr_rects(page) all_ocr_cells = []
for ocr_rect in ocr_rects:
all_ocr_cells = [] # Skip zero area boxes
for ocr_rect in ocr_rects: if ocr_rect.area() == 0:
# Skip zero area boxes continue
if ocr_rect.area() == 0: high_res_image = page._backend.get_page_image(
continue scale=self.scale, cropbox=ocr_rect
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
)
im = numpy.array(high_res_image)
result = self.reader.readtext(im)
del high_res_image
del im
cells = [
OcrCell(
id=ix,
text=line[1],
confidence=line[2],
bbox=BoundingBox.from_tuple(
coord=(
(line[0][0][0] / self.scale) + ocr_rect.l,
(line[0][0][1] / self.scale) + ocr_rect.t,
(line[0][2][0] / self.scale) + ocr_rect.l,
(line[0][2][1] / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
),
) )
for ix, line in enumerate(result) im = numpy.array(high_res_image)
] result = self.reader.readtext(im)
all_ocr_cells.extend(cells)
## Remove OCR cells which overlap with programmatic cells. del high_res_image
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) del im
page.cells.extend(filtered_ocr_cells) cells = [
OcrCell(
id=ix,
text=line[1],
confidence=line[2],
bbox=BoundingBox.from_tuple(
coord=(
(line[0][0][0] / self.scale) + ocr_rect.l,
(line[0][0][1] / self.scale) + ocr_rect.t,
(line[0][2][0] / self.scale) + ocr_rect.l,
(line[0][2][1] / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
),
)
for ix, line in enumerate(result)
]
all_ocr_cells.extend(cells)
# DEBUG code: ## Remove OCR cells which overlap with programmatic cells.
# self.draw_ocr_rects_and_cells(page, ocr_rects) filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
yield page page.cells.extend(filtered_ocr_cells)
# DEBUG code:
# self.draw_ocr_rects_and_cells(page, ocr_rects)
yield page

View File

@ -273,68 +273,72 @@ class LayoutModel(BasePageModel):
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for page in page_batch: for page in page_batch:
assert page.size is not None assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
assert page.size is not None
clusters = [] clusters = []
for ix, pred_item in enumerate( for ix, pred_item in enumerate(
self.layout_predictor.predict(page.get_image(scale=1.0)) self.layout_predictor.predict(page.get_image(scale=1.0))
): ):
label = DocItemLabel( label = DocItemLabel(
pred_item["label"].lower().replace(" ", "_").replace("-", "_") pred_item["label"].lower().replace(" ", "_").replace("-", "_")
) # Temporary, until docling-ibm-model uses docling-core types ) # Temporary, until docling-ibm-model uses docling-core types
cluster = Cluster( cluster = Cluster(
id=ix, id=ix,
label=label, label=label,
confidence=pred_item["confidence"], confidence=pred_item["confidence"],
bbox=BoundingBox.model_validate(pred_item), bbox=BoundingBox.model_validate(pred_item),
cells=[], cells=[],
)
clusters.append(cluster)
# Map cells to clusters
# TODO: Remove, postprocess should take care of it anyway.
for cell in page.cells:
for cluster in clusters:
if not cell.bbox.area() > 0:
overlap_frac = 0.0
else:
overlap_frac = (
cell.bbox.intersection_area_with(cluster.bbox)
/ cell.bbox.area()
)
if overlap_frac > 0.5:
cluster.cells.append(cell)
# Pre-sort clusters
# clusters = self.sort_clusters_by_cell_order(clusters)
# DEBUG code:
def draw_clusters_and_cells():
image = copy.deepcopy(page.image)
draw = ImageDraw.Draw(image)
for c in clusters:
x0, y0, x1, y1 = c.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
) )
for tc in c.cells: # [:1]: clusters.append(cluster)
x0, y0, x1, y1 = tc.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# draw_clusters_and_cells() # Map cells to clusters
# TODO: Remove, postprocess should take care of it anyway.
for cell in page.cells:
for cluster in clusters:
if not cell.bbox.area() > 0:
overlap_frac = 0.0
else:
overlap_frac = (
cell.bbox.intersection_area_with(cluster.bbox)
/ cell.bbox.area()
)
clusters, page.cells = self.postprocess( if overlap_frac > 0.5:
clusters, page.cells, page.size.height cluster.cells.append(cell)
)
# draw_clusters_and_cells() # Pre-sort clusters
# clusters = self.sort_clusters_by_cell_order(clusters)
page.predictions.layout = LayoutPrediction(clusters=clusters) # DEBUG code:
def draw_clusters_and_cells():
image = copy.deepcopy(page.image)
draw = ImageDraw.Draw(image)
for c in clusters:
x0, y0, x1, y1 = c.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="green")
yield page cell_color = (
random.randint(30, 140),
random.randint(30, 140),
random.randint(30, 140),
)
for tc in c.cells: # [:1]:
x0, y0, x1, y1 = tc.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
image.show()
# draw_clusters_and_cells()
clusters, page.cells = self.postprocess(
clusters, page.cells, page.size.height
)
# draw_clusters_and_cells()
page.predictions.layout = LayoutPrediction(clusters=clusters)
yield page

View File

@ -54,111 +54,119 @@ class PageAssembleModel(BasePageModel):
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for page in page_batch: for page in page_batch:
assert page._backend is not None assert page._backend is not None
assert page.predictions.layout is not None if not page._backend.is_valid():
# assembles some JSON output page by page. yield page
else:
assert page.predictions.layout is not None
elements: List[PageElement] = [] # assembles some JSON output page by page.
headers: List[PageElement] = []
body: List[PageElement] = []
for cluster in page.predictions.layout.clusters: elements: List[PageElement] = []
# _log.info("Cluster label seen:", cluster.label) headers: List[PageElement] = []
if cluster.label in LayoutModel.TEXT_ELEM_LABELS: body: List[PageElement] = []
textlines = [ for cluster in page.predictions.layout.clusters:
cell.text.replace("\x02", "-").strip() # _log.info("Cluster label seen:", cluster.label)
for cell in cluster.cells if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
if len(cell.text.strip()) > 0
]
text = self.sanitize_text(textlines)
text_el = TextElement(
label=cluster.label,
id=cluster.id,
text=text,
page_no=page.page_no,
cluster=cluster,
)
elements.append(text_el)
if cluster.label in LayoutModel.PAGE_HEADER_LABELS: textlines = [
headers.append(text_el) cell.text.replace("\x02", "-").strip()
else: for cell in cluster.cells
body.append(text_el) if len(cell.text.strip()) > 0
elif cluster.label == LayoutModel.TABLE_LABEL: ]
tbl = None text = self.sanitize_text(textlines)
if page.predictions.tablestructure: text_el = TextElement(
tbl = page.predictions.tablestructure.table_map.get(
cluster.id, None
)
if (
not tbl
): # fallback: add table without structure, if it isn't present
tbl = Table(
label=cluster.label, label=cluster.label,
id=cluster.id, id=cluster.id,
text="", text=text,
otsl_seq=[],
table_cells=[],
cluster=cluster,
page_no=page.page_no, page_no=page.page_no,
cluster=cluster,
) )
elements.append(text_el)
elements.append(tbl) if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
body.append(tbl) headers.append(text_el)
elif cluster.label == LayoutModel.FIGURE_LABEL: else:
fig = None body.append(text_el)
if page.predictions.figures_classification: elif cluster.label == LayoutModel.TABLE_LABEL:
fig = page.predictions.figures_classification.figure_map.get( tbl = None
cluster.id, None if page.predictions.tablestructure:
) tbl = page.predictions.tablestructure.table_map.get(
if (
not fig
): # fallback: add figure without classification, if it isn't present
fig = FigureElement(
label=cluster.label,
id=cluster.id,
text="",
data=None,
cluster=cluster,
page_no=page.page_no,
)
elements.append(fig)
body.append(fig)
elif cluster.label == LayoutModel.FORMULA_LABEL:
equation = None
if page.predictions.equations_prediction:
equation = (
page.predictions.equations_prediction.equation_map.get(
cluster.id, None cluster.id, None
) )
) if (
if not equation: # fallback: add empty formula, if it isn't present not tbl
text = self.sanitize_text( ): # fallback: add table without structure, if it isn't present
[ tbl = Table(
cell.text.replace("\x02", "-").strip() label=cluster.label,
for cell in cluster.cells id=cluster.id,
if len(cell.text.strip()) > 0 text="",
] otsl_seq=[],
) table_cells=[],
equation = TextElement( cluster=cluster,
label=cluster.label, page_no=page.page_no,
id=cluster.id, )
cluster=cluster,
page_no=page.page_no,
text=text,
)
elements.append(equation)
body.append(equation)
page.assembled = AssembledUnit( elements.append(tbl)
elements=elements, headers=headers, body=body body.append(tbl)
) elif cluster.label == LayoutModel.FIGURE_LABEL:
fig = None
if page.predictions.figures_classification:
fig = (
page.predictions.figures_classification.figure_map.get(
cluster.id, None
)
)
if (
not fig
): # fallback: add figure without classification, if it isn't present
fig = FigureElement(
label=cluster.label,
id=cluster.id,
text="",
data=None,
cluster=cluster,
page_no=page.page_no,
)
elements.append(fig)
body.append(fig)
elif cluster.label == LayoutModel.FORMULA_LABEL:
equation = None
if page.predictions.equations_prediction:
equation = (
page.predictions.equations_prediction.equation_map.get(
cluster.id, None
)
)
if (
not equation
): # fallback: add empty formula, if it isn't present
text = self.sanitize_text(
[
cell.text.replace("\x02", "-").strip()
for cell in cluster.cells
if len(cell.text.strip()) > 0
]
)
equation = TextElement(
label=cluster.label,
id=cluster.id,
cluster=cluster,
page_no=page.page_no,
text=text,
)
elements.append(equation)
body.append(equation)
# Remove page images (can be disabled) page.assembled = AssembledUnit(
if not self.options.keep_images: elements=elements, headers=headers, body=body
page._image_cache = {} )
# Unload backend # Remove page images (can be disabled)
page._backend.unload() if not self.options.keep_images:
page._image_cache = {}
yield page # Unload backend
page._backend.unload()
yield page

View File

@ -17,9 +17,13 @@ class PagePreprocessingModel(BasePageModel):
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for page in page_batch: for page in page_batch:
page = self._populate_page_images(page) assert page._backend is not None
page = self._parse_page_cells(page) if not page._backend.is_valid():
yield page yield page
else:
page = self._populate_page_images(page)
page = self._parse_page_cells(page)
yield page
# Generate the page image and store it in the page object # Generate the page image and store it in the page object
def _populate_page_images(self, page: Page) -> Page: def _populate_page_images(self, page: Page) -> Page:

View File

@ -71,92 +71,101 @@ class TableStructureModel(BasePageModel):
for page in page_batch: for page in page_batch:
assert page._backend is not None assert page._backend is not None
assert page.predictions.layout is not None if not page._backend.is_valid():
assert page.size is not None
page.predictions.tablestructure = TableStructurePrediction() # dummy
in_tables = [
(
cluster,
[
round(cluster.bbox.l) * self.scale,
round(cluster.bbox.t) * self.scale,
round(cluster.bbox.r) * self.scale,
round(cluster.bbox.b) * self.scale,
],
)
for cluster in page.predictions.layout.clusters
if cluster.label == DocItemLabel.TABLE
]
if not len(in_tables):
yield page yield page
continue else:
tokens = [] assert page.predictions.layout is not None
for c in page.cells: assert page.size is not None
for cluster, _ in in_tables:
if c.bbox.area() > 0:
if (
c.bbox.intersection_area_with(cluster.bbox) / c.bbox.area()
> 0.2
):
# Only allow non empty stings (spaces) into the cells of a table
if len(c.text.strip()) > 0:
new_cell = copy.deepcopy(c)
new_cell.bbox = new_cell.bbox.scaled(scale=self.scale)
tokens.append(new_cell.model_dump()) page.predictions.tablestructure = TableStructurePrediction() # dummy
page_input = { in_tables = [
"tokens": tokens, (
"width": page.size.width * self.scale, cluster,
"height": page.size.height * self.scale, [
} round(cluster.bbox.l) * self.scale,
page_input["image"] = numpy.asarray(page.get_image(scale=self.scale)) round(cluster.bbox.t) * self.scale,
round(cluster.bbox.r) * self.scale,
round(cluster.bbox.b) * self.scale,
],
)
for cluster in page.predictions.layout.clusters
if cluster.label == DocItemLabel.TABLE
]
if not len(in_tables):
yield page
continue
table_clusters, table_bboxes = zip(*in_tables) tokens = []
for c in page.cells:
for cluster, _ in in_tables:
if c.bbox.area() > 0:
if (
c.bbox.intersection_area_with(cluster.bbox)
/ c.bbox.area()
> 0.2
):
# Only allow non empty stings (spaces) into the cells of a table
if len(c.text.strip()) > 0:
new_cell = copy.deepcopy(c)
new_cell.bbox = new_cell.bbox.scaled(
scale=self.scale
)
if len(table_bboxes): tokens.append(new_cell.model_dump())
tf_output = self.tf_predictor.multi_table_predict(
page_input, table_bboxes, do_matching=self.do_cell_matching
)
for table_cluster, table_out in zip(table_clusters, tf_output): page_input = {
table_cells = [] "tokens": tokens,
for element in table_out["tf_responses"]: "width": page.size.width * self.scale,
"height": page.size.height * self.scale,
}
page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
if not self.do_cell_matching: table_clusters, table_bboxes = zip(*in_tables)
the_bbox = BoundingBox.model_validate(
element["bbox"]
).scaled(1 / self.scale)
text_piece = page._backend.get_text_in_rect(the_bbox)
element["bbox"]["token"] = text_piece
tc = TableCell.model_validate(element) if len(table_bboxes):
if self.do_cell_matching and tc.bbox is not None: tf_output = self.tf_predictor.multi_table_predict(
tc.bbox = tc.bbox.scaled(1 / self.scale) page_input, table_bboxes, do_matching=self.do_cell_matching
table_cells.append(tc)
# Retrieving cols/rows, after post processing:
num_rows = table_out["predict_details"]["num_rows"]
num_cols = table_out["predict_details"]["num_cols"]
otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
tbl = Table(
otsl_seq=otsl_seq,
table_cells=table_cells,
num_rows=num_rows,
num_cols=num_cols,
id=table_cluster.id,
page_no=page.page_no,
cluster=table_cluster,
label=DocItemLabel.TABLE,
) )
page.predictions.tablestructure.table_map[table_cluster.id] = tbl for table_cluster, table_out in zip(table_clusters, tf_output):
table_cells = []
for element in table_out["tf_responses"]:
# For debugging purposes: if not self.do_cell_matching:
# self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values()) the_bbox = BoundingBox.model_validate(
element["bbox"]
).scaled(1 / self.scale)
text_piece = page._backend.get_text_in_rect(the_bbox)
element["bbox"]["token"] = text_piece
yield page tc = TableCell.model_validate(element)
if self.do_cell_matching and tc.bbox is not None:
tc.bbox = tc.bbox.scaled(1 / self.scale)
table_cells.append(tc)
# Retrieving cols/rows, after post processing:
num_rows = table_out["predict_details"]["num_rows"]
num_cols = table_out["predict_details"]["num_cols"]
otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
tbl = Table(
otsl_seq=otsl_seq,
table_cells=table_cells,
num_rows=num_rows,
num_cols=num_cols,
id=table_cluster.id,
page_no=page.page_no,
cluster=table_cluster,
label=DocItemLabel.TABLE,
)
page.predictions.tablestructure.table_map[table_cluster.id] = (
tbl
)
# For debugging purposes:
# self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
yield page

View File

@ -110,61 +110,65 @@ class TesseractOcrCliModel(BaseOcrModel):
for page in page_batch: for page in page_batch:
assert page._backend is not None assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
ocr_rects = self.get_ocr_rects(page)
ocr_rects = self.get_ocr_rects(page) all_ocr_cells = []
for ocr_rect in ocr_rects:
all_ocr_cells = [] # Skip zero area boxes
for ocr_rect in ocr_rects: if ocr_rect.area() == 0:
# Skip zero area boxes continue
if ocr_rect.area() == 0: high_res_image = page._backend.get_page_image(
continue scale=self.scale, cropbox=ocr_rect
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
)
with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
fname = image_file.name
high_res_image.save(fname)
df = self._run_tesseract(fname)
# _log.info(df)
# Print relevant columns (bounding box and text)
for ix, row in df.iterrows():
text = row["text"]
conf = row["conf"]
l = float(row["left"])
b = float(row["top"])
w = float(row["width"])
h = float(row["height"])
t = b + h
r = l + w
cell = OcrCell(
id=ix,
text=text,
confidence=conf / 100.0,
bbox=BoundingBox.from_tuple(
coord=(
(l / self.scale) + ocr_rect.l,
(b / self.scale) + ocr_rect.t,
(r / self.scale) + ocr_rect.l,
(t / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
),
) )
all_ocr_cells.append(cell)
## Remove OCR cells which overlap with programmatic cells. with tempfile.NamedTemporaryFile(
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) suffix=".png", mode="w"
) as image_file:
fname = image_file.name
high_res_image.save(fname)
page.cells.extend(filtered_ocr_cells) df = self._run_tesseract(fname)
# DEBUG code: # _log.info(df)
# self.draw_ocr_rects_and_cells(page, ocr_rects)
yield page # Print relevant columns (bounding box and text)
for ix, row in df.iterrows():
text = row["text"]
conf = row["conf"]
l = float(row["left"])
b = float(row["top"])
w = float(row["width"])
h = float(row["height"])
t = b + h
r = l + w
cell = OcrCell(
id=ix,
text=text,
confidence=conf / 100.0,
bbox=BoundingBox.from_tuple(
coord=(
(l / self.scale) + ocr_rect.l,
(b / self.scale) + ocr_rect.t,
(r / self.scale) + ocr_rect.l,
(t / self.scale) + ocr_rect.t,
),
origin=CoordOrigin.TOPLEFT,
),
)
all_ocr_cells.append(cell)
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
page.cells.extend(filtered_ocr_cells)
# DEBUG code:
# self.draw_ocr_rects_and_cells(page, ocr_rects)
yield page

View File

@ -69,57 +69,62 @@ class TesseractOcrModel(BaseOcrModel):
for page in page_batch: for page in page_batch:
assert page._backend is not None assert page._backend is not None
assert self.reader is not None if not page._backend.is_valid():
yield page
else:
assert self.reader is not None
ocr_rects = self.get_ocr_rects(page) ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = [] all_ocr_cells = []
for ocr_rect in ocr_rects: for ocr_rect in ocr_rects:
# Skip zero area boxes # Skip zero area boxes
if ocr_rect.area() == 0: if ocr_rect.area() == 0:
continue continue
high_res_image = page._backend.get_page_image( high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect scale=self.scale, cropbox=ocr_rect
)
# Retrieve text snippets with their bounding boxes
self.reader.SetImage(high_res_image)
boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
cells = []
for ix, (im, box, _, _) in enumerate(boxes):
# Set the area of interest. Tesseract uses Bottom-Left for the origin
self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
# Extract text within the bounding box
text = self.reader.GetUTF8Text().strip()
confidence = self.reader.MeanTextConf()
left = box["x"] / self.scale
bottom = box["y"] / self.scale
right = (box["x"] + box["w"]) / self.scale
top = (box["y"] + box["h"]) / self.scale
cells.append(
OcrCell(
id=ix,
text=text,
confidence=confidence,
bbox=BoundingBox.from_tuple(
coord=(left, top, right, bottom),
origin=CoordOrigin.TOPLEFT,
),
)
) )
# del high_res_image # Retrieve text snippets with their bounding boxes
all_ocr_cells.extend(cells) self.reader.SetImage(high_res_image)
boxes = self.reader.GetComponentImages(
self.reader_RIL.TEXTLINE, True
)
## Remove OCR cells which overlap with programmatic cells. cells = []
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) for ix, (im, box, _, _) in enumerate(boxes):
# Set the area of interest. Tesseract uses Bottom-Left for the origin
self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
page.cells.extend(filtered_ocr_cells) # Extract text within the bounding box
text = self.reader.GetUTF8Text().strip()
confidence = self.reader.MeanTextConf()
left = box["x"] / self.scale
bottom = box["y"] / self.scale
right = (box["x"] + box["w"]) / self.scale
top = (box["y"] + box["h"]) / self.scale
# DEBUG code: cells.append(
# self.draw_ocr_rects_and_cells(page, ocr_rects) OcrCell(
id=ix,
text=text,
confidence=confidence,
bbox=BoundingBox.from_tuple(
coord=(left, top, right, bottom),
origin=CoordOrigin.TOPLEFT,
),
)
)
yield page # del high_res_image
all_ocr_cells.extend(cells)
## Remove OCR cells which overlap with programmatic cells.
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
page.cells.extend(filtered_ocr_cells)
# DEBUG code:
# self.draw_ocr_rects_and_cells(page, ocr_rects)
yield page

View File

@ -134,13 +134,13 @@ class StandardPdfPipeline(PaginatedPipeline):
all_body = [] all_body = []
for p in conv_res.pages: for p in conv_res.pages:
assert p.assembled is not None if p.assembled is not None:
for el in p.assembled.body: for el in p.assembled.body:
all_body.append(el) all_body.append(el)
for el in p.assembled.headers: for el in p.assembled.headers:
all_headers.append(el) all_headers.append(el)
for el in p.assembled.elements: for el in p.assembled.elements:
all_elements.append(el) all_elements.append(el)
conv_res.assembled = AssembledUnit( conv_res.assembled = AssembledUnit(
elements=all_elements, headers=all_headers, body=all_body elements=all_elements, headers=all_headers, body=all_body

View File

@ -126,7 +126,7 @@ input_files = [
] ]
# Directly pass list of files or streams to `convert_all` # Directly pass list of files or streams to `convert_all`
conv_results_iter = doc_converter.convert_all(input_files) # previously `convert_batch` conv_results_iter = doc_converter.convert_all(input_files) # previously `convert`
``` ```
Through the `raises_on_error` argument, you can also control if the conversion should raise exceptions when first Through the `raises_on_error` argument, you can also control if the conversion should raise exceptions when first
@ -135,7 +135,7 @@ By default, any error is immediately raised and the conversion aborts (previousl
```python ```python
... ...
conv_results_iter = doc_converter.convert_all(input_files, raises_on_error=False) # previously `convert_batch` conv_results_iter = doc_converter.convert_all(input_files, raises_on_error=False) # previously `convert`
``` ```