Ensure all models work only on valid pages (#158)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2024-10-18 08:54:06 +02:00
parent 034a411057
commit a00c937e19
10 changed files with 413 additions and 376 deletions
@@ -202,6 +202,7 @@ class GlmModel:
        page_dimensions = [
            PageDimensions(page=p.page_no + 1, height=p.size.height, width=p.size.width)
            for p in conv_res.pages
+            if p.size is not None
        ]

        ds_doc: DsDocument = DsDocument(
@@ -41,48 +41,50 @@ class EasyOcrModel(BaseOcrModel):

        for page in page_batch:
            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                ocr_rects = self.get_ocr_rects(page)

-            ocr_rects = self.get_ocr_rects(page)
-
-            all_ocr_cells = []
-            for ocr_rect in ocr_rects:
-                # Skip zero area boxes
-                if ocr_rect.area() == 0:
-                    continue
-                high_res_image = page._backend.get_page_image(
-                    scale=self.scale, cropbox=ocr_rect
-                )
-                im = numpy.array(high_res_image)
-                result = self.reader.readtext(im)
-
-                del high_res_image
-                del im
-
-                cells = [
-                    OcrCell(
-                        id=ix,
-                        text=line[1],
-                        confidence=line[2],
-                        bbox=BoundingBox.from_tuple(
-                            coord=(
-                                (line[0][0][0] / self.scale) + ocr_rect.l,
-                                (line[0][0][1] / self.scale) + ocr_rect.t,
-                                (line[0][2][0] / self.scale) + ocr_rect.l,
-                                (line[0][2][1] / self.scale) + ocr_rect.t,
-                            ),
-                            origin=CoordOrigin.TOPLEFT,
-                        ),
+                all_ocr_cells = []
+                for ocr_rect in ocr_rects:
+                    # Skip zero area boxes
+                    if ocr_rect.area() == 0:
+                        continue
+                    high_res_image = page._backend.get_page_image(
+                        scale=self.scale, cropbox=ocr_rect
                    )
-                    for ix, line in enumerate(result)
-                ]
-                all_ocr_cells.extend(cells)
+                    im = numpy.array(high_res_image)
+                    result = self.reader.readtext(im)

-            ## Remove OCR cells which overlap with programmatic cells.
-            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+                    del high_res_image
+                    del im

-            page.cells.extend(filtered_ocr_cells)
+                    cells = [
+                        OcrCell(
+                            id=ix,
+                            text=line[1],
+                            confidence=line[2],
+                            bbox=BoundingBox.from_tuple(
+                                coord=(
+                                    (line[0][0][0] / self.scale) + ocr_rect.l,
+                                    (line[0][0][1] / self.scale) + ocr_rect.t,
+                                    (line[0][2][0] / self.scale) + ocr_rect.l,
+                                    (line[0][2][1] / self.scale) + ocr_rect.t,
+                                ),
+                                origin=CoordOrigin.TOPLEFT,
+                            ),
+                        )
+                        for ix, line in enumerate(result)
+                    ]
+                    all_ocr_cells.extend(cells)

-            # DEBUG code:
-            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+                ## Remove OCR cells which overlap with programmatic cells.
+                filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)

-            yield page
+                page.cells.extend(filtered_ocr_cells)
+
+                # DEBUG code:
+                # self.draw_ocr_rects_and_cells(page, ocr_rects)
+
+                yield page
@@ -273,68 +273,72 @@ class LayoutModel(BasePageModel):

    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
        for page in page_batch:
-            assert page.size is not None
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                assert page.size is not None

-            clusters = []
-            for ix, pred_item in enumerate(
-                self.layout_predictor.predict(page.get_image(scale=1.0))
-            ):
-                label = DocItemLabel(
-                    pred_item["label"].lower().replace(" ", "_").replace("-", "_")
-                )  # Temporary, until docling-ibm-model uses docling-core types
-                cluster = Cluster(
-                    id=ix,
-                    label=label,
-                    confidence=pred_item["confidence"],
-                    bbox=BoundingBox.model_validate(pred_item),
-                    cells=[],
-                )
-                clusters.append(cluster)
-
-            # Map cells to clusters
-            # TODO: Remove, postprocess should take care of it anyway.
-            for cell in page.cells:
-                for cluster in clusters:
-                    if not cell.bbox.area() > 0:
-                        overlap_frac = 0.0
-                    else:
-                        overlap_frac = (
-                            cell.bbox.intersection_area_with(cluster.bbox)
-                            / cell.bbox.area()
-                        )
-
-                    if overlap_frac > 0.5:
-                        cluster.cells.append(cell)
-
-            # Pre-sort clusters
-            # clusters = self.sort_clusters_by_cell_order(clusters)
-
-            # DEBUG code:
-            def draw_clusters_and_cells():
-                image = copy.deepcopy(page.image)
-                draw = ImageDraw.Draw(image)
-                for c in clusters:
-                    x0, y0, x1, y1 = c.bbox.as_tuple()
-                    draw.rectangle([(x0, y0), (x1, y1)], outline="green")
-
-                    cell_color = (
-                        random.randint(30, 140),
-                        random.randint(30, 140),
-                        random.randint(30, 140),
+                clusters = []
+                for ix, pred_item in enumerate(
+                    self.layout_predictor.predict(page.get_image(scale=1.0))
+                ):
+                    label = DocItemLabel(
+                        pred_item["label"].lower().replace(" ", "_").replace("-", "_")
+                    )  # Temporary, until docling-ibm-model uses docling-core types
+                    cluster = Cluster(
+                        id=ix,
+                        label=label,
+                        confidence=pred_item["confidence"],
+                        bbox=BoundingBox.model_validate(pred_item),
+                        cells=[],
                    )
-                    for tc in c.cells:  # [:1]:
-                        x0, y0, x1, y1 = tc.bbox.as_tuple()
-                        draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-                image.show()
+                    clusters.append(cluster)

-            # draw_clusters_and_cells()
+                # Map cells to clusters
+                # TODO: Remove, postprocess should take care of it anyway.
+                for cell in page.cells:
+                    for cluster in clusters:
+                        if not cell.bbox.area() > 0:
+                            overlap_frac = 0.0
+                        else:
+                            overlap_frac = (
+                                cell.bbox.intersection_area_with(cluster.bbox)
+                                / cell.bbox.area()
+                            )

-            clusters, page.cells = self.postprocess(
-                clusters, page.cells, page.size.height
-            )
+                        if overlap_frac > 0.5:
+                            cluster.cells.append(cell)

-            # draw_clusters_and_cells()
+                # Pre-sort clusters
+                # clusters = self.sort_clusters_by_cell_order(clusters)

-            page.predictions.layout = LayoutPrediction(clusters=clusters)
+                # DEBUG code:
+                def draw_clusters_and_cells():
+                    image = copy.deepcopy(page.image)
+                    draw = ImageDraw.Draw(image)
+                    for c in clusters:
+                        x0, y0, x1, y1 = c.bbox.as_tuple()
+                        draw.rectangle([(x0, y0), (x1, y1)], outline="green")

-            yield page
+                        cell_color = (
+                            random.randint(30, 140),
+                            random.randint(30, 140),
+                            random.randint(30, 140),
+                        )
+                        for tc in c.cells:  # [:1]:
+                            x0, y0, x1, y1 = tc.bbox.as_tuple()
+                            draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
+                    image.show()
+
+                # draw_clusters_and_cells()
+
+                clusters, page.cells = self.postprocess(
+                    clusters, page.cells, page.size.height
+                )
+
+                # draw_clusters_and_cells()
+
+                page.predictions.layout = LayoutPrediction(clusters=clusters)
+
+                yield page
@@ -54,111 +54,119 @@ class PageAssembleModel(BasePageModel):
    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
        for page in page_batch:
            assert page._backend is not None
-            assert page.predictions.layout is not None
-            # assembles some JSON output page by page.
+            if not page._backend.is_valid():
+                yield page
+            else:
+                assert page.predictions.layout is not None

-            elements: List[PageElement] = []
-            headers: List[PageElement] = []
-            body: List[PageElement] = []
+                # assembles some JSON output page by page.

-            for cluster in page.predictions.layout.clusters:
-                # _log.info("Cluster label seen:", cluster.label)
-                if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
+                elements: List[PageElement] = []
+                headers: List[PageElement] = []
+                body: List[PageElement] = []

-                    textlines = [
-                        cell.text.replace("\x02", "-").strip()
-                        for cell in cluster.cells
-                        if len(cell.text.strip()) > 0
-                    ]
-                    text = self.sanitize_text(textlines)
-                    text_el = TextElement(
-                        label=cluster.label,
-                        id=cluster.id,
-                        text=text,
-                        page_no=page.page_no,
-                        cluster=cluster,
-                    )
-                    elements.append(text_el)
+                for cluster in page.predictions.layout.clusters:
+                    # _log.info("Cluster label seen:", cluster.label)
+                    if cluster.label in LayoutModel.TEXT_ELEM_LABELS:

-                    if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
-                        headers.append(text_el)
-                    else:
-                        body.append(text_el)
-                elif cluster.label == LayoutModel.TABLE_LABEL:
-                    tbl = None
-                    if page.predictions.tablestructure:
-                        tbl = page.predictions.tablestructure.table_map.get(
-                            cluster.id, None
-                        )
-                    if (
-                        not tbl
-                    ):  # fallback: add table without structure, if it isn't present
-                        tbl = Table(
+                        textlines = [
+                            cell.text.replace("\x02", "-").strip()
+                            for cell in cluster.cells
+                            if len(cell.text.strip()) > 0
+                        ]
+                        text = self.sanitize_text(textlines)
+                        text_el = TextElement(
                            label=cluster.label,
                            id=cluster.id,
-                            text="",
-                            otsl_seq=[],
-                            table_cells=[],
-                            cluster=cluster,
+                            text=text,
                            page_no=page.page_no,
+                            cluster=cluster,
                        )
+                        elements.append(text_el)

-                    elements.append(tbl)
-                    body.append(tbl)
-                elif cluster.label == LayoutModel.FIGURE_LABEL:
-                    fig = None
-                    if page.predictions.figures_classification:
-                        fig = page.predictions.figures_classification.figure_map.get(
-                            cluster.id, None
-                        )
-                    if (
-                        not fig
-                    ):  # fallback: add figure without classification, if it isn't present
-                        fig = FigureElement(
-                            label=cluster.label,
-                            id=cluster.id,
-                            text="",
-                            data=None,
-                            cluster=cluster,
-                            page_no=page.page_no,
-                        )
-                    elements.append(fig)
-                    body.append(fig)
-                elif cluster.label == LayoutModel.FORMULA_LABEL:
-                    equation = None
-                    if page.predictions.equations_prediction:
-                        equation = (
-                            page.predictions.equations_prediction.equation_map.get(
+                        if cluster.label in LayoutModel.PAGE_HEADER_LABELS:
+                            headers.append(text_el)
+                        else:
+                            body.append(text_el)
+                    elif cluster.label == LayoutModel.TABLE_LABEL:
+                        tbl = None
+                        if page.predictions.tablestructure:
+                            tbl = page.predictions.tablestructure.table_map.get(
                                cluster.id, None
                            )
-                        )
-                    if not equation:  # fallback: add empty formula, if it isn't present
-                        text = self.sanitize_text(
-                            [
-                                cell.text.replace("\x02", "-").strip()
-                                for cell in cluster.cells
-                                if len(cell.text.strip()) > 0
-                            ]
-                        )
-                        equation = TextElement(
-                            label=cluster.label,
-                            id=cluster.id,
-                            cluster=cluster,
-                            page_no=page.page_no,
-                            text=text,
-                        )
-                    elements.append(equation)
-                    body.append(equation)
+                        if (
+                            not tbl
+                        ):  # fallback: add table without structure, if it isn't present
+                            tbl = Table(
+                                label=cluster.label,
+                                id=cluster.id,
+                                text="",
+                                otsl_seq=[],
+                                table_cells=[],
+                                cluster=cluster,
+                                page_no=page.page_no,
+                            )

-            page.assembled = AssembledUnit(
-                elements=elements, headers=headers, body=body
-            )
+                        elements.append(tbl)
+                        body.append(tbl)
+                    elif cluster.label == LayoutModel.FIGURE_LABEL:
+                        fig = None
+                        if page.predictions.figures_classification:
+                            fig = (
+                                page.predictions.figures_classification.figure_map.get(
+                                    cluster.id, None
+                                )
+                            )
+                        if (
+                            not fig
+                        ):  # fallback: add figure without classification, if it isn't present
+                            fig = FigureElement(
+                                label=cluster.label,
+                                id=cluster.id,
+                                text="",
+                                data=None,
+                                cluster=cluster,
+                                page_no=page.page_no,
+                            )
+                        elements.append(fig)
+                        body.append(fig)
+                    elif cluster.label == LayoutModel.FORMULA_LABEL:
+                        equation = None
+                        if page.predictions.equations_prediction:
+                            equation = (
+                                page.predictions.equations_prediction.equation_map.get(
+                                    cluster.id, None
+                                )
+                            )
+                        if (
+                            not equation
+                        ):  # fallback: add empty formula, if it isn't present
+                            text = self.sanitize_text(
+                                [
+                                    cell.text.replace("\x02", "-").strip()
+                                    for cell in cluster.cells
+                                    if len(cell.text.strip()) > 0
+                                ]
+                            )
+                            equation = TextElement(
+                                label=cluster.label,
+                                id=cluster.id,
+                                cluster=cluster,
+                                page_no=page.page_no,
+                                text=text,
+                            )
+                        elements.append(equation)
+                        body.append(equation)

-            # Remove page images (can be disabled)
-            if not self.options.keep_images:
-                page._image_cache = {}
+                page.assembled = AssembledUnit(
+                    elements=elements, headers=headers, body=body
+                )

-            # Unload backend
-            page._backend.unload()
+                # Remove page images (can be disabled)
+                if not self.options.keep_images:
+                    page._image_cache = {}

-            yield page
+                # Unload backend
+                page._backend.unload()
+
+                yield page
@@ -17,9 +17,13 @@ class PagePreprocessingModel(BasePageModel):

    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
        for page in page_batch:
-            page = self._populate_page_images(page)
-            page = self._parse_page_cells(page)
-            yield page
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                page = self._populate_page_images(page)
+                page = self._parse_page_cells(page)
+                yield page

    # Generate the page image and store it in the page object
    def _populate_page_images(self, page: Page) -> Page:
@@ -71,92 +71,101 @@ class TableStructureModel(BasePageModel):

        for page in page_batch:
            assert page._backend is not None
-            assert page.predictions.layout is not None
-            assert page.size is not None
-
-            page.predictions.tablestructure = TableStructurePrediction()  # dummy
-
-            in_tables = [
-                (
-                    cluster,
-                    [
-                        round(cluster.bbox.l) * self.scale,
-                        round(cluster.bbox.t) * self.scale,
-                        round(cluster.bbox.r) * self.scale,
-                        round(cluster.bbox.b) * self.scale,
-                    ],
-                )
-                for cluster in page.predictions.layout.clusters
-                if cluster.label == DocItemLabel.TABLE
-            ]
-            if not len(in_tables):
+            if not page._backend.is_valid():
                yield page
-                continue
+            else:

-            tokens = []
-            for c in page.cells:
-                for cluster, _ in in_tables:
-                    if c.bbox.area() > 0:
-                        if (
-                            c.bbox.intersection_area_with(cluster.bbox) / c.bbox.area()
-                            > 0.2
-                        ):
-                            # Only allow non empty stings (spaces) into the cells of a table
-                            if len(c.text.strip()) > 0:
-                                new_cell = copy.deepcopy(c)
-                                new_cell.bbox = new_cell.bbox.scaled(scale=self.scale)
+                assert page.predictions.layout is not None
+                assert page.size is not None

-                                tokens.append(new_cell.model_dump())
+                page.predictions.tablestructure = TableStructurePrediction()  # dummy

-            page_input = {
-                "tokens": tokens,
-                "width": page.size.width * self.scale,
-                "height": page.size.height * self.scale,
-            }
-            page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))
+                in_tables = [
+                    (
+                        cluster,
+                        [
+                            round(cluster.bbox.l) * self.scale,
+                            round(cluster.bbox.t) * self.scale,
+                            round(cluster.bbox.r) * self.scale,
+                            round(cluster.bbox.b) * self.scale,
+                        ],
+                    )
+                    for cluster in page.predictions.layout.clusters
+                    if cluster.label == DocItemLabel.TABLE
+                ]
+                if not len(in_tables):
+                    yield page
+                    continue

-            table_clusters, table_bboxes = zip(*in_tables)
+                tokens = []
+                for c in page.cells:
+                    for cluster, _ in in_tables:
+                        if c.bbox.area() > 0:
+                            if (
+                                c.bbox.intersection_area_with(cluster.bbox)
+                                / c.bbox.area()
+                                > 0.2
+                            ):
+                                # Only allow non empty stings (spaces) into the cells of a table
+                                if len(c.text.strip()) > 0:
+                                    new_cell = copy.deepcopy(c)
+                                    new_cell.bbox = new_cell.bbox.scaled(
+                                        scale=self.scale
+                                    )

-            if len(table_bboxes):
-                tf_output = self.tf_predictor.multi_table_predict(
-                    page_input, table_bboxes, do_matching=self.do_cell_matching
-                )
+                                    tokens.append(new_cell.model_dump())

-                for table_cluster, table_out in zip(table_clusters, tf_output):
-                    table_cells = []
-                    for element in table_out["tf_responses"]:
+                page_input = {
+                    "tokens": tokens,
+                    "width": page.size.width * self.scale,
+                    "height": page.size.height * self.scale,
+                }
+                page_input["image"] = numpy.asarray(page.get_image(scale=self.scale))

-                        if not self.do_cell_matching:
-                            the_bbox = BoundingBox.model_validate(
-                                element["bbox"]
-                            ).scaled(1 / self.scale)
-                            text_piece = page._backend.get_text_in_rect(the_bbox)
-                            element["bbox"]["token"] = text_piece
+                table_clusters, table_bboxes = zip(*in_tables)

-                        tc = TableCell.model_validate(element)
-                        if self.do_cell_matching and tc.bbox is not None:
-                            tc.bbox = tc.bbox.scaled(1 / self.scale)
-                        table_cells.append(tc)
-
-                    # Retrieving cols/rows, after post processing:
-                    num_rows = table_out["predict_details"]["num_rows"]
-                    num_cols = table_out["predict_details"]["num_cols"]
-                    otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
-
-                    tbl = Table(
-                        otsl_seq=otsl_seq,
-                        table_cells=table_cells,
-                        num_rows=num_rows,
-                        num_cols=num_cols,
-                        id=table_cluster.id,
-                        page_no=page.page_no,
-                        cluster=table_cluster,
-                        label=DocItemLabel.TABLE,
+                if len(table_bboxes):
+                    tf_output = self.tf_predictor.multi_table_predict(
+                        page_input, table_bboxes, do_matching=self.do_cell_matching
                    )

-                    page.predictions.tablestructure.table_map[table_cluster.id] = tbl
+                    for table_cluster, table_out in zip(table_clusters, tf_output):
+                        table_cells = []
+                        for element in table_out["tf_responses"]:

-                # For debugging purposes:
-                # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
+                            if not self.do_cell_matching:
+                                the_bbox = BoundingBox.model_validate(
+                                    element["bbox"]
+                                ).scaled(1 / self.scale)
+                                text_piece = page._backend.get_text_in_rect(the_bbox)
+                                element["bbox"]["token"] = text_piece

-            yield page
+                            tc = TableCell.model_validate(element)
+                            if self.do_cell_matching and tc.bbox is not None:
+                                tc.bbox = tc.bbox.scaled(1 / self.scale)
+                            table_cells.append(tc)
+
+                        # Retrieving cols/rows, after post processing:
+                        num_rows = table_out["predict_details"]["num_rows"]
+                        num_cols = table_out["predict_details"]["num_cols"]
+                        otsl_seq = table_out["predict_details"]["prediction"]["rs_seq"]
+
+                        tbl = Table(
+                            otsl_seq=otsl_seq,
+                            table_cells=table_cells,
+                            num_rows=num_rows,
+                            num_cols=num_cols,
+                            id=table_cluster.id,
+                            page_no=page.page_no,
+                            cluster=table_cluster,
+                            label=DocItemLabel.TABLE,
+                        )
+
+                        page.predictions.tablestructure.table_map[table_cluster.id] = (
+                            tbl
+                        )
+
+                    # For debugging purposes:
+                    # self.draw_table_and_cells(page, page.predictions.tablestructure.table_map.values())
+
+                yield page
@@ -110,61 +110,65 @@ class TesseractOcrCliModel(BaseOcrModel):

        for page in page_batch:
            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                ocr_rects = self.get_ocr_rects(page)

-            ocr_rects = self.get_ocr_rects(page)
-
-            all_ocr_cells = []
-            for ocr_rect in ocr_rects:
-                # Skip zero area boxes
-                if ocr_rect.area() == 0:
-                    continue
-                high_res_image = page._backend.get_page_image(
-                    scale=self.scale, cropbox=ocr_rect
-                )
-
-                with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
-                    fname = image_file.name
-                    high_res_image.save(fname)
-
-                    df = self._run_tesseract(fname)
-
-                # _log.info(df)
-
-                # Print relevant columns (bounding box and text)
-                for ix, row in df.iterrows():
-                    text = row["text"]
-                    conf = row["conf"]
-
-                    l = float(row["left"])
-                    b = float(row["top"])
-                    w = float(row["width"])
-                    h = float(row["height"])
-
-                    t = b + h
-                    r = l + w
-
-                    cell = OcrCell(
-                        id=ix,
-                        text=text,
-                        confidence=conf / 100.0,
-                        bbox=BoundingBox.from_tuple(
-                            coord=(
-                                (l / self.scale) + ocr_rect.l,
-                                (b / self.scale) + ocr_rect.t,
-                                (r / self.scale) + ocr_rect.l,
-                                (t / self.scale) + ocr_rect.t,
-                            ),
-                            origin=CoordOrigin.TOPLEFT,
-                        ),
+                all_ocr_cells = []
+                for ocr_rect in ocr_rects:
+                    # Skip zero area boxes
+                    if ocr_rect.area() == 0:
+                        continue
+                    high_res_image = page._backend.get_page_image(
+                        scale=self.scale, cropbox=ocr_rect
                    )
-                    all_ocr_cells.append(cell)

-            ## Remove OCR cells which overlap with programmatic cells.
-            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+                    with tempfile.NamedTemporaryFile(
+                        suffix=".png", mode="w"
+                    ) as image_file:
+                        fname = image_file.name
+                        high_res_image.save(fname)

-            page.cells.extend(filtered_ocr_cells)
+                        df = self._run_tesseract(fname)

-            # DEBUG code:
-            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+                    # _log.info(df)

-            yield page
+                    # Print relevant columns (bounding box and text)
+                    for ix, row in df.iterrows():
+                        text = row["text"]
+                        conf = row["conf"]
+
+                        l = float(row["left"])
+                        b = float(row["top"])
+                        w = float(row["width"])
+                        h = float(row["height"])
+
+                        t = b + h
+                        r = l + w
+
+                        cell = OcrCell(
+                            id=ix,
+                            text=text,
+                            confidence=conf / 100.0,
+                            bbox=BoundingBox.from_tuple(
+                                coord=(
+                                    (l / self.scale) + ocr_rect.l,
+                                    (b / self.scale) + ocr_rect.t,
+                                    (r / self.scale) + ocr_rect.l,
+                                    (t / self.scale) + ocr_rect.t,
+                                ),
+                                origin=CoordOrigin.TOPLEFT,
+                            ),
+                        )
+                        all_ocr_cells.append(cell)
+
+                ## Remove OCR cells which overlap with programmatic cells.
+                filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+
+                page.cells.extend(filtered_ocr_cells)
+
+                # DEBUG code:
+                # self.draw_ocr_rects_and_cells(page, ocr_rects)
+
+                yield page
@@ -69,57 +69,62 @@ class TesseractOcrModel(BaseOcrModel):

        for page in page_batch:
            assert page._backend is not None
-            assert self.reader is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                assert self.reader is not None

-            ocr_rects = self.get_ocr_rects(page)
+                ocr_rects = self.get_ocr_rects(page)

-            all_ocr_cells = []
-            for ocr_rect in ocr_rects:
-                # Skip zero area boxes
-                if ocr_rect.area() == 0:
-                    continue
-                high_res_image = page._backend.get_page_image(
-                    scale=self.scale, cropbox=ocr_rect
-                )
-
-                # Retrieve text snippets with their bounding boxes
-                self.reader.SetImage(high_res_image)
-                boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
-
-                cells = []
-                for ix, (im, box, _, _) in enumerate(boxes):
-                    # Set the area of interest. Tesseract uses Bottom-Left for the origin
-                    self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
-
-                    # Extract text within the bounding box
-                    text = self.reader.GetUTF8Text().strip()
-                    confidence = self.reader.MeanTextConf()
-                    left = box["x"] / self.scale
-                    bottom = box["y"] / self.scale
-                    right = (box["x"] + box["w"]) / self.scale
-                    top = (box["y"] + box["h"]) / self.scale
-
-                    cells.append(
-                        OcrCell(
-                            id=ix,
-                            text=text,
-                            confidence=confidence,
-                            bbox=BoundingBox.from_tuple(
-                                coord=(left, top, right, bottom),
-                                origin=CoordOrigin.TOPLEFT,
-                            ),
-                        )
+                all_ocr_cells = []
+                for ocr_rect in ocr_rects:
+                    # Skip zero area boxes
+                    if ocr_rect.area() == 0:
+                        continue
+                    high_res_image = page._backend.get_page_image(
+                        scale=self.scale, cropbox=ocr_rect
                    )

-                # del high_res_image
-                all_ocr_cells.extend(cells)
+                    # Retrieve text snippets with their bounding boxes
+                    self.reader.SetImage(high_res_image)
+                    boxes = self.reader.GetComponentImages(
+                        self.reader_RIL.TEXTLINE, True
+                    )

-            ## Remove OCR cells which overlap with programmatic cells.
-            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+                    cells = []
+                    for ix, (im, box, _, _) in enumerate(boxes):
+                        # Set the area of interest. Tesseract uses Bottom-Left for the origin
+                        self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])

-            page.cells.extend(filtered_ocr_cells)
+                        # Extract text within the bounding box
+                        text = self.reader.GetUTF8Text().strip()
+                        confidence = self.reader.MeanTextConf()
+                        left = box["x"] / self.scale
+                        bottom = box["y"] / self.scale
+                        right = (box["x"] + box["w"]) / self.scale
+                        top = (box["y"] + box["h"]) / self.scale

-            # DEBUG code:
-            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+                        cells.append(
+                            OcrCell(
+                                id=ix,
+                                text=text,
+                                confidence=confidence,
+                                bbox=BoundingBox.from_tuple(
+                                    coord=(left, top, right, bottom),
+                                    origin=CoordOrigin.TOPLEFT,
+                                ),
+                            )
+                        )

-            yield page
+                    # del high_res_image
+                    all_ocr_cells.extend(cells)
+
+                ## Remove OCR cells which overlap with programmatic cells.
+                filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+
+                page.cells.extend(filtered_ocr_cells)
+
+                # DEBUG code:
+                # self.draw_ocr_rects_and_cells(page, ocr_rects)
+
+                yield page
@@ -134,13 +134,13 @@ class StandardPdfPipeline(PaginatedPipeline):
        all_body = []

        for p in conv_res.pages:
-            assert p.assembled is not None
-            for el in p.assembled.body:
-                all_body.append(el)
-            for el in p.assembled.headers:
-                all_headers.append(el)
-            for el in p.assembled.elements:
-                all_elements.append(el)
+            if p.assembled is not None:
+                for el in p.assembled.body:
+                    all_body.append(el)
+                for el in p.assembled.headers:
+                    all_headers.append(el)
+                for el in p.assembled.elements:
+                    all_elements.append(el)

        conv_res.assembled = AssembledUnit(
            elements=all_elements, headers=all_headers, body=all_body
@@ -126,7 +126,7 @@ input_files = [
 ]

 # Directly pass list of files or streams to `convert_all`
-conv_results_iter = doc_converter.convert_all(input_files) # previously `convert_batch`
+conv_results_iter = doc_converter.convert_all(input_files) # previously `convert`

 ```
 Through the `raises_on_error` argument, you can also control if the conversion should raise exceptions when first
@@ -135,7 +135,7 @@ By default, any error is immediately raised and the conversion aborts (previousl

 ```python
 ...
-conv_results_iter = doc_converter.convert_all(input_files, raises_on_error=False) # previously `convert_batch`
+conv_results_iter = doc_converter.convert_all(input_files, raises_on_error=False) # previously `convert`

 ```