feat: add options for choosing OCR engines (#118)

--------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> Signed-off-by: Peter Staar <taa@zurich.ibm.com> Co-authored-by: Nikos Livathinos <nli@zurich.ibm.com> Co-authored-by: Peter Staar <taa@zurich.ibm.com>
2024-10-08 19:07:08 +02:00 · 2024-10-08 19:07:08 +02:00 · f96ea86a00
commit f96ea86a00
parent d412c363d7
20 changed files with 699 additions and 32 deletions
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@ -9,6 +9,11 @@ jobs:
        python-version: ['3.10', '3.11', '3.12']
    steps:
      - uses: actions/checkout@v3
+      - name: Install tesseract
+        run: sudo apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa libleptonica-dev libtesseract-dev pkg-config
+      - name: Set TESSDATA_PREFIX
+        run: |
+          echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
      - uses: ./.github/actions/setup-poetry
        with:
          python-version: ${{ matrix.python-version }}
@ -32,4 +37,4 @@ jobs:
            poetry run python "$file" || exit 1
          done
      - name: Build with poetry
-        run: poetry build
+        run: poetry build
--- a/README.md
+++ b/README.md
@ -52,6 +52,79 @@ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectu
  ```
 </details>

+<details>
+  <summary><b>Alternative OCR engines</b></summary>
+
+  Docling supports multiple OCR engines for processing scanned documents. The current version provides
+  the following engines.
+
+  | Engine | Installation | Usage |
+  | ------ | ------------ | ----- |
+  | [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
+  | Tesseract | System dependency. See description for Tesseract and Tesserocr below.  | `TesseractOcrOptions` |
+  | Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
+
+  The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
+
+  ```python
+    from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+    from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
+    from docling.document_converter import DocumentConverter
+
+    pipeline_options = PipelineOptions()
+    pipeline_options.do_ocr = True
+    pipeline_options.ocr_options = TesseractOcrOptions()  # Use Tesseract
+
+    doc_converter = DocumentConverter(
+        pipeline_options=pipeline_options,
+    )
+  ```
+
+  #### Tesseract installation
+
+  [Tesseract](https://github.com/tesseract-ocr/tesseract) is a popular OCR engine which is available
+  on most operating systems. For using this engine with Docling, Tesseract must be installed on your
+  system, using the packaging tool of your choice. Below we provide example commands.
+  After installing Tesseract you are expected to provide the path to its language files using the
+  `TESSDATA_PREFIX` environment variable (note that it must terminate with a slash `/`).
+
+  For macOS, we reccomend using [Homebrew](https://brew.sh/).
+
+  ```console
+  brew install tesseract leptonica pkg-config
+  TESSDATA_PREFIX=/opt/homebrew/share/tessdata/
+  echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
+  ```
+
+  For Debian-based systems.
+
+  ```console
+  apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev pkg-config
+  TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
+  echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
+  ```
+
+  For RHEL systems.
+
+  ```console
+  dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
+  TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
+  echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
+  ```
+
+  #### Linking to Tesseract
+  The most efficient usage of the Tesseract library is via linking. Docling is using
+  the [Tesserocr](https://github.com/sirfz/tesserocr) package for this.
+
+  If you get into installation issues of Tesserocr, we suggest using the following
+  installation options:
+
+  ```console
+  pip uninstall tesserocr
+  pip install --no-binary :all: tesserocr
+  ```
+</details>
+
 <details>
  <summary><b>Docling development setup</b></summary>

--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -14,7 +14,12 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
-from docling.datamodel.pipeline_options import PipelineOptions
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    PipelineOptions,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
 from docling.document_converter import DocumentConverter

 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@ -53,6 +58,13 @@ class Backend(str, Enum):
    DOCLING = "docling"


+# Define an enum for the ocr engines
+class OcrEngine(str, Enum):
+    EASYOCR = "easyocr"
+    TESSERACT_CLI = "tesseract_cli"
+    TESSERACT = "tesseract"
+
+
 def export_documents(
    conv_results: Iterable[ConversionResult],
    output_dir: Path,
@ -152,6 +164,9 @@ def convert(
    backend: Annotated[
        Backend, typer.Option(..., help="The PDF backend to use.")
    ] = Backend.DOCLING,
+    ocr_engine: Annotated[
+        OcrEngine, typer.Option(..., help="The OCR engine to use.")
+    ] = OcrEngine.EASYOCR,
    output: Annotated[
        Path, typer.Option(..., help="Output directory where results are saved.")
    ] = Path("."),
@ -191,8 +206,19 @@ def convert(
        case _:
            raise RuntimeError(f"Unexpected backend type {backend}")

+    match ocr_engine:
+        case OcrEngine.EASYOCR:
+            ocr_options = EasyOcrOptions()
+        case OcrEngine.TESSERACT_CLI:
+            ocr_options = TesseractCliOcrOptions()
+        case OcrEngine.TESSERACT:
+            ocr_options = TesseractOcrOptions()
+        case _:
+            raise RuntimeError(f"Unexpected backend type {backend}")
+
    pipeline_options = PipelineOptions(
        do_ocr=ocr,
+        ocr_options=ocr_options,
        do_table_structure=True,
    )
    pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -110,7 +110,10 @@ class BoundingBox(BaseModel):
            return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)

    def area(self) -> float:
-        return (self.r - self.l) * (self.b - self.t)
+        area = (self.r - self.l) * (self.b - self.t)
+        if self.coord_origin == CoordOrigin.BOTTOMLEFT:
+            area = -area
+        return area

    def intersection_area_with(self, other: "BoundingBox") -> float:
        # Calculate intersection coordinates
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -1,6 +1,7 @@
 from enum import Enum, auto
+from typing import List, Literal, Optional, Union

-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict, Field


 class TableFormerMode(str, Enum):
@ -18,8 +19,49 @@ class TableStructureOptions(BaseModel):
    mode: TableFormerMode = TableFormerMode.FAST


+class OcrOptions(BaseModel):
+    kind: str
+
+
+class EasyOcrOptions(OcrOptions):
+    kind: Literal["easyocr"] = "easyocr"
+    lang: List[str] = ["fr", "de", "es", "en"]
+    use_gpu: bool = True  # same default as easyocr.Reader
+    model_storage_directory: Optional[str] = None
+    download_enabled: bool = True  # same default as easyocr.Reader
+
+    model_config = ConfigDict(
+        extra="forbid",
+        protected_namespaces=(),
+    )
+
+
+class TesseractCliOcrOptions(OcrOptions):
+    kind: Literal["tesseract"] = "tesseract"
+    lang: List[str] = ["fra", "deu", "spa", "eng"]
+    tesseract_cmd: str = "tesseract"
+    path: Optional[str] = None
+
+    model_config = ConfigDict(
+        extra="forbid",
+    )
+
+
+class TesseractOcrOptions(OcrOptions):
+    kind: Literal["tesserocr"] = "tesserocr"
+    lang: List[str] = ["fra", "deu", "spa", "eng"]
+    path: Optional[str] = None
+
+    model_config = ConfigDict(
+        extra="forbid",
+    )
+
+
 class PipelineOptions(BaseModel):
    do_table_structure: bool = True  # True: perform table structure extraction
    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text

    table_structure_options: TableStructureOptions = TableStructureOptions()
+    ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
+        Field(EasyOcrOptions(), discriminator="kind")
+    )
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@ -3,21 +3,21 @@ import logging
 from abc import abstractmethod
 from typing import Iterable, List, Tuple

-import numpy
 import numpy as np
 from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import find_objects, label

 from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.pipeline_options import OcrOptions

 _log = logging.getLogger(__name__)


 class BaseOcrModel:
-    def __init__(self, config):
-        self.config = config
-        self.enabled = config["enabled"]
+    def __init__(self, enabled: bool, options: OcrOptions):
+        self.enabled = enabled
+        self.options = options

    # Computes the optimum amount and coordinates of rectangles to OCR on a given page
    def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@ -4,21 +4,33 @@ from typing import Iterable
 import numpy

 from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.pipeline_options import EasyOcrOptions
 from docling.models.base_ocr_model import BaseOcrModel

 _log = logging.getLogger(__name__)


 class EasyOcrModel(BaseOcrModel):
-    def __init__(self, config):
-        super().__init__(config)
+    def __init__(self, enabled: bool, options: EasyOcrOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: EasyOcrOptions

        self.scale = 3  # multiplier for 72 dpi == 216 dpi.

        if self.enabled:
-            import easyocr
+            try:
+                import easyocr
+            except ImportError:
+                raise ImportError(
+                    "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
+                    "Alternatively, Docling has support for other OCR engines. See the documentation."
+                )

-            self.reader = easyocr.Reader(config["lang"])
+            self.reader = easyocr.Reader(
+                lang_list=self.options.lang,
+                model_storage_directory=self.options.model_storage_directory,
+                download_enabled=self.options.download_enabled,
+            )

    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:

@ -31,6 +43,9 @@ class EasyOcrModel(BaseOcrModel):

            all_ocr_cells = []
            for ocr_rect in ocr_rects:
+                # Skip zero area boxes
+                if ocr_rect.area() == 0:
+                    continue
                high_res_image = page._backend.get_page_image(
                    scale=self.scale, cropbox=ocr_rect
                )
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@ -0,0 +1,167 @@
+import io
+import logging
+import tempfile
+from subprocess import PIPE, Popen
+from typing import Iterable, Tuple
+
+import pandas as pd
+
+from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.pipeline_options import TesseractCliOcrOptions
+from docling.models.base_ocr_model import BaseOcrModel
+
+_log = logging.getLogger(__name__)
+
+
+class TesseractOcrCliModel(BaseOcrModel):
+
+    def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: TesseractCliOcrOptions
+
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+
+        self._name = None
+        self._version = None
+
+        if self.enabled:
+            try:
+                self._get_name_and_version()
+
+            except Exception as exc:
+                raise RuntimeError(
+                    f"Tesseract is not available, aborting: {exc} "
+                    "Install tesseract on your system and the tesseract binary is discoverable. "
+                    "The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. "
+                    "Alternatively, Docling has support for other OCR engines. See the documentation."
+                )
+
+    def _get_name_and_version(self) -> Tuple[str, str]:
+
+        if self._name != None and self._version != None:
+            return self._name, self._version
+
+        cmd = [self.options.tesseract_cmd, "--version"]
+
+        proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
+        stdout, stderr = proc.communicate()
+
+        proc.wait()
+
+        # HACK: Windows versions of Tesseract output the version to stdout, Linux versions
+        # to stderr, so check both.
+        version_line = (
+            (stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
+            .split("\n")[0]
+            .strip()
+        )
+
+        # If everything else fails...
+        if not version_line:
+            version_line = "tesseract XXX"
+
+        name, version = version_line.split(" ")
+
+        self._name = name
+        self._version = version
+
+        return name, version
+
+    def _run_tesseract(self, ifilename: str):
+
+        cmd = [self.options.tesseract_cmd]
+
+        if self.options.lang is not None and len(self.options.lang) > 0:
+            cmd.append("-l")
+            cmd.append("+".join(self.options.lang))
+        if self.options.path is not None:
+            cmd.append("--tessdata-dir")
+            cmd.append(self.options.path)
+
+        cmd += [ifilename, "stdout", "tsv"]
+        _log.info("command: {}".format(" ".join(cmd)))
+
+        proc = Popen(cmd, stdout=PIPE)
+        output, _ = proc.communicate()
+
+        # _log.info(output)
+
+        # Decode the byte string to a regular string
+        decoded_data = output.decode("utf-8")
+        # _log.info(decoded_data)
+
+        # Read the TSV file generated by Tesseract
+        df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
+
+        # Display the dataframe (optional)
+        # _log.info("df: ", df.head())
+
+        # Filter rows that contain actual text (ignore header or empty rows)
+        df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
+
+        return df_filtered
+
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+
+        if not self.enabled:
+            yield from page_batch
+            return
+
+        for page in page_batch:
+            ocr_rects = self.get_ocr_rects(page)
+
+            all_ocr_cells = []
+            for ocr_rect in ocr_rects:
+                # Skip zero area boxes
+                if ocr_rect.area() == 0:
+                    continue
+                high_res_image = page._backend.get_page_image(
+                    scale=self.scale, cropbox=ocr_rect
+                )
+
+                with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
+                    fname = image_file.name
+                    high_res_image.save(fname)
+
+                    df = self._run_tesseract(fname)
+
+                # _log.info(df)
+
+                # Print relevant columns (bounding box and text)
+                for ix, row in df.iterrows():
+                    text = row["text"]
+                    conf = row["conf"]
+
+                    l = float(row["left"])
+                    b = float(row["top"])
+                    w = float(row["width"])
+                    h = float(row["height"])
+
+                    t = b + h
+                    r = l + w
+
+                    cell = OcrCell(
+                        id=ix,
+                        text=text,
+                        confidence=conf / 100.0,
+                        bbox=BoundingBox.from_tuple(
+                            coord=(
+                                (l / self.scale) + ocr_rect.l,
+                                (b / self.scale) + ocr_rect.t,
+                                (r / self.scale) + ocr_rect.l,
+                                (t / self.scale) + ocr_rect.t,
+                            ),
+                            origin=CoordOrigin.TOPLEFT,
+                        ),
+                    )
+                    all_ocr_cells.append(cell)
+
+            ## Remove OCR cells which overlap with programmatic cells.
+            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+
+            page.cells.extend(filtered_ocr_cells)
+
+            # DEBUG code:
+            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+
+            yield page
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -0,0 +1,122 @@
+import logging
+from typing import Iterable
+
+import numpy
+
+from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
+from docling.datamodel.pipeline_options import TesseractCliOcrOptions
+from docling.models.base_ocr_model import BaseOcrModel
+
+_log = logging.getLogger(__name__)
+
+
+class TesseractOcrModel(BaseOcrModel):
+    def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: TesseractCliOcrOptions
+
+        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
+        self.reader = None
+
+        if self.enabled:
+            setup_errmsg = (
+                "tesserocr is not correctly installed. "
+                "Please install it via `pip install tesserocr` to use this OCR engine. "
+                "Note that tesserocr might have to be manually compiled for working with"
+                "your Tesseract installation. The Docling documentation provides examples for it. "
+                "Alternatively, Docling has support for other OCR engines. See the documentation."
+            )
+            try:
+                import tesserocr
+            except ImportError:
+                raise ImportError(setup_errmsg)
+
+            try:
+                tesseract_version = tesserocr.tesseract_version()
+                _log.debug("Initializing TesserOCR: %s", tesseract_version)
+            except:
+                raise ImportError(setup_errmsg)
+
+            # Initialize the tesseractAPI
+            lang = "+".join(self.options.lang)
+            if self.options.path is not None:
+                self.reader = tesserocr.PyTessBaseAPI(
+                    path=self.options.path,
+                    lang=lang,
+                    psm=tesserocr.PSM.AUTO,
+                    init=True,
+                    oem=tesserocr.OEM.DEFAULT,
+                )
+            else:
+                self.reader = tesserocr.PyTessBaseAPI(
+                    lang=lang,
+                    psm=tesserocr.PSM.AUTO,
+                    init=True,
+                    oem=tesserocr.OEM.DEFAULT,
+                )
+            self.reader_RIL = tesserocr.RIL
+
+    def __del__(self):
+        if self.reader is not None:
+            # Finalize the tesseractAPI
+            self.reader.End()
+
+    def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
+
+        if not self.enabled:
+            yield from page_batch
+            return
+
+        for page in page_batch:
+            ocr_rects = self.get_ocr_rects(page)
+
+            all_ocr_cells = []
+            for ocr_rect in ocr_rects:
+                # Skip zero area boxes
+                if ocr_rect.area() == 0:
+                    continue
+                high_res_image = page._backend.get_page_image(
+                    scale=self.scale, cropbox=ocr_rect
+                )
+
+                # Retrieve text snippets with their bounding boxes
+                self.reader.SetImage(high_res_image)
+                boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
+
+                cells = []
+                for ix, (im, box, _, _) in enumerate(boxes):
+                    # Set the area of interest. Tesseract uses Bottom-Left for the origin
+                    self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
+
+                    # Extract text within the bounding box
+                    text = self.reader.GetUTF8Text().strip()
+                    confidence = self.reader.MeanTextConf()
+                    left = box["x"] / self.scale
+                    bottom = box["y"] / self.scale
+                    right = (box["x"] + box["w"]) / self.scale
+                    top = (box["y"] + box["h"]) / self.scale
+
+                    cells.append(
+                        OcrCell(
+                            id=ix,
+                            text=text,
+                            confidence=confidence,
+                            bbox=BoundingBox.from_tuple(
+                                coord=(left, top, right, bottom),
+                                origin=CoordOrigin.TOPLEFT,
+                            ),
+                        )
+                    )
+
+                # del high_res_image
+                all_ocr_cells.extend(cells)
+
+            ## Remove OCR cells which overlap with programmatic cells.
+            filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
+
+            page.cells.extend(filtered_ocr_cells)
+
+            # DEBUG code:
+            # self.draw_ocr_rects_and_cells(page, ocr_rects)
+
+            yield page
--- a/docling/pipeline/standard_model_pipeline.py
+++ b/docling/pipeline/standard_model_pipeline.py
@ -1,9 +1,17 @@
 from pathlib import Path

-from docling.datamodel.pipeline_options import PipelineOptions
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    PipelineOptions,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
+from docling.models.base_ocr_model import BaseOcrModel
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
 from docling.models.table_structure_model import TableStructureModel
+from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
+from docling.models.tesseract_ocr_model import TesseractOcrModel
 from docling.pipeline.base_model_pipeline import BaseModelPipeline


@ -14,19 +22,38 @@ class StandardModelPipeline(BaseModelPipeline):
    def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
        super().__init__(artifacts_path, pipeline_options)

+        ocr_model: BaseOcrModel
+        if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
+            ocr_model = EasyOcrModel(
+                enabled=pipeline_options.do_ocr,
+                options=pipeline_options.ocr_options,
+            )
+        elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
+            ocr_model = TesseractOcrCliModel(
+                enabled=pipeline_options.do_ocr,
+                options=pipeline_options.ocr_options,
+            )
+        elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
+            ocr_model = TesseractOcrModel(
+                enabled=pipeline_options.do_ocr,
+                options=pipeline_options.ocr_options,
+            )
+        else:
+            raise RuntimeError(
+                f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
+            )
+
        self.model_pipe = [
-            EasyOcrModel(
-                config={
-                    "lang": ["fr", "de", "es", "en"],
-                    "enabled": pipeline_options.do_ocr,
-                }
-            ),
+            # OCR
+            ocr_model,
+            # Layout
            LayoutModel(
                config={
                    "artifacts_path": artifacts_path
                    / StandardModelPipeline._layout_model_path
                }
            ),
+            # Table structure
            TableStructureModel(
                config={
                    "artifacts_path": artifacts_path
--- a/examples/custom_convert.py
+++ b/examples/custom_convert.py
@ -8,6 +8,10 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, PipelineOptions
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
+from docling.datamodel.pipeline_options import (
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
 from docling.document_converter import DocumentConverter

 _log = logging.getLogger(__name__)
@ -71,7 +75,7 @@ def main():
    # and PDF Backends for various configurations.
    # Uncomment one section at the time to see the differences in the output.

-    # PyPdfium without OCR
+    # PyPdfium without EasyOCR
    # --------------------
    # pipeline_options = PipelineOptions()
    # pipeline_options.do_ocr=False
@ -83,7 +87,7 @@ def main():
    #     pdf_backend=PyPdfiumDocumentBackend,
    # )

-    # PyPdfium with OCR
+    # PyPdfium with EasyOCR
    # -----------------
    # pipeline_options = PipelineOptions()
    # pipeline_options.do_ocr=True
@ -95,7 +99,7 @@ def main():
    #     pdf_backend=PyPdfiumDocumentBackend,
    # )

-    # Docling Parse without OCR
+    # Docling Parse without EasyOCR
    # -------------------------
    pipeline_options = PipelineOptions()
    pipeline_options.do_ocr = False
@ -107,7 +111,7 @@ def main():
        pdf_backend=DoclingParseDocumentBackend,
    )

-    # Docling Parse with OCR
+    # Docling Parse with EasyOCR
    # ----------------------
    # pipeline_options = PipelineOptions()
    # pipeline_options.do_ocr=True
@ -119,6 +123,32 @@ def main():
    #     pdf_backend=DoclingParseDocumentBackend,
    # )

+    # Docling Parse with Tesseract
+    # ----------------------
+    # pipeline_options = PipelineOptions()
+    # pipeline_options.do_ocr = True
+    # pipeline_options.do_table_structure = True
+    # pipeline_options.table_structure_options.do_cell_matching = True
+    # pipeline_options.ocr_options = TesseractOcrOptions()
+
+    # doc_converter = DocumentConverter(
+    #     pipeline_options=pipeline_options,
+    #     pdf_backend=DoclingParseDocumentBackend,
+    # )
+
+    # Docling Parse with Tesseract CLI
+    # ----------------------
+    # pipeline_options = PipelineOptions()
+    # pipeline_options.do_ocr = True
+    # pipeline_options.do_table_structure = True
+    # pipeline_options.table_structure_options.do_cell_matching = True
+    # pipeline_options.ocr_options = TesseractCliOcrOptions()
+
+    # doc_converter = DocumentConverter(
+    #     pipeline_options=pipeline_options,
+    #     pdf_backend=DoclingParseDocumentBackend,
+    # )
+
    ###########################################################################

    # Define input files
--- a/poetry.lock
+++ b/poetry.lock
@ -5929,6 +5929,41 @@ files = [
 doc = ["reno", "sphinx"]
 test = ["pytest", "tornado (>=4.5)", "typeguard"]

+[[package]]
+name = "tesserocr"
+version = "2.7.1"
+description = "A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython"
+optional = true
+python-versions = "*"
+files = [
+    {file = "tesserocr-2.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1b8c4828f970af7bcfca83a1fb228aa68a2587299387bc875d0dfad8b6baf8ed"},
+    {file = "tesserocr-2.7.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3bb5d336ebf2cc47cd0d117cadc8b25b2e558f54fb9a2dedaa28a14cb5a6b437"},
+    {file = "tesserocr-2.7.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:3ff7f6d6b5c12dd31b80842eb0892b661a41ca3edf0e6cc1e54ec2c14552ceef"},
+    {file = "tesserocr-2.7.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ae794c5434373f4afa4c7f8b59f19fde810f8caf096d8bb701a4b2f3a6739460"},
+    {file = "tesserocr-2.7.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0a0895a4d9ff6a34f5a6f203fe0c9899f31d6f2378ae99be80605637b622687b"},
+    {file = "tesserocr-2.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c3187d14b95c866aa1d34cc374a53d583e2168742eefe33347e4790af70338e"},
+    {file = "tesserocr-2.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ec52be3d82136430081427062ad0211a52fc38fa28fe58e216b89f840354f216"},
+    {file = "tesserocr-2.7.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:44e71b3e8da36b2567760309398689ea9785ee62db3ff21140a9ea6941a233c4"},
+    {file = "tesserocr-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e31a49d7784e7e52fe656719145c3a872856d67daa9bfb340c2990db00e023e9"},
+    {file = "tesserocr-2.7.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:37abde15c1c940d691305fd87836e4cad25a1434799729c324bbcd2277bcae44"},
+    {file = "tesserocr-2.7.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:1b6349d35d333d420d24acf1953ad6f1d5613ffcde462c62126b68bdfca12753"},
+    {file = "tesserocr-2.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:42f009cde8479f3b339da12a8e419fd9559b64b13bc08a248bd0833c6ae94331"},
+    {file = "tesserocr-2.7.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6e13204b3b92fac76ece6e33f55eba6335b30e379f4a7b75e285c2ad05762027"},
+    {file = "tesserocr-2.7.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:65afdec0c5dc09a4a23a62e65524989cd940af41be1603e251a64ac10de9babf"},
+    {file = "tesserocr-2.7.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4c5f59fb072c90bff8aa6a365fc82b747c2668b7b48233901728b155860d1ff9"},
+    {file = "tesserocr-2.7.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f62d662e3002868384e14e8cd620bdedf34ab9f9fc3ebbce527cfe032a7485ee"},
+    {file = "tesserocr-2.7.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e80051812685bd521bc17cb70cf1480ffbb3e54ccc2883e90d5bcda15f8278ea"},
+    {file = "tesserocr-2.7.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:2690cb2330fc9349d68ff027cbdac09693fdda36470836b196c04f16dcc99e9d"},
+    {file = "tesserocr-2.7.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d01ebd094103451ecb77b6510ade2f6bb064c51413ff35b135f649f3d6067a67"},
+    {file = "tesserocr-2.7.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f8069ae6cd9ea3c056b6a596bc99f501ee9f95d6fd2928fcaffb9777071c210d"},
+    {file = "tesserocr-2.7.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2d3d23223d0a448877fb91af83c46ce95ff0a497a82fa93e93068148c9712e5"},
+    {file = "tesserocr-2.7.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef8a09a44c2e96bab0f40dbf0633767d063680d86b79365b43fc4e1234219694"},
+    {file = "tesserocr-2.7.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6e613213ea5b64db06f2cba0b93c3656b7e6aec2d9b2d2e929edf49da7143225"},
+    {file = "tesserocr-2.7.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:4a8888b765e26680a6e34b8ec09b7bb85a17e08cea76f0661eafe2a84254562a"},
+    {file = "tesserocr-2.7.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:64f25763e56c4c29b808e59b485c930cac46b6a1ac8eadd994086dc40a29d3a1"},
+    {file = "tesserocr-2.7.1.tar.gz", hash = "sha256:3744c5c8bbabf18172849c7731be00dc2e5e44f8c556d37c850e788794ae0af4"},
+]
+
 [[package]]
 name = "threadpoolctl"
 version = "3.5.0"
@ -6514,6 +6549,11 @@ files = [
    {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
    {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
    {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
+    {file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"},
+    {file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"},
+    {file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"},
+    {file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"},
+    {file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"},
 ]

 [package.dependencies]
@ -7121,7 +7161,10 @@ enabler = ["pytest-enabler (>=2.2)"]
 test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
 type = ["pytest-mypy"]

+[extras]
+tesserocr = ["tesserocr"]
+
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "7c5fb235944009b74193d045f36c1be2a8e168393012bf952541e6e7dea08072"
+content-hash = "a9bfb36209f3a9140b6923c51bae8c1e23af5be34e52d9622119a5683f125b2c"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -46,6 +46,7 @@ pydantic-settings = "^2.3.0"
 huggingface_hub = ">=0.23,<1"
 requests = "^2.32.3"
 easyocr = "^1.7"
+tesserocr = { version = "^2.7.1", optional = true }
 docling-parse = "^1.4.1"
 certifi = ">=2024.7.4"
 rtree = "^1.3.0"
@ -81,6 +82,9 @@ langchain-huggingface = "^0.0.3"
 langchain-milvus = "^0.1.4"
 langchain-text-splitters = "^0.2.4"

+[tool.poetry.extras]
+tesserocr = ["tesserocr"]
+
 [tool.poetry.scripts]
 docling = "docling.cli.main:app"

--- a/tests/data_scanned/ocr_test.doctags.txt
+++ b/tests/data_scanned/ocr_test.doctags.txt
@ -0,0 +1,3 @@
+<document>
+<paragraph><location><page_1><loc_12><loc_82><loc_86><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph>
+</document>
--- a/tests/data_scanned/ocr_test.json
+++ b/tests/data_scanned/ocr_test.json
@ -0,0 +1 @@
+{"_name": "", "type": "pdf-document", "description": {"logs": []}, "file-info": {"filename": "ocr_test_8.pdf", "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.58837890625, 509.4446716308594, 767.422119140625], "page": 1, "span": [0, 94]}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "name": "Text"}], "figures": [], "tables": [], "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": []}
--- a/tests/data_scanned/ocr_test.md
+++ b/tests/data_scanned/ocr_test.md
@ -0,0 +1 @@
+Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
--- a/tests/data_scanned/ocr_test.pages.json
+++ b/tests/data_scanned/ocr_test.pages.json
@ -0,0 +1 @@
+[{"page_no": 0, "page_hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "Text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "body": [{"label": "Text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "headers": []}}]
--- a/tests/data_scanned/ocr_test.pdf
+++ b/tests/data_scanned/ocr_test.pdf
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@ -0,0 +1,98 @@
+from pathlib import Path
+from typing import List
+
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import (
+    EasyOcrOptions,
+    OcrOptions,
+    PipelineOptions,
+    TesseractCliOcrOptions,
+    TesseractOcrOptions,
+)
+from docling.document_converter import DocumentConverter
+
+from .verify_utils import verify_conversion_result
+
+GENERATE = False
+
+
+# Debug
+def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
+    r""" """
+    import json
+    import os
+
+    parent = pdf_path.parent
+    eng = "" if engine is None else f".{engine}"
+
+    dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
+    with open(dict_fn, "w") as fd:
+        json.dump(doc_result.render_as_dict(), fd)
+
+    pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json")
+    pages = [p.model_dump() for p in doc_result.pages]
+    with open(pages_fn, "w") as fd:
+        json.dump(pages, fd)
+
+    doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
+    with open(doctags_fn, "w") as fd:
+        fd.write(doc_result.render_as_doctags())
+
+    md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
+    with open(md_fn, "w") as fd:
+        fd.write(doc_result.render_as_markdown())
+
+
+def get_pdf_paths():
+    # Define the directory you want to search
+    directory = Path("./tests/data_scanned")
+
+    # List all PDF files in the directory and its subdirectories
+    pdf_files = sorted(directory.rglob("*.pdf"))
+    return pdf_files
+
+
+def get_converter(ocr_options: OcrOptions):
+    pipeline_options = PipelineOptions()
+    pipeline_options.do_ocr = True
+    pipeline_options.do_table_structure = True
+    pipeline_options.table_structure_options.do_cell_matching = True
+    pipeline_options.ocr_options = ocr_options
+
+    converter = DocumentConverter(
+        pipeline_options=pipeline_options,
+        pdf_backend=DoclingParseDocumentBackend,
+    )
+
+    return converter
+
+
+def test_e2e_conversions():
+
+    pdf_paths = get_pdf_paths()
+
+    engines: List[OcrOptions] = [
+        EasyOcrOptions(),
+        TesseractOcrOptions(),
+        TesseractCliOcrOptions(),
+    ]
+
+    for ocr_options in engines:
+        print(f"Converting with ocr_engine: {ocr_options.kind}")
+        converter = get_converter(ocr_options=ocr_options)
+        for pdf_path in pdf_paths:
+            print(f"converting {pdf_path}")
+
+            doc_result: ConversionResult = converter.convert_single(pdf_path)
+
+            # Save conversions
+            # save_output(pdf_path, doc_result, None)
+
+            # Debug
+            verify_conversion_result(
+                input_path=pdf_path,
+                doc_result=doc_result,
+                generate=GENERATE,
+                skip_cells=True,
+            )
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@ -130,7 +130,11 @@ def verify_dt(doc_pred_dt, doc_true_dt):


 def verify_conversion_result(
-    input_path: Path, doc_result: ConversionResult, generate=False
+    input_path: Path,
+    doc_result: ConversionResult,
+    generate: bool = False,
+    ocr_engine: str = None,
+    skip_cells: bool = False,
 ):
    PageList = TypeAdapter(List[Page])

@ -143,10 +147,11 @@ def verify_conversion_result(
    doc_pred_md = doc_result.render_as_markdown()
    doc_pred_dt = doc_result.render_as_doctags()

-    pages_path = input_path.with_suffix(".pages.json")
-    json_path = input_path.with_suffix(".json")
-    md_path = input_path.with_suffix(".md")
-    dt_path = input_path.with_suffix(".doctags.txt")
+    engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
+    pages_path = input_path.with_suffix(f"{engine_suffix}.pages.json")
+    json_path = input_path.with_suffix(f"{engine_suffix}.json")
+    md_path = input_path.with_suffix(f"{engine_suffix}.md")
+    dt_path = input_path.with_suffix(f"{engine_suffix}.doctags.txt")

    if generate:  # only used when re-generating truth
        with open(pages_path, "w") as fw:
@ -173,9 +178,10 @@ def verify_conversion_result(
        with open(dt_path, "r") as fr:
            doc_true_dt = fr.read()

-        assert verify_cells(
-            doc_pred_pages, doc_true_pages
-        ), f"Mismatch in PDF cell prediction for {input_path}"
+        if not skip_cells:
+            assert verify_cells(
+                doc_pred_pages, doc_true_pages
+            ), f"Mismatch in PDF cell prediction for {input_path}"

        # assert verify_output(
        #    doc_pred, doc_true
				`@ -0,0 +1 @@`
				{"_name": "", "type": "pdf-document", "description": {"logs": []}, "file-info": {"filename": "ocr_test_8.pdf", "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.58837890625, 509.4446716308594, 767.422119140625], "page": 1, "span": [0, 94]}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "name": "Text"}], "figures": [], "tables": [], "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": []}
				`@ -0,0 +1 @@`
				`Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package`
				`@ -0,0 +1 @@`
				[{"page_no": 0, "page_hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "Text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "body": [{"label": "Text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "headers": []}}]