diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 8e92e76..8c88acc 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -9,6 +9,11 @@ jobs: python-version: ['3.10', '3.11', '3.12'] steps: - uses: actions/checkout@v3 + - name: Install tesseract + run: sudo apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa libleptonica-dev libtesseract-dev pkg-config + - name: Set TESSDATA_PREFIX + run: | + echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV" - uses: ./.github/actions/setup-poetry with: python-version: ${{ matrix.python-version }} @@ -32,4 +37,4 @@ jobs: poetry run python "$file" || exit 1 done - name: Build with poetry - run: poetry build \ No newline at end of file + run: poetry build diff --git a/README.md b/README.md index f390245..882f3a8 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,79 @@ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectu ``` +
+ Alternative OCR engines + + Docling supports multiple OCR engines for processing scanned documents. The current version provides + the following engines. + + | Engine | Installation | Usage | + | ------ | ------------ | ----- | + | [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` | + | Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` | + | Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` | + + The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example + + ```python + from docling.datamodel.base_models import ConversionStatus, PipelineOptions + from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions + from docling.document_converter import DocumentConverter + + pipeline_options = PipelineOptions() + pipeline_options.do_ocr = True + pipeline_options.ocr_options = TesseractOcrOptions() # Use Tesseract + + doc_converter = DocumentConverter( + pipeline_options=pipeline_options, + ) + ``` + + #### Tesseract installation + + [Tesseract](https://github.com/tesseract-ocr/tesseract) is a popular OCR engine which is available + on most operating systems. For using this engine with Docling, Tesseract must be installed on your + system, using the packaging tool of your choice. Below we provide example commands. + After installing Tesseract you are expected to provide the path to its language files using the + `TESSDATA_PREFIX` environment variable (note that it must terminate with a slash `/`). + + For macOS, we reccomend using [Homebrew](https://brew.sh/). + + ```console + brew install tesseract leptonica pkg-config + TESSDATA_PREFIX=/opt/homebrew/share/tessdata/ + echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}" + ``` + + For Debian-based systems. + + ```console + apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev pkg-config + TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$) + echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}" + ``` + + For RHEL systems. + + ```console + dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel + TESSDATA_PREFIX=/usr/share/tesseract/tessdata/ + echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}" + ``` + + #### Linking to Tesseract + The most efficient usage of the Tesseract library is via linking. Docling is using + the [Tesserocr](https://github.com/sirfz/tesserocr) package for this. + + If you get into installation issues of Tesserocr, we suggest using the following + installation options: + + ```console + pip uninstall tesserocr + pip install --no-binary :all: tesserocr + ``` +
+
Docling development setup diff --git a/docling/cli/main.py b/docling/cli/main.py index b942d51..e27026d 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -14,7 +14,12 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import ConversionStatus from docling.datamodel.document import ConversionResult, DocumentConversionInput -from docling.datamodel.pipeline_options import PipelineOptions +from docling.datamodel.pipeline_options import ( + EasyOcrOptions, + PipelineOptions, + TesseractCliOcrOptions, + TesseractOcrOptions, +) from docling.document_converter import DocumentConverter warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") @@ -53,6 +58,13 @@ class Backend(str, Enum): DOCLING = "docling" +# Define an enum for the ocr engines +class OcrEngine(str, Enum): + EASYOCR = "easyocr" + TESSERACT_CLI = "tesseract_cli" + TESSERACT = "tesseract" + + def export_documents( conv_results: Iterable[ConversionResult], output_dir: Path, @@ -152,6 +164,9 @@ def convert( backend: Annotated[ Backend, typer.Option(..., help="The PDF backend to use.") ] = Backend.DOCLING, + ocr_engine: Annotated[ + OcrEngine, typer.Option(..., help="The OCR engine to use.") + ] = OcrEngine.EASYOCR, output: Annotated[ Path, typer.Option(..., help="Output directory where results are saved.") ] = Path("."), @@ -191,8 +206,19 @@ def convert( case _: raise RuntimeError(f"Unexpected backend type {backend}") + match ocr_engine: + case OcrEngine.EASYOCR: + ocr_options = EasyOcrOptions() + case OcrEngine.TESSERACT_CLI: + ocr_options = TesseractCliOcrOptions() + case OcrEngine.TESSERACT: + ocr_options = TesseractOcrOptions() + case _: + raise RuntimeError(f"Unexpected backend type {backend}") + pipeline_options = PipelineOptions( do_ocr=ocr, + ocr_options=ocr_options, do_table_structure=True, ) pipeline_options.table_structure_options.do_cell_matching = do_cell_matching diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index f18dbd7..752e264 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -110,7 +110,10 @@ class BoundingBox(BaseModel): return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin) def area(self) -> float: - return (self.r - self.l) * (self.b - self.t) + area = (self.r - self.l) * (self.b - self.t) + if self.coord_origin == CoordOrigin.BOTTOMLEFT: + area = -area + return area def intersection_area_with(self, other: "BoundingBox") -> float: # Calculate intersection coordinates diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 9ea7a77..2ebff48 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -1,6 +1,7 @@ from enum import Enum, auto +from typing import List, Literal, Optional, Union -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict, Field class TableFormerMode(str, Enum): @@ -18,8 +19,49 @@ class TableStructureOptions(BaseModel): mode: TableFormerMode = TableFormerMode.FAST +class OcrOptions(BaseModel): + kind: str + + +class EasyOcrOptions(OcrOptions): + kind: Literal["easyocr"] = "easyocr" + lang: List[str] = ["fr", "de", "es", "en"] + use_gpu: bool = True # same default as easyocr.Reader + model_storage_directory: Optional[str] = None + download_enabled: bool = True # same default as easyocr.Reader + + model_config = ConfigDict( + extra="forbid", + protected_namespaces=(), + ) + + +class TesseractCliOcrOptions(OcrOptions): + kind: Literal["tesseract"] = "tesseract" + lang: List[str] = ["fra", "deu", "spa", "eng"] + tesseract_cmd: str = "tesseract" + path: Optional[str] = None + + model_config = ConfigDict( + extra="forbid", + ) + + +class TesseractOcrOptions(OcrOptions): + kind: Literal["tesserocr"] = "tesserocr" + lang: List[str] = ["fra", "deu", "spa", "eng"] + path: Optional[str] = None + + model_config = ConfigDict( + extra="forbid", + ) + + class PipelineOptions(BaseModel): do_table_structure: bool = True # True: perform table structure extraction do_ocr: bool = True # True: perform OCR, replace programmatic PDF text table_structure_options: TableStructureOptions = TableStructureOptions() + ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = ( + Field(EasyOcrOptions(), discriminator="kind") + ) diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index 3b3c261..4139d68 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -3,21 +3,21 @@ import logging from abc import abstractmethod from typing import Iterable, List, Tuple -import numpy import numpy as np from PIL import Image, ImageDraw from rtree import index from scipy.ndimage import find_objects, label from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page +from docling.datamodel.pipeline_options import OcrOptions _log = logging.getLogger(__name__) class BaseOcrModel: - def __init__(self, config): - self.config = config - self.enabled = config["enabled"] + def __init__(self, enabled: bool, options: OcrOptions): + self.enabled = enabled + self.options = options # Computes the optimum amount and coordinates of rectangles to OCR on a given page def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]: diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index 5fb4066..a4c64a7 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -4,21 +4,33 @@ from typing import Iterable import numpy from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page +from docling.datamodel.pipeline_options import EasyOcrOptions from docling.models.base_ocr_model import BaseOcrModel _log = logging.getLogger(__name__) class EasyOcrModel(BaseOcrModel): - def __init__(self, config): - super().__init__(config) + def __init__(self, enabled: bool, options: EasyOcrOptions): + super().__init__(enabled=enabled, options=options) + self.options: EasyOcrOptions self.scale = 3 # multiplier for 72 dpi == 216 dpi. if self.enabled: - import easyocr + try: + import easyocr + except ImportError: + raise ImportError( + "EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. " + "Alternatively, Docling has support for other OCR engines. See the documentation." + ) - self.reader = easyocr.Reader(config["lang"]) + self.reader = easyocr.Reader( + lang_list=self.options.lang, + model_storage_directory=self.options.model_storage_directory, + download_enabled=self.options.download_enabled, + ) def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: @@ -31,6 +43,9 @@ class EasyOcrModel(BaseOcrModel): all_ocr_cells = [] for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue high_res_image = page._backend.get_page_image( scale=self.scale, cropbox=ocr_rect ) diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py new file mode 100644 index 0000000..c3c1999 --- /dev/null +++ b/docling/models/tesseract_ocr_cli_model.py @@ -0,0 +1,167 @@ +import io +import logging +import tempfile +from subprocess import PIPE, Popen +from typing import Iterable, Tuple + +import pandas as pd + +from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page +from docling.datamodel.pipeline_options import TesseractCliOcrOptions +from docling.models.base_ocr_model import BaseOcrModel + +_log = logging.getLogger(__name__) + + +class TesseractOcrCliModel(BaseOcrModel): + + def __init__(self, enabled: bool, options: TesseractCliOcrOptions): + super().__init__(enabled=enabled, options=options) + self.options: TesseractCliOcrOptions + + self.scale = 3 # multiplier for 72 dpi == 216 dpi. + + self._name = None + self._version = None + + if self.enabled: + try: + self._get_name_and_version() + + except Exception as exc: + raise RuntimeError( + f"Tesseract is not available, aborting: {exc} " + "Install tesseract on your system and the tesseract binary is discoverable. " + "The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. " + "Alternatively, Docling has support for other OCR engines. See the documentation." + ) + + def _get_name_and_version(self) -> Tuple[str, str]: + + if self._name != None and self._version != None: + return self._name, self._version + + cmd = [self.options.tesseract_cmd, "--version"] + + proc = Popen(cmd, stdout=PIPE, stderr=PIPE) + stdout, stderr = proc.communicate() + + proc.wait() + + # HACK: Windows versions of Tesseract output the version to stdout, Linux versions + # to stderr, so check both. + version_line = ( + (stdout.decode("utf8").strip() or stderr.decode("utf8").strip()) + .split("\n")[0] + .strip() + ) + + # If everything else fails... + if not version_line: + version_line = "tesseract XXX" + + name, version = version_line.split(" ") + + self._name = name + self._version = version + + return name, version + + def _run_tesseract(self, ifilename: str): + + cmd = [self.options.tesseract_cmd] + + if self.options.lang is not None and len(self.options.lang) > 0: + cmd.append("-l") + cmd.append("+".join(self.options.lang)) + if self.options.path is not None: + cmd.append("--tessdata-dir") + cmd.append(self.options.path) + + cmd += [ifilename, "stdout", "tsv"] + _log.info("command: {}".format(" ".join(cmd))) + + proc = Popen(cmd, stdout=PIPE) + output, _ = proc.communicate() + + # _log.info(output) + + # Decode the byte string to a regular string + decoded_data = output.decode("utf-8") + # _log.info(decoded_data) + + # Read the TSV file generated by Tesseract + df = pd.read_csv(io.StringIO(decoded_data), sep="\t") + + # Display the dataframe (optional) + # _log.info("df: ", df.head()) + + # Filter rows that contain actual text (ignore header or empty rows) + df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")] + + return df_filtered + + def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + + if not self.enabled: + yield from page_batch + return + + for page in page_batch: + ocr_rects = self.get_ocr_rects(page) + + all_ocr_cells = [] + for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect + ) + + with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file: + fname = image_file.name + high_res_image.save(fname) + + df = self._run_tesseract(fname) + + # _log.info(df) + + # Print relevant columns (bounding box and text) + for ix, row in df.iterrows(): + text = row["text"] + conf = row["conf"] + + l = float(row["left"]) + b = float(row["top"]) + w = float(row["width"]) + h = float(row["height"]) + + t = b + h + r = l + w + + cell = OcrCell( + id=ix, + text=text, + confidence=conf / 100.0, + bbox=BoundingBox.from_tuple( + coord=( + (l / self.scale) + ocr_rect.l, + (b / self.scale) + ocr_rect.t, + (r / self.scale) + ocr_rect.l, + (t / self.scale) + ocr_rect.t, + ), + origin=CoordOrigin.TOPLEFT, + ), + ) + all_ocr_cells.append(cell) + + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) + + page.cells.extend(filtered_ocr_cells) + + # DEBUG code: + # self.draw_ocr_rects_and_cells(page, ocr_rects) + + yield page diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py new file mode 100644 index 0000000..1b4f6f7 --- /dev/null +++ b/docling/models/tesseract_ocr_model.py @@ -0,0 +1,122 @@ +import logging +from typing import Iterable + +import numpy + +from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page +from docling.datamodel.pipeline_options import TesseractCliOcrOptions +from docling.models.base_ocr_model import BaseOcrModel + +_log = logging.getLogger(__name__) + + +class TesseractOcrModel(BaseOcrModel): + def __init__(self, enabled: bool, options: TesseractCliOcrOptions): + super().__init__(enabled=enabled, options=options) + self.options: TesseractCliOcrOptions + + self.scale = 3 # multiplier for 72 dpi == 216 dpi. + self.reader = None + + if self.enabled: + setup_errmsg = ( + "tesserocr is not correctly installed. " + "Please install it via `pip install tesserocr` to use this OCR engine. " + "Note that tesserocr might have to be manually compiled for working with" + "your Tesseract installation. The Docling documentation provides examples for it. " + "Alternatively, Docling has support for other OCR engines. See the documentation." + ) + try: + import tesserocr + except ImportError: + raise ImportError(setup_errmsg) + + try: + tesseract_version = tesserocr.tesseract_version() + _log.debug("Initializing TesserOCR: %s", tesseract_version) + except: + raise ImportError(setup_errmsg) + + # Initialize the tesseractAPI + lang = "+".join(self.options.lang) + if self.options.path is not None: + self.reader = tesserocr.PyTessBaseAPI( + path=self.options.path, + lang=lang, + psm=tesserocr.PSM.AUTO, + init=True, + oem=tesserocr.OEM.DEFAULT, + ) + else: + self.reader = tesserocr.PyTessBaseAPI( + lang=lang, + psm=tesserocr.PSM.AUTO, + init=True, + oem=tesserocr.OEM.DEFAULT, + ) + self.reader_RIL = tesserocr.RIL + + def __del__(self): + if self.reader is not None: + # Finalize the tesseractAPI + self.reader.End() + + def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]: + + if not self.enabled: + yield from page_batch + return + + for page in page_batch: + ocr_rects = self.get_ocr_rects(page) + + all_ocr_cells = [] + for ocr_rect in ocr_rects: + # Skip zero area boxes + if ocr_rect.area() == 0: + continue + high_res_image = page._backend.get_page_image( + scale=self.scale, cropbox=ocr_rect + ) + + # Retrieve text snippets with their bounding boxes + self.reader.SetImage(high_res_image) + boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True) + + cells = [] + for ix, (im, box, _, _) in enumerate(boxes): + # Set the area of interest. Tesseract uses Bottom-Left for the origin + self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"]) + + # Extract text within the bounding box + text = self.reader.GetUTF8Text().strip() + confidence = self.reader.MeanTextConf() + left = box["x"] / self.scale + bottom = box["y"] / self.scale + right = (box["x"] + box["w"]) / self.scale + top = (box["y"] + box["h"]) / self.scale + + cells.append( + OcrCell( + id=ix, + text=text, + confidence=confidence, + bbox=BoundingBox.from_tuple( + coord=(left, top, right, bottom), + origin=CoordOrigin.TOPLEFT, + ), + ) + ) + + # del high_res_image + all_ocr_cells.extend(cells) + + ## Remove OCR cells which overlap with programmatic cells. + filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells) + + page.cells.extend(filtered_ocr_cells) + + # DEBUG code: + # self.draw_ocr_rects_and_cells(page, ocr_rects) + + yield page diff --git a/docling/pipeline/standard_model_pipeline.py b/docling/pipeline/standard_model_pipeline.py index a68318b..3cbd87d 100644 --- a/docling/pipeline/standard_model_pipeline.py +++ b/docling/pipeline/standard_model_pipeline.py @@ -1,9 +1,17 @@ from pathlib import Path -from docling.datamodel.pipeline_options import PipelineOptions +from docling.datamodel.pipeline_options import ( + EasyOcrOptions, + PipelineOptions, + TesseractCliOcrOptions, + TesseractOcrOptions, +) +from docling.models.base_ocr_model import BaseOcrModel from docling.models.easyocr_model import EasyOcrModel from docling.models.layout_model import LayoutModel from docling.models.table_structure_model import TableStructureModel +from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel +from docling.models.tesseract_ocr_model import TesseractOcrModel from docling.pipeline.base_model_pipeline import BaseModelPipeline @@ -14,19 +22,38 @@ class StandardModelPipeline(BaseModelPipeline): def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions): super().__init__(artifacts_path, pipeline_options) + ocr_model: BaseOcrModel + if isinstance(pipeline_options.ocr_options, EasyOcrOptions): + ocr_model = EasyOcrModel( + enabled=pipeline_options.do_ocr, + options=pipeline_options.ocr_options, + ) + elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions): + ocr_model = TesseractOcrCliModel( + enabled=pipeline_options.do_ocr, + options=pipeline_options.ocr_options, + ) + elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions): + ocr_model = TesseractOcrModel( + enabled=pipeline_options.do_ocr, + options=pipeline_options.ocr_options, + ) + else: + raise RuntimeError( + f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}." + ) + self.model_pipe = [ - EasyOcrModel( - config={ - "lang": ["fr", "de", "es", "en"], - "enabled": pipeline_options.do_ocr, - } - ), + # OCR + ocr_model, + # Layout LayoutModel( config={ "artifacts_path": artifacts_path / StandardModelPipeline._layout_model_path } ), + # Table structure TableStructureModel( config={ "artifacts_path": artifacts_path diff --git a/examples/custom_convert.py b/examples/custom_convert.py index 63c8beb..e386bb3 100644 --- a/examples/custom_convert.py +++ b/examples/custom_convert.py @@ -8,6 +8,10 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docling.datamodel.document import ConversionResult, DocumentConversionInput +from docling.datamodel.pipeline_options import ( + TesseractCliOcrOptions, + TesseractOcrOptions, +) from docling.document_converter import DocumentConverter _log = logging.getLogger(__name__) @@ -71,7 +75,7 @@ def main(): # and PDF Backends for various configurations. # Uncomment one section at the time to see the differences in the output. - # PyPdfium without OCR + # PyPdfium without EasyOCR # -------------------- # pipeline_options = PipelineOptions() # pipeline_options.do_ocr=False @@ -83,7 +87,7 @@ def main(): # pdf_backend=PyPdfiumDocumentBackend, # ) - # PyPdfium with OCR + # PyPdfium with EasyOCR # ----------------- # pipeline_options = PipelineOptions() # pipeline_options.do_ocr=True @@ -95,7 +99,7 @@ def main(): # pdf_backend=PyPdfiumDocumentBackend, # ) - # Docling Parse without OCR + # Docling Parse without EasyOCR # ------------------------- pipeline_options = PipelineOptions() pipeline_options.do_ocr = False @@ -107,7 +111,7 @@ def main(): pdf_backend=DoclingParseDocumentBackend, ) - # Docling Parse with OCR + # Docling Parse with EasyOCR # ---------------------- # pipeline_options = PipelineOptions() # pipeline_options.do_ocr=True @@ -119,6 +123,32 @@ def main(): # pdf_backend=DoclingParseDocumentBackend, # ) + # Docling Parse with Tesseract + # ---------------------- + # pipeline_options = PipelineOptions() + # pipeline_options.do_ocr = True + # pipeline_options.do_table_structure = True + # pipeline_options.table_structure_options.do_cell_matching = True + # pipeline_options.ocr_options = TesseractOcrOptions() + + # doc_converter = DocumentConverter( + # pipeline_options=pipeline_options, + # pdf_backend=DoclingParseDocumentBackend, + # ) + + # Docling Parse with Tesseract CLI + # ---------------------- + # pipeline_options = PipelineOptions() + # pipeline_options.do_ocr = True + # pipeline_options.do_table_structure = True + # pipeline_options.table_structure_options.do_cell_matching = True + # pipeline_options.ocr_options = TesseractCliOcrOptions() + + # doc_converter = DocumentConverter( + # pipeline_options=pipeline_options, + # pdf_backend=DoclingParseDocumentBackend, + # ) + ########################################################################### # Define input files diff --git a/poetry.lock b/poetry.lock index 7733ecb..27fac6b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5929,6 +5929,41 @@ files = [ doc = ["reno", "sphinx"] test = ["pytest", "tornado (>=4.5)", "typeguard"] +[[package]] +name = "tesserocr" +version = "2.7.1" +description = "A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython" +optional = true +python-versions = "*" +files = [ + {file = "tesserocr-2.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1b8c4828f970af7bcfca83a1fb228aa68a2587299387bc875d0dfad8b6baf8ed"}, + {file = "tesserocr-2.7.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3bb5d336ebf2cc47cd0d117cadc8b25b2e558f54fb9a2dedaa28a14cb5a6b437"}, + {file = "tesserocr-2.7.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:3ff7f6d6b5c12dd31b80842eb0892b661a41ca3edf0e6cc1e54ec2c14552ceef"}, + {file = "tesserocr-2.7.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ae794c5434373f4afa4c7f8b59f19fde810f8caf096d8bb701a4b2f3a6739460"}, + {file = "tesserocr-2.7.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0a0895a4d9ff6a34f5a6f203fe0c9899f31d6f2378ae99be80605637b622687b"}, + {file = "tesserocr-2.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c3187d14b95c866aa1d34cc374a53d583e2168742eefe33347e4790af70338e"}, + {file = "tesserocr-2.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ec52be3d82136430081427062ad0211a52fc38fa28fe58e216b89f840354f216"}, + {file = "tesserocr-2.7.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:44e71b3e8da36b2567760309398689ea9785ee62db3ff21140a9ea6941a233c4"}, + {file = "tesserocr-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e31a49d7784e7e52fe656719145c3a872856d67daa9bfb340c2990db00e023e9"}, + {file = "tesserocr-2.7.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:37abde15c1c940d691305fd87836e4cad25a1434799729c324bbcd2277bcae44"}, + {file = "tesserocr-2.7.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:1b6349d35d333d420d24acf1953ad6f1d5613ffcde462c62126b68bdfca12753"}, + {file = "tesserocr-2.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:42f009cde8479f3b339da12a8e419fd9559b64b13bc08a248bd0833c6ae94331"}, + {file = "tesserocr-2.7.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6e13204b3b92fac76ece6e33f55eba6335b30e379f4a7b75e285c2ad05762027"}, + {file = "tesserocr-2.7.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:65afdec0c5dc09a4a23a62e65524989cd940af41be1603e251a64ac10de9babf"}, + {file = "tesserocr-2.7.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4c5f59fb072c90bff8aa6a365fc82b747c2668b7b48233901728b155860d1ff9"}, + {file = "tesserocr-2.7.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f62d662e3002868384e14e8cd620bdedf34ab9f9fc3ebbce527cfe032a7485ee"}, + {file = "tesserocr-2.7.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e80051812685bd521bc17cb70cf1480ffbb3e54ccc2883e90d5bcda15f8278ea"}, + {file = "tesserocr-2.7.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:2690cb2330fc9349d68ff027cbdac09693fdda36470836b196c04f16dcc99e9d"}, + {file = "tesserocr-2.7.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d01ebd094103451ecb77b6510ade2f6bb064c51413ff35b135f649f3d6067a67"}, + {file = "tesserocr-2.7.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f8069ae6cd9ea3c056b6a596bc99f501ee9f95d6fd2928fcaffb9777071c210d"}, + {file = "tesserocr-2.7.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2d3d23223d0a448877fb91af83c46ce95ff0a497a82fa93e93068148c9712e5"}, + {file = "tesserocr-2.7.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef8a09a44c2e96bab0f40dbf0633767d063680d86b79365b43fc4e1234219694"}, + {file = "tesserocr-2.7.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6e613213ea5b64db06f2cba0b93c3656b7e6aec2d9b2d2e929edf49da7143225"}, + {file = "tesserocr-2.7.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:4a8888b765e26680a6e34b8ec09b7bb85a17e08cea76f0661eafe2a84254562a"}, + {file = "tesserocr-2.7.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:64f25763e56c4c29b808e59b485c930cac46b6a1ac8eadd994086dc40a29d3a1"}, + {file = "tesserocr-2.7.1.tar.gz", hash = "sha256:3744c5c8bbabf18172849c7731be00dc2e5e44f8c556d37c850e788794ae0af4"}, +] + [[package]] name = "threadpoolctl" version = "3.5.0" @@ -6514,6 +6549,11 @@ files = [ {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"}, {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"}, {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"}, + {file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"}, + {file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"}, + {file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"}, + {file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"}, + {file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"}, ] [package.dependencies] @@ -7121,7 +7161,10 @@ enabler = ["pytest-enabler (>=2.2)"] test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] type = ["pytest-mypy"] +[extras] +tesserocr = ["tesserocr"] + [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "7c5fb235944009b74193d045f36c1be2a8e168393012bf952541e6e7dea08072" +content-hash = "a9bfb36209f3a9140b6923c51bae8c1e23af5be34e52d9622119a5683f125b2c" diff --git a/pyproject.toml b/pyproject.toml index f512a19..41d21cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ pydantic-settings = "^2.3.0" huggingface_hub = ">=0.23,<1" requests = "^2.32.3" easyocr = "^1.7" +tesserocr = { version = "^2.7.1", optional = true } docling-parse = "^1.4.1" certifi = ">=2024.7.4" rtree = "^1.3.0" @@ -81,6 +82,9 @@ langchain-huggingface = "^0.0.3" langchain-milvus = "^0.1.4" langchain-text-splitters = "^0.2.4" +[tool.poetry.extras] +tesserocr = ["tesserocr"] + [tool.poetry.scripts] docling = "docling.cli.main:app" diff --git a/tests/data_scanned/ocr_test.doctags.txt b/tests/data_scanned/ocr_test.doctags.txt new file mode 100644 index 0000000..7cd5351 --- /dev/null +++ b/tests/data_scanned/ocr_test.doctags.txt @@ -0,0 +1,3 @@ + +Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package + \ No newline at end of file diff --git a/tests/data_scanned/ocr_test.json b/tests/data_scanned/ocr_test.json new file mode 100644 index 0000000..bf0fb86 --- /dev/null +++ b/tests/data_scanned/ocr_test.json @@ -0,0 +1 @@ +{"_name": "", "type": "pdf-document", "description": {"logs": []}, "file-info": {"filename": "ocr_test_8.pdf", "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.58837890625, 509.4446716308594, 767.422119140625], "page": 1, "span": [0, 94]}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "name": "Text"}], "figures": [], "tables": [], "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": []} \ No newline at end of file diff --git a/tests/data_scanned/ocr_test.md b/tests/data_scanned/ocr_test.md new file mode 100644 index 0000000..4289654 --- /dev/null +++ b/tests/data_scanned/ocr_test.md @@ -0,0 +1 @@ +Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package \ No newline at end of file diff --git a/tests/data_scanned/ocr_test.pages.json b/tests/data_scanned/ocr_test.pages.json new file mode 100644 index 0000000..de3f5f5 --- /dev/null +++ b/tests/data_scanned/ocr_test.pages.json @@ -0,0 +1 @@ +[{"page_no": 0, "page_hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "Text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "body": [{"label": "Text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "headers": []}}] \ No newline at end of file diff --git a/tests/data_scanned/ocr_test.pdf b/tests/data_scanned/ocr_test.pdf new file mode 100644 index 0000000..b79f3c2 Binary files /dev/null and b/tests/data_scanned/ocr_test.pdf differ diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py new file mode 100644 index 0000000..96bc087 --- /dev/null +++ b/tests/test_e2e_ocr_conversion.py @@ -0,0 +1,98 @@ +from pathlib import Path +from typing import List + +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import ( + EasyOcrOptions, + OcrOptions, + PipelineOptions, + TesseractCliOcrOptions, + TesseractOcrOptions, +) +from docling.document_converter import DocumentConverter + +from .verify_utils import verify_conversion_result + +GENERATE = False + + +# Debug +def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str): + r""" """ + import json + import os + + parent = pdf_path.parent + eng = "" if engine is None else f".{engine}" + + dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json") + with open(dict_fn, "w") as fd: + json.dump(doc_result.render_as_dict(), fd) + + pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json") + pages = [p.model_dump() for p in doc_result.pages] + with open(pages_fn, "w") as fd: + json.dump(pages, fd) + + doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt") + with open(doctags_fn, "w") as fd: + fd.write(doc_result.render_as_doctags()) + + md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md") + with open(md_fn, "w") as fd: + fd.write(doc_result.render_as_markdown()) + + +def get_pdf_paths(): + # Define the directory you want to search + directory = Path("./tests/data_scanned") + + # List all PDF files in the directory and its subdirectories + pdf_files = sorted(directory.rglob("*.pdf")) + return pdf_files + + +def get_converter(ocr_options: OcrOptions): + pipeline_options = PipelineOptions() + pipeline_options.do_ocr = True + pipeline_options.do_table_structure = True + pipeline_options.table_structure_options.do_cell_matching = True + pipeline_options.ocr_options = ocr_options + + converter = DocumentConverter( + pipeline_options=pipeline_options, + pdf_backend=DoclingParseDocumentBackend, + ) + + return converter + + +def test_e2e_conversions(): + + pdf_paths = get_pdf_paths() + + engines: List[OcrOptions] = [ + EasyOcrOptions(), + TesseractOcrOptions(), + TesseractCliOcrOptions(), + ] + + for ocr_options in engines: + print(f"Converting with ocr_engine: {ocr_options.kind}") + converter = get_converter(ocr_options=ocr_options) + for pdf_path in pdf_paths: + print(f"converting {pdf_path}") + + doc_result: ConversionResult = converter.convert_single(pdf_path) + + # Save conversions + # save_output(pdf_path, doc_result, None) + + # Debug + verify_conversion_result( + input_path=pdf_path, + doc_result=doc_result, + generate=GENERATE, + skip_cells=True, + ) diff --git a/tests/verify_utils.py b/tests/verify_utils.py index a0b0f0e..082b7c7 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -130,7 +130,11 @@ def verify_dt(doc_pred_dt, doc_true_dt): def verify_conversion_result( - input_path: Path, doc_result: ConversionResult, generate=False + input_path: Path, + doc_result: ConversionResult, + generate: bool = False, + ocr_engine: str = None, + skip_cells: bool = False, ): PageList = TypeAdapter(List[Page]) @@ -143,10 +147,11 @@ def verify_conversion_result( doc_pred_md = doc_result.render_as_markdown() doc_pred_dt = doc_result.render_as_doctags() - pages_path = input_path.with_suffix(".pages.json") - json_path = input_path.with_suffix(".json") - md_path = input_path.with_suffix(".md") - dt_path = input_path.with_suffix(".doctags.txt") + engine_suffix = "" if ocr_engine is None else f".{ocr_engine}" + pages_path = input_path.with_suffix(f"{engine_suffix}.pages.json") + json_path = input_path.with_suffix(f"{engine_suffix}.json") + md_path = input_path.with_suffix(f"{engine_suffix}.md") + dt_path = input_path.with_suffix(f"{engine_suffix}.doctags.txt") if generate: # only used when re-generating truth with open(pages_path, "w") as fw: @@ -173,9 +178,10 @@ def verify_conversion_result( with open(dt_path, "r") as fr: doc_true_dt = fr.read() - assert verify_cells( - doc_pred_pages, doc_true_pages - ), f"Mismatch in PDF cell prediction for {input_path}" + if not skip_cells: + assert verify_cells( + doc_pred_pages, doc_true_pages + ), f"Mismatch in PDF cell prediction for {input_path}" # assert verify_output( # doc_pred, doc_true