feat: add options for choosing OCR engines (#118)
--------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> Signed-off-by: Peter Staar <taa@zurich.ibm.com> Co-authored-by: Nikos Livathinos <nli@zurich.ibm.com> Co-authored-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
d412c363d7
commit
f96ea86a00
7
.github/workflows/checks.yml
vendored
7
.github/workflows/checks.yml
vendored
@ -9,6 +9,11 @@ jobs:
|
||||
python-version: ['3.10', '3.11', '3.12']
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Install tesseract
|
||||
run: sudo apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa libleptonica-dev libtesseract-dev pkg-config
|
||||
- name: Set TESSDATA_PREFIX
|
||||
run: |
|
||||
echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
|
||||
- uses: ./.github/actions/setup-poetry
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
@ -32,4 +37,4 @@ jobs:
|
||||
poetry run python "$file" || exit 1
|
||||
done
|
||||
- name: Build with poetry
|
||||
run: poetry build
|
||||
run: poetry build
|
||||
|
73
README.md
73
README.md
@ -52,6 +52,79 @@ Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectu
|
||||
```
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><b>Alternative OCR engines</b></summary>
|
||||
|
||||
Docling supports multiple OCR engines for processing scanned documents. The current version provides
|
||||
the following engines.
|
||||
|
||||
| Engine | Installation | Usage |
|
||||
| ------ | ------------ | ----- |
|
||||
| [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
|
||||
| Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` |
|
||||
| Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
|
||||
|
||||
The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
|
||||
|
||||
```python
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.datamodel.pipeline_options import PipelineOptions, EasyOcrOptions, TesseractOcrOptions
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = True
|
||||
pipeline_options.ocr_options = TesseractOcrOptions() # Use Tesseract
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
)
|
||||
```
|
||||
|
||||
#### Tesseract installation
|
||||
|
||||
[Tesseract](https://github.com/tesseract-ocr/tesseract) is a popular OCR engine which is available
|
||||
on most operating systems. For using this engine with Docling, Tesseract must be installed on your
|
||||
system, using the packaging tool of your choice. Below we provide example commands.
|
||||
After installing Tesseract you are expected to provide the path to its language files using the
|
||||
`TESSDATA_PREFIX` environment variable (note that it must terminate with a slash `/`).
|
||||
|
||||
For macOS, we reccomend using [Homebrew](https://brew.sh/).
|
||||
|
||||
```console
|
||||
brew install tesseract leptonica pkg-config
|
||||
TESSDATA_PREFIX=/opt/homebrew/share/tessdata/
|
||||
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
||||
```
|
||||
|
||||
For Debian-based systems.
|
||||
|
||||
```console
|
||||
apt-get install tesseract-ocr tesseract-ocr-eng libtesseract-dev libleptonica-dev pkg-config
|
||||
TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)
|
||||
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
||||
```
|
||||
|
||||
For RHEL systems.
|
||||
|
||||
```console
|
||||
dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
|
||||
TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
||||
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
||||
```
|
||||
|
||||
#### Linking to Tesseract
|
||||
The most efficient usage of the Tesseract library is via linking. Docling is using
|
||||
the [Tesserocr](https://github.com/sirfz/tesserocr) package for this.
|
||||
|
||||
If you get into installation issues of Tesserocr, we suggest using the following
|
||||
installation options:
|
||||
|
||||
```console
|
||||
pip uninstall tesserocr
|
||||
pip install --no-binary :all: tesserocr
|
||||
```
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><b>Docling development setup</b></summary>
|
||||
|
||||
|
@ -14,7 +14,12 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
PipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||
@ -53,6 +58,13 @@ class Backend(str, Enum):
|
||||
DOCLING = "docling"
|
||||
|
||||
|
||||
# Define an enum for the ocr engines
|
||||
class OcrEngine(str, Enum):
|
||||
EASYOCR = "easyocr"
|
||||
TESSERACT_CLI = "tesseract_cli"
|
||||
TESSERACT = "tesseract"
|
||||
|
||||
|
||||
def export_documents(
|
||||
conv_results: Iterable[ConversionResult],
|
||||
output_dir: Path,
|
||||
@ -152,6 +164,9 @@ def convert(
|
||||
backend: Annotated[
|
||||
Backend, typer.Option(..., help="The PDF backend to use.")
|
||||
] = Backend.DOCLING,
|
||||
ocr_engine: Annotated[
|
||||
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
||||
] = OcrEngine.EASYOCR,
|
||||
output: Annotated[
|
||||
Path, typer.Option(..., help="Output directory where results are saved.")
|
||||
] = Path("."),
|
||||
@ -191,8 +206,19 @@ def convert(
|
||||
case _:
|
||||
raise RuntimeError(f"Unexpected backend type {backend}")
|
||||
|
||||
match ocr_engine:
|
||||
case OcrEngine.EASYOCR:
|
||||
ocr_options = EasyOcrOptions()
|
||||
case OcrEngine.TESSERACT_CLI:
|
||||
ocr_options = TesseractCliOcrOptions()
|
||||
case OcrEngine.TESSERACT:
|
||||
ocr_options = TesseractOcrOptions()
|
||||
case _:
|
||||
raise RuntimeError(f"Unexpected backend type {backend}")
|
||||
|
||||
pipeline_options = PipelineOptions(
|
||||
do_ocr=ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=True,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
||||
|
@ -110,7 +110,10 @@ class BoundingBox(BaseModel):
|
||||
return BoundingBox(l=l, t=t, r=r, b=b, coord_origin=origin)
|
||||
|
||||
def area(self) -> float:
|
||||
return (self.r - self.l) * (self.b - self.t)
|
||||
area = (self.r - self.l) * (self.b - self.t)
|
||||
if self.coord_origin == CoordOrigin.BOTTOMLEFT:
|
||||
area = -area
|
||||
return area
|
||||
|
||||
def intersection_area_with(self, other: "BoundingBox") -> float:
|
||||
# Calculate intersection coordinates
|
||||
|
@ -1,6 +1,7 @@
|
||||
from enum import Enum, auto
|
||||
from typing import List, Literal, Optional, Union
|
||||
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
|
||||
class TableFormerMode(str, Enum):
|
||||
@ -18,8 +19,49 @@ class TableStructureOptions(BaseModel):
|
||||
mode: TableFormerMode = TableFormerMode.FAST
|
||||
|
||||
|
||||
class OcrOptions(BaseModel):
|
||||
kind: str
|
||||
|
||||
|
||||
class EasyOcrOptions(OcrOptions):
|
||||
kind: Literal["easyocr"] = "easyocr"
|
||||
lang: List[str] = ["fr", "de", "es", "en"]
|
||||
use_gpu: bool = True # same default as easyocr.Reader
|
||||
model_storage_directory: Optional[str] = None
|
||||
download_enabled: bool = True # same default as easyocr.Reader
|
||||
|
||||
model_config = ConfigDict(
|
||||
extra="forbid",
|
||||
protected_namespaces=(),
|
||||
)
|
||||
|
||||
|
||||
class TesseractCliOcrOptions(OcrOptions):
|
||||
kind: Literal["tesseract"] = "tesseract"
|
||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||
tesseract_cmd: str = "tesseract"
|
||||
path: Optional[str] = None
|
||||
|
||||
model_config = ConfigDict(
|
||||
extra="forbid",
|
||||
)
|
||||
|
||||
|
||||
class TesseractOcrOptions(OcrOptions):
|
||||
kind: Literal["tesserocr"] = "tesserocr"
|
||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||
path: Optional[str] = None
|
||||
|
||||
model_config = ConfigDict(
|
||||
extra="forbid",
|
||||
)
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
do_table_structure: bool = True # True: perform table structure extraction
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
|
||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
|
||||
Field(EasyOcrOptions(), discriminator="kind")
|
||||
)
|
||||
|
@ -3,21 +3,21 @@ import logging
|
||||
from abc import abstractmethod
|
||||
from typing import Iterable, List, Tuple
|
||||
|
||||
import numpy
|
||||
import numpy as np
|
||||
from PIL import Image, ImageDraw
|
||||
from rtree import index
|
||||
from scipy.ndimage import find_objects, label
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
||||
from docling.datamodel.pipeline_options import OcrOptions
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseOcrModel:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.enabled = config["enabled"]
|
||||
def __init__(self, enabled: bool, options: OcrOptions):
|
||||
self.enabled = enabled
|
||||
self.options = options
|
||||
|
||||
# Computes the optimum amount and coordinates of rectangles to OCR on a given page
|
||||
def get_ocr_rects(self, page: Page) -> Tuple[bool, List[BoundingBox]]:
|
||||
|
@ -4,21 +4,33 @@ from typing import Iterable
|
||||
import numpy
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
||||
from docling.datamodel.pipeline_options import EasyOcrOptions
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EasyOcrModel(BaseOcrModel):
|
||||
def __init__(self, config):
|
||||
super().__init__(config)
|
||||
def __init__(self, enabled: bool, options: EasyOcrOptions):
|
||||
super().__init__(enabled=enabled, options=options)
|
||||
self.options: EasyOcrOptions
|
||||
|
||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||
|
||||
if self.enabled:
|
||||
import easyocr
|
||||
try:
|
||||
import easyocr
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"EasyOCR is not installed. Please install it via `pip install easyocr` to use this OCR engine. "
|
||||
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
||||
)
|
||||
|
||||
self.reader = easyocr.Reader(config["lang"])
|
||||
self.reader = easyocr.Reader(
|
||||
lang_list=self.options.lang,
|
||||
model_storage_directory=self.options.model_storage_directory,
|
||||
download_enabled=self.options.download_enabled,
|
||||
)
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
|
||||
@ -31,6 +43,9 @@ class EasyOcrModel(BaseOcrModel):
|
||||
|
||||
all_ocr_cells = []
|
||||
for ocr_rect in ocr_rects:
|
||||
# Skip zero area boxes
|
||||
if ocr_rect.area() == 0:
|
||||
continue
|
||||
high_res_image = page._backend.get_page_image(
|
||||
scale=self.scale, cropbox=ocr_rect
|
||||
)
|
||||
|
167
docling/models/tesseract_ocr_cli_model.py
Normal file
167
docling/models/tesseract_ocr_cli_model.py
Normal file
@ -0,0 +1,167 @@
|
||||
import io
|
||||
import logging
|
||||
import tempfile
|
||||
from subprocess import PIPE, Popen
|
||||
from typing import Iterable, Tuple
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TesseractOcrCliModel(BaseOcrModel):
|
||||
|
||||
def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
|
||||
super().__init__(enabled=enabled, options=options)
|
||||
self.options: TesseractCliOcrOptions
|
||||
|
||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||
|
||||
self._name = None
|
||||
self._version = None
|
||||
|
||||
if self.enabled:
|
||||
try:
|
||||
self._get_name_and_version()
|
||||
|
||||
except Exception as exc:
|
||||
raise RuntimeError(
|
||||
f"Tesseract is not available, aborting: {exc} "
|
||||
"Install tesseract on your system and the tesseract binary is discoverable. "
|
||||
"The actual command for Tesseract can be specified in `pipeline_options.ocr_options.tesseract_cmd='tesseract'`. "
|
||||
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
||||
)
|
||||
|
||||
def _get_name_and_version(self) -> Tuple[str, str]:
|
||||
|
||||
if self._name != None and self._version != None:
|
||||
return self._name, self._version
|
||||
|
||||
cmd = [self.options.tesseract_cmd, "--version"]
|
||||
|
||||
proc = Popen(cmd, stdout=PIPE, stderr=PIPE)
|
||||
stdout, stderr = proc.communicate()
|
||||
|
||||
proc.wait()
|
||||
|
||||
# HACK: Windows versions of Tesseract output the version to stdout, Linux versions
|
||||
# to stderr, so check both.
|
||||
version_line = (
|
||||
(stdout.decode("utf8").strip() or stderr.decode("utf8").strip())
|
||||
.split("\n")[0]
|
||||
.strip()
|
||||
)
|
||||
|
||||
# If everything else fails...
|
||||
if not version_line:
|
||||
version_line = "tesseract XXX"
|
||||
|
||||
name, version = version_line.split(" ")
|
||||
|
||||
self._name = name
|
||||
self._version = version
|
||||
|
||||
return name, version
|
||||
|
||||
def _run_tesseract(self, ifilename: str):
|
||||
|
||||
cmd = [self.options.tesseract_cmd]
|
||||
|
||||
if self.options.lang is not None and len(self.options.lang) > 0:
|
||||
cmd.append("-l")
|
||||
cmd.append("+".join(self.options.lang))
|
||||
if self.options.path is not None:
|
||||
cmd.append("--tessdata-dir")
|
||||
cmd.append(self.options.path)
|
||||
|
||||
cmd += [ifilename, "stdout", "tsv"]
|
||||
_log.info("command: {}".format(" ".join(cmd)))
|
||||
|
||||
proc = Popen(cmd, stdout=PIPE)
|
||||
output, _ = proc.communicate()
|
||||
|
||||
# _log.info(output)
|
||||
|
||||
# Decode the byte string to a regular string
|
||||
decoded_data = output.decode("utf-8")
|
||||
# _log.info(decoded_data)
|
||||
|
||||
# Read the TSV file generated by Tesseract
|
||||
df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
|
||||
|
||||
# Display the dataframe (optional)
|
||||
# _log.info("df: ", df.head())
|
||||
|
||||
# Filter rows that contain actual text (ignore header or empty rows)
|
||||
df_filtered = df[df["text"].notnull() & (df["text"].str.strip() != "")]
|
||||
|
||||
return df_filtered
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
return
|
||||
|
||||
for page in page_batch:
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
|
||||
all_ocr_cells = []
|
||||
for ocr_rect in ocr_rects:
|
||||
# Skip zero area boxes
|
||||
if ocr_rect.area() == 0:
|
||||
continue
|
||||
high_res_image = page._backend.get_page_image(
|
||||
scale=self.scale, cropbox=ocr_rect
|
||||
)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix=".png", mode="w") as image_file:
|
||||
fname = image_file.name
|
||||
high_res_image.save(fname)
|
||||
|
||||
df = self._run_tesseract(fname)
|
||||
|
||||
# _log.info(df)
|
||||
|
||||
# Print relevant columns (bounding box and text)
|
||||
for ix, row in df.iterrows():
|
||||
text = row["text"]
|
||||
conf = row["conf"]
|
||||
|
||||
l = float(row["left"])
|
||||
b = float(row["top"])
|
||||
w = float(row["width"])
|
||||
h = float(row["height"])
|
||||
|
||||
t = b + h
|
||||
r = l + w
|
||||
|
||||
cell = OcrCell(
|
||||
id=ix,
|
||||
text=text,
|
||||
confidence=conf / 100.0,
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(
|
||||
(l / self.scale) + ocr_rect.l,
|
||||
(b / self.scale) + ocr_rect.t,
|
||||
(r / self.scale) + ocr_rect.l,
|
||||
(t / self.scale) + ocr_rect.t,
|
||||
),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
),
|
||||
)
|
||||
all_ocr_cells.append(cell)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
|
||||
# DEBUG code:
|
||||
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
||||
|
||||
yield page
|
122
docling/models/tesseract_ocr_model.py
Normal file
122
docling/models/tesseract_ocr_model.py
Normal file
@ -0,0 +1,122 @@
|
||||
import logging
|
||||
from typing import Iterable
|
||||
|
||||
import numpy
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell, Page
|
||||
from docling.datamodel.pipeline_options import TesseractCliOcrOptions
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TesseractOcrModel(BaseOcrModel):
|
||||
def __init__(self, enabled: bool, options: TesseractCliOcrOptions):
|
||||
super().__init__(enabled=enabled, options=options)
|
||||
self.options: TesseractCliOcrOptions
|
||||
|
||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||
self.reader = None
|
||||
|
||||
if self.enabled:
|
||||
setup_errmsg = (
|
||||
"tesserocr is not correctly installed. "
|
||||
"Please install it via `pip install tesserocr` to use this OCR engine. "
|
||||
"Note that tesserocr might have to be manually compiled for working with"
|
||||
"your Tesseract installation. The Docling documentation provides examples for it. "
|
||||
"Alternatively, Docling has support for other OCR engines. See the documentation."
|
||||
)
|
||||
try:
|
||||
import tesserocr
|
||||
except ImportError:
|
||||
raise ImportError(setup_errmsg)
|
||||
|
||||
try:
|
||||
tesseract_version = tesserocr.tesseract_version()
|
||||
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
||||
except:
|
||||
raise ImportError(setup_errmsg)
|
||||
|
||||
# Initialize the tesseractAPI
|
||||
lang = "+".join(self.options.lang)
|
||||
if self.options.path is not None:
|
||||
self.reader = tesserocr.PyTessBaseAPI(
|
||||
path=self.options.path,
|
||||
lang=lang,
|
||||
psm=tesserocr.PSM.AUTO,
|
||||
init=True,
|
||||
oem=tesserocr.OEM.DEFAULT,
|
||||
)
|
||||
else:
|
||||
self.reader = tesserocr.PyTessBaseAPI(
|
||||
lang=lang,
|
||||
psm=tesserocr.PSM.AUTO,
|
||||
init=True,
|
||||
oem=tesserocr.OEM.DEFAULT,
|
||||
)
|
||||
self.reader_RIL = tesserocr.RIL
|
||||
|
||||
def __del__(self):
|
||||
if self.reader is not None:
|
||||
# Finalize the tesseractAPI
|
||||
self.reader.End()
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
return
|
||||
|
||||
for page in page_batch:
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
|
||||
all_ocr_cells = []
|
||||
for ocr_rect in ocr_rects:
|
||||
# Skip zero area boxes
|
||||
if ocr_rect.area() == 0:
|
||||
continue
|
||||
high_res_image = page._backend.get_page_image(
|
||||
scale=self.scale, cropbox=ocr_rect
|
||||
)
|
||||
|
||||
# Retrieve text snippets with their bounding boxes
|
||||
self.reader.SetImage(high_res_image)
|
||||
boxes = self.reader.GetComponentImages(self.reader_RIL.TEXTLINE, True)
|
||||
|
||||
cells = []
|
||||
for ix, (im, box, _, _) in enumerate(boxes):
|
||||
# Set the area of interest. Tesseract uses Bottom-Left for the origin
|
||||
self.reader.SetRectangle(box["x"], box["y"], box["w"], box["h"])
|
||||
|
||||
# Extract text within the bounding box
|
||||
text = self.reader.GetUTF8Text().strip()
|
||||
confidence = self.reader.MeanTextConf()
|
||||
left = box["x"] / self.scale
|
||||
bottom = box["y"] / self.scale
|
||||
right = (box["x"] + box["w"]) / self.scale
|
||||
top = (box["y"] + box["h"]) / self.scale
|
||||
|
||||
cells.append(
|
||||
OcrCell(
|
||||
id=ix,
|
||||
text=text,
|
||||
confidence=confidence,
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(left, top, right, bottom),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# del high_res_image
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
## Remove OCR cells which overlap with programmatic cells.
|
||||
filtered_ocr_cells = self.filter_ocr_cells(all_ocr_cells, page.cells)
|
||||
|
||||
page.cells.extend(filtered_ocr_cells)
|
||||
|
||||
# DEBUG code:
|
||||
# self.draw_ocr_rects_and_cells(page, ocr_rects)
|
||||
|
||||
yield page
|
@ -1,9 +1,17 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
PipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.table_structure_model import TableStructureModel
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
|
||||
from docling.models.tesseract_ocr_model import TesseractOcrModel
|
||||
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
||||
|
||||
|
||||
@ -14,19 +22,38 @@ class StandardModelPipeline(BaseModelPipeline):
|
||||
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
||||
super().__init__(artifacts_path, pipeline_options)
|
||||
|
||||
ocr_model: BaseOcrModel
|
||||
if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
|
||||
ocr_model = EasyOcrModel(
|
||||
enabled=pipeline_options.do_ocr,
|
||||
options=pipeline_options.ocr_options,
|
||||
)
|
||||
elif isinstance(pipeline_options.ocr_options, TesseractCliOcrOptions):
|
||||
ocr_model = TesseractOcrCliModel(
|
||||
enabled=pipeline_options.do_ocr,
|
||||
options=pipeline_options.ocr_options,
|
||||
)
|
||||
elif isinstance(pipeline_options.ocr_options, TesseractOcrOptions):
|
||||
ocr_model = TesseractOcrModel(
|
||||
enabled=pipeline_options.do_ocr,
|
||||
options=pipeline_options.ocr_options,
|
||||
)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"The specified OCR kind is not supported: {pipeline_options.ocr_options.kind}."
|
||||
)
|
||||
|
||||
self.model_pipe = [
|
||||
EasyOcrModel(
|
||||
config={
|
||||
"lang": ["fr", "de", "es", "en"],
|
||||
"enabled": pipeline_options.do_ocr,
|
||||
}
|
||||
),
|
||||
# OCR
|
||||
ocr_model,
|
||||
# Layout
|
||||
LayoutModel(
|
||||
config={
|
||||
"artifacts_path": artifacts_path
|
||||
/ StandardModelPipeline._layout_model_path
|
||||
}
|
||||
),
|
||||
# Table structure
|
||||
TableStructureModel(
|
||||
config={
|
||||
"artifacts_path": artifacts_path
|
||||
|
@ -8,6 +8,10 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import (
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -71,7 +75,7 @@ def main():
|
||||
# and PDF Backends for various configurations.
|
||||
# Uncomment one section at the time to see the differences in the output.
|
||||
|
||||
# PyPdfium without OCR
|
||||
# PyPdfium without EasyOCR
|
||||
# --------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=False
|
||||
@ -83,7 +87,7 @@ def main():
|
||||
# pdf_backend=PyPdfiumDocumentBackend,
|
||||
# )
|
||||
|
||||
# PyPdfium with OCR
|
||||
# PyPdfium with EasyOCR
|
||||
# -----------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=True
|
||||
@ -95,7 +99,7 @@ def main():
|
||||
# pdf_backend=PyPdfiumDocumentBackend,
|
||||
# )
|
||||
|
||||
# Docling Parse without OCR
|
||||
# Docling Parse without EasyOCR
|
||||
# -------------------------
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
@ -107,7 +111,7 @@ def main():
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
|
||||
# Docling Parse with OCR
|
||||
# Docling Parse with EasyOCR
|
||||
# ----------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr=True
|
||||
@ -119,6 +123,32 @@ def main():
|
||||
# pdf_backend=DoclingParseDocumentBackend,
|
||||
# )
|
||||
|
||||
# Docling Parse with Tesseract
|
||||
# ----------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
# pipeline_options.ocr_options = TesseractOcrOptions()
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=DoclingParseDocumentBackend,
|
||||
# )
|
||||
|
||||
# Docling Parse with Tesseract CLI
|
||||
# ----------------------
|
||||
# pipeline_options = PipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
# pipeline_options.ocr_options = TesseractCliOcrOptions()
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# pipeline_options=pipeline_options,
|
||||
# pdf_backend=DoclingParseDocumentBackend,
|
||||
# )
|
||||
|
||||
###########################################################################
|
||||
|
||||
# Define input files
|
||||
|
45
poetry.lock
generated
45
poetry.lock
generated
@ -5929,6 +5929,41 @@ files = [
|
||||
doc = ["reno", "sphinx"]
|
||||
test = ["pytest", "tornado (>=4.5)", "typeguard"]
|
||||
|
||||
[[package]]
|
||||
name = "tesserocr"
|
||||
version = "2.7.1"
|
||||
description = "A simple, Pillow-friendly, Python wrapper around tesseract-ocr API using Cython"
|
||||
optional = true
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "tesserocr-2.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1b8c4828f970af7bcfca83a1fb228aa68a2587299387bc875d0dfad8b6baf8ed"},
|
||||
{file = "tesserocr-2.7.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3bb5d336ebf2cc47cd0d117cadc8b25b2e558f54fb9a2dedaa28a14cb5a6b437"},
|
||||
{file = "tesserocr-2.7.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:3ff7f6d6b5c12dd31b80842eb0892b661a41ca3edf0e6cc1e54ec2c14552ceef"},
|
||||
{file = "tesserocr-2.7.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:ae794c5434373f4afa4c7f8b59f19fde810f8caf096d8bb701a4b2f3a6739460"},
|
||||
{file = "tesserocr-2.7.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:0a0895a4d9ff6a34f5a6f203fe0c9899f31d6f2378ae99be80605637b622687b"},
|
||||
{file = "tesserocr-2.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4c3187d14b95c866aa1d34cc374a53d583e2168742eefe33347e4790af70338e"},
|
||||
{file = "tesserocr-2.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ec52be3d82136430081427062ad0211a52fc38fa28fe58e216b89f840354f216"},
|
||||
{file = "tesserocr-2.7.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:44e71b3e8da36b2567760309398689ea9785ee62db3ff21140a9ea6941a233c4"},
|
||||
{file = "tesserocr-2.7.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:e31a49d7784e7e52fe656719145c3a872856d67daa9bfb340c2990db00e023e9"},
|
||||
{file = "tesserocr-2.7.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:37abde15c1c940d691305fd87836e4cad25a1434799729c324bbcd2277bcae44"},
|
||||
{file = "tesserocr-2.7.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:1b6349d35d333d420d24acf1953ad6f1d5613ffcde462c62126b68bdfca12753"},
|
||||
{file = "tesserocr-2.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:42f009cde8479f3b339da12a8e419fd9559b64b13bc08a248bd0833c6ae94331"},
|
||||
{file = "tesserocr-2.7.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:6e13204b3b92fac76ece6e33f55eba6335b30e379f4a7b75e285c2ad05762027"},
|
||||
{file = "tesserocr-2.7.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:65afdec0c5dc09a4a23a62e65524989cd940af41be1603e251a64ac10de9babf"},
|
||||
{file = "tesserocr-2.7.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4c5f59fb072c90bff8aa6a365fc82b747c2668b7b48233901728b155860d1ff9"},
|
||||
{file = "tesserocr-2.7.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:f62d662e3002868384e14e8cd620bdedf34ab9f9fc3ebbce527cfe032a7485ee"},
|
||||
{file = "tesserocr-2.7.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e80051812685bd521bc17cb70cf1480ffbb3e54ccc2883e90d5bcda15f8278ea"},
|
||||
{file = "tesserocr-2.7.1-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:2690cb2330fc9349d68ff027cbdac09693fdda36470836b196c04f16dcc99e9d"},
|
||||
{file = "tesserocr-2.7.1-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d01ebd094103451ecb77b6510ade2f6bb064c51413ff35b135f649f3d6067a67"},
|
||||
{file = "tesserocr-2.7.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f8069ae6cd9ea3c056b6a596bc99f501ee9f95d6fd2928fcaffb9777071c210d"},
|
||||
{file = "tesserocr-2.7.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2d3d23223d0a448877fb91af83c46ce95ff0a497a82fa93e93068148c9712e5"},
|
||||
{file = "tesserocr-2.7.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ef8a09a44c2e96bab0f40dbf0633767d063680d86b79365b43fc4e1234219694"},
|
||||
{file = "tesserocr-2.7.1-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:6e613213ea5b64db06f2cba0b93c3656b7e6aec2d9b2d2e929edf49da7143225"},
|
||||
{file = "tesserocr-2.7.1-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:4a8888b765e26680a6e34b8ec09b7bb85a17e08cea76f0661eafe2a84254562a"},
|
||||
{file = "tesserocr-2.7.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:64f25763e56c4c29b808e59b485c930cac46b6a1ac8eadd994086dc40a29d3a1"},
|
||||
{file = "tesserocr-2.7.1.tar.gz", hash = "sha256:3744c5c8bbabf18172849c7731be00dc2e5e44f8c556d37c850e788794ae0af4"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "threadpoolctl"
|
||||
version = "3.5.0"
|
||||
@ -6514,6 +6549,11 @@ files = [
|
||||
{file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
|
||||
{file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
|
||||
{file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
|
||||
{file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"},
|
||||
{file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"},
|
||||
{file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"},
|
||||
{file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"},
|
||||
{file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@ -7121,7 +7161,10 @@ enabler = ["pytest-enabler (>=2.2)"]
|
||||
test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"]
|
||||
type = ["pytest-mypy"]
|
||||
|
||||
[extras]
|
||||
tesserocr = ["tesserocr"]
|
||||
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "7c5fb235944009b74193d045f36c1be2a8e168393012bf952541e6e7dea08072"
|
||||
content-hash = "a9bfb36209f3a9140b6923c51bae8c1e23af5be34e52d9622119a5683f125b2c"
|
||||
|
@ -46,6 +46,7 @@ pydantic-settings = "^2.3.0"
|
||||
huggingface_hub = ">=0.23,<1"
|
||||
requests = "^2.32.3"
|
||||
easyocr = "^1.7"
|
||||
tesserocr = { version = "^2.7.1", optional = true }
|
||||
docling-parse = "^1.4.1"
|
||||
certifi = ">=2024.7.4"
|
||||
rtree = "^1.3.0"
|
||||
@ -81,6 +82,9 @@ langchain-huggingface = "^0.0.3"
|
||||
langchain-milvus = "^0.1.4"
|
||||
langchain-text-splitters = "^0.2.4"
|
||||
|
||||
[tool.poetry.extras]
|
||||
tesserocr = ["tesserocr"]
|
||||
|
||||
[tool.poetry.scripts]
|
||||
docling = "docling.cli.main:app"
|
||||
|
||||
|
3
tests/data_scanned/ocr_test.doctags.txt
Normal file
3
tests/data_scanned/ocr_test.doctags.txt
Normal file
@ -0,0 +1,3 @@
|
||||
<document>
|
||||
<paragraph><location><page_1><loc_12><loc_82><loc_86><loc_91></location>Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package</paragraph>
|
||||
</document>
|
1
tests/data_scanned/ocr_test.json
Normal file
1
tests/data_scanned/ocr_test.json
Normal file
@ -0,0 +1 @@
|
||||
{"_name": "", "type": "pdf-document", "description": {"logs": []}, "file-info": {"filename": "ocr_test_8.pdf", "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.0, 688.58837890625, 509.4446716308594, 767.422119140625], "page": 1, "span": [0, 94]}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "name": "Text"}], "figures": [], "tables": [], "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": []}
|
1
tests/data_scanned/ocr_test.md
Normal file
1
tests/data_scanned/ocr_test.md
Normal file
@ -0,0 +1 @@
|
||||
Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package
|
1
tests/data_scanned/ocr_test.pages.json
Normal file
1
tests/data_scanned/ocr_test.pages.json
Normal file
@ -0,0 +1 @@
|
||||
[{"page_no": 0, "page_hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "size": {"width": 595.201171875, "height": 841.9216918945312}, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}], "predictions": {"layout": {"clusters": [{"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}]}, "tablestructure": {"table_map": {}}, "figures_classification": null, "equations_prediction": null}, "assembled": {"elements": [{"label": "Text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "body": [{"label": "Text", "id": 0, "page_no": 0, "cluster": {"id": 0, "label": "Text", "bbox": {"l": 69.0, "t": 74.49958801269531, "r": 509.4446716308594, "b": 153.33333333333337, "coord_origin": "1"}, "confidence": 0.923837423324585, "cells": [{"id": 0, "text": "Docling bundles PDF document conversion to", "bbox": {"l": 71.33333333333333, "t": 74.66666666666663, "r": 506.6666666666667, "b": 99.33333333333337, "coord_origin": "1"}}, {"id": 1, "text": "JSON and Markdown in an easy self contained", "bbox": {"l": 69.0, "t": 100.66666666666663, "r": 506.6666666666667, "b": 126.66666666666663, "coord_origin": "1"}}, {"id": 2, "text": "package", "bbox": {"l": 70.66666666666667, "t": 128.66666666666663, "r": 154.0, "b": 153.33333333333337, "coord_origin": "1"}}]}, "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package"}], "headers": []}}]
|
BIN
tests/data_scanned/ocr_test.pdf
Normal file
BIN
tests/data_scanned/ocr_test.pdf
Normal file
Binary file not shown.
98
tests/test_e2e_ocr_conversion.py
Normal file
98
tests/test_e2e_ocr_conversion.py
Normal file
@ -0,0 +1,98 @@
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrOptions,
|
||||
PipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
from .verify_utils import verify_conversion_result
|
||||
|
||||
GENERATE = False
|
||||
|
||||
|
||||
# Debug
|
||||
def save_output(pdf_path: Path, doc_result: ConversionResult, engine: str):
|
||||
r""" """
|
||||
import json
|
||||
import os
|
||||
|
||||
parent = pdf_path.parent
|
||||
eng = "" if engine is None else f".{engine}"
|
||||
|
||||
dict_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.json")
|
||||
with open(dict_fn, "w") as fd:
|
||||
json.dump(doc_result.render_as_dict(), fd)
|
||||
|
||||
pages_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.pages.json")
|
||||
pages = [p.model_dump() for p in doc_result.pages]
|
||||
with open(pages_fn, "w") as fd:
|
||||
json.dump(pages, fd)
|
||||
|
||||
doctags_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.doctags.txt")
|
||||
with open(doctags_fn, "w") as fd:
|
||||
fd.write(doc_result.render_as_doctags())
|
||||
|
||||
md_fn = os.path.join(parent, f"{pdf_path.stem}{eng}.md")
|
||||
with open(md_fn, "w") as fd:
|
||||
fd.write(doc_result.render_as_markdown())
|
||||
|
||||
|
||||
def get_pdf_paths():
|
||||
# Define the directory you want to search
|
||||
directory = Path("./tests/data_scanned")
|
||||
|
||||
# List all PDF files in the directory and its subdirectories
|
||||
pdf_files = sorted(directory.rglob("*.pdf"))
|
||||
return pdf_files
|
||||
|
||||
|
||||
def get_converter(ocr_options: OcrOptions):
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options.do_ocr = True
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
pipeline_options.ocr_options = ocr_options
|
||||
|
||||
converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
|
||||
return converter
|
||||
|
||||
|
||||
def test_e2e_conversions():
|
||||
|
||||
pdf_paths = get_pdf_paths()
|
||||
|
||||
engines: List[OcrOptions] = [
|
||||
EasyOcrOptions(),
|
||||
TesseractOcrOptions(),
|
||||
TesseractCliOcrOptions(),
|
||||
]
|
||||
|
||||
for ocr_options in engines:
|
||||
print(f"Converting with ocr_engine: {ocr_options.kind}")
|
||||
converter = get_converter(ocr_options=ocr_options)
|
||||
for pdf_path in pdf_paths:
|
||||
print(f"converting {pdf_path}")
|
||||
|
||||
doc_result: ConversionResult = converter.convert_single(pdf_path)
|
||||
|
||||
# Save conversions
|
||||
# save_output(pdf_path, doc_result, None)
|
||||
|
||||
# Debug
|
||||
verify_conversion_result(
|
||||
input_path=pdf_path,
|
||||
doc_result=doc_result,
|
||||
generate=GENERATE,
|
||||
skip_cells=True,
|
||||
)
|
@ -130,7 +130,11 @@ def verify_dt(doc_pred_dt, doc_true_dt):
|
||||
|
||||
|
||||
def verify_conversion_result(
|
||||
input_path: Path, doc_result: ConversionResult, generate=False
|
||||
input_path: Path,
|
||||
doc_result: ConversionResult,
|
||||
generate: bool = False,
|
||||
ocr_engine: str = None,
|
||||
skip_cells: bool = False,
|
||||
):
|
||||
PageList = TypeAdapter(List[Page])
|
||||
|
||||
@ -143,10 +147,11 @@ def verify_conversion_result(
|
||||
doc_pred_md = doc_result.render_as_markdown()
|
||||
doc_pred_dt = doc_result.render_as_doctags()
|
||||
|
||||
pages_path = input_path.with_suffix(".pages.json")
|
||||
json_path = input_path.with_suffix(".json")
|
||||
md_path = input_path.with_suffix(".md")
|
||||
dt_path = input_path.with_suffix(".doctags.txt")
|
||||
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
||||
pages_path = input_path.with_suffix(f"{engine_suffix}.pages.json")
|
||||
json_path = input_path.with_suffix(f"{engine_suffix}.json")
|
||||
md_path = input_path.with_suffix(f"{engine_suffix}.md")
|
||||
dt_path = input_path.with_suffix(f"{engine_suffix}.doctags.txt")
|
||||
|
||||
if generate: # only used when re-generating truth
|
||||
with open(pages_path, "w") as fw:
|
||||
@ -173,9 +178,10 @@ def verify_conversion_result(
|
||||
with open(dt_path, "r") as fr:
|
||||
doc_true_dt = fr.read()
|
||||
|
||||
assert verify_cells(
|
||||
doc_pred_pages, doc_true_pages
|
||||
), f"Mismatch in PDF cell prediction for {input_path}"
|
||||
if not skip_cells:
|
||||
assert verify_cells(
|
||||
doc_pred_pages, doc_true_pages
|
||||
), f"Mismatch in PDF cell prediction for {input_path}"
|
||||
|
||||
# assert verify_output(
|
||||
# doc_pred, doc_true
|
||||
|
Loading…
Reference in New Issue
Block a user