feat: add support for ocrmac OCR engine on macOS (#276)

* feat: add support for `ocrmac` OCR engine on macOS

- Integrates `ocrmac` as an OCR engine option for macOS users.
- Adds configuration options and dependencies for `ocrmac`.
- Updates documentation to reflect new engine support.

This change allows macOS users to utilize `ocrmac` for improved OCR performance and compatibility.

Signed-off-by: Suhwan Seo <nuridol@gmail.com>

* updated the poetry lock

Signed-off-by: Suhwan Seo <nuridol@gmail.com>

* Fix linting issues, update CLI docs, and add error for ocrmac use on non-Mac systems

- Resolved formatting and linting issues
- Updated `--ocr-engine` CLI option documentation for `ocrmac`
- Added RuntimeError for attempts to use `ocrmac` on non-Mac platforms

Signed-off-by: Suhwan Seo <nuridol@gmail.com>

* feat: add support for `ocrmac` OCR engine on macOS

- Integrates `ocrmac` as an OCR engine option for macOS users.
- Adds configuration options and dependencies for `ocrmac`.
- Updates documentation to reflect new engine support.

This change allows macOS users to utilize `ocrmac` for improved OCR performance and compatibility.

Signed-off-by: Suhwan Seo <nuridol@gmail.com>

* docs: update examples and installation for ocrmac support

- Added `OcrMacOptions` to `custom_convert.py` and `full_page_ocr.py` examples.
- Included usage comments and examples for `OcrMacOptions` in OCR pipelines.
- Updated installation guide to include instructions for installing `ocrmac`, noting macOS version requirements (10.15+).
- Highlighted that `ocrmac` leverages Apple's Vision framework as an OCR backend.

This enhances documentation for users working on macOS to leverage `ocrmac` effectively.

Signed-off-by: Suhwan Seo <nuridol@gmail.com>

* fix: update `ocrmac` dependency with macOS-specific marker

- Added `sys_platform == 'darwin'` marker to the `ocrmac` dependency in `pyproject.toml` to specify macOS compatibility.
- Updated the content hash in `poetry.lock` to reflect the changes.

This ensures the `ocrmac` dependency is only installed on macOS systems.

Signed-off-by: Suhwan Seo <nuridol@gmail.com>

---------

Signed-off-by: Suhwan Seo <nuridol@gmail.com>
Co-authored-by: Suhwan Seo <nuridol@gmail.com>
This commit is contained in:
nuridol 2024-11-20 20:51:19 +09:00 committed by GitHub
parent 32ebf55e33
commit 6efa96c983
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 311 additions and 14 deletions

View File

@ -24,6 +24,7 @@ from docling.datamodel.base_models import (
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
OcrOptions,
PdfPipelineOptions,
TableFormerMode,
@ -74,6 +75,7 @@ class OcrEngine(str, Enum):
EASYOCR = "easyocr"
TESSERACT_CLI = "tesseract_cli"
TESSERACT = "tesseract"
OCRMAC = "ocrmac"
def export_documents(
@ -259,6 +261,8 @@ def convert(
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
case OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
case OcrEngine.OCRMAC:
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
case _:
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")

View File

@ -63,6 +63,17 @@ class TesseractOcrOptions(OcrOptions):
)
class OcrMacOptions(OcrOptions):
kind: Literal["ocrmac"] = "ocrmac"
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
recognition: str = "accurate"
framework: str = "vision"
model_config = ConfigDict(
extra="forbid",
)
class PipelineOptions(BaseModel):
create_legacy_output: bool = (
True # This defautl will be set to False on a future version of docling
@ -75,9 +86,9 @@ class PdfPipelineOptions(PipelineOptions):
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
Field(EasyOcrOptions(), discriminator="kind")
)
ocr_options: Union[
EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
] = Field(EasyOcrOptions(), discriminator="kind")
images_scale: float = 1.0
generate_page_images: bool = False

View File

@ -0,0 +1,118 @@
import logging
import tempfile
from typing import Iterable, Optional, Tuple
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import OcrCell, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import OcrMacOptions
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
class OcrMacModel(BaseOcrModel):
def __init__(self, enabled: bool, options: OcrMacOptions):
super().__init__(enabled=enabled, options=options)
self.options: OcrMacOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
if self.enabled:
install_errmsg = (
"ocrmac is not correctly installed. "
"Please install it via `pip install ocrmac` to use this OCR engine. "
"Alternatively, Docling has support for other OCR engines. See the documentation: "
"https://ds4sd.github.io/docling/installation/"
)
try:
from ocrmac import ocrmac
except ImportError:
raise ImportError(install_errmsg)
self.reader_RIL = ocrmac.OCR
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
if not self.enabled:
yield from page_batch
return
for page in page_batch:
assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
with TimeRecorder(conv_res, "ocr"):
ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = []
for ocr_rect in ocr_rects:
# Skip zero area boxes
if ocr_rect.area() == 0:
continue
high_res_image = page._backend.get_page_image(
scale=self.scale, cropbox=ocr_rect
)
with tempfile.NamedTemporaryFile(
suffix=".png", mode="w"
) as image_file:
fname = image_file.name
high_res_image.save(fname)
boxes = self.reader_RIL(
fname,
recognition_level=self.options.recognition,
framework=self.options.framework,
language_preference=self.options.lang,
).recognize()
im_width, im_height = high_res_image.size
cells = []
for ix, (text, confidence, box) in enumerate(boxes):
x = float(box[0])
y = float(box[1])
w = float(box[2])
h = float(box[3])
x1 = x * im_width
y2 = (1 - y) * im_height
x2 = x1 + w * im_width
y1 = y2 - h * im_height
left = x1 / self.scale
top = y1 / self.scale
right = x2 / self.scale
bottom = y2 / self.scale
cells.append(
OcrCell(
id=ix,
text=text,
confidence=confidence,
bbox=BoundingBox.from_tuple(
coord=(left, top, right, bottom),
origin=CoordOrigin.TOPLEFT,
),
)
)
# del high_res_image
all_ocr_cells.extend(cells)
# Post-process the cells
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
# DEBUG code:
if settings.debug.visualize_ocr:
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
yield page

View File

@ -1,4 +1,5 @@
import logging
import sys
from pathlib import Path
from typing import Optional
@ -10,6 +11,7 @@ from docling.datamodel.base_models import AssembledUnit, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
@ -18,6 +20,7 @@ from docling.models.base_ocr_model import BaseOcrModel
from docling.models.ds_glm_model import GlmModel, GlmOptions
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.ocr_mac_model import OcrMacModel
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
from docling.models.page_preprocessing_model import (
PagePreprocessingModel,
@ -118,6 +121,15 @@ class StandardPdfPipeline(PaginatedPipeline):
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
)
elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
if "darwin" != sys.platform:
raise RuntimeError(
f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}."
)
return OcrMacModel(
enabled=self.pipeline_options.do_ocr,
options=self.pipeline_options.ocr_options,
)
return None
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:

View File

@ -7,6 +7,7 @@ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.ocr_mac_model import OcrMacOptions
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
from docling.models.tesseract_ocr_model import TesseractOcrOptions
@ -122,6 +123,20 @@ def main():
# }
# )
# Docling Parse with ocrmac(Mac only)
# ----------------------
# pipeline_options = PdfPipelineOptions()
# pipeline_options.do_ocr = True
# pipeline_options.do_table_structure = True
# pipeline_options.table_structure_options.do_cell_matching = True
# pipeline_options.ocr_options = OcrMacOptions()
# doc_converter = DocumentConverter(
# format_options={
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
# }
# )
###########################################################################
start_time = time.time()

View File

@ -4,6 +4,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
@ -19,9 +20,10 @@ def main():
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
# Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions
# Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions(Mac only)
# ocr_options = EasyOcrOptions(force_full_page_ocr=True)
# ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
# ocr_options = OcrMacOptions(force_full_page_ocr=True)
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
pipeline_options.ocr_options = ocr_options

View File

@ -30,6 +30,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
| [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
| Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` |
| Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
| OcrMac | System dependency. See description below. | `OcrMacOptions` |
The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
@ -91,6 +92,17 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
pip install --no-binary :all: tesserocr
```
<h3>ocrmac installation</h3>
[ocrmac](https://github.com/straussmaximilian/ocrmac) is using
Apple's vision(or livetext) framework as OCR backend.
For using this engine with Docling, ocrmac must be installed on your system.
This only works on macOS systems with newer macOS versions (10.15+).
```console
pip install ocrmac
```
## Development setup
To develop Docling features, bugfixes etc., install as follows from your local clone's root dir:

133
poetry.lock generated
View File

@ -182,8 +182,8 @@ files = [
lazy-object-proxy = ">=1.4.0"
typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
wrapt = [
{version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
{version = ">=1.11,<2", markers = "python_version < \"3.11\""},
{version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
]
[[package]]
@ -825,8 +825,8 @@ files = [
docling-core = ">=2.0,<3.0"
docutils = "!=0.21"
numpy = [
{version = ">=2.0.2,<3.0.0", markers = "python_version >= \"3.13\""},
{version = ">=1.26.4,<2.0.0", markers = "python_version >= \"3.9\" and python_version < \"3.13\""},
{version = ">=2.0.2,<3.0.0", markers = "python_version >= \"3.13\""},
]
pandas = {version = ">=2.1.4,<3.0.0", markers = "python_version >= \"3.9\""}
python-dotenv = ">=1.0.0,<2.0.0"
@ -912,8 +912,8 @@ huggingface_hub = ">=0.23,<1"
jsonlines = ">=3.1.0,<4.0.0"
mean_average_precision = ">=2021.4.26.0,<2022.0.0.0"
numpy = [
{version = ">=2.1.0,<3.0.0", markers = "python_version >= \"3.13\""},
{version = ">=1.24.4,<2.0.0", markers = "python_version < \"3.13\""},
{version = ">=2.1.0,<3.0.0", markers = "python_version >= \"3.13\""},
]
opencv-python-headless = ">=4.6.0.66,<5.0.0.0"
Pillow = ">=10.0.0,<11.0.0"
@ -2063,8 +2063,8 @@ jsonpatch = ">=1.33,<2.0"
langsmith = ">=0.1.112,<0.2.0"
packaging = ">=23.2,<25"
pydantic = [
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
{version = ">=1,<3", markers = "python_full_version < \"3.12.4\""},
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
]
PyYAML = ">=5.3"
tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0"
@ -2132,8 +2132,8 @@ files = [
httpx = ">=0.23.0,<1"
orjson = ">=3.9.14,<4.0.0"
pydantic = [
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
{version = ">=1,<3", markers = "python_full_version < \"3.12.4\""},
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
]
requests = ">=2,<3"
requests-toolbelt = ">=1.0.0,<2.0.0"
@ -3548,6 +3548,22 @@ files = [
{file = "nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485"},
]
[[package]]
name = "ocrmac"
version = "1.0.0"
description = "A python wrapper to extract text from images on a mac system. Uses the vision framework from Apple."
optional = true
python-versions = ">=3.6"
files = [
{file = "ocrmac-1.0.0-py2.py3-none-any.whl", hash = "sha256:0b5a072aa23a9ead48132cb2d595b680aa6c3c5a6cb69525155e35ca95610c3a"},
{file = "ocrmac-1.0.0.tar.gz", hash = "sha256:5b299e9030c973d1f60f82db000d6c2e5ff271601878c7db0885e850597d1d2e"},
]
[package.dependencies]
Click = ">=7.0"
pillow = "*"
pyobjc-framework-Vision = "*"
[[package]]
name = "opencv-python-headless"
version = "4.10.0.84"
@ -3566,10 +3582,10 @@ files = [
[package.dependencies]
numpy = [
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
]
[[package]]
@ -3732,9 +3748,9 @@ files = [
[package.dependencies]
numpy = [
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
]
python-dateutil = ">=2.8.2"
pytz = ">=2020.1"
@ -4331,8 +4347,8 @@ files = [
annotated-types = ">=0.6.0"
pydantic-core = "2.23.4"
typing-extensions = [
{version = ">=4.12.2", markers = "python_version >= \"3.13\""},
{version = ">=4.6.1", markers = "python_version < \"3.13\""},
{version = ">=4.12.2", markers = "python_version >= \"3.13\""},
]
[package.extras]
@ -4500,8 +4516,8 @@ files = [
astroid = ">=2.15.8,<=2.17.0-dev0"
colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
dill = [
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
{version = ">=0.2", markers = "python_version < \"3.11\""},
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
]
isort = ">=4.2.5,<6"
mccabe = ">=0.6,<0.8"
@ -4556,6 +4572,102 @@ bulk-writer = ["azure-storage-blob", "minio (>=7.0.0)", "pyarrow (>=12.0.0)", "r
dev = ["black", "grpcio (==1.62.2)", "grpcio-testing (==1.62.2)", "grpcio-tools (==1.62.2)", "pytest (>=5.3.4)", "pytest-cov (>=2.8.1)", "pytest-timeout (>=1.3.4)", "ruff (>0.4.0)"]
model = ["milvus-model (>=0.1.0)"]
[[package]]
name = "pyobjc-core"
version = "10.3.1"
description = "Python<->ObjC Interoperability Module"
optional = true
python-versions = ">=3.8"
files = [
{file = "pyobjc_core-10.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ea46d2cda17921e417085ac6286d43ae448113158afcf39e0abe484c58fb3d78"},
{file = "pyobjc_core-10.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:899d3c84d2933d292c808f385dc881a140cf08632907845043a333a9d7c899f9"},
{file = "pyobjc_core-10.3.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:6ff5823d13d0a534cdc17fa4ad47cf5bee4846ce0fd27fc40012e12b46db571b"},
{file = "pyobjc_core-10.3.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2581e8e68885bcb0e11ec619e81ef28e08ee3fac4de20d8cc83bc5af5bcf4a90"},
{file = "pyobjc_core-10.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ea98d4c2ec39ca29e62e0327db21418696161fb138ee6278daf2acbedf7ce504"},
{file = "pyobjc_core-10.3.1-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:4c179c26ee2123d0aabffb9dbc60324b62b6f8614fb2c2328b09386ef59ef6d8"},
{file = "pyobjc_core-10.3.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:cb901fce65c9be420c40d8a6ee6fff5ff27c6945f44fd7191989b982baa66dea"},
{file = "pyobjc_core-10.3.1.tar.gz", hash = "sha256:b204a80ccc070f9ab3f8af423a3a25a6fd787e228508d00c4c30f8ac538ba720"},
]
[[package]]
name = "pyobjc-framework-cocoa"
version = "10.3.1"
description = "Wrappers for the Cocoa frameworks on macOS"
optional = true
python-versions = ">=3.8"
files = [
{file = "pyobjc_framework_Cocoa-10.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4cb4f8491ab4d9b59f5187e42383f819f7a46306a4fa25b84f126776305291d1"},
{file = "pyobjc_framework_Cocoa-10.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5f31021f4f8fdf873b57a97ee1f3c1620dbe285e0b4eaed73dd0005eb72fd773"},
{file = "pyobjc_framework_Cocoa-10.3.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:11b4e0bad4bbb44a4edda128612f03cdeab38644bbf174de0c13129715497296"},
{file = "pyobjc_framework_Cocoa-10.3.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:de5e62e5ccf2871a94acf3bf79646b20ea893cc9db78afa8d1fe1b0d0f7cbdb0"},
{file = "pyobjc_framework_Cocoa-10.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c5af24610ab639bd1f521ce4500484b40787f898f691b7a23da3339e6bc8b90"},
{file = "pyobjc_framework_Cocoa-10.3.1-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:a7151186bb7805deea434fae9a4423335e6371d105f29e73cc2036c6779a9dbc"},
{file = "pyobjc_framework_Cocoa-10.3.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:743d2a1ac08027fd09eab65814c79002a1d0421d7c0074ffd1217b6560889744"},
{file = "pyobjc_framework_cocoa-10.3.1.tar.gz", hash = "sha256:1cf20714daaa986b488fb62d69713049f635c9d41a60c8da97d835710445281a"},
]
[package.dependencies]
pyobjc-core = ">=10.3.1"
[[package]]
name = "pyobjc-framework-coreml"
version = "10.3.1"
description = "Wrappers for the framework CoreML on macOS"
optional = true
python-versions = ">=3.8"
files = [
{file = "pyobjc_framework_CoreML-10.3.1-cp36-abi3-macosx_10_13_universal2.whl", hash = "sha256:c1fdcc0487807afa9cd0f88f25697e0e2e093d0219e8e1aa42aa3674dd78c2cb"},
{file = "pyobjc_framework_CoreML-10.3.1-cp36-abi3-macosx_10_9_universal2.whl", hash = "sha256:21c87e84c807b5dbe61e0f016d9aefa32d3212f175cc4b976b5c08770be7a58c"},
{file = "pyobjc_framework_CoreML-10.3.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:a0877aed5d4cdbb63d1246cd5384c09d78a0667e83c435a1257d10017c11c1a4"},
{file = "pyobjc_framework_CoreML-10.3.1-cp36-abi3-macosx_11_0_universal2.whl", hash = "sha256:4bd3f1acfb3245727727b71cbcf7d21a33d7e00fa488e41ad01527764b969b92"},
{file = "pyobjc_framework_coreml-10.3.1.tar.gz", hash = "sha256:6b7091142cfaafee76f1a804329e7a4e3aeca921eea8644e9ceba4cc2751f705"},
]
[package.dependencies]
pyobjc-core = ">=10.3.1"
pyobjc-framework-Cocoa = ">=10.3.1"
[[package]]
name = "pyobjc-framework-quartz"
version = "10.3.1"
description = "Wrappers for the Quartz frameworks on macOS"
optional = true
python-versions = ">=3.8"
files = [
{file = "pyobjc_framework_Quartz-10.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5ef4fd315ed2bc42ef77fdeb2bae28a88ec986bd7b8079a87ba3b3475348f96e"},
{file = "pyobjc_framework_Quartz-10.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:96578d4a3e70164efe44ad7dc320ecd4e211758ffcde5dcd694de1bbdfe090a4"},
{file = "pyobjc_framework_Quartz-10.3.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:ca35f92486869a41847a1703bb176aab8a53dbfd8e678d1f4d68d8e6e1581c71"},
{file = "pyobjc_framework_Quartz-10.3.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:00a0933267e3a46ea4afcc35d117b2efb920f06de797fa66279c52e7057e3590"},
{file = "pyobjc_framework_Quartz-10.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a161bedb4c5257a02ad56a910cd7eefb28bdb0ea78607df0d70ed4efe4ea54c1"},
{file = "pyobjc_framework_Quartz-10.3.1-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:d7a8028e117a94923a511944bfa9daf9744e212f06cf89010c60934a479863a5"},
{file = "pyobjc_framework_Quartz-10.3.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:de00c983b3267eb26fa42c6ed9f15e2bf006bde8afa7fe2b390646aa21a5d6fc"},
{file = "pyobjc_framework_quartz-10.3.1.tar.gz", hash = "sha256:b6d7e346d735c9a7f147cd78e6da79eeae416a0b7d3874644c83a23786c6f886"},
]
[package.dependencies]
pyobjc-core = ">=10.3.1"
pyobjc-framework-Cocoa = ">=10.3.1"
[[package]]
name = "pyobjc-framework-vision"
version = "10.3.1"
description = "Wrappers for the framework Vision on macOS"
optional = true
python-versions = ">=3.8"
files = [
{file = "pyobjc_framework_Vision-10.3.1-cp36-abi3-macosx_10_13_universal2.whl", hash = "sha256:dff3582678930461a0bb11bf070854d49f6944a851dc89edc63fac93c75ddf39"},
{file = "pyobjc_framework_Vision-10.3.1-cp36-abi3-macosx_10_9_universal2.whl", hash = "sha256:32626183c51674efb3b5738e2884c3fea37edca010117cf71bd72cb3c49c869a"},
{file = "pyobjc_framework_Vision-10.3.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2473b346a112c51ac485184305bd13c402e0db45f2df3d277315bd49efba18e9"},
{file = "pyobjc_framework_Vision-10.3.1-cp36-abi3-macosx_11_0_universal2.whl", hash = "sha256:4302e2c5f68c9667ecd4273809cbc4611af6368b123d69596e5b088f1b1aa16b"},
{file = "pyobjc_framework_vision-10.3.1.tar.gz", hash = "sha256:aa071656d395afc2d624600a9f30d6a3344aa747bf37f613ff3972158c40881c"},
]
[package.dependencies]
pyobjc-core = ">=10.3.1"
pyobjc-framework-Cocoa = ">=10.3.1"
pyobjc-framework-CoreML = ">=10.3.1"
pyobjc-framework-Quartz = ">=10.3.1"
[[package]]
name = "pypdfium2"
version = "4.30.0"
@ -7248,9 +7360,10 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
type = ["pytest-mypy"]
[extras]
ocrmac = ["ocrmac"]
tesserocr = ["tesserocr"]
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "a0f599090cfd9414c0e90fd611fd0b23166a45cd925904491eb0503a6f6bd1d8"
content-hash = "129137f8229158ac7672919df1684a260f74db22517d4d40c905f801f2950f46"

View File

@ -48,6 +48,7 @@ beautifulsoup4 = "^4.12.3"
pandas = "^2.1.4"
marko = "^2.1.2"
openpyxl = "^3.1.5"
ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
[tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"}
@ -95,6 +96,7 @@ torchvision = [
[tool.poetry.extras]
tesserocr = ["tesserocr"]
ocrmac = ["ocrmac"]
[tool.poetry.scripts]
docling = "docling.cli.main:app"
@ -130,6 +132,7 @@ module = [
"tesserocr.*",
"docling_ibm_models.*",
"easyocr.*",
"ocrmac.*",
"deepsearch_glm.*",
"lxml.*",
"bs4.*",

View File

@ -1,3 +1,4 @@
import sys
from pathlib import Path
from typing import List
@ -6,6 +7,7 @@ from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
OcrOptions,
PdfPipelineOptions,
TesseractCliOcrOptions,
@ -59,6 +61,11 @@ def test_e2e_conversions():
TesseractCliOcrOptions(force_full_page_ocr=True),
]
# only works on mac
if "darwin" == sys.platform:
engines.append(OcrMacOptions())
engines.append(OcrMacOptions(force_full_page_ocr=True))
for ocr_options in engines:
print(f"Converting with ocr_engine: {ocr_options.kind}")
converter = get_converter(ocr_options=ocr_options)