feat: add support for ocrmac
OCR engine on macOS (#276)
* feat: add support for `ocrmac` OCR engine on macOS - Integrates `ocrmac` as an OCR engine option for macOS users. - Adds configuration options and dependencies for `ocrmac`. - Updates documentation to reflect new engine support. This change allows macOS users to utilize `ocrmac` for improved OCR performance and compatibility. Signed-off-by: Suhwan Seo <nuridol@gmail.com> * updated the poetry lock Signed-off-by: Suhwan Seo <nuridol@gmail.com> * Fix linting issues, update CLI docs, and add error for ocrmac use on non-Mac systems - Resolved formatting and linting issues - Updated `--ocr-engine` CLI option documentation for `ocrmac` - Added RuntimeError for attempts to use `ocrmac` on non-Mac platforms Signed-off-by: Suhwan Seo <nuridol@gmail.com> * feat: add support for `ocrmac` OCR engine on macOS - Integrates `ocrmac` as an OCR engine option for macOS users. - Adds configuration options and dependencies for `ocrmac`. - Updates documentation to reflect new engine support. This change allows macOS users to utilize `ocrmac` for improved OCR performance and compatibility. Signed-off-by: Suhwan Seo <nuridol@gmail.com> * docs: update examples and installation for ocrmac support - Added `OcrMacOptions` to `custom_convert.py` and `full_page_ocr.py` examples. - Included usage comments and examples for `OcrMacOptions` in OCR pipelines. - Updated installation guide to include instructions for installing `ocrmac`, noting macOS version requirements (10.15+). - Highlighted that `ocrmac` leverages Apple's Vision framework as an OCR backend. This enhances documentation for users working on macOS to leverage `ocrmac` effectively. Signed-off-by: Suhwan Seo <nuridol@gmail.com> * fix: update `ocrmac` dependency with macOS-specific marker - Added `sys_platform == 'darwin'` marker to the `ocrmac` dependency in `pyproject.toml` to specify macOS compatibility. - Updated the content hash in `poetry.lock` to reflect the changes. This ensures the `ocrmac` dependency is only installed on macOS systems. Signed-off-by: Suhwan Seo <nuridol@gmail.com> --------- Signed-off-by: Suhwan Seo <nuridol@gmail.com> Co-authored-by: Suhwan Seo <nuridol@gmail.com>
This commit is contained in:
parent
32ebf55e33
commit
6efa96c983
@ -24,6 +24,7 @@ from docling.datamodel.base_models import (
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrMacOptions,
|
||||
OcrOptions,
|
||||
PdfPipelineOptions,
|
||||
TableFormerMode,
|
||||
@ -74,6 +75,7 @@ class OcrEngine(str, Enum):
|
||||
EASYOCR = "easyocr"
|
||||
TESSERACT_CLI = "tesseract_cli"
|
||||
TESSERACT = "tesseract"
|
||||
OCRMAC = "ocrmac"
|
||||
|
||||
|
||||
def export_documents(
|
||||
@ -259,6 +261,8 @@ def convert(
|
||||
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
||||
case OcrEngine.TESSERACT:
|
||||
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
||||
case OcrEngine.OCRMAC:
|
||||
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
||||
case _:
|
||||
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
||||
|
||||
|
@ -63,6 +63,17 @@ class TesseractOcrOptions(OcrOptions):
|
||||
)
|
||||
|
||||
|
||||
class OcrMacOptions(OcrOptions):
|
||||
kind: Literal["ocrmac"] = "ocrmac"
|
||||
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
||||
recognition: str = "accurate"
|
||||
framework: str = "vision"
|
||||
|
||||
model_config = ConfigDict(
|
||||
extra="forbid",
|
||||
)
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
create_legacy_output: bool = (
|
||||
True # This defautl will be set to False on a future version of docling
|
||||
@ -75,9 +86,9 @@ class PdfPipelineOptions(PipelineOptions):
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
|
||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||
ocr_options: Union[EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions] = (
|
||||
Field(EasyOcrOptions(), discriminator="kind")
|
||||
)
|
||||
ocr_options: Union[
|
||||
EasyOcrOptions, TesseractCliOcrOptions, TesseractOcrOptions, OcrMacOptions
|
||||
] = Field(EasyOcrOptions(), discriminator="kind")
|
||||
|
||||
images_scale: float = 1.0
|
||||
generate_page_images: bool = False
|
||||
|
118
docling/models/ocr_mac_model.py
Normal file
118
docling/models/ocr_mac_model.py
Normal file
@ -0,0 +1,118 @@
|
||||
import logging
|
||||
import tempfile
|
||||
from typing import Iterable, Optional, Tuple
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import OcrMacOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OcrMacModel(BaseOcrModel):
|
||||
def __init__(self, enabled: bool, options: OcrMacOptions):
|
||||
super().__init__(enabled=enabled, options=options)
|
||||
self.options: OcrMacOptions
|
||||
|
||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||
|
||||
if self.enabled:
|
||||
install_errmsg = (
|
||||
"ocrmac is not correctly installed. "
|
||||
"Please install it via `pip install ocrmac` to use this OCR engine. "
|
||||
"Alternatively, Docling has support for other OCR engines. See the documentation: "
|
||||
"https://ds4sd.github.io/docling/installation/"
|
||||
)
|
||||
try:
|
||||
from ocrmac import ocrmac
|
||||
except ImportError:
|
||||
raise ImportError(install_errmsg)
|
||||
|
||||
self.reader_RIL = ocrmac.OCR
|
||||
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
return
|
||||
|
||||
for page in page_batch:
|
||||
assert page._backend is not None
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
else:
|
||||
with TimeRecorder(conv_res, "ocr"):
|
||||
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
|
||||
all_ocr_cells = []
|
||||
for ocr_rect in ocr_rects:
|
||||
# Skip zero area boxes
|
||||
if ocr_rect.area() == 0:
|
||||
continue
|
||||
high_res_image = page._backend.get_page_image(
|
||||
scale=self.scale, cropbox=ocr_rect
|
||||
)
|
||||
|
||||
with tempfile.NamedTemporaryFile(
|
||||
suffix=".png", mode="w"
|
||||
) as image_file:
|
||||
fname = image_file.name
|
||||
high_res_image.save(fname)
|
||||
|
||||
boxes = self.reader_RIL(
|
||||
fname,
|
||||
recognition_level=self.options.recognition,
|
||||
framework=self.options.framework,
|
||||
language_preference=self.options.lang,
|
||||
).recognize()
|
||||
|
||||
im_width, im_height = high_res_image.size
|
||||
cells = []
|
||||
for ix, (text, confidence, box) in enumerate(boxes):
|
||||
x = float(box[0])
|
||||
y = float(box[1])
|
||||
w = float(box[2])
|
||||
h = float(box[3])
|
||||
|
||||
x1 = x * im_width
|
||||
y2 = (1 - y) * im_height
|
||||
|
||||
x2 = x1 + w * im_width
|
||||
y1 = y2 - h * im_height
|
||||
|
||||
left = x1 / self.scale
|
||||
top = y1 / self.scale
|
||||
right = x2 / self.scale
|
||||
bottom = y2 / self.scale
|
||||
|
||||
cells.append(
|
||||
OcrCell(
|
||||
id=ix,
|
||||
text=text,
|
||||
confidence=confidence,
|
||||
bbox=BoundingBox.from_tuple(
|
||||
coord=(left, top, right, bottom),
|
||||
origin=CoordOrigin.TOPLEFT,
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
# del high_res_image
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
# Post-process the cells
|
||||
page.cells = self.post_process_cells(all_ocr_cells, page.cells)
|
||||
|
||||
# DEBUG code:
|
||||
if settings.debug.visualize_ocr:
|
||||
self.draw_ocr_rects_and_cells(conv_res, page, ocr_rects)
|
||||
|
||||
yield page
|
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
@ -10,6 +11,7 @@ from docling.datamodel.base_models import AssembledUnit, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrMacOptions,
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
@ -18,6 +20,7 @@ from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.models.ds_glm_model import GlmModel, GlmOptions
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.ocr_mac_model import OcrMacModel
|
||||
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
|
||||
from docling.models.page_preprocessing_model import (
|
||||
PagePreprocessingModel,
|
||||
@ -118,6 +121,15 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
enabled=self.pipeline_options.do_ocr,
|
||||
options=self.pipeline_options.ocr_options,
|
||||
)
|
||||
elif isinstance(self.pipeline_options.ocr_options, OcrMacOptions):
|
||||
if "darwin" != sys.platform:
|
||||
raise RuntimeError(
|
||||
f"The specified OCR type is only supported on Mac: {self.pipeline_options.ocr_options.kind}."
|
||||
)
|
||||
return OcrMacModel(
|
||||
enabled=self.pipeline_options.do_ocr,
|
||||
options=self.pipeline_options.ocr_options,
|
||||
)
|
||||
return None
|
||||
|
||||
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||
|
@ -7,6 +7,7 @@ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.models.ocr_mac_model import OcrMacOptions
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
||||
from docling.models.tesseract_ocr_model import TesseractOcrOptions
|
||||
|
||||
@ -122,6 +123,20 @@ def main():
|
||||
# }
|
||||
# )
|
||||
|
||||
# Docling Parse with ocrmac(Mac only)
|
||||
# ----------------------
|
||||
# pipeline_options = PdfPipelineOptions()
|
||||
# pipeline_options.do_ocr = True
|
||||
# pipeline_options.do_table_structure = True
|
||||
# pipeline_options.table_structure_options.do_cell_matching = True
|
||||
# pipeline_options.ocr_options = OcrMacOptions()
|
||||
|
||||
# doc_converter = DocumentConverter(
|
||||
# format_options={
|
||||
# InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
# }
|
||||
# )
|
||||
|
||||
###########################################################################
|
||||
|
||||
start_time = time.time()
|
||||
|
@ -4,6 +4,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrMacOptions,
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
@ -19,9 +20,10 @@ def main():
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
# Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions
|
||||
# Any of the OCR options can be used:EasyOcrOptions, TesseractOcrOptions, TesseractCliOcrOptions, OcrMacOptions(Mac only)
|
||||
# ocr_options = EasyOcrOptions(force_full_page_ocr=True)
|
||||
# ocr_options = TesseractOcrOptions(force_full_page_ocr=True)
|
||||
# ocr_options = OcrMacOptions(force_full_page_ocr=True)
|
||||
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=True)
|
||||
pipeline_options.ocr_options = ocr_options
|
||||
|
||||
|
@ -30,6 +30,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
|
||||
| [EasyOCR](https://github.com/JaidedAI/EasyOCR) | Default in Docling or via `pip install easyocr`. | `EasyOcrOptions` |
|
||||
| Tesseract | System dependency. See description for Tesseract and Tesserocr below. | `TesseractOcrOptions` |
|
||||
| Tesseract CLI | System dependency. See description below. | `TesseractCliOcrOptions` |
|
||||
| OcrMac | System dependency. See description below. | `OcrMacOptions` |
|
||||
|
||||
The Docling `DocumentConverter` allows to choose the OCR engine with the `ocr_options` settings. For example
|
||||
|
||||
@ -91,6 +92,17 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
|
||||
pip install --no-binary :all: tesserocr
|
||||
```
|
||||
|
||||
<h3>ocrmac installation</h3>
|
||||
|
||||
[ocrmac](https://github.com/straussmaximilian/ocrmac) is using
|
||||
Apple's vision(or livetext) framework as OCR backend.
|
||||
For using this engine with Docling, ocrmac must be installed on your system.
|
||||
This only works on macOS systems with newer macOS versions (10.15+).
|
||||
|
||||
```console
|
||||
pip install ocrmac
|
||||
```
|
||||
|
||||
## Development setup
|
||||
|
||||
To develop Docling features, bugfixes etc., install as follows from your local clone's root dir:
|
||||
|
133
poetry.lock
generated
133
poetry.lock
generated
@ -182,8 +182,8 @@ files = [
|
||||
lazy-object-proxy = ">=1.4.0"
|
||||
typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
|
||||
wrapt = [
|
||||
{version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
|
||||
{version = ">=1.11,<2", markers = "python_version < \"3.11\""},
|
||||
{version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -825,8 +825,8 @@ files = [
|
||||
docling-core = ">=2.0,<3.0"
|
||||
docutils = "!=0.21"
|
||||
numpy = [
|
||||
{version = ">=2.0.2,<3.0.0", markers = "python_version >= \"3.13\""},
|
||||
{version = ">=1.26.4,<2.0.0", markers = "python_version >= \"3.9\" and python_version < \"3.13\""},
|
||||
{version = ">=2.0.2,<3.0.0", markers = "python_version >= \"3.13\""},
|
||||
]
|
||||
pandas = {version = ">=2.1.4,<3.0.0", markers = "python_version >= \"3.9\""}
|
||||
python-dotenv = ">=1.0.0,<2.0.0"
|
||||
@ -912,8 +912,8 @@ huggingface_hub = ">=0.23,<1"
|
||||
jsonlines = ">=3.1.0,<4.0.0"
|
||||
mean_average_precision = ">=2021.4.26.0,<2022.0.0.0"
|
||||
numpy = [
|
||||
{version = ">=2.1.0,<3.0.0", markers = "python_version >= \"3.13\""},
|
||||
{version = ">=1.24.4,<2.0.0", markers = "python_version < \"3.13\""},
|
||||
{version = ">=2.1.0,<3.0.0", markers = "python_version >= \"3.13\""},
|
||||
]
|
||||
opencv-python-headless = ">=4.6.0.66,<5.0.0.0"
|
||||
Pillow = ">=10.0.0,<11.0.0"
|
||||
@ -2063,8 +2063,8 @@ jsonpatch = ">=1.33,<2.0"
|
||||
langsmith = ">=0.1.112,<0.2.0"
|
||||
packaging = ">=23.2,<25"
|
||||
pydantic = [
|
||||
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
|
||||
{version = ">=1,<3", markers = "python_full_version < \"3.12.4\""},
|
||||
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
|
||||
]
|
||||
PyYAML = ">=5.3"
|
||||
tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0"
|
||||
@ -2132,8 +2132,8 @@ files = [
|
||||
httpx = ">=0.23.0,<1"
|
||||
orjson = ">=3.9.14,<4.0.0"
|
||||
pydantic = [
|
||||
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
|
||||
{version = ">=1,<3", markers = "python_full_version < \"3.12.4\""},
|
||||
{version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""},
|
||||
]
|
||||
requests = ">=2,<3"
|
||||
requests-toolbelt = ">=1.0.0,<2.0.0"
|
||||
@ -3548,6 +3548,22 @@ files = [
|
||||
{file = "nvidia_nvtx_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:641dccaaa1139f3ffb0d3164b4b84f9d253397e38246a4f2f36728b48566d485"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ocrmac"
|
||||
version = "1.0.0"
|
||||
description = "A python wrapper to extract text from images on a mac system. Uses the vision framework from Apple."
|
||||
optional = true
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "ocrmac-1.0.0-py2.py3-none-any.whl", hash = "sha256:0b5a072aa23a9ead48132cb2d595b680aa6c3c5a6cb69525155e35ca95610c3a"},
|
||||
{file = "ocrmac-1.0.0.tar.gz", hash = "sha256:5b299e9030c973d1f60f82db000d6c2e5ff271601878c7db0885e850597d1d2e"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
Click = ">=7.0"
|
||||
pillow = "*"
|
||||
pyobjc-framework-Vision = "*"
|
||||
|
||||
[[package]]
|
||||
name = "opencv-python-headless"
|
||||
version = "4.10.0.84"
|
||||
@ -3566,10 +3582,10 @@ files = [
|
||||
|
||||
[package.dependencies]
|
||||
numpy = [
|
||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
||||
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -3732,9 +3748,9 @@ files = [
|
||||
|
||||
[package.dependencies]
|
||||
numpy = [
|
||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
|
||||
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||
]
|
||||
python-dateutil = ">=2.8.2"
|
||||
pytz = ">=2020.1"
|
||||
@ -4331,8 +4347,8 @@ files = [
|
||||
annotated-types = ">=0.6.0"
|
||||
pydantic-core = "2.23.4"
|
||||
typing-extensions = [
|
||||
{version = ">=4.12.2", markers = "python_version >= \"3.13\""},
|
||||
{version = ">=4.6.1", markers = "python_version < \"3.13\""},
|
||||
{version = ">=4.12.2", markers = "python_version >= \"3.13\""},
|
||||
]
|
||||
|
||||
[package.extras]
|
||||
@ -4500,8 +4516,8 @@ files = [
|
||||
astroid = ">=2.15.8,<=2.17.0-dev0"
|
||||
colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
|
||||
dill = [
|
||||
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
|
||||
{version = ">=0.2", markers = "python_version < \"3.11\""},
|
||||
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
|
||||
]
|
||||
isort = ">=4.2.5,<6"
|
||||
mccabe = ">=0.6,<0.8"
|
||||
@ -4556,6 +4572,102 @@ bulk-writer = ["azure-storage-blob", "minio (>=7.0.0)", "pyarrow (>=12.0.0)", "r
|
||||
dev = ["black", "grpcio (==1.62.2)", "grpcio-testing (==1.62.2)", "grpcio-tools (==1.62.2)", "pytest (>=5.3.4)", "pytest-cov (>=2.8.1)", "pytest-timeout (>=1.3.4)", "ruff (>0.4.0)"]
|
||||
model = ["milvus-model (>=0.1.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "pyobjc-core"
|
||||
version = "10.3.1"
|
||||
description = "Python<->ObjC Interoperability Module"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "pyobjc_core-10.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ea46d2cda17921e417085ac6286d43ae448113158afcf39e0abe484c58fb3d78"},
|
||||
{file = "pyobjc_core-10.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:899d3c84d2933d292c808f385dc881a140cf08632907845043a333a9d7c899f9"},
|
||||
{file = "pyobjc_core-10.3.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:6ff5823d13d0a534cdc17fa4ad47cf5bee4846ce0fd27fc40012e12b46db571b"},
|
||||
{file = "pyobjc_core-10.3.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:2581e8e68885bcb0e11ec619e81ef28e08ee3fac4de20d8cc83bc5af5bcf4a90"},
|
||||
{file = "pyobjc_core-10.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:ea98d4c2ec39ca29e62e0327db21418696161fb138ee6278daf2acbedf7ce504"},
|
||||
{file = "pyobjc_core-10.3.1-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:4c179c26ee2123d0aabffb9dbc60324b62b6f8614fb2c2328b09386ef59ef6d8"},
|
||||
{file = "pyobjc_core-10.3.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:cb901fce65c9be420c40d8a6ee6fff5ff27c6945f44fd7191989b982baa66dea"},
|
||||
{file = "pyobjc_core-10.3.1.tar.gz", hash = "sha256:b204a80ccc070f9ab3f8af423a3a25a6fd787e228508d00c4c30f8ac538ba720"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pyobjc-framework-cocoa"
|
||||
version = "10.3.1"
|
||||
description = "Wrappers for the Cocoa frameworks on macOS"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "pyobjc_framework_Cocoa-10.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4cb4f8491ab4d9b59f5187e42383f819f7a46306a4fa25b84f126776305291d1"},
|
||||
{file = "pyobjc_framework_Cocoa-10.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:5f31021f4f8fdf873b57a97ee1f3c1620dbe285e0b4eaed73dd0005eb72fd773"},
|
||||
{file = "pyobjc_framework_Cocoa-10.3.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:11b4e0bad4bbb44a4edda128612f03cdeab38644bbf174de0c13129715497296"},
|
||||
{file = "pyobjc_framework_Cocoa-10.3.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:de5e62e5ccf2871a94acf3bf79646b20ea893cc9db78afa8d1fe1b0d0f7cbdb0"},
|
||||
{file = "pyobjc_framework_Cocoa-10.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6c5af24610ab639bd1f521ce4500484b40787f898f691b7a23da3339e6bc8b90"},
|
||||
{file = "pyobjc_framework_Cocoa-10.3.1-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:a7151186bb7805deea434fae9a4423335e6371d105f29e73cc2036c6779a9dbc"},
|
||||
{file = "pyobjc_framework_Cocoa-10.3.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:743d2a1ac08027fd09eab65814c79002a1d0421d7c0074ffd1217b6560889744"},
|
||||
{file = "pyobjc_framework_cocoa-10.3.1.tar.gz", hash = "sha256:1cf20714daaa986b488fb62d69713049f635c9d41a60c8da97d835710445281a"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pyobjc-core = ">=10.3.1"
|
||||
|
||||
[[package]]
|
||||
name = "pyobjc-framework-coreml"
|
||||
version = "10.3.1"
|
||||
description = "Wrappers for the framework CoreML on macOS"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "pyobjc_framework_CoreML-10.3.1-cp36-abi3-macosx_10_13_universal2.whl", hash = "sha256:c1fdcc0487807afa9cd0f88f25697e0e2e093d0219e8e1aa42aa3674dd78c2cb"},
|
||||
{file = "pyobjc_framework_CoreML-10.3.1-cp36-abi3-macosx_10_9_universal2.whl", hash = "sha256:21c87e84c807b5dbe61e0f016d9aefa32d3212f175cc4b976b5c08770be7a58c"},
|
||||
{file = "pyobjc_framework_CoreML-10.3.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:a0877aed5d4cdbb63d1246cd5384c09d78a0667e83c435a1257d10017c11c1a4"},
|
||||
{file = "pyobjc_framework_CoreML-10.3.1-cp36-abi3-macosx_11_0_universal2.whl", hash = "sha256:4bd3f1acfb3245727727b71cbcf7d21a33d7e00fa488e41ad01527764b969b92"},
|
||||
{file = "pyobjc_framework_coreml-10.3.1.tar.gz", hash = "sha256:6b7091142cfaafee76f1a804329e7a4e3aeca921eea8644e9ceba4cc2751f705"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pyobjc-core = ">=10.3.1"
|
||||
pyobjc-framework-Cocoa = ">=10.3.1"
|
||||
|
||||
[[package]]
|
||||
name = "pyobjc-framework-quartz"
|
||||
version = "10.3.1"
|
||||
description = "Wrappers for the Quartz frameworks on macOS"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "pyobjc_framework_Quartz-10.3.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5ef4fd315ed2bc42ef77fdeb2bae28a88ec986bd7b8079a87ba3b3475348f96e"},
|
||||
{file = "pyobjc_framework_Quartz-10.3.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:96578d4a3e70164efe44ad7dc320ecd4e211758ffcde5dcd694de1bbdfe090a4"},
|
||||
{file = "pyobjc_framework_Quartz-10.3.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:ca35f92486869a41847a1703bb176aab8a53dbfd8e678d1f4d68d8e6e1581c71"},
|
||||
{file = "pyobjc_framework_Quartz-10.3.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:00a0933267e3a46ea4afcc35d117b2efb920f06de797fa66279c52e7057e3590"},
|
||||
{file = "pyobjc_framework_Quartz-10.3.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a161bedb4c5257a02ad56a910cd7eefb28bdb0ea78607df0d70ed4efe4ea54c1"},
|
||||
{file = "pyobjc_framework_Quartz-10.3.1-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:d7a8028e117a94923a511944bfa9daf9744e212f06cf89010c60934a479863a5"},
|
||||
{file = "pyobjc_framework_Quartz-10.3.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:de00c983b3267eb26fa42c6ed9f15e2bf006bde8afa7fe2b390646aa21a5d6fc"},
|
||||
{file = "pyobjc_framework_quartz-10.3.1.tar.gz", hash = "sha256:b6d7e346d735c9a7f147cd78e6da79eeae416a0b7d3874644c83a23786c6f886"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pyobjc-core = ">=10.3.1"
|
||||
pyobjc-framework-Cocoa = ">=10.3.1"
|
||||
|
||||
[[package]]
|
||||
name = "pyobjc-framework-vision"
|
||||
version = "10.3.1"
|
||||
description = "Wrappers for the framework Vision on macOS"
|
||||
optional = true
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "pyobjc_framework_Vision-10.3.1-cp36-abi3-macosx_10_13_universal2.whl", hash = "sha256:dff3582678930461a0bb11bf070854d49f6944a851dc89edc63fac93c75ddf39"},
|
||||
{file = "pyobjc_framework_Vision-10.3.1-cp36-abi3-macosx_10_9_universal2.whl", hash = "sha256:32626183c51674efb3b5738e2884c3fea37edca010117cf71bd72cb3c49c869a"},
|
||||
{file = "pyobjc_framework_Vision-10.3.1-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2473b346a112c51ac485184305bd13c402e0db45f2df3d277315bd49efba18e9"},
|
||||
{file = "pyobjc_framework_Vision-10.3.1-cp36-abi3-macosx_11_0_universal2.whl", hash = "sha256:4302e2c5f68c9667ecd4273809cbc4611af6368b123d69596e5b088f1b1aa16b"},
|
||||
{file = "pyobjc_framework_vision-10.3.1.tar.gz", hash = "sha256:aa071656d395afc2d624600a9f30d6a3344aa747bf37f613ff3972158c40881c"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
pyobjc-core = ">=10.3.1"
|
||||
pyobjc-framework-Cocoa = ">=10.3.1"
|
||||
pyobjc-framework-CoreML = ">=10.3.1"
|
||||
pyobjc-framework-Quartz = ">=10.3.1"
|
||||
|
||||
[[package]]
|
||||
name = "pypdfium2"
|
||||
version = "4.30.0"
|
||||
@ -7248,9 +7360,10 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
|
||||
type = ["pytest-mypy"]
|
||||
|
||||
[extras]
|
||||
ocrmac = ["ocrmac"]
|
||||
tesserocr = ["tesserocr"]
|
||||
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "a0f599090cfd9414c0e90fd611fd0b23166a45cd925904491eb0503a6f6bd1d8"
|
||||
content-hash = "129137f8229158ac7672919df1684a260f74db22517d4d40c905f801f2950f46"
|
||||
|
@ -48,6 +48,7 @@ beautifulsoup4 = "^4.12.3"
|
||||
pandas = "^2.1.4"
|
||||
marko = "^2.1.2"
|
||||
openpyxl = "^3.1.5"
|
||||
ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = {extras = ["jupyter"], version = "^24.4.2"}
|
||||
@ -95,6 +96,7 @@ torchvision = [
|
||||
|
||||
[tool.poetry.extras]
|
||||
tesserocr = ["tesserocr"]
|
||||
ocrmac = ["ocrmac"]
|
||||
|
||||
[tool.poetry.scripts]
|
||||
docling = "docling.cli.main:app"
|
||||
@ -130,6 +132,7 @@ module = [
|
||||
"tesserocr.*",
|
||||
"docling_ibm_models.*",
|
||||
"easyocr.*",
|
||||
"ocrmac.*",
|
||||
"deepsearch_glm.*",
|
||||
"lxml.*",
|
||||
"bs4.*",
|
||||
|
@ -1,3 +1,4 @@
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
@ -6,6 +7,7 @@ from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrMacOptions,
|
||||
OcrOptions,
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
@ -59,6 +61,11 @@ def test_e2e_conversions():
|
||||
TesseractCliOcrOptions(force_full_page_ocr=True),
|
||||
]
|
||||
|
||||
# only works on mac
|
||||
if "darwin" == sys.platform:
|
||||
engines.append(OcrMacOptions())
|
||||
engines.append(OcrMacOptions(force_full_page_ocr=True))
|
||||
|
||||
for ocr_options in engines:
|
||||
print(f"Converting with ocr_engine: {ocr_options.kind}")
|
||||
converter = get_converter(ocr_options=ocr_options)
|
||||
|
Loading…
Reference in New Issue
Block a user