feat: add simplified single-doc conversion (#20)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Panos Vagenas 2024-07-26 16:55:33 +02:00 committed by GitHub
parent 3eca8b8485
commit d603137383
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 80 additions and 6 deletions

View File

@ -37,12 +37,28 @@ pip install docling
To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir: To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
```bash ```bash
poetry install poetry install --all-extras
``` ```
## Usage ## Usage
For basic usage, see the [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py) example module. Run with: ### Convert a single document
To convert invidual PDF documents, use `convert_single()`, for example:
```python
from docling.document_converter import DocumentConverter
source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
converter = DocumentConverter()
doc = converter.convert_single(source)
print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
```
### Convert a batch of documents
For an example of converting multiple documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
From a local repo clone, you can run it with:
``` ```
python examples/convert.py python examples/convert.py

View File

@ -1,11 +1,15 @@
import functools import functools
import logging import logging
import tempfile
import time import time
import traceback import traceback
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Type, Union from typing import Iterable, Optional, Type, Union
import requests
from docling_core.types import Document
from PIL import ImageDraw from PIL import ImageDraw
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
from docling.backend.abstract_backend import PdfDocumentBackend from docling.backend.abstract_backend import PdfDocumentBackend
from docling.datamodel.base_models import ( from docling.datamodel.base_models import (
@ -32,6 +36,7 @@ _log = logging.getLogger(__name__)
class DocumentConverter: class DocumentConverter:
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5" _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
_table_model_path = "model_artifacts/tableformer" _table_model_path = "model_artifacts/tableformer"
_default_download_filename = "file.pdf"
def __init__( def __init__(
self, self,
@ -80,6 +85,57 @@ class DocumentConverter:
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled. # Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
yield from map(self.process_document, input_batch) yield from map(self.process_document, input_batch)
def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
"""Convert a single document.
Args:
source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
Raises:
ValueError: If source is of unexpected type.
RuntimeError: If conversion fails.
Returns:
Document: The converted document object.
"""
with tempfile.TemporaryDirectory() as temp_dir:
try:
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
res = requests.get(http_url, stream=True)
res.raise_for_status()
fname = None
# try to get filename from response header
if cont_disp := res.headers.get("Content-Disposition"):
for par in cont_disp.strip().split(";"):
# currently only handling directive "filename" (not "*filename")
if (split := par.split("=")) and split[0].strip() == "filename":
fname = "=".join(split[1:]).strip().strip("'\"") or None
break
# otherwise, use name from URL:
if fname is None:
fname = Path(http_url.path).name or self._default_download_filename
local_path = Path(temp_dir) / fname
with open(local_path, "wb") as f:
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
f.write(chunk)
except ValidationError:
try:
local_path = TypeAdapter(Path).validate_python(source)
except ValidationError:
raise ValueError(
f"Unexpected file path type encountered: {type(source)}"
)
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
converted_docs_iter = self.convert(conv_inp)
converted_doc: ConvertedDocument = next(converted_docs_iter)
if converted_doc.status not in {
ConversionStatus.SUCCESS,
ConversionStatus.SUCCESS_WITH_ERRORS,
}:
raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
doc = converted_doc.to_ds_document()
return doc
def process_document(self, in_doc: InputDocument) -> ConvertedDocument: def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
start_doc_time = time.time() start_doc_time = time.time()
converted_doc = ConvertedDocument(input=in_doc) converted_doc = ConvertedDocument(input=in_doc)

3
poetry.lock generated
View File

@ -2510,6 +2510,7 @@ description = "Nvidia JIT LTO Library"
optional = false optional = false
python-versions = ">=3" python-versions = ">=3"
files = [ files = [
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_aarch64.whl", hash = "sha256:98103729cc5226e13ca319a10bbf9433bbbd44ef64fe72f45f067cacc14b8d27"},
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"}, {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"},
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"}, {file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"},
] ]
@ -4881,4 +4882,4 @@ ocr = ["easyocr"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "3ffc5161fd49fe2186ee2afbb3319922964661c769c434fc7386aae40f4aab19" content-hash = "dcb00c6601f61b087fd204d040149c20a7dcd72ab353e912e78dc265c86e4d00"

View File

@ -30,6 +30,7 @@ filetype = "^1.2.0"
pypdfium2 = "^4.30.0" pypdfium2 = "^4.30.0"
pydantic-settings = "^2.3.0" pydantic-settings = "^2.3.0"
huggingface_hub = ">=0.23,<1" huggingface_hub = ">=0.23,<1"
requests = "^2.32.3"
easyocr = { version = "^1.7", optional = true } easyocr = { version = "^1.7", optional = true }
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]