feat: add simplified single-doc conversion (#20)
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
parent
3eca8b8485
commit
d603137383
20
README.md
20
README.md
@ -37,12 +37,28 @@ pip install docling
|
|||||||
|
|
||||||
To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
|
To develop for Docling, you need Python 3.10 / 3.11 / 3.12 and Poetry. You can then install from your local clone's root dir:
|
||||||
```bash
|
```bash
|
||||||
poetry install
|
poetry install --all-extras
|
||||||
```
|
```
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
For basic usage, see the [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py) example module. Run with:
|
### Convert a single document
|
||||||
|
|
||||||
|
To convert invidual PDF documents, use `convert_single()`, for example:
|
||||||
|
```python
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
|
||||||
|
source = "https://arxiv.org/pdf/2206.01062" # PDF path or URL
|
||||||
|
converter = DocumentConverter()
|
||||||
|
doc = converter.convert_single(source)
|
||||||
|
print(doc.export_to_markdown()) # output: "## DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis [...]"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Convert a batch of documents
|
||||||
|
|
||||||
|
For an example of converting multiple documents, see [convert.py](https://github.com/DS4SD/docling/blob/main/examples/convert.py).
|
||||||
|
|
||||||
|
From a local repo clone, you can run it with:
|
||||||
|
|
||||||
```
|
```
|
||||||
python examples/convert.py
|
python examples/convert.py
|
||||||
|
@ -1,11 +1,15 @@
|
|||||||
import functools
|
import functools
|
||||||
import logging
|
import logging
|
||||||
|
import tempfile
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Type, Union
|
from typing import Iterable, Optional, Type, Union
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from docling_core.types import Document
|
||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
|
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
||||||
|
|
||||||
from docling.backend.abstract_backend import PdfDocumentBackend
|
from docling.backend.abstract_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
@ -32,6 +36,7 @@ _log = logging.getLogger(__name__)
|
|||||||
class DocumentConverter:
|
class DocumentConverter:
|
||||||
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
|
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
|
||||||
_table_model_path = "model_artifacts/tableformer"
|
_table_model_path = "model_artifacts/tableformer"
|
||||||
|
_default_download_filename = "file.pdf"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -80,6 +85,57 @@ class DocumentConverter:
|
|||||||
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
|
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
|
||||||
yield from map(self.process_document, input_batch)
|
yield from map(self.process_document, input_batch)
|
||||||
|
|
||||||
|
def convert_single(self, source: Path | AnyHttpUrl | str) -> Document:
|
||||||
|
"""Convert a single document.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
source (Path | AnyHttpUrl | str): The PDF input source. Can be a path or URL.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If source is of unexpected type.
|
||||||
|
RuntimeError: If conversion fails.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Document: The converted document object.
|
||||||
|
"""
|
||||||
|
with tempfile.TemporaryDirectory() as temp_dir:
|
||||||
|
try:
|
||||||
|
http_url: AnyHttpUrl = TypeAdapter(AnyHttpUrl).validate_python(source)
|
||||||
|
res = requests.get(http_url, stream=True)
|
||||||
|
res.raise_for_status()
|
||||||
|
fname = None
|
||||||
|
# try to get filename from response header
|
||||||
|
if cont_disp := res.headers.get("Content-Disposition"):
|
||||||
|
for par in cont_disp.strip().split(";"):
|
||||||
|
# currently only handling directive "filename" (not "*filename")
|
||||||
|
if (split := par.split("=")) and split[0].strip() == "filename":
|
||||||
|
fname = "=".join(split[1:]).strip().strip("'\"") or None
|
||||||
|
break
|
||||||
|
# otherwise, use name from URL:
|
||||||
|
if fname is None:
|
||||||
|
fname = Path(http_url.path).name or self._default_download_filename
|
||||||
|
local_path = Path(temp_dir) / fname
|
||||||
|
with open(local_path, "wb") as f:
|
||||||
|
for chunk in res.iter_content(chunk_size=1024): # using 1-KB chunks
|
||||||
|
f.write(chunk)
|
||||||
|
except ValidationError:
|
||||||
|
try:
|
||||||
|
local_path = TypeAdapter(Path).validate_python(source)
|
||||||
|
except ValidationError:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unexpected file path type encountered: {type(source)}"
|
||||||
|
)
|
||||||
|
conv_inp = DocumentConversionInput.from_paths(paths=[local_path])
|
||||||
|
converted_docs_iter = self.convert(conv_inp)
|
||||||
|
converted_doc: ConvertedDocument = next(converted_docs_iter)
|
||||||
|
if converted_doc.status not in {
|
||||||
|
ConversionStatus.SUCCESS,
|
||||||
|
ConversionStatus.SUCCESS_WITH_ERRORS,
|
||||||
|
}:
|
||||||
|
raise RuntimeError(f"Conversion failed with status: {converted_doc.status}")
|
||||||
|
doc = converted_doc.to_ds_document()
|
||||||
|
return doc
|
||||||
|
|
||||||
def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
|
def process_document(self, in_doc: InputDocument) -> ConvertedDocument:
|
||||||
start_doc_time = time.time()
|
start_doc_time = time.time()
|
||||||
converted_doc = ConvertedDocument(input=in_doc)
|
converted_doc = ConvertedDocument(input=in_doc)
|
||||||
|
3
poetry.lock
generated
3
poetry.lock
generated
@ -2510,6 +2510,7 @@ description = "Nvidia JIT LTO Library"
|
|||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3"
|
python-versions = ">=3"
|
||||||
files = [
|
files = [
|
||||||
|
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_aarch64.whl", hash = "sha256:98103729cc5226e13ca319a10bbf9433bbbd44ef64fe72f45f067cacc14b8d27"},
|
||||||
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"},
|
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f9b37bc5c8cf7509665cb6ada5aaa0ce65618f2332b7d3e78e9790511f111212"},
|
||||||
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"},
|
{file = "nvidia_nvjitlink_cu12-12.5.82-py3-none-win_amd64.whl", hash = "sha256:e782564d705ff0bf61ac3e1bf730166da66dd2fe9012f111ede5fc49b64ae697"},
|
||||||
]
|
]
|
||||||
@ -4881,4 +4882,4 @@ ocr = ["easyocr"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "3ffc5161fd49fe2186ee2afbb3319922964661c769c434fc7386aae40f4aab19"
|
content-hash = "dcb00c6601f61b087fd204d040149c20a7dcd72ab353e912e78dc265c86e4d00"
|
||||||
|
@ -30,6 +30,7 @@ filetype = "^1.2.0"
|
|||||||
pypdfium2 = "^4.30.0"
|
pypdfium2 = "^4.30.0"
|
||||||
pydantic-settings = "^2.3.0"
|
pydantic-settings = "^2.3.0"
|
||||||
huggingface_hub = ">=0.23,<1"
|
huggingface_hub = ">=0.23,<1"
|
||||||
|
requests = "^2.32.3"
|
||||||
easyocr = { version = "^1.7", optional = true }
|
easyocr = { version = "^1.7", optional = true }
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
Loading…
Reference in New Issue
Block a user