docs: add automatic api reference (#475)
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
8ccb3c6db6
commit
d4872103b8
@ -6,11 +6,15 @@ from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
|
||||
class TableFormerMode(str, Enum):
|
||||
"""Modes for the TableFormer model."""
|
||||
|
||||
FAST = "fast"
|
||||
ACCURATE = "accurate"
|
||||
|
||||
|
||||
class TableStructureOptions(BaseModel):
|
||||
"""Options for the table structure."""
|
||||
|
||||
do_cell_matching: bool = (
|
||||
True
|
||||
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
||||
@ -21,6 +25,8 @@ class TableStructureOptions(BaseModel):
|
||||
|
||||
|
||||
class OcrOptions(BaseModel):
|
||||
"""OCR options."""
|
||||
|
||||
kind: str
|
||||
lang: List[str]
|
||||
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
||||
@ -30,6 +36,8 @@ class OcrOptions(BaseModel):
|
||||
|
||||
|
||||
class RapidOcrOptions(OcrOptions):
|
||||
"""Options for the RapidOCR engine."""
|
||||
|
||||
kind: Literal["rapidocr"] = "rapidocr"
|
||||
|
||||
# English and chinese are the most commly used models and have been tested with RapidOCR.
|
||||
@ -66,6 +74,8 @@ class RapidOcrOptions(OcrOptions):
|
||||
|
||||
|
||||
class EasyOcrOptions(OcrOptions):
|
||||
"""Options for the EasyOCR engine."""
|
||||
|
||||
kind: Literal["easyocr"] = "easyocr"
|
||||
lang: List[str] = ["fr", "de", "es", "en"]
|
||||
use_gpu: bool = True # same default as easyocr.Reader
|
||||
@ -79,6 +89,8 @@ class EasyOcrOptions(OcrOptions):
|
||||
|
||||
|
||||
class TesseractCliOcrOptions(OcrOptions):
|
||||
"""Options for the TesseractCli engine."""
|
||||
|
||||
kind: Literal["tesseract"] = "tesseract"
|
||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||
tesseract_cmd: str = "tesseract"
|
||||
@ -90,6 +102,8 @@ class TesseractCliOcrOptions(OcrOptions):
|
||||
|
||||
|
||||
class TesseractOcrOptions(OcrOptions):
|
||||
"""Options for the Tesseract engine."""
|
||||
|
||||
kind: Literal["tesserocr"] = "tesserocr"
|
||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||
path: Optional[str] = None
|
||||
@ -100,6 +114,8 @@ class TesseractOcrOptions(OcrOptions):
|
||||
|
||||
|
||||
class OcrMacOptions(OcrOptions):
|
||||
"""Options for the Mac OCR engine."""
|
||||
|
||||
kind: Literal["ocrmac"] = "ocrmac"
|
||||
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
||||
recognition: str = "accurate"
|
||||
@ -111,12 +127,16 @@ class OcrMacOptions(OcrOptions):
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
"""Base pipeline options."""
|
||||
|
||||
create_legacy_output: bool = (
|
||||
True # This defautl will be set to False on a future version of docling
|
||||
)
|
||||
|
||||
|
||||
class PdfPipelineOptions(PipelineOptions):
|
||||
"""Options for the PDF pipeline."""
|
||||
|
||||
artifacts_path: Optional[Union[Path, str]] = None
|
||||
do_table_structure: bool = True # True: perform table structure extraction
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
|
52
docs/api_reference/docling_document.md
Normal file
52
docs/api_reference/docling_document.md
Normal file
@ -0,0 +1,52 @@
|
||||
# Docling Document
|
||||
|
||||
This is an automatic generated API reference of the DoclingDocument type.
|
||||
|
||||
::: docling_core.types.doc
|
||||
handler: python
|
||||
options:
|
||||
members:
|
||||
- DoclingDocument
|
||||
- DocumentOrigin
|
||||
- DocItem
|
||||
- DocItemLabel
|
||||
- ProvenanceItem
|
||||
- GroupItem
|
||||
- GroupLabel
|
||||
- NodeItem
|
||||
- PageItem
|
||||
- FloatingItem
|
||||
- TextItem
|
||||
- TableItem
|
||||
- TableCell
|
||||
- TableData
|
||||
- TableCellLabel
|
||||
- KeyValueItem
|
||||
- SectionHeaderItem
|
||||
- PictureItem
|
||||
- ImageRef
|
||||
- PictureClassificationClass
|
||||
- PictureClassificationData
|
||||
- RefItem
|
||||
- BoundingBox
|
||||
- CoordOrigin
|
||||
- ImageRefMode
|
||||
- Size
|
||||
show_if_no_docstring: true
|
||||
show_submodules: true
|
||||
docstring_section_style: list
|
||||
filters: ["!^_"]
|
||||
heading_level: 2
|
||||
show_root_toc_entry: true
|
||||
inherited_members: true
|
||||
merge_init_into_class: true
|
||||
separate_signature: true
|
||||
show_root_heading: true
|
||||
show_root_full_path: false
|
||||
show_signature_annotations: true
|
||||
show_source: false
|
||||
show_symbol_type_heading: true
|
||||
show_symbol_type_toc: true
|
||||
show_labels: false
|
||||
signature_crossrefs: true
|
||||
summary: true
|
38
docs/api_reference/document_converter.md
Normal file
38
docs/api_reference/document_converter.md
Normal file
@ -0,0 +1,38 @@
|
||||
# Document converter
|
||||
|
||||
This is an automatic generated API reference of the main components of Docling.
|
||||
|
||||
::: docling.document_converter
|
||||
handler: python
|
||||
options:
|
||||
members:
|
||||
- DocumentConverter
|
||||
- ConversionResult
|
||||
- ConversionStatus
|
||||
- FormatOption
|
||||
- InputFormat
|
||||
- PdfFormatOption
|
||||
- ImageFormatOption
|
||||
- StandardPdfPipeline
|
||||
- WordFormatOption
|
||||
- PowerpointFormatOption
|
||||
- MarkdownFormatOption
|
||||
- AsciiDocFormatOption
|
||||
- HTMLFormatOption
|
||||
- SimplePipeline
|
||||
show_if_no_docstring: true
|
||||
show_submodules: true
|
||||
docstring_section_style: list
|
||||
filters: ["!^_"]
|
||||
heading_level: 2
|
||||
inherited_members: true
|
||||
merge_init_into_class: true
|
||||
separate_signature: true
|
||||
show_root_heading: true
|
||||
show_root_full_path: false
|
||||
show_signature_annotations: true
|
||||
show_source: false
|
||||
show_symbol_type_heading: true
|
||||
show_symbol_type_toc: true
|
||||
signature_crossrefs: true
|
||||
summary: true
|
36
docs/api_reference/pipeline_options.md
Normal file
36
docs/api_reference/pipeline_options.md
Normal file
@ -0,0 +1,36 @@
|
||||
# Pipeline options
|
||||
|
||||
Pipeline options allow to customize the execution of the models during the conversion pipeline.
|
||||
This includes options for the OCR engines, the table model as well as enrichment options which
|
||||
can be enabled with `do_xyz = True`.
|
||||
|
||||
|
||||
This is an automatic generated API reference of the all the pipeline options available in Docling.
|
||||
|
||||
|
||||
::: docling.datamodel.pipeline_options
|
||||
handler: python
|
||||
options:
|
||||
show_if_no_docstring: true
|
||||
show_submodules: true
|
||||
docstring_section_style: list
|
||||
filters: ["!^_"]
|
||||
heading_level: 2
|
||||
inherited_members: true
|
||||
merge_init_into_class: true
|
||||
separate_signature: true
|
||||
show_root_heading: true
|
||||
show_root_full_path: false
|
||||
show_signature_annotations: true
|
||||
show_source: false
|
||||
show_symbol_type_heading: true
|
||||
show_symbol_type_toc: true
|
||||
signature_crossrefs: true
|
||||
summary: true
|
||||
|
||||
<!-- ::: docling.document_converter.DocumentConverter
|
||||
handler: python
|
||||
options:
|
||||
show_if_no_docstring: true
|
||||
show_submodules: true -->
|
||||
|
21
mkdocs.yml
21
mkdocs.yml
@ -95,8 +95,10 @@ nav:
|
||||
- "Prodigy": integrations/prodigy.md
|
||||
- "spaCy": integrations/spacy.md
|
||||
# - "LangChain 🦜🔗": integrations/langchain.md
|
||||
# - API reference:
|
||||
# - API reference: api_reference/index.md
|
||||
- API reference:
|
||||
- Document Converter: api_reference/document_converter.md
|
||||
- Pipeline options: api_reference/pipeline_options.md
|
||||
- Docling Document: api_reference/docling_document.md
|
||||
|
||||
markdown_extensions:
|
||||
- pymdownx.superfences
|
||||
@ -112,12 +114,15 @@ markdown_extensions:
|
||||
plugins:
|
||||
- search
|
||||
- mkdocs-jupyter
|
||||
# - mkdocstrings:
|
||||
# default_handler: python
|
||||
# options:
|
||||
# preload_modules:
|
||||
# - docling
|
||||
# - docling_core
|
||||
- mkdocstrings:
|
||||
default_handler: python
|
||||
options:
|
||||
extensions:
|
||||
- griffe_pydantic:
|
||||
schema: true
|
||||
preload_modules:
|
||||
- docling
|
||||
- docling_core
|
||||
|
||||
extra_css:
|
||||
- stylesheets/extra.css
|
||||
|
95
poetry.lock
generated
95
poetry.lock
generated
@ -1365,6 +1365,34 @@ gitdb = ">=4.0.1,<5"
|
||||
doc = ["sphinx (==4.3.2)", "sphinx-autodoc-typehints", "sphinx-rtd-theme", "sphinxcontrib-applehelp (>=1.0.2,<=1.0.4)", "sphinxcontrib-devhelp (==1.0.2)", "sphinxcontrib-htmlhelp (>=2.0.0,<=2.0.1)", "sphinxcontrib-qthelp (==1.0.3)", "sphinxcontrib-serializinghtml (==1.1.5)"]
|
||||
test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions"]
|
||||
|
||||
[[package]]
|
||||
name = "griffe"
|
||||
version = "1.5.1"
|
||||
description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
files = [
|
||||
{file = "griffe-1.5.1-py3-none-any.whl", hash = "sha256:ad6a7980f8c424c9102160aafa3bcdf799df0e75f7829d75af9ee5aef656f860"},
|
||||
{file = "griffe-1.5.1.tar.gz", hash = "sha256:72964f93e08c553257706d6cd2c42d1c172213feb48b2be386f243380b405d4b"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
colorama = ">=0.4"
|
||||
|
||||
[[package]]
|
||||
name = "griffe-pydantic"
|
||||
version = "1.1.0"
|
||||
description = "Griffe extension for Pydantic."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
files = [
|
||||
{file = "griffe_pydantic-1.1.0-py3-none-any.whl", hash = "sha256:ac9cc2d9b016cf302d8d9f577c9b3ca2793d88060f500d0b2a65f33a4a785cf1"},
|
||||
{file = "griffe_pydantic-1.1.0.tar.gz", hash = "sha256:9c5a701cc485dab087857c1ac960b44671acee5008aaae0752f610b2aa82b068"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
griffe = ">=0.49"
|
||||
|
||||
[[package]]
|
||||
name = "grpcio"
|
||||
version = "1.67.1"
|
||||
@ -2640,6 +2668,22 @@ watchdog = ">=2.0"
|
||||
i18n = ["babel (>=2.9.0)"]
|
||||
min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-import (==1.0)", "importlib-metadata (==4.4)", "jinja2 (==2.11.1)", "markdown (==3.3.6)", "markupsafe (==2.0.1)", "mergedeep (==1.3.4)", "mkdocs-get-deps (==0.2.0)", "packaging (==20.5)", "pathspec (==0.11.1)", "pyyaml (==5.1)", "pyyaml-env-tag (==0.1)", "watchdog (==2.0)"]
|
||||
|
||||
[[package]]
|
||||
name = "mkdocs-autorefs"
|
||||
version = "1.2.0"
|
||||
description = "Automatically link across pages in MkDocs."
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "mkdocs_autorefs-1.2.0-py3-none-any.whl", hash = "sha256:d588754ae89bd0ced0c70c06f58566a4ee43471eeeee5202427da7de9ef85a2f"},
|
||||
{file = "mkdocs_autorefs-1.2.0.tar.gz", hash = "sha256:a86b93abff653521bda71cf3fc5596342b7a23982093915cb74273f67522190f"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
Markdown = ">=3.3"
|
||||
markupsafe = ">=2.0.1"
|
||||
mkdocs = ">=1.1"
|
||||
|
||||
[[package]]
|
||||
name = "mkdocs-click"
|
||||
version = "0.8.1"
|
||||
@ -2731,6 +2775,51 @@ files = [
|
||||
{file = "mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mkdocstrings"
|
||||
version = "0.27.0"
|
||||
description = "Automatic documentation from sources, for MkDocs."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
files = [
|
||||
{file = "mkdocstrings-0.27.0-py3-none-any.whl", hash = "sha256:6ceaa7ea830770959b55a16203ac63da24badd71325b96af950e59fd37366332"},
|
||||
{file = "mkdocstrings-0.27.0.tar.gz", hash = "sha256:16adca6d6b0a1f9e0c07ff0b02ced8e16f228a9d65a37c063ec4c14d7b76a657"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
click = ">=7.0"
|
||||
importlib-metadata = {version = ">=4.6", markers = "python_version < \"3.10\""}
|
||||
Jinja2 = ">=2.11.1"
|
||||
Markdown = ">=3.6"
|
||||
MarkupSafe = ">=1.1"
|
||||
mkdocs = ">=1.4"
|
||||
mkdocs-autorefs = ">=1.2"
|
||||
mkdocstrings-python = {version = ">=0.5.2", optional = true, markers = "extra == \"python\""}
|
||||
platformdirs = ">=2.2"
|
||||
pymdown-extensions = ">=6.3"
|
||||
typing-extensions = {version = ">=4.1", markers = "python_version < \"3.10\""}
|
||||
|
||||
[package.extras]
|
||||
crystal = ["mkdocstrings-crystal (>=0.3.4)"]
|
||||
python = ["mkdocstrings-python (>=0.5.2)"]
|
||||
python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"]
|
||||
|
||||
[[package]]
|
||||
name = "mkdocstrings-python"
|
||||
version = "1.12.2"
|
||||
description = "A Python handler for mkdocstrings."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
files = [
|
||||
{file = "mkdocstrings_python-1.12.2-py3-none-any.whl", hash = "sha256:7f7d40d6db3cb1f5d19dbcd80e3efe4d0ba32b073272c0c0de9de2e604eda62a"},
|
||||
{file = "mkdocstrings_python-1.12.2.tar.gz", hash = "sha256:7a1760941c0b52a2cd87b960a9e21112ffe52e7df9d0b9583d04d47ed2e186f3"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
griffe = ">=0.49"
|
||||
mkdocs-autorefs = ">=1.2"
|
||||
mkdocstrings = ">=0.26"
|
||||
|
||||
[[package]]
|
||||
name = "more-itertools"
|
||||
version = "10.5.0"
|
||||
@ -3676,9 +3765,9 @@ numpy = [
|
||||
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
|
||||
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
|
||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
||||
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -3702,9 +3791,9 @@ numpy = [
|
||||
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
|
||||
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
|
||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
||||
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -7557,4 +7646,4 @@ tesserocr = ["tesserocr"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "3be886856c0f11033cfb7cb8bc30e5d59d7bb9804df9da9572b3cfbc2f6c3c56"
|
||||
content-hash = "2e7c27ffe32d556a66cc1008a7147a90c17f63b01d2a6cde3e7b941ba7e268d7"
|
||||
|
@ -80,6 +80,8 @@ types-openpyxl = "^3.1.5.20241114"
|
||||
mkdocs-material = "^9.5.40"
|
||||
mkdocs-jupyter = "^0.25.0"
|
||||
mkdocs-click = "^0.8.1"
|
||||
mkdocstrings = {extras = ["python"], version = "^0.27.0"}
|
||||
griffe-pydantic = "^1.1.0"
|
||||
|
||||
[tool.poetry.group.examples.dependencies]
|
||||
datasets = "^2.21.0"
|
||||
|
Loading…
Reference in New Issue
Block a user