docs: add automatic api reference (#475)

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-12-02 09:55:52 +01:00 committed by GitHub
parent 8ccb3c6db6
commit d4872103b8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 253 additions and 11 deletions

View File

@ -6,11 +6,15 @@ from pydantic import BaseModel, ConfigDict, Field
class TableFormerMode(str, Enum):
"""Modes for the TableFormer model."""
FAST = "fast"
ACCURATE = "accurate"
class TableStructureOptions(BaseModel):
"""Options for the table structure."""
do_cell_matching: bool = (
True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
@ -21,6 +25,8 @@ class TableStructureOptions(BaseModel):
class OcrOptions(BaseModel):
"""OCR options."""
kind: str
lang: List[str]
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
@ -30,6 +36,8 @@ class OcrOptions(BaseModel):
class RapidOcrOptions(OcrOptions):
"""Options for the RapidOCR engine."""
kind: Literal["rapidocr"] = "rapidocr"
# English and chinese are the most commly used models and have been tested with RapidOCR.
@ -66,6 +74,8 @@ class RapidOcrOptions(OcrOptions):
class EasyOcrOptions(OcrOptions):
"""Options for the EasyOCR engine."""
kind: Literal["easyocr"] = "easyocr"
lang: List[str] = ["fr", "de", "es", "en"]
use_gpu: bool = True # same default as easyocr.Reader
@ -79,6 +89,8 @@ class EasyOcrOptions(OcrOptions):
class TesseractCliOcrOptions(OcrOptions):
"""Options for the TesseractCli engine."""
kind: Literal["tesseract"] = "tesseract"
lang: List[str] = ["fra", "deu", "spa", "eng"]
tesseract_cmd: str = "tesseract"
@ -90,6 +102,8 @@ class TesseractCliOcrOptions(OcrOptions):
class TesseractOcrOptions(OcrOptions):
"""Options for the Tesseract engine."""
kind: Literal["tesserocr"] = "tesserocr"
lang: List[str] = ["fra", "deu", "spa", "eng"]
path: Optional[str] = None
@ -100,6 +114,8 @@ class TesseractOcrOptions(OcrOptions):
class OcrMacOptions(OcrOptions):
"""Options for the Mac OCR engine."""
kind: Literal["ocrmac"] = "ocrmac"
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
recognition: str = "accurate"
@ -111,12 +127,16 @@ class OcrMacOptions(OcrOptions):
class PipelineOptions(BaseModel):
"""Base pipeline options."""
create_legacy_output: bool = (
True # This defautl will be set to False on a future version of docling
)
class PdfPipelineOptions(PipelineOptions):
"""Options for the PDF pipeline."""
artifacts_path: Optional[Union[Path, str]] = None
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text

View File

@ -0,0 +1,52 @@
# Docling Document
This is an automatic generated API reference of the DoclingDocument type.
::: docling_core.types.doc
handler: python
options:
members:
- DoclingDocument
- DocumentOrigin
- DocItem
- DocItemLabel
- ProvenanceItem
- GroupItem
- GroupLabel
- NodeItem
- PageItem
- FloatingItem
- TextItem
- TableItem
- TableCell
- TableData
- TableCellLabel
- KeyValueItem
- SectionHeaderItem
- PictureItem
- ImageRef
- PictureClassificationClass
- PictureClassificationData
- RefItem
- BoundingBox
- CoordOrigin
- ImageRefMode
- Size
show_if_no_docstring: true
show_submodules: true
docstring_section_style: list
filters: ["!^_"]
heading_level: 2
show_root_toc_entry: true
inherited_members: true
merge_init_into_class: true
separate_signature: true
show_root_heading: true
show_root_full_path: false
show_signature_annotations: true
show_source: false
show_symbol_type_heading: true
show_symbol_type_toc: true
show_labels: false
signature_crossrefs: true
summary: true

View File

@ -0,0 +1,38 @@
# Document converter
This is an automatic generated API reference of the main components of Docling.
::: docling.document_converter
handler: python
options:
members:
- DocumentConverter
- ConversionResult
- ConversionStatus
- FormatOption
- InputFormat
- PdfFormatOption
- ImageFormatOption
- StandardPdfPipeline
- WordFormatOption
- PowerpointFormatOption
- MarkdownFormatOption
- AsciiDocFormatOption
- HTMLFormatOption
- SimplePipeline
show_if_no_docstring: true
show_submodules: true
docstring_section_style: list
filters: ["!^_"]
heading_level: 2
inherited_members: true
merge_init_into_class: true
separate_signature: true
show_root_heading: true
show_root_full_path: false
show_signature_annotations: true
show_source: false
show_symbol_type_heading: true
show_symbol_type_toc: true
signature_crossrefs: true
summary: true

View File

@ -0,0 +1,36 @@
# Pipeline options
Pipeline options allow to customize the execution of the models during the conversion pipeline.
This includes options for the OCR engines, the table model as well as enrichment options which
can be enabled with `do_xyz = True`.
This is an automatic generated API reference of the all the pipeline options available in Docling.
::: docling.datamodel.pipeline_options
handler: python
options:
show_if_no_docstring: true
show_submodules: true
docstring_section_style: list
filters: ["!^_"]
heading_level: 2
inherited_members: true
merge_init_into_class: true
separate_signature: true
show_root_heading: true
show_root_full_path: false
show_signature_annotations: true
show_source: false
show_symbol_type_heading: true
show_symbol_type_toc: true
signature_crossrefs: true
summary: true
<!-- ::: docling.document_converter.DocumentConverter
handler: python
options:
show_if_no_docstring: true
show_submodules: true -->

View File

@ -95,8 +95,10 @@ nav:
- "Prodigy": integrations/prodigy.md
- "spaCy": integrations/spacy.md
# - "LangChain 🦜🔗": integrations/langchain.md
# - API reference:
# - API reference: api_reference/index.md
- API reference:
- Document Converter: api_reference/document_converter.md
- Pipeline options: api_reference/pipeline_options.md
- Docling Document: api_reference/docling_document.md
markdown_extensions:
- pymdownx.superfences
@ -112,12 +114,15 @@ markdown_extensions:
plugins:
- search
- mkdocs-jupyter
# - mkdocstrings:
# default_handler: python
# options:
# preload_modules:
# - docling
# - docling_core
- mkdocstrings:
default_handler: python
options:
extensions:
- griffe_pydantic:
schema: true
preload_modules:
- docling
- docling_core
extra_css:
- stylesheets/extra.css

95
poetry.lock generated
View File

@ -1365,6 +1365,34 @@ gitdb = ">=4.0.1,<5"
doc = ["sphinx (==4.3.2)", "sphinx-autodoc-typehints", "sphinx-rtd-theme", "sphinxcontrib-applehelp (>=1.0.2,<=1.0.4)", "sphinxcontrib-devhelp (==1.0.2)", "sphinxcontrib-htmlhelp (>=2.0.0,<=2.0.1)", "sphinxcontrib-qthelp (==1.0.3)", "sphinxcontrib-serializinghtml (==1.1.5)"]
test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions"]
[[package]]
name = "griffe"
version = "1.5.1"
description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API."
optional = false
python-versions = ">=3.9"
files = [
{file = "griffe-1.5.1-py3-none-any.whl", hash = "sha256:ad6a7980f8c424c9102160aafa3bcdf799df0e75f7829d75af9ee5aef656f860"},
{file = "griffe-1.5.1.tar.gz", hash = "sha256:72964f93e08c553257706d6cd2c42d1c172213feb48b2be386f243380b405d4b"},
]
[package.dependencies]
colorama = ">=0.4"
[[package]]
name = "griffe-pydantic"
version = "1.1.0"
description = "Griffe extension for Pydantic."
optional = false
python-versions = ">=3.9"
files = [
{file = "griffe_pydantic-1.1.0-py3-none-any.whl", hash = "sha256:ac9cc2d9b016cf302d8d9f577c9b3ca2793d88060f500d0b2a65f33a4a785cf1"},
{file = "griffe_pydantic-1.1.0.tar.gz", hash = "sha256:9c5a701cc485dab087857c1ac960b44671acee5008aaae0752f610b2aa82b068"},
]
[package.dependencies]
griffe = ">=0.49"
[[package]]
name = "grpcio"
version = "1.67.1"
@ -2640,6 +2668,22 @@ watchdog = ">=2.0"
i18n = ["babel (>=2.9.0)"]
min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-import (==1.0)", "importlib-metadata (==4.4)", "jinja2 (==2.11.1)", "markdown (==3.3.6)", "markupsafe (==2.0.1)", "mergedeep (==1.3.4)", "mkdocs-get-deps (==0.2.0)", "packaging (==20.5)", "pathspec (==0.11.1)", "pyyaml (==5.1)", "pyyaml-env-tag (==0.1)", "watchdog (==2.0)"]
[[package]]
name = "mkdocs-autorefs"
version = "1.2.0"
description = "Automatically link across pages in MkDocs."
optional = false
python-versions = ">=3.8"
files = [
{file = "mkdocs_autorefs-1.2.0-py3-none-any.whl", hash = "sha256:d588754ae89bd0ced0c70c06f58566a4ee43471eeeee5202427da7de9ef85a2f"},
{file = "mkdocs_autorefs-1.2.0.tar.gz", hash = "sha256:a86b93abff653521bda71cf3fc5596342b7a23982093915cb74273f67522190f"},
]
[package.dependencies]
Markdown = ">=3.3"
markupsafe = ">=2.0.1"
mkdocs = ">=1.1"
[[package]]
name = "mkdocs-click"
version = "0.8.1"
@ -2731,6 +2775,51 @@ files = [
{file = "mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443"},
]
[[package]]
name = "mkdocstrings"
version = "0.27.0"
description = "Automatic documentation from sources, for MkDocs."
optional = false
python-versions = ">=3.9"
files = [
{file = "mkdocstrings-0.27.0-py3-none-any.whl", hash = "sha256:6ceaa7ea830770959b55a16203ac63da24badd71325b96af950e59fd37366332"},
{file = "mkdocstrings-0.27.0.tar.gz", hash = "sha256:16adca6d6b0a1f9e0c07ff0b02ced8e16f228a9d65a37c063ec4c14d7b76a657"},
]
[package.dependencies]
click = ">=7.0"
importlib-metadata = {version = ">=4.6", markers = "python_version < \"3.10\""}
Jinja2 = ">=2.11.1"
Markdown = ">=3.6"
MarkupSafe = ">=1.1"
mkdocs = ">=1.4"
mkdocs-autorefs = ">=1.2"
mkdocstrings-python = {version = ">=0.5.2", optional = true, markers = "extra == \"python\""}
platformdirs = ">=2.2"
pymdown-extensions = ">=6.3"
typing-extensions = {version = ">=4.1", markers = "python_version < \"3.10\""}
[package.extras]
crystal = ["mkdocstrings-crystal (>=0.3.4)"]
python = ["mkdocstrings-python (>=0.5.2)"]
python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"]
[[package]]
name = "mkdocstrings-python"
version = "1.12.2"
description = "A Python handler for mkdocstrings."
optional = false
python-versions = ">=3.9"
files = [
{file = "mkdocstrings_python-1.12.2-py3-none-any.whl", hash = "sha256:7f7d40d6db3cb1f5d19dbcd80e3efe4d0ba32b073272c0c0de9de2e604eda62a"},
{file = "mkdocstrings_python-1.12.2.tar.gz", hash = "sha256:7a1760941c0b52a2cd87b960a9e21112ffe52e7df9d0b9583d04d47ed2e186f3"},
]
[package.dependencies]
griffe = ">=0.49"
mkdocs-autorefs = ">=1.2"
mkdocstrings = ">=0.26"
[[package]]
name = "more-itertools"
version = "10.5.0"
@ -3676,9 +3765,9 @@ numpy = [
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
]
[[package]]
@ -3702,9 +3791,9 @@ numpy = [
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
]
[[package]]
@ -7557,4 +7646,4 @@ tesserocr = ["tesserocr"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "3be886856c0f11033cfb7cb8bc30e5d59d7bb9804df9da9572b3cfbc2f6c3c56"
content-hash = "2e7c27ffe32d556a66cc1008a7147a90c17f63b01d2a6cde3e7b941ba7e268d7"

View File

@ -80,6 +80,8 @@ types-openpyxl = "^3.1.5.20241114"
mkdocs-material = "^9.5.40"
mkdocs-jupyter = "^0.25.0"
mkdocs-click = "^0.8.1"
mkdocstrings = {extras = ["python"], version = "^0.27.0"}
griffe-pydantic = "^1.1.0"
[tool.poetry.group.examples.dependencies]
datasets = "^2.21.0"