diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 31a6e95..622338b 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -10,19 +10,14 @@ env: PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring jobs: - # To be enabled when we add docs - # docs: - # permissions: - # contents: write - # runs-on: ubuntu-latest - # steps: - # - uses: actions/checkout@v3 - # - uses: ./.github/actions/setup-poetry - # - name: Build and push docs - # run: poetry run mkdocs gh-deploy --force - code-checks: uses: ./.github/workflows/checks.yml + build-deploy-docs: + uses: ./.github/workflows/docs.yml + with: + deploy: true + permissions: + contents: write pre-release-check: runs-on: ubuntu-latest outputs: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 81a3174..a1ceca8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,13 +16,7 @@ env: jobs: code-checks: uses: ./.github/workflows/checks.yml - - # To enable when we add the ./docs - # build-docs: - # runs-on: ubuntu-latest - # steps: - # - uses: actions/checkout@v3 - # - uses: ./.github/actions/setup-poetry - # - name: Build docs - # run: poetry run mkdocs build --verbose --clean - + build-docs: + uses: ./.github/workflows/docs.yml + with: + deploy: false diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..d2b9bdd --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,28 @@ +on: + workflow_call: + inputs: + deploy: + type: boolean + description: "If true, the docs will be deployed." + default: false + +jobs: + run-docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install poetry + run: pipx install poetry==1.8.3 + shell: bash + - uses: actions/setup-python@v5 + with: + cache: 'poetry' + - name: Install dependencies + run: poetry install --only docs + shell: bash + - name: Build docs + run: poetry run mkdocs build --verbose --clean + - name: Build and push docs + if: inputs.deploy + run: poetry run mkdocs gh-deploy --force + \ No newline at end of file diff --git a/README.md b/README.md index 099a09e..33598dd 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,6 @@ source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL converter = DocumentConverter() result = converter.convert(source) print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]" -print(result.document.export_to_document_tokens()) # output: "<page_1><loc_20>..." ``` diff --git a/docs/concepts/docling_format.md b/docs/concepts/docling_document.md similarity index 75% rename from docs/concepts/docling_format.md rename to docs/concepts/docling_document.md index 0e84e44..00b5452 100644 --- a/docs/concepts/docling_format.md +++ b/docs/concepts/docling_document.md @@ -1,4 +1,4 @@ -With Docling v2, we introduce a unified document representation format called `DoclingDocument`. It is defined as a +With Docling v2, we introduce a unified document representation format called `DoclingDocument`. It is defined as a pydantic datatype, which can express several features common to documents, such as: * Text, Tables, Pictures, and more @@ -9,15 +9,16 @@ pydantic datatype, which can express several features common to documents, such It also brings a set of document construction APIs to build up a `DoclingDocument` from scratch. -# Example document structures +## Example document structures -To illustrate the features of the `DoclingDocument` format, consider the following side-by-side comparison of a -`DoclingDocument` converted from `test/data/word_sample.docx`. Left side shows snippets from the converted document -serialized as YAML, right side shows the corresponding visual parts in MS Word. +To illustrate the features of the `DoclingDocument` format, in the subsections below we consider the +`DoclingDocument` converted from `tests/data/word_sample.docx` and we present some side-by-side comparisons, +where the left side shows snippets from the converted document +serialized as YAML and the right one shows the corresponding parts of the original MS Word. -## Basic structure +### Basic structure -A `DoclingDocument` exposes top-level fields for the document content, organized in two categories. +A `DoclingDocument` exposes top-level fields for the document content, organized in two categories. The first category is the _content items_, which are stored in these fields: - `texts`: All items that have a text representation (paragraph, section heading, equation, ...). Base class is `TextItem`. @@ -34,32 +35,34 @@ The second category is _content structure_, which is encapsualted in: - `furniture`: The root node of a tree-structure for all items that don't belong into the body (headers, footers, ...) - `groups`: A set of items that don't represent content, but act as containers for other content items (e.g. a list, a chapter) -All of the above fields are only storing `NodeItem` instances, which reference children and parents -through JSON pointers. +All of the above fields are only storing `NodeItem` instances, which reference children and parents +through JSON pointers. The reading order of the document is encapsulated through the `body` tree and the order of _children_ in each item in the tree. -Below example shows how all items in the first page are nested below the `title` item (`#/texts/1`). +Below example shows how all items in the first page are nested below the `title` item (`#/texts/1`). ![doc_hierarchy_1](../assets/docling_doc_hierarchy_1.png) -## Grouping +### Grouping Below example shows how all items under the heading "Let's swim" (`#/texts/5`) are nested as chilrden. The children of -"Let's swim" are both text items and groups, which contain the list elements. The group items are stored in the +"Let's swim" are both text items and groups, which contain the list elements. The group items are stored in the top-level `groups` field. ![doc_hierarchy_2](../assets/docling_doc_hierarchy_2.png) -## Tables +<!-- +### Tables TBD -## Pictures +### Pictures TBD -## Provenance +### Provenance -TBD \ No newline at end of file +TBD + --> diff --git a/docs/usage.md b/docs/usage.md new file mode 100644 index 0000000..3f9b679 --- /dev/null +++ b/docs/usage.md @@ -0,0 +1,171 @@ +## Conversion + +### Convert a single document + +To convert invidual PDF documents, use `convert()`, for example: + +```python +from docling.document_converter import DocumentConverter + +source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL +converter = DocumentConverter() +result = converter.convert(source) +print(result.document.export_to_markdown()) # output: "### Docling Technical Report[...]" +``` + +### CLI + +You can also use Docling directly from your command line to convert individual files —be it local or by URL— or whole directories. + +A simple example would look like this: +```console +docling https://arxiv.org/pdf/2206.01062 +``` + +To see all available options (export formats etc.) run `docling --help`. + +<details> + <summary><b>CLI reference</b></summary> + + Here are the available options as of this writing (for an up-to-date listing, run `docling --help`): + + ```console + $ docling --help + + Usage: docling [OPTIONS] source + +╭─ Arguments ───────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ * input_sources source PDF files to convert. Can be local file / directory paths or URL. [default: None] │ +│ [required] │ +╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ +╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ +│ --from [docx|pptx|html|image|pdf] Specify input formats to convert from. │ +│ Defaults to all formats. │ +│ [default: None] │ +│ --to [md|json|text|doctags] Specify output formats. Defaults to │ +│ Markdown. │ +│ [default: None] │ +│ --ocr --no-ocr If enabled, the bitmap content will be │ +│ processed using OCR. │ +│ [default: ocr] │ +│ --ocr-engine [easyocr|tesseract_cli|tesseract] The OCR engine to use. [default: easyocr] │ +│ --abort-on-error --no-abort-on-error If enabled, the bitmap content will be │ +│ processed using OCR. │ +│ [default: no-abort-on-error] │ +│ --output PATH Output directory where results are saved. │ +│ [default: .] │ +│ --version Show version information. │ +│ --help Show this message and exit. │ +╰───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯ + ``` +</details> + + + +### Advanced options + +#### Adjust pipeline features + +The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways +one can adjust the conversion pipeline and features. + + +##### Control PDF table extraction options + +You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself. +This can improve output quality if you find that multiple columns in extracted tables are erroneously merged into one. + + +```python +from docling.datamodel.base_models import InputFormat +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.datamodel.pipeline_options import PdfPipelineOptions + +pipeline_options = PdfPipelineOptions(do_table_structure=True) +pipeline_options.table_structure_options.do_cell_matching = False # uses text cells predicted from table structure model + +doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + } +) +``` + +Since docling 1.16.0: You can control which TableFormer mode you want to use. Choose between `TableFormerMode.FAST` (default) and `TableFormerMode.ACCURATE` (better, but slower) to receive better quality with difficult table structures. + +```python +from docling.datamodel.base_models import InputFormat +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode + +pipeline_options = PdfPipelineOptions(do_table_structure=True) +pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE # use more accurate TableFormer model + +doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + } +) +``` + +#### Impose limits on the document size + +You can limit the file size and number of pages which should be allowed to process per document: + +```python +from pathlib import Path +from docling.document_converter import DocumentConverter + +source = "https://arxiv.org/pdf/2408.09869" +converter = DocumentConverter() +result = converter.convert(source, max_num_pages=100, max_file_size=20971520) +``` + +#### Convert from binary PDF streams + +You can convert PDFs from a binary stream instead of from the filesystem as follows: + +```python +from io import BytesIO +from docling.datamodel.base_models import DocumentStream +from docling.document_converter import DocumentConverter + +buf = BytesIO(your_binary_stream) +source = DocumentStream(filename="my_doc.pdf", stream=buf) +converter = DocumentConverter() +result = converter.convert(source) +``` + +#### Limit resource usage + +You can limit the CPU threads used by Docling by setting the environment variable `OMP_NUM_THREADS` accordingly. The default setting is using 4 CPU threads. + + +## Chunking + +You can perform a hierarchy-aware chunking of a Docling document as follows: + +```python +from docling.document_converter import DocumentConverter +from docling_core.transforms.chunker import HierarchicalChunker + +conv_res = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062") +doc = conv_res.document +chunks = list(HierarchicalChunker().chunk(doc)) + +print(chunks[30]) +# { +# "text": "Lately, new types of ML models for document-layout analysis have emerged [...]", +# "meta": { +# "doc_items": [{ +# "self_ref": "#/texts/40", +# "label": "text", +# "prov": [{ +# "page_no": 2, +# "bbox": {"l": 317.06, "t": 325.81, "r": 559.18, "b": 239.97, ...}, +# }] +# }], +# "headings": ["2 RELATED WORK"], +# } +# } +``` diff --git a/docs/v2.md b/docs/v2.md index 0e513ec..319679d 100644 --- a/docs/v2.md +++ b/docs/v2.md @@ -2,7 +2,7 @@ Docling v2 introduces several new features: -- Understands and converts PDF, MS Word, MS Powerpoint, HTML and several image formats +- Understands and converts PDF, MS Word, MS Powerpoint, HTML and several image formats - Produces a new, universal document representation which can encapsulate document hierarchy - Comes with a fresh new API and CLI @@ -22,7 +22,7 @@ docling myfile.pdf --to json --to md --no-ocr docling ./input/dir --from pdf # Convert PDF and Word files in input directory to Markdown and JSON -docling ./input/dir --from pdf --from docx --to md --to json --output ./scratch +docling ./input/dir --from pdf --from docx --to md --to json --output ./scratch # Convert all supported files in input directory to Markdown, but abort on first error docling ./input/dir --output ./scratch --abort-on-error @@ -38,8 +38,8 @@ docling ./input/dir --output ./scratch --abort-on-error ### Setting up a `DocumentConverter` To accomodate many input formats, we changed the way you need to set up your `DocumentConverter` object. -You can now define a list of allowed formats on the `DocumentConverter` initialization, and specify custom options -per-format if desired. By default, all supported formats are allowed. If you don't provide `format_options`, defaults +You can now define a list of allowed formats on the `DocumentConverter` initialization, and specify custom options +per-format if desired. By default, all supported formats are allowed. If you don't provide `format_options`, defaults will be used for all `allowed_formats`. Format options can include the pipeline class to use, the options to provide to the pipeline, and the document backend. @@ -59,7 +59,7 @@ from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend ## Default initialization still works as before: -# doc_converter = DocumentConverter() +# doc_converter = DocumentConverter() # previous `PipelineOptions` is now `PdfPipelineOptions` @@ -68,7 +68,7 @@ pipeline_options.do_ocr = False pipeline_options.do_table_structure = True #... -## Custom options are now defined per format. +## Custom options are now defined per format. doc_converter = ( DocumentConverter( # all of the below is optional, has internal defaults. allowed_formats=[ @@ -100,8 +100,8 @@ More options are shown in the following example units: ### Converting documents -We have simplified the way you can feed input to the `DocumentConverter` and renamed the conversion methods for -better semantics. You can now call the conversion directly with a single file, or a list of input files, +We have simplified the way you can feed input to the `DocumentConverter` and renamed the conversion methods for +better semantics. You can now call the conversion directly with a single file, or a list of input files, or `DocumentStream` objects, without constructing a `DocumentConversionInput` object first. * `DocumentConverter.convert` now converts a single file input (previously `DocumentConverter.convert_single`). @@ -129,7 +129,7 @@ input_files = [ conv_results_iter = doc_converter.convert_all(input_files) # previously `convert_batch` ``` -Through the `raises_on_error` argument, you can also control if the conversion should raise exceptions when first +Through the `raises_on_error` argument, you can also control if the conversion should raise exceptions when first encountering a problem, or resiliently convert all files first and reflect errors in each file's conversion status. By default, any error is immediately raised and the conversion aborts (previously, exceptions were swallowed). @@ -139,7 +139,7 @@ conv_results_iter = doc_converter.convert_all(input_files, raises_on_error=False ``` -### Access document structures +### Access document structures We have simplified how you can access and export the converted document data, too. Our universal document representation is now available in conversion results as a `DoclingDocument` object. @@ -167,7 +167,7 @@ for item, level in conv_result.document.iterate_items: conv_result.legacy_document # provides the representation in previous ExportedCCSDocument type ``` -## Export into JSON, Markdown, Doctags +### Export into JSON, Markdown, Doctags **Note**: All `render_...` methods in `ConversionResult` have been removed in Docling v2, and are now available on `DoclingDocument` as: @@ -184,7 +184,7 @@ print(conv_res.document.export_to_markdown()) print(conv_res.document.export_to_document_tokens()) ``` -**Note**: While it is deprecated, you can _still_ export Docling v1 JSON format. This is available through the same +**Note**: While it is deprecated, you can _still_ export Docling v1 JSON format. This is available through the same methods as on the `DoclingDocument` type: ```shell ## Export legacy document representation to desired format, for v1 compatibility: @@ -193,7 +193,7 @@ print(conv_res.legacy_document.export_to_markdown()) print(conv_res.legacy_document.export_to_document_tokens()) ``` -## Reload a `DoclingDocument` stored as JSON +### Reload a `DoclingDocument` stored as JSON You can save and reload a `DoclingDocument` to disk in JSON format using the following codes: @@ -211,3 +211,19 @@ with Path("./doc.json").open("r") as fp: ``` +### Chunking + +Docling v2 defines new base classes for chunking: + +- `BaseMeta` for chunk metadata +- `BaseChunk` containing the chunk text and metadata, and +- `BaseChunker` for chunkers, producing chunks out of a `DoclingDocument`. + +Additionally, it provides an updated `HierarchicalChunker` implementation, which +leverages the new `DoclingDocument` and provides a new, richer chunk output format, including: + +- the respective doc items for grounding +- any applicable headings for context +- any applicable captions for context + +For an example, check out [Chunking usage](../usage/#chunking). diff --git a/mkdocs.yml b/mkdocs.yml index 5fd180a..1fef442 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -54,10 +54,10 @@ nav: - Get started: - Home: index.md - Installation: installation.md - - Use Docling: use_docling.md + - Usage: usage.md - Docling v2: v2.md - Concepts: - - The Docling Document format: concepts/docling_format.md + - Docling Document: concepts/docling_document.md # - Chunking: concepts/chunking.md - Examples: - Conversion: diff --git a/poetry.lock b/poetry.lock index 656cd50..d5dee06 100644 --- a/poetry.lock +++ b/poetry.lock @@ -196,8 +196,8 @@ files = [ lazy-object-proxy = ">=1.4.0" typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""} wrapt = [ - {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, {version = ">=1.11,<2", markers = "python_version < \"3.11\""}, + {version = ">=1.14,<2", markers = "python_version >= \"3.11\""}, ] [[package]] @@ -943,8 +943,8 @@ networkx = ">=3.1,<4.0" netwulf = ">=0.1.5,<0.2.0" numerize = ">=0.12,<0.13" numpy = [ - {version = ">=2.0.2,<3.0.0", markers = "python_version >= \"3.13\""}, {version = ">=1.26.4,<2.0.0", markers = "python_version >= \"3.9\" and python_version < \"3.13\""}, + {version = ">=2.0.2,<3.0.0", markers = "python_version >= \"3.13\""}, ] pandas = {version = ">=2.1.4,<3.0.0", markers = "python_version >= \"3.9\""} python-dotenv = ">=1.0.0,<2.0.0" @@ -1030,8 +1030,8 @@ jsonlines = ">=3.1.0,<4.0.0" lxml = ">=4.9.1,<5.0.0" mean_average_precision = ">=2021.4.26.0,<2022.0.0.0" numpy = [ - {version = ">=2.1.0,<3.0.0", markers = "python_version >= \"3.13\""}, {version = ">=1.24.4,<2.0.0", markers = "python_version < \"3.13\""}, + {version = ">=2.1.0,<3.0.0", markers = "python_version >= \"3.13\""}, ] opencv-python-headless = ">=4.6.0.66,<5.0.0.0" Pillow = ">=10.0.0,<11.0.0" @@ -2355,8 +2355,8 @@ jsonpatch = ">=1.33,<2.0" langsmith = ">=0.1.112,<0.2.0" packaging = ">=23.2,<25" pydantic = [ - {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""}, + {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, ] PyYAML = ">=5.3" tenacity = ">=8.1.0,<8.4.0 || >8.4.0,<9.0.0" @@ -2424,8 +2424,8 @@ files = [ httpx = ">=0.23.0,<1" orjson = ">=3.9.14,<4.0.0" pydantic = [ - {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, {version = ">=1,<3", markers = "python_full_version < \"3.12.4\""}, + {version = ">=2.7.4,<3.0.0", markers = "python_full_version >= \"3.12.4\""}, ] requests = ">=2,<3" requests-toolbelt = ">=1.0.0,<2.0.0" @@ -3737,10 +3737,10 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] [[package]] @@ -3888,9 +3888,9 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.23.2", markers = "python_version == \"3.11\""}, + {version = ">=1.26.0", markers = "python_version >= \"3.12\""}, ] python-dateutil = ">=2.8.2" pytz = ">=2020.1" @@ -4480,8 +4480,8 @@ files = [ annotated-types = ">=0.6.0" pydantic-core = "2.23.4" typing-extensions = [ - {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, {version = ">=4.6.1", markers = "python_version < \"3.13\""}, + {version = ">=4.12.2", markers = "python_version >= \"3.13\""}, ] [package.extras] @@ -4649,8 +4649,8 @@ files = [ astroid = ">=2.15.8,<=2.17.0-dev0" colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""} dill = [ - {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, {version = ">=0.2", markers = "python_version < \"3.11\""}, + {version = ">=0.3.6", markers = "python_version >= \"3.11\""}, ] isort = ">=4.2.5,<6" mccabe = ">=0.6,<0.8" @@ -7513,4 +7513,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "8eb8024c32b37b2367c8d83e2833c3d118b3cfdd2f53966712c95dec8d830199" +content-hash = "ba5b52f1a318810bd363d2aa4f60fdfc2e5899e1729b0f0c51026082c93d23e0" diff --git a/pyproject.toml b/pyproject.toml index 684d9a8..71281a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,6 +75,8 @@ pandas-stubs = "^2.1.4.231227" ipykernel = "^6.29.5" ipywidgets = "^8.1.5" nbqa = "^1.9.0" + +[tool.poetry.group.docs.dependencies] mkdocs-material = "^9.5.40" mkdocs-jupyter = "^0.25.0"