diff --git a/.github/workflows/cd-docs.yml b/.github/workflows/cd-docs.yml new file mode 100644 index 0000000..1ff7c4f --- /dev/null +++ b/.github/workflows/cd-docs.yml @@ -0,0 +1,14 @@ +name: "Run Docs CD" + +on: + push: + branches: + - "main" + +jobs: + build-deploy-docs: + uses: ./.github/workflows/docs.yml + with: + deploy: true + permissions: + contents: write diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 7c92d38..9a2bf71 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -10,12 +10,6 @@ env: jobs: code-checks: uses: ./.github/workflows/checks.yml - build-deploy-docs: - uses: ./.github/workflows/docs.yml - with: - deploy: true - permissions: - contents: write pre-release-check: runs-on: ubuntu-latest outputs: diff --git a/.github/workflows/ci-docs.yml b/.github/workflows/ci-docs.yml new file mode 100644 index 0000000..6e9134d --- /dev/null +++ b/.github/workflows/ci-docs.yml @@ -0,0 +1,16 @@ +name: "Run Docs CI" + +on: + pull_request: + types: [opened, reopened, synchronize] + push: + branches: + - "**" + - "!gh-pages" + +jobs: + build-docs: + if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'DS4SD/docling' && github.event.pull_request.head.repo.full_name != 'ds4sd/docling') }} + uses: ./.github/workflows/docs.yml + with: + deploy: false diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index aec5d34..e2b21ed 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -6,6 +6,7 @@ on: push: branches: - "**" + - "!main" - "!gh-pages" env: @@ -16,8 +17,3 @@ jobs: code-checks: if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'DS4SD/docling' && github.event.pull_request.head.repo.full_name != 'ds4sd/docling') }} uses: ./.github/workflows/checks.yml - build-docs: - if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'DS4SD/docling' && github.event.pull_request.head.repo.full_name != 'ds4sd/docling') }} - uses: ./.github/workflows/docs.yml - with: - deploy: false diff --git a/README.md b/README.md index 22c2e8d..c3af0f7 100644 --- a/README.md +++ b/README.md @@ -22,8 +22,9 @@ Docling parses documents and exports them to the desired format with ease and sp ## Features -* 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.) -* 📑 Advanced PDF document understanding incl. page layout, reading order & table structures +* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON +* 📑 Advanced PDF document understanding including page layout, reading order & table structures +* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format * 📝 Metadata extraction, including title, authors, references & language * 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications * 🔍 OCR support for scanned PDFs diff --git a/docs/concepts/docling_document.md b/docs/concepts/docling_document.md index 00b5452..1ac46f5 100644 --- a/docs/concepts/docling_document.md +++ b/docs/concepts/docling_document.md @@ -7,6 +7,8 @@ pydantic datatype, which can express several features common to documents, such * Layout information (i.e. bounding boxes) for all items, if available * Provenance information +The definition of the Pydantic types is implemented in the module `docling_core.types.doc`, more details in [source code definitions](https://github.com/DS4SD/docling-core/tree/main/docling_core/types/doc). + It also brings a set of document construction APIs to build up a `DoclingDocument` from scratch. ## Example document structures diff --git a/docs/index.md b/docs/index.md index 9db06e3..68cdd12 100644 --- a/docs/index.md +++ b/docs/index.md @@ -19,8 +19,9 @@ Docling parses documents and exports them to the desired format with ease and sp ## Features -* 🗂️ Multi-format support for input (PDF, DOCX etc.) & output (Markdown, JSON etc.) +* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, Images, HTML, AsciiDoc, Markdown) and exports to Markdown and JSON * 📑 Advanced PDF document understanding incl. page layout, reading order & table structures +* 🧩 Unified, expressive [DoclingDocument](./concepts/docling_document.md) representation format * 📝 Metadata extraction, including title, authors, references & language * 🤖 Seamless LlamaIndex 🦙 & LangChain 🦜🔗 integration for powerful RAG / QA applications * 🔍 OCR support for scanned PDFs