ci: add coverage and ruff (#1383)

* add coverage calculation and push

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* new codecov version and usage of token

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* enable ruff formatter instead of black and isort

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply ruff lint fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply ruff unsafe fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add removed imports

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* runs 1 on linter issues

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* finalize linter fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* Update pyproject.toml

Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Michele Dolfi 2025-04-14 18:01:26 +02:00 committed by GitHub
parent 293c28ca7c
commit 5458a88464
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
104 changed files with 665 additions and 633 deletions

17
.github/codecov.yml vendored Normal file
View File

@ -0,0 +1,17 @@
codecov:
# https://docs.codecov.io/docs/comparing-commits
allow_coverage_offsets: true
coverage:
status:
project:
default:
informational: true
target: auto # auto compares coverage to the previous base commit
flags:
- docling
comment:
layout: "reach, diff, flags, files"
behavior: default
require_changes: false # if true: only post the comment if coverage changes
branches: # branch names that can post comment
- "main"

View File

@ -10,6 +10,8 @@ env:
jobs: jobs:
code-checks: code-checks:
uses: ./.github/workflows/checks.yml uses: ./.github/workflows/checks.yml
with:
push_coverage: false
pre-release-check: pre-release-check:
runs-on: ubuntu-latest runs-on: ubuntu-latest
outputs: outputs:

View File

@ -1,5 +1,13 @@
on: on:
workflow_call: workflow_call:
inputs:
push_coverage:
type: boolean
description: "If true, the coverage results are pushed to codecov.io."
default: true
secrets:
CODECOV_TOKEN:
required: false
env: env:
HF_HUB_DOWNLOAD_TIMEOUT: "60" HF_HUB_DOWNLOAD_TIMEOUT: "60"
@ -32,7 +40,13 @@ jobs:
run: poetry install --all-extras run: poetry install --all-extras
- name: Testing - name: Testing
run: | run: |
poetry run pytest -v tests poetry run pytest -v --cov=docling --cov-report=xml tests
- name: Upload coverage to Codecov
if: inputs.push_coverage
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: ./coverage.xml
- name: Run examples - name: Run examples
run: | run: |
for file in docs/examples/*.py; do for file in docs/examples/*.py; do

View File

@ -17,3 +17,5 @@ jobs:
code-checks: code-checks:
if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }} if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }}
uses: ./.github/workflows/checks.yml uses: ./.github/workflows/checks.yml
secrets:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

View File

@ -1,43 +1,26 @@
fail_fast: true fail_fast: true
repos: repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.5
hooks:
# Run the Ruff formatter.
- id: ruff-format
name: "Ruff formatter"
args: [--config=pyproject.toml]
files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
# Run the Ruff linter.
- id: ruff
name: "Ruff linter"
args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
- repo: local - repo: local
hooks: hooks:
- id: black
name: Black
entry: poetry run black docling docs/examples tests
pass_filenames: false
language: system
files: '\.py$'
- id: isort
name: isort
entry: poetry run isort docling docs/examples tests
pass_filenames: false
language: system
files: '\.py$'
# - id: flake8
# name: flake8
# entry: poetry run flake8 docling
# pass_filenames: false
# language: system
# files: '\.py$'
- id: mypy - id: mypy
name: MyPy name: MyPy
entry: poetry run mypy docling entry: poetry run mypy docling
pass_filenames: false pass_filenames: false
language: system language: system
files: '\.py$' files: '\.py$'
- id: nbqa_black
name: nbQA Black
entry: poetry run nbqa black docs/examples
pass_filenames: false
language: system
files: '\.ipynb$'
- id: nbqa_isort
name: nbQA isort
entry: poetry run nbqa isort docs/examples
pass_filenames: false
language: system
files: '\.ipynb$'
- id: poetry - id: poetry
name: Poetry check name: Poetry check
entry: poetry check --lock entry: poetry check --lock

View File

@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
text_stream = self.path_or_stream.getvalue().decode("utf-8") text_stream = self.path_or_stream.getvalue().decode("utf-8")
self.lines = text_stream.split("\n") self.lines = text_stream.split("\n")
if isinstance(self.path_or_stream, Path): if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f: with open(self.path_or_stream, encoding="utf-8") as f:
self.lines = f.readlines() self.lines = f.readlines()
self.valid = True self.valid = True
@ -75,14 +75,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return doc return doc
def _parse(self, doc: DoclingDocument): def _parse(self, doc: DoclingDocument): # noqa: C901
""" """
Main function that orchestrates the parsing by yielding components: Main function that orchestrates the parsing by yielding components:
title, section headers, text, lists, and tables. title, section headers, text, lists, and tables.
""" """
content = ""
in_list = False in_list = False
in_table = False in_table = False
@ -95,7 +93,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
# indents: dict[int, Union[DocItem, GroupItem, None]] = {} # indents: dict[int, Union[DocItem, GroupItem, None]] = {}
indents: dict[int, Union[GroupItem, None]] = {} indents: dict[int, Union[GroupItem, None]] = {}
for i in range(0, 10): for i in range(10):
parents[i] = None parents[i] = None
indents[i] = None indents[i] = None
@ -125,7 +123,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
# Lists # Lists
elif self._is_list_item(line): elif self._is_list_item(line):
_log.debug(f"line: {line}") _log.debug(f"line: {line}")
item = self._parse_list_item(line) item = self._parse_list_item(line)
_log.debug(f"parsed list-item: {item}") _log.debug(f"parsed list-item: {item}")
@ -147,7 +144,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
indents[level + 1] = item["indent"] indents[level + 1] = item["indent"]
elif in_list and item["indent"] < indents[level]: elif in_list and item["indent"] < indents[level]:
# print(item["indent"], " => ", indents[level]) # print(item["indent"], " => ", indents[level])
while item["indent"] < indents[level]: while item["indent"] < indents[level]:
# print(item["indent"], " => ", indents[level]) # print(item["indent"], " => ", indents[level])
@ -176,7 +172,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
elif in_table and ( elif in_table and (
(not self._is_table_line(line)) or line.strip() == "|===" (not self._is_table_line(line)) or line.strip() == "|==="
): # end of table ): # end of table
caption = None caption = None
if len(caption_data) > 0: if len(caption_data) > 0:
caption = doc.add_text( caption = doc.add_text(
@ -195,7 +190,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
# Picture # Picture
elif self._is_picture(line): elif self._is_picture(line):
caption = None caption = None
if len(caption_data) > 0: if len(caption_data) > 0:
caption = doc.add_text( caption = doc.add_text(
@ -250,7 +244,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
text_data = [] text_data = []
elif len(line.strip()) > 0: # allow multiline texts elif len(line.strip()) > 0: # allow multiline texts
item = self._parse_text(line) item = self._parse_text(line)
text_data.append(item["text"]) text_data.append(item["text"])
@ -273,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
def _get_current_level(self, parents): def _get_current_level(self, parents):
for k, v in parents.items(): for k, v in parents.items():
if v == None and k > 0: if v is None and k > 0:
return k - 1 return k - 1
return 0 return 0
def _get_current_parent(self, parents): def _get_current_parent(self, parents):
for k, v in parents.items(): for k, v in parents.items():
if v == None and k > 0: if v is None and k > 0:
return parents[k - 1] return parents[k - 1]
return None return None
@ -328,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
"marker": marker, "marker": marker,
"text": text.strip(), "text": text.strip(),
"numbered": False, "numbered": False,
"indent": 0 if indent == None else len(indent), "indent": 0 if indent is None else len(indent),
} }
else: else:
return { return {
@ -336,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
"marker": marker, "marker": marker,
"text": text.strip(), "text": text.strip(),
"numbered": True, "numbered": True,
"indent": 0 if indent == None else len(indent), "indent": 0 if indent is None else len(indent),
} }
else: else:
# Fallback if no match # Fallback if no match
@ -357,7 +350,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return [cell.strip() for cell in line.split("|") if cell.strip()] return [cell.strip() for cell in line.split("|") if cell.strip()]
def _populate_table_as_grid(self, table_data): def _populate_table_as_grid(self, table_data):
num_rows = len(table_data) num_rows = len(table_data)
# Adjust the table data into a grid format # Adjust the table data into a grid format

View File

@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
head = self.content.readline() head = self.content.readline()
dialect = csv.Sniffer().sniff(head, ",;\t|:") dialect = csv.Sniffer().sniff(head, ",;\t|:")
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"') _log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
if not dialect.delimiter in {",", ";", "\t", "|", ":"}: if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
raise RuntimeError( raise RuntimeError(
f"Cannot convert csv with unknown delimiter {dialect.delimiter}." f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
) )

View File

@ -1,8 +1,9 @@
import logging import logging
import random import random
from collections.abc import Iterable
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Iterable, List, Optional, Union from typing import List, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin, Size from docling_core.types.doc import BoundingBox, CoordOrigin, Size
@ -156,7 +157,6 @@ class DoclingParsePageBackend(PdfPageBackend):
def get_page_image( def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image: ) -> Image.Image:
page_size = self.get_size() page_size = self.get_size()
if not cropbox: if not cropbox:

View File

@ -1,8 +1,9 @@
import logging import logging
import random import random
from collections.abc import Iterable
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Union from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
@ -172,7 +173,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
def get_page_image( def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image: ) -> Image.Image:
page_size = self.get_size() page_size = self.get_size()
if not cropbox: if not cropbox:

View File

@ -1,14 +1,14 @@
import logging import logging
import random from collections.abc import Iterable
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Union from typing import TYPE_CHECKING, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import SegmentedPdfPage, TextCell from docling_core.types.doc.page import SegmentedPdfPage, TextCell
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
from PIL import Image, ImageDraw from PIL import Image
from pypdfium2 import PdfPage from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
@ -93,7 +93,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
def get_page_image( def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image: ) -> Image.Image:
page_size = self.get_size() page_size = self.get_size()
if not cropbox: if not cropbox:

View File

@ -1,12 +1,8 @@
# -*- coding: utf-8 -*-
""" """
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
On 23/01/2025 On 23/01/2025
""" """
from __future__ import unicode_literals
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~") CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
BLANK = "" BLANK = ""
@ -79,7 +75,6 @@ CHR_BO = {
} }
T = { T = {
"\u2192": "\\rightarrow ",
# Greek letters # Greek letters
"\U0001d6fc": "\\alpha ", "\U0001d6fc": "\\alpha ",
"\U0001d6fd": "\\beta ", "\U0001d6fd": "\\beta ",

View File

@ -76,8 +76,7 @@ def get_val(key, default=None, store=CHR):
return default return default
class Tag2Method(object): class Tag2Method:
def call_method(self, elm, stag=None): def call_method(self, elm, stag=None):
getmethod = self.tag2meth.get getmethod = self.tag2meth.get
if stag is None: if stag is None:
@ -130,7 +129,6 @@ class Tag2Method(object):
class Pr(Tag2Method): class Pr(Tag2Method):
text = "" text = ""
__val_tags = ("chr", "pos", "begChr", "endChr", "type") __val_tags = ("chr", "pos", "begChr", "endChr", "type")
@ -159,7 +157,7 @@ class Pr(Tag2Method):
def do_common(self, elm): def do_common(self, elm):
stag = elm.tag.replace(OMML_NS, "") stag = elm.tag.replace(OMML_NS, "")
if stag in self.__val_tags: if stag in self.__val_tags:
t = elm.get("{0}val".format(OMML_NS)) t = elm.get(f"{OMML_NS}val")
self.__innerdict[stag] = t self.__innerdict[stag] = t
return None return None
@ -248,7 +246,6 @@ class oMath2Latex(Tag2Method):
""" """
the Pre-Sub-Superscript object -- Not support yet the Pre-Sub-Superscript object -- Not support yet
""" """
pass
def do_sub(self, elm): def do_sub(self, elm):
text = self.process_children(elm) text = self.process_children(elm)
@ -331,7 +328,7 @@ class oMath2Latex(Tag2Method):
t_dict = self.process_children_dict(elm, include=("e", "lim")) t_dict = self.process_children_dict(elm, include=("e", "lim"))
latex_s = LIM_FUNC.get(t_dict["e"]) latex_s = LIM_FUNC.get(t_dict["e"])
if not latex_s: if not latex_s:
raise NotSupport("Not support lim %s" % t_dict["e"]) raise RuntimeError("Not support lim {}".format(t_dict["e"]))
else: else:
return latex_s.format(lim=t_dict.get("lim")) return latex_s.format(lim=t_dict.get("lim"))
@ -413,7 +410,7 @@ class oMath2Latex(Tag2Method):
""" """
_str = [] _str = []
_base_str = [] _base_str = []
found_text = elm.findtext("./{0}t".format(OMML_NS)) found_text = elm.findtext(f"./{OMML_NS}t")
if found_text: if found_text:
for s in found_text: for s in found_text:
out_latex_str = self.process_unicode(s) out_latex_str = self.process_unicode(s)

View File

@ -55,7 +55,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.max_levels = 10 self.max_levels = 10
self.level = 0 self.level = 0
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {} self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
for i in range(0, self.max_levels): for i in range(self.max_levels):
self.parents[i] = None self.parents[i] = None
try: try:
@ -126,7 +126,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return doc return doc
def walk(self, tag: Tag, doc: DoclingDocument) -> None: def walk(self, tag: Tag, doc: DoclingDocument) -> None:
# Iterate over elements in the body of the document # Iterate over elements in the body of the document
text: str = "" text: str = ""
for element in tag.children: for element in tag.children:
@ -135,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.analyze_tag(cast(Tag, element), doc) self.analyze_tag(cast(Tag, element), doc)
except Exception as exc_child: except Exception as exc_child:
_log.error( _log.error(
f"Error processing child from tag {tag.name}: {repr(exc_child)}" f"Error processing child from tag {tag.name}: {exc_child!r}"
) )
raise exc_child raise exc_child
elif isinstance(element, NavigableString) and not isinstance( elif isinstance(element, NavigableString) and not isinstance(
@ -147,7 +146,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
item for item in element.next_siblings if isinstance(item, Tag) item for item in element.next_siblings if isinstance(item, Tag)
] ]
if element.next_sibling is None or any( if element.next_sibling is None or any(
[item.name in TAGS_FOR_NODE_ITEMS for item in siblings] item.name in TAGS_FOR_NODE_ITEMS for item in siblings
): ):
text = text.strip() text = text.strip()
if text and tag.name in ["div"]: if text and tag.name in ["div"]:
@ -222,7 +221,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
) )
else: else:
if hlevel > self.level: if hlevel > self.level:
# add invisible group # add invisible group
for i in range(self.level + 1, hlevel): for i in range(self.level + 1, hlevel):
self.parents[i] = doc.add_group( self.parents[i] = doc.add_group(
@ -234,7 +232,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.level = hlevel self.level = hlevel
elif hlevel < self.level: elif hlevel < self.level:
# remove the tail # remove the tail
for key in self.parents.keys(): for key in self.parents.keys():
if key > hlevel: if key > hlevel:
@ -360,7 +357,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
marker = "" marker = ""
enumerated = False enumerated = False
if parent_label == GroupLabel.ORDERED_LIST: if parent_label == GroupLabel.ORDERED_LIST:
marker = f"{str(index_in_list)}." marker = f"{index_in_list!s}."
enumerated = True enumerated = True
doc.add_list_item( doc.add_list_item(
text=text, text=text,

View File

@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# otherwise they represent emphasis (bold or italic) # otherwise they represent emphasis (bold or italic)
self.markdown = self._shorten_underscore_sequences(text_stream) self.markdown = self._shorten_underscore_sequences(text_stream)
if isinstance(self.path_or_stream, Path): if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f: with open(self.path_or_stream, encoding="utf-8") as f:
md_content = f.read() md_content = f.read()
# remove invalid sequences # remove invalid sequences
# very long sequences of underscores will lead to unnecessary long processing times. # very long sequences of underscores will lead to unnecessary long processing times.
@ -168,7 +168,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) )
self.inline_texts = [] self.inline_texts = []
def _iterate_elements( def _iterate_elements( # noqa: C901
self, self,
element: marko.element.Element, element: marko.element.Element,
depth: int, depth: int,
@ -176,7 +176,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
visited: Set[marko.element.Element], visited: Set[marko.element.Element],
parent_item: Optional[NodeItem] = None, parent_item: Optional[NodeItem] = None,
): ):
if element in visited: if element in visited:
return return
@ -236,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
if has_non_empty_list_items: if has_non_empty_list_items:
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
parent_item = doc.add_group( parent_item = doc.add_group(
label=label, name=f"list", parent=parent_item label=label, name="list", parent=parent_item
) )
elif ( elif (
@ -320,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self._html_blocks += 1 self._html_blocks += 1
self._process_inline_text(parent_item, doc) self._process_inline_text(parent_item, doc)
self._close_table(doc) self._close_table(doc)
_log.debug("HTML Block: {}".format(element)) _log.debug(f"HTML Block: {element}")
if ( if (
len(element.body) > 0 len(element.body) > 0
): # If Marko doesn't return any content for HTML block, skip it ): # If Marko doesn't return any content for HTML block, skip it
@ -332,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
else: else:
if not isinstance(element, str): if not isinstance(element, str):
self._close_table(doc) self._close_table(doc)
_log.debug("Some other element: {}".format(element)) _log.debug(f"Some other element: {element}")
processed_block_types = ( processed_block_types = (
marko.block.Heading, marko.block.Heading,
@ -398,7 +397,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# if HTML blocks were detected, export to HTML and delegate to HTML backend # if HTML blocks were detected, export to HTML and delegate to HTML backend
if self._html_blocks > 0: if self._html_blocks > 0:
# export to HTML # export to HTML
html_backend_cls = HTMLDocumentBackend html_backend_cls = HTMLDocumentBackend
html_str = doc.export_to_html() html_str = doc.export_to_html()

View File

@ -184,7 +184,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
""" """
if self.workbook is not None: if self.workbook is not None:
# Iterate over all sheets # Iterate over all sheets
for sheet_name in self.workbook.sheetnames: for sheet_name in self.workbook.sheetnames:
_log.info(f"Processing sheet: {sheet_name}") _log.info(f"Processing sheet: {sheet_name}")
@ -253,7 +252,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
) )
for excel_cell in excel_table.data: for excel_cell in excel_table.data:
cell = TableCell( cell = TableCell(
text=excel_cell.text, text=excel_cell.text,
row_span=excel_cell.row_span, row_span=excel_cell.row_span,
@ -303,7 +301,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
# Iterate over all cells in the sheet # Iterate over all cells in the sheet
for ri, row in enumerate(sheet.iter_rows(values_only=False)): for ri, row in enumerate(sheet.iter_rows(values_only=False)):
for rj, cell in enumerate(row): for rj, cell in enumerate(row):
# Skip empty or already visited cells # Skip empty or already visited cells
if cell.value is None or (ri, rj) in visited: if cell.value is None or (ri, rj) in visited:
continue continue
@ -342,7 +339,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
visited_cells: set[tuple[int, int]] = set() visited_cells: set[tuple[int, int]] = set()
for ri in range(start_row, max_row + 1): for ri in range(start_row, max_row + 1):
for rj in range(start_col, max_col + 1): for rj in range(start_col, max_col + 1):
cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
# Check if the cell belongs to a merged range # Check if the cell belongs to a merged range
@ -350,14 +346,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
col_span = 1 col_span = 1
for merged_range in sheet.merged_cells.ranges: for merged_range in sheet.merged_cells.ranges:
if ( if (
merged_range.min_row <= ri + 1 merged_range.min_row <= ri + 1
and ri + 1 <= merged_range.max_row and ri + 1 <= merged_range.max_row
and merged_range.min_col <= rj + 1 and merged_range.min_col <= rj + 1
and rj + 1 <= merged_range.max_col and rj + 1 <= merged_range.max_col
): ):
row_span = merged_range.max_row - merged_range.min_row + 1 row_span = merged_range.max_row - merged_range.min_row + 1
col_span = merged_range.max_col - merged_range.min_col + 1 col_span = merged_range.max_col - merged_range.min_col + 1
break break
@ -499,7 +493,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
), ),
), ),
) )
except: except Exception:
_log.error("could not extract the image from excel sheets") _log.error("could not extract the image from excel sheets")
return doc return doc

View File

@ -120,13 +120,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
return prov return prov
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901
is_a_list = False is_a_list = False
is_list_group_created = False is_list_group_created = False
enum_list_item_value = 0 enum_list_item_value = 0
new_list = None new_list = None
bullet_type = "None" bullet_type = "None"
list_text = ""
list_label = GroupLabel.LIST list_label = GroupLabel.LIST
doc_label = DocItemLabel.LIST_ITEM doc_label = DocItemLabel.LIST_ITEM
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size) prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
@ -243,7 +242,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
enum_marker = str(enum_list_item_value) + "." enum_marker = str(enum_list_item_value) + "."
if not is_list_group_created: if not is_list_group_created:
new_list = doc.add_group( new_list = doc.add_group(
label=list_label, name=f"list", parent=parent_slide label=list_label, name="list", parent=parent_slide
) )
is_list_group_created = True is_list_group_created = True
doc.add_list_item( doc.add_list_item(
@ -368,11 +367,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
slide_width = pptx_obj.slide_width slide_width = pptx_obj.slide_width
slide_height = pptx_obj.slide_height slide_height = pptx_obj.slide_height
text_content = [] # type: ignore
max_levels = 10 max_levels = 10
parents = {} # type: ignore parents = {} # type: ignore
for i in range(0, max_levels): for i in range(max_levels):
parents[i] = None parents[i] = None
# Loop through each slide # Loop through each slide
@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
) )
slide_size = Size(width=slide_width, height=slide_height) slide_size = Size(width=slide_width, height=slide_height)
parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size) doc.add_page(page_no=slide_ind + 1, size=slide_size)
def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size): def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
handle_groups(shape, parent_slide, slide_ind, doc, slide_size) handle_groups(shape, parent_slide, slide_ind, doc, slide_size)

View File

@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def _get_level(self) -> int: def _get_level(self) -> int:
"""Return the first None index.""" """Return the first None index."""
for k, v in self.parents.items(): for k, v in self.parents.items():
if k >= 0 and v == None: if k >= 0 and v is None:
return k return k
return 0 return 0
@ -418,7 +418,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
else prev_parent else prev_parent
) )
def _handle_text_elements( def _handle_text_elements( # noqa: C901
self, self,
element: BaseOxmlElement, element: BaseOxmlElement,
docx_obj: DocxDocument, docx_obj: DocxDocument,
@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}" f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
) )
if cell is None or cell._tc in cell_set: if cell is None or cell._tc in cell_set:
_log.debug(f" skipped since repeated content") _log.debug(" skipped since repeated content")
col_idx += cell.grid_span col_idx += cell.grid_span
continue continue
else: else:
@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
image=ImageRef.from_pil(image=pil_image, dpi=72), image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None, caption=None,
) )
except (UnidentifiedImageError, OSError) as e: except (UnidentifiedImageError, OSError):
_log.warning("Warning: image cannot be loaded by Pillow") _log.warning("Warning: image cannot be loaded by Pillow")
doc.add_picture( doc.add_picture(
parent=self.parents[level - 1], parent=self.parents[level - 1],

View File

@ -1,7 +1,8 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from collections.abc import Iterable
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Set, Union from typing import Optional, Set, Union
from docling_core.types.doc import BoundingBox, Size from docling_core.types.doc import BoundingBox, Size
from docling_core.types.doc.page import SegmentedPdfPage, TextCell from docling_core.types.doc.page import SegmentedPdfPage, TextCell

View File

@ -1,8 +1,9 @@
import logging import logging
import random import random
from collections.abc import Iterable
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Union from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c import pypdfium2.raw as pdfium_c
@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
self.valid = True # No better way to tell from pypdfium. self.valid = True # No better way to tell from pypdfium.
try: try:
self._ppage: pdfium.PdfPage = pdfium_doc[page_no] self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
except PdfiumError as e: except PdfiumError:
_log.info( _log.info(
f"An exception occurred when loading page {page_no} of document {document_hash}.", f"An exception occurred when loading page {page_no} of document {document_hash}.",
exc_info=True, exc_info=True,
@ -225,7 +226,6 @@ class PyPdfiumPageBackend(PdfPageBackend):
def get_page_image( def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image: ) -> Image.Image:
page_size = self.get_size() page_size = self.get_size()
if not cropbox: if not cropbox:

View File

@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
doc_info: etree.DocInfo = self.tree.docinfo doc_info: etree.DocInfo = self.tree.docinfo
if doc_info.system_url and any( if doc_info.system_url and any(
[kwd in doc_info.system_url for kwd in JATS_DTD_URL] kwd in doc_info.system_url for kwd in JATS_DTD_URL
): ):
self.valid = True self.valid = True
return return
for ent in doc_info.internalDTD.iterentities(): for ent in doc_info.internalDTD.iterentities():
if ent.system_url and any( if ent.system_url and any(
[kwd in ent.system_url for kwd in JATS_DTD_URL] kwd in ent.system_url for kwd in JATS_DTD_URL
): ):
self.valid = True self.valid = True
return return
@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
# TODO: once superscript is supported, add label with formatting # TODO: once superscript is supported, add label with formatting
aff = aff.removeprefix(f"{label[0].text}, ") aff = aff.removeprefix(f"{label[0].text}, ")
affiliation_names.append(aff) affiliation_names.append(aff)
affiliation_ids_names = { affiliation_ids_names = dict(
id: name zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names) )
}
# Get author names and affiliation names # Get author names and affiliation names
for author_node in meta.xpath( for author_node in meta.xpath(
@ -300,7 +299,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
def _add_abstract( def _add_abstract(
self, doc: DoclingDocument, xml_components: XMLComponents self, doc: DoclingDocument, xml_components: XMLComponents
) -> None: ) -> None:
for abstract in xml_components["abstract"]: for abstract in xml_components["abstract"]:
text: str = abstract["content"] text: str = abstract["content"]
title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
@ -349,7 +347,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
return return
def _parse_element_citation(self, node: etree._Element) -> str: def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901
citation: Citation = { citation: Citation = {
"author_names": "", "author_names": "",
"title": "", "title": "",
@ -440,7 +438,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip() citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
if len(node.xpath("lpage")) > 0: if len(node.xpath("lpage")) > 0:
citation["page"] += ( citation["page"] += (
"" + node.xpath("lpage")[0].text.replace("\n", " ").strip() "" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001
) )
# Flatten the citation to string # Flatten the citation to string
@ -595,9 +593,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
try: try:
self._add_table(doc, parent, table) self._add_table(doc, parent, table)
except Exception as e: except Exception:
_log.warning(f"Skipping unsupported table in {str(self.file)}") _log.warning(f"Skipping unsupported table in {self.file!s}")
pass
return return
@ -609,7 +606,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
) )
return return
def _walk_linear( def _walk_linear( # noqa: C901
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
) -> str: ) -> str:
skip_tags = ["term"] skip_tags = ["term"]

View File

@ -122,7 +122,6 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
@override @override
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
if self.parser is not None: if self.parser is not None:
doc = self.parser.parse(self.patent_content) doc = self.parser.parse(self.patent_content)
if doc is None: if doc is None:
@ -163,7 +162,6 @@ class PatentUspto(ABC):
Returns: Returns:
The patent parsed as a docling document. The patent parsed as a docling document.
""" """
pass
class PatentUsptoIce(PatentUspto): class PatentUsptoIce(PatentUspto):
@ -265,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
self.style_html = HtmlEntity() self.style_html = HtmlEntity()
@override @override
def startElement(self, tag, attributes): # noqa: N802 def startElement(self, tag, attributes):
"""Signal the start of an element. """Signal the start of an element.
Args: Args:
@ -281,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
self._start_registered_elements(tag, attributes) self._start_registered_elements(tag, attributes)
@override @override
def skippedEntity(self, name): # noqa: N802 def skippedEntity(self, name):
"""Receive notification of a skipped entity. """Receive notification of a skipped entity.
HTML entities will be skipped by the parser. This method will unescape them HTML entities will be skipped by the parser. This method will unescape them
@ -315,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
self.text += unescaped self.text += unescaped
@override @override
def endElement(self, tag): # noqa: N802 def endElement(self, tag):
"""Signal the end of an element. """Signal the end of an element.
Args: Args:
@ -603,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
self.style_html = HtmlEntity() self.style_html = HtmlEntity()
@override @override
def startElement(self, tag, attributes): # noqa: N802 def startElement(self, tag, attributes):
"""Signal the start of an element. """Signal the start of an element.
Args: Args:
@ -616,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
self._start_registered_elements(tag, attributes) self._start_registered_elements(tag, attributes)
@override @override
def skippedEntity(self, name): # noqa: N802 def skippedEntity(self, name):
"""Receive notification of a skipped entity. """Receive notification of a skipped entity.
HTML entities will be skipped by the parser. This method will unescape them HTML entities will be skipped by the parser. This method will unescape them
@ -650,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
self.text += unescaped self.text += unescaped
@override @override
def endElement(self, tag): # noqa: N802 def endElement(self, tag):
"""Signal the end of an element. """Signal the end of an element.
Args: Args:
@ -691,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
if tag in [member.value for member in self.Element]: if tag in [member.value for member in self.Element]:
if ( if (
tag == self.Element.HEADING.value tag == self.Element.HEADING.value
and not self.Element.SDOCL.value in self.property and self.Element.SDOCL.value not in self.property
): ):
level_attr: str = attributes.get("LVL", "") level_attr: str = attributes.get("LVL", "")
new_level: int = int(level_attr) if level_attr.isnumeric() else 1 new_level: int = int(level_attr) if level_attr.isnumeric() else 1
@ -743,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
# headers except claims statement # headers except claims statement
elif ( elif (
self.Element.HEADING.value in self.property self.Element.HEADING.value in self.property
and not self.Element.SDOCL.value in self.property and self.Element.SDOCL.value not in self.property
and text.strip() and text.strip()
): ):
self.parents[self.level + 1] = self.doc.add_heading( self.parents[self.level + 1] = self.doc.add_heading(
@ -1164,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
self.style_html = HtmlEntity() self.style_html = HtmlEntity()
@override @override
def startElement(self, tag, attributes): # noqa: N802 def startElement(self, tag, attributes):
"""Signal the start of an element. """Signal the start of an element.
Args: Args:
@ -1177,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
self._start_registered_elements(tag, attributes) self._start_registered_elements(tag, attributes)
@override @override
def skippedEntity(self, name): # noqa: N802 def skippedEntity(self, name):
"""Receive notification of a skipped entity. """Receive notification of a skipped entity.
HTML entities will be skipped by the parser. This method will unescape them HTML entities will be skipped by the parser. This method will unescape them
@ -1211,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
self.text += unescaped self.text += unescaped
@override @override
def endElement(self, tag): # noqa: N802 def endElement(self, tag):
"""Signal the end of an element. """Signal the end of an element.
Args: Args:
@ -1474,9 +1472,7 @@ class XmlTable:
if cw == 0: if cw == 0:
offset_w0.append(col["offset"][ic]) offset_w0.append(col["offset"][ic])
min_colinfo["offset"] = sorted( min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))
list(set(col["offset"] + min_colinfo["offset"]))
)
# add back the 0 width cols to offset list # add back the 0 width cols to offset list
offset_w0 = list(set(offset_w0)) offset_w0 = list(set(offset_w0))
@ -1527,7 +1523,7 @@ class XmlTable:
return ncols_max return ncols_max
def _parse_table(self, table: Tag) -> TableData: def _parse_table(self, table: Tag) -> TableData: # noqa: C901
"""Parse the content of a table tag. """Parse the content of a table tag.
Args: Args:
@ -1722,7 +1718,7 @@ class HtmlEntity:
"0": "&#8304;", "0": "&#8304;",
"+": "&#8314;", "+": "&#8314;",
"-": "&#8315;", "-": "&#8315;",
"": "&#8315;", "": "&#8315;", # noqa: RUF001
"=": "&#8316;", "=": "&#8316;",
"(": "&#8317;", "(": "&#8317;",
")": "&#8318;", ")": "&#8318;",
@ -1746,7 +1742,7 @@ class HtmlEntity:
"0": "&#8320;", "0": "&#8320;",
"+": "&#8330;", "+": "&#8330;",
"-": "&#8331;", "-": "&#8331;",
"": "&#8331;", "": "&#8331;", # noqa: RUF001
"=": "&#8332;", "=": "&#8332;",
"(": "&#8333;", "(": "&#8333;",
")": "&#8334;", ")": "&#8334;",

View File

@ -6,14 +6,16 @@ import sys
import tempfile import tempfile
import time import time
import warnings import warnings
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Annotated, Dict, Iterable, List, Optional, Type from typing import Annotated, Dict, List, Optional, Type
import rich.table import rich.table
import typer import typer
from docling_core.types.doc import ImageRefMode from docling_core.types.doc import ImageRefMode
from docling_core.utils.file import resolve_source_to_path from docling_core.utils.file import resolve_source_to_path
from pydantic import TypeAdapter from pydantic import TypeAdapter
from rich.console import Console
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@ -53,7 +55,6 @@ warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr") warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
from rich.console import Console
console = Console() console = Console()
err_console = Console(stderr=True) err_console = Console(stderr=True)
@ -160,7 +161,6 @@ def export_documents(
export_doctags: bool, export_doctags: bool,
image_export_mode: ImageRefMode, image_export_mode: ImageRefMode,
): ):
success_count = 0 success_count = 0
failure_count = 0 failure_count = 0
@ -233,7 +233,7 @@ def _split_list(raw: Optional[str]) -> Optional[List[str]]:
@app.command(no_args_is_help=True) @app.command(no_args_is_help=True)
def convert( def convert( # noqa: C901
input_sources: Annotated[ input_sources: Annotated[
List[str], List[str],
typer.Argument( typer.Argument(
@ -289,7 +289,7 @@ def convert(
..., ...,
help=( help=(
f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: " f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
f"{', '.join((o.value for o in ocr_engines_enum_internal))}. " f"{', '.join(o.value for o in ocr_engines_enum_internal)}. "
f"Use the option --show-external-plugins to see the options allowed with external plugins." f"Use the option --show-external-plugins to see the options allowed with external plugins."
), ),
), ),
@ -430,7 +430,7 @@ def convert(
settings.debug.visualize_ocr = debug_visualize_ocr settings.debug.visualize_ocr = debug_visualize_ocr
if from_formats is None: if from_formats is None:
from_formats = [e for e in InputFormat] from_formats = list(InputFormat)
parsed_headers: Optional[Dict[str, str]] = None parsed_headers: Optional[Dict[str, str]] = None
if headers is not None: if headers is not None:

View File

@ -62,7 +62,7 @@ def download(
models: Annotated[ models: Annotated[
Optional[list[_AvailableModels]], Optional[list[_AvailableModels]],
typer.Argument( typer.Argument(
help=f"Models to download (default behavior: a predefined set of models will be downloaded).", help="Models to download (default behavior: a predefined set of models will be downloaded).",
), ),
] = None, ] = None,
all: Annotated[ all: Annotated[
@ -89,14 +89,13 @@ def download(
"Cannot simultaneously set 'all' parameter and specify models to download." "Cannot simultaneously set 'all' parameter and specify models to download."
) )
if not quiet: if not quiet:
FORMAT = "%(message)s"
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
format="[blue]%(message)s[/blue]", format="[blue]%(message)s[/blue]",
datefmt="[%X]", datefmt="[%X]",
handlers=[RichHandler(show_level=False, show_time=False, markup=True)], handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
) )
to_download = models or ([m for m in _AvailableModels] if all else _default_models) to_download = models or (list(_AvailableModels) if all else _default_models)
output_dir = download_models( output_dir = download_models(
output_dir=output_dir, output_dir=output_dir,
force=force, force=force,

View File

@ -10,7 +10,9 @@ from docling_core.types.doc import (
TableCell, TableCell,
) )
from docling_core.types.doc.page import SegmentedPdfPage, TextCell from docling_core.types.doc.page import SegmentedPdfPage, TextCell
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
# DO NOT REMOVE; explicitly exposed from this location
from docling_core.types.io import (
DocumentStream, DocumentStream,
) )
from PIL.Image import Image from PIL.Image import Image
@ -233,9 +235,9 @@ class Page(BaseModel):
None # Internal PDF backend. By default it is cleared during assembling. None # Internal PDF backend. By default it is cleared during assembling.
) )
_default_image_scale: float = 1.0 # Default image scale for external usage. _default_image_scale: float = 1.0 # Default image scale for external usage.
_image_cache: Dict[float, Image] = ( _image_cache: Dict[
{} float, Image
) # Cache of images in different scales. By default it is cleared during assembling. ] = {} # Cache of images in different scales. By default it is cleared during assembling.
def get_image( def get_image(
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
@ -243,7 +245,7 @@ class Page(BaseModel):
if self._backend is None: if self._backend is None:
return self._image_cache.get(scale, None) return self._image_cache.get(scale, None)
if not scale in self._image_cache: if scale not in self._image_cache:
if cropbox is None: if cropbox is None:
self._image_cache[scale] = self._backend.get_page_image(scale=scale) self._image_cache[scale] = self._backend.get_page_image(scale=scale)
else: else:

View File

@ -1,13 +1,13 @@
import csv import csv
import logging import logging
import re import re
from collections.abc import Iterable
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path, PurePath from pathlib import Path, PurePath
from typing import ( from typing import (
TYPE_CHECKING, TYPE_CHECKING,
Dict, Dict,
Iterable,
List, List,
Literal, Literal,
Optional, Optional,
@ -17,6 +17,8 @@ from typing import (
) )
import filetype import filetype
# DO NOT REMOVE; explicitly exposed from this location
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItem, DocItem,
DocItemLabel, DocItemLabel,
@ -35,14 +37,14 @@ from docling_core.types.legacy_doc.base import (
PageReference, PageReference,
Prov, Prov,
Ref, Ref,
Table as DsSchemaTable,
TableCell,
) )
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
from docling_core.types.legacy_doc.base import TableCell
from docling_core.types.legacy_doc.document import ( from docling_core.types.legacy_doc.document import (
CCSDocumentDescription as DsDocumentDescription, CCSDocumentDescription as DsDocumentDescription,
CCSFileInfoObject as DsFileInfoObject,
ExportedCCSDocument as DsDocument,
) )
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from docling_core.utils.file import resolve_source_to_stream from docling_core.utils.file import resolve_source_to_stream
from docling_core.utils.legacy import docling_document_to_legacy from docling_core.utils.legacy import docling_document_to_legacy
from pydantic import BaseModel from pydantic import BaseModel
@ -65,7 +67,7 @@ from docling.datamodel.base_models import (
) )
from docling.datamodel.settings import DocumentLimits from docling.datamodel.settings import DocumentLimits
from docling.utils.profiling import ProfilingItem from docling.utils.profiling import ProfilingItem
from docling.utils.utils import create_file_hash, create_hash from docling.utils.utils import create_file_hash
if TYPE_CHECKING: if TYPE_CHECKING:
from docling.document_converter import FormatOption from docling.document_converter import FormatOption
@ -134,9 +136,9 @@ class InputDocument(BaseModel):
self._init_doc(backend, path_or_stream) self._init_doc(backend, path_or_stream)
elif isinstance(path_or_stream, BytesIO): elif isinstance(path_or_stream, BytesIO):
assert ( assert filename is not None, (
filename is not None "Can't construct InputDocument from stream without providing filename arg."
), "Can't construct InputDocument from stream without providing filename arg." )
self.file = PurePath(filename) self.file = PurePath(filename)
self.filesize = path_or_stream.getbuffer().nbytes self.filesize = path_or_stream.getbuffer().nbytes
@ -228,7 +230,6 @@ class _DummyBackend(AbstractDocumentBackend):
class _DocumentConversionInput(BaseModel): class _DocumentConversionInput(BaseModel):
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]] path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
headers: Optional[Dict[str, str]] = None headers: Optional[Dict[str, str]] = None
limits: Optional[DocumentLimits] = DocumentLimits() limits: Optional[DocumentLimits] = DocumentLimits()

View File

@ -380,7 +380,6 @@ class PaginatedPipelineOptions(PipelineOptions):
class VlmPipelineOptions(PaginatedPipelineOptions): class VlmPipelineOptions(PaginatedPipelineOptions):
generate_page_images: bool = True generate_page_images: bool = True
force_backend_text: bool = ( force_backend_text: bool = (
False # (To be used with vlms, or other generative models) False # (To be used with vlms, or other generative models)

View File

@ -1,11 +1,11 @@
import hashlib import hashlib
import logging import logging
import math
import sys import sys
import time import time
from collections.abc import Iterable, Iterator
from functools import partial from functools import partial
from pathlib import Path from pathlib import Path
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union from typing import Dict, List, Optional, Tuple, Type, Union
from pydantic import BaseModel, ConfigDict, model_validator, validate_call from pydantic import BaseModel, ConfigDict, model_validator, validate_call
@ -172,7 +172,7 @@ class DocumentConverter:
format_options: Optional[Dict[InputFormat, FormatOption]] = None, format_options: Optional[Dict[InputFormat, FormatOption]] = None,
): ):
self.allowed_formats = ( self.allowed_formats = (
allowed_formats if allowed_formats is not None else [e for e in InputFormat] allowed_formats if allowed_formats is not None else list(InputFormat)
) )
self.format_to_options = { self.format_to_options = {
format: ( format: (
@ -254,7 +254,7 @@ class DocumentConverter:
if not had_result and raises_on_error: if not had_result and raises_on_error:
raise ConversionError( raise ConversionError(
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats." "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
) )
def _convert( def _convert(
@ -266,7 +266,7 @@ class DocumentConverter:
conv_input.docs(self.format_to_options), conv_input.docs(self.format_to_options),
settings.perf.doc_batch_size, # pass format_options settings.perf.doc_batch_size, # pass format_options
): ):
_log.info(f"Going to convert document batch...") _log.info("Going to convert document batch...")
# parallel processing only within input_batch # parallel processing only within input_batch
# with ThreadPoolExecutor( # with ThreadPoolExecutor(

View File

@ -1,4 +1,4 @@
from typing import Iterable from collections.abc import Iterable
from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
@ -10,7 +10,6 @@ from docling.utils.profiling import TimeRecorder
class ApiVlmModel(BasePageModel): class ApiVlmModel(BasePageModel):
def __init__( def __init__(
self, self,
enabled: bool, enabled: bool,

View File

@ -1,5 +1,6 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, Generic, Iterable, Optional, Protocol, Type from collections.abc import Iterable
from typing import Generic, Optional, Protocol, Type
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
from typing_extensions import TypeVar from typing_extensions import TypeVar
@ -29,7 +30,6 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
class GenericEnrichmentModel(ABC, Generic[EnrichElementT]): class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
elements_batch_size: int = settings.perf.elements_batch_size elements_batch_size: int = settings.perf.elements_batch_size
@abstractmethod @abstractmethod
@ -50,7 +50,6 @@ class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]): class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
def prepare_element( def prepare_element(
self, conv_res: ConversionResult, element: NodeItem self, conv_res: ConversionResult, element: NodeItem
) -> Optional[NodeItem]: ) -> Optional[NodeItem]:
@ -62,7 +61,6 @@ class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
class BaseItemAndImageEnrichmentModel( class BaseItemAndImageEnrichmentModel(
GenericEnrichmentModel[ItemAndImageEnrichmentElement] GenericEnrichmentModel[ItemAndImageEnrichmentElement]
): ):
images_scale: float images_scale: float
expansion_factor: float = 0.0 expansion_factor: float = 0.0

View File

@ -1,12 +1,12 @@
import copy import copy
import logging import logging
from abc import abstractmethod from abc import abstractmethod
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, List, Optional, Type from typing import List, Optional, Type
import numpy as np import numpy as np
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from rtree import index from rtree import index
from scipy.ndimage import binary_dilation, find_objects, label from scipy.ndimage import binary_dilation, find_objects, label

View File

@ -1,7 +1,8 @@
import re import re
from collections import Counter from collections import Counter
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, List, Literal, Optional, Tuple, Union from typing import List, Literal, Optional, Tuple, Union
import numpy as np import numpy as np
from docling_core.types.doc import ( from docling_core.types.doc import (

View File

@ -1,5 +1,6 @@
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, List, Literal, Optional, Tuple, Union from typing import List, Literal, Optional, Union
import numpy as np import numpy as np
from docling_core.types.doc import ( from docling_core.types.doc import (

View File

@ -1,8 +1,9 @@
import logging import logging
import warnings import warnings
import zipfile import zipfile
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, List, Optional, Type from typing import List, Optional, Type
import numpy import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
@ -58,12 +59,10 @@ class EasyOcrModel(BaseOcrModel):
device = decide_device(accelerator_options.device) device = decide_device(accelerator_options.device)
# Enable easyocr GPU if running on CUDA, MPS # Enable easyocr GPU if running on CUDA, MPS
use_gpu = any( use_gpu = any(
[ device.startswith(x)
device.startswith(x) for x in [
for x in [ AcceleratorDevice.CUDA.value,
AcceleratorDevice.CUDA.value, AcceleratorDevice.MPS.value,
AcceleratorDevice.MPS.value,
]
] ]
) )
else: else:
@ -98,8 +97,10 @@ class EasyOcrModel(BaseOcrModel):
progress: bool = False, progress: bool = False,
) -> Path: ) -> Path:
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py # Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
from easyocr.config import detection_models as det_models_dict from easyocr.config import (
from easyocr.config import recognition_models as rec_models_dict detection_models as det_models_dict,
recognition_models as rec_models_dict,
)
if local_dir is None: if local_dir is None:
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
@ -126,13 +127,11 @@ class EasyOcrModel(BaseOcrModel):
def __call__( def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page] self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]: ) -> Iterable[Page]:
if not self.enabled: if not self.enabled:
yield from page_batch yield from page_batch
return return
for page in page_batch: for page in page_batch:
assert page._backend is not None assert page._backend is not None
if not page._backend.is_valid(): if not page._backend.is_valid():
yield page yield page

View File

@ -9,7 +9,7 @@ from docling.models.factories.picture_description_factory import (
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@lru_cache() @lru_cache
def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory: def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
factory = OcrFactory() factory = OcrFactory()
factory.load_from_plugins(allow_external_plugins=allow_external_plugins) factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
@ -17,7 +17,7 @@ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
return factory return factory
@lru_cache() @lru_cache
def get_picture_description_factory( def get_picture_description_factory(
allow_external_plugins: bool = False, allow_external_plugins: bool = False,
) -> PictureDescriptionFactory: ) -> PictureDescriptionFactory:

View File

@ -33,7 +33,7 @@ class BaseFactory(Generic[A], metaclass=ABCMeta):
@property @property
def registered_kind(self) -> list[str]: def registered_kind(self) -> list[str]:
return list(opt.kind for opt in self._classes.keys()) return [opt.kind for opt in self._classes.keys()]
def get_enum(self) -> enum.Enum: def get_enum(self) -> enum.Enum:
return enum.Enum( return enum.Enum(

View File

@ -1,25 +1,22 @@
import logging import logging
import time import time
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, List, Optional from typing import Optional
from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions, AcceleratorOptions,
HuggingFaceVlmOptions, HuggingFaceVlmOptions,
) )
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class HuggingFaceMlxModel(BasePageModel): class HuggingFaceMlxModel(BasePageModel):
def __init__( def __init__(
self, self,
enabled: bool, enabled: bool,
@ -32,7 +29,6 @@ class HuggingFaceMlxModel(BasePageModel):
self.vlm_options = vlm_options self.vlm_options = vlm_options
if self.enabled: if self.enabled:
try: try:
from mlx_vlm import generate, load # type: ignore from mlx_vlm import generate, load # type: ignore
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
@ -125,6 +121,8 @@ class HuggingFaceMlxModel(BasePageModel):
generation_time = time.time() - start_time generation_time = time.time() - start_time
page_tags = output page_tags = output
_log.debug(f"Generation time {generation_time:.2f} seconds.")
# inference_time = time.time() - start_time # inference_time = time.time() - start_time
# tokens_per_second = num_tokens / generation_time # tokens_per_second = num_tokens / generation_time
# print("") # print("")

View File

@ -1,16 +1,15 @@
import logging import logging
import time import time
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, List, Optional from typing import Optional
from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions, AcceleratorOptions,
HuggingFaceVlmOptions, HuggingFaceVlmOptions,
) )
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder from docling.utils.profiling import TimeRecorder
@ -19,7 +18,6 @@ _log = logging.getLogger(__name__)
class HuggingFaceVlmModel(BasePageModel): class HuggingFaceVlmModel(BasePageModel):
def __init__( def __init__(
self, self,
enabled: bool, enabled: bool,
@ -42,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel):
device = decide_device(accelerator_options.device) device = decide_device(accelerator_options.device)
self.device = device self.device = device
_log.debug("Available device for HuggingFace VLM: {}".format(device)) _log.debug(f"Available device for HuggingFace VLM: {device}")
repo_cache_folder = vlm_options.repo_id.replace("/", "--") repo_cache_folder = vlm_options.repo_id.replace("/", "--")
@ -168,6 +166,10 @@ class HuggingFaceVlmModel(BasePageModel):
num_tokens = len(generated_ids[0]) num_tokens = len(generated_ids[0])
page_tags = generated_texts page_tags = generated_texts
_log.debug(
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
)
# inference_time = time.time() - start_time # inference_time = time.time() - start_time
# tokens_per_second = num_tokens / generation_time # tokens_per_second = num_tokens / generation_time
# print("") # print("")

View File

@ -1,8 +1,9 @@
import copy import copy
import logging import logging
import warnings import warnings
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Union from typing import Optional
from docling_core.types.doc import DocItemLabel from docling_core.types.doc import DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
@ -142,7 +143,6 @@ class LayoutModel(BasePageModel):
def __call__( def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page] self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]: ) -> Iterable[Page]:
for page in page_batch: for page in page_batch:
assert page._backend is not None assert page._backend is not None
if not page._backend.is_valid(): if not page._backend.is_valid():

View File

@ -1,8 +1,9 @@
import logging import logging
import sys import sys
import tempfile import tempfile
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Tuple, Type from typing import Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell from docling_core.types.doc.page import BoundingRectangle, TextCell
@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel):
if self.enabled: if self.enabled:
if "darwin" != sys.platform: if "darwin" != sys.platform:
raise RuntimeError(f"OcrMac is only supported on Mac.") raise RuntimeError("OcrMac is only supported on Mac.")
install_errmsg = ( install_errmsg = (
"ocrmac is not correctly installed. " "ocrmac is not correctly installed. "
"Please install it via `pip install ocrmac` to use this OCR engine. " "Please install it via `pip install ocrmac` to use this OCR engine. "
@ -58,7 +59,6 @@ class OcrMacModel(BaseOcrModel):
def __call__( def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page] self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]: ) -> Iterable[Page]:
if not self.enabled: if not self.enabled:
yield from page_batch yield from page_batch
return return
@ -69,7 +69,6 @@ class OcrMacModel(BaseOcrModel):
yield page yield page
else: else:
with TimeRecorder(conv_res, "ocr"): with TimeRecorder(conv_res, "ocr"):
ocr_rects = self.get_ocr_rects(page) ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = [] all_ocr_cells = []

View File

@ -1,6 +1,7 @@
import logging import logging
import re import re
from typing import Iterable, List from collections.abc import Iterable
from typing import List
from pydantic import BaseModel from pydantic import BaseModel
@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel):
sanitized_text = "".join(lines) sanitized_text = "".join(lines)
# Text normalization # Text normalization
sanitized_text = sanitized_text.replace("", "/") sanitized_text = sanitized_text.replace("", "/") # noqa: RUF001
sanitized_text = sanitized_text.replace("", "'") sanitized_text = sanitized_text.replace("", "'") # noqa: RUF001
sanitized_text = sanitized_text.replace("", "'") sanitized_text = sanitized_text.replace("", "'") # noqa: RUF001
sanitized_text = sanitized_text.replace("", '"') sanitized_text = sanitized_text.replace("", '"')
sanitized_text = sanitized_text.replace("", '"') sanitized_text = sanitized_text.replace("", '"')
sanitized_text = sanitized_text.replace("", "·") sanitized_text = sanitized_text.replace("", "·")
@ -71,7 +72,6 @@ class PageAssembleModel(BasePageModel):
yield page yield page
else: else:
with TimeRecorder(conv_res, "page_assemble"): with TimeRecorder(conv_res, "page_assemble"):
assert page.predictions.layout is not None assert page.predictions.layout is not None
# assembles some JSON output page by page. # assembles some JSON output page by page.
@ -83,7 +83,6 @@ class PageAssembleModel(BasePageModel):
for cluster in page.predictions.layout.clusters: for cluster in page.predictions.layout.clusters:
# _log.info("Cluster label seen:", cluster.label) # _log.info("Cluster label seen:", cluster.label)
if cluster.label in LayoutModel.TEXT_ELEM_LABELS: if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
textlines = [ textlines = [
cell.text.replace("\x02", "-").strip() cell.text.replace("\x02", "-").strip()
for cell in cluster.cells for cell in cluster.cells
@ -109,9 +108,7 @@ class PageAssembleModel(BasePageModel):
tbl = page.predictions.tablestructure.table_map.get( tbl = page.predictions.tablestructure.table_map.get(
cluster.id, None cluster.id, None
) )
if ( if not tbl: # fallback: add table without structure, if it isn't present
not tbl
): # fallback: add table without structure, if it isn't present
tbl = Table( tbl = Table(
label=cluster.label, label=cluster.label,
id=cluster.id, id=cluster.id,
@ -130,9 +127,7 @@ class PageAssembleModel(BasePageModel):
fig = page.predictions.figures_classification.figure_map.get( fig = page.predictions.figures_classification.figure_map.get(
cluster.id, None cluster.id, None
) )
if ( if not fig: # fallback: add figure without classification, if it isn't present
not fig
): # fallback: add figure without classification, if it isn't present
fig = FigureElement( fig = FigureElement(
label=cluster.label, label=cluster.label,
id=cluster.id, id=cluster.id,

View File

@ -1,5 +1,6 @@
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional from typing import Optional
from PIL import ImageDraw from PIL import ImageDraw
from pydantic import BaseModel from pydantic import BaseModel

View File

@ -1,5 +1,6 @@
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Type, Union from typing import Optional, Type, Union
from PIL import Image from PIL import Image

View File

@ -1,12 +1,11 @@
import logging
from abc import abstractmethod from abc import abstractmethod
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Any, Iterable, List, Optional, Type, Union from typing import List, Optional, Type, Union
from docling_core.types.doc import ( from docling_core.types.doc import (
DoclingDocument, DoclingDocument,
NodeItem, NodeItem,
PictureClassificationClass,
PictureItem, PictureItem,
) )
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc

View File

@ -1,5 +1,6 @@
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Type, Union from typing import Optional, Type, Union
from PIL import Image from PIL import Image
@ -13,7 +14,6 @@ from docling.utils.accelerator_utils import decide_device
class PictureDescriptionVlmModel(PictureDescriptionBaseModel): class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
@classmethod @classmethod
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]: def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
return PictureDescriptionVlmOptions return PictureDescriptionVlmOptions
@ -36,7 +36,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
self.options: PictureDescriptionVlmOptions self.options: PictureDescriptionVlmOptions
if self.enabled: if self.enabled:
if artifacts_path is None: if artifacts_path is None:
artifacts_path = self.download_models(repo_id=self.options.repo_id) artifacts_path = self.download_models(repo_id=self.options.repo_id)
else: else:

View File

@ -1,6 +1,7 @@
import logging import logging
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Type from typing import Optional, Type
import numpy import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
@ -74,13 +75,11 @@ class RapidOcrModel(BaseOcrModel):
def __call__( def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page] self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]: ) -> Iterable[Page]:
if not self.enabled: if not self.enabled:
yield from page_batch yield from page_batch
return return
for page in page_batch: for page in page_batch:
assert page._backend is not None assert page._backend is not None
if not page._backend.is_valid(): if not page._backend.is_valid():
yield page yield page

View File

@ -1,12 +1,7 @@
import copy
import random
from pathlib import Path from pathlib import Path
from typing import Dict, List from typing import Dict, List
from docling_core.types.doc import ( from docling_core.types.doc import (
BoundingBox,
CoordOrigin,
DocItem,
DocItemLabel, DocItemLabel,
DoclingDocument, DoclingDocument,
DocumentOrigin, DocumentOrigin,
@ -17,13 +12,10 @@ from docling_core.types.doc import (
TableData, TableData,
) )
from docling_core.types.doc.document import ContentLayer from docling_core.types.doc.document import ContentLayer
from docling_core.types.legacy_doc.base import Ref
from docling_core.types.legacy_doc.document import BaseText
from docling_ibm_models.reading_order.reading_order_rb import ( from docling_ibm_models.reading_order.reading_order_rb import (
PageElement as ReadingOrderPageElement, PageElement as ReadingOrderPageElement,
ReadingOrderPredictor,
) )
from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
from PIL import ImageDraw
from pydantic import BaseModel, ConfigDict from pydantic import BaseModel, ConfigDict
from docling.datamodel.base_models import ( from docling.datamodel.base_models import (
@ -35,7 +27,6 @@ from docling.datamodel.base_models import (
TextElement, TextElement,
) )
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.utils.profiling import ProfilingScope, TimeRecorder from docling.utils.profiling import ProfilingScope, TimeRecorder
@ -53,12 +44,10 @@ class ReadingOrderModel:
def _assembled_to_readingorder_elements( def _assembled_to_readingorder_elements(
self, conv_res: ConversionResult self, conv_res: ConversionResult
) -> List[ReadingOrderPageElement]: ) -> List[ReadingOrderPageElement]:
elements: List[ReadingOrderPageElement] = [] elements: List[ReadingOrderPageElement] = []
page_no_to_pages = {p.page_no: p for p in conv_res.pages} page_no_to_pages = {p.page_no: p for p in conv_res.pages}
for element in conv_res.assembled.elements: for element in conv_res.assembled.elements:
page_height = page_no_to_pages[element.page_no].size.height # type: ignore page_height = page_no_to_pages[element.page_no].size.height # type: ignore
bbox = element.cluster.bbox.to_bottom_left_origin(page_height) bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
text = element.text or "" text = element.text or ""
@ -84,7 +73,6 @@ class ReadingOrderModel:
def _add_child_elements( def _add_child_elements(
self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
): ):
child: Cluster child: Cluster
for child in element.cluster.children: for child in element.cluster.children:
c_label = child.label c_label = child.label
@ -110,7 +98,7 @@ class ReadingOrderModel:
else: else:
doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov) doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
def _readingorder_elements_to_docling_doc( def _readingorder_elements_to_docling_doc( # noqa: C901
self, self,
conv_res: ConversionResult, conv_res: ConversionResult,
ro_elements: List[ReadingOrderPageElement], ro_elements: List[ReadingOrderPageElement],
@ -118,7 +106,6 @@ class ReadingOrderModel:
el_to_footnotes_mapping: Dict[int, List[int]], el_to_footnotes_mapping: Dict[int, List[int]],
el_merges_mapping: Dict[int, List[int]], el_merges_mapping: Dict[int, List[int]],
) -> DoclingDocument: ) -> DoclingDocument:
id_to_elem = { id_to_elem = {
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
for elem in conv_res.assembled.elements for elem in conv_res.assembled.elements
@ -192,7 +179,6 @@ class ReadingOrderModel:
code_item.footnotes.append(new_footnote_item.get_ref()) code_item.footnotes.append(new_footnote_item.get_ref())
else: else:
new_item, current_list = self._handle_text_element( new_item, current_list = self._handle_text_element(
element, out_doc, current_list, page_height element, out_doc, current_list, page_height
) )
@ -206,7 +192,6 @@ class ReadingOrderModel:
) )
elif isinstance(element, Table): elif isinstance(element, Table):
tbl_data = TableData( tbl_data = TableData(
num_rows=element.num_rows, num_rows=element.num_rows,
num_cols=element.num_cols, num_cols=element.num_cols,
@ -342,12 +327,12 @@ class ReadingOrderModel:
return new_item, current_list return new_item, current_list
def _merge_elements(self, element, merged_elem, new_item, page_height): def _merge_elements(self, element, merged_elem, new_item, page_height):
assert isinstance( assert isinstance(merged_elem, type(element)), (
merged_elem, type(element) "Merged element must be of same type as element."
), "Merged element must be of same type as element." )
assert ( assert merged_elem.label == new_item.label, (
merged_elem.label == new_item.label "Labels of merged elements must match."
), "Labels of merged elements must match." )
prov = ProvenanceItem( prov = ProvenanceItem(
page_no=element.page_no + 1, page_no=element.page_no + 1,
charspan=( charspan=(

View File

@ -1,13 +1,13 @@
import copy import copy
import warnings import warnings
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Union from typing import Optional
import numpy import numpy
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
from docling_core.types.doc.page import ( from docling_core.types.doc.page import (
BoundingRectangle, BoundingRectangle,
SegmentedPdfPage,
TextCellUnit, TextCellUnit,
) )
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
@ -44,7 +44,6 @@ class TableStructureModel(BasePageModel):
self.enabled = enabled self.enabled = enabled
if self.enabled: if self.enabled:
if artifacts_path is None: if artifacts_path is None:
artifacts_path = self.download_models() / self._model_path artifacts_path = self.download_models() / self._model_path
else: else:
@ -175,7 +174,6 @@ class TableStructureModel(BasePageModel):
def __call__( def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page] self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]: ) -> Iterable[Page]:
if not self.enabled: if not self.enabled:
yield from page_batch yield from page_batch
return return
@ -186,7 +184,6 @@ class TableStructureModel(BasePageModel):
yield page yield page
else: else:
with TimeRecorder(conv_res, "table_structure"): with TimeRecorder(conv_res, "table_structure"):
assert page.predictions.layout is not None assert page.predictions.layout is not None
assert page.size is not None assert page.size is not None
@ -260,7 +257,6 @@ class TableStructureModel(BasePageModel):
table_out = tf_output[0] table_out = tf_output[0]
table_cells = [] table_cells = []
for element in table_out["tf_responses"]: for element in table_out["tf_responses"]:
if not self.do_cell_matching: if not self.do_cell_matching:
the_bbox = BoundingBox.model_validate( the_bbox = BoundingBox.model_validate(
element["bbox"] element["bbox"]

View File

@ -3,9 +3,10 @@ import io
import logging import logging
import os import os
import tempfile import tempfile
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from subprocess import DEVNULL, PIPE, Popen from subprocess import DEVNULL, PIPE, Popen
from typing import Iterable, List, Optional, Tuple, Type from typing import List, Optional, Tuple, Type
import pandas as pd import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
@ -63,8 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel):
) )
def _get_name_and_version(self) -> Tuple[str, str]: def _get_name_and_version(self) -> Tuple[str, str]:
if self._name is not None and self._version is not None:
if self._name != None and self._version != None:
return self._name, self._version # type: ignore return self._name, self._version # type: ignore
cmd = [self.options.tesseract_cmd, "--version"] cmd = [self.options.tesseract_cmd, "--version"]
@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
# _log.info(decoded_data) # _log.info(decoded_data)
# Read the TSV file generated by Tesseract # Read the TSV file generated by Tesseract
df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t") df_result = pd.read_csv(
io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
)
# Display the dataframe (optional) # Display the dataframe (optional)
# _log.info("df: ", df.head()) # _log.info("df: ", df.head())
# Filter rows that contain actual text (ignore header or empty rows) # Filter rows that contain actual text (ignore header or empty rows)
df_filtered = df[ df_filtered = df_result[
df["text"].notnull() & (df["text"].apply(str).str.strip() != "") df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
] ]
return df_filtered return df_filtered
@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate() output, _ = proc.communicate()
decoded_data = output.decode("utf-8") decoded_data = output.decode("utf-8")
df = pd.read_csv( df_detected = pd.read_csv(
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"] io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
) )
scripts = df.loc[df["key"] == "Script"].value.tolist() scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
if len(scripts) == 0: if len(scripts) == 0:
_log.warning("Tesseract cannot detect the script of the page") _log.warning("Tesseract cannot detect the script of the page")
return None return None
@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate() output, _ = proc.communicate()
decoded_data = output.decode("utf-8") decoded_data = output.decode("utf-8")
df = pd.read_csv(io.StringIO(decoded_data), header=None) df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
self._tesseract_languages = df[0].tolist()[1:] self._tesseract_languages = df_list[0].tolist()[1:]
# Decide the script prefix # Decide the script prefix
if any([l.startswith("script/") for l in self._tesseract_languages]): if any(lang.startswith("script/") for lang in self._tesseract_languages):
script_prefix = "script/" script_prefix = "script/"
else: else:
script_prefix = "" script_prefix = ""
@ -197,7 +199,6 @@ class TesseractOcrCliModel(BaseOcrModel):
def __call__( def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page] self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]: ) -> Iterable[Page]:
if not self.enabled: if not self.enabled:
yield from page_batch yield from page_batch
return return
@ -225,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
fname = image_file.name fname = image_file.name
high_res_image.save(image_file) high_res_image.save(image_file)
df = self._run_tesseract(fname) df_result = self._run_tesseract(fname)
finally: finally:
if os.path.exists(fname): if os.path.exists(fname):
os.remove(fname) os.remove(fname)
# _log.info(df) # _log.info(df_result)
# Print relevant columns (bounding box and text) # Print relevant columns (bounding box and text)
for ix, row in df.iterrows(): for ix, row in df_result.iterrows():
text = row["text"] text = row["text"]
conf = row["conf"] conf = row["conf"]
l = float(row["left"]) l = float(row["left"]) # noqa: E741
b = float(row["top"]) b = float(row["top"])
w = float(row["width"]) w = float(row["width"])
h = float(row["height"]) h = float(row["height"])

View File

@ -1,6 +1,7 @@
import logging import logging
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Type from typing import Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell from docling_core.types.doc.page import BoundingRectangle, TextCell
@ -37,9 +38,6 @@ class TesseractOcrModel(BaseOcrModel):
self.options: TesseractOcrOptions self.options: TesseractOcrOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi. self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self.reader = None
self.osd_reader = None
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
if self.enabled: if self.enabled:
install_errmsg = ( install_errmsg = (
@ -64,7 +62,7 @@ class TesseractOcrModel(BaseOcrModel):
raise ImportError(install_errmsg) raise ImportError(install_errmsg)
try: try:
tesseract_version = tesserocr.tesseract_version() tesseract_version = tesserocr.tesseract_version()
except: except Exception:
raise ImportError(install_errmsg) raise ImportError(install_errmsg)
_, self._tesserocr_languages = tesserocr.get_languages() _, self._tesserocr_languages = tesserocr.get_languages()
@ -75,7 +73,7 @@ class TesseractOcrModel(BaseOcrModel):
_log.debug("Initializing TesserOCR: %s", tesseract_version) _log.debug("Initializing TesserOCR: %s", tesseract_version)
lang = "+".join(self.options.lang) lang = "+".join(self.options.lang)
if any([l.startswith("script/") for l in self._tesserocr_languages]): if any(lang.startswith("script/") for lang in self._tesserocr_languages):
self.script_prefix = "script/" self.script_prefix = "script/"
else: else:
self.script_prefix = "" self.script_prefix = ""
@ -86,6 +84,10 @@ class TesseractOcrModel(BaseOcrModel):
"oem": tesserocr.OEM.DEFAULT, "oem": tesserocr.OEM.DEFAULT,
} }
self.reader = None
self.osd_reader = None
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
if self.options.path is not None: if self.options.path is not None:
tesserocr_kwargs["path"] = self.options.path tesserocr_kwargs["path"] = self.options.path

View File

@ -3,9 +3,10 @@ import logging
import time import time
import traceback import traceback
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from typing import Any, Callable, Iterable, List from collections.abc import Iterable
from typing import Any, Callable, List
from docling_core.types.doc import DoclingDocument, NodeItem from docling_core.types.doc import NodeItem
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend
@ -64,7 +65,6 @@ class BasePipeline(ABC):
return conv_res return conv_res
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult: def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
def _prepare_elements( def _prepare_elements(
conv_res: ConversionResult, model: GenericEnrichmentModel[Any] conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
) -> Iterable[NodeItem]: ) -> Iterable[NodeItem]:
@ -113,7 +113,6 @@ class BasePipeline(ABC):
class PaginatedPipeline(BasePipeline): # TODO this is a bad name. class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
def __init__(self, pipeline_options: PipelineOptions): def __init__(self, pipeline_options: PipelineOptions):
super().__init__(pipeline_options) super().__init__(pipeline_options)
self.keep_backend = False self.keep_backend = False
@ -127,7 +126,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
yield from page_batch yield from page_batch
def _build_document(self, conv_res: ConversionResult) -> ConversionResult: def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
if not isinstance(conv_res.input._backend, PdfDocumentBackend): if not isinstance(conv_res.input._backend, PdfDocumentBackend):
raise RuntimeError( raise RuntimeError(
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. " f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
@ -139,8 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
total_elapsed_time = 0.0 total_elapsed_time = 0.0
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT): with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
for i in range(conv_res.input.page_count):
for i in range(0, conv_res.input.page_count):
start_page, end_page = conv_res.input.limits.page_range start_page, end_page = conv_res.input.limits.page_range
if (start_page - 1) <= i <= (end_page - 1): if (start_page - 1) <= i <= (end_page - 1):
conv_res.pages.append(Page(page_no=i)) conv_res.pages.append(Page(page_no=i))
@ -161,7 +158,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
pipeline_pages = self._apply_on_pages(conv_res, init_pages) pipeline_pages = self._apply_on_pages(conv_res, init_pages)
for p in pipeline_pages: # Must exhaust! for p in pipeline_pages: # Must exhaust!
# Cleanup cached images # Cleanup cached images
if not self.keep_images: if not self.keep_images:
p._image_cache = {} p._image_cache = {}

View File

@ -24,7 +24,6 @@ class SimplePipeline(BasePipeline):
super().__init__(pipeline_options) super().__init__(pipeline_options)
def _build_document(self, conv_res: ConversionResult) -> ConversionResult: def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend): if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
raise RuntimeError( raise RuntimeError(
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. " f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "

View File

@ -1,5 +1,4 @@
import logging import logging
import sys
import warnings import warnings
from pathlib import Path from pathlib import Path
from typing import Optional, cast from typing import Optional, cast

View File

@ -1,5 +1,4 @@
import logging import logging
import warnings
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import List, Optional, Union, cast from typing import List, Optional, Union, cast
@ -32,7 +31,6 @@ _log = logging.getLogger(__name__)
class VlmPipeline(PaginatedPipeline): class VlmPipeline(PaginatedPipeline):
def __init__(self, pipeline_options: VlmPipelineOptions): def __init__(self, pipeline_options: VlmPipelineOptions):
super().__init__(pipeline_options) super().__init__(pipeline_options)
self.keep_backend = True self.keep_backend = True
@ -114,7 +112,6 @@ class VlmPipeline(PaginatedPipeline):
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT): with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
if ( if (
self.pipeline_options.vlm_options.response_format self.pipeline_options.vlm_options.response_format
== ResponseFormat.DOCTAGS == ResponseFormat.DOCTAGS

View File

@ -1,8 +1,8 @@
import logging import logging
from typing import Any, Dict, Iterable, List, Tuple, Union from collections.abc import Iterable
from typing import Any, Dict, List, Tuple, Union
from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import TextCell
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
from docling.datamodel.document import ConversionResult, Page from docling.datamodel.document import ConversionResult, Page
@ -13,7 +13,6 @@ _log = logging.getLogger(__name__)
def generate_multimodal_pages( def generate_multimodal_pages(
doc_result: ConversionResult, doc_result: ConversionResult,
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]: ) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
label_to_doclaynet = { label_to_doclaynet = {
"title": "title", "title": "title",
"table-of-contents": "document_index", "table-of-contents": "document_index",
@ -122,7 +121,6 @@ def generate_multimodal_pages(
if doc.main_text is None: if doc.main_text is None:
return return
for ix, orig_item in enumerate(doc.main_text): for ix, orig_item in enumerate(doc.main_text):
item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
if item is None or item.prov is None or len(item.prov) == 0: if item is None or item.prov is None or len(item.prov) == 0:
_log.debug(f"Skipping item {orig_item}") _log.debug(f"Skipping item {orig_item}")

View File

@ -29,7 +29,7 @@ def resolve_item(paths, obj):
try: try:
key = int(paths[0]) key = int(paths[0])
except: except Exception:
key = paths[0] key = paths[0]
if len(paths) == 1: if len(paths) == 1:
@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
return unique_objects return unique_objects
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: # noqa: C901
origin = DocumentOrigin( origin = DocumentOrigin(
mimetype="application/pdf", mimetype="application/pdf",
filename=doc_glm["file-info"]["filename"], filename=doc_glm["file-info"]["filename"],

View File

@ -18,7 +18,7 @@ class UnionFind:
def __init__(self, elements): def __init__(self, elements):
self.parent = {elem: elem for elem in elements} self.parent = {elem: elem for elem in elements}
self.rank = {elem: 0 for elem in elements} self.rank = dict.fromkeys(elements, 0)
def find(self, x): def find(self, x):
if self.parent[x] != x: if self.parent[x] != x:
@ -484,7 +484,9 @@ class LayoutPostprocessor:
spatial_index = ( spatial_index = (
self.regular_index self.regular_index
if cluster_type == "regular" if cluster_type == "regular"
else self.picture_index if cluster_type == "picture" else self.wrapper_index else self.picture_index
if cluster_type == "picture"
else self.wrapper_index
) )
# Map of currently valid clusters # Map of currently valid clusters

View File

@ -37,7 +37,7 @@ def download_models(
output_dir.mkdir(exist_ok=True, parents=True) output_dir.mkdir(exist_ok=True, parents=True)
if with_layout: if with_layout:
_log.info(f"Downloading layout model...") _log.info("Downloading layout model...")
LayoutModel.download_models( LayoutModel.download_models(
local_dir=output_dir / LayoutModel._model_repo_folder, local_dir=output_dir / LayoutModel._model_repo_folder,
force=force, force=force,
@ -45,7 +45,7 @@ def download_models(
) )
if with_tableformer: if with_tableformer:
_log.info(f"Downloading tableformer model...") _log.info("Downloading tableformer model...")
TableStructureModel.download_models( TableStructureModel.download_models(
local_dir=output_dir / TableStructureModel._model_repo_folder, local_dir=output_dir / TableStructureModel._model_repo_folder,
force=force, force=force,
@ -53,7 +53,7 @@ def download_models(
) )
if with_picture_classifier: if with_picture_classifier:
_log.info(f"Downloading picture classifier model...") _log.info("Downloading picture classifier model...")
DocumentPictureClassifier.download_models( DocumentPictureClassifier.download_models(
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder, local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
force=force, force=force,
@ -61,7 +61,7 @@ def download_models(
) )
if with_code_formula: if with_code_formula:
_log.info(f"Downloading code formula model...") _log.info("Downloading code formula model...")
CodeFormulaModel.download_models( CodeFormulaModel.download_models(
local_dir=output_dir / CodeFormulaModel._model_repo_folder, local_dir=output_dir / CodeFormulaModel._model_repo_folder,
force=force, force=force,
@ -69,7 +69,7 @@ def download_models(
) )
if with_smolvlm: if with_smolvlm:
_log.info(f"Downloading SmolVlm model...") _log.info("Downloading SmolVlm model...")
PictureDescriptionVlmModel.download_models( PictureDescriptionVlmModel.download_models(
repo_id=smolvlm_picture_description.repo_id, repo_id=smolvlm_picture_description.repo_id,
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder, local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
@ -78,7 +78,7 @@ def download_models(
) )
if with_granite_vision: if with_granite_vision:
_log.info(f"Downloading Granite Vision model...") _log.info("Downloading Granite Vision model...")
PictureDescriptionVlmModel.download_models( PictureDescriptionVlmModel.download_models(
repo_id=granite_picture_description.repo_id, repo_id=granite_picture_description.repo_id,
local_dir=output_dir / granite_picture_description.repo_cache_folder, local_dir=output_dir / granite_picture_description.repo_cache_folder,
@ -87,7 +87,7 @@ def download_models(
) )
if with_easyocr: if with_easyocr:
_log.info(f"Downloading easyocr models...") _log.info("Downloading easyocr models...")
EasyOcrModel.download_models( EasyOcrModel.download_models(
local_dir=output_dir / EasyOcrModel._model_repo_folder, local_dir=output_dir / EasyOcrModel._model_repo_folder,
force=force, force=force,

View File

@ -13,7 +13,7 @@ def chunkify(iterator, chunk_size):
if isinstance(iterator, List): if isinstance(iterator, List):
iterator = iter(iterator) iterator = iter(iterator)
for first in iterator: # Take the first element from the iterator for first in iterator: # Take the first element from the iterator
yield [first] + list(islice(iterator, chunk_size - 1)) yield [first, *list(islice(iterator, chunk_size - 1))]
def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str: def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:

View File

@ -383,7 +383,7 @@
"\n", "\n",
"print(f\"Downloading {url}...\")\n", "print(f\"Downloading {url}...\")\n",
"buf = BytesIO(requests.get(url).content)\n", "buf = BytesIO(requests.get(url).content)\n",
"print(f\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n", "print(\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
"with zipfile.ZipFile(buf) as zf:\n", "with zipfile.ZipFile(buf) as zf:\n",
" res = zf.testzip()\n", " res = zf.testzip()\n",
" if res:\n", " if res:\n",
@ -544,7 +544,7 @@
"source": [ "source": [
"doc = backend.convert()\n", "doc = backend.convert()\n",
"\n", "\n",
"claims_sec = [item for item in doc.texts if item.text == \"CLAIMS\"][0]\n", "claims_sec = next(item for item in doc.texts if item.text == \"CLAIMS\")\n",
"print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')" "print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')"
] ]
}, },

View File

@ -1,8 +1,8 @@
import json import json
import logging import logging
import time import time
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable
import yaml import yaml
from docling_core.types.doc import ImageRefMode from docling_core.types.doc import ImageRefMode
@ -11,7 +11,6 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -3,7 +3,6 @@ import logging
import time import time
from pathlib import Path from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
AcceleratorDevice, AcceleratorDevice,
@ -11,9 +10,6 @@ from docling.datamodel.pipeline_options import (
PdfPipelineOptions, PdfPipelineOptions,
) )
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.ocr_mac_model import OcrMacOptions
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
from docling.models.tesseract_ocr_model import TesseractOcrOptions
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -3,8 +3,8 @@
# It does not run the actual formula understanding model. # It does not run the actual formula understanding model.
import logging import logging
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Iterable
from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem
@ -49,7 +49,6 @@ class ExampleFormulaUnderstandingEnrichmentModel(BaseItemAndImageEnrichmentModel
# How the pipeline can be extended. # How the pipeline can be extended.
class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline): class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
def __init__(self, pipeline_options: ExampleFormulaUnderstandingPipelineOptions): def __init__(self, pipeline_options: ExampleFormulaUnderstandingPipelineOptions):
super().__init__(pipeline_options) super().__init__(pipeline_options)
self.pipeline_options: ExampleFormulaUnderstandingPipelineOptions self.pipeline_options: ExampleFormulaUnderstandingPipelineOptions
@ -85,7 +84,7 @@ def main():
) )
} }
) )
result = doc_converter.convert(input_doc_path) doc_converter.convert(input_doc_path)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -3,8 +3,9 @@
# It does not run the actual picture classifier model. # It does not run the actual picture classifier model.
import logging import logging
from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Any, Iterable from typing import Any
from docling_core.types.doc import ( from docling_core.types.doc import (
DoclingDocument, DoclingDocument,

View File

@ -4,7 +4,7 @@ from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -51,7 +51,6 @@ def main():
page_segments, page_segments,
page, page,
) in generate_multimodal_pages(conv_res): ) in generate_multimodal_pages(conv_res):
dpi = page._default_image_scale * 72 dpi = page._default_image_scale * 72
rows.append( rows.append(
@ -81,10 +80,10 @@ def main():
) )
# Generate one parquet from all documents # Generate one parquet from all documents
df = pd.json_normalize(rows) df_result = pd.json_normalize(rows)
now = datetime.datetime.now() now = datetime.datetime.now()
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet" output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
df.to_parquet(output_filename) df_result.to_parquet(output_filename)
end_time = time.time() - start_time end_time = time.time() - start_time

View File

@ -32,12 +32,12 @@ def main():
print(table_df.to_markdown()) print(table_df.to_markdown())
# Save the table as csv # Save the table as csv
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv" element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv"
_log.info(f"Saving CSV table to {element_csv_filename}") _log.info(f"Saving CSV table to {element_csv_filename}")
table_df.to_csv(element_csv_filename) table_df.to_csv(element_csv_filename)
# Save the table as html # Save the table as html
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html" element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html"
_log.info(f"Saving HTML table to {element_html_filename}") _log.info(f"Saving HTML table to {element_html_filename}")
with element_html_filename.open("w") as fp: with element_html_filename.open("w") as fp:
fp.write(table.export_to_html(doc=conv_res.document)) fp.write(table.export_to_html(doc=conv_res.document))

View File

@ -1,14 +1,9 @@
from pathlib import Path from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions, PdfPipelineOptions,
RapidOcrOptions,
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions,
) )
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -153,10 +153,10 @@
"source": [ "source": [
"for i, chunk in enumerate(chunk_iter):\n", "for i, chunk in enumerate(chunk_iter):\n",
" print(f\"=== {i} ===\")\n", " print(f\"=== {i} ===\")\n",
" print(f\"chunk.text:\\n{repr(f'{chunk.text[:300]}…')}\")\n", " print(f\"chunk.text:\\n{f'{chunk.text[:300]}…'!r}\")\n",
"\n", "\n",
" enriched_text = chunker.serialize(chunk=chunk)\n", " enriched_text = chunker.serialize(chunk=chunk)\n",
" print(f\"chunker.serialize(chunk):\\n{repr(f'{enriched_text[:300]}…')}\")\n", " print(f\"chunker.serialize(chunk):\\n{f'{enriched_text[:300]}…'!r}\")\n",
"\n", "\n",
" print()" " print()"
] ]
@ -353,11 +353,11 @@
"for i, chunk in enumerate(chunks):\n", "for i, chunk in enumerate(chunks):\n",
" print(f\"=== {i} ===\")\n", " print(f\"=== {i} ===\")\n",
" txt_tokens = len(tokenizer.tokenize(chunk.text))\n", " txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
" print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n", " print(f\"chunk.text ({txt_tokens} tokens):\\n{chunk.text!r}\")\n",
"\n", "\n",
" ser_txt = chunker.serialize(chunk=chunk)\n", " ser_txt = chunker.serialize(chunk=chunk)\n",
" ser_tokens = len(tokenizer.tokenize(ser_txt))\n", " ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n", " print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{ser_txt!r}\")\n",
"\n", "\n",
" print()" " print()"
] ]

View File

@ -2,17 +2,14 @@ import json
import time import time
from pathlib import Path from pathlib import Path
import yaml from docling_core.types.doc import DocItemLabel, ImageRefMode
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
AcceleratorDevice,
VlmPipelineOptions, VlmPipelineOptions,
granite_vision_vlm_conversion_options,
smoldocling_vlm_conversion_options,
smoldocling_vlm_mlx_conversion_options, smoldocling_vlm_mlx_conversion_options,
) )
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline from docling.pipeline.vlm_pipeline import VlmPipeline
@ -39,9 +36,6 @@ pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
## Alternative VLM models: ## Alternative VLM models:
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options # pipeline_options.vlm_options = granite_vision_vlm_conversion_options
from docling_core.types.doc import DocItemLabel, ImageRefMode
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
## Set up pipeline for PDF or image inputs ## Set up pipeline for PDF or image inputs
converter = DocumentConverter( converter = DocumentConverter(
format_options={ format_options={
@ -62,7 +56,7 @@ out_path.mkdir(parents=True, exist_ok=True)
for source in sources: for source in sources:
start_time = time.time() start_time = time.time()
print("================================================") print("================================================")
print("Processing... {}".format(source)) print(f"Processing... {source}")
print("================================================") print("================================================")
print("") print("")
@ -77,7 +71,7 @@ for source in sources:
print(page.predictions.vlm_response.text) print(page.predictions.vlm_response.text)
res.document.save_as_html( res.document.save_as_html(
filename=Path("{}/{}.html".format(out_path, res.input.file.stem)), filename=Path(f"{out_path}/{res.input.file.stem}.html"),
image_mode=ImageRefMode.REFERENCED, image_mode=ImageRefMode.REFERENCED,
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE], labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
) )

View File

@ -144,7 +144,7 @@
"for pic in doc.pictures[:5]:\n", "for pic in doc.pictures[:5]:\n",
" html_item = (\n", " html_item = (\n",
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n", " f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n", " f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n", " f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
" )\n", " )\n",
" for annotation in pic.annotations:\n", " for annotation in pic.annotations:\n",
@ -252,7 +252,7 @@
"for pic in doc.pictures[:5]:\n", "for pic in doc.pictures[:5]:\n",
" html_item = (\n", " html_item = (\n",
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n", " f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n", " f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n", " f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
" )\n", " )\n",
" for annotation in pic.annotations:\n", " for annotation in pic.annotations:\n",

View File

@ -283,7 +283,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 23, "execution_count": null,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -369,7 +369,7 @@
" new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n", " new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n",
" try:\n", " try:\n",
" index_client.delete_index(index_name)\n", " index_client.delete_index(index_name)\n",
" except:\n", " except Exception:\n",
" pass\n", " pass\n",
"\n", "\n",
" index_client.create_or_update_index(new_index)\n", " index_client.create_or_update_index(new_index)\n",
@ -487,7 +487,7 @@
"\n", "\n",
" all_succeeded = all(r.succeeded for r in resp)\n", " all_succeeded = all(r.succeeded for r in resp)\n",
" console.print(\n", " console.print(\n",
" f\"Uploaded batch {i} -> {i+len(subset)}; all_succeeded: {all_succeeded}, \"\n", " f\"Uploaded batch {i} -> {i + len(subset)}; all_succeeded: {all_succeeded}, \"\n",
" f\"first_doc_status_code: {resp[0].status_code}\"\n", " f\"first_doc_status_code: {resp[0].status_code}\"\n",
" )\n", " )\n",
"\n", "\n",
@ -807,10 +807,12 @@
} }
], ],
"source": [ "source": [
"from typing import Optional\n",
"\n",
"from azure.search.documents.models import VectorizableTextQuery\n", "from azure.search.documents.models import VectorizableTextQuery\n",
"\n", "\n",
"\n", "\n",
"def generate_chat_response(prompt: str, system_message: str = None):\n", "def generate_chat_response(prompt: str, system_message: Optional[str] = None):\n",
" \"\"\"\n", " \"\"\"\n",
" Generates a single-turn chat response using Azure OpenAI Chat.\n", " Generates a single-turn chat response using Azure OpenAI Chat.\n",
" If you need multi-turn conversation or follow-up queries, you'll have to\n", " If you need multi-turn conversation or follow-up queries, you'll have to\n",

View File

@ -351,7 +351,7 @@
"for source in sources:\n", "for source in sources:\n",
" if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n", " if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
" doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n", " doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n",
" print(f\"- text: {repr(doc_chunk.text)}\")\n", " print(f\"- text: {doc_chunk.text!r}\")\n",
" if doc_chunk.meta.origin:\n", " if doc_chunk.meta.origin:\n",
" print(f\" file: {doc_chunk.meta.origin.filename}\")\n", " print(f\" file: {doc_chunk.meta.origin.filename}\")\n",
" if doc_chunk.meta.headings:\n", " if doc_chunk.meta.headings:\n",

View File

@ -341,7 +341,7 @@
"print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{clipped_answer}\")\n", "print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{clipped_answer}\")\n",
"for i, doc in enumerate(resp_dict[\"context\"]):\n", "for i, doc in enumerate(resp_dict[\"context\"]):\n",
" print()\n", " print()\n",
" print(f\"Source {i+1}:\")\n", " print(f\"Source {i + 1}:\")\n",
" print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n", " print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
" for key in doc.metadata:\n", " for key in doc.metadata:\n",
" if key != \"pk\":\n", " if key != \"pk\":\n",

View File

@ -59,7 +59,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 1, "execution_count": null,
"metadata": { "metadata": {
"collapsed": true, "collapsed": true,
"id": "u076oUSF_YUG" "id": "u076oUSF_YUG"
@ -72,12 +72,11 @@
"%pip install rich\n", "%pip install rich\n",
"%pip install torch\n", "%pip install torch\n",
"\n", "\n",
"import logging\n",
"import warnings\n", "import warnings\n",
"\n", "\n",
"warnings.filterwarnings(\"ignore\")\n", "warnings.filterwarnings(\"ignore\")\n",
"\n", "\n",
"import logging\n",
"\n",
"# Suppress Weaviate client logs\n", "# Suppress Weaviate client logs\n",
"logging.getLogger(\"weaviate\").setLevel(logging.ERROR)" "logging.getLogger(\"weaviate\").setLevel(logging.ERROR)"
] ]
@ -119,7 +118,7 @@
" device = torch.device(\"mps\")\n", " device = torch.device(\"mps\")\n",
" print(\"MPS GPU is enabled.\")\n", " print(\"MPS GPU is enabled.\")\n",
"else:\n", "else:\n",
" raise EnvironmentError(\n", " raise OSError(\n",
" \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n", " \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
" )" " )"
] ]
@ -226,7 +225,6 @@
} }
], ],
"source": [ "source": [
"from docling.datamodel.document import ConversionResult\n",
"from docling.document_converter import DocumentConverter\n", "from docling.document_converter import DocumentConverter\n",
"\n", "\n",
"# Instantiate the doc converter\n", "# Instantiate the doc converter\n",
@ -345,7 +343,7 @@
"\n", "\n",
" openai_api_key = os.getenv(openai_api_key_var)\n", " openai_api_key = os.getenv(openai_api_key_var)\n",
" if not openai_api_key:\n", " if not openai_api_key:\n",
" raise EnvironmentError(\n", " raise OSError(\n",
" f\"Environment variable '{openai_api_key_var}' is not set. \"\n", " f\"Environment variable '{openai_api_key_var}' is not set. \"\n",
" \"Please define it before running this script.\"\n", " \"Please define it before running this script.\"\n",
" )" " )"
@ -387,7 +385,6 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"import weaviate.classes.config as wc\n", "import weaviate.classes.config as wc\n",
"from weaviate.classes.config import DataType, Property\n",
"\n", "\n",
"# Define the collection name\n", "# Define the collection name\n",
"collection_name = \"docling\"\n", "collection_name = \"docling\"\n",

View File

@ -25,9 +25,7 @@ def main():
document = mdb.convert() document = mdb.convert()
out_path = Path("scratch") out_path = Path("scratch")
print( print(f"Document {path} converted.\nSaved markdown output to: {out_path!s}")
f"Document {path} converted." f"\nSaved markdown output to: {str(out_path)}"
)
# Export Docling document format to markdowndoc: # Export Docling document format to markdowndoc:
fn = os.path.basename(path) fn = os.path.basename(path)

View File

@ -1,13 +1,10 @@
from pathlib import Path from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
AcceleratorDevice, AcceleratorDevice,
AcceleratorOptions, AcceleratorOptions,
PdfPipelineOptions, PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
) )
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -63,7 +63,7 @@ def main():
out_path = Path("scratch") out_path = Path("scratch")
print( print(
f"Document {res.input.file.name} converted." f"Document {res.input.file.name} converted."
f"\nSaved markdown output to: {str(out_path)}" f"\nSaved markdown output to: {out_path!s}"
) )
_log.debug(res.document._export_to_indented_text(max_text_len=16)) _log.debug(res.document._export_to_indented_text(max_text_len=16))
# Export Docling document format to markdowndoc: # Export Docling document format to markdowndoc:

View File

@ -4,7 +4,6 @@ from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
PdfPipelineOptions, PdfPipelineOptions,
TesseractCliOcrOptions, TesseractCliOcrOptions,
TesseractOcrOptions,
) )
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -2,9 +2,9 @@ import logging
import time import time
from pathlib import Path from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem from docling_core.types.doc import ImageRefMode, TableItem, TextItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
@ -15,7 +15,6 @@ IMAGE_RESOLUTION_SCALE = 2.0
# FIXME: put in your favorite translation code .... # FIXME: put in your favorite translation code ....
def translate(text: str, src: str = "en", dest: str = "de"): def translate(text: str, src: str = "en", dest: str = "de"):
_log.warning("!!! IMPLEMENT HERE YOUR FAVORITE TRANSLATION CODE!!!") _log.warning("!!! IMPLEMENT HERE YOUR FAVORITE TRANSLATION CODE!!!")
# from googletrans import Translator # from googletrans import Translator
@ -52,10 +51,9 @@ def main():
} }
) )
start_time = time.time()
conv_res = doc_converter.convert(input_doc_path) conv_res = doc_converter.convert(input_doc_path)
conv_doc = conv_res.document conv_doc = conv_res.document
doc_filename = conv_res.input.file
# Save markdown with embedded pictures in original text # Save markdown with embedded pictures in original text
md_filename = output_dir / f"{doc_filename}-with-images-orig.md" md_filename = output_dir / f"{doc_filename}-with-images-orig.md"

View File

@ -432,7 +432,7 @@
"\n", "\n",
"for i, doc in enumerate(resp_dict[\"context\"][:]):\n", "for i, doc in enumerate(resp_dict[\"context\"][:]):\n",
" image_by_page = {}\n", " image_by_page = {}\n",
" print(f\"Source {i+1}:\")\n", " print(f\"Source {i + 1}:\")\n",
" print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n", " print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
" meta = DocMeta.model_validate(doc.metadata[\"dl_meta\"])\n", " meta = DocMeta.model_validate(doc.metadata[\"dl_meta\"])\n",
"\n", "\n",

View File

@ -10,7 +10,6 @@ from docling.datamodel.pipeline_options import (
ApiVlmOptions, ApiVlmOptions,
ResponseFormat, ResponseFormat,
VlmPipelineOptions, VlmPipelineOptions,
granite_vision_vlm_ollama_conversion_options,
) )
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline from docling.pipeline.vlm_pipeline import VlmPipeline

98
poetry.lock generated
View File

@ -692,6 +692,84 @@ traitlets = ">=4"
[package.extras] [package.extras]
test = ["pytest"] test = ["pytest"]
[[package]]
name = "coverage"
version = "7.8.0"
description = "Code coverage measurement for Python"
optional = false
python-versions = ">=3.9"
files = [
{file = "coverage-7.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2931f66991175369859b5fd58529cd4b73582461877ecfd859b6549869287ffe"},
{file = "coverage-7.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52a523153c568d2c0ef8826f6cc23031dc86cffb8c6aeab92c4ff776e7951b28"},
{file = "coverage-7.8.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c8a5c139aae4c35cbd7cadca1df02ea8cf28a911534fc1b0456acb0b14234f3"},
{file = "coverage-7.8.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a26c0c795c3e0b63ec7da6efded5f0bc856d7c0b24b2ac84b4d1d7bc578d676"},
{file = "coverage-7.8.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:821f7bcbaa84318287115d54becb1915eece6918136c6f91045bb84e2f88739d"},
{file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a321c61477ff8ee705b8a5fed370b5710c56b3a52d17b983d9215861e37b642a"},
{file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ed2144b8a78f9d94d9515963ed273d620e07846acd5d4b0a642d4849e8d91a0c"},
{file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:042e7841a26498fff7a37d6fda770d17519982f5b7d8bf5278d140b67b61095f"},
{file = "coverage-7.8.0-cp310-cp310-win32.whl", hash = "sha256:f9983d01d7705b2d1f7a95e10bbe4091fabc03a46881a256c2787637b087003f"},
{file = "coverage-7.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:5a570cd9bd20b85d1a0d7b009aaf6c110b52b5755c17be6962f8ccd65d1dbd23"},
{file = "coverage-7.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e7ac22a0bb2c7c49f441f7a6d46c9c80d96e56f5a8bc6972529ed43c8b694e27"},
{file = "coverage-7.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bf13d564d310c156d1c8e53877baf2993fb3073b2fc9f69790ca6a732eb4bfea"},
{file = "coverage-7.8.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5761c70c017c1b0d21b0815a920ffb94a670c8d5d409d9b38857874c21f70d7"},
{file = "coverage-7.8.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5ff52d790c7e1628241ffbcaeb33e07d14b007b6eb00a19320c7b8a7024c040"},
{file = "coverage-7.8.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d39fc4817fd67b3915256af5dda75fd4ee10621a3d484524487e33416c6f3543"},
{file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b44674870709017e4b4036e3d0d6c17f06a0e6d4436422e0ad29b882c40697d2"},
{file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8f99eb72bf27cbb167b636eb1726f590c00e1ad375002230607a844d9e9a2318"},
{file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b571bf5341ba8c6bc02e0baeaf3b061ab993bf372d982ae509807e7f112554e9"},
{file = "coverage-7.8.0-cp311-cp311-win32.whl", hash = "sha256:e75a2ad7b647fd8046d58c3132d7eaf31b12d8a53c0e4b21fa9c4d23d6ee6d3c"},
{file = "coverage-7.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:3043ba1c88b2139126fc72cb48574b90e2e0546d4c78b5299317f61b7f718b78"},
{file = "coverage-7.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bbb5cc845a0292e0c520656d19d7ce40e18d0e19b22cb3e0409135a575bf79fc"},
{file = "coverage-7.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4dfd9a93db9e78666d178d4f08a5408aa3f2474ad4d0e0378ed5f2ef71640cb6"},
{file = "coverage-7.8.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f017a61399f13aa6d1039f75cd467be388d157cd81f1a119b9d9a68ba6f2830d"},
{file = "coverage-7.8.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0915742f4c82208ebf47a2b154a5334155ed9ef9fe6190674b8a46c2fb89cb05"},
{file = "coverage-7.8.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a40fcf208e021eb14b0fac6bdb045c0e0cab53105f93ba0d03fd934c956143a"},
{file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a1f406a8e0995d654b2ad87c62caf6befa767885301f3b8f6f73e6f3c31ec3a6"},
{file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:77af0f6447a582fdc7de5e06fa3757a3ef87769fbb0fdbdeba78c23049140a47"},
{file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f2d32f95922927186c6dbc8bc60df0d186b6edb828d299ab10898ef3f40052fe"},
{file = "coverage-7.8.0-cp312-cp312-win32.whl", hash = "sha256:769773614e676f9d8e8a0980dd7740f09a6ea386d0f383db6821df07d0f08545"},
{file = "coverage-7.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:e5d2b9be5b0693cf21eb4ce0ec8d211efb43966f6657807f6859aab3814f946b"},
{file = "coverage-7.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5ac46d0c2dd5820ce93943a501ac5f6548ea81594777ca585bf002aa8854cacd"},
{file = "coverage-7.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:771eb7587a0563ca5bb6f622b9ed7f9d07bd08900f7589b4febff05f469bea00"},
{file = "coverage-7.8.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42421e04069fb2cbcbca5a696c4050b84a43b05392679d4068acbe65449b5c64"},
{file = "coverage-7.8.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:554fec1199d93ab30adaa751db68acec2b41c5602ac944bb19187cb9a41a8067"},
{file = "coverage-7.8.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aaeb00761f985007b38cf463b1d160a14a22c34eb3f6a39d9ad6fc27cb73008"},
{file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:581a40c7b94921fffd6457ffe532259813fc68eb2bdda60fa8cc343414ce3733"},
{file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:f319bae0321bc838e205bf9e5bc28f0a3165f30c203b610f17ab5552cff90323"},
{file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04bfec25a8ef1c5f41f5e7e5c842f6b615599ca8ba8391ec33a9290d9d2db3a3"},
{file = "coverage-7.8.0-cp313-cp313-win32.whl", hash = "sha256:dd19608788b50eed889e13a5d71d832edc34fc9dfce606f66e8f9f917eef910d"},
{file = "coverage-7.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:a9abbccd778d98e9c7e85038e35e91e67f5b520776781d9a1e2ee9d400869487"},
{file = "coverage-7.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:18c5ae6d061ad5b3e7eef4363fb27a0576012a7447af48be6c75b88494c6cf25"},
{file = "coverage-7.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:95aa6ae391a22bbbce1b77ddac846c98c5473de0372ba5c463480043a07bff42"},
{file = "coverage-7.8.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e013b07ba1c748dacc2a80e69a46286ff145935f260eb8c72df7185bf048f502"},
{file = "coverage-7.8.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d766a4f0e5aa1ba056ec3496243150698dc0481902e2b8559314368717be82b1"},
{file = "coverage-7.8.0-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad80e6b4a0c3cb6f10f29ae4c60e991f424e6b14219d46f1e7d442b938ee68a4"},
{file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b87eb6fc9e1bb8f98892a2458781348fa37e6925f35bb6ceb9d4afd54ba36c73"},
{file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:d1ba00ae33be84066cfbe7361d4e04dec78445b2b88bdb734d0d1cbab916025a"},
{file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f3c38e4e5ccbdc9198aecc766cedbb134b2d89bf64533973678dfcf07effd883"},
{file = "coverage-7.8.0-cp313-cp313t-win32.whl", hash = "sha256:379fe315e206b14e21db5240f89dc0774bdd3e25c3c58c2c733c99eca96f1ada"},
{file = "coverage-7.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2e4b6b87bb0c846a9315e3ab4be2d52fac905100565f4b92f02c445c8799e257"},
{file = "coverage-7.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa260de59dfb143af06dcf30c2be0b200bed2a73737a8a59248fcb9fa601ef0f"},
{file = "coverage-7.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:96121edfa4c2dfdda409877ea8608dd01de816a4dc4a0523356067b305e4e17a"},
{file = "coverage-7.8.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b8af63b9afa1031c0ef05b217faa598f3069148eeee6bb24b79da9012423b82"},
{file = "coverage-7.8.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:89b1f4af0d4afe495cd4787a68e00f30f1d15939f550e869de90a86efa7e0814"},
{file = "coverage-7.8.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94ec0be97723ae72d63d3aa41961a0b9a6f5a53ff599813c324548d18e3b9e8c"},
{file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8a1d96e780bdb2d0cbb297325711701f7c0b6f89199a57f2049e90064c29f6bd"},
{file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:f1d8a2a57b47142b10374902777e798784abf400a004b14f1b0b9eaf1e528ba4"},
{file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:cf60dd2696b457b710dd40bf17ad269d5f5457b96442f7f85722bdb16fa6c899"},
{file = "coverage-7.8.0-cp39-cp39-win32.whl", hash = "sha256:be945402e03de47ba1872cd5236395e0f4ad635526185a930735f66710e1bd3f"},
{file = "coverage-7.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:90e7fbc6216ecaffa5a880cdc9c77b7418c1dcb166166b78dbc630d07f278cc3"},
{file = "coverage-7.8.0-pp39.pp310.pp311-none-any.whl", hash = "sha256:b8194fb8e50d556d5849753de991d390c5a1edeeba50f68e3a9253fbd8bf8ccd"},
{file = "coverage-7.8.0-py3-none-any.whl", hash = "sha256:dbf364b4c5e7bae9250528167dfe40219b62e2d573c854d74be213e1e52069f7"},
{file = "coverage-7.8.0.tar.gz", hash = "sha256:7a3d62b3b03b4b6fd41a085f3574874cf946cb4604d2b4d3e8dca8cd570ca501"},
]
[package.dependencies]
tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""}
[package.extras]
toml = ["tomli"]
[[package]] [[package]]
name = "cryptography" name = "cryptography"
version = "43.0.3" version = "43.0.3"
@ -5073,6 +5151,24 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
[package.extras] [package.extras]
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
[[package]]
name = "pytest-cov"
version = "6.1.1"
description = "Pytest plugin for measuring coverage."
optional = false
python-versions = ">=3.9"
files = [
{file = "pytest_cov-6.1.1-py3-none-any.whl", hash = "sha256:bddf29ed2d0ab6f4df17b4c55b0a657287db8684af9c42ea546b21b1041b3dde"},
{file = "pytest_cov-6.1.1.tar.gz", hash = "sha256:46935f7aaefba760e716c2ebfbe1c216240b9592966e7da99ea8292d4d3e2a0a"},
]
[package.dependencies]
coverage = {version = ">=7.5", extras = ["toml"]}
pytest = ">=4.6"
[package.extras]
testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"]
[[package]] [[package]]
name = "pytest-xdist" name = "pytest-xdist"
version = "3.6.1" version = "3.6.1"
@ -7882,4 +7978,4 @@ vlm = ["accelerate", "transformers", "transformers"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.9" python-versions = "^3.9"
content-hash = "d2a8f7997b9ffb249ad26ba492b766d580bdb0072d50e76b0afd92496e983e96" content-hash = "b36037ec17dc4b6d5197a2f63a1367e05bf888b4fa97e2e2e8c29c217741d69c"

View File

@ -110,6 +110,8 @@ ipywidgets = "^8.1.5"
nbqa = "^1.9.0" nbqa = "^1.9.0"
types-openpyxl = "^3.1.5.20241114" types-openpyxl = "^3.1.5.20241114"
types-tqdm = "^4.67.0.20241221" types-tqdm = "^4.67.0.20241221"
coverage = "^7.6.2"
pytest-cov = "^6.0.0"
[tool.poetry.group.docs.dependencies] [tool.poetry.group.docs.dependencies]
mkdocs-material = "^9.5.40" mkdocs-material = "^9.5.40"
@ -164,15 +166,82 @@ docling-tools = "docling.cli.tools:app"
requires = ["poetry-core"] requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"
[tool.black] [tool.ruff]
target-version = "py39"
line-length = 88 line-length = 88
target-version = ["py39"] respect-gitignore = true
include = '\.pyi?$'
[tool.isort] # extend-exclude = [
profile = "black" # "tests",
line_length = 88 # ]
py_version = 39
[tool.ruff.format]
skip-magic-trailing-comma = false
[tool.ruff.lint]
select = [
# "B", # flake8-bugbear
"C", # flake8-comprehensions
"C9", # mccabe
# "D", # flake8-docstrings
"E", # pycodestyle errors (default)
"F", # pyflakes (default)
"I", # isort
"PD", # pandas-vet
"PIE", # pie
# "PTH", # pathlib
"Q", # flake8-quotes
# "RET", # return
"RUF", # Enable all ruff-specific checks
# "SIM", # simplify
"S307", # eval
# "T20", # (disallow print statements) keep debugging statements out of the codebase
"W", # pycodestyle warnings
"ASYNC", # async
"UP", # pyupgrade
]
ignore = [
"C408", # Unnecessary `dict()` call (rewrite as a literal)
"E501", # Line too long, handled by ruff formatter
"D107", # "Missing docstring in __init__",
"F401", # imported but unused; consider using `importlib.util.find_spec` to test for "
"F811", # "redefinition of the same function"
"PL", # Pylint
"RUF012", # Mutable Class Attributes
"UP006", # List vs list, etc
"UP007", # Option and Union
"UP035", # `typing.Set` is deprecated, use `set` instead"
]
#extend-select = []
[tool.ruff.lint.pep8-naming]
classmethod-decorators = [
# Allow Pydantic's `@validator` decorator to trigger class method treatment.
"pydantic.validator",
]
[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["E402", "F401"]
"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests
[tool.ruff.lint.mccabe]
max-complexity = 20
# [tool.ruff.lint.isort.sections]
# "docling" = ["docling_core", "docling_ibm_models", "docling_parse"]
[tool.ruff.lint.isort]
combine-as-imports = true
# section-order = [
# "future",
# "standard-library",
# "third-party",
# "docling",
# "first-party",
# "local-folder",
# ]
[tool.mypy] [tool.mypy]
pretty = true pretty = true
@ -200,10 +269,6 @@ module = [
] ]
ignore_missing_imports = true ignore_missing_imports = true
[tool.flake8]
max-line-length = 88
extend-ignore = ["E203", "E501"]
[tool.semantic_release] [tool.semantic_release]
# for default values check: # for default values check:
# https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg # https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg

View File

@ -19,7 +19,6 @@ def _get_backend(fname):
def test_asciidocs_examples(): def test_asciidocs_examples():
fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc")) fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc"))
for fname in fnames: for fname in fnames:
@ -38,8 +37,8 @@ def test_asciidocs_examples():
print("\n\n", pred_mddoc) print("\n\n", pred_mddoc)
if os.path.exists(gname): if os.path.exists(gname):
with open(gname, "r") as fr: with open(gname) as fr:
true_mddoc = fr.read() fr.read()
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc" # assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
else: else:

View File

@ -1,5 +1,3 @@
import json
import os
from pathlib import Path from pathlib import Path
from pytest import warns from pytest import warns
@ -15,22 +13,19 @@ GENERATE = GEN_TEST_DATA
def get_csv_paths(): def get_csv_paths():
# Define the directory you want to search # Define the directory you want to search
directory = Path(f"./tests/data/csv/") directory = Path("./tests/data/csv/")
# List all CSV files in the directory and its subdirectories # List all CSV files in the directory and its subdirectories
return sorted(directory.rglob("*.csv")) return sorted(directory.rglob("*.csv"))
def get_csv_path(name: str): def get_csv_path(name: str):
# Return the matching CSV file path # Return the matching CSV file path
return Path(f"./tests/data/csv/{name}.csv") return Path(f"./tests/data/csv/{name}.csv")
def get_converter(): def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.CSV]) converter = DocumentConverter(allowed_formats=[InputFormat.CSV])
return converter return converter
@ -55,9 +50,9 @@ def test_e2e_valid_csv_conversions():
pred_itxt: str = doc._export_to_indented_text( pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False max_text_len=70, explicit_tables=False
) )
assert verify_export( assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
pred_itxt, str(gt_path) + ".itxt" "export to indented-text"
), "export to indented-text" )
assert verify_document( assert verify_document(
pred_doc=doc, pred_doc=doc,

View File

@ -32,7 +32,7 @@ def test_text_cell_counts():
doc_backend = _get_backend(pdf_doc) doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()): for page_index in range(doc_backend.page_count()):
last_cell_count = None last_cell_count = None
for i in range(10): for i in range(10):
page_backend: DoclingParsePageBackend = doc_backend.load_page(0) page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
@ -42,9 +42,9 @@ def test_text_cell_counts():
last_cell_count = len(cells) last_cell_count = len(cells)
if len(cells) != last_cell_count: if len(cells) != last_cell_count:
assert ( assert False, (
False "Loading page multiple times yielded non-identical text cell counts"
), "Loading page multiple times yielded non-identical text cell counts" )
last_cell_count = len(cells) last_cell_count = len(cells)
@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path):
page_backend: DoclingParsePageBackend = doc_backend.load_page(0) page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper # Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image( page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
) )
# im.show() # im.show()

View File

@ -31,7 +31,7 @@ def test_text_cell_counts():
doc_backend = _get_backend(pdf_doc) doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()): for page_index in range(doc_backend.page_count()):
last_cell_count = None last_cell_count = None
for i in range(10): for i in range(10):
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0) page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
@ -41,9 +41,9 @@ def test_text_cell_counts():
last_cell_count = len(cells) last_cell_count = len(cells)
if len(cells) != last_cell_count: if len(cells) != last_cell_count:
assert ( assert False, (
False "Loading page multiple times yielded non-identical text cell counts"
), "Loading page multiple times yielded non-identical text cell counts" )
last_cell_count = len(cells) last_cell_count = len(cells)
@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path):
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0) page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper # Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image( page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
) )
# im.show() # im.show()

View File

@ -31,7 +31,7 @@ def test_text_cell_counts():
doc_backend = _get_backend(pdf_doc) doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()): for page_index in range(doc_backend.page_count()):
last_cell_count = None last_cell_count = None
for i in range(10): for i in range(10):
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0) page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
@ -41,9 +41,9 @@ def test_text_cell_counts():
last_cell_count = len(cells) last_cell_count = len(cells)
if len(cells) != last_cell_count: if len(cells) != last_cell_count:
assert ( assert False, (
False "Loading page multiple times yielded non-identical text cell counts"
), "Loading page multiple times yielded non-identical text cell counts" )
last_cell_count = len(cells) last_cell_count = len(cells)
@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path):
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0) page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper # Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image( page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
) )
# im.show() # im.show()

View File

@ -105,7 +105,6 @@ def test_ordered_lists():
def get_html_paths(): def get_html_paths():
# Define the directory you want to search # Define the directory you want to search
directory = Path("./tests/data/html/") directory = Path("./tests/data/html/")
@ -115,14 +114,12 @@ def get_html_paths():
def get_converter(): def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.HTML]) converter = DocumentConverter(allowed_formats=[InputFormat.HTML])
return converter return converter
def test_e2e_html_conversions(): def test_e2e_html_conversions():
html_paths = get_html_paths() html_paths = get_html_paths()
converter = get_converter() converter = get_converter()
@ -138,15 +135,15 @@ def test_e2e_html_conversions():
doc: DoclingDocument = conv_result.document doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown() pred_md: str = doc.export_to_markdown()
assert verify_export( assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
pred_md, str(gt_path) + ".md", generate=GENERATE "export to md"
), "export to md" )
pred_itxt: str = doc._export_to_indented_text( pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False max_text_len=70, explicit_tables=False
) )
assert verify_export( assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
pred_itxt, str(gt_path) + ".itxt", generate=GENERATE "export to indented-text"
), "export to indented-text" )
assert verify_document(doc, str(gt_path) + ".json", GENERATE) assert verify_document(doc, str(gt_path) + ".json", GENERATE)

View File

@ -15,7 +15,7 @@ GENERATE = GEN_TEST_DATA
def get_pubmed_paths(): def get_pubmed_paths():
directory = Path(os.path.dirname(__file__) + f"/data/pubmed/") directory = Path(os.path.dirname(__file__) + "/data/pubmed/")
xml_files = sorted(directory.rglob("*.xml")) xml_files = sorted(directory.rglob("*.xml"))
return xml_files return xml_files
@ -47,9 +47,9 @@ def test_e2e_pubmed_conversions(use_stream=False):
pred_itxt: str = doc._export_to_indented_text( pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False max_text_len=70, explicit_tables=False
) )
assert verify_export( assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
pred_itxt, str(gt_path) + ".itxt" "export to indented-text"
), "export to indented-text" )
assert verify_document(doc, str(gt_path) + ".json", GENERATE), "export to json" assert verify_document(doc, str(gt_path) + ".json", GENERATE), "export to json"

View File

@ -17,7 +17,6 @@ GENERATE = GEN_TEST_DATA
def get_xlsx_paths(): def get_xlsx_paths():
# Define the directory you want to search # Define the directory you want to search
directory = Path("./tests/data/xlsx/") directory = Path("./tests/data/xlsx/")
@ -27,7 +26,6 @@ def get_xlsx_paths():
def get_converter(): def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.XLSX]) converter = DocumentConverter(allowed_formats=[InputFormat.XLSX])
return converter return converter
@ -65,13 +63,13 @@ def test_e2e_xlsx_conversions(documents) -> None:
pred_itxt: str = doc._export_to_indented_text( pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False max_text_len=70, explicit_tables=False
) )
assert verify_export( assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
pred_itxt, str(gt_path) + ".itxt" "export to indented-text"
), "export to indented-text" )
assert verify_document( assert verify_document(doc, str(gt_path) + ".json", GENERATE), (
doc, str(gt_path) + ".json", GENERATE "document document"
), "document document" )
def test_pages(documents) -> None: def test_pages(documents) -> None:
@ -81,7 +79,7 @@ def test_pages(documents) -> None:
documents: The paths and converted documents. documents: The paths and converted documents.
""" """
# number of pages from the backend method # number of pages from the backend method
path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0] path = next(item for item in get_xlsx_paths() if item.stem == "test-01")
in_doc = InputDocument( in_doc = InputDocument(
path_or_stream=path, path_or_stream=path,
format=InputFormat.XLSX, format=InputFormat.XLSX,
@ -92,7 +90,7 @@ def test_pages(documents) -> None:
assert backend.page_count() == 3 assert backend.page_count() == 3
# number of pages from the converted document # number of pages from the converted document
doc = [item for path, item in documents if path.stem == "test-01"][0] doc = next(item for path, item in documents if path.stem == "test-01")
assert len(doc.pages) == 3 assert len(doc.pages) == 3
# page sizes as number of cells # page sizes as number of cells

View File

@ -1,4 +1,3 @@
import os
from pathlib import Path from pathlib import Path
from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend
@ -43,7 +42,6 @@ def test_heading_levels():
def get_docx_paths(): def get_docx_paths():
# Define the directory you want to search # Define the directory you want to search
directory = Path("./tests/data/docx/") directory = Path("./tests/data/docx/")
@ -53,14 +51,12 @@ def get_docx_paths():
def get_converter(): def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.DOCX]) converter = DocumentConverter(allowed_formats=[InputFormat.DOCX])
return converter return converter
def test_e2e_docx_conversions(): def test_e2e_docx_conversions():
docx_paths = get_docx_paths() docx_paths = get_docx_paths()
converter = get_converter() converter = get_converter()
@ -76,20 +72,20 @@ def test_e2e_docx_conversions():
doc: DoclingDocument = conv_result.document doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown() pred_md: str = doc.export_to_markdown()
assert verify_export( assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
pred_md, str(gt_path) + ".md", generate=GENERATE "export to md"
), "export to md" )
pred_itxt: str = doc._export_to_indented_text( pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False max_text_len=70, explicit_tables=False
) )
assert verify_export( assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
pred_itxt, str(gt_path) + ".itxt", generate=GENERATE "export to indented-text"
), "export to indented-text" )
assert verify_document( assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
doc, str(gt_path) + ".json", generate=GENERATE "document document"
), "document document" )
if docx_path.name == "word_tables.docx": if docx_path.name == "word_tables.docx":
pred_html: str = doc.export_to_html() pred_html: str = doc.export_to_html()

View File

@ -109,27 +109,27 @@ def test_patent_groundtruth(patents, groundtruth):
md_name = path.stem + ".md" md_name = path.stem + ".md"
if md_name in gt_names: if md_name in gt_names:
pred_md = doc.export_to_markdown() pred_md = doc.export_to_markdown()
assert ( assert pred_md == gt_names[md_name], (
pred_md == gt_names[md_name] f"Markdown file mismatch against groundtruth {md_name}"
), f"Markdown file mismatch against groundtruth {md_name}" )
json_path = path.with_suffix(".json") json_path = path.with_suffix(".json")
if json_path.stem in gt_names: if json_path.stem in gt_names:
assert verify_document( assert verify_document(doc, str(json_path), GENERATE), (
doc, str(json_path), GENERATE f"JSON file mismatch against groundtruth {json_path}"
), f"JSON file mismatch against groundtruth {json_path}" )
itxt_name = path.stem + ".itxt" itxt_name = path.stem + ".itxt"
if itxt_name in gt_names: if itxt_name in gt_names:
pred_itxt = doc._export_to_indented_text() pred_itxt = doc._export_to_indented_text()
assert ( assert pred_itxt == gt_names[itxt_name], (
pred_itxt == gt_names[itxt_name] f"Indented text file mismatch against groundtruth {itxt_name}"
), f"Indented text file mismatch against groundtruth {itxt_name}" )
def test_tables(tables): def test_tables(tables):
"""Test the table parser.""" """Test the table parser."""
# CHECK table in file tables_20180000016.xml # CHECK table in file tables_20180000016.xml
file_name = "tables_ipa20180000016.xml" file_name = "tables_ipa20180000016.xml"
file_table = [item[1] for item in tables if item[0].name == file_name][0] file_table = next(item[1] for item in tables if item[0].name == file_name)
assert file_table.num_rows == 13 assert file_table.num_rows == 13
assert file_table.num_cols == 10 assert file_table.num_cols == 10
assert len(file_table.table_cells) == 130 assert len(file_table.table_cells) == 130
@ -140,7 +140,7 @@ def test_patent_uspto_ice(patents):
# CHECK application doc number 20200022300 # CHECK application doc number 20200022300
file_name = "ipa20200022300.xml" file_name = "ipa20200022300.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0] doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE: if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem) _generate_groundtruth(doc, Path(file_name).stem)
@ -278,7 +278,7 @@ def test_patent_uspto_ice(patents):
# CHECK application doc number 20180000016 for HTML entities, level 2 headings, tables # CHECK application doc number 20180000016 for HTML entities, level 2 headings, tables
file_name = "ipa20180000016.xml" file_name = "ipa20180000016.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0] doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE: if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem) _generate_groundtruth(doc, Path(file_name).stem)
@ -348,7 +348,7 @@ def test_patent_uspto_ice(patents):
# CHECK application doc number 20110039701 for complex long tables # CHECK application doc number 20110039701 for complex long tables
file_name = "ipa20110039701.xml" file_name = "ipa20110039701.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0] doc = next(item[1] for item in patents if item[0].name == file_name)
assert doc.name == file_name assert doc.name == file_name
assert len(doc.tables) == 17 assert len(doc.tables) == 17
@ -358,7 +358,7 @@ def test_patent_uspto_grant_v2(patents):
# CHECK application doc number 06442728 # CHECK application doc number 06442728
file_name = "pg06442728.xml" file_name = "pg06442728.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0] doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE: if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem) _generate_groundtruth(doc, Path(file_name).stem)
@ -376,12 +376,12 @@ def test_patent_uspto_grant_v2(patents):
assert isinstance(texts[2], TextItem) assert isinstance(texts[2], TextItem)
assert texts[2].text == ( assert texts[2].text == (
"An interleaver receives incoming data frames of size N. The interleaver " "An interleaver receives incoming data frames of size N. The interleaver "
"indexes the elements of the frame with an N₁×N₂ index array. The interleaver " "indexes the elements of the frame with an N₁×N₂ index array. The interleaver " # noqa: RUF001
"then effectively rearranges (permutes) the data by permuting the rows of the " "then effectively rearranges (permutes) the data by permuting the rows of the "
"index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to " "index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to " # noqa: RUF001
"permute the columns (indexed by k) of each row (indexed by j). P is at least " "permute the columns (indexed by k) of each row (indexed by j). P is at least "
"equal to N₂, βj is a constant which may be different for each row, and each " "equal to N₂, βj is a constant which may be different for each row, and each "
"αj is a relative prime number relative to P. After permuting, the " "αj is a relative prime number relative to P. After permuting, the " # noqa: RUF001
"interleaver outputs the data in a different order than received (e.g., " "interleaver outputs the data in a different order than received (e.g., "
"receives sequentially row by row, outputs sequentially each column by column)." "receives sequentially row by row, outputs sequentially each column by column)."
) )
@ -402,7 +402,7 @@ def test_patent_uspto_app_v1(patents):
# CHECK application doc number 20010031492 # CHECK application doc number 20010031492
file_name = "pa20010031492.xml" file_name = "pa20010031492.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0] doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE: if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem) _generate_groundtruth(doc, Path(file_name).stem)
@ -432,7 +432,7 @@ def test_patent_uspto_grant_aps(patents):
# CHECK application doc number 057006474 # CHECK application doc number 057006474
file_name = "pftaps057006474.txt" file_name = "pftaps057006474.txt"
doc = [item[1] for item in patents if item[0].name == file_name][0] doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE: if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem) _generate_groundtruth(doc, Path(file_name).stem)

View File

@ -32,7 +32,7 @@ def test_text_cell_counts():
doc_backend = _get_backend(pdf_doc) doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()): for page_index in range(doc_backend.page_count()):
last_cell_count = None last_cell_count = None
for i in range(10): for i in range(10):
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0) page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
@ -42,9 +42,9 @@ def test_text_cell_counts():
last_cell_count = len(cells) last_cell_count = len(cells)
if len(cells) != last_cell_count: if len(cells) != last_cell_count:
assert ( assert False, (
False "Loading page multiple times yielded non-identical text cell counts"
), "Loading page multiple times yielded non-identical text cell counts" )
last_cell_count = len(cells) last_cell_count = len(cells)
@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path):
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0) page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper # Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image( page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
) )
# im.show() # im.show()

View File

@ -1,4 +1,3 @@
import os
from pathlib import Path from pathlib import Path
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
@ -12,7 +11,6 @@ GENERATE = GEN_TEST_DATA
def get_pptx_paths(): def get_pptx_paths():
# Define the directory you want to search # Define the directory you want to search
directory = Path("./tests/data/pptx/") directory = Path("./tests/data/pptx/")
@ -22,14 +20,12 @@ def get_pptx_paths():
def get_converter(): def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.PPTX]) converter = DocumentConverter(allowed_formats=[InputFormat.PPTX])
return converter return converter
def test_e2e_pptx_conversions(): def test_e2e_pptx_conversions():
pptx_paths = get_pptx_paths() pptx_paths = get_pptx_paths()
converter = get_converter() converter = get_converter()
@ -50,10 +46,10 @@ def test_e2e_pptx_conversions():
pred_itxt: str = doc._export_to_indented_text( pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False max_text_len=70, explicit_tables=False
) )
assert verify_export( assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
pred_itxt, str(gt_path) + ".itxt" "export to indented-text"
), "export to indented-text" )
assert verify_document( assert verify_document(doc, str(gt_path) + ".json", GENERATE), (
doc, str(gt_path) + ".json", GENERATE "document document"
), "document document" )

View File

@ -3,7 +3,6 @@ from pathlib import Path
from docling_core.types.doc import CodeItem, TextItem from docling_core.types.doc import CodeItem, TextItem
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
@ -12,7 +11,6 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
def get_converter(): def get_converter():
pipeline_options = PdfPipelineOptions() pipeline_options = PdfPipelineOptions()
pipeline_options.generate_page_images = True pipeline_options.generate_page_images = True

View File

@ -2,7 +2,6 @@ from pathlib import Path
from docling_core.types.doc import PictureClassificationData from docling_core.types.doc import PictureClassificationData
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
@ -11,7 +10,6 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
def get_converter(): def get_converter():
pipeline_options = PdfPipelineOptions() pipeline_options = PdfPipelineOptions()
pipeline_options.generate_page_images = True pipeline_options.generate_page_images = True
@ -49,32 +47,32 @@ def test_picture_classifier():
res = results[0] res = results[0]
assert len(res.annotations) == 1 assert len(res.annotations) == 1
assert type(res.annotations[0]) == PictureClassificationData assert isinstance(res.annotations[0], PictureClassificationData)
classification_data = res.annotations[0] classification_data = res.annotations[0]
assert classification_data.provenance == "DocumentPictureClassifier" assert classification_data.provenance == "DocumentPictureClassifier"
assert ( assert len(classification_data.predicted_classes) == 16, (
len(classification_data.predicted_classes) == 16 "Number of predicted classes is not equal to 16"
), "Number of predicted classes is not equal to 16" )
confidences = [pred.confidence for pred in classification_data.predicted_classes] confidences = [pred.confidence for pred in classification_data.predicted_classes]
assert confidences == sorted( assert confidences == sorted(confidences, reverse=True), (
confidences, reverse=True "Predictions are not sorted in descending order of confidence"
), "Predictions are not sorted in descending order of confidence" )
assert ( assert classification_data.predicted_classes[0].class_name == "bar_chart", (
classification_data.predicted_classes[0].class_name == "bar_chart" "The prediction is wrong for the bar chart image."
), "The prediction is wrong for the bar chart image." )
res = results[1] res = results[1]
assert len(res.annotations) == 1 assert len(res.annotations) == 1
assert type(res.annotations[0]) == PictureClassificationData assert isinstance(res.annotations[0], PictureClassificationData)
classification_data = res.annotations[0] classification_data = res.annotations[0]
assert classification_data.provenance == "DocumentPictureClassifier" assert classification_data.provenance == "DocumentPictureClassifier"
assert ( assert len(classification_data.predicted_classes) == 16, (
len(classification_data.predicted_classes) == 16 "Number of predicted classes is not equal to 16"
), "Number of predicted classes is not equal to 16" )
confidences = [pred.confidence for pred in classification_data.predicted_classes] confidences = [pred.confidence for pred in classification_data.predicted_classes]
assert confidences == sorted( assert confidences == sorted(confidences, reverse=True), (
confidences, reverse=True "Predictions are not sorted in descending order of confidence"
), "Predictions are not sorted in descending order of confidence" )
assert ( assert classification_data.predicted_classes[0].class_name == "map", (
classification_data.predicted_classes[0].class_name == "map" "The prediction is wrong for the bar chart image."
), "The prediction is wrong for the bar chart image." )

View File

@ -1,7 +1,6 @@
from pathlib import Path from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions
@ -15,7 +14,6 @@ GENERATE_V2 = GEN_TEST_DATA
def get_pdf_paths(): def get_pdf_paths():
# Define the directory you want to search # Define the directory you want to search
directory = Path("./tests/data/pdf/") directory = Path("./tests/data/pdf/")
@ -25,7 +23,6 @@ def get_pdf_paths():
def get_converter(): def get_converter():
pipeline_options = PdfPipelineOptions() pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True pipeline_options.do_table_structure = True
@ -45,7 +42,6 @@ def get_converter():
def test_e2e_pdfs_conversions(): def test_e2e_pdfs_conversions():
pdf_paths = get_pdf_paths() pdf_paths = get_pdf_paths()
converter = get_converter() converter = get_converter()

View File

@ -3,7 +3,6 @@ from pathlib import Path
from typing import List from typing import List
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (

View File

@ -12,10 +12,9 @@ from docling.document_converter import PdfFormatOption
def test_in_doc_from_valid_path(): def test_in_doc_from_valid_path():
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
doc = _make_input_doc(test_doc_path) doc = _make_input_doc(test_doc_path)
assert doc.valid == True assert doc.valid is True
def test_in_doc_from_invalid_path(): def test_in_doc_from_invalid_path():
@ -23,29 +22,26 @@ def test_in_doc_from_invalid_path():
doc = _make_input_doc(test_doc_path) doc = _make_input_doc(test_doc_path)
assert doc.valid == False assert doc.valid is False
def test_in_doc_from_valid_buf(): def test_in_doc_from_valid_buf():
buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read()) buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read())
stream = DocumentStream(name="my_doc.pdf", stream=buf) stream = DocumentStream(name="my_doc.pdf", stream=buf)
doc = _make_input_doc_from_stream(stream) doc = _make_input_doc_from_stream(stream)
assert doc.valid == True assert doc.valid is True
def test_in_doc_from_invalid_buf(): def test_in_doc_from_invalid_buf():
buf = BytesIO(b"") buf = BytesIO(b"")
stream = DocumentStream(name="my_doc.pdf", stream=buf) stream = DocumentStream(name="my_doc.pdf", stream=buf)
doc = _make_input_doc_from_stream(stream) doc = _make_input_doc_from_stream(stream)
assert doc.valid == False assert doc.valid is False
def test_image_in_pdf_backend(): def test_image_in_pdf_backend():
in_doc = InputDocument( in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"), path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE, format=InputFormat.IMAGE,
@ -76,7 +72,6 @@ def test_image_in_pdf_backend():
def test_in_doc_with_page_range(): def test_in_doc_with_page_range():
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
limits = DocumentLimits() limits = DocumentLimits()
limits.page_range = (1, 10) limits.page_range = (1, 10)
@ -87,7 +82,7 @@ def test_in_doc_with_page_range():
backend=PyPdfiumDocumentBackend, backend=PyPdfiumDocumentBackend,
limits=limits, limits=limits,
) )
assert doc.valid == True assert doc.valid is True
limits.page_range = (9, 9) limits.page_range = (9, 9)
@ -97,7 +92,7 @@ def test_in_doc_with_page_range():
backend=PyPdfiumDocumentBackend, backend=PyPdfiumDocumentBackend,
limits=limits, limits=limits,
) )
assert doc.valid == True assert doc.valid is True
limits.page_range = (11, 12) limits.page_range = (11, 12)
@ -107,7 +102,7 @@ def test_in_doc_with_page_range():
backend=PyPdfiumDocumentBackend, backend=PyPdfiumDocumentBackend,
limits=limits, limits=limits,
) )
assert doc.valid == False assert doc.valid is False
def test_guess_format(tmp_path): def test_guess_format(tmp_path):
@ -192,17 +187,17 @@ def test_guess_format(tmp_path):
) )
doc_path = temp_dir / "docling_test.xml" doc_path = temp_dir / "docling_test.xml"
doc_path.write_text(xml_content, encoding="utf-8") doc_path.write_text(xml_content, encoding="utf-8")
assert dci._guess_format(doc_path) == None assert dci._guess_format(doc_path) is None
buf = BytesIO(Path(doc_path).open("rb").read()) buf = BytesIO(Path(doc_path).open("rb").read())
stream = DocumentStream(name="docling_test.xml", stream=buf) stream = DocumentStream(name="docling_test.xml", stream=buf)
assert dci._guess_format(stream) == None assert dci._guess_format(stream) is None
# Invalid USPTO patent (as plain text) # Invalid USPTO patent (as plain text)
stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz")) stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz"))
assert dci._guess_format(stream) == None assert dci._guess_format(stream) is None
doc_path = temp_dir / "pftaps_wrong.txt" doc_path = temp_dir / "pftaps_wrong.txt"
doc_path.write_text("xyz", encoding="utf-8") doc_path.write_text("xyz", encoding="utf-8")
assert dci._guess_format(doc_path) == None assert dci._guess_format(doc_path) is None
# Valid Docling JSON # Valid Docling JSON
test_str = '{"name": ""}' test_str = '{"name": ""}'

Some files were not shown because too many files have changed in this diff Show More