ci: add coverage and ruff (#1383)

* add coverage calculation and push

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* new codecov version and usage of token

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* enable ruff formatter instead of black and isort

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply ruff lint fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply ruff unsafe fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add removed imports

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* runs 1 on linter issues

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* finalize linter fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* Update pyproject.toml

Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Michele Dolfi 2025-04-14 18:01:26 +02:00 committed by GitHub
parent 293c28ca7c
commit 5458a88464
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
104 changed files with 665 additions and 633 deletions

17
.github/codecov.yml vendored Normal file
View File

@ -0,0 +1,17 @@
codecov:
# https://docs.codecov.io/docs/comparing-commits
allow_coverage_offsets: true
coverage:
status:
project:
default:
informational: true
target: auto # auto compares coverage to the previous base commit
flags:
- docling
comment:
layout: "reach, diff, flags, files"
behavior: default
require_changes: false # if true: only post the comment if coverage changes
branches: # branch names that can post comment
- "main"

View File

@ -10,6 +10,8 @@ env:
jobs:
code-checks:
uses: ./.github/workflows/checks.yml
with:
push_coverage: false
pre-release-check:
runs-on: ubuntu-latest
outputs:

View File

@ -1,5 +1,13 @@
on:
workflow_call:
inputs:
push_coverage:
type: boolean
description: "If true, the coverage results are pushed to codecov.io."
default: true
secrets:
CODECOV_TOKEN:
required: false
env:
HF_HUB_DOWNLOAD_TIMEOUT: "60"
@ -32,7 +40,13 @@ jobs:
run: poetry install --all-extras
- name: Testing
run: |
poetry run pytest -v tests
poetry run pytest -v --cov=docling --cov-report=xml tests
- name: Upload coverage to Codecov
if: inputs.push_coverage
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
file: ./coverage.xml
- name: Run examples
run: |
for file in docs/examples/*.py; do

View File

@ -17,3 +17,5 @@ jobs:
code-checks:
if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }}
uses: ./.github/workflows/checks.yml
secrets:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

View File

@ -1,43 +1,26 @@
fail_fast: true
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.11.5
hooks:
# Run the Ruff formatter.
- id: ruff-format
name: "Ruff formatter"
args: [--config=pyproject.toml]
files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
# Run the Ruff linter.
- id: ruff
name: "Ruff linter"
args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
- repo: local
hooks:
- id: black
name: Black
entry: poetry run black docling docs/examples tests
pass_filenames: false
language: system
files: '\.py$'
- id: isort
name: isort
entry: poetry run isort docling docs/examples tests
pass_filenames: false
language: system
files: '\.py$'
# - id: flake8
# name: flake8
# entry: poetry run flake8 docling
# pass_filenames: false
# language: system
# files: '\.py$'
- id: mypy
name: MyPy
entry: poetry run mypy docling
pass_filenames: false
language: system
files: '\.py$'
- id: nbqa_black
name: nbQA Black
entry: poetry run nbqa black docs/examples
pass_filenames: false
language: system
files: '\.ipynb$'
- id: nbqa_isort
name: nbQA isort
entry: poetry run nbqa isort docs/examples
pass_filenames: false
language: system
files: '\.ipynb$'
- id: poetry
name: Poetry check
entry: poetry check --lock

View File

@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
text_stream = self.path_or_stream.getvalue().decode("utf-8")
self.lines = text_stream.split("\n")
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
with open(self.path_or_stream, encoding="utf-8") as f:
self.lines = f.readlines()
self.valid = True
@ -75,14 +75,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return doc
def _parse(self, doc: DoclingDocument):
def _parse(self, doc: DoclingDocument): # noqa: C901
"""
Main function that orchestrates the parsing by yielding components:
title, section headers, text, lists, and tables.
"""
content = ""
in_list = False
in_table = False
@ -95,7 +93,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
indents: dict[int, Union[GroupItem, None]] = {}
for i in range(0, 10):
for i in range(10):
parents[i] = None
indents[i] = None
@ -125,7 +123,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
# Lists
elif self._is_list_item(line):
_log.debug(f"line: {line}")
item = self._parse_list_item(line)
_log.debug(f"parsed list-item: {item}")
@ -147,7 +144,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
indents[level + 1] = item["indent"]
elif in_list and item["indent"] < indents[level]:
# print(item["indent"], " => ", indents[level])
while item["indent"] < indents[level]:
# print(item["indent"], " => ", indents[level])
@ -176,7 +172,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
elif in_table and (
(not self._is_table_line(line)) or line.strip() == "|==="
): # end of table
caption = None
if len(caption_data) > 0:
caption = doc.add_text(
@ -195,7 +190,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
# Picture
elif self._is_picture(line):
caption = None
if len(caption_data) > 0:
caption = doc.add_text(
@ -250,7 +244,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
text_data = []
elif len(line.strip()) > 0: # allow multiline texts
item = self._parse_text(line)
text_data.append(item["text"])
@ -273,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
def _get_current_level(self, parents):
for k, v in parents.items():
if v == None and k > 0:
if v is None and k > 0:
return k - 1
return 0
def _get_current_parent(self, parents):
for k, v in parents.items():
if v == None and k > 0:
if v is None and k > 0:
return parents[k - 1]
return None
@ -328,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
"marker": marker,
"text": text.strip(),
"numbered": False,
"indent": 0 if indent == None else len(indent),
"indent": 0 if indent is None else len(indent),
}
else:
return {
@ -336,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
"marker": marker,
"text": text.strip(),
"numbered": True,
"indent": 0 if indent == None else len(indent),
"indent": 0 if indent is None else len(indent),
}
else:
# Fallback if no match
@ -357,7 +350,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return [cell.strip() for cell in line.split("|") if cell.strip()]
def _populate_table_as_grid(self, table_data):
num_rows = len(table_data)
# Adjust the table data into a grid format

View File

@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
head = self.content.readline()
dialect = csv.Sniffer().sniff(head, ",;\t|:")
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
raise RuntimeError(
f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
)

View File

@ -1,8 +1,9 @@
import logging
import random
from collections.abc import Iterable
from io import BytesIO
from pathlib import Path
from typing import Iterable, List, Optional, Union
from typing import List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
@ -156,7 +157,6 @@ class DoclingParsePageBackend(PdfPageBackend):
def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
page_size = self.get_size()
if not cropbox:

View File

@ -1,8 +1,9 @@
import logging
import random
from collections.abc import Iterable
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin
@ -172,7 +173,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
page_size = self.get_size()
if not cropbox:

View File

@ -1,14 +1,14 @@
import logging
import random
from collections.abc import Iterable
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
from typing import TYPE_CHECKING, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
from PIL import Image, ImageDraw
from PIL import Image
from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
@ -93,7 +93,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
page_size = self.get_size()
if not cropbox:

View File

@ -1,12 +1,8 @@
# -*- coding: utf-8 -*-
"""
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
On 23/01/2025
"""
from __future__ import unicode_literals
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
BLANK = ""
@ -79,7 +75,6 @@ CHR_BO = {
}
T = {
"\u2192": "\\rightarrow ",
# Greek letters
"\U0001d6fc": "\\alpha ",
"\U0001d6fd": "\\beta ",

View File

@ -76,8 +76,7 @@ def get_val(key, default=None, store=CHR):
return default
class Tag2Method(object):
class Tag2Method:
def call_method(self, elm, stag=None):
getmethod = self.tag2meth.get
if stag is None:
@ -130,7 +129,6 @@ class Tag2Method(object):
class Pr(Tag2Method):
text = ""
__val_tags = ("chr", "pos", "begChr", "endChr", "type")
@ -159,7 +157,7 @@ class Pr(Tag2Method):
def do_common(self, elm):
stag = elm.tag.replace(OMML_NS, "")
if stag in self.__val_tags:
t = elm.get("{0}val".format(OMML_NS))
t = elm.get(f"{OMML_NS}val")
self.__innerdict[stag] = t
return None
@ -248,7 +246,6 @@ class oMath2Latex(Tag2Method):
"""
the Pre-Sub-Superscript object -- Not support yet
"""
pass
def do_sub(self, elm):
text = self.process_children(elm)
@ -331,7 +328,7 @@ class oMath2Latex(Tag2Method):
t_dict = self.process_children_dict(elm, include=("e", "lim"))
latex_s = LIM_FUNC.get(t_dict["e"])
if not latex_s:
raise NotSupport("Not support lim %s" % t_dict["e"])
raise RuntimeError("Not support lim {}".format(t_dict["e"]))
else:
return latex_s.format(lim=t_dict.get("lim"))
@ -413,7 +410,7 @@ class oMath2Latex(Tag2Method):
"""
_str = []
_base_str = []
found_text = elm.findtext("./{0}t".format(OMML_NS))
found_text = elm.findtext(f"./{OMML_NS}t")
if found_text:
for s in found_text:
out_latex_str = self.process_unicode(s)

View File

@ -55,7 +55,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.max_levels = 10
self.level = 0
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
for i in range(0, self.max_levels):
for i in range(self.max_levels):
self.parents[i] = None
try:
@ -126,7 +126,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return doc
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
# Iterate over elements in the body of the document
text: str = ""
for element in tag.children:
@ -135,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.analyze_tag(cast(Tag, element), doc)
except Exception as exc_child:
_log.error(
f"Error processing child from tag {tag.name}: {repr(exc_child)}"
f"Error processing child from tag {tag.name}: {exc_child!r}"
)
raise exc_child
elif isinstance(element, NavigableString) and not isinstance(
@ -147,7 +146,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
item for item in element.next_siblings if isinstance(item, Tag)
]
if element.next_sibling is None or any(
[item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
item.name in TAGS_FOR_NODE_ITEMS for item in siblings
):
text = text.strip()
if text and tag.name in ["div"]:
@ -222,7 +221,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
)
else:
if hlevel > self.level:
# add invisible group
for i in range(self.level + 1, hlevel):
self.parents[i] = doc.add_group(
@ -234,7 +232,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.level = hlevel
elif hlevel < self.level:
# remove the tail
for key in self.parents.keys():
if key > hlevel:
@ -360,7 +357,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
marker = ""
enumerated = False
if parent_label == GroupLabel.ORDERED_LIST:
marker = f"{str(index_in_list)}."
marker = f"{index_in_list!s}."
enumerated = True
doc.add_list_item(
text=text,

View File

@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# otherwise they represent emphasis (bold or italic)
self.markdown = self._shorten_underscore_sequences(text_stream)
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
with open(self.path_or_stream, encoding="utf-8") as f:
md_content = f.read()
# remove invalid sequences
# very long sequences of underscores will lead to unnecessary long processing times.
@ -168,7 +168,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
)
self.inline_texts = []
def _iterate_elements(
def _iterate_elements( # noqa: C901
self,
element: marko.element.Element,
depth: int,
@ -176,7 +176,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
visited: Set[marko.element.Element],
parent_item: Optional[NodeItem] = None,
):
if element in visited:
return
@ -236,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
if has_non_empty_list_items:
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
parent_item = doc.add_group(
label=label, name=f"list", parent=parent_item
label=label, name="list", parent=parent_item
)
elif (
@ -320,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self._html_blocks += 1
self._process_inline_text(parent_item, doc)
self._close_table(doc)
_log.debug("HTML Block: {}".format(element))
_log.debug(f"HTML Block: {element}")
if (
len(element.body) > 0
): # If Marko doesn't return any content for HTML block, skip it
@ -332,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
else:
if not isinstance(element, str):
self._close_table(doc)
_log.debug("Some other element: {}".format(element))
_log.debug(f"Some other element: {element}")
processed_block_types = (
marko.block.Heading,
@ -398,7 +397,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# if HTML blocks were detected, export to HTML and delegate to HTML backend
if self._html_blocks > 0:
# export to HTML
html_backend_cls = HTMLDocumentBackend
html_str = doc.export_to_html()

View File

@ -184,7 +184,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
"""
if self.workbook is not None:
# Iterate over all sheets
for sheet_name in self.workbook.sheetnames:
_log.info(f"Processing sheet: {sheet_name}")
@ -253,7 +252,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
)
for excel_cell in excel_table.data:
cell = TableCell(
text=excel_cell.text,
row_span=excel_cell.row_span,
@ -303,7 +301,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
# Iterate over all cells in the sheet
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
for rj, cell in enumerate(row):
# Skip empty or already visited cells
if cell.value is None or (ri, rj) in visited:
continue
@ -342,7 +339,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
visited_cells: set[tuple[int, int]] = set()
for ri in range(start_row, max_row + 1):
for rj in range(start_col, max_col + 1):
cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
# Check if the cell belongs to a merged range
@ -350,14 +346,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
col_span = 1
for merged_range in sheet.merged_cells.ranges:
if (
merged_range.min_row <= ri + 1
and ri + 1 <= merged_range.max_row
and merged_range.min_col <= rj + 1
and rj + 1 <= merged_range.max_col
):
row_span = merged_range.max_row - merged_range.min_row + 1
col_span = merged_range.max_col - merged_range.min_col + 1
break
@ -499,7 +493,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
),
),
)
except:
except Exception:
_log.error("could not extract the image from excel sheets")
return doc

View File

@ -120,13 +120,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
return prov
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901
is_a_list = False
is_list_group_created = False
enum_list_item_value = 0
new_list = None
bullet_type = "None"
list_text = ""
list_label = GroupLabel.LIST
doc_label = DocItemLabel.LIST_ITEM
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
@ -243,7 +242,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
enum_marker = str(enum_list_item_value) + "."
if not is_list_group_created:
new_list = doc.add_group(
label=list_label, name=f"list", parent=parent_slide
label=list_label, name="list", parent=parent_slide
)
is_list_group_created = True
doc.add_list_item(
@ -368,11 +367,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
slide_width = pptx_obj.slide_width
slide_height = pptx_obj.slide_height
text_content = [] # type: ignore
max_levels = 10
parents = {} # type: ignore
for i in range(0, max_levels):
for i in range(max_levels):
parents[i] = None
# Loop through each slide
@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
)
slide_size = Size(width=slide_width, height=slide_height)
parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
doc.add_page(page_no=slide_ind + 1, size=slide_size)
def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
handle_groups(shape, parent_slide, slide_ind, doc, slide_size)

View File

@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def _get_level(self) -> int:
"""Return the first None index."""
for k, v in self.parents.items():
if k >= 0 and v == None:
if k >= 0 and v is None:
return k
return 0
@ -418,7 +418,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
else prev_parent
)
def _handle_text_elements(
def _handle_text_elements( # noqa: C901
self,
element: BaseOxmlElement,
docx_obj: DocxDocument,
@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
)
if cell is None or cell._tc in cell_set:
_log.debug(f" skipped since repeated content")
_log.debug(" skipped since repeated content")
col_idx += cell.grid_span
continue
else:
@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
)
except (UnidentifiedImageError, OSError) as e:
except (UnidentifiedImageError, OSError):
_log.warning("Warning: image cannot be loaded by Pillow")
doc.add_picture(
parent=self.parents[level - 1],

View File

@ -1,7 +1,8 @@
from abc import ABC, abstractmethod
from collections.abc import Iterable
from io import BytesIO
from pathlib import Path
from typing import Iterable, Optional, Set, Union
from typing import Optional, Set, Union
from docling_core.types.doc import BoundingBox, Size
from docling_core.types.doc.page import SegmentedPdfPage, TextCell

View File

@ -1,8 +1,9 @@
import logging
import random
from collections.abc import Iterable
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
self.valid = True # No better way to tell from pypdfium.
try:
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
except PdfiumError as e:
except PdfiumError:
_log.info(
f"An exception occurred when loading page {page_no} of document {document_hash}.",
exc_info=True,
@ -225,7 +226,6 @@ class PyPdfiumPageBackend(PdfPageBackend):
def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
page_size = self.get_size()
if not cropbox:

View File

@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
doc_info: etree.DocInfo = self.tree.docinfo
if doc_info.system_url and any(
[kwd in doc_info.system_url for kwd in JATS_DTD_URL]
kwd in doc_info.system_url for kwd in JATS_DTD_URL
):
self.valid = True
return
for ent in doc_info.internalDTD.iterentities():
if ent.system_url and any(
[kwd in ent.system_url for kwd in JATS_DTD_URL]
kwd in ent.system_url for kwd in JATS_DTD_URL
):
self.valid = True
return
@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
# TODO: once superscript is supported, add label with formatting
aff = aff.removeprefix(f"{label[0].text}, ")
affiliation_names.append(aff)
affiliation_ids_names = {
id: name
for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
}
affiliation_ids_names = dict(
zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
)
# Get author names and affiliation names
for author_node in meta.xpath(
@ -300,7 +299,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
def _add_abstract(
self, doc: DoclingDocument, xml_components: XMLComponents
) -> None:
for abstract in xml_components["abstract"]:
text: str = abstract["content"]
title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
@ -349,7 +347,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
return
def _parse_element_citation(self, node: etree._Element) -> str:
def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901
citation: Citation = {
"author_names": "",
"title": "",
@ -440,7 +438,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
if len(node.xpath("lpage")) > 0:
citation["page"] += (
"" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
"" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001
)
# Flatten the citation to string
@ -595,9 +593,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
try:
self._add_table(doc, parent, table)
except Exception as e:
_log.warning(f"Skipping unsupported table in {str(self.file)}")
pass
except Exception:
_log.warning(f"Skipping unsupported table in {self.file!s}")
return
@ -609,7 +606,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
)
return
def _walk_linear(
def _walk_linear( # noqa: C901
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
) -> str:
skip_tags = ["term"]

View File

@ -122,7 +122,6 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
@override
def convert(self) -> DoclingDocument:
if self.parser is not None:
doc = self.parser.parse(self.patent_content)
if doc is None:
@ -163,7 +162,6 @@ class PatentUspto(ABC):
Returns:
The patent parsed as a docling document.
"""
pass
class PatentUsptoIce(PatentUspto):
@ -265,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
self.style_html = HtmlEntity()
@override
def startElement(self, tag, attributes): # noqa: N802
def startElement(self, tag, attributes):
"""Signal the start of an element.
Args:
@ -281,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
self._start_registered_elements(tag, attributes)
@override
def skippedEntity(self, name): # noqa: N802
def skippedEntity(self, name):
"""Receive notification of a skipped entity.
HTML entities will be skipped by the parser. This method will unescape them
@ -315,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
self.text += unescaped
@override
def endElement(self, tag): # noqa: N802
def endElement(self, tag):
"""Signal the end of an element.
Args:
@ -603,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
self.style_html = HtmlEntity()
@override
def startElement(self, tag, attributes): # noqa: N802
def startElement(self, tag, attributes):
"""Signal the start of an element.
Args:
@ -616,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
self._start_registered_elements(tag, attributes)
@override
def skippedEntity(self, name): # noqa: N802
def skippedEntity(self, name):
"""Receive notification of a skipped entity.
HTML entities will be skipped by the parser. This method will unescape them
@ -650,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
self.text += unescaped
@override
def endElement(self, tag): # noqa: N802
def endElement(self, tag):
"""Signal the end of an element.
Args:
@ -691,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
if tag in [member.value for member in self.Element]:
if (
tag == self.Element.HEADING.value
and not self.Element.SDOCL.value in self.property
and self.Element.SDOCL.value not in self.property
):
level_attr: str = attributes.get("LVL", "")
new_level: int = int(level_attr) if level_attr.isnumeric() else 1
@ -743,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
# headers except claims statement
elif (
self.Element.HEADING.value in self.property
and not self.Element.SDOCL.value in self.property
and self.Element.SDOCL.value not in self.property
and text.strip()
):
self.parents[self.level + 1] = self.doc.add_heading(
@ -1164,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
self.style_html = HtmlEntity()
@override
def startElement(self, tag, attributes): # noqa: N802
def startElement(self, tag, attributes):
"""Signal the start of an element.
Args:
@ -1177,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
self._start_registered_elements(tag, attributes)
@override
def skippedEntity(self, name): # noqa: N802
def skippedEntity(self, name):
"""Receive notification of a skipped entity.
HTML entities will be skipped by the parser. This method will unescape them
@ -1211,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
self.text += unescaped
@override
def endElement(self, tag): # noqa: N802
def endElement(self, tag):
"""Signal the end of an element.
Args:
@ -1474,9 +1472,7 @@ class XmlTable:
if cw == 0:
offset_w0.append(col["offset"][ic])
min_colinfo["offset"] = sorted(
list(set(col["offset"] + min_colinfo["offset"]))
)
min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))
# add back the 0 width cols to offset list
offset_w0 = list(set(offset_w0))
@ -1527,7 +1523,7 @@ class XmlTable:
return ncols_max
def _parse_table(self, table: Tag) -> TableData:
def _parse_table(self, table: Tag) -> TableData: # noqa: C901
"""Parse the content of a table tag.
Args:
@ -1722,7 +1718,7 @@ class HtmlEntity:
"0": "&#8304;",
"+": "&#8314;",
"-": "&#8315;",
"": "&#8315;",
"": "&#8315;", # noqa: RUF001
"=": "&#8316;",
"(": "&#8317;",
")": "&#8318;",
@ -1746,7 +1742,7 @@ class HtmlEntity:
"0": "&#8320;",
"+": "&#8330;",
"-": "&#8331;",
"": "&#8331;",
"": "&#8331;", # noqa: RUF001
"=": "&#8332;",
"(": "&#8333;",
")": "&#8334;",

View File

@ -6,14 +6,16 @@ import sys
import tempfile
import time
import warnings
from collections.abc import Iterable
from pathlib import Path
from typing import Annotated, Dict, Iterable, List, Optional, Type
from typing import Annotated, Dict, List, Optional, Type
import rich.table
import typer
from docling_core.types.doc import ImageRefMode
from docling_core.utils.file import resolve_source_to_path
from pydantic import TypeAdapter
from rich.console import Console
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@ -53,7 +55,6 @@ warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
_log = logging.getLogger(__name__)
from rich.console import Console
console = Console()
err_console = Console(stderr=True)
@ -160,7 +161,6 @@ def export_documents(
export_doctags: bool,
image_export_mode: ImageRefMode,
):
success_count = 0
failure_count = 0
@ -233,7 +233,7 @@ def _split_list(raw: Optional[str]) -> Optional[List[str]]:
@app.command(no_args_is_help=True)
def convert(
def convert( # noqa: C901
input_sources: Annotated[
List[str],
typer.Argument(
@ -289,7 +289,7 @@ def convert(
...,
help=(
f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
f"{', '.join((o.value for o in ocr_engines_enum_internal))}. "
f"{', '.join(o.value for o in ocr_engines_enum_internal)}. "
f"Use the option --show-external-plugins to see the options allowed with external plugins."
),
),
@ -430,7 +430,7 @@ def convert(
settings.debug.visualize_ocr = debug_visualize_ocr
if from_formats is None:
from_formats = [e for e in InputFormat]
from_formats = list(InputFormat)
parsed_headers: Optional[Dict[str, str]] = None
if headers is not None:

View File

@ -62,7 +62,7 @@ def download(
models: Annotated[
Optional[list[_AvailableModels]],
typer.Argument(
help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
help="Models to download (default behavior: a predefined set of models will be downloaded).",
),
] = None,
all: Annotated[
@ -89,14 +89,13 @@ def download(
"Cannot simultaneously set 'all' parameter and specify models to download."
)
if not quiet:
FORMAT = "%(message)s"
logging.basicConfig(
level=logging.INFO,
format="[blue]%(message)s[/blue]",
datefmt="[%X]",
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
)
to_download = models or ([m for m in _AvailableModels] if all else _default_models)
to_download = models or (list(_AvailableModels) if all else _default_models)
output_dir = download_models(
output_dir=output_dir,
force=force,

View File

@ -10,7 +10,9 @@ from docling_core.types.doc import (
TableCell,
)
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
# DO NOT REMOVE; explicitly exposed from this location
from docling_core.types.io import (
DocumentStream,
)
from PIL.Image import Image
@ -233,9 +235,9 @@ class Page(BaseModel):
None # Internal PDF backend. By default it is cleared during assembling.
)
_default_image_scale: float = 1.0 # Default image scale for external usage.
_image_cache: Dict[float, Image] = (
{}
) # Cache of images in different scales. By default it is cleared during assembling.
_image_cache: Dict[
float, Image
] = {} # Cache of images in different scales. By default it is cleared during assembling.
def get_image(
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
@ -243,7 +245,7 @@ class Page(BaseModel):
if self._backend is None:
return self._image_cache.get(scale, None)
if not scale in self._image_cache:
if scale not in self._image_cache:
if cropbox is None:
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
else:

View File

@ -1,13 +1,13 @@
import csv
import logging
import re
from collections.abc import Iterable
from enum import Enum
from io import BytesIO
from pathlib import Path, PurePath
from typing import (
TYPE_CHECKING,
Dict,
Iterable,
List,
Literal,
Optional,
@ -17,6 +17,8 @@ from typing import (
)
import filetype
# DO NOT REMOVE; explicitly exposed from this location
from docling_core.types.doc import (
DocItem,
DocItemLabel,
@ -35,14 +37,14 @@ from docling_core.types.legacy_doc.base import (
PageReference,
Prov,
Ref,
Table as DsSchemaTable,
TableCell,
)
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
from docling_core.types.legacy_doc.base import TableCell
from docling_core.types.legacy_doc.document import (
CCSDocumentDescription as DsDocumentDescription,
CCSFileInfoObject as DsFileInfoObject,
ExportedCCSDocument as DsDocument,
)
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from docling_core.utils.file import resolve_source_to_stream
from docling_core.utils.legacy import docling_document_to_legacy
from pydantic import BaseModel
@ -65,7 +67,7 @@ from docling.datamodel.base_models import (
)
from docling.datamodel.settings import DocumentLimits
from docling.utils.profiling import ProfilingItem
from docling.utils.utils import create_file_hash, create_hash
from docling.utils.utils import create_file_hash
if TYPE_CHECKING:
from docling.document_converter import FormatOption
@ -134,9 +136,9 @@ class InputDocument(BaseModel):
self._init_doc(backend, path_or_stream)
elif isinstance(path_or_stream, BytesIO):
assert (
filename is not None
), "Can't construct InputDocument from stream without providing filename arg."
assert filename is not None, (
"Can't construct InputDocument from stream without providing filename arg."
)
self.file = PurePath(filename)
self.filesize = path_or_stream.getbuffer().nbytes
@ -228,7 +230,6 @@ class _DummyBackend(AbstractDocumentBackend):
class _DocumentConversionInput(BaseModel):
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
headers: Optional[Dict[str, str]] = None
limits: Optional[DocumentLimits] = DocumentLimits()

View File

@ -380,7 +380,6 @@ class PaginatedPipelineOptions(PipelineOptions):
class VlmPipelineOptions(PaginatedPipelineOptions):
generate_page_images: bool = True
force_backend_text: bool = (
False # (To be used with vlms, or other generative models)

View File

@ -1,11 +1,11 @@
import hashlib
import logging
import math
import sys
import time
from collections.abc import Iterable, Iterator
from functools import partial
from pathlib import Path
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
from typing import Dict, List, Optional, Tuple, Type, Union
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
@ -172,7 +172,7 @@ class DocumentConverter:
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
):
self.allowed_formats = (
allowed_formats if allowed_formats is not None else [e for e in InputFormat]
allowed_formats if allowed_formats is not None else list(InputFormat)
)
self.format_to_options = {
format: (
@ -254,7 +254,7 @@ class DocumentConverter:
if not had_result and raises_on_error:
raise ConversionError(
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
)
def _convert(
@ -266,7 +266,7 @@ class DocumentConverter:
conv_input.docs(self.format_to_options),
settings.perf.doc_batch_size, # pass format_options
):
_log.info(f"Going to convert document batch...")
_log.info("Going to convert document batch...")
# parallel processing only within input_batch
# with ThreadPoolExecutor(

View File

@ -1,4 +1,4 @@
from typing import Iterable
from collections.abc import Iterable
from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult
@ -10,7 +10,6 @@ from docling.utils.profiling import TimeRecorder
class ApiVlmModel(BasePageModel):
def __init__(
self,
enabled: bool,

View File

@ -1,5 +1,6 @@
from abc import ABC, abstractmethod
from typing import Any, Generic, Iterable, Optional, Protocol, Type
from collections.abc import Iterable
from typing import Generic, Optional, Protocol, Type
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
from typing_extensions import TypeVar
@ -29,7 +30,6 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
elements_batch_size: int = settings.perf.elements_batch_size
@abstractmethod
@ -50,7 +50,6 @@ class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
def prepare_element(
self, conv_res: ConversionResult, element: NodeItem
) -> Optional[NodeItem]:
@ -62,7 +61,6 @@ class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
class BaseItemAndImageEnrichmentModel(
GenericEnrichmentModel[ItemAndImageEnrichmentElement]
):
images_scale: float
expansion_factor: float = 0.0

View File

@ -1,12 +1,12 @@
import copy
import logging
from abc import abstractmethod
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, List, Optional, Type
from typing import List, Optional, Type
import numpy as np
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
from PIL import Image, ImageDraw
from rtree import index
from scipy.ndimage import binary_dilation, find_objects, label

View File

@ -1,7 +1,8 @@
import re
from collections import Counter
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, List, Literal, Optional, Tuple, Union
from typing import List, Literal, Optional, Tuple, Union
import numpy as np
from docling_core.types.doc import (

View File

@ -1,5 +1,6 @@
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, List, Literal, Optional, Tuple, Union
from typing import List, Literal, Optional, Union
import numpy as np
from docling_core.types.doc import (

View File

@ -1,8 +1,9 @@
import logging
import warnings
import zipfile
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, List, Optional, Type
from typing import List, Optional, Type
import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin
@ -58,12 +59,10 @@ class EasyOcrModel(BaseOcrModel):
device = decide_device(accelerator_options.device)
# Enable easyocr GPU if running on CUDA, MPS
use_gpu = any(
[
device.startswith(x)
for x in [
AcceleratorDevice.CUDA.value,
AcceleratorDevice.MPS.value,
]
device.startswith(x)
for x in [
AcceleratorDevice.CUDA.value,
AcceleratorDevice.MPS.value,
]
)
else:
@ -98,8 +97,10 @@ class EasyOcrModel(BaseOcrModel):
progress: bool = False,
) -> Path:
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
from easyocr.config import detection_models as det_models_dict
from easyocr.config import recognition_models as rec_models_dict
from easyocr.config import (
detection_models as det_models_dict,
recognition_models as rec_models_dict,
)
if local_dir is None:
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
@ -126,13 +127,11 @@ class EasyOcrModel(BaseOcrModel):
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
if not self.enabled:
yield from page_batch
return
for page in page_batch:
assert page._backend is not None
if not page._backend.is_valid():
yield page

View File

@ -9,7 +9,7 @@ from docling.models.factories.picture_description_factory import (
logger = logging.getLogger(__name__)
@lru_cache()
@lru_cache
def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
factory = OcrFactory()
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
@ -17,7 +17,7 @@ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
return factory
@lru_cache()
@lru_cache
def get_picture_description_factory(
allow_external_plugins: bool = False,
) -> PictureDescriptionFactory:

View File

@ -33,7 +33,7 @@ class BaseFactory(Generic[A], metaclass=ABCMeta):
@property
def registered_kind(self) -> list[str]:
return list(opt.kind for opt in self._classes.keys())
return [opt.kind for opt in self._classes.keys()]
def get_enum(self) -> enum.Enum:
return enum.Enum(

View File

@ -1,25 +1,22 @@
import logging
import time
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, List, Optional
from typing import Optional
from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
HuggingFaceVlmOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
class HuggingFaceMlxModel(BasePageModel):
def __init__(
self,
enabled: bool,
@ -32,7 +29,6 @@ class HuggingFaceMlxModel(BasePageModel):
self.vlm_options = vlm_options
if self.enabled:
try:
from mlx_vlm import generate, load # type: ignore
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
@ -125,6 +121,8 @@ class HuggingFaceMlxModel(BasePageModel):
generation_time = time.time() - start_time
page_tags = output
_log.debug(f"Generation time {generation_time:.2f} seconds.")
# inference_time = time.time() - start_time
# tokens_per_second = num_tokens / generation_time
# print("")

View File

@ -1,16 +1,15 @@
import logging
import time
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, List, Optional
from typing import Optional
from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
HuggingFaceVlmOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder
@ -19,7 +18,6 @@ _log = logging.getLogger(__name__)
class HuggingFaceVlmModel(BasePageModel):
def __init__(
self,
enabled: bool,
@ -42,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel):
device = decide_device(accelerator_options.device)
self.device = device
_log.debug("Available device for HuggingFace VLM: {}".format(device))
_log.debug(f"Available device for HuggingFace VLM: {device}")
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
@ -168,6 +166,10 @@ class HuggingFaceVlmModel(BasePageModel):
num_tokens = len(generated_ids[0])
page_tags = generated_texts
_log.debug(
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
)
# inference_time = time.time() - start_time
# tokens_per_second = num_tokens / generation_time
# print("")

View File

@ -1,8 +1,9 @@
import copy
import logging
import warnings
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, Optional, Union
from typing import Optional
from docling_core.types.doc import DocItemLabel
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
@ -142,7 +143,6 @@ class LayoutModel(BasePageModel):
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
for page in page_batch:
assert page._backend is not None
if not page._backend.is_valid():

View File

@ -1,8 +1,9 @@
import logging
import sys
import tempfile
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, Optional, Tuple, Type
from typing import Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel):
if self.enabled:
if "darwin" != sys.platform:
raise RuntimeError(f"OcrMac is only supported on Mac.")
raise RuntimeError("OcrMac is only supported on Mac.")
install_errmsg = (
"ocrmac is not correctly installed. "
"Please install it via `pip install ocrmac` to use this OCR engine. "
@ -58,7 +59,6 @@ class OcrMacModel(BaseOcrModel):
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
if not self.enabled:
yield from page_batch
return
@ -69,7 +69,6 @@ class OcrMacModel(BaseOcrModel):
yield page
else:
with TimeRecorder(conv_res, "ocr"):
ocr_rects = self.get_ocr_rects(page)
all_ocr_cells = []

View File

@ -1,6 +1,7 @@
import logging
import re
from typing import Iterable, List
from collections.abc import Iterable
from typing import List
from pydantic import BaseModel
@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel):
sanitized_text = "".join(lines)
# Text normalization
sanitized_text = sanitized_text.replace("", "/")
sanitized_text = sanitized_text.replace("", "'")
sanitized_text = sanitized_text.replace("", "'")
sanitized_text = sanitized_text.replace("", "/") # noqa: RUF001
sanitized_text = sanitized_text.replace("", "'") # noqa: RUF001
sanitized_text = sanitized_text.replace("", "'") # noqa: RUF001
sanitized_text = sanitized_text.replace("", '"')
sanitized_text = sanitized_text.replace("", '"')
sanitized_text = sanitized_text.replace("", "·")
@ -71,7 +72,6 @@ class PageAssembleModel(BasePageModel):
yield page
else:
with TimeRecorder(conv_res, "page_assemble"):
assert page.predictions.layout is not None
# assembles some JSON output page by page.
@ -83,7 +83,6 @@ class PageAssembleModel(BasePageModel):
for cluster in page.predictions.layout.clusters:
# _log.info("Cluster label seen:", cluster.label)
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
textlines = [
cell.text.replace("\x02", "-").strip()
for cell in cluster.cells
@ -109,9 +108,7 @@ class PageAssembleModel(BasePageModel):
tbl = page.predictions.tablestructure.table_map.get(
cluster.id, None
)
if (
not tbl
): # fallback: add table without structure, if it isn't present
if not tbl: # fallback: add table without structure, if it isn't present
tbl = Table(
label=cluster.label,
id=cluster.id,
@ -130,9 +127,7 @@ class PageAssembleModel(BasePageModel):
fig = page.predictions.figures_classification.figure_map.get(
cluster.id, None
)
if (
not fig
): # fallback: add figure without classification, if it isn't present
if not fig: # fallback: add figure without classification, if it isn't present
fig = FigureElement(
label=cluster.label,
id=cluster.id,

View File

@ -1,5 +1,6 @@
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, Optional
from typing import Optional
from PIL import ImageDraw
from pydantic import BaseModel

View File

@ -1,5 +1,6 @@
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, Optional, Type, Union
from typing import Optional, Type, Union
from PIL import Image

View File

@ -1,12 +1,11 @@
import logging
from abc import abstractmethod
from collections.abc import Iterable
from pathlib import Path
from typing import Any, Iterable, List, Optional, Type, Union
from typing import List, Optional, Type, Union
from docling_core.types.doc import (
DoclingDocument,
NodeItem,
PictureClassificationClass,
PictureItem,
)
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc

View File

@ -1,5 +1,6 @@
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, Optional, Type, Union
from typing import Optional, Type, Union
from PIL import Image
@ -13,7 +14,6 @@ from docling.utils.accelerator_utils import decide_device
class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
@classmethod
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
return PictureDescriptionVlmOptions
@ -36,7 +36,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
self.options: PictureDescriptionVlmOptions
if self.enabled:
if artifacts_path is None:
artifacts_path = self.download_models(repo_id=self.options.repo_id)
else:

View File

@ -1,6 +1,7 @@
import logging
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, Optional, Type
from typing import Optional, Type
import numpy
from docling_core.types.doc import BoundingBox, CoordOrigin
@ -74,13 +75,11 @@ class RapidOcrModel(BaseOcrModel):
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
if not self.enabled:
yield from page_batch
return
for page in page_batch:
assert page._backend is not None
if not page._backend.is_valid():
yield page

View File

@ -1,12 +1,7 @@
import copy
import random
from pathlib import Path
from typing import Dict, List
from docling_core.types.doc import (
BoundingBox,
CoordOrigin,
DocItem,
DocItemLabel,
DoclingDocument,
DocumentOrigin,
@ -17,13 +12,10 @@ from docling_core.types.doc import (
TableData,
)
from docling_core.types.doc.document import ContentLayer
from docling_core.types.legacy_doc.base import Ref
from docling_core.types.legacy_doc.document import BaseText
from docling_ibm_models.reading_order.reading_order_rb import (
PageElement as ReadingOrderPageElement,
ReadingOrderPredictor,
)
from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
from PIL import ImageDraw
from pydantic import BaseModel, ConfigDict
from docling.datamodel.base_models import (
@ -35,7 +27,6 @@ from docling.datamodel.base_models import (
TextElement,
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.utils.profiling import ProfilingScope, TimeRecorder
@ -53,12 +44,10 @@ class ReadingOrderModel:
def _assembled_to_readingorder_elements(
self, conv_res: ConversionResult
) -> List[ReadingOrderPageElement]:
elements: List[ReadingOrderPageElement] = []
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
for element in conv_res.assembled.elements:
page_height = page_no_to_pages[element.page_no].size.height # type: ignore
bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
text = element.text or ""
@ -84,7 +73,6 @@ class ReadingOrderModel:
def _add_child_elements(
self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
):
child: Cluster
for child in element.cluster.children:
c_label = child.label
@ -110,7 +98,7 @@ class ReadingOrderModel:
else:
doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
def _readingorder_elements_to_docling_doc(
def _readingorder_elements_to_docling_doc( # noqa: C901
self,
conv_res: ConversionResult,
ro_elements: List[ReadingOrderPageElement],
@ -118,7 +106,6 @@ class ReadingOrderModel:
el_to_footnotes_mapping: Dict[int, List[int]],
el_merges_mapping: Dict[int, List[int]],
) -> DoclingDocument:
id_to_elem = {
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
for elem in conv_res.assembled.elements
@ -192,7 +179,6 @@ class ReadingOrderModel:
code_item.footnotes.append(new_footnote_item.get_ref())
else:
new_item, current_list = self._handle_text_element(
element, out_doc, current_list, page_height
)
@ -206,7 +192,6 @@ class ReadingOrderModel:
)
elif isinstance(element, Table):
tbl_data = TableData(
num_rows=element.num_rows,
num_cols=element.num_cols,
@ -342,12 +327,12 @@ class ReadingOrderModel:
return new_item, current_list
def _merge_elements(self, element, merged_elem, new_item, page_height):
assert isinstance(
merged_elem, type(element)
), "Merged element must be of same type as element."
assert (
merged_elem.label == new_item.label
), "Labels of merged elements must match."
assert isinstance(merged_elem, type(element)), (
"Merged element must be of same type as element."
)
assert merged_elem.label == new_item.label, (
"Labels of merged elements must match."
)
prov = ProvenanceItem(
page_no=element.page_no + 1,
charspan=(

View File

@ -1,13 +1,13 @@
import copy
import warnings
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, Optional, Union
from typing import Optional
import numpy
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
from docling_core.types.doc.page import (
BoundingRectangle,
SegmentedPdfPage,
TextCellUnit,
)
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
@ -44,7 +44,6 @@ class TableStructureModel(BasePageModel):
self.enabled = enabled
if self.enabled:
if artifacts_path is None:
artifacts_path = self.download_models() / self._model_path
else:
@ -175,7 +174,6 @@ class TableStructureModel(BasePageModel):
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
if not self.enabled:
yield from page_batch
return
@ -186,7 +184,6 @@ class TableStructureModel(BasePageModel):
yield page
else:
with TimeRecorder(conv_res, "table_structure"):
assert page.predictions.layout is not None
assert page.size is not None
@ -260,7 +257,6 @@ class TableStructureModel(BasePageModel):
table_out = tf_output[0]
table_cells = []
for element in table_out["tf_responses"]:
if not self.do_cell_matching:
the_bbox = BoundingBox.model_validate(
element["bbox"]

View File

@ -3,9 +3,10 @@ import io
import logging
import os
import tempfile
from collections.abc import Iterable
from pathlib import Path
from subprocess import DEVNULL, PIPE, Popen
from typing import Iterable, List, Optional, Tuple, Type
from typing import List, Optional, Tuple, Type
import pandas as pd
from docling_core.types.doc import BoundingBox, CoordOrigin
@ -63,8 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel):
)
def _get_name_and_version(self) -> Tuple[str, str]:
if self._name != None and self._version != None:
if self._name is not None and self._version is not None:
return self._name, self._version # type: ignore
cmd = [self.options.tesseract_cmd, "--version"]
@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
# _log.info(decoded_data)
# Read the TSV file generated by Tesseract
df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
df_result = pd.read_csv(
io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
)
# Display the dataframe (optional)
# _log.info("df: ", df.head())
# Filter rows that contain actual text (ignore header or empty rows)
df_filtered = df[
df["text"].notnull() & (df["text"].apply(str).str.strip() != "")
df_filtered = df_result[
df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
]
return df_filtered
@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate()
decoded_data = output.decode("utf-8")
df = pd.read_csv(
df_detected = pd.read_csv(
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
)
scripts = df.loc[df["key"] == "Script"].value.tolist()
scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
if len(scripts) == 0:
_log.warning("Tesseract cannot detect the script of the page")
return None
@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
output, _ = proc.communicate()
decoded_data = output.decode("utf-8")
df = pd.read_csv(io.StringIO(decoded_data), header=None)
self._tesseract_languages = df[0].tolist()[1:]
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
self._tesseract_languages = df_list[0].tolist()[1:]
# Decide the script prefix
if any([l.startswith("script/") for l in self._tesseract_languages]):
if any(lang.startswith("script/") for lang in self._tesseract_languages):
script_prefix = "script/"
else:
script_prefix = ""
@ -197,7 +199,6 @@ class TesseractOcrCliModel(BaseOcrModel):
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
if not self.enabled:
yield from page_batch
return
@ -225,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
fname = image_file.name
high_res_image.save(image_file)
df = self._run_tesseract(fname)
df_result = self._run_tesseract(fname)
finally:
if os.path.exists(fname):
os.remove(fname)
# _log.info(df)
# _log.info(df_result)
# Print relevant columns (bounding box and text)
for ix, row in df.iterrows():
for ix, row in df_result.iterrows():
text = row["text"]
conf = row["conf"]
l = float(row["left"])
l = float(row["left"]) # noqa: E741
b = float(row["top"])
w = float(row["width"])
h = float(row["height"])

View File

@ -1,6 +1,7 @@
import logging
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable, Optional, Type
from typing import Optional, Type
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import BoundingRectangle, TextCell
@ -37,9 +38,6 @@ class TesseractOcrModel(BaseOcrModel):
self.options: TesseractOcrOptions
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
self.reader = None
self.osd_reader = None
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
if self.enabled:
install_errmsg = (
@ -64,7 +62,7 @@ class TesseractOcrModel(BaseOcrModel):
raise ImportError(install_errmsg)
try:
tesseract_version = tesserocr.tesseract_version()
except:
except Exception:
raise ImportError(install_errmsg)
_, self._tesserocr_languages = tesserocr.get_languages()
@ -75,7 +73,7 @@ class TesseractOcrModel(BaseOcrModel):
_log.debug("Initializing TesserOCR: %s", tesseract_version)
lang = "+".join(self.options.lang)
if any([l.startswith("script/") for l in self._tesserocr_languages]):
if any(lang.startswith("script/") for lang in self._tesserocr_languages):
self.script_prefix = "script/"
else:
self.script_prefix = ""
@ -86,6 +84,10 @@ class TesseractOcrModel(BaseOcrModel):
"oem": tesserocr.OEM.DEFAULT,
}
self.reader = None
self.osd_reader = None
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
if self.options.path is not None:
tesserocr_kwargs["path"] = self.options.path

View File

@ -3,9 +3,10 @@ import logging
import time
import traceback
from abc import ABC, abstractmethod
from typing import Any, Callable, Iterable, List
from collections.abc import Iterable
from typing import Any, Callable, List
from docling_core.types.doc import DoclingDocument, NodeItem
from docling_core.types.doc import NodeItem
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
@ -64,7 +65,6 @@ class BasePipeline(ABC):
return conv_res
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
def _prepare_elements(
conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
) -> Iterable[NodeItem]:
@ -113,7 +113,6 @@ class BasePipeline(ABC):
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
def __init__(self, pipeline_options: PipelineOptions):
super().__init__(pipeline_options)
self.keep_backend = False
@ -127,7 +126,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
yield from page_batch
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
if not isinstance(conv_res.input._backend, PdfDocumentBackend):
raise RuntimeError(
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
@ -139,8 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
total_elapsed_time = 0.0
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
for i in range(0, conv_res.input.page_count):
for i in range(conv_res.input.page_count):
start_page, end_page = conv_res.input.limits.page_range
if (start_page - 1) <= i <= (end_page - 1):
conv_res.pages.append(Page(page_no=i))
@ -161,7 +158,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
for p in pipeline_pages: # Must exhaust!
# Cleanup cached images
if not self.keep_images:
p._image_cache = {}

View File

@ -24,7 +24,6 @@ class SimplePipeline(BasePipeline):
super().__init__(pipeline_options)
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
raise RuntimeError(
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "

View File

@ -1,5 +1,4 @@
import logging
import sys
import warnings
from pathlib import Path
from typing import Optional, cast

View File

@ -1,5 +1,4 @@
import logging
import warnings
from io import BytesIO
from pathlib import Path
from typing import List, Optional, Union, cast
@ -32,7 +31,6 @@ _log = logging.getLogger(__name__)
class VlmPipeline(PaginatedPipeline):
def __init__(self, pipeline_options: VlmPipelineOptions):
super().__init__(pipeline_options)
self.keep_backend = True
@ -114,7 +112,6 @@ class VlmPipeline(PaginatedPipeline):
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
if (
self.pipeline_options.vlm_options.response_format
== ResponseFormat.DOCTAGS

View File

@ -1,8 +1,8 @@
import logging
from typing import Any, Dict, Iterable, List, Tuple, Union
from collections.abc import Iterable
from typing import Any, Dict, List, Tuple, Union
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import TextCell
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
from docling.datamodel.document import ConversionResult, Page
@ -13,7 +13,6 @@ _log = logging.getLogger(__name__)
def generate_multimodal_pages(
doc_result: ConversionResult,
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
label_to_doclaynet = {
"title": "title",
"table-of-contents": "document_index",
@ -122,7 +121,6 @@ def generate_multimodal_pages(
if doc.main_text is None:
return
for ix, orig_item in enumerate(doc.main_text):
item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
if item is None or item.prov is None or len(item.prov) == 0:
_log.debug(f"Skipping item {orig_item}")

View File

@ -29,7 +29,7 @@ def resolve_item(paths, obj):
try:
key = int(paths[0])
except:
except Exception:
key = paths[0]
if len(paths) == 1:
@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
return unique_objects
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: # noqa: C901
origin = DocumentOrigin(
mimetype="application/pdf",
filename=doc_glm["file-info"]["filename"],

View File

@ -18,7 +18,7 @@ class UnionFind:
def __init__(self, elements):
self.parent = {elem: elem for elem in elements}
self.rank = {elem: 0 for elem in elements}
self.rank = dict.fromkeys(elements, 0)
def find(self, x):
if self.parent[x] != x:
@ -484,7 +484,9 @@ class LayoutPostprocessor:
spatial_index = (
self.regular_index
if cluster_type == "regular"
else self.picture_index if cluster_type == "picture" else self.wrapper_index
else self.picture_index
if cluster_type == "picture"
else self.wrapper_index
)
# Map of currently valid clusters

View File

@ -37,7 +37,7 @@ def download_models(
output_dir.mkdir(exist_ok=True, parents=True)
if with_layout:
_log.info(f"Downloading layout model...")
_log.info("Downloading layout model...")
LayoutModel.download_models(
local_dir=output_dir / LayoutModel._model_repo_folder,
force=force,
@ -45,7 +45,7 @@ def download_models(
)
if with_tableformer:
_log.info(f"Downloading tableformer model...")
_log.info("Downloading tableformer model...")
TableStructureModel.download_models(
local_dir=output_dir / TableStructureModel._model_repo_folder,
force=force,
@ -53,7 +53,7 @@ def download_models(
)
if with_picture_classifier:
_log.info(f"Downloading picture classifier model...")
_log.info("Downloading picture classifier model...")
DocumentPictureClassifier.download_models(
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
force=force,
@ -61,7 +61,7 @@ def download_models(
)
if with_code_formula:
_log.info(f"Downloading code formula model...")
_log.info("Downloading code formula model...")
CodeFormulaModel.download_models(
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
force=force,
@ -69,7 +69,7 @@ def download_models(
)
if with_smolvlm:
_log.info(f"Downloading SmolVlm model...")
_log.info("Downloading SmolVlm model...")
PictureDescriptionVlmModel.download_models(
repo_id=smolvlm_picture_description.repo_id,
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
@ -78,7 +78,7 @@ def download_models(
)
if with_granite_vision:
_log.info(f"Downloading Granite Vision model...")
_log.info("Downloading Granite Vision model...")
PictureDescriptionVlmModel.download_models(
repo_id=granite_picture_description.repo_id,
local_dir=output_dir / granite_picture_description.repo_cache_folder,
@ -87,7 +87,7 @@ def download_models(
)
if with_easyocr:
_log.info(f"Downloading easyocr models...")
_log.info("Downloading easyocr models...")
EasyOcrModel.download_models(
local_dir=output_dir / EasyOcrModel._model_repo_folder,
force=force,

View File

@ -13,7 +13,7 @@ def chunkify(iterator, chunk_size):
if isinstance(iterator, List):
iterator = iter(iterator)
for first in iterator: # Take the first element from the iterator
yield [first] + list(islice(iterator, chunk_size - 1))
yield [first, *list(islice(iterator, chunk_size - 1))]
def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:

View File

@ -383,7 +383,7 @@
"\n",
"print(f\"Downloading {url}...\")\n",
"buf = BytesIO(requests.get(url).content)\n",
"print(f\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
"print(\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
"with zipfile.ZipFile(buf) as zf:\n",
" res = zf.testzip()\n",
" if res:\n",
@ -544,7 +544,7 @@
"source": [
"doc = backend.convert()\n",
"\n",
"claims_sec = [item for item in doc.texts if item.text == \"CLAIMS\"][0]\n",
"claims_sec = next(item for item in doc.texts if item.text == \"CLAIMS\")\n",
"print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')"
]
},

View File

@ -1,8 +1,8 @@
import json
import logging
import time
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable
import yaml
from docling_core.types.doc import ImageRefMode
@ -11,7 +11,6 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption
_log = logging.getLogger(__name__)

View File

@ -3,7 +3,6 @@ import logging
import time
from pathlib import Path
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
@ -11,9 +10,6 @@ from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.ocr_mac_model import OcrMacOptions
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
from docling.models.tesseract_ocr_model import TesseractOcrOptions
_log = logging.getLogger(__name__)

View File

@ -3,8 +3,8 @@
# It does not run the actual formula understanding model.
import logging
from collections.abc import Iterable
from pathlib import Path
from typing import Iterable
from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem
@ -49,7 +49,6 @@ class ExampleFormulaUnderstandingEnrichmentModel(BaseItemAndImageEnrichmentModel
# How the pipeline can be extended.
class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
def __init__(self, pipeline_options: ExampleFormulaUnderstandingPipelineOptions):
super().__init__(pipeline_options)
self.pipeline_options: ExampleFormulaUnderstandingPipelineOptions
@ -85,7 +84,7 @@ def main():
)
}
)
result = doc_converter.convert(input_doc_path)
doc_converter.convert(input_doc_path)
if __name__ == "__main__":

View File

@ -3,8 +3,9 @@
# It does not run the actual picture classifier model.
import logging
from collections.abc import Iterable
from pathlib import Path
from typing import Any, Iterable
from typing import Any
from docling_core.types.doc import (
DoclingDocument,

View File

@ -4,7 +4,7 @@ from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -51,7 +51,6 @@ def main():
page_segments,
page,
) in generate_multimodal_pages(conv_res):
dpi = page._default_image_scale * 72
rows.append(
@ -81,10 +80,10 @@ def main():
)
# Generate one parquet from all documents
df = pd.json_normalize(rows)
df_result = pd.json_normalize(rows)
now = datetime.datetime.now()
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
df.to_parquet(output_filename)
df_result.to_parquet(output_filename)
end_time = time.time() - start_time

View File

@ -32,12 +32,12 @@ def main():
print(table_df.to_markdown())
# Save the table as csv
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv"
_log.info(f"Saving CSV table to {element_csv_filename}")
table_df.to_csv(element_csv_filename)
# Save the table as html
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html"
_log.info(f"Saving HTML table to {element_html_filename}")
with element_html_filename.open("w") as fp:
fp.write(table.export_to_html(doc=conv_res.document))

View File

@ -1,14 +1,9 @@
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions,
RapidOcrOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -153,10 +153,10 @@
"source": [
"for i, chunk in enumerate(chunk_iter):\n",
" print(f\"=== {i} ===\")\n",
" print(f\"chunk.text:\\n{repr(f'{chunk.text[:300]}…')}\")\n",
" print(f\"chunk.text:\\n{f'{chunk.text[:300]}…'!r}\")\n",
"\n",
" enriched_text = chunker.serialize(chunk=chunk)\n",
" print(f\"chunker.serialize(chunk):\\n{repr(f'{enriched_text[:300]}…')}\")\n",
" print(f\"chunker.serialize(chunk):\\n{f'{enriched_text[:300]}…'!r}\")\n",
"\n",
" print()"
]
@ -353,11 +353,11 @@
"for i, chunk in enumerate(chunks):\n",
" print(f\"=== {i} ===\")\n",
" txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
" print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
" print(f\"chunk.text ({txt_tokens} tokens):\\n{chunk.text!r}\")\n",
"\n",
" ser_txt = chunker.serialize(chunk=chunk)\n",
" ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{ser_txt!r}\")\n",
"\n",
" print()"
]

View File

@ -2,17 +2,14 @@ import json
import time
from pathlib import Path
import yaml
from docling_core.types.doc import DocItemLabel, ImageRefMode
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
VlmPipelineOptions,
granite_vision_vlm_conversion_options,
smoldocling_vlm_conversion_options,
smoldocling_vlm_mlx_conversion_options,
)
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
@ -39,9 +36,6 @@ pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
## Alternative VLM models:
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
from docling_core.types.doc import DocItemLabel, ImageRefMode
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
## Set up pipeline for PDF or image inputs
converter = DocumentConverter(
format_options={
@ -62,7 +56,7 @@ out_path.mkdir(parents=True, exist_ok=True)
for source in sources:
start_time = time.time()
print("================================================")
print("Processing... {}".format(source))
print(f"Processing... {source}")
print("================================================")
print("")
@ -77,7 +71,7 @@ for source in sources:
print(page.predictions.vlm_response.text)
res.document.save_as_html(
filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
filename=Path(f"{out_path}/{res.input.file.stem}.html"),
image_mode=ImageRefMode.REFERENCED,
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
)

View File

@ -144,7 +144,7 @@
"for pic in doc.pictures[:5]:\n",
" html_item = (\n",
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
" f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
" )\n",
" for annotation in pic.annotations:\n",
@ -252,7 +252,7 @@
"for pic in doc.pictures[:5]:\n",
" html_item = (\n",
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
" f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
" )\n",
" for annotation in pic.annotations:\n",

View File

@ -283,7 +283,7 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": null,
"metadata": {},
"outputs": [
{
@ -369,7 +369,7 @@
" new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n",
" try:\n",
" index_client.delete_index(index_name)\n",
" except:\n",
" except Exception:\n",
" pass\n",
"\n",
" index_client.create_or_update_index(new_index)\n",
@ -487,7 +487,7 @@
"\n",
" all_succeeded = all(r.succeeded for r in resp)\n",
" console.print(\n",
" f\"Uploaded batch {i} -> {i+len(subset)}; all_succeeded: {all_succeeded}, \"\n",
" f\"Uploaded batch {i} -> {i + len(subset)}; all_succeeded: {all_succeeded}, \"\n",
" f\"first_doc_status_code: {resp[0].status_code}\"\n",
" )\n",
"\n",
@ -807,10 +807,12 @@
}
],
"source": [
"from typing import Optional\n",
"\n",
"from azure.search.documents.models import VectorizableTextQuery\n",
"\n",
"\n",
"def generate_chat_response(prompt: str, system_message: str = None):\n",
"def generate_chat_response(prompt: str, system_message: Optional[str] = None):\n",
" \"\"\"\n",
" Generates a single-turn chat response using Azure OpenAI Chat.\n",
" If you need multi-turn conversation or follow-up queries, you'll have to\n",

View File

@ -351,7 +351,7 @@
"for source in sources:\n",
" if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
" doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n",
" print(f\"- text: {repr(doc_chunk.text)}\")\n",
" print(f\"- text: {doc_chunk.text!r}\")\n",
" if doc_chunk.meta.origin:\n",
" print(f\" file: {doc_chunk.meta.origin.filename}\")\n",
" if doc_chunk.meta.headings:\n",

View File

@ -341,7 +341,7 @@
"print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{clipped_answer}\")\n",
"for i, doc in enumerate(resp_dict[\"context\"]):\n",
" print()\n",
" print(f\"Source {i+1}:\")\n",
" print(f\"Source {i + 1}:\")\n",
" print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
" for key in doc.metadata:\n",
" if key != \"pk\":\n",

View File

@ -59,7 +59,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {
"collapsed": true,
"id": "u076oUSF_YUG"
@ -72,12 +72,11 @@
"%pip install rich\n",
"%pip install torch\n",
"\n",
"import logging\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"import logging\n",
"\n",
"# Suppress Weaviate client logs\n",
"logging.getLogger(\"weaviate\").setLevel(logging.ERROR)"
]
@ -119,7 +118,7 @@
" device = torch.device(\"mps\")\n",
" print(\"MPS GPU is enabled.\")\n",
"else:\n",
" raise EnvironmentError(\n",
" raise OSError(\n",
" \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
" )"
]
@ -226,7 +225,6 @@
}
],
"source": [
"from docling.datamodel.document import ConversionResult\n",
"from docling.document_converter import DocumentConverter\n",
"\n",
"# Instantiate the doc converter\n",
@ -345,7 +343,7 @@
"\n",
" openai_api_key = os.getenv(openai_api_key_var)\n",
" if not openai_api_key:\n",
" raise EnvironmentError(\n",
" raise OSError(\n",
" f\"Environment variable '{openai_api_key_var}' is not set. \"\n",
" \"Please define it before running this script.\"\n",
" )"
@ -387,7 +385,6 @@
"outputs": [],
"source": [
"import weaviate.classes.config as wc\n",
"from weaviate.classes.config import DataType, Property\n",
"\n",
"# Define the collection name\n",
"collection_name = \"docling\"\n",

View File

@ -25,9 +25,7 @@ def main():
document = mdb.convert()
out_path = Path("scratch")
print(
f"Document {path} converted." f"\nSaved markdown output to: {str(out_path)}"
)
print(f"Document {path} converted.\nSaved markdown output to: {out_path!s}")
# Export Docling document format to markdowndoc:
fn = os.path.basename(path)

View File

@ -1,13 +1,10 @@
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -63,7 +63,7 @@ def main():
out_path = Path("scratch")
print(
f"Document {res.input.file.name} converted."
f"\nSaved markdown output to: {str(out_path)}"
f"\nSaved markdown output to: {out_path!s}"
)
_log.debug(res.document._export_to_indented_text(max_text_len=16))
# Export Docling document format to markdowndoc:

View File

@ -4,7 +4,6 @@ from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
)
from docling.document_converter import DocumentConverter, PdfFormatOption

View File

@ -2,9 +2,9 @@ import logging
import time
from pathlib import Path
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem
from docling_core.types.doc import ImageRefMode, TableItem, TextItem
from docling.datamodel.base_models import FigureElement, InputFormat, Table
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
@ -15,7 +15,6 @@ IMAGE_RESOLUTION_SCALE = 2.0
# FIXME: put in your favorite translation code ....
def translate(text: str, src: str = "en", dest: str = "de"):
_log.warning("!!! IMPLEMENT HERE YOUR FAVORITE TRANSLATION CODE!!!")
# from googletrans import Translator
@ -52,10 +51,9 @@ def main():
}
)
start_time = time.time()
conv_res = doc_converter.convert(input_doc_path)
conv_doc = conv_res.document
doc_filename = conv_res.input.file
# Save markdown with embedded pictures in original text
md_filename = output_dir / f"{doc_filename}-with-images-orig.md"

View File

@ -432,7 +432,7 @@
"\n",
"for i, doc in enumerate(resp_dict[\"context\"][:]):\n",
" image_by_page = {}\n",
" print(f\"Source {i+1}:\")\n",
" print(f\"Source {i + 1}:\")\n",
" print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
" meta = DocMeta.model_validate(doc.metadata[\"dl_meta\"])\n",
"\n",

View File

@ -10,7 +10,6 @@ from docling.datamodel.pipeline_options import (
ApiVlmOptions,
ResponseFormat,
VlmPipelineOptions,
granite_vision_vlm_ollama_conversion_options,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline

98
poetry.lock generated
View File

@ -692,6 +692,84 @@ traitlets = ">=4"
[package.extras]
test = ["pytest"]
[[package]]
name = "coverage"
version = "7.8.0"
description = "Code coverage measurement for Python"
optional = false
python-versions = ">=3.9"
files = [
{file = "coverage-7.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2931f66991175369859b5fd58529cd4b73582461877ecfd859b6549869287ffe"},
{file = "coverage-7.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52a523153c568d2c0ef8826f6cc23031dc86cffb8c6aeab92c4ff776e7951b28"},
{file = "coverage-7.8.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c8a5c139aae4c35cbd7cadca1df02ea8cf28a911534fc1b0456acb0b14234f3"},
{file = "coverage-7.8.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a26c0c795c3e0b63ec7da6efded5f0bc856d7c0b24b2ac84b4d1d7bc578d676"},
{file = "coverage-7.8.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:821f7bcbaa84318287115d54becb1915eece6918136c6f91045bb84e2f88739d"},
{file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a321c61477ff8ee705b8a5fed370b5710c56b3a52d17b983d9215861e37b642a"},
{file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ed2144b8a78f9d94d9515963ed273d620e07846acd5d4b0a642d4849e8d91a0c"},
{file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:042e7841a26498fff7a37d6fda770d17519982f5b7d8bf5278d140b67b61095f"},
{file = "coverage-7.8.0-cp310-cp310-win32.whl", hash = "sha256:f9983d01d7705b2d1f7a95e10bbe4091fabc03a46881a256c2787637b087003f"},
{file = "coverage-7.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:5a570cd9bd20b85d1a0d7b009aaf6c110b52b5755c17be6962f8ccd65d1dbd23"},
{file = "coverage-7.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e7ac22a0bb2c7c49f441f7a6d46c9c80d96e56f5a8bc6972529ed43c8b694e27"},
{file = "coverage-7.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bf13d564d310c156d1c8e53877baf2993fb3073b2fc9f69790ca6a732eb4bfea"},
{file = "coverage-7.8.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5761c70c017c1b0d21b0815a920ffb94a670c8d5d409d9b38857874c21f70d7"},
{file = "coverage-7.8.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5ff52d790c7e1628241ffbcaeb33e07d14b007b6eb00a19320c7b8a7024c040"},
{file = "coverage-7.8.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d39fc4817fd67b3915256af5dda75fd4ee10621a3d484524487e33416c6f3543"},
{file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b44674870709017e4b4036e3d0d6c17f06a0e6d4436422e0ad29b882c40697d2"},
{file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8f99eb72bf27cbb167b636eb1726f590c00e1ad375002230607a844d9e9a2318"},
{file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b571bf5341ba8c6bc02e0baeaf3b061ab993bf372d982ae509807e7f112554e9"},
{file = "coverage-7.8.0-cp311-cp311-win32.whl", hash = "sha256:e75a2ad7b647fd8046d58c3132d7eaf31b12d8a53c0e4b21fa9c4d23d6ee6d3c"},
{file = "coverage-7.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:3043ba1c88b2139126fc72cb48574b90e2e0546d4c78b5299317f61b7f718b78"},
{file = "coverage-7.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bbb5cc845a0292e0c520656d19d7ce40e18d0e19b22cb3e0409135a575bf79fc"},
{file = "coverage-7.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4dfd9a93db9e78666d178d4f08a5408aa3f2474ad4d0e0378ed5f2ef71640cb6"},
{file = "coverage-7.8.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f017a61399f13aa6d1039f75cd467be388d157cd81f1a119b9d9a68ba6f2830d"},
{file = "coverage-7.8.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0915742f4c82208ebf47a2b154a5334155ed9ef9fe6190674b8a46c2fb89cb05"},
{file = "coverage-7.8.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a40fcf208e021eb14b0fac6bdb045c0e0cab53105f93ba0d03fd934c956143a"},
{file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a1f406a8e0995d654b2ad87c62caf6befa767885301f3b8f6f73e6f3c31ec3a6"},
{file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:77af0f6447a582fdc7de5e06fa3757a3ef87769fbb0fdbdeba78c23049140a47"},
{file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f2d32f95922927186c6dbc8bc60df0d186b6edb828d299ab10898ef3f40052fe"},
{file = "coverage-7.8.0-cp312-cp312-win32.whl", hash = "sha256:769773614e676f9d8e8a0980dd7740f09a6ea386d0f383db6821df07d0f08545"},
{file = "coverage-7.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:e5d2b9be5b0693cf21eb4ce0ec8d211efb43966f6657807f6859aab3814f946b"},
{file = "coverage-7.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5ac46d0c2dd5820ce93943a501ac5f6548ea81594777ca585bf002aa8854cacd"},
{file = "coverage-7.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:771eb7587a0563ca5bb6f622b9ed7f9d07bd08900f7589b4febff05f469bea00"},
{file = "coverage-7.8.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42421e04069fb2cbcbca5a696c4050b84a43b05392679d4068acbe65449b5c64"},
{file = "coverage-7.8.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:554fec1199d93ab30adaa751db68acec2b41c5602ac944bb19187cb9a41a8067"},
{file = "coverage-7.8.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aaeb00761f985007b38cf463b1d160a14a22c34eb3f6a39d9ad6fc27cb73008"},
{file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:581a40c7b94921fffd6457ffe532259813fc68eb2bdda60fa8cc343414ce3733"},
{file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:f319bae0321bc838e205bf9e5bc28f0a3165f30c203b610f17ab5552cff90323"},
{file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04bfec25a8ef1c5f41f5e7e5c842f6b615599ca8ba8391ec33a9290d9d2db3a3"},
{file = "coverage-7.8.0-cp313-cp313-win32.whl", hash = "sha256:dd19608788b50eed889e13a5d71d832edc34fc9dfce606f66e8f9f917eef910d"},
{file = "coverage-7.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:a9abbccd778d98e9c7e85038e35e91e67f5b520776781d9a1e2ee9d400869487"},
{file = "coverage-7.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:18c5ae6d061ad5b3e7eef4363fb27a0576012a7447af48be6c75b88494c6cf25"},
{file = "coverage-7.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:95aa6ae391a22bbbce1b77ddac846c98c5473de0372ba5c463480043a07bff42"},
{file = "coverage-7.8.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e013b07ba1c748dacc2a80e69a46286ff145935f260eb8c72df7185bf048f502"},
{file = "coverage-7.8.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d766a4f0e5aa1ba056ec3496243150698dc0481902e2b8559314368717be82b1"},
{file = "coverage-7.8.0-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad80e6b4a0c3cb6f10f29ae4c60e991f424e6b14219d46f1e7d442b938ee68a4"},
{file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b87eb6fc9e1bb8f98892a2458781348fa37e6925f35bb6ceb9d4afd54ba36c73"},
{file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:d1ba00ae33be84066cfbe7361d4e04dec78445b2b88bdb734d0d1cbab916025a"},
{file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f3c38e4e5ccbdc9198aecc766cedbb134b2d89bf64533973678dfcf07effd883"},
{file = "coverage-7.8.0-cp313-cp313t-win32.whl", hash = "sha256:379fe315e206b14e21db5240f89dc0774bdd3e25c3c58c2c733c99eca96f1ada"},
{file = "coverage-7.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2e4b6b87bb0c846a9315e3ab4be2d52fac905100565f4b92f02c445c8799e257"},
{file = "coverage-7.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa260de59dfb143af06dcf30c2be0b200bed2a73737a8a59248fcb9fa601ef0f"},
{file = "coverage-7.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:96121edfa4c2dfdda409877ea8608dd01de816a4dc4a0523356067b305e4e17a"},
{file = "coverage-7.8.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b8af63b9afa1031c0ef05b217faa598f3069148eeee6bb24b79da9012423b82"},
{file = "coverage-7.8.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:89b1f4af0d4afe495cd4787a68e00f30f1d15939f550e869de90a86efa7e0814"},
{file = "coverage-7.8.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94ec0be97723ae72d63d3aa41961a0b9a6f5a53ff599813c324548d18e3b9e8c"},
{file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8a1d96e780bdb2d0cbb297325711701f7c0b6f89199a57f2049e90064c29f6bd"},
{file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:f1d8a2a57b47142b10374902777e798784abf400a004b14f1b0b9eaf1e528ba4"},
{file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:cf60dd2696b457b710dd40bf17ad269d5f5457b96442f7f85722bdb16fa6c899"},
{file = "coverage-7.8.0-cp39-cp39-win32.whl", hash = "sha256:be945402e03de47ba1872cd5236395e0f4ad635526185a930735f66710e1bd3f"},
{file = "coverage-7.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:90e7fbc6216ecaffa5a880cdc9c77b7418c1dcb166166b78dbc630d07f278cc3"},
{file = "coverage-7.8.0-pp39.pp310.pp311-none-any.whl", hash = "sha256:b8194fb8e50d556d5849753de991d390c5a1edeeba50f68e3a9253fbd8bf8ccd"},
{file = "coverage-7.8.0-py3-none-any.whl", hash = "sha256:dbf364b4c5e7bae9250528167dfe40219b62e2d573c854d74be213e1e52069f7"},
{file = "coverage-7.8.0.tar.gz", hash = "sha256:7a3d62b3b03b4b6fd41a085f3574874cf946cb4604d2b4d3e8dca8cd570ca501"},
]
[package.dependencies]
tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""}
[package.extras]
toml = ["tomli"]
[[package]]
name = "cryptography"
version = "43.0.3"
@ -5073,6 +5151,24 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
[package.extras]
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
[[package]]
name = "pytest-cov"
version = "6.1.1"
description = "Pytest plugin for measuring coverage."
optional = false
python-versions = ">=3.9"
files = [
{file = "pytest_cov-6.1.1-py3-none-any.whl", hash = "sha256:bddf29ed2d0ab6f4df17b4c55b0a657287db8684af9c42ea546b21b1041b3dde"},
{file = "pytest_cov-6.1.1.tar.gz", hash = "sha256:46935f7aaefba760e716c2ebfbe1c216240b9592966e7da99ea8292d4d3e2a0a"},
]
[package.dependencies]
coverage = {version = ">=7.5", extras = ["toml"]}
pytest = ">=4.6"
[package.extras]
testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"]
[[package]]
name = "pytest-xdist"
version = "3.6.1"
@ -7882,4 +7978,4 @@ vlm = ["accelerate", "transformers", "transformers"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "d2a8f7997b9ffb249ad26ba492b766d580bdb0072d50e76b0afd92496e983e96"
content-hash = "b36037ec17dc4b6d5197a2f63a1367e05bf888b4fa97e2e2e8c29c217741d69c"

View File

@ -110,6 +110,8 @@ ipywidgets = "^8.1.5"
nbqa = "^1.9.0"
types-openpyxl = "^3.1.5.20241114"
types-tqdm = "^4.67.0.20241221"
coverage = "^7.6.2"
pytest-cov = "^6.0.0"
[tool.poetry.group.docs.dependencies]
mkdocs-material = "^9.5.40"
@ -164,15 +166,82 @@ docling-tools = "docling.cli.tools:app"
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[tool.black]
[tool.ruff]
target-version = "py39"
line-length = 88
target-version = ["py39"]
include = '\.pyi?$'
respect-gitignore = true
[tool.isort]
profile = "black"
line_length = 88
py_version = 39
# extend-exclude = [
# "tests",
# ]
[tool.ruff.format]
skip-magic-trailing-comma = false
[tool.ruff.lint]
select = [
# "B", # flake8-bugbear
"C", # flake8-comprehensions
"C9", # mccabe
# "D", # flake8-docstrings
"E", # pycodestyle errors (default)
"F", # pyflakes (default)
"I", # isort
"PD", # pandas-vet
"PIE", # pie
# "PTH", # pathlib
"Q", # flake8-quotes
# "RET", # return
"RUF", # Enable all ruff-specific checks
# "SIM", # simplify
"S307", # eval
# "T20", # (disallow print statements) keep debugging statements out of the codebase
"W", # pycodestyle warnings
"ASYNC", # async
"UP", # pyupgrade
]
ignore = [
"C408", # Unnecessary `dict()` call (rewrite as a literal)
"E501", # Line too long, handled by ruff formatter
"D107", # "Missing docstring in __init__",
"F401", # imported but unused; consider using `importlib.util.find_spec` to test for "
"F811", # "redefinition of the same function"
"PL", # Pylint
"RUF012", # Mutable Class Attributes
"UP006", # List vs list, etc
"UP007", # Option and Union
"UP035", # `typing.Set` is deprecated, use `set` instead"
]
#extend-select = []
[tool.ruff.lint.pep8-naming]
classmethod-decorators = [
# Allow Pydantic's `@validator` decorator to trigger class method treatment.
"pydantic.validator",
]
[tool.ruff.lint.per-file-ignores]
"__init__.py" = ["E402", "F401"]
"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests
[tool.ruff.lint.mccabe]
max-complexity = 20
# [tool.ruff.lint.isort.sections]
# "docling" = ["docling_core", "docling_ibm_models", "docling_parse"]
[tool.ruff.lint.isort]
combine-as-imports = true
# section-order = [
# "future",
# "standard-library",
# "third-party",
# "docling",
# "first-party",
# "local-folder",
# ]
[tool.mypy]
pretty = true
@ -200,10 +269,6 @@ module = [
]
ignore_missing_imports = true
[tool.flake8]
max-line-length = 88
extend-ignore = ["E203", "E501"]
[tool.semantic_release]
# for default values check:
# https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg

View File

@ -19,7 +19,6 @@ def _get_backend(fname):
def test_asciidocs_examples():
fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc"))
for fname in fnames:
@ -38,8 +37,8 @@ def test_asciidocs_examples():
print("\n\n", pred_mddoc)
if os.path.exists(gname):
with open(gname, "r") as fr:
true_mddoc = fr.read()
with open(gname) as fr:
fr.read()
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
else:

View File

@ -1,5 +1,3 @@
import json
import os
from pathlib import Path
from pytest import warns
@ -15,22 +13,19 @@ GENERATE = GEN_TEST_DATA
def get_csv_paths():
# Define the directory you want to search
directory = Path(f"./tests/data/csv/")
directory = Path("./tests/data/csv/")
# List all CSV files in the directory and its subdirectories
return sorted(directory.rglob("*.csv"))
def get_csv_path(name: str):
# Return the matching CSV file path
return Path(f"./tests/data/csv/{name}.csv")
def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.CSV])
return converter
@ -55,9 +50,9 @@ def test_e2e_valid_csv_conversions():
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(
pred_itxt, str(gt_path) + ".itxt"
), "export to indented-text"
assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
"export to indented-text"
)
assert verify_document(
pred_doc=doc,

View File

@ -32,7 +32,7 @@ def test_text_cell_counts():
doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()):
for page_index in range(doc_backend.page_count()):
last_cell_count = None
for i in range(10):
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
@ -42,9 +42,9 @@ def test_text_cell_counts():
last_cell_count = len(cells)
if len(cells) != last_cell_count:
assert (
False
), "Loading page multiple times yielded non-identical text cell counts"
assert False, (
"Loading page multiple times yielded non-identical text cell counts"
)
last_cell_count = len(cells)
@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path):
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image(
page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
)
# im.show()

View File

@ -31,7 +31,7 @@ def test_text_cell_counts():
doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()):
for page_index in range(doc_backend.page_count()):
last_cell_count = None
for i in range(10):
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
@ -41,9 +41,9 @@ def test_text_cell_counts():
last_cell_count = len(cells)
if len(cells) != last_cell_count:
assert (
False
), "Loading page multiple times yielded non-identical text cell counts"
assert False, (
"Loading page multiple times yielded non-identical text cell counts"
)
last_cell_count = len(cells)
@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path):
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image(
page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
)
# im.show()

View File

@ -31,7 +31,7 @@ def test_text_cell_counts():
doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()):
for page_index in range(doc_backend.page_count()):
last_cell_count = None
for i in range(10):
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
@ -41,9 +41,9 @@ def test_text_cell_counts():
last_cell_count = len(cells)
if len(cells) != last_cell_count:
assert (
False
), "Loading page multiple times yielded non-identical text cell counts"
assert False, (
"Loading page multiple times yielded non-identical text cell counts"
)
last_cell_count = len(cells)
@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path):
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image(
page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
)
# im.show()

View File

@ -105,7 +105,6 @@ def test_ordered_lists():
def get_html_paths():
# Define the directory you want to search
directory = Path("./tests/data/html/")
@ -115,14 +114,12 @@ def get_html_paths():
def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.HTML])
return converter
def test_e2e_html_conversions():
html_paths = get_html_paths()
converter = get_converter()
@ -138,15 +135,15 @@ def test_e2e_html_conversions():
doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown()
assert verify_export(
pred_md, str(gt_path) + ".md", generate=GENERATE
), "export to md"
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
"export to md"
)
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(
pred_itxt, str(gt_path) + ".itxt", generate=GENERATE
), "export to indented-text"
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
"export to indented-text"
)
assert verify_document(doc, str(gt_path) + ".json", GENERATE)

View File

@ -15,7 +15,7 @@ GENERATE = GEN_TEST_DATA
def get_pubmed_paths():
directory = Path(os.path.dirname(__file__) + f"/data/pubmed/")
directory = Path(os.path.dirname(__file__) + "/data/pubmed/")
xml_files = sorted(directory.rglob("*.xml"))
return xml_files
@ -47,9 +47,9 @@ def test_e2e_pubmed_conversions(use_stream=False):
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(
pred_itxt, str(gt_path) + ".itxt"
), "export to indented-text"
assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
"export to indented-text"
)
assert verify_document(doc, str(gt_path) + ".json", GENERATE), "export to json"

View File

@ -17,7 +17,6 @@ GENERATE = GEN_TEST_DATA
def get_xlsx_paths():
# Define the directory you want to search
directory = Path("./tests/data/xlsx/")
@ -27,7 +26,6 @@ def get_xlsx_paths():
def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.XLSX])
return converter
@ -65,13 +63,13 @@ def test_e2e_xlsx_conversions(documents) -> None:
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(
pred_itxt, str(gt_path) + ".itxt"
), "export to indented-text"
assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
"export to indented-text"
)
assert verify_document(
doc, str(gt_path) + ".json", GENERATE
), "document document"
assert verify_document(doc, str(gt_path) + ".json", GENERATE), (
"document document"
)
def test_pages(documents) -> None:
@ -81,7 +79,7 @@ def test_pages(documents) -> None:
documents: The paths and converted documents.
"""
# number of pages from the backend method
path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0]
path = next(item for item in get_xlsx_paths() if item.stem == "test-01")
in_doc = InputDocument(
path_or_stream=path,
format=InputFormat.XLSX,
@ -92,7 +90,7 @@ def test_pages(documents) -> None:
assert backend.page_count() == 3
# number of pages from the converted document
doc = [item for path, item in documents if path.stem == "test-01"][0]
doc = next(item for path, item in documents if path.stem == "test-01")
assert len(doc.pages) == 3
# page sizes as number of cells

View File

@ -1,4 +1,3 @@
import os
from pathlib import Path
from docling.backend.msword_backend import MsWordDocumentBackend
@ -43,7 +42,6 @@ def test_heading_levels():
def get_docx_paths():
# Define the directory you want to search
directory = Path("./tests/data/docx/")
@ -53,14 +51,12 @@ def get_docx_paths():
def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.DOCX])
return converter
def test_e2e_docx_conversions():
docx_paths = get_docx_paths()
converter = get_converter()
@ -76,20 +72,20 @@ def test_e2e_docx_conversions():
doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown()
assert verify_export(
pred_md, str(gt_path) + ".md", generate=GENERATE
), "export to md"
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
"export to md"
)
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(
pred_itxt, str(gt_path) + ".itxt", generate=GENERATE
), "export to indented-text"
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
"export to indented-text"
)
assert verify_document(
doc, str(gt_path) + ".json", generate=GENERATE
), "document document"
assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
"document document"
)
if docx_path.name == "word_tables.docx":
pred_html: str = doc.export_to_html()

View File

@ -109,27 +109,27 @@ def test_patent_groundtruth(patents, groundtruth):
md_name = path.stem + ".md"
if md_name in gt_names:
pred_md = doc.export_to_markdown()
assert (
pred_md == gt_names[md_name]
), f"Markdown file mismatch against groundtruth {md_name}"
assert pred_md == gt_names[md_name], (
f"Markdown file mismatch against groundtruth {md_name}"
)
json_path = path.with_suffix(".json")
if json_path.stem in gt_names:
assert verify_document(
doc, str(json_path), GENERATE
), f"JSON file mismatch against groundtruth {json_path}"
assert verify_document(doc, str(json_path), GENERATE), (
f"JSON file mismatch against groundtruth {json_path}"
)
itxt_name = path.stem + ".itxt"
if itxt_name in gt_names:
pred_itxt = doc._export_to_indented_text()
assert (
pred_itxt == gt_names[itxt_name]
), f"Indented text file mismatch against groundtruth {itxt_name}"
assert pred_itxt == gt_names[itxt_name], (
f"Indented text file mismatch against groundtruth {itxt_name}"
)
def test_tables(tables):
"""Test the table parser."""
# CHECK table in file tables_20180000016.xml
file_name = "tables_ipa20180000016.xml"
file_table = [item[1] for item in tables if item[0].name == file_name][0]
file_table = next(item[1] for item in tables if item[0].name == file_name)
assert file_table.num_rows == 13
assert file_table.num_cols == 10
assert len(file_table.table_cells) == 130
@ -140,7 +140,7 @@ def test_patent_uspto_ice(patents):
# CHECK application doc number 20200022300
file_name = "ipa20200022300.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0]
doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem)
@ -278,7 +278,7 @@ def test_patent_uspto_ice(patents):
# CHECK application doc number 20180000016 for HTML entities, level 2 headings, tables
file_name = "ipa20180000016.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0]
doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem)
@ -348,7 +348,7 @@ def test_patent_uspto_ice(patents):
# CHECK application doc number 20110039701 for complex long tables
file_name = "ipa20110039701.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0]
doc = next(item[1] for item in patents if item[0].name == file_name)
assert doc.name == file_name
assert len(doc.tables) == 17
@ -358,7 +358,7 @@ def test_patent_uspto_grant_v2(patents):
# CHECK application doc number 06442728
file_name = "pg06442728.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0]
doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem)
@ -376,12 +376,12 @@ def test_patent_uspto_grant_v2(patents):
assert isinstance(texts[2], TextItem)
assert texts[2].text == (
"An interleaver receives incoming data frames of size N. The interleaver "
"indexes the elements of the frame with an N₁×N₂ index array. The interleaver "
"indexes the elements of the frame with an N₁×N₂ index array. The interleaver " # noqa: RUF001
"then effectively rearranges (permutes) the data by permuting the rows of the "
"index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to "
"index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to " # noqa: RUF001
"permute the columns (indexed by k) of each row (indexed by j). P is at least "
"equal to N₂, βj is a constant which may be different for each row, and each "
"αj is a relative prime number relative to P. After permuting, the "
"αj is a relative prime number relative to P. After permuting, the " # noqa: RUF001
"interleaver outputs the data in a different order than received (e.g., "
"receives sequentially row by row, outputs sequentially each column by column)."
)
@ -402,7 +402,7 @@ def test_patent_uspto_app_v1(patents):
# CHECK application doc number 20010031492
file_name = "pa20010031492.xml"
doc = [item[1] for item in patents if item[0].name == file_name][0]
doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem)
@ -432,7 +432,7 @@ def test_patent_uspto_grant_aps(patents):
# CHECK application doc number 057006474
file_name = "pftaps057006474.txt"
doc = [item[1] for item in patents if item[0].name == file_name][0]
doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem)

View File

@ -32,7 +32,7 @@ def test_text_cell_counts():
doc_backend = _get_backend(pdf_doc)
for page_index in range(0, doc_backend.page_count()):
for page_index in range(doc_backend.page_count()):
last_cell_count = None
for i in range(10):
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
@ -42,9 +42,9 @@ def test_text_cell_counts():
last_cell_count = len(cells)
if len(cells) != last_cell_count:
assert (
False
), "Loading page multiple times yielded non-identical text cell counts"
assert False, (
"Loading page multiple times yielded non-identical text cell counts"
)
last_cell_count = len(cells)
@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path):
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
# Crop out "Figure 1" from the DocLayNet paper
im = page_backend.get_page_image(
page_backend.get_page_image(
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
)
# im.show()

View File

@ -1,4 +1,3 @@
import os
from pathlib import Path
from docling.datamodel.base_models import InputFormat
@ -12,7 +11,6 @@ GENERATE = GEN_TEST_DATA
def get_pptx_paths():
# Define the directory you want to search
directory = Path("./tests/data/pptx/")
@ -22,14 +20,12 @@ def get_pptx_paths():
def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.PPTX])
return converter
def test_e2e_pptx_conversions():
pptx_paths = get_pptx_paths()
converter = get_converter()
@ -50,10 +46,10 @@ def test_e2e_pptx_conversions():
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(
pred_itxt, str(gt_path) + ".itxt"
), "export to indented-text"
assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
"export to indented-text"
)
assert verify_document(
doc, str(gt_path) + ".json", GENERATE
), "document document"
assert verify_document(doc, str(gt_path) + ".json", GENERATE), (
"document document"
)

View File

@ -3,7 +3,6 @@ from pathlib import Path
from docling_core.types.doc import CodeItem, TextItem
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
@ -12,7 +11,6 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
def get_converter():
pipeline_options = PdfPipelineOptions()
pipeline_options.generate_page_images = True

View File

@ -2,7 +2,6 @@ from pathlib import Path
from docling_core.types.doc import PictureClassificationData
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
@ -11,7 +10,6 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
def get_converter():
pipeline_options = PdfPipelineOptions()
pipeline_options.generate_page_images = True
@ -49,32 +47,32 @@ def test_picture_classifier():
res = results[0]
assert len(res.annotations) == 1
assert type(res.annotations[0]) == PictureClassificationData
assert isinstance(res.annotations[0], PictureClassificationData)
classification_data = res.annotations[0]
assert classification_data.provenance == "DocumentPictureClassifier"
assert (
len(classification_data.predicted_classes) == 16
), "Number of predicted classes is not equal to 16"
assert len(classification_data.predicted_classes) == 16, (
"Number of predicted classes is not equal to 16"
)
confidences = [pred.confidence for pred in classification_data.predicted_classes]
assert confidences == sorted(
confidences, reverse=True
), "Predictions are not sorted in descending order of confidence"
assert (
classification_data.predicted_classes[0].class_name == "bar_chart"
), "The prediction is wrong for the bar chart image."
assert confidences == sorted(confidences, reverse=True), (
"Predictions are not sorted in descending order of confidence"
)
assert classification_data.predicted_classes[0].class_name == "bar_chart", (
"The prediction is wrong for the bar chart image."
)
res = results[1]
assert len(res.annotations) == 1
assert type(res.annotations[0]) == PictureClassificationData
assert isinstance(res.annotations[0], PictureClassificationData)
classification_data = res.annotations[0]
assert classification_data.provenance == "DocumentPictureClassifier"
assert (
len(classification_data.predicted_classes) == 16
), "Number of predicted classes is not equal to 16"
assert len(classification_data.predicted_classes) == 16, (
"Number of predicted classes is not equal to 16"
)
confidences = [pred.confidence for pred in classification_data.predicted_classes]
assert confidences == sorted(
confidences, reverse=True
), "Predictions are not sorted in descending order of confidence"
assert (
classification_data.predicted_classes[0].class_name == "map"
), "The prediction is wrong for the bar chart image."
assert confidences == sorted(confidences, reverse=True), (
"Predictions are not sorted in descending order of confidence"
)
assert classification_data.predicted_classes[0].class_name == "map", (
"The prediction is wrong for the bar chart image."
)

View File

@ -1,7 +1,6 @@
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions
@ -15,7 +14,6 @@ GENERATE_V2 = GEN_TEST_DATA
def get_pdf_paths():
# Define the directory you want to search
directory = Path("./tests/data/pdf/")
@ -25,7 +23,6 @@ def get_pdf_paths():
def get_converter():
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
@ -45,7 +42,6 @@ def get_converter():
def test_e2e_pdfs_conversions():
pdf_paths = get_pdf_paths()
converter = get_converter()

View File

@ -3,7 +3,6 @@ from pathlib import Path
from typing import List
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (

View File

@ -12,10 +12,9 @@ from docling.document_converter import PdfFormatOption
def test_in_doc_from_valid_path():
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
doc = _make_input_doc(test_doc_path)
assert doc.valid == True
assert doc.valid is True
def test_in_doc_from_invalid_path():
@ -23,29 +22,26 @@ def test_in_doc_from_invalid_path():
doc = _make_input_doc(test_doc_path)
assert doc.valid == False
assert doc.valid is False
def test_in_doc_from_valid_buf():
buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read())
stream = DocumentStream(name="my_doc.pdf", stream=buf)
doc = _make_input_doc_from_stream(stream)
assert doc.valid == True
assert doc.valid is True
def test_in_doc_from_invalid_buf():
buf = BytesIO(b"")
stream = DocumentStream(name="my_doc.pdf", stream=buf)
doc = _make_input_doc_from_stream(stream)
assert doc.valid == False
assert doc.valid is False
def test_image_in_pdf_backend():
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
@ -76,7 +72,6 @@ def test_image_in_pdf_backend():
def test_in_doc_with_page_range():
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
limits = DocumentLimits()
limits.page_range = (1, 10)
@ -87,7 +82,7 @@ def test_in_doc_with_page_range():
backend=PyPdfiumDocumentBackend,
limits=limits,
)
assert doc.valid == True
assert doc.valid is True
limits.page_range = (9, 9)
@ -97,7 +92,7 @@ def test_in_doc_with_page_range():
backend=PyPdfiumDocumentBackend,
limits=limits,
)
assert doc.valid == True
assert doc.valid is True
limits.page_range = (11, 12)
@ -107,7 +102,7 @@ def test_in_doc_with_page_range():
backend=PyPdfiumDocumentBackend,
limits=limits,
)
assert doc.valid == False
assert doc.valid is False
def test_guess_format(tmp_path):
@ -192,17 +187,17 @@ def test_guess_format(tmp_path):
)
doc_path = temp_dir / "docling_test.xml"
doc_path.write_text(xml_content, encoding="utf-8")
assert dci._guess_format(doc_path) == None
assert dci._guess_format(doc_path) is None
buf = BytesIO(Path(doc_path).open("rb").read())
stream = DocumentStream(name="docling_test.xml", stream=buf)
assert dci._guess_format(stream) == None
assert dci._guess_format(stream) is None
# Invalid USPTO patent (as plain text)
stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz"))
assert dci._guess_format(stream) == None
assert dci._guess_format(stream) is None
doc_path = temp_dir / "pftaps_wrong.txt"
doc_path.write_text("xyz", encoding="utf-8")
assert dci._guess_format(doc_path) == None
assert dci._guess_format(doc_path) is None
# Valid Docling JSON
test_str = '{"name": ""}'

Some files were not shown because too many files have changed in this diff Show More