diff --git a/.github/codecov.yml b/.github/codecov.yml new file mode 100644 index 0000000..944d4a0 --- /dev/null +++ b/.github/codecov.yml @@ -0,0 +1,17 @@ +codecov: + # https://docs.codecov.io/docs/comparing-commits + allow_coverage_offsets: true +coverage: + status: + project: + default: + informational: true + target: auto # auto compares coverage to the previous base commit + flags: + - docling + comment: + layout: "reach, diff, flags, files" + behavior: default + require_changes: false # if true: only post the comment if coverage changes + branches: # branch names that can post comment + - "main" diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 1f0502d..f569130 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -10,6 +10,8 @@ env: jobs: code-checks: uses: ./.github/workflows/checks.yml + with: + push_coverage: false pre-release-check: runs-on: ubuntu-latest outputs: diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index ee5ba79..8b93a56 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -1,5 +1,13 @@ on: workflow_call: + inputs: + push_coverage: + type: boolean + description: "If true, the coverage results are pushed to codecov.io." + default: true + secrets: + CODECOV_TOKEN: + required: false env: HF_HUB_DOWNLOAD_TIMEOUT: "60" @@ -32,7 +40,13 @@ jobs: run: poetry install --all-extras - name: Testing run: | - poetry run pytest -v tests + poetry run pytest -v --cov=docling --cov-report=xml tests + - name: Upload coverage to Codecov + if: inputs.push_coverage + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + file: ./coverage.xml - name: Run examples run: | for file in docs/examples/*.py; do diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 0bf45ce..e2cf18a 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,3 +17,5 @@ jobs: code-checks: if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }} uses: ./.github/workflows/checks.yml + secrets: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 19bb27c..041a100 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,43 +1,26 @@ fail_fast: true repos: + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.11.5 + hooks: + # Run the Ruff formatter. + - id: ruff-format + name: "Ruff formatter" + args: [--config=pyproject.toml] + files: '^(docling|tests|docs/examples).*\.(py|ipynb)$' + # Run the Ruff linter. + - id: ruff + name: "Ruff linter" + args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml] + files: '^(docling|tests|docs/examples).*\.(py|ipynb)$' - repo: local hooks: - - id: black - name: Black - entry: poetry run black docling docs/examples tests - pass_filenames: false - language: system - files: '\.py$' - - id: isort - name: isort - entry: poetry run isort docling docs/examples tests - pass_filenames: false - language: system - files: '\.py$' -# - id: flake8 -# name: flake8 -# entry: poetry run flake8 docling -# pass_filenames: false -# language: system -# files: '\.py$' - id: mypy name: MyPy entry: poetry run mypy docling pass_filenames: false language: system files: '\.py$' - - id: nbqa_black - name: nbQA Black - entry: poetry run nbqa black docs/examples - pass_filenames: false - language: system - files: '\.ipynb$' - - id: nbqa_isort - name: nbQA isort - entry: poetry run nbqa isort docs/examples - pass_filenames: false - language: system - files: '\.ipynb$' - id: poetry name: Poetry check entry: poetry check --lock diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py index 09891eb..3c41810 100644 --- a/docling/backend/asciidoc_backend.py +++ b/docling/backend/asciidoc_backend.py @@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend): text_stream = self.path_or_stream.getvalue().decode("utf-8") self.lines = text_stream.split("\n") if isinstance(self.path_or_stream, Path): - with open(self.path_or_stream, "r", encoding="utf-8") as f: + with open(self.path_or_stream, encoding="utf-8") as f: self.lines = f.readlines() self.valid = True @@ -75,14 +75,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend): return doc - def _parse(self, doc: DoclingDocument): + def _parse(self, doc: DoclingDocument): # noqa: C901 """ Main function that orchestrates the parsing by yielding components: title, section headers, text, lists, and tables. """ - content = "" - in_list = False in_table = False @@ -95,7 +93,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend): # indents: dict[int, Union[DocItem, GroupItem, None]] = {} indents: dict[int, Union[GroupItem, None]] = {} - for i in range(0, 10): + for i in range(10): parents[i] = None indents[i] = None @@ -125,7 +123,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend): # Lists elif self._is_list_item(line): - _log.debug(f"line: {line}") item = self._parse_list_item(line) _log.debug(f"parsed list-item: {item}") @@ -147,7 +144,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend): indents[level + 1] = item["indent"] elif in_list and item["indent"] < indents[level]: - # print(item["indent"], " => ", indents[level]) while item["indent"] < indents[level]: # print(item["indent"], " => ", indents[level]) @@ -176,7 +172,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend): elif in_table and ( (not self._is_table_line(line)) or line.strip() == "|===" ): # end of table - caption = None if len(caption_data) > 0: caption = doc.add_text( @@ -195,7 +190,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend): # Picture elif self._is_picture(line): - caption = None if len(caption_data) > 0: caption = doc.add_text( @@ -250,7 +244,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend): text_data = [] elif len(line.strip()) > 0: # allow multiline texts - item = self._parse_text(line) text_data.append(item["text"]) @@ -273,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend): def _get_current_level(self, parents): for k, v in parents.items(): - if v == None and k > 0: + if v is None and k > 0: return k - 1 return 0 def _get_current_parent(self, parents): for k, v in parents.items(): - if v == None and k > 0: + if v is None and k > 0: return parents[k - 1] return None @@ -328,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend): "marker": marker, "text": text.strip(), "numbered": False, - "indent": 0 if indent == None else len(indent), + "indent": 0 if indent is None else len(indent), } else: return { @@ -336,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend): "marker": marker, "text": text.strip(), "numbered": True, - "indent": 0 if indent == None else len(indent), + "indent": 0 if indent is None else len(indent), } else: # Fallback if no match @@ -357,7 +350,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend): return [cell.strip() for cell in line.split("|") if cell.strip()] def _populate_table_as_grid(self, table_data): - num_rows = len(table_data) # Adjust the table data into a grid format diff --git a/docling/backend/csv_backend.py b/docling/backend/csv_backend.py index 9159bd4..94d37d0 100644 --- a/docling/backend/csv_backend.py +++ b/docling/backend/csv_backend.py @@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend): head = self.content.readline() dialect = csv.Sniffer().sniff(head, ",;\t|:") _log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"') - if not dialect.delimiter in {",", ";", "\t", "|", ":"}: + if dialect.delimiter not in {",", ";", "\t", "|", ":"}: raise RuntimeError( f"Cannot convert csv with unknown delimiter {dialect.delimiter}." ) diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index 533ed42..33e7792 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -1,8 +1,9 @@ import logging import random +from collections.abc import Iterable from io import BytesIO from pathlib import Path -from typing import Iterable, List, Optional, Union +from typing import List, Optional, Union import pypdfium2 as pdfium from docling_core.types.doc import BoundingBox, CoordOrigin, Size @@ -156,7 +157,6 @@ class DoclingParsePageBackend(PdfPageBackend): def get_page_image( self, scale: float = 1, cropbox: Optional[BoundingBox] = None ) -> Image.Image: - page_size = self.get_size() if not cropbox: diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py index f7475aa..6c12b66 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docling/backend/docling_parse_v2_backend.py @@ -1,8 +1,9 @@ import logging import random +from collections.abc import Iterable from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Iterable, List, Optional, Union +from typing import TYPE_CHECKING, List, Optional, Union import pypdfium2 as pdfium from docling_core.types.doc import BoundingBox, CoordOrigin @@ -172,7 +173,6 @@ class DoclingParseV2PageBackend(PdfPageBackend): def get_page_image( self, scale: float = 1, cropbox: Optional[BoundingBox] = None ) -> Image.Image: - page_size = self.get_size() if not cropbox: diff --git a/docling/backend/docling_parse_v4_backend.py b/docling/backend/docling_parse_v4_backend.py index e1e7430..3e59f12 100644 --- a/docling/backend/docling_parse_v4_backend.py +++ b/docling/backend/docling_parse_v4_backend.py @@ -1,14 +1,14 @@ import logging -import random +from collections.abc import Iterable from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Iterable, List, Optional, Union +from typing import TYPE_CHECKING, Optional, Union import pypdfium2 as pdfium from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc.page import SegmentedPdfPage, TextCell from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument -from PIL import Image, ImageDraw +from PIL import Image from pypdfium2 import PdfPage from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend @@ -93,7 +93,6 @@ class DoclingParseV4PageBackend(PdfPageBackend): def get_page_image( self, scale: float = 1, cropbox: Optional[BoundingBox] = None ) -> Image.Image: - page_size = self.get_size() if not cropbox: diff --git a/docling/backend/docx/latex/latex_dict.py b/docling/backend/docx/latex/latex_dict.py index 0323478..1348647 100644 --- a/docling/backend/docx/latex/latex_dict.py +++ b/docling/backend/docx/latex/latex_dict.py @@ -1,12 +1,8 @@ -# -*- coding: utf-8 -*- - """ Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py On 23/01/2025 """ -from __future__ import unicode_literals - CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~") BLANK = "" @@ -79,7 +75,6 @@ CHR_BO = { } T = { - "\u2192": "\\rightarrow ", # Greek letters "\U0001d6fc": "\\alpha ", "\U0001d6fd": "\\beta ", diff --git a/docling/backend/docx/latex/omml.py b/docling/backend/docx/latex/omml.py index b2d5f90..f927885 100644 --- a/docling/backend/docx/latex/omml.py +++ b/docling/backend/docx/latex/omml.py @@ -76,8 +76,7 @@ def get_val(key, default=None, store=CHR): return default -class Tag2Method(object): - +class Tag2Method: def call_method(self, elm, stag=None): getmethod = self.tag2meth.get if stag is None: @@ -130,7 +129,6 @@ class Tag2Method(object): class Pr(Tag2Method): - text = "" __val_tags = ("chr", "pos", "begChr", "endChr", "type") @@ -159,7 +157,7 @@ class Pr(Tag2Method): def do_common(self, elm): stag = elm.tag.replace(OMML_NS, "") if stag in self.__val_tags: - t = elm.get("{0}val".format(OMML_NS)) + t = elm.get(f"{OMML_NS}val") self.__innerdict[stag] = t return None @@ -248,7 +246,6 @@ class oMath2Latex(Tag2Method): """ the Pre-Sub-Superscript object -- Not support yet """ - pass def do_sub(self, elm): text = self.process_children(elm) @@ -331,7 +328,7 @@ class oMath2Latex(Tag2Method): t_dict = self.process_children_dict(elm, include=("e", "lim")) latex_s = LIM_FUNC.get(t_dict["e"]) if not latex_s: - raise NotSupport("Not support lim %s" % t_dict["e"]) + raise RuntimeError("Not support lim {}".format(t_dict["e"])) else: return latex_s.format(lim=t_dict.get("lim")) @@ -413,7 +410,7 @@ class oMath2Latex(Tag2Method): """ _str = [] _base_str = [] - found_text = elm.findtext("./{0}t".format(OMML_NS)) + found_text = elm.findtext(f"./{OMML_NS}t") if found_text: for s in found_text: out_latex_str = self.process_unicode(s) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 5889429..aa2637f 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -55,7 +55,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.max_levels = 10 self.level = 0 self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {} - for i in range(0, self.max_levels): + for i in range(self.max_levels): self.parents[i] = None try: @@ -126,7 +126,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return doc def walk(self, tag: Tag, doc: DoclingDocument) -> None: - # Iterate over elements in the body of the document text: str = "" for element in tag.children: @@ -135,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.analyze_tag(cast(Tag, element), doc) except Exception as exc_child: _log.error( - f"Error processing child from tag {tag.name}: {repr(exc_child)}" + f"Error processing child from tag {tag.name}: {exc_child!r}" ) raise exc_child elif isinstance(element, NavigableString) and not isinstance( @@ -147,7 +146,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): item for item in element.next_siblings if isinstance(item, Tag) ] if element.next_sibling is None or any( - [item.name in TAGS_FOR_NODE_ITEMS for item in siblings] + item.name in TAGS_FOR_NODE_ITEMS for item in siblings ): text = text.strip() if text and tag.name in ["div"]: @@ -222,7 +221,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): ) else: if hlevel > self.level: - # add invisible group for i in range(self.level + 1, hlevel): self.parents[i] = doc.add_group( @@ -234,7 +232,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.level = hlevel elif hlevel < self.level: - # remove the tail for key in self.parents.keys(): if key > hlevel: @@ -360,7 +357,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): marker = "" enumerated = False if parent_label == GroupLabel.ORDERED_LIST: - marker = f"{str(index_in_list)}." + marker = f"{index_in_list!s}." enumerated = True doc.add_list_item( text=text, diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index f83dd2d..0c6b306 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): # otherwise they represent emphasis (bold or italic) self.markdown = self._shorten_underscore_sequences(text_stream) if isinstance(self.path_or_stream, Path): - with open(self.path_or_stream, "r", encoding="utf-8") as f: + with open(self.path_or_stream, encoding="utf-8") as f: md_content = f.read() # remove invalid sequences # very long sequences of underscores will lead to unnecessary long processing times. @@ -168,7 +168,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): ) self.inline_texts = [] - def _iterate_elements( + def _iterate_elements( # noqa: C901 self, element: marko.element.Element, depth: int, @@ -176,7 +176,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): visited: Set[marko.element.Element], parent_item: Optional[NodeItem] = None, ): - if element in visited: return @@ -236,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): if has_non_empty_list_items: label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST parent_item = doc.add_group( - label=label, name=f"list", parent=parent_item + label=label, name="list", parent=parent_item ) elif ( @@ -320,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): self._html_blocks += 1 self._process_inline_text(parent_item, doc) self._close_table(doc) - _log.debug("HTML Block: {}".format(element)) + _log.debug(f"HTML Block: {element}") if ( len(element.body) > 0 ): # If Marko doesn't return any content for HTML block, skip it @@ -332,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): else: if not isinstance(element, str): self._close_table(doc) - _log.debug("Some other element: {}".format(element)) + _log.debug(f"Some other element: {element}") processed_block_types = ( marko.block.Heading, @@ -398,7 +397,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): # if HTML blocks were detected, export to HTML and delegate to HTML backend if self._html_blocks > 0: - # export to HTML html_backend_cls = HTMLDocumentBackend html_str = doc.export_to_html() diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py index 971b93c..0ae9083 100644 --- a/docling/backend/msexcel_backend.py +++ b/docling/backend/msexcel_backend.py @@ -184,7 +184,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken """ if self.workbook is not None: - # Iterate over all sheets for sheet_name in self.workbook.sheetnames: _log.info(f"Processing sheet: {sheet_name}") @@ -253,7 +252,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken ) for excel_cell in excel_table.data: - cell = TableCell( text=excel_cell.text, row_span=excel_cell.row_span, @@ -303,7 +301,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken # Iterate over all cells in the sheet for ri, row in enumerate(sheet.iter_rows(values_only=False)): for rj, cell in enumerate(row): - # Skip empty or already visited cells if cell.value is None or (ri, rj) in visited: continue @@ -342,7 +339,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken visited_cells: set[tuple[int, int]] = set() for ri in range(start_row, max_row + 1): for rj in range(start_col, max_col + 1): - cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing # Check if the cell belongs to a merged range @@ -350,14 +346,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken col_span = 1 for merged_range in sheet.merged_cells.ranges: - if ( merged_range.min_row <= ri + 1 and ri + 1 <= merged_range.max_row and merged_range.min_col <= rj + 1 and rj + 1 <= merged_range.max_col ): - row_span = merged_range.max_row - merged_range.min_row + 1 col_span = merged_range.max_col - merged_range.min_col + 1 break @@ -499,7 +493,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken ), ), ) - except: + except Exception: _log.error("could not extract the image from excel sheets") return doc diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 2de0da1..3b9a6bb 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -120,13 +120,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB return prov - def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): + def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901 is_a_list = False is_list_group_created = False enum_list_item_value = 0 new_list = None bullet_type = "None" - list_text = "" list_label = GroupLabel.LIST doc_label = DocItemLabel.LIST_ITEM prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size) @@ -243,7 +242,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB enum_marker = str(enum_list_item_value) + "." if not is_list_group_created: new_list = doc.add_group( - label=list_label, name=f"list", parent=parent_slide + label=list_label, name="list", parent=parent_slide ) is_list_group_created = True doc.add_list_item( @@ -368,11 +367,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB slide_width = pptx_obj.slide_width slide_height = pptx_obj.slide_height - text_content = [] # type: ignore - max_levels = 10 parents = {} # type: ignore - for i in range(0, max_levels): + for i in range(max_levels): parents[i] = None # Loop through each slide @@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB ) slide_size = Size(width=slide_width, height=slide_height) - parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size) + doc.add_page(page_no=slide_ind + 1, size=slide_size) def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size): handle_groups(shape, parent_slide, slide_ind, doc, slide_size) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 5915c0a..a108361 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): def _get_level(self) -> int: """Return the first None index.""" for k, v in self.parents.items(): - if k >= 0 and v == None: + if k >= 0 and v is None: return k return 0 @@ -418,7 +418,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): else prev_parent ) - def _handle_text_elements( + def _handle_text_elements( # noqa: C901 self, element: BaseOxmlElement, docx_obj: DocxDocument, @@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}" ) if cell is None or cell._tc in cell_set: - _log.debug(f" skipped since repeated content") + _log.debug(" skipped since repeated content") col_idx += cell.grid_span continue else: @@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): image=ImageRef.from_pil(image=pil_image, dpi=72), caption=None, ) - except (UnidentifiedImageError, OSError) as e: + except (UnidentifiedImageError, OSError): _log.warning("Warning: image cannot be loaded by Pillow") doc.add_picture( parent=self.parents[level - 1], diff --git a/docling/backend/pdf_backend.py b/docling/backend/pdf_backend.py index cfecc7e..3d07578 100644 --- a/docling/backend/pdf_backend.py +++ b/docling/backend/pdf_backend.py @@ -1,7 +1,8 @@ from abc import ABC, abstractmethod +from collections.abc import Iterable from io import BytesIO from pathlib import Path -from typing import Iterable, Optional, Set, Union +from typing import Optional, Set, Union from docling_core.types.doc import BoundingBox, Size from docling_core.types.doc.page import SegmentedPdfPage, TextCell diff --git a/docling/backend/pypdfium2_backend.py b/docling/backend/pypdfium2_backend.py index 0fce0f8..67e1f05 100644 --- a/docling/backend/pypdfium2_backend.py +++ b/docling/backend/pypdfium2_backend.py @@ -1,8 +1,9 @@ import logging import random +from collections.abc import Iterable from io import BytesIO from pathlib import Path -from typing import TYPE_CHECKING, Iterable, List, Optional, Union +from typing import TYPE_CHECKING, List, Optional, Union import pypdfium2 as pdfium import pypdfium2.raw as pdfium_c @@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend): self.valid = True # No better way to tell from pypdfium. try: self._ppage: pdfium.PdfPage = pdfium_doc[page_no] - except PdfiumError as e: + except PdfiumError: _log.info( f"An exception occurred when loading page {page_no} of document {document_hash}.", exc_info=True, @@ -225,7 +226,6 @@ class PyPdfiumPageBackend(PdfPageBackend): def get_page_image( self, scale: float = 1, cropbox: Optional[BoundingBox] = None ) -> Image.Image: - page_size = self.get_size() if not cropbox: diff --git a/docling/backend/xml/jats_backend.py b/docling/backend/xml/jats_backend.py index 2409961..23560d3 100755 --- a/docling/backend/xml/jats_backend.py +++ b/docling/backend/xml/jats_backend.py @@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend): doc_info: etree.DocInfo = self.tree.docinfo if doc_info.system_url and any( - [kwd in doc_info.system_url for kwd in JATS_DTD_URL] + kwd in doc_info.system_url for kwd in JATS_DTD_URL ): self.valid = True return for ent in doc_info.internalDTD.iterentities(): if ent.system_url and any( - [kwd in ent.system_url for kwd in JATS_DTD_URL] + kwd in ent.system_url for kwd in JATS_DTD_URL ): self.valid = True return @@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend): # TODO: once superscript is supported, add label with formatting aff = aff.removeprefix(f"{label[0].text}, ") affiliation_names.append(aff) - affiliation_ids_names = { - id: name - for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names) - } + affiliation_ids_names = dict( + zip(meta.xpath(".//aff[@id]/@id"), affiliation_names) + ) # Get author names and affiliation names for author_node in meta.xpath( @@ -300,7 +299,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend): def _add_abstract( self, doc: DoclingDocument, xml_components: XMLComponents ) -> None: - for abstract in xml_components["abstract"]: text: str = abstract["content"] title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT @@ -349,7 +347,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend): return - def _parse_element_citation(self, node: etree._Element) -> str: + def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901 citation: Citation = { "author_names": "", "title": "", @@ -440,7 +438,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend): citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip() if len(node.xpath("lpage")) > 0: citation["page"] += ( - "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip() + "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001 ) # Flatten the citation to string @@ -595,9 +593,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend): try: self._add_table(doc, parent, table) - except Exception as e: - _log.warning(f"Skipping unsupported table in {str(self.file)}") - pass + except Exception: + _log.warning(f"Skipping unsupported table in {self.file!s}") return @@ -609,7 +606,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend): ) return - def _walk_linear( + def _walk_linear( # noqa: C901 self, doc: DoclingDocument, parent: NodeItem, node: etree._Element ) -> str: skip_tags = ["term"] diff --git a/docling/backend/xml/uspto_backend.py b/docling/backend/xml/uspto_backend.py index f3fb1ca..b0f8031 100644 --- a/docling/backend/xml/uspto_backend.py +++ b/docling/backend/xml/uspto_backend.py @@ -122,7 +122,6 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend): @override def convert(self) -> DoclingDocument: - if self.parser is not None: doc = self.parser.parse(self.patent_content) if doc is None: @@ -163,7 +162,6 @@ class PatentUspto(ABC): Returns: The patent parsed as a docling document. """ - pass class PatentUsptoIce(PatentUspto): @@ -265,7 +263,7 @@ class PatentUsptoIce(PatentUspto): self.style_html = HtmlEntity() @override - def startElement(self, tag, attributes): # noqa: N802 + def startElement(self, tag, attributes): """Signal the start of an element. Args: @@ -281,7 +279,7 @@ class PatentUsptoIce(PatentUspto): self._start_registered_elements(tag, attributes) @override - def skippedEntity(self, name): # noqa: N802 + def skippedEntity(self, name): """Receive notification of a skipped entity. HTML entities will be skipped by the parser. This method will unescape them @@ -315,7 +313,7 @@ class PatentUsptoIce(PatentUspto): self.text += unescaped @override - def endElement(self, tag): # noqa: N802 + def endElement(self, tag): """Signal the end of an element. Args: @@ -603,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto): self.style_html = HtmlEntity() @override - def startElement(self, tag, attributes): # noqa: N802 + def startElement(self, tag, attributes): """Signal the start of an element. Args: @@ -616,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto): self._start_registered_elements(tag, attributes) @override - def skippedEntity(self, name): # noqa: N802 + def skippedEntity(self, name): """Receive notification of a skipped entity. HTML entities will be skipped by the parser. This method will unescape them @@ -650,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto): self.text += unescaped @override - def endElement(self, tag): # noqa: N802 + def endElement(self, tag): """Signal the end of an element. Args: @@ -691,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto): if tag in [member.value for member in self.Element]: if ( tag == self.Element.HEADING.value - and not self.Element.SDOCL.value in self.property + and self.Element.SDOCL.value not in self.property ): level_attr: str = attributes.get("LVL", "") new_level: int = int(level_attr) if level_attr.isnumeric() else 1 @@ -743,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto): # headers except claims statement elif ( self.Element.HEADING.value in self.property - and not self.Element.SDOCL.value in self.property + and self.Element.SDOCL.value not in self.property and text.strip() ): self.parents[self.level + 1] = self.doc.add_heading( @@ -1164,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto): self.style_html = HtmlEntity() @override - def startElement(self, tag, attributes): # noqa: N802 + def startElement(self, tag, attributes): """Signal the start of an element. Args: @@ -1177,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto): self._start_registered_elements(tag, attributes) @override - def skippedEntity(self, name): # noqa: N802 + def skippedEntity(self, name): """Receive notification of a skipped entity. HTML entities will be skipped by the parser. This method will unescape them @@ -1211,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto): self.text += unescaped @override - def endElement(self, tag): # noqa: N802 + def endElement(self, tag): """Signal the end of an element. Args: @@ -1474,9 +1472,7 @@ class XmlTable: if cw == 0: offset_w0.append(col["offset"][ic]) - min_colinfo["offset"] = sorted( - list(set(col["offset"] + min_colinfo["offset"])) - ) + min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"])) # add back the 0 width cols to offset list offset_w0 = list(set(offset_w0)) @@ -1527,7 +1523,7 @@ class XmlTable: return ncols_max - def _parse_table(self, table: Tag) -> TableData: + def _parse_table(self, table: Tag) -> TableData: # noqa: C901 """Parse the content of a table tag. Args: @@ -1722,7 +1718,7 @@ class HtmlEntity: "0": "⁰", "+": "⁺", "-": "⁻", - "−": "⁻", + "−": "⁻", # noqa: RUF001 "=": "⁼", "(": "⁽", ")": "⁾", @@ -1746,7 +1742,7 @@ class HtmlEntity: "0": "₀", "+": "₊", "-": "₋", - "−": "₋", + "−": "₋", # noqa: RUF001 "=": "₌", "(": "₍", ")": "₎", diff --git a/docling/cli/main.py b/docling/cli/main.py index 6830c7f..c0718c8 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -6,14 +6,16 @@ import sys import tempfile import time import warnings +from collections.abc import Iterable from pathlib import Path -from typing import Annotated, Dict, Iterable, List, Optional, Type +from typing import Annotated, Dict, List, Optional, Type import rich.table import typer from docling_core.types.doc import ImageRefMode from docling_core.utils.file import resolve_source_to_path from pydantic import TypeAdapter +from rich.console import Console from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend @@ -53,7 +55,6 @@ warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic| warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr") _log = logging.getLogger(__name__) -from rich.console import Console console = Console() err_console = Console(stderr=True) @@ -160,7 +161,6 @@ def export_documents( export_doctags: bool, image_export_mode: ImageRefMode, ): - success_count = 0 failure_count = 0 @@ -233,7 +233,7 @@ def _split_list(raw: Optional[str]) -> Optional[List[str]]: @app.command(no_args_is_help=True) -def convert( +def convert( # noqa: C901 input_sources: Annotated[ List[str], typer.Argument( @@ -289,7 +289,7 @@ def convert( ..., help=( f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: " - f"{', '.join((o.value for o in ocr_engines_enum_internal))}. " + f"{', '.join(o.value for o in ocr_engines_enum_internal)}. " f"Use the option --show-external-plugins to see the options allowed with external plugins." ), ), @@ -430,7 +430,7 @@ def convert( settings.debug.visualize_ocr = debug_visualize_ocr if from_formats is None: - from_formats = [e for e in InputFormat] + from_formats = list(InputFormat) parsed_headers: Optional[Dict[str, str]] = None if headers is not None: diff --git a/docling/cli/models.py b/docling/cli/models.py index 7bc313c..982bbdd 100644 --- a/docling/cli/models.py +++ b/docling/cli/models.py @@ -62,7 +62,7 @@ def download( models: Annotated[ Optional[list[_AvailableModels]], typer.Argument( - help=f"Models to download (default behavior: a predefined set of models will be downloaded).", + help="Models to download (default behavior: a predefined set of models will be downloaded).", ), ] = None, all: Annotated[ @@ -89,14 +89,13 @@ def download( "Cannot simultaneously set 'all' parameter and specify models to download." ) if not quiet: - FORMAT = "%(message)s" logging.basicConfig( level=logging.INFO, format="[blue]%(message)s[/blue]", datefmt="[%X]", handlers=[RichHandler(show_level=False, show_time=False, markup=True)], ) - to_download = models or ([m for m in _AvailableModels] if all else _default_models) + to_download = models or (list(_AvailableModels) if all else _default_models) output_dir = download_models( output_dir=output_dir, force=force, diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 8ee53d6..95dcfe7 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -10,7 +10,9 @@ from docling_core.types.doc import ( TableCell, ) from docling_core.types.doc.page import SegmentedPdfPage, TextCell -from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location + +# DO NOT REMOVE; explicitly exposed from this location +from docling_core.types.io import ( DocumentStream, ) from PIL.Image import Image @@ -233,9 +235,9 @@ class Page(BaseModel): None # Internal PDF backend. By default it is cleared during assembling. ) _default_image_scale: float = 1.0 # Default image scale for external usage. - _image_cache: Dict[float, Image] = ( - {} - ) # Cache of images in different scales. By default it is cleared during assembling. + _image_cache: Dict[ + float, Image + ] = {} # Cache of images in different scales. By default it is cleared during assembling. def get_image( self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None @@ -243,7 +245,7 @@ class Page(BaseModel): if self._backend is None: return self._image_cache.get(scale, None) - if not scale in self._image_cache: + if scale not in self._image_cache: if cropbox is None: self._image_cache[scale] = self._backend.get_page_image(scale=scale) else: diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 93dfd1a..668e824 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -1,13 +1,13 @@ import csv import logging import re +from collections.abc import Iterable from enum import Enum from io import BytesIO from pathlib import Path, PurePath from typing import ( TYPE_CHECKING, Dict, - Iterable, List, Literal, Optional, @@ -17,6 +17,8 @@ from typing import ( ) import filetype + +# DO NOT REMOVE; explicitly exposed from this location from docling_core.types.doc import ( DocItem, DocItemLabel, @@ -35,14 +37,14 @@ from docling_core.types.legacy_doc.base import ( PageReference, Prov, Ref, + Table as DsSchemaTable, + TableCell, ) -from docling_core.types.legacy_doc.base import Table as DsSchemaTable -from docling_core.types.legacy_doc.base import TableCell from docling_core.types.legacy_doc.document import ( CCSDocumentDescription as DsDocumentDescription, + CCSFileInfoObject as DsFileInfoObject, + ExportedCCSDocument as DsDocument, ) -from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject -from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument from docling_core.utils.file import resolve_source_to_stream from docling_core.utils.legacy import docling_document_to_legacy from pydantic import BaseModel @@ -65,7 +67,7 @@ from docling.datamodel.base_models import ( ) from docling.datamodel.settings import DocumentLimits from docling.utils.profiling import ProfilingItem -from docling.utils.utils import create_file_hash, create_hash +from docling.utils.utils import create_file_hash if TYPE_CHECKING: from docling.document_converter import FormatOption @@ -134,9 +136,9 @@ class InputDocument(BaseModel): self._init_doc(backend, path_or_stream) elif isinstance(path_or_stream, BytesIO): - assert ( - filename is not None - ), "Can't construct InputDocument from stream without providing filename arg." + assert filename is not None, ( + "Can't construct InputDocument from stream without providing filename arg." + ) self.file = PurePath(filename) self.filesize = path_or_stream.getbuffer().nbytes @@ -228,7 +230,6 @@ class _DummyBackend(AbstractDocumentBackend): class _DocumentConversionInput(BaseModel): - path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]] headers: Optional[Dict[str, str]] = None limits: Optional[DocumentLimits] = DocumentLimits() diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 8e99cd0..a24df89 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -380,7 +380,6 @@ class PaginatedPipelineOptions(PipelineOptions): class VlmPipelineOptions(PaginatedPipelineOptions): - generate_page_images: bool = True force_backend_text: bool = ( False # (To be used with vlms, or other generative models) diff --git a/docling/document_converter.py b/docling/document_converter.py index 7489f49..125681f 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -1,11 +1,11 @@ import hashlib import logging -import math import sys import time +from collections.abc import Iterable, Iterator from functools import partial from pathlib import Path -from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union +from typing import Dict, List, Optional, Tuple, Type, Union from pydantic import BaseModel, ConfigDict, model_validator, validate_call @@ -172,7 +172,7 @@ class DocumentConverter: format_options: Optional[Dict[InputFormat, FormatOption]] = None, ): self.allowed_formats = ( - allowed_formats if allowed_formats is not None else [e for e in InputFormat] + allowed_formats if allowed_formats is not None else list(InputFormat) ) self.format_to_options = { format: ( @@ -254,7 +254,7 @@ class DocumentConverter: if not had_result and raises_on_error: raise ConversionError( - f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats." + "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats." ) def _convert( @@ -266,7 +266,7 @@ class DocumentConverter: conv_input.docs(self.format_to_options), settings.perf.doc_batch_size, # pass format_options ): - _log.info(f"Going to convert document batch...") + _log.info("Going to convert document batch...") # parallel processing only within input_batch # with ThreadPoolExecutor( diff --git a/docling/models/api_vlm_model.py b/docling/models/api_vlm_model.py index 9520122..f7e82b5 100644 --- a/docling/models/api_vlm_model.py +++ b/docling/models/api_vlm_model.py @@ -1,4 +1,4 @@ -from typing import Iterable +from collections.abc import Iterable from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.document import ConversionResult @@ -10,7 +10,6 @@ from docling.utils.profiling import TimeRecorder class ApiVlmModel(BasePageModel): - def __init__( self, enabled: bool, diff --git a/docling/models/base_model.py b/docling/models/base_model.py index 712d329..04df812 100644 --- a/docling/models/base_model.py +++ b/docling/models/base_model.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod -from typing import Any, Generic, Iterable, Optional, Protocol, Type +from collections.abc import Iterable +from typing import Generic, Optional, Protocol, Type from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem from typing_extensions import TypeVar @@ -29,7 +30,6 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem) class GenericEnrichmentModel(ABC, Generic[EnrichElementT]): - elements_batch_size: int = settings.perf.elements_batch_size @abstractmethod @@ -50,7 +50,6 @@ class GenericEnrichmentModel(ABC, Generic[EnrichElementT]): class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]): - def prepare_element( self, conv_res: ConversionResult, element: NodeItem ) -> Optional[NodeItem]: @@ -62,7 +61,6 @@ class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]): class BaseItemAndImageEnrichmentModel( GenericEnrichmentModel[ItemAndImageEnrichmentElement] ): - images_scale: float expansion_factor: float = 0.0 diff --git a/docling/models/base_ocr_model.py b/docling/models/base_ocr_model.py index c823580..9f05aed 100644 --- a/docling/models/base_ocr_model.py +++ b/docling/models/base_ocr_model.py @@ -1,12 +1,12 @@ import copy import logging from abc import abstractmethod +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, List, Optional, Type +from typing import List, Optional, Type import numpy as np from docling_core.types.doc import BoundingBox, CoordOrigin -from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell from PIL import Image, ImageDraw from rtree import index from scipy.ndimage import binary_dilation, find_objects, label diff --git a/docling/models/code_formula_model.py b/docling/models/code_formula_model.py index 10426c2..bf747c5 100644 --- a/docling/models/code_formula_model.py +++ b/docling/models/code_formula_model.py @@ -1,7 +1,8 @@ import re from collections import Counter +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, List, Literal, Optional, Tuple, Union +from typing import List, Literal, Optional, Tuple, Union import numpy as np from docling_core.types.doc import ( diff --git a/docling/models/document_picture_classifier.py b/docling/models/document_picture_classifier.py index f51d735..6a57a74 100644 --- a/docling/models/document_picture_classifier.py +++ b/docling/models/document_picture_classifier.py @@ -1,5 +1,6 @@ +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, List, Literal, Optional, Tuple, Union +from typing import List, Literal, Optional, Union import numpy as np from docling_core.types.doc import ( diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index 13eb33c..b40ca50 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -1,8 +1,9 @@ import logging import warnings import zipfile +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, List, Optional, Type +from typing import List, Optional, Type import numpy from docling_core.types.doc import BoundingBox, CoordOrigin @@ -58,12 +59,10 @@ class EasyOcrModel(BaseOcrModel): device = decide_device(accelerator_options.device) # Enable easyocr GPU if running on CUDA, MPS use_gpu = any( - [ - device.startswith(x) - for x in [ - AcceleratorDevice.CUDA.value, - AcceleratorDevice.MPS.value, - ] + device.startswith(x) + for x in [ + AcceleratorDevice.CUDA.value, + AcceleratorDevice.MPS.value, ] ) else: @@ -98,8 +97,10 @@ class EasyOcrModel(BaseOcrModel): progress: bool = False, ) -> Path: # Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py - from easyocr.config import detection_models as det_models_dict - from easyocr.config import recognition_models as rec_models_dict + from easyocr.config import ( + detection_models as det_models_dict, + recognition_models as rec_models_dict, + ) if local_dir is None: local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder @@ -126,13 +127,11 @@ class EasyOcrModel(BaseOcrModel): def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: - if not self.enabled: yield from page_batch return for page in page_batch: - assert page._backend is not None if not page._backend.is_valid(): yield page diff --git a/docling/models/factories/__init__.py b/docling/models/factories/__init__.py index 9a3308e..a6adb3f 100644 --- a/docling/models/factories/__init__.py +++ b/docling/models/factories/__init__.py @@ -9,7 +9,7 @@ from docling.models.factories.picture_description_factory import ( logger = logging.getLogger(__name__) -@lru_cache() +@lru_cache def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory: factory = OcrFactory() factory.load_from_plugins(allow_external_plugins=allow_external_plugins) @@ -17,7 +17,7 @@ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory: return factory -@lru_cache() +@lru_cache def get_picture_description_factory( allow_external_plugins: bool = False, ) -> PictureDescriptionFactory: diff --git a/docling/models/factories/base_factory.py b/docling/models/factories/base_factory.py index 542fc7e..208f0ca 100644 --- a/docling/models/factories/base_factory.py +++ b/docling/models/factories/base_factory.py @@ -33,7 +33,7 @@ class BaseFactory(Generic[A], metaclass=ABCMeta): @property def registered_kind(self) -> list[str]: - return list(opt.kind for opt in self._classes.keys()) + return [opt.kind for opt in self._classes.keys()] def get_enum(self) -> enum.Enum: return enum.Enum( diff --git a/docling/models/hf_mlx_model.py b/docling/models/hf_mlx_model.py index 762a655..63f8fc9 100644 --- a/docling/models/hf_mlx_model.py +++ b/docling/models/hf_mlx_model.py @@ -1,25 +1,22 @@ import logging import time +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, List, Optional +from typing import Optional from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( - AcceleratorDevice, AcceleratorOptions, HuggingFaceVlmOptions, ) -from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel -from docling.utils.accelerator_utils import decide_device from docling.utils.profiling import TimeRecorder _log = logging.getLogger(__name__) class HuggingFaceMlxModel(BasePageModel): - def __init__( self, enabled: bool, @@ -32,7 +29,6 @@ class HuggingFaceMlxModel(BasePageModel): self.vlm_options = vlm_options if self.enabled: - try: from mlx_vlm import generate, load # type: ignore from mlx_vlm.prompt_utils import apply_chat_template # type: ignore @@ -125,6 +121,8 @@ class HuggingFaceMlxModel(BasePageModel): generation_time = time.time() - start_time page_tags = output + _log.debug(f"Generation time {generation_time:.2f} seconds.") + # inference_time = time.time() - start_time # tokens_per_second = num_tokens / generation_time # print("") diff --git a/docling/models/hf_vlm_model.py b/docling/models/hf_vlm_model.py index 2acbe29..29276fc 100644 --- a/docling/models/hf_vlm_model.py +++ b/docling/models/hf_vlm_model.py @@ -1,16 +1,15 @@ import logging import time +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, List, Optional +from typing import Optional from docling.datamodel.base_models import Page, VlmPrediction from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( - AcceleratorDevice, AcceleratorOptions, HuggingFaceVlmOptions, ) -from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel from docling.utils.accelerator_utils import decide_device from docling.utils.profiling import TimeRecorder @@ -19,7 +18,6 @@ _log = logging.getLogger(__name__) class HuggingFaceVlmModel(BasePageModel): - def __init__( self, enabled: bool, @@ -42,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel): device = decide_device(accelerator_options.device) self.device = device - _log.debug("Available device for HuggingFace VLM: {}".format(device)) + _log.debug(f"Available device for HuggingFace VLM: {device}") repo_cache_folder = vlm_options.repo_id.replace("/", "--") @@ -168,6 +166,10 @@ class HuggingFaceVlmModel(BasePageModel): num_tokens = len(generated_ids[0]) page_tags = generated_texts + _log.debug( + f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds." + ) + # inference_time = time.time() - start_time # tokens_per_second = num_tokens / generation_time # print("") diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index b3cbd95..ae37301 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -1,8 +1,9 @@ import copy import logging import warnings +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Optional, Union +from typing import Optional from docling_core.types.doc import DocItemLabel from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor @@ -142,7 +143,6 @@ class LayoutModel(BasePageModel): def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: - for page in page_batch: assert page._backend is not None if not page._backend.is_valid(): diff --git a/docling/models/ocr_mac_model.py b/docling/models/ocr_mac_model.py index 98ca3f1..a8ff55b 100644 --- a/docling/models/ocr_mac_model.py +++ b/docling/models/ocr_mac_model.py @@ -1,8 +1,9 @@ import logging import sys import tempfile +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Optional, Tuple, Type +from typing import Optional, Type from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc.page import BoundingRectangle, TextCell @@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel): if self.enabled: if "darwin" != sys.platform: - raise RuntimeError(f"OcrMac is only supported on Mac.") + raise RuntimeError("OcrMac is only supported on Mac.") install_errmsg = ( "ocrmac is not correctly installed. " "Please install it via `pip install ocrmac` to use this OCR engine. " @@ -58,7 +59,6 @@ class OcrMacModel(BaseOcrModel): def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: - if not self.enabled: yield from page_batch return @@ -69,7 +69,6 @@ class OcrMacModel(BaseOcrModel): yield page else: with TimeRecorder(conv_res, "ocr"): - ocr_rects = self.get_ocr_rects(page) all_ocr_cells = [] diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index 4712abd..7153181 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -1,6 +1,7 @@ import logging import re -from typing import Iterable, List +from collections.abc import Iterable +from typing import List from pydantic import BaseModel @@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel): sanitized_text = "".join(lines) # Text normalization - sanitized_text = sanitized_text.replace("⁄", "/") - sanitized_text = sanitized_text.replace("’", "'") - sanitized_text = sanitized_text.replace("‘", "'") + sanitized_text = sanitized_text.replace("⁄", "/") # noqa: RUF001 + sanitized_text = sanitized_text.replace("’", "'") # noqa: RUF001 + sanitized_text = sanitized_text.replace("‘", "'") # noqa: RUF001 sanitized_text = sanitized_text.replace("“", '"') sanitized_text = sanitized_text.replace("”", '"') sanitized_text = sanitized_text.replace("•", "·") @@ -71,7 +72,6 @@ class PageAssembleModel(BasePageModel): yield page else: with TimeRecorder(conv_res, "page_assemble"): - assert page.predictions.layout is not None # assembles some JSON output page by page. @@ -83,7 +83,6 @@ class PageAssembleModel(BasePageModel): for cluster in page.predictions.layout.clusters: # _log.info("Cluster label seen:", cluster.label) if cluster.label in LayoutModel.TEXT_ELEM_LABELS: - textlines = [ cell.text.replace("\x02", "-").strip() for cell in cluster.cells @@ -109,9 +108,7 @@ class PageAssembleModel(BasePageModel): tbl = page.predictions.tablestructure.table_map.get( cluster.id, None ) - if ( - not tbl - ): # fallback: add table without structure, if it isn't present + if not tbl: # fallback: add table without structure, if it isn't present tbl = Table( label=cluster.label, id=cluster.id, @@ -130,9 +127,7 @@ class PageAssembleModel(BasePageModel): fig = page.predictions.figures_classification.figure_map.get( cluster.id, None ) - if ( - not fig - ): # fallback: add figure without classification, if it isn't present + if not fig: # fallback: add figure without classification, if it isn't present fig = FigureElement( label=cluster.label, id=cluster.id, diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index d1b29e3..b45b189 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -1,5 +1,6 @@ +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Optional +from typing import Optional from PIL import ImageDraw from pydantic import BaseModel diff --git a/docling/models/picture_description_api_model.py b/docling/models/picture_description_api_model.py index 1aa7351..44bb5e2 100644 --- a/docling/models/picture_description_api_model.py +++ b/docling/models/picture_description_api_model.py @@ -1,5 +1,6 @@ +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Optional, Type, Union +from typing import Optional, Type, Union from PIL import Image diff --git a/docling/models/picture_description_base_model.py b/docling/models/picture_description_base_model.py index 9616922..2f6e647 100644 --- a/docling/models/picture_description_base_model.py +++ b/docling/models/picture_description_base_model.py @@ -1,12 +1,11 @@ -import logging from abc import abstractmethod +from collections.abc import Iterable from pathlib import Path -from typing import Any, Iterable, List, Optional, Type, Union +from typing import List, Optional, Type, Union from docling_core.types.doc import ( DoclingDocument, NodeItem, - PictureClassificationClass, PictureItem, ) from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc diff --git a/docling/models/picture_description_vlm_model.py b/docling/models/picture_description_vlm_model.py index fc5c51e..374f575 100644 --- a/docling/models/picture_description_vlm_model.py +++ b/docling/models/picture_description_vlm_model.py @@ -1,5 +1,6 @@ +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Optional, Type, Union +from typing import Optional, Type, Union from PIL import Image @@ -13,7 +14,6 @@ from docling.utils.accelerator_utils import decide_device class PictureDescriptionVlmModel(PictureDescriptionBaseModel): - @classmethod def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]: return PictureDescriptionVlmOptions @@ -36,7 +36,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel): self.options: PictureDescriptionVlmOptions if self.enabled: - if artifacts_path is None: artifacts_path = self.download_models(repo_id=self.options.repo_id) else: diff --git a/docling/models/rapid_ocr_model.py b/docling/models/rapid_ocr_model.py index e21974d..2c7f435 100644 --- a/docling/models/rapid_ocr_model.py +++ b/docling/models/rapid_ocr_model.py @@ -1,6 +1,7 @@ import logging +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Optional, Type +from typing import Optional, Type import numpy from docling_core.types.doc import BoundingBox, CoordOrigin @@ -74,13 +75,11 @@ class RapidOcrModel(BaseOcrModel): def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: - if not self.enabled: yield from page_batch return for page in page_batch: - assert page._backend is not None if not page._backend.is_valid(): yield page diff --git a/docling/models/readingorder_model.py b/docling/models/readingorder_model.py index e7bdd1a..4373536 100644 --- a/docling/models/readingorder_model.py +++ b/docling/models/readingorder_model.py @@ -1,12 +1,7 @@ -import copy -import random from pathlib import Path from typing import Dict, List from docling_core.types.doc import ( - BoundingBox, - CoordOrigin, - DocItem, DocItemLabel, DoclingDocument, DocumentOrigin, @@ -17,13 +12,10 @@ from docling_core.types.doc import ( TableData, ) from docling_core.types.doc.document import ContentLayer -from docling_core.types.legacy_doc.base import Ref -from docling_core.types.legacy_doc.document import BaseText from docling_ibm_models.reading_order.reading_order_rb import ( PageElement as ReadingOrderPageElement, + ReadingOrderPredictor, ) -from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor -from PIL import ImageDraw from pydantic import BaseModel, ConfigDict from docling.datamodel.base_models import ( @@ -35,7 +27,6 @@ from docling.datamodel.base_models import ( TextElement, ) from docling.datamodel.document import ConversionResult -from docling.datamodel.settings import settings from docling.utils.profiling import ProfilingScope, TimeRecorder @@ -53,12 +44,10 @@ class ReadingOrderModel: def _assembled_to_readingorder_elements( self, conv_res: ConversionResult ) -> List[ReadingOrderPageElement]: - elements: List[ReadingOrderPageElement] = [] page_no_to_pages = {p.page_no: p for p in conv_res.pages} for element in conv_res.assembled.elements: - page_height = page_no_to_pages[element.page_no].size.height # type: ignore bbox = element.cluster.bbox.to_bottom_left_origin(page_height) text = element.text or "" @@ -84,7 +73,6 @@ class ReadingOrderModel: def _add_child_elements( self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument ): - child: Cluster for child in element.cluster.children: c_label = child.label @@ -110,7 +98,7 @@ class ReadingOrderModel: else: doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov) - def _readingorder_elements_to_docling_doc( + def _readingorder_elements_to_docling_doc( # noqa: C901 self, conv_res: ConversionResult, ro_elements: List[ReadingOrderPageElement], @@ -118,7 +106,6 @@ class ReadingOrderModel: el_to_footnotes_mapping: Dict[int, List[int]], el_merges_mapping: Dict[int, List[int]], ) -> DoclingDocument: - id_to_elem = { RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem for elem in conv_res.assembled.elements @@ -192,7 +179,6 @@ class ReadingOrderModel: code_item.footnotes.append(new_footnote_item.get_ref()) else: - new_item, current_list = self._handle_text_element( element, out_doc, current_list, page_height ) @@ -206,7 +192,6 @@ class ReadingOrderModel: ) elif isinstance(element, Table): - tbl_data = TableData( num_rows=element.num_rows, num_cols=element.num_cols, @@ -342,12 +327,12 @@ class ReadingOrderModel: return new_item, current_list def _merge_elements(self, element, merged_elem, new_item, page_height): - assert isinstance( - merged_elem, type(element) - ), "Merged element must be of same type as element." - assert ( - merged_elem.label == new_item.label - ), "Labels of merged elements must match." + assert isinstance(merged_elem, type(element)), ( + "Merged element must be of same type as element." + ) + assert merged_elem.label == new_item.label, ( + "Labels of merged elements must match." + ) prov = ProvenanceItem( page_no=element.page_no + 1, charspan=( diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 34a7d9d..44579b9 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -1,13 +1,13 @@ import copy import warnings +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Optional, Union +from typing import Optional import numpy from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell from docling_core.types.doc.page import ( BoundingRectangle, - SegmentedPdfPage, TextCellUnit, ) from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor @@ -44,7 +44,6 @@ class TableStructureModel(BasePageModel): self.enabled = enabled if self.enabled: - if artifacts_path is None: artifacts_path = self.download_models() / self._model_path else: @@ -175,7 +174,6 @@ class TableStructureModel(BasePageModel): def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: - if not self.enabled: yield from page_batch return @@ -186,7 +184,6 @@ class TableStructureModel(BasePageModel): yield page else: with TimeRecorder(conv_res, "table_structure"): - assert page.predictions.layout is not None assert page.size is not None @@ -260,7 +257,6 @@ class TableStructureModel(BasePageModel): table_out = tf_output[0] table_cells = [] for element in table_out["tf_responses"]: - if not self.do_cell_matching: the_bbox = BoundingBox.model_validate( element["bbox"] diff --git a/docling/models/tesseract_ocr_cli_model.py b/docling/models/tesseract_ocr_cli_model.py index 1e7fe03..91b4555 100644 --- a/docling/models/tesseract_ocr_cli_model.py +++ b/docling/models/tesseract_ocr_cli_model.py @@ -3,9 +3,10 @@ import io import logging import os import tempfile +from collections.abc import Iterable from pathlib import Path from subprocess import DEVNULL, PIPE, Popen -from typing import Iterable, List, Optional, Tuple, Type +from typing import List, Optional, Tuple, Type import pandas as pd from docling_core.types.doc import BoundingBox, CoordOrigin @@ -63,8 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel): ) def _get_name_and_version(self) -> Tuple[str, str]: - - if self._name != None and self._version != None: + if self._name is not None and self._version is not None: return self._name, self._version # type: ignore cmd = [self.options.tesseract_cmd, "--version"] @@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel): # _log.info(decoded_data) # Read the TSV file generated by Tesseract - df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t") + df_result = pd.read_csv( + io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t" + ) # Display the dataframe (optional) # _log.info("df: ", df.head()) # Filter rows that contain actual text (ignore header or empty rows) - df_filtered = df[ - df["text"].notnull() & (df["text"].apply(str).str.strip() != "") + df_filtered = df_result[ + df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "") ] return df_filtered @@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel): proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) output, _ = proc.communicate() decoded_data = output.decode("utf-8") - df = pd.read_csv( + df_detected = pd.read_csv( io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"] ) - scripts = df.loc[df["key"] == "Script"].value.tolist() + scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist() if len(scripts) == 0: _log.warning("Tesseract cannot detect the script of the page") return None @@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel): proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL) output, _ = proc.communicate() decoded_data = output.decode("utf-8") - df = pd.read_csv(io.StringIO(decoded_data), header=None) - self._tesseract_languages = df[0].tolist()[1:] + df_list = pd.read_csv(io.StringIO(decoded_data), header=None) + self._tesseract_languages = df_list[0].tolist()[1:] # Decide the script prefix - if any([l.startswith("script/") for l in self._tesseract_languages]): + if any(lang.startswith("script/") for lang in self._tesseract_languages): script_prefix = "script/" else: script_prefix = "" @@ -197,7 +199,6 @@ class TesseractOcrCliModel(BaseOcrModel): def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: - if not self.enabled: yield from page_batch return @@ -225,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel): fname = image_file.name high_res_image.save(image_file) - df = self._run_tesseract(fname) + df_result = self._run_tesseract(fname) finally: if os.path.exists(fname): os.remove(fname) - # _log.info(df) + # _log.info(df_result) # Print relevant columns (bounding box and text) - for ix, row in df.iterrows(): + for ix, row in df_result.iterrows(): text = row["text"] conf = row["conf"] - l = float(row["left"]) + l = float(row["left"]) # noqa: E741 b = float(row["top"]) w = float(row["width"]) h = float(row["height"]) diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index 84a02a3..fbe907c 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -1,6 +1,7 @@ import logging +from collections.abc import Iterable from pathlib import Path -from typing import Iterable, Optional, Type +from typing import Optional, Type from docling_core.types.doc import BoundingBox, CoordOrigin from docling_core.types.doc.page import BoundingRectangle, TextCell @@ -37,9 +38,6 @@ class TesseractOcrModel(BaseOcrModel): self.options: TesseractOcrOptions self.scale = 3 # multiplier for 72 dpi == 216 dpi. - self.reader = None - self.osd_reader = None - self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {} if self.enabled: install_errmsg = ( @@ -64,7 +62,7 @@ class TesseractOcrModel(BaseOcrModel): raise ImportError(install_errmsg) try: tesseract_version = tesserocr.tesseract_version() - except: + except Exception: raise ImportError(install_errmsg) _, self._tesserocr_languages = tesserocr.get_languages() @@ -75,7 +73,7 @@ class TesseractOcrModel(BaseOcrModel): _log.debug("Initializing TesserOCR: %s", tesseract_version) lang = "+".join(self.options.lang) - if any([l.startswith("script/") for l in self._tesserocr_languages]): + if any(lang.startswith("script/") for lang in self._tesserocr_languages): self.script_prefix = "script/" else: self.script_prefix = "" @@ -86,6 +84,10 @@ class TesseractOcrModel(BaseOcrModel): "oem": tesserocr.OEM.DEFAULT, } + self.reader = None + self.osd_reader = None + self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {} + if self.options.path is not None: tesserocr_kwargs["path"] = self.options.path diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index 1bf48ef..29475d6 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -3,9 +3,10 @@ import logging import time import traceback from abc import ABC, abstractmethod -from typing import Any, Callable, Iterable, List +from collections.abc import Iterable +from typing import Any, Callable, List -from docling_core.types.doc import DoclingDocument, NodeItem +from docling_core.types.doc import NodeItem from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend @@ -64,7 +65,6 @@ class BasePipeline(ABC): return conv_res def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult: - def _prepare_elements( conv_res: ConversionResult, model: GenericEnrichmentModel[Any] ) -> Iterable[NodeItem]: @@ -113,7 +113,6 @@ class BasePipeline(ABC): class PaginatedPipeline(BasePipeline): # TODO this is a bad name. - def __init__(self, pipeline_options: PipelineOptions): super().__init__(pipeline_options) self.keep_backend = False @@ -127,7 +126,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. yield from page_batch def _build_document(self, conv_res: ConversionResult) -> ConversionResult: - if not isinstance(conv_res.input._backend, PdfDocumentBackend): raise RuntimeError( f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. " @@ -139,8 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. total_elapsed_time = 0.0 with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT): - - for i in range(0, conv_res.input.page_count): + for i in range(conv_res.input.page_count): start_page, end_page = conv_res.input.limits.page_range if (start_page - 1) <= i <= (end_page - 1): conv_res.pages.append(Page(page_no=i)) @@ -161,7 +158,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. pipeline_pages = self._apply_on_pages(conv_res, init_pages) for p in pipeline_pages: # Must exhaust! - # Cleanup cached images if not self.keep_images: p._image_cache = {} diff --git a/docling/pipeline/simple_pipeline.py b/docling/pipeline/simple_pipeline.py index fb98523..2e8f0ea 100644 --- a/docling/pipeline/simple_pipeline.py +++ b/docling/pipeline/simple_pipeline.py @@ -24,7 +24,6 @@ class SimplePipeline(BasePipeline): super().__init__(pipeline_options) def _build_document(self, conv_res: ConversionResult) -> ConversionResult: - if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend): raise RuntimeError( f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. " diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index ae2d918..fe93c6c 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -1,5 +1,4 @@ import logging -import sys import warnings from pathlib import Path from typing import Optional, cast diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 79279fd..9a7b51e 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -1,5 +1,4 @@ import logging -import warnings from io import BytesIO from pathlib import Path from typing import List, Optional, Union, cast @@ -32,7 +31,6 @@ _log = logging.getLogger(__name__) class VlmPipeline(PaginatedPipeline): - def __init__(self, pipeline_options: VlmPipelineOptions): super().__init__(pipeline_options) self.keep_backend = True @@ -114,7 +112,6 @@ class VlmPipeline(PaginatedPipeline): def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult: with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT): - if ( self.pipeline_options.vlm_options.response_format == ResponseFormat.DOCTAGS diff --git a/docling/utils/export.py b/docling/utils/export.py index 2c0077e..debf09f 100644 --- a/docling/utils/export.py +++ b/docling/utils/export.py @@ -1,8 +1,8 @@ import logging -from typing import Any, Dict, Iterable, List, Tuple, Union +from collections.abc import Iterable +from typing import Any, Dict, List, Tuple, Union from docling_core.types.doc import BoundingBox, CoordOrigin -from docling_core.types.doc.page import TextCell from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table from docling.datamodel.document import ConversionResult, Page @@ -13,7 +13,6 @@ _log = logging.getLogger(__name__) def generate_multimodal_pages( doc_result: ConversionResult, ) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]: - label_to_doclaynet = { "title": "title", "table-of-contents": "document_index", @@ -122,7 +121,6 @@ def generate_multimodal_pages( if doc.main_text is None: return for ix, orig_item in enumerate(doc.main_text): - item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item if item is None or item.prov is None or len(item.prov) == 0: _log.debug(f"Skipping item {orig_item}") diff --git a/docling/utils/glm_utils.py b/docling/utils/glm_utils.py index c3c4353..b67281f 100644 --- a/docling/utils/glm_utils.py +++ b/docling/utils/glm_utils.py @@ -29,7 +29,7 @@ def resolve_item(paths, obj): try: key = int(paths[0]) - except: + except Exception: key = paths[0] if len(paths) == 1: @@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]: return unique_objects -def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: +def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: # noqa: C901 origin = DocumentOrigin( mimetype="application/pdf", filename=doc_glm["file-info"]["filename"], diff --git a/docling/utils/layout_postprocessor.py b/docling/utils/layout_postprocessor.py index 17d8f8b..4c25655 100644 --- a/docling/utils/layout_postprocessor.py +++ b/docling/utils/layout_postprocessor.py @@ -18,7 +18,7 @@ class UnionFind: def __init__(self, elements): self.parent = {elem: elem for elem in elements} - self.rank = {elem: 0 for elem in elements} + self.rank = dict.fromkeys(elements, 0) def find(self, x): if self.parent[x] != x: @@ -484,7 +484,9 @@ class LayoutPostprocessor: spatial_index = ( self.regular_index if cluster_type == "regular" - else self.picture_index if cluster_type == "picture" else self.wrapper_index + else self.picture_index + if cluster_type == "picture" + else self.wrapper_index ) # Map of currently valid clusters diff --git a/docling/utils/model_downloader.py b/docling/utils/model_downloader.py index 694fe04..6a1eb83 100644 --- a/docling/utils/model_downloader.py +++ b/docling/utils/model_downloader.py @@ -37,7 +37,7 @@ def download_models( output_dir.mkdir(exist_ok=True, parents=True) if with_layout: - _log.info(f"Downloading layout model...") + _log.info("Downloading layout model...") LayoutModel.download_models( local_dir=output_dir / LayoutModel._model_repo_folder, force=force, @@ -45,7 +45,7 @@ def download_models( ) if with_tableformer: - _log.info(f"Downloading tableformer model...") + _log.info("Downloading tableformer model...") TableStructureModel.download_models( local_dir=output_dir / TableStructureModel._model_repo_folder, force=force, @@ -53,7 +53,7 @@ def download_models( ) if with_picture_classifier: - _log.info(f"Downloading picture classifier model...") + _log.info("Downloading picture classifier model...") DocumentPictureClassifier.download_models( local_dir=output_dir / DocumentPictureClassifier._model_repo_folder, force=force, @@ -61,7 +61,7 @@ def download_models( ) if with_code_formula: - _log.info(f"Downloading code formula model...") + _log.info("Downloading code formula model...") CodeFormulaModel.download_models( local_dir=output_dir / CodeFormulaModel._model_repo_folder, force=force, @@ -69,7 +69,7 @@ def download_models( ) if with_smolvlm: - _log.info(f"Downloading SmolVlm model...") + _log.info("Downloading SmolVlm model...") PictureDescriptionVlmModel.download_models( repo_id=smolvlm_picture_description.repo_id, local_dir=output_dir / smolvlm_picture_description.repo_cache_folder, @@ -78,7 +78,7 @@ def download_models( ) if with_granite_vision: - _log.info(f"Downloading Granite Vision model...") + _log.info("Downloading Granite Vision model...") PictureDescriptionVlmModel.download_models( repo_id=granite_picture_description.repo_id, local_dir=output_dir / granite_picture_description.repo_cache_folder, @@ -87,7 +87,7 @@ def download_models( ) if with_easyocr: - _log.info(f"Downloading easyocr models...") + _log.info("Downloading easyocr models...") EasyOcrModel.download_models( local_dir=output_dir / EasyOcrModel._model_repo_folder, force=force, diff --git a/docling/utils/utils.py b/docling/utils/utils.py index 1261f86..11b9fdd 100644 --- a/docling/utils/utils.py +++ b/docling/utils/utils.py @@ -13,7 +13,7 @@ def chunkify(iterator, chunk_size): if isinstance(iterator, List): iterator = iter(iterator) for first in iterator: # Take the first element from the iterator - yield [first] + list(islice(iterator, chunk_size - 1)) + yield [first, *list(islice(iterator, chunk_size - 1))] def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str: diff --git a/docs/examples/backend_xml_rag.ipynb b/docs/examples/backend_xml_rag.ipynb index 091f116..60872c3 100644 --- a/docs/examples/backend_xml_rag.ipynb +++ b/docs/examples/backend_xml_rag.ipynb @@ -383,7 +383,7 @@ "\n", "print(f\"Downloading {url}...\")\n", "buf = BytesIO(requests.get(url).content)\n", - "print(f\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n", + "print(\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n", "with zipfile.ZipFile(buf) as zf:\n", " res = zf.testzip()\n", " if res:\n", @@ -544,7 +544,7 @@ "source": [ "doc = backend.convert()\n", "\n", - "claims_sec = [item for item in doc.texts if item.text == \"CLAIMS\"][0]\n", + "claims_sec = next(item for item in doc.texts if item.text == \"CLAIMS\")\n", "print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')" ] }, diff --git a/docs/examples/batch_convert.py b/docs/examples/batch_convert.py index fd68e62..25eb2ba 100644 --- a/docs/examples/batch_convert.py +++ b/docs/examples/batch_convert.py @@ -1,8 +1,8 @@ import json import logging import time +from collections.abc import Iterable from pathlib import Path -from typing import Iterable import yaml from docling_core.types.doc import ImageRefMode @@ -11,7 +11,6 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import PdfPipelineOptions -from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter, PdfFormatOption _log = logging.getLogger(__name__) diff --git a/docs/examples/custom_convert.py b/docs/examples/custom_convert.py index ddc1921..3b8ae6d 100644 --- a/docs/examples/custom_convert.py +++ b/docs/examples/custom_convert.py @@ -3,7 +3,6 @@ import logging import time from pathlib import Path -from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( AcceleratorDevice, @@ -11,9 +10,6 @@ from docling.datamodel.pipeline_options import ( PdfPipelineOptions, ) from docling.document_converter import DocumentConverter, PdfFormatOption -from docling.models.ocr_mac_model import OcrMacOptions -from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions -from docling.models.tesseract_ocr_model import TesseractOcrOptions _log = logging.getLogger(__name__) diff --git a/docs/examples/develop_formula_understanding.py b/docs/examples/develop_formula_understanding.py index ca24d95..beb1575 100644 --- a/docs/examples/develop_formula_understanding.py +++ b/docs/examples/develop_formula_understanding.py @@ -3,8 +3,8 @@ # It does not run the actual formula understanding model. import logging +from collections.abc import Iterable from pathlib import Path -from typing import Iterable from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem @@ -49,7 +49,6 @@ class ExampleFormulaUnderstandingEnrichmentModel(BaseItemAndImageEnrichmentModel # How the pipeline can be extended. class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline): - def __init__(self, pipeline_options: ExampleFormulaUnderstandingPipelineOptions): super().__init__(pipeline_options) self.pipeline_options: ExampleFormulaUnderstandingPipelineOptions @@ -85,7 +84,7 @@ def main(): ) } ) - result = doc_converter.convert(input_doc_path) + doc_converter.convert(input_doc_path) if __name__ == "__main__": diff --git a/docs/examples/develop_picture_enrichment.py b/docs/examples/develop_picture_enrichment.py index 9991afe..9e3d306 100644 --- a/docs/examples/develop_picture_enrichment.py +++ b/docs/examples/develop_picture_enrichment.py @@ -3,8 +3,9 @@ # It does not run the actual picture classifier model. import logging +from collections.abc import Iterable from pathlib import Path -from typing import Any, Iterable +from typing import Any from docling_core.types.doc import ( DoclingDocument, diff --git a/docs/examples/export_figures.py b/docs/examples/export_figures.py index c218666..8ed14a7 100644 --- a/docs/examples/export_figures.py +++ b/docs/examples/export_figures.py @@ -4,7 +4,7 @@ from pathlib import Path from docling_core.types.doc import ImageRefMode, PictureItem, TableItem -from docling.datamodel.base_models import FigureElement, InputFormat, Table +from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption diff --git a/docs/examples/export_multimodal.py b/docs/examples/export_multimodal.py index e7ea3df..bef74bf 100644 --- a/docs/examples/export_multimodal.py +++ b/docs/examples/export_multimodal.py @@ -51,7 +51,6 @@ def main(): page_segments, page, ) in generate_multimodal_pages(conv_res): - dpi = page._default_image_scale * 72 rows.append( @@ -81,10 +80,10 @@ def main(): ) # Generate one parquet from all documents - df = pd.json_normalize(rows) + df_result = pd.json_normalize(rows) now = datetime.datetime.now() output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet" - df.to_parquet(output_filename) + df_result.to_parquet(output_filename) end_time = time.time() - start_time diff --git a/docs/examples/export_tables.py b/docs/examples/export_tables.py index 8f09292..9a911d8 100644 --- a/docs/examples/export_tables.py +++ b/docs/examples/export_tables.py @@ -32,12 +32,12 @@ def main(): print(table_df.to_markdown()) # Save the table as csv - element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv" + element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv" _log.info(f"Saving CSV table to {element_csv_filename}") table_df.to_csv(element_csv_filename) # Save the table as html - element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html" + element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html" _log.info(f"Saving HTML table to {element_html_filename}") with element_html_filename.open("w") as fp: fp.write(table.export_to_html(doc=conv_res.document)) diff --git a/docs/examples/full_page_ocr.py b/docs/examples/full_page_ocr.py index 8390d5f..5525e87 100644 --- a/docs/examples/full_page_ocr.py +++ b/docs/examples/full_page_ocr.py @@ -1,14 +1,9 @@ from pathlib import Path -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( - EasyOcrOptions, - OcrMacOptions, PdfPipelineOptions, - RapidOcrOptions, TesseractCliOcrOptions, - TesseractOcrOptions, ) from docling.document_converter import DocumentConverter, PdfFormatOption diff --git a/docs/examples/hybrid_chunking.ipynb b/docs/examples/hybrid_chunking.ipynb index 2f6d945..c8a8f42 100644 --- a/docs/examples/hybrid_chunking.ipynb +++ b/docs/examples/hybrid_chunking.ipynb @@ -153,10 +153,10 @@ "source": [ "for i, chunk in enumerate(chunk_iter):\n", " print(f\"=== {i} ===\")\n", - " print(f\"chunk.text:\\n{repr(f'{chunk.text[:300]}…')}\")\n", + " print(f\"chunk.text:\\n{f'{chunk.text[:300]}…'!r}\")\n", "\n", " enriched_text = chunker.serialize(chunk=chunk)\n", - " print(f\"chunker.serialize(chunk):\\n{repr(f'{enriched_text[:300]}…')}\")\n", + " print(f\"chunker.serialize(chunk):\\n{f'{enriched_text[:300]}…'!r}\")\n", "\n", " print()" ] @@ -353,11 +353,11 @@ "for i, chunk in enumerate(chunks):\n", " print(f\"=== {i} ===\")\n", " txt_tokens = len(tokenizer.tokenize(chunk.text))\n", - " print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n", + " print(f\"chunk.text ({txt_tokens} tokens):\\n{chunk.text!r}\")\n", "\n", " ser_txt = chunker.serialize(chunk=chunk)\n", " ser_tokens = len(tokenizer.tokenize(ser_txt))\n", - " print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n", + " print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{ser_txt!r}\")\n", "\n", " print()" ] diff --git a/docs/examples/minimal_vlm_pipeline.py b/docs/examples/minimal_vlm_pipeline.py index 6a15fe4..fab6342 100644 --- a/docs/examples/minimal_vlm_pipeline.py +++ b/docs/examples/minimal_vlm_pipeline.py @@ -2,17 +2,14 @@ import json import time from pathlib import Path -import yaml +from docling_core.types.doc import DocItemLabel, ImageRefMode +from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( - AcceleratorDevice, VlmPipelineOptions, - granite_vision_vlm_conversion_options, - smoldocling_vlm_conversion_options, smoldocling_vlm_mlx_conversion_options, ) -from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline @@ -39,9 +36,6 @@ pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options ## Alternative VLM models: # pipeline_options.vlm_options = granite_vision_vlm_conversion_options -from docling_core.types.doc import DocItemLabel, ImageRefMode -from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS - ## Set up pipeline for PDF or image inputs converter = DocumentConverter( format_options={ @@ -62,7 +56,7 @@ out_path.mkdir(parents=True, exist_ok=True) for source in sources: start_time = time.time() print("================================================") - print("Processing... {}".format(source)) + print(f"Processing... {source}") print("================================================") print("") @@ -77,7 +71,7 @@ for source in sources: print(page.predictions.vlm_response.text) res.document.save_as_html( - filename=Path("{}/{}.html".format(out_path, res.input.file.stem)), + filename=Path(f"{out_path}/{res.input.file.stem}.html"), image_mode=ImageRefMode.REFERENCED, labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE], ) diff --git a/docs/examples/pictures_description.ipynb b/docs/examples/pictures_description.ipynb index feeb00b..a40a73a 100644 --- a/docs/examples/pictures_description.ipynb +++ b/docs/examples/pictures_description.ipynb @@ -144,7 +144,7 @@ "for pic in doc.pictures[:5]:\n", " html_item = (\n", " f\"

Picture {pic.self_ref}

\"\n", - " f'
'\n", + " f'
'\n", " f\"

Caption

{pic.caption_text(doc=doc)}
\"\n", " )\n", " for annotation in pic.annotations:\n", @@ -252,7 +252,7 @@ "for pic in doc.pictures[:5]:\n", " html_item = (\n", " f\"

Picture {pic.self_ref}

\"\n", - " f'
'\n", + " f'
'\n", " f\"

Caption

{pic.caption_text(doc=doc)}
\"\n", " )\n", " for annotation in pic.annotations:\n", diff --git a/docs/examples/rag_azuresearch.ipynb b/docs/examples/rag_azuresearch.ipynb index 9f867b1..b206069 100644 --- a/docs/examples/rag_azuresearch.ipynb +++ b/docs/examples/rag_azuresearch.ipynb @@ -283,7 +283,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -369,7 +369,7 @@ " new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n", " try:\n", " index_client.delete_index(index_name)\n", - " except:\n", + " except Exception:\n", " pass\n", "\n", " index_client.create_or_update_index(new_index)\n", @@ -487,7 +487,7 @@ "\n", " all_succeeded = all(r.succeeded for r in resp)\n", " console.print(\n", - " f\"Uploaded batch {i} -> {i+len(subset)}; all_succeeded: {all_succeeded}, \"\n", + " f\"Uploaded batch {i} -> {i + len(subset)}; all_succeeded: {all_succeeded}, \"\n", " f\"first_doc_status_code: {resp[0].status_code}\"\n", " )\n", "\n", @@ -807,10 +807,12 @@ } ], "source": [ + "from typing import Optional\n", + "\n", "from azure.search.documents.models import VectorizableTextQuery\n", "\n", "\n", - "def generate_chat_response(prompt: str, system_message: str = None):\n", + "def generate_chat_response(prompt: str, system_message: Optional[str] = None):\n", " \"\"\"\n", " Generates a single-turn chat response using Azure OpenAI Chat.\n", " If you need multi-turn conversation or follow-up queries, you'll have to\n", diff --git a/docs/examples/rag_haystack.ipynb b/docs/examples/rag_haystack.ipynb index b954115..2861c96 100644 --- a/docs/examples/rag_haystack.ipynb +++ b/docs/examples/rag_haystack.ipynb @@ -351,7 +351,7 @@ "for source in sources:\n", " if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n", " doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n", - " print(f\"- text: {repr(doc_chunk.text)}\")\n", + " print(f\"- text: {doc_chunk.text!r}\")\n", " if doc_chunk.meta.origin:\n", " print(f\" file: {doc_chunk.meta.origin.filename}\")\n", " if doc_chunk.meta.headings:\n", diff --git a/docs/examples/rag_langchain.ipynb b/docs/examples/rag_langchain.ipynb index 37c4170..17fe8e6 100644 --- a/docs/examples/rag_langchain.ipynb +++ b/docs/examples/rag_langchain.ipynb @@ -341,7 +341,7 @@ "print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{clipped_answer}\")\n", "for i, doc in enumerate(resp_dict[\"context\"]):\n", " print()\n", - " print(f\"Source {i+1}:\")\n", + " print(f\"Source {i + 1}:\")\n", " print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n", " for key in doc.metadata:\n", " if key != \"pk\":\n", diff --git a/docs/examples/rag_weaviate.ipynb b/docs/examples/rag_weaviate.ipynb index 7c020f4..627e892 100644 --- a/docs/examples/rag_weaviate.ipynb +++ b/docs/examples/rag_weaviate.ipynb @@ -59,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "collapsed": true, "id": "u076oUSF_YUG" @@ -72,12 +72,11 @@ "%pip install rich\n", "%pip install torch\n", "\n", + "import logging\n", "import warnings\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "\n", - "import logging\n", - "\n", "# Suppress Weaviate client logs\n", "logging.getLogger(\"weaviate\").setLevel(logging.ERROR)" ] @@ -119,7 +118,7 @@ " device = torch.device(\"mps\")\n", " print(\"MPS GPU is enabled.\")\n", "else:\n", - " raise EnvironmentError(\n", + " raise OSError(\n", " \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n", " )" ] @@ -226,7 +225,6 @@ } ], "source": [ - "from docling.datamodel.document import ConversionResult\n", "from docling.document_converter import DocumentConverter\n", "\n", "# Instantiate the doc converter\n", @@ -345,7 +343,7 @@ "\n", " openai_api_key = os.getenv(openai_api_key_var)\n", " if not openai_api_key:\n", - " raise EnvironmentError(\n", + " raise OSError(\n", " f\"Environment variable '{openai_api_key_var}' is not set. \"\n", " \"Please define it before running this script.\"\n", " )" @@ -387,7 +385,6 @@ "outputs": [], "source": [ "import weaviate.classes.config as wc\n", - "from weaviate.classes.config import DataType, Property\n", "\n", "# Define the collection name\n", "collection_name = \"docling\"\n", diff --git a/docs/examples/run_md.py b/docs/examples/run_md.py index 46be97e..94de14b 100644 --- a/docs/examples/run_md.py +++ b/docs/examples/run_md.py @@ -25,9 +25,7 @@ def main(): document = mdb.convert() out_path = Path("scratch") - print( - f"Document {path} converted." f"\nSaved markdown output to: {str(out_path)}" - ) + print(f"Document {path} converted.\nSaved markdown output to: {out_path!s}") # Export Docling document format to markdowndoc: fn = os.path.basename(path) diff --git a/docs/examples/run_with_accelerator.py b/docs/examples/run_with_accelerator.py index 6e81e85..a538074 100644 --- a/docs/examples/run_with_accelerator.py +++ b/docs/examples/run_with_accelerator.py @@ -1,13 +1,10 @@ from pathlib import Path -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( AcceleratorDevice, AcceleratorOptions, PdfPipelineOptions, - TesseractCliOcrOptions, - TesseractOcrOptions, ) from docling.datamodel.settings import settings from docling.document_converter import DocumentConverter, PdfFormatOption diff --git a/docs/examples/run_with_formats.py b/docs/examples/run_with_formats.py index 0eff248..38d7fff 100644 --- a/docs/examples/run_with_formats.py +++ b/docs/examples/run_with_formats.py @@ -63,7 +63,7 @@ def main(): out_path = Path("scratch") print( f"Document {res.input.file.name} converted." - f"\nSaved markdown output to: {str(out_path)}" + f"\nSaved markdown output to: {out_path!s}" ) _log.debug(res.document._export_to_indented_text(max_text_len=16)) # Export Docling document format to markdowndoc: diff --git a/docs/examples/tesseract_lang_detection.py b/docs/examples/tesseract_lang_detection.py index 0de0dd6..37859b9 100644 --- a/docs/examples/tesseract_lang_detection.py +++ b/docs/examples/tesseract_lang_detection.py @@ -4,7 +4,6 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( PdfPipelineOptions, TesseractCliOcrOptions, - TesseractOcrOptions, ) from docling.document_converter import DocumentConverter, PdfFormatOption diff --git a/docs/examples/translate.py b/docs/examples/translate.py index fa39b6d..229d545 100644 --- a/docs/examples/translate.py +++ b/docs/examples/translate.py @@ -2,9 +2,9 @@ import logging import time from pathlib import Path -from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem +from docling_core.types.doc import ImageRefMode, TableItem, TextItem -from docling.datamodel.base_models import FigureElement, InputFormat, Table +from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption @@ -15,7 +15,6 @@ IMAGE_RESOLUTION_SCALE = 2.0 # FIXME: put in your favorite translation code .... def translate(text: str, src: str = "en", dest: str = "de"): - _log.warning("!!! IMPLEMENT HERE YOUR FAVORITE TRANSLATION CODE!!!") # from googletrans import Translator @@ -52,10 +51,9 @@ def main(): } ) - start_time = time.time() - conv_res = doc_converter.convert(input_doc_path) conv_doc = conv_res.document + doc_filename = conv_res.input.file # Save markdown with embedded pictures in original text md_filename = output_dir / f"{doc_filename}-with-images-orig.md" diff --git a/docs/examples/visual_grounding.ipynb b/docs/examples/visual_grounding.ipynb index 4d091da..63200ed 100644 --- a/docs/examples/visual_grounding.ipynb +++ b/docs/examples/visual_grounding.ipynb @@ -432,7 +432,7 @@ "\n", "for i, doc in enumerate(resp_dict[\"context\"][:]):\n", " image_by_page = {}\n", - " print(f\"Source {i+1}:\")\n", + " print(f\"Source {i + 1}:\")\n", " print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n", " meta = DocMeta.model_validate(doc.metadata[\"dl_meta\"])\n", "\n", diff --git a/docs/examples/vlm_pipeline_api_model.py b/docs/examples/vlm_pipeline_api_model.py index 33fb72a..504cecc 100644 --- a/docs/examples/vlm_pipeline_api_model.py +++ b/docs/examples/vlm_pipeline_api_model.py @@ -10,7 +10,6 @@ from docling.datamodel.pipeline_options import ( ApiVlmOptions, ResponseFormat, VlmPipelineOptions, - granite_vision_vlm_ollama_conversion_options, ) from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline diff --git a/poetry.lock b/poetry.lock index 2f142b8..22514b6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -692,6 +692,84 @@ traitlets = ">=4" [package.extras] test = ["pytest"] +[[package]] +name = "coverage" +version = "7.8.0" +description = "Code coverage measurement for Python" +optional = false +python-versions = ">=3.9" +files = [ + {file = "coverage-7.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2931f66991175369859b5fd58529cd4b73582461877ecfd859b6549869287ffe"}, + {file = "coverage-7.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52a523153c568d2c0ef8826f6cc23031dc86cffb8c6aeab92c4ff776e7951b28"}, + {file = "coverage-7.8.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c8a5c139aae4c35cbd7cadca1df02ea8cf28a911534fc1b0456acb0b14234f3"}, + {file = "coverage-7.8.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a26c0c795c3e0b63ec7da6efded5f0bc856d7c0b24b2ac84b4d1d7bc578d676"}, + {file = "coverage-7.8.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:821f7bcbaa84318287115d54becb1915eece6918136c6f91045bb84e2f88739d"}, + {file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a321c61477ff8ee705b8a5fed370b5710c56b3a52d17b983d9215861e37b642a"}, + {file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ed2144b8a78f9d94d9515963ed273d620e07846acd5d4b0a642d4849e8d91a0c"}, + {file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:042e7841a26498fff7a37d6fda770d17519982f5b7d8bf5278d140b67b61095f"}, + {file = "coverage-7.8.0-cp310-cp310-win32.whl", hash = "sha256:f9983d01d7705b2d1f7a95e10bbe4091fabc03a46881a256c2787637b087003f"}, + {file = "coverage-7.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:5a570cd9bd20b85d1a0d7b009aaf6c110b52b5755c17be6962f8ccd65d1dbd23"}, + {file = "coverage-7.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e7ac22a0bb2c7c49f441f7a6d46c9c80d96e56f5a8bc6972529ed43c8b694e27"}, + {file = "coverage-7.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bf13d564d310c156d1c8e53877baf2993fb3073b2fc9f69790ca6a732eb4bfea"}, + {file = "coverage-7.8.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5761c70c017c1b0d21b0815a920ffb94a670c8d5d409d9b38857874c21f70d7"}, + {file = "coverage-7.8.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5ff52d790c7e1628241ffbcaeb33e07d14b007b6eb00a19320c7b8a7024c040"}, + {file = "coverage-7.8.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d39fc4817fd67b3915256af5dda75fd4ee10621a3d484524487e33416c6f3543"}, + {file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b44674870709017e4b4036e3d0d6c17f06a0e6d4436422e0ad29b882c40697d2"}, + {file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8f99eb72bf27cbb167b636eb1726f590c00e1ad375002230607a844d9e9a2318"}, + {file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b571bf5341ba8c6bc02e0baeaf3b061ab993bf372d982ae509807e7f112554e9"}, + {file = "coverage-7.8.0-cp311-cp311-win32.whl", hash = "sha256:e75a2ad7b647fd8046d58c3132d7eaf31b12d8a53c0e4b21fa9c4d23d6ee6d3c"}, + {file = "coverage-7.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:3043ba1c88b2139126fc72cb48574b90e2e0546d4c78b5299317f61b7f718b78"}, + {file = "coverage-7.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bbb5cc845a0292e0c520656d19d7ce40e18d0e19b22cb3e0409135a575bf79fc"}, + {file = "coverage-7.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4dfd9a93db9e78666d178d4f08a5408aa3f2474ad4d0e0378ed5f2ef71640cb6"}, + {file = "coverage-7.8.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f017a61399f13aa6d1039f75cd467be388d157cd81f1a119b9d9a68ba6f2830d"}, + {file = "coverage-7.8.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0915742f4c82208ebf47a2b154a5334155ed9ef9fe6190674b8a46c2fb89cb05"}, + {file = "coverage-7.8.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a40fcf208e021eb14b0fac6bdb045c0e0cab53105f93ba0d03fd934c956143a"}, + {file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a1f406a8e0995d654b2ad87c62caf6befa767885301f3b8f6f73e6f3c31ec3a6"}, + {file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:77af0f6447a582fdc7de5e06fa3757a3ef87769fbb0fdbdeba78c23049140a47"}, + {file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f2d32f95922927186c6dbc8bc60df0d186b6edb828d299ab10898ef3f40052fe"}, + {file = "coverage-7.8.0-cp312-cp312-win32.whl", hash = "sha256:769773614e676f9d8e8a0980dd7740f09a6ea386d0f383db6821df07d0f08545"}, + {file = "coverage-7.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:e5d2b9be5b0693cf21eb4ce0ec8d211efb43966f6657807f6859aab3814f946b"}, + {file = "coverage-7.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5ac46d0c2dd5820ce93943a501ac5f6548ea81594777ca585bf002aa8854cacd"}, + {file = "coverage-7.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:771eb7587a0563ca5bb6f622b9ed7f9d07bd08900f7589b4febff05f469bea00"}, + {file = "coverage-7.8.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42421e04069fb2cbcbca5a696c4050b84a43b05392679d4068acbe65449b5c64"}, + {file = "coverage-7.8.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:554fec1199d93ab30adaa751db68acec2b41c5602ac944bb19187cb9a41a8067"}, + {file = "coverage-7.8.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aaeb00761f985007b38cf463b1d160a14a22c34eb3f6a39d9ad6fc27cb73008"}, + {file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:581a40c7b94921fffd6457ffe532259813fc68eb2bdda60fa8cc343414ce3733"}, + {file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:f319bae0321bc838e205bf9e5bc28f0a3165f30c203b610f17ab5552cff90323"}, + {file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04bfec25a8ef1c5f41f5e7e5c842f6b615599ca8ba8391ec33a9290d9d2db3a3"}, + {file = "coverage-7.8.0-cp313-cp313-win32.whl", hash = "sha256:dd19608788b50eed889e13a5d71d832edc34fc9dfce606f66e8f9f917eef910d"}, + {file = "coverage-7.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:a9abbccd778d98e9c7e85038e35e91e67f5b520776781d9a1e2ee9d400869487"}, + {file = "coverage-7.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:18c5ae6d061ad5b3e7eef4363fb27a0576012a7447af48be6c75b88494c6cf25"}, + {file = "coverage-7.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:95aa6ae391a22bbbce1b77ddac846c98c5473de0372ba5c463480043a07bff42"}, + {file = "coverage-7.8.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e013b07ba1c748dacc2a80e69a46286ff145935f260eb8c72df7185bf048f502"}, + {file = "coverage-7.8.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d766a4f0e5aa1ba056ec3496243150698dc0481902e2b8559314368717be82b1"}, + {file = "coverage-7.8.0-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad80e6b4a0c3cb6f10f29ae4c60e991f424e6b14219d46f1e7d442b938ee68a4"}, + {file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b87eb6fc9e1bb8f98892a2458781348fa37e6925f35bb6ceb9d4afd54ba36c73"}, + {file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:d1ba00ae33be84066cfbe7361d4e04dec78445b2b88bdb734d0d1cbab916025a"}, + {file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f3c38e4e5ccbdc9198aecc766cedbb134b2d89bf64533973678dfcf07effd883"}, + {file = "coverage-7.8.0-cp313-cp313t-win32.whl", hash = "sha256:379fe315e206b14e21db5240f89dc0774bdd3e25c3c58c2c733c99eca96f1ada"}, + {file = "coverage-7.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2e4b6b87bb0c846a9315e3ab4be2d52fac905100565f4b92f02c445c8799e257"}, + {file = "coverage-7.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa260de59dfb143af06dcf30c2be0b200bed2a73737a8a59248fcb9fa601ef0f"}, + {file = "coverage-7.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:96121edfa4c2dfdda409877ea8608dd01de816a4dc4a0523356067b305e4e17a"}, + {file = "coverage-7.8.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b8af63b9afa1031c0ef05b217faa598f3069148eeee6bb24b79da9012423b82"}, + {file = "coverage-7.8.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:89b1f4af0d4afe495cd4787a68e00f30f1d15939f550e869de90a86efa7e0814"}, + {file = "coverage-7.8.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94ec0be97723ae72d63d3aa41961a0b9a6f5a53ff599813c324548d18e3b9e8c"}, + {file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8a1d96e780bdb2d0cbb297325711701f7c0b6f89199a57f2049e90064c29f6bd"}, + {file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:f1d8a2a57b47142b10374902777e798784abf400a004b14f1b0b9eaf1e528ba4"}, + {file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:cf60dd2696b457b710dd40bf17ad269d5f5457b96442f7f85722bdb16fa6c899"}, + {file = "coverage-7.8.0-cp39-cp39-win32.whl", hash = "sha256:be945402e03de47ba1872cd5236395e0f4ad635526185a930735f66710e1bd3f"}, + {file = "coverage-7.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:90e7fbc6216ecaffa5a880cdc9c77b7418c1dcb166166b78dbc630d07f278cc3"}, + {file = "coverage-7.8.0-pp39.pp310.pp311-none-any.whl", hash = "sha256:b8194fb8e50d556d5849753de991d390c5a1edeeba50f68e3a9253fbd8bf8ccd"}, + {file = "coverage-7.8.0-py3-none-any.whl", hash = "sha256:dbf364b4c5e7bae9250528167dfe40219b62e2d573c854d74be213e1e52069f7"}, + {file = "coverage-7.8.0.tar.gz", hash = "sha256:7a3d62b3b03b4b6fd41a085f3574874cf946cb4604d2b4d3e8dca8cd570ca501"}, +] + +[package.dependencies] +tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""} + +[package.extras] +toml = ["tomli"] + [[package]] name = "cryptography" version = "43.0.3" @@ -5073,6 +5151,24 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} [package.extras] testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] +[[package]] +name = "pytest-cov" +version = "6.1.1" +description = "Pytest plugin for measuring coverage." +optional = false +python-versions = ">=3.9" +files = [ + {file = "pytest_cov-6.1.1-py3-none-any.whl", hash = "sha256:bddf29ed2d0ab6f4df17b4c55b0a657287db8684af9c42ea546b21b1041b3dde"}, + {file = "pytest_cov-6.1.1.tar.gz", hash = "sha256:46935f7aaefba760e716c2ebfbe1c216240b9592966e7da99ea8292d4d3e2a0a"}, +] + +[package.dependencies] +coverage = {version = ">=7.5", extras = ["toml"]} +pytest = ">=4.6" + +[package.extras] +testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"] + [[package]] name = "pytest-xdist" version = "3.6.1" @@ -7882,4 +7978,4 @@ vlm = ["accelerate", "transformers", "transformers"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "d2a8f7997b9ffb249ad26ba492b766d580bdb0072d50e76b0afd92496e983e96" +content-hash = "b36037ec17dc4b6d5197a2f63a1367e05bf888b4fa97e2e2e8c29c217741d69c" diff --git a/pyproject.toml b/pyproject.toml index 5091afc..148f52b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -110,6 +110,8 @@ ipywidgets = "^8.1.5" nbqa = "^1.9.0" types-openpyxl = "^3.1.5.20241114" types-tqdm = "^4.67.0.20241221" +coverage = "^7.6.2" +pytest-cov = "^6.0.0" [tool.poetry.group.docs.dependencies] mkdocs-material = "^9.5.40" @@ -164,15 +166,82 @@ docling-tools = "docling.cli.tools:app" requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" -[tool.black] +[tool.ruff] +target-version = "py39" line-length = 88 -target-version = ["py39"] -include = '\.pyi?$' +respect-gitignore = true -[tool.isort] -profile = "black" -line_length = 88 -py_version = 39 +# extend-exclude = [ +# "tests", +# ] + +[tool.ruff.format] +skip-magic-trailing-comma = false + +[tool.ruff.lint] +select = [ + # "B", # flake8-bugbear + "C", # flake8-comprehensions + "C9", # mccabe + # "D", # flake8-docstrings + "E", # pycodestyle errors (default) + "F", # pyflakes (default) + "I", # isort + "PD", # pandas-vet + "PIE", # pie + # "PTH", # pathlib + "Q", # flake8-quotes + # "RET", # return + "RUF", # Enable all ruff-specific checks + # "SIM", # simplify + "S307", # eval + # "T20", # (disallow print statements) keep debugging statements out of the codebase + "W", # pycodestyle warnings + "ASYNC", # async + "UP", # pyupgrade +] + +ignore = [ + "C408", # Unnecessary `dict()` call (rewrite as a literal) + "E501", # Line too long, handled by ruff formatter + "D107", # "Missing docstring in __init__", + "F401", # imported but unused; consider using `importlib.util.find_spec` to test for " + "F811", # "redefinition of the same function" + "PL", # Pylint + "RUF012", # Mutable Class Attributes + "UP006", # List vs list, etc + "UP007", # Option and Union + "UP035", # `typing.Set` is deprecated, use `set` instead" +] + +#extend-select = [] + +[tool.ruff.lint.pep8-naming] +classmethod-decorators = [ + # Allow Pydantic's `@validator` decorator to trigger class method treatment. + "pydantic.validator", +] + +[tool.ruff.lint.per-file-ignores] +"__init__.py" = ["E402", "F401"] +"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests + +[tool.ruff.lint.mccabe] +max-complexity = 20 + +# [tool.ruff.lint.isort.sections] +# "docling" = ["docling_core", "docling_ibm_models", "docling_parse"] + +[tool.ruff.lint.isort] +combine-as-imports = true +# section-order = [ +# "future", +# "standard-library", +# "third-party", +# "docling", +# "first-party", +# "local-folder", +# ] [tool.mypy] pretty = true @@ -200,10 +269,6 @@ module = [ ] ignore_missing_imports = true -[tool.flake8] -max-line-length = 88 -extend-ignore = ["E203", "E501"] - [tool.semantic_release] # for default values check: # https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg diff --git a/tests/test_backend_asciidoc.py b/tests/test_backend_asciidoc.py index 4574a22..fc047ba 100644 --- a/tests/test_backend_asciidoc.py +++ b/tests/test_backend_asciidoc.py @@ -19,7 +19,6 @@ def _get_backend(fname): def test_asciidocs_examples(): - fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc")) for fname in fnames: @@ -38,8 +37,8 @@ def test_asciidocs_examples(): print("\n\n", pred_mddoc) if os.path.exists(gname): - with open(gname, "r") as fr: - true_mddoc = fr.read() + with open(gname) as fr: + fr.read() # assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc" else: diff --git a/tests/test_backend_csv.py b/tests/test_backend_csv.py index 2eee27b..d929ae1 100644 --- a/tests/test_backend_csv.py +++ b/tests/test_backend_csv.py @@ -1,5 +1,3 @@ -import json -import os from pathlib import Path from pytest import warns @@ -15,22 +13,19 @@ GENERATE = GEN_TEST_DATA def get_csv_paths(): - # Define the directory you want to search - directory = Path(f"./tests/data/csv/") + directory = Path("./tests/data/csv/") # List all CSV files in the directory and its subdirectories return sorted(directory.rglob("*.csv")) def get_csv_path(name: str): - # Return the matching CSV file path return Path(f"./tests/data/csv/{name}.csv") def get_converter(): - converter = DocumentConverter(allowed_formats=[InputFormat.CSV]) return converter @@ -55,9 +50,9 @@ def test_e2e_valid_csv_conversions(): pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) - assert verify_export( - pred_itxt, str(gt_path) + ".itxt" - ), "export to indented-text" + assert verify_export(pred_itxt, str(gt_path) + ".itxt"), ( + "export to indented-text" + ) assert verify_document( pred_doc=doc, diff --git a/tests/test_backend_docling_parse.py b/tests/test_backend_docling_parse.py index 3c21479..d6f804c 100644 --- a/tests/test_backend_docling_parse.py +++ b/tests/test_backend_docling_parse.py @@ -32,7 +32,7 @@ def test_text_cell_counts(): doc_backend = _get_backend(pdf_doc) - for page_index in range(0, doc_backend.page_count()): + for page_index in range(doc_backend.page_count()): last_cell_count = None for i in range(10): page_backend: DoclingParsePageBackend = doc_backend.load_page(0) @@ -42,9 +42,9 @@ def test_text_cell_counts(): last_cell_count = len(cells) if len(cells) != last_cell_count: - assert ( - False - ), "Loading page multiple times yielded non-identical text cell counts" + assert False, ( + "Loading page multiple times yielded non-identical text cell counts" + ) last_cell_count = len(cells) @@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path): page_backend: DoclingParsePageBackend = doc_backend.load_page(0) # Crop out "Figure 1" from the DocLayNet paper - im = page_backend.get_page_image( + page_backend.get_page_image( scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) ) # im.show() diff --git a/tests/test_backend_docling_parse_v2.py b/tests/test_backend_docling_parse_v2.py index ee0e5c7..972f3b5 100644 --- a/tests/test_backend_docling_parse_v2.py +++ b/tests/test_backend_docling_parse_v2.py @@ -31,7 +31,7 @@ def test_text_cell_counts(): doc_backend = _get_backend(pdf_doc) - for page_index in range(0, doc_backend.page_count()): + for page_index in range(doc_backend.page_count()): last_cell_count = None for i in range(10): page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0) @@ -41,9 +41,9 @@ def test_text_cell_counts(): last_cell_count = len(cells) if len(cells) != last_cell_count: - assert ( - False - ), "Loading page multiple times yielded non-identical text cell counts" + assert False, ( + "Loading page multiple times yielded non-identical text cell counts" + ) last_cell_count = len(cells) @@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path): page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0) # Crop out "Figure 1" from the DocLayNet paper - im = page_backend.get_page_image( + page_backend.get_page_image( scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) ) # im.show() diff --git a/tests/test_backend_docling_parse_v4.py b/tests/test_backend_docling_parse_v4.py index fcb551e..35c4eab 100644 --- a/tests/test_backend_docling_parse_v4.py +++ b/tests/test_backend_docling_parse_v4.py @@ -31,7 +31,7 @@ def test_text_cell_counts(): doc_backend = _get_backend(pdf_doc) - for page_index in range(0, doc_backend.page_count()): + for page_index in range(doc_backend.page_count()): last_cell_count = None for i in range(10): page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0) @@ -41,9 +41,9 @@ def test_text_cell_counts(): last_cell_count = len(cells) if len(cells) != last_cell_count: - assert ( - False - ), "Loading page multiple times yielded non-identical text cell counts" + assert False, ( + "Loading page multiple times yielded non-identical text cell counts" + ) last_cell_count = len(cells) @@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path): page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0) # Crop out "Figure 1" from the DocLayNet paper - im = page_backend.get_page_image( + page_backend.get_page_image( scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) ) # im.show() diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py index 5f5e740..18254a7 100644 --- a/tests/test_backend_html.py +++ b/tests/test_backend_html.py @@ -105,7 +105,6 @@ def test_ordered_lists(): def get_html_paths(): - # Define the directory you want to search directory = Path("./tests/data/html/") @@ -115,14 +114,12 @@ def get_html_paths(): def get_converter(): - converter = DocumentConverter(allowed_formats=[InputFormat.HTML]) return converter def test_e2e_html_conversions(): - html_paths = get_html_paths() converter = get_converter() @@ -138,15 +135,15 @@ def test_e2e_html_conversions(): doc: DoclingDocument = conv_result.document pred_md: str = doc.export_to_markdown() - assert verify_export( - pred_md, str(gt_path) + ".md", generate=GENERATE - ), "export to md" + assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), ( + "export to md" + ) pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) - assert verify_export( - pred_itxt, str(gt_path) + ".itxt", generate=GENERATE - ), "export to indented-text" + assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), ( + "export to indented-text" + ) assert verify_document(doc, str(gt_path) + ".json", GENERATE) diff --git a/tests/test_backend_jats.py b/tests/test_backend_jats.py index d209431..a4373be 100644 --- a/tests/test_backend_jats.py +++ b/tests/test_backend_jats.py @@ -15,7 +15,7 @@ GENERATE = GEN_TEST_DATA def get_pubmed_paths(): - directory = Path(os.path.dirname(__file__) + f"/data/pubmed/") + directory = Path(os.path.dirname(__file__) + "/data/pubmed/") xml_files = sorted(directory.rglob("*.xml")) return xml_files @@ -47,9 +47,9 @@ def test_e2e_pubmed_conversions(use_stream=False): pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) - assert verify_export( - pred_itxt, str(gt_path) + ".itxt" - ), "export to indented-text" + assert verify_export(pred_itxt, str(gt_path) + ".itxt"), ( + "export to indented-text" + ) assert verify_document(doc, str(gt_path) + ".json", GENERATE), "export to json" diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index 0604429..65f636e 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -17,7 +17,6 @@ GENERATE = GEN_TEST_DATA def get_xlsx_paths(): - # Define the directory you want to search directory = Path("./tests/data/xlsx/") @@ -27,7 +26,6 @@ def get_xlsx_paths(): def get_converter(): - converter = DocumentConverter(allowed_formats=[InputFormat.XLSX]) return converter @@ -65,13 +63,13 @@ def test_e2e_xlsx_conversions(documents) -> None: pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) - assert verify_export( - pred_itxt, str(gt_path) + ".itxt" - ), "export to indented-text" + assert verify_export(pred_itxt, str(gt_path) + ".itxt"), ( + "export to indented-text" + ) - assert verify_document( - doc, str(gt_path) + ".json", GENERATE - ), "document document" + assert verify_document(doc, str(gt_path) + ".json", GENERATE), ( + "document document" + ) def test_pages(documents) -> None: @@ -81,7 +79,7 @@ def test_pages(documents) -> None: documents: The paths and converted documents. """ # number of pages from the backend method - path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0] + path = next(item for item in get_xlsx_paths() if item.stem == "test-01") in_doc = InputDocument( path_or_stream=path, format=InputFormat.XLSX, @@ -92,7 +90,7 @@ def test_pages(documents) -> None: assert backend.page_count() == 3 # number of pages from the converted document - doc = [item for path, item in documents if path.stem == "test-01"][0] + doc = next(item for path, item in documents if path.stem == "test-01") assert len(doc.pages) == 3 # page sizes as number of cells diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 5c43ccf..c50e071 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -1,4 +1,3 @@ -import os from pathlib import Path from docling.backend.msword_backend import MsWordDocumentBackend @@ -43,7 +42,6 @@ def test_heading_levels(): def get_docx_paths(): - # Define the directory you want to search directory = Path("./tests/data/docx/") @@ -53,14 +51,12 @@ def get_docx_paths(): def get_converter(): - converter = DocumentConverter(allowed_formats=[InputFormat.DOCX]) return converter def test_e2e_docx_conversions(): - docx_paths = get_docx_paths() converter = get_converter() @@ -76,20 +72,20 @@ def test_e2e_docx_conversions(): doc: DoclingDocument = conv_result.document pred_md: str = doc.export_to_markdown() - assert verify_export( - pred_md, str(gt_path) + ".md", generate=GENERATE - ), "export to md" + assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), ( + "export to md" + ) pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) - assert verify_export( - pred_itxt, str(gt_path) + ".itxt", generate=GENERATE - ), "export to indented-text" + assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), ( + "export to indented-text" + ) - assert verify_document( - doc, str(gt_path) + ".json", generate=GENERATE - ), "document document" + assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), ( + "document document" + ) if docx_path.name == "word_tables.docx": pred_html: str = doc.export_to_html() diff --git a/tests/test_backend_patent_uspto.py b/tests/test_backend_patent_uspto.py index aebc01d..ace6d3a 100644 --- a/tests/test_backend_patent_uspto.py +++ b/tests/test_backend_patent_uspto.py @@ -109,27 +109,27 @@ def test_patent_groundtruth(patents, groundtruth): md_name = path.stem + ".md" if md_name in gt_names: pred_md = doc.export_to_markdown() - assert ( - pred_md == gt_names[md_name] - ), f"Markdown file mismatch against groundtruth {md_name}" + assert pred_md == gt_names[md_name], ( + f"Markdown file mismatch against groundtruth {md_name}" + ) json_path = path.with_suffix(".json") if json_path.stem in gt_names: - assert verify_document( - doc, str(json_path), GENERATE - ), f"JSON file mismatch against groundtruth {json_path}" + assert verify_document(doc, str(json_path), GENERATE), ( + f"JSON file mismatch against groundtruth {json_path}" + ) itxt_name = path.stem + ".itxt" if itxt_name in gt_names: pred_itxt = doc._export_to_indented_text() - assert ( - pred_itxt == gt_names[itxt_name] - ), f"Indented text file mismatch against groundtruth {itxt_name}" + assert pred_itxt == gt_names[itxt_name], ( + f"Indented text file mismatch against groundtruth {itxt_name}" + ) def test_tables(tables): """Test the table parser.""" # CHECK table in file tables_20180000016.xml file_name = "tables_ipa20180000016.xml" - file_table = [item[1] for item in tables if item[0].name == file_name][0] + file_table = next(item[1] for item in tables if item[0].name == file_name) assert file_table.num_rows == 13 assert file_table.num_cols == 10 assert len(file_table.table_cells) == 130 @@ -140,7 +140,7 @@ def test_patent_uspto_ice(patents): # CHECK application doc number 20200022300 file_name = "ipa20200022300.xml" - doc = [item[1] for item in patents if item[0].name == file_name][0] + doc = next(item[1] for item in patents if item[0].name == file_name) if GENERATE: _generate_groundtruth(doc, Path(file_name).stem) @@ -278,7 +278,7 @@ def test_patent_uspto_ice(patents): # CHECK application doc number 20180000016 for HTML entities, level 2 headings, tables file_name = "ipa20180000016.xml" - doc = [item[1] for item in patents if item[0].name == file_name][0] + doc = next(item[1] for item in patents if item[0].name == file_name) if GENERATE: _generate_groundtruth(doc, Path(file_name).stem) @@ -348,7 +348,7 @@ def test_patent_uspto_ice(patents): # CHECK application doc number 20110039701 for complex long tables file_name = "ipa20110039701.xml" - doc = [item[1] for item in patents if item[0].name == file_name][0] + doc = next(item[1] for item in patents if item[0].name == file_name) assert doc.name == file_name assert len(doc.tables) == 17 @@ -358,7 +358,7 @@ def test_patent_uspto_grant_v2(patents): # CHECK application doc number 06442728 file_name = "pg06442728.xml" - doc = [item[1] for item in patents if item[0].name == file_name][0] + doc = next(item[1] for item in patents if item[0].name == file_name) if GENERATE: _generate_groundtruth(doc, Path(file_name).stem) @@ -376,12 +376,12 @@ def test_patent_uspto_grant_v2(patents): assert isinstance(texts[2], TextItem) assert texts[2].text == ( "An interleaver receives incoming data frames of size N. The interleaver " - "indexes the elements of the frame with an N₁×N₂ index array. The interleaver " + "indexes the elements of the frame with an N₁×N₂ index array. The interleaver " # noqa: RUF001 "then effectively rearranges (permutes) the data by permuting the rows of the " - "index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to " + "index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to " # noqa: RUF001 "permute the columns (indexed by k) of each row (indexed by j). P is at least " "equal to N₂, βj is a constant which may be different for each row, and each " - "αj is a relative prime number relative to P. After permuting, the " + "αj is a relative prime number relative to P. After permuting, the " # noqa: RUF001 "interleaver outputs the data in a different order than received (e.g., " "receives sequentially row by row, outputs sequentially each column by column)." ) @@ -402,7 +402,7 @@ def test_patent_uspto_app_v1(patents): # CHECK application doc number 20010031492 file_name = "pa20010031492.xml" - doc = [item[1] for item in patents if item[0].name == file_name][0] + doc = next(item[1] for item in patents if item[0].name == file_name) if GENERATE: _generate_groundtruth(doc, Path(file_name).stem) @@ -432,7 +432,7 @@ def test_patent_uspto_grant_aps(patents): # CHECK application doc number 057006474 file_name = "pftaps057006474.txt" - doc = [item[1] for item in patents if item[0].name == file_name][0] + doc = next(item[1] for item in patents if item[0].name == file_name) if GENERATE: _generate_groundtruth(doc, Path(file_name).stem) diff --git a/tests/test_backend_pdfium.py b/tests/test_backend_pdfium.py index 10a2b9e..317cdee 100644 --- a/tests/test_backend_pdfium.py +++ b/tests/test_backend_pdfium.py @@ -32,7 +32,7 @@ def test_text_cell_counts(): doc_backend = _get_backend(pdf_doc) - for page_index in range(0, doc_backend.page_count()): + for page_index in range(doc_backend.page_count()): last_cell_count = None for i in range(10): page_backend: PyPdfiumPageBackend = doc_backend.load_page(0) @@ -42,9 +42,9 @@ def test_text_cell_counts(): last_cell_count = len(cells) if len(cells) != last_cell_count: - assert ( - False - ), "Loading page multiple times yielded non-identical text cell counts" + assert False, ( + "Loading page multiple times yielded non-identical text cell counts" + ) last_cell_count = len(cells) @@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path): page_backend: PyPdfiumPageBackend = doc_backend.load_page(0) # Crop out "Figure 1" from the DocLayNet paper - im = page_backend.get_page_image( + page_backend.get_page_image( scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527) ) # im.show() diff --git a/tests/test_backend_pptx.py b/tests/test_backend_pptx.py index 947e9e6..4f73c87 100644 --- a/tests/test_backend_pptx.py +++ b/tests/test_backend_pptx.py @@ -1,4 +1,3 @@ -import os from pathlib import Path from docling.datamodel.base_models import InputFormat @@ -12,7 +11,6 @@ GENERATE = GEN_TEST_DATA def get_pptx_paths(): - # Define the directory you want to search directory = Path("./tests/data/pptx/") @@ -22,14 +20,12 @@ def get_pptx_paths(): def get_converter(): - converter = DocumentConverter(allowed_formats=[InputFormat.PPTX]) return converter def test_e2e_pptx_conversions(): - pptx_paths = get_pptx_paths() converter = get_converter() @@ -50,10 +46,10 @@ def test_e2e_pptx_conversions(): pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) - assert verify_export( - pred_itxt, str(gt_path) + ".itxt" - ), "export to indented-text" + assert verify_export(pred_itxt, str(gt_path) + ".itxt"), ( + "export to indented-text" + ) - assert verify_document( - doc, str(gt_path) + ".json", GENERATE - ), "document document" + assert verify_document(doc, str(gt_path) + ".json", GENERATE), ( + "document document" + ) diff --git a/tests/test_code_formula.py b/tests/test_code_formula.py index 085e094..e5d52da 100644 --- a/tests/test_code_formula.py +++ b/tests/test_code_formula.py @@ -3,7 +3,6 @@ from pathlib import Path from docling_core.types.doc import CodeItem, TextItem from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import PdfPipelineOptions @@ -12,7 +11,6 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline def get_converter(): - pipeline_options = PdfPipelineOptions() pipeline_options.generate_page_images = True diff --git a/tests/test_document_picture_classifier.py b/tests/test_document_picture_classifier.py index 2ac1da9..5dc5e92 100644 --- a/tests/test_document_picture_classifier.py +++ b/tests/test_document_picture_classifier.py @@ -2,7 +2,6 @@ from pathlib import Path from docling_core.types.doc import PictureClassificationData -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import PdfPipelineOptions @@ -11,7 +10,6 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline def get_converter(): - pipeline_options = PdfPipelineOptions() pipeline_options.generate_page_images = True @@ -49,32 +47,32 @@ def test_picture_classifier(): res = results[0] assert len(res.annotations) == 1 - assert type(res.annotations[0]) == PictureClassificationData + assert isinstance(res.annotations[0], PictureClassificationData) classification_data = res.annotations[0] assert classification_data.provenance == "DocumentPictureClassifier" - assert ( - len(classification_data.predicted_classes) == 16 - ), "Number of predicted classes is not equal to 16" + assert len(classification_data.predicted_classes) == 16, ( + "Number of predicted classes is not equal to 16" + ) confidences = [pred.confidence for pred in classification_data.predicted_classes] - assert confidences == sorted( - confidences, reverse=True - ), "Predictions are not sorted in descending order of confidence" - assert ( - classification_data.predicted_classes[0].class_name == "bar_chart" - ), "The prediction is wrong for the bar chart image." + assert confidences == sorted(confidences, reverse=True), ( + "Predictions are not sorted in descending order of confidence" + ) + assert classification_data.predicted_classes[0].class_name == "bar_chart", ( + "The prediction is wrong for the bar chart image." + ) res = results[1] assert len(res.annotations) == 1 - assert type(res.annotations[0]) == PictureClassificationData + assert isinstance(res.annotations[0], PictureClassificationData) classification_data = res.annotations[0] assert classification_data.provenance == "DocumentPictureClassifier" - assert ( - len(classification_data.predicted_classes) == 16 - ), "Number of predicted classes is not equal to 16" + assert len(classification_data.predicted_classes) == 16, ( + "Number of predicted classes is not equal to 16" + ) confidences = [pred.confidence for pred in classification_data.predicted_classes] - assert confidences == sorted( - confidences, reverse=True - ), "Predictions are not sorted in descending order of confidence" - assert ( - classification_data.predicted_classes[0].class_name == "map" - ), "The prediction is wrong for the bar chart image." + assert confidences == sorted(confidences, reverse=True), ( + "Predictions are not sorted in descending order of confidence" + ) + assert classification_data.predicted_classes[0].class_name == "map", ( + "The prediction is wrong for the bar chart image." + ) diff --git a/tests/test_e2e_conversion.py b/tests/test_e2e_conversion.py index 590558f..5dc2e89 100644 --- a/tests/test_e2e_conversion.py +++ b/tests/test_e2e_conversion.py @@ -1,7 +1,6 @@ from pathlib import Path from docling.backend.docling_parse_backend import DoclingParseDocumentBackend -from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions @@ -15,7 +14,6 @@ GENERATE_V2 = GEN_TEST_DATA def get_pdf_paths(): - # Define the directory you want to search directory = Path("./tests/data/pdf/") @@ -25,7 +23,6 @@ def get_pdf_paths(): def get_converter(): - pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False pipeline_options.do_table_structure = True @@ -45,7 +42,6 @@ def get_converter(): def test_e2e_pdfs_conversions(): - pdf_paths = get_pdf_paths() converter = get_converter() diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index 985a625..63570d0 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -3,7 +3,6 @@ from pathlib import Path from typing import List from docling.backend.docling_parse_backend import DoclingParseDocumentBackend -from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( diff --git a/tests/test_input_doc.py b/tests/test_input_doc.py index 946ad06..94a6887 100644 --- a/tests/test_input_doc.py +++ b/tests/test_input_doc.py @@ -12,10 +12,9 @@ from docling.document_converter import PdfFormatOption def test_in_doc_from_valid_path(): - test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") doc = _make_input_doc(test_doc_path) - assert doc.valid == True + assert doc.valid is True def test_in_doc_from_invalid_path(): @@ -23,29 +22,26 @@ def test_in_doc_from_invalid_path(): doc = _make_input_doc(test_doc_path) - assert doc.valid == False + assert doc.valid is False def test_in_doc_from_valid_buf(): - buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read()) stream = DocumentStream(name="my_doc.pdf", stream=buf) doc = _make_input_doc_from_stream(stream) - assert doc.valid == True + assert doc.valid is True def test_in_doc_from_invalid_buf(): - buf = BytesIO(b"") stream = DocumentStream(name="my_doc.pdf", stream=buf) doc = _make_input_doc_from_stream(stream) - assert doc.valid == False + assert doc.valid is False def test_image_in_pdf_backend(): - in_doc = InputDocument( path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"), format=InputFormat.IMAGE, @@ -76,7 +72,6 @@ def test_image_in_pdf_backend(): def test_in_doc_with_page_range(): - test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") limits = DocumentLimits() limits.page_range = (1, 10) @@ -87,7 +82,7 @@ def test_in_doc_with_page_range(): backend=PyPdfiumDocumentBackend, limits=limits, ) - assert doc.valid == True + assert doc.valid is True limits.page_range = (9, 9) @@ -97,7 +92,7 @@ def test_in_doc_with_page_range(): backend=PyPdfiumDocumentBackend, limits=limits, ) - assert doc.valid == True + assert doc.valid is True limits.page_range = (11, 12) @@ -107,7 +102,7 @@ def test_in_doc_with_page_range(): backend=PyPdfiumDocumentBackend, limits=limits, ) - assert doc.valid == False + assert doc.valid is False def test_guess_format(tmp_path): @@ -192,17 +187,17 @@ def test_guess_format(tmp_path): ) doc_path = temp_dir / "docling_test.xml" doc_path.write_text(xml_content, encoding="utf-8") - assert dci._guess_format(doc_path) == None + assert dci._guess_format(doc_path) is None buf = BytesIO(Path(doc_path).open("rb").read()) stream = DocumentStream(name="docling_test.xml", stream=buf) - assert dci._guess_format(stream) == None + assert dci._guess_format(stream) is None # Invalid USPTO patent (as plain text) stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz")) - assert dci._guess_format(stream) == None + assert dci._guess_format(stream) is None doc_path = temp_dir / "pftaps_wrong.txt" doc_path.write_text("xyz", encoding="utf-8") - assert dci._guess_format(doc_path) == None + assert dci._guess_format(doc_path) is None # Valid Docling JSON test_str = '{"name": ""}' diff --git a/tests/test_interfaces.py b/tests/test_interfaces.py index 29130c5..8d68f29 100644 --- a/tests/test_interfaces.py +++ b/tests/test_interfaces.py @@ -4,7 +4,6 @@ from pathlib import Path import pytest from docling.backend.docling_parse_backend import DoclingParseDocumentBackend -from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.datamodel.base_models import DocumentStream, InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption @@ -16,14 +15,12 @@ GENERATE = GEN_TEST_DATA def get_pdf_path(): - pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf") return pdf_path @pytest.fixture def converter(): - pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False pipeline_options.do_table_structure = True @@ -42,7 +39,6 @@ def converter(): def test_convert_path(converter: DocumentConverter): - pdf_path = get_pdf_path() print(f"converting {pdf_path}") @@ -56,7 +52,6 @@ def test_convert_path(converter: DocumentConverter): def test_convert_stream(converter: DocumentConverter): - pdf_path = get_pdf_path() print(f"converting {pdf_path}") diff --git a/tests/test_invalid_input.py b/tests/test_invalid_input.py index 68716cb..3cc7a63 100644 --- a/tests/test_invalid_input.py +++ b/tests/test_invalid_input.py @@ -8,7 +8,6 @@ from docling.document_converter import ConversionError, DocumentConverter def get_pdf_path(): - pdf_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf") return pdf_path diff --git a/tests/test_legacy_format_transform.py b/tests/test_legacy_format_transform.py index c46f899..caef8ff 100644 --- a/tests/test_legacy_format_transform.py +++ b/tests/test_legacy_format_transform.py @@ -3,8 +3,6 @@ from pathlib import Path import pytest -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend -from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption @@ -23,7 +21,6 @@ def test_doc_paths(): def get_converter(): - pipeline_options = PdfPipelineOptions() pipeline_options.do_ocr = False diff --git a/tests/verify_utils.py b/tests/verify_utils.py index 02861a8..1a913c2 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -21,7 +21,6 @@ from docling.datamodel.document import ConversionResult def levenshtein(str1: str, str2: str) -> int: - # Ensure str1 is the shorter string to optimize memory usage if len(str1) > len(str2): str1, str2 = str2, str1 @@ -46,7 +45,6 @@ def levenshtein(str1: str, str2: str) -> int: def verify_text(gt: str, pred: str, fuzzy: bool, fuzzy_threshold: float = 0.4): - if len(gt) == 0 or not fuzzy: assert gt == pred, f"{gt}!={pred}" else: @@ -57,22 +55,19 @@ def verify_text(gt: str, pred: str, fuzzy: bool, fuzzy_threshold: float = 0.4): def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]): - - assert len(doc_pred_pages) == len( - doc_true_pages - ), "pred- and true-doc do not have the same number of pages" + assert len(doc_pred_pages) == len(doc_true_pages), ( + "pred- and true-doc do not have the same number of pages" + ) for pid, page_true_item in enumerate(doc_true_pages): - num_true_cells = len(page_true_item.cells) num_pred_cells = len(doc_pred_pages[pid].cells) - assert ( - num_true_cells == num_pred_cells - ), f"num_true_cells!=num_pred_cells {num_true_cells}!={num_pred_cells}" + assert num_true_cells == num_pred_cells, ( + f"num_true_cells!=num_pred_cells {num_true_cells}!={num_pred_cells}" + ) for cid, cell_true_item in enumerate(page_true_item.cells): - cell_pred_item = doc_pred_pages[pid].cells[cid] true_text = cell_true_item.text @@ -81,9 +76,9 @@ def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]): true_bbox = cell_true_item.rect.to_bounding_box().as_tuple() pred_bbox = cell_pred_item.rect.to_bounding_box().as_tuple() - assert ( - true_bbox == pred_bbox - ), f"bbox is not the same: {true_bbox} != {pred_bbox}" + assert true_bbox == pred_bbox, ( + f"bbox is not the same: {true_bbox} != {pred_bbox}" + ) return True @@ -123,19 +118,19 @@ def verify_tables_v1(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool): # print("Expected number of tables: {}, result: {}".format(len(doc_true.tables), len(doc_pred.tables))) - assert len(doc_true.tables) == len( - doc_pred.tables - ), "document has different count of tables than expected." + assert len(doc_true.tables) == len(doc_pred.tables), ( + "document has different count of tables than expected." + ) - for l, true_item in enumerate(doc_true.tables): - pred_item = doc_pred.tables[l] + for ix, true_item in enumerate(doc_true.tables): + pred_item = doc_pred.tables[ix] - assert ( - true_item.num_rows == pred_item.num_rows - ), "table does not have the same #-rows" - assert ( - true_item.num_cols == pred_item.num_cols - ), "table does not have the same #-cols" + assert true_item.num_rows == pred_item.num_rows, ( + "table does not have the same #-rows" + ) + assert true_item.num_cols == pred_item.num_cols, ( + "table does not have the same #-cols" + ) assert true_item.data is not None, "documents are expected to have table data" assert pred_item.data is not None, "documents are expected to have table data" @@ -145,7 +140,6 @@ def verify_tables_v1(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool): for i, row in enumerate(true_item.data): for j, col in enumerate(true_item.data[i]): - # print("true: ", true_item.data[i][j].text) # print("pred: ", pred_item.data[i][j].text) # print("") @@ -154,20 +148,20 @@ def verify_tables_v1(doc_pred: DsDocument, doc_true: DsDocument, fuzzy: bool): true_item.data[i][j].text, pred_item.data[i][j].text, fuzzy=fuzzy ) - assert ( - true_item.data[i][j].obj_type == pred_item.data[i][j].obj_type - ), "table-cell does not have the same type" + assert true_item.data[i][j].obj_type == pred_item.data[i][j].obj_type, ( + "table-cell does not have the same type" + ) return True def verify_table_v2(true_item: TableItem, pred_item: TableItem, fuzzy: bool): - assert ( - true_item.data.num_rows == pred_item.data.num_rows - ), "table does not have the same #-rows" - assert ( - true_item.data.num_cols == pred_item.data.num_cols - ), "table does not have the same #-cols" + assert true_item.data.num_rows == pred_item.data.num_rows, ( + "table does not have the same #-rows" + ) + assert true_item.data.num_cols == pred_item.data.num_cols, ( + "table does not have the same #-cols" + ) assert true_item.data is not None, "documents are expected to have table data" assert pred_item.data is not None, "documents are expected to have table data" @@ -177,7 +171,6 @@ def verify_table_v2(true_item: TableItem, pred_item: TableItem, fuzzy: bool): for i, row in enumerate(true_item.data.grid): for j, col in enumerate(true_item.data.grid[i]): - # print("true: ", true_item.data[i][j].text) # print("pred: ", pred_item.data[i][j].text) # print("") @@ -223,11 +216,11 @@ def verify_picture_image_v2( def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy: bool): - assert len(doc_pred.texts) == len(doc_true.texts), f"Text lengths do not match." + assert len(doc_pred.texts) == len(doc_true.texts), "Text lengths do not match." - assert len(doc_true.tables) == len( - doc_pred.tables - ), "document has different count of tables than expected." + assert len(doc_true.tables) == len(doc_pred.tables), ( + "document has different count of tables than expected." + ) for (true_item, _true_level), (pred_item, _pred_level) in zip( doc_true.iterate_items(), doc_pred.iterate_items() @@ -237,7 +230,7 @@ def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy: assert isinstance(pred_item, DocItem), "Test item is not a DocItem" # Validate type - assert true_item.label == pred_item.label, f"Object label does not match." + assert true_item.label == pred_item.label, "Object label does not match." # Validate provenance assert len(true_item.prov) == len(pred_item.prov), "Length of prov mismatch" @@ -261,25 +254,25 @@ def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy: # Validate table content if isinstance(true_item, TableItem): - assert isinstance( - pred_item, TableItem - ), "Test item is not a TableItem as the expected one" - assert verify_table_v2( - true_item, pred_item, fuzzy=fuzzy - ), "Tables not matching" + assert isinstance(pred_item, TableItem), ( + "Test item is not a TableItem as the expected one" + ) + assert verify_table_v2(true_item, pred_item, fuzzy=fuzzy), ( + "Tables not matching" + ) # Validate picture content if isinstance(true_item, PictureItem): - assert isinstance( - pred_item, PictureItem - ), "Test item is not a PictureItem as the expected one" + assert isinstance(pred_item, PictureItem), ( + "Test item is not a PictureItem as the expected one" + ) true_image = true_item.get_image(doc=doc_true) pred_image = true_item.get_image(doc=doc_pred) if true_image is not None: - assert verify_picture_image_v2( - true_image, pred_image - ), "Picture image mismatch" + assert verify_picture_image_v2(true_image, pred_image), ( + "Picture image mismatch" + ) # TODO: check picture annotations @@ -298,14 +291,14 @@ def verify_conversion_result_v1( input_path: Path, doc_result: ConversionResult, generate: bool = False, - ocr_engine: str = None, + ocr_engine: Optional[str] = None, fuzzy: bool = False, ): PageList = TypeAdapter(List[Page]) - assert ( - doc_result.status == ConversionStatus.SUCCESS - ), f"Doc {input_path} did not convert successfully." + assert doc_result.status == ConversionStatus.SUCCESS, ( + f"Doc {input_path} did not convert successfully." + ) doc_pred_pages: List[Page] = doc_result.pages doc_pred: DsDocument = doc_result.legacy_document @@ -344,52 +337,52 @@ def verify_conversion_result_v1( with open(dt_path, "w") as fw: fw.write(doc_pred_dt) else: # default branch in test - with open(pages_path, "r") as fr: + with open(pages_path) as fr: doc_true_pages = PageList.validate_json(fr.read()) - with open(json_path, "r") as fr: + with open(json_path) as fr: doc_true: DsDocument = DsDocument.model_validate_json(fr.read()) - with open(md_path, "r") as fr: + with open(md_path) as fr: doc_true_md = fr.read() - with open(dt_path, "r") as fr: + with open(dt_path) as fr: doc_true_dt = fr.read() if not fuzzy: - assert verify_cells( - doc_pred_pages, doc_true_pages - ), f"Mismatch in PDF cell prediction for {input_path}" + assert verify_cells(doc_pred_pages, doc_true_pages), ( + f"Mismatch in PDF cell prediction for {input_path}" + ) # assert verify_output( # doc_pred, doc_true # ), f"Mismatch in JSON prediction for {input_path}" - assert verify_tables_v1( - doc_pred, doc_true, fuzzy=fuzzy - ), f"verify_tables(doc_pred, doc_true) mismatch for {input_path}" + assert verify_tables_v1(doc_pred, doc_true, fuzzy=fuzzy), ( + f"verify_tables(doc_pred, doc_true) mismatch for {input_path}" + ) - assert verify_md( - doc_pred_md, doc_true_md, fuzzy=fuzzy - ), f"Mismatch in Markdown prediction for {input_path}" + assert verify_md(doc_pred_md, doc_true_md, fuzzy=fuzzy), ( + f"Mismatch in Markdown prediction for {input_path}" + ) - assert verify_dt( - doc_pred_dt, doc_true_dt, fuzzy=fuzzy - ), f"Mismatch in DocTags prediction for {input_path}" + assert verify_dt(doc_pred_dt, doc_true_dt, fuzzy=fuzzy), ( + f"Mismatch in DocTags prediction for {input_path}" + ) def verify_conversion_result_v2( input_path: Path, doc_result: ConversionResult, generate: bool = False, - ocr_engine: str = None, + ocr_engine: Optional[str] = None, fuzzy: bool = False, ): PageList = TypeAdapter(List[Page]) - assert ( - doc_result.status == ConversionStatus.SUCCESS - ), f"Doc {input_path} did not convert successfully." + assert doc_result.status == ConversionStatus.SUCCESS, ( + f"Doc {input_path} did not convert successfully." + ) doc_pred_pages: List[Page] = doc_result.pages doc_pred: DoclingDocument = doc_result.document @@ -426,42 +419,41 @@ def verify_conversion_result_v2( with open(dt_path, "w") as fw: fw.write(doc_pred_dt) else: # default branch in test - with open(pages_path, "r") as fr: + with open(pages_path) as fr: doc_true_pages = PageList.validate_json(fr.read()) - with open(json_path, "r") as fr: + with open(json_path) as fr: doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read()) - with open(md_path, "r") as fr: + with open(md_path) as fr: doc_true_md = fr.read() - with open(dt_path, "r") as fr: + with open(dt_path) as fr: doc_true_dt = fr.read() if not fuzzy: - assert verify_cells( - doc_pred_pages, doc_true_pages - ), f"Mismatch in PDF cell prediction for {input_path}" + assert verify_cells(doc_pred_pages, doc_true_pages), ( + f"Mismatch in PDF cell prediction for {input_path}" + ) # assert verify_output( # doc_pred, doc_true # ), f"Mismatch in JSON prediction for {input_path}" - assert verify_docitems( - doc_pred, doc_true, fuzzy=fuzzy - ), f"verify_docling_document(doc_pred, doc_true) mismatch for {input_path}" + assert verify_docitems(doc_pred, doc_true, fuzzy=fuzzy), ( + f"verify_docling_document(doc_pred, doc_true) mismatch for {input_path}" + ) - assert verify_md( - doc_pred_md, doc_true_md, fuzzy=fuzzy - ), f"Mismatch in Markdown prediction for {input_path}" + assert verify_md(doc_pred_md, doc_true_md, fuzzy=fuzzy), ( + f"Mismatch in Markdown prediction for {input_path}" + ) - assert verify_dt( - doc_pred_dt, doc_true_dt, fuzzy=fuzzy - ), f"Mismatch in DocTags prediction for {input_path}" + assert verify_dt(doc_pred_dt, doc_true_dt, fuzzy=fuzzy), ( + f"Mismatch in DocTags prediction for {input_path}" + ) def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False): - if not os.path.exists(gtfile) or generate: with open(gtfile, "w") as fw: json.dump(pred_doc.export_to_dict(), fw, indent=2)