ci: add coverage and ruff (#1383)

* add coverage calculation and push Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * new codecov version and usage of token Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * enable ruff formatter instead of black and isort Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff lint fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff unsafe fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add removed imports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * runs 1 on linter issues Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * finalize linter fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update pyproject.toml Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-04-14 18:01:26 +02:00 · 2025-04-14 18:01:26 +02:00 · 5458a88464
commit 5458a88464
parent 293c28ca7c
104 changed files with 665 additions and 633 deletions
--- a/.github/codecov.yml
+++ b/.github/codecov.yml
@ -0,0 +1,17 @@
 codecov:
  # https://docs.codecov.io/docs/comparing-commits
  allow_coverage_offsets: true
 coverage:
  status:
    project:
      default:
        informational: true
        target: auto  # auto compares coverage to the previous base commit
        flags:
          - docling
  comment:
    layout: "reach, diff, flags, files"
    behavior: default
    require_changes: false  # if true: only post the comment if coverage changes
    branches:               # branch names that can post comment
      - "main"
--- a/.github/workflows/cd.yml
+++ b/.github/workflows/cd.yml
@ -10,6 +10,8 @@ env:
 jobs:
  code-checks:
    uses: ./.github/workflows/checks.yml
    with:
      push_coverage: false
  pre-release-check:
    runs-on: ubuntu-latest
    outputs:
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@ -1,5 +1,13 @@
 on:
  workflow_call:
    inputs:
      push_coverage:
          type: boolean
          description: "If true, the coverage results are pushed to codecov.io."
          default: true
    secrets:
      CODECOV_TOKEN:
        required: false        
 env:
  HF_HUB_DOWNLOAD_TIMEOUT: "60"
@ -32,7 +40,13 @@ jobs:
        run: poetry install --all-extras
      - name: Testing
        run: |
-          poetry run pytest -v tests
+          poetry run pytest -v --cov=docling --cov-report=xml tests
      - name: Upload coverage to Codecov
        if: inputs.push_coverage
        uses: codecov/codecov-action@v5
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          file: ./coverage.xml
      - name: Run examples
        run: |
          for file in docs/examples/*.py; do
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -17,3 +17,5 @@ jobs:
  code-checks:
    if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }}
    uses: ./.github/workflows/checks.yml
    secrets:
      CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,43 +1,26 @@
 fail_fast: true
 repos:
  - repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.11.5
    hooks:
      # Run the Ruff formatter.
      - id: ruff-format
        name: "Ruff formatter"
        args: [--config=pyproject.toml]
        files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
      # Run the Ruff linter.
      - id: ruff
        name: "Ruff linter"
        args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
        files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
  - repo: local
    hooks:
      - id: black
        name: Black
        entry: poetry run black docling docs/examples tests
        pass_filenames: false
        language: system
        files: '\.py$'
      - id: isort
        name: isort
        entry: poetry run isort docling docs/examples tests
        pass_filenames: false
        language: system
        files: '\.py$'
 #      - id: flake8
 #        name: flake8
 #        entry: poetry run flake8 docling
 #        pass_filenames: false
 #        language: system
 #        files: '\.py$'
      - id: mypy
        name: MyPy
        entry: poetry run mypy docling
        pass_filenames: false
        language: system
        files: '\.py$'
      - id: nbqa_black
        name: nbQA Black
        entry: poetry run nbqa black docs/examples
        pass_filenames: false
        language: system
        files: '\.ipynb$'
      - id: nbqa_isort
        name: nbQA isort
        entry: poetry run nbqa isort docs/examples
        pass_filenames: false
        language: system
        files: '\.ipynb$'
      - id: poetry
        name: Poetry check
        entry: poetry check --lock
--- a/docling/backend/asciidoc_backend.py
+++ b/docling/backend/asciidoc_backend.py
@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                text_stream = self.path_or_stream.getvalue().decode("utf-8")
                self.lines = text_stream.split("\n")
            if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                with open(self.path_or_stream, encoding="utf-8") as f:
                    self.lines = f.readlines()
            self.valid = True
@ -75,14 +75,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        return doc
-    def _parse(self, doc: DoclingDocument):
+    def _parse(self, doc: DoclingDocument):  # noqa: C901
        """
        Main function that orchestrates the parsing by yielding components:
        title, section headers, text, lists, and tables.
        """
        content = ""
        in_list = False
        in_table = False
@ -95,7 +93,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        # indents: dict[int, Union[DocItem, GroupItem, None]] = {}
        indents: dict[int, Union[GroupItem, None]] = {}
-        for i in range(0, 10):
+        for i in range(10):
            parents[i] = None
            indents[i] = None
@ -125,7 +123,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
            # Lists
            elif self._is_list_item(line):
                _log.debug(f"line: {line}")
                item = self._parse_list_item(line)
                _log.debug(f"parsed list-item: {item}")
@ -147,7 +144,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                    indents[level + 1] = item["indent"]
                elif in_list and item["indent"] < indents[level]:
                    # print(item["indent"], " => ", indents[level])
                    while item["indent"] < indents[level]:
                        # print(item["indent"], " => ", indents[level])
@ -176,7 +172,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
            elif in_table and (
                (not self._is_table_line(line)) or line.strip() == "|==="
            ):  # end of table
                caption = None
                if len(caption_data) > 0:
                    caption = doc.add_text(
@ -195,7 +190,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
            # Picture
            elif self._is_picture(line):
                caption = None
                if len(caption_data) > 0:
                    caption = doc.add_text(
@ -250,7 +244,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                text_data = []
            elif len(line.strip()) > 0:  # allow multiline texts
                item = self._parse_text(line)
                text_data.append(item["text"])
@ -273,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
    def _get_current_level(self, parents):
        for k, v in parents.items():
-            if v == None and k > 0:
+            if v is None and k > 0:
                return k - 1
        return 0
    def _get_current_parent(self, parents):
        for k, v in parents.items():
-            if v == None and k > 0:
+            if v is None and k > 0:
                return parents[k - 1]
        return None
@ -328,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                    "marker": marker,
                    "text": text.strip(),
                    "numbered": False,
-                    "indent": 0 if indent == None else len(indent),
+                    "indent": 0 if indent is None else len(indent),
                }
            else:
                return {
@ -336,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                    "marker": marker,
                    "text": text.strip(),
                    "numbered": True,
-                    "indent": 0 if indent == None else len(indent),
+                    "indent": 0 if indent is None else len(indent),
                }
        else:
            # Fallback if no match
@ -357,7 +350,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        return [cell.strip() for cell in line.split("|") if cell.strip()]
    def _populate_table_as_grid(self, table_data):
        num_rows = len(table_data)
        # Adjust the table data into a grid format
--- a/docling/backend/csv_backend.py
+++ b/docling/backend/csv_backend.py
@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
        head = self.content.readline()
        dialect = csv.Sniffer().sniff(head, ",;\t|:")
        _log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
-        if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
+        if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
            raise RuntimeError(
                f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
            )
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@ -1,8 +1,9 @@
 import logging
 import random
 from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, List, Optional, Union
+from typing import List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
@ -156,7 +157,6 @@ class DoclingParsePageBackend(PdfPageBackend):
    def get_page_image(
        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
    ) -> Image.Image:
        page_size = self.get_size()
        if not cropbox:
--- a/docling/backend/docling_parse_v2_backend.py
+++ b/docling/backend/docling_parse_v2_backend.py
@ -1,8 +1,9 @@
 import logging
 import random
 from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
@ -172,7 +173,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
    def get_page_image(
        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
    ) -> Image.Image:
        page_size = self.get_size()
        if not cropbox:
--- a/docling/backend/docling_parse_v4_backend.py
+++ b/docling/backend/docling_parse_v4_backend.py
@ -1,14 +1,14 @@
 import logging
-import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
 from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
-from PIL import Image, ImageDraw
+from PIL import Image
 from pypdfium2 import PdfPage
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
@ -93,7 +93,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
    def get_page_image(
        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
    ) -> Image.Image:
        page_size = self.get_size()
        if not cropbox:
--- a/docling/backend/docx/latex/latex_dict.py
+++ b/docling/backend/docx/latex/latex_dict.py
@ -1,12 +1,8 @@
 # -*- coding: utf-8 -*-
 """
 Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
 On 23/01/2025
 """
 from __future__ import unicode_literals
 CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
 BLANK = ""
@ -79,7 +75,6 @@ CHR_BO = {
 }
 T = {
    "\u2192": "\\rightarrow ",
    # Greek letters
    "\U0001d6fc": "\\alpha ",
    "\U0001d6fd": "\\beta ",
--- a/docling/backend/docx/latex/omml.py
+++ b/docling/backend/docx/latex/omml.py
@ -76,8 +76,7 @@ def get_val(key, default=None, store=CHR):
        return default
-class Tag2Method(object):
+class Tag2Method:
    def call_method(self, elm, stag=None):
        getmethod = self.tag2meth.get
        if stag is None:
@ -130,7 +129,6 @@ class Tag2Method(object):
 class Pr(Tag2Method):
    text = ""
    __val_tags = ("chr", "pos", "begChr", "endChr", "type")
@ -159,7 +157,7 @@ class Pr(Tag2Method):
    def do_common(self, elm):
        stag = elm.tag.replace(OMML_NS, "")
        if stag in self.__val_tags:
-            t = elm.get("{0}val".format(OMML_NS))
+            t = elm.get(f"{OMML_NS}val")
            self.__innerdict[stag] = t
        return None
@ -248,7 +246,6 @@ class oMath2Latex(Tag2Method):
        """
        the Pre-Sub-Superscript object -- Not support yet
        """
        pass
    def do_sub(self, elm):
        text = self.process_children(elm)
@ -331,7 +328,7 @@ class oMath2Latex(Tag2Method):
        t_dict = self.process_children_dict(elm, include=("e", "lim"))
        latex_s = LIM_FUNC.get(t_dict["e"])
        if not latex_s:
-            raise NotSupport("Not support lim %s" % t_dict["e"])
+            raise RuntimeError("Not support lim {}".format(t_dict["e"]))
        else:
            return latex_s.format(lim=t_dict.get("lim"))
@ -413,7 +410,7 @@ class oMath2Latex(Tag2Method):
        """
        _str = []
        _base_str = []
-        found_text = elm.findtext("./{0}t".format(OMML_NS))
+        found_text = elm.findtext(f"./{OMML_NS}t")
        if found_text:
            for s in found_text:
                out_latex_str = self.process_unicode(s)
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@ -55,7 +55,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        self.max_levels = 10
        self.level = 0
        self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
-        for i in range(0, self.max_levels):
+        for i in range(self.max_levels):
            self.parents[i] = None
        try:
@ -126,7 +126,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        return doc
    def walk(self, tag: Tag, doc: DoclingDocument) -> None:
        # Iterate over elements in the body of the document
        text: str = ""
        for element in tag.children:
@ -135,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    self.analyze_tag(cast(Tag, element), doc)
                except Exception as exc_child:
                    _log.error(
-                        f"Error processing child from tag {tag.name}: {repr(exc_child)}"
+                        f"Error processing child from tag {tag.name}: {exc_child!r}"
                    )
                    raise exc_child
            elif isinstance(element, NavigableString) and not isinstance(
@ -147,7 +146,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    item for item in element.next_siblings if isinstance(item, Tag)
                ]
                if element.next_sibling is None or any(
-                    [item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
+                    item.name in TAGS_FOR_NODE_ITEMS for item in siblings
                ):
                    text = text.strip()
                    if text and tag.name in ["div"]:
@ -222,7 +221,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            )
        else:
            if hlevel > self.level:
                # add invisible group
                for i in range(self.level + 1, hlevel):
                    self.parents[i] = doc.add_group(
@ -234,7 +232,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                self.level = hlevel
            elif hlevel < self.level:
                # remove the tail
                for key in self.parents.keys():
                    if key > hlevel:
@ -360,7 +357,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            marker = ""
            enumerated = False
            if parent_label == GroupLabel.ORDERED_LIST:
-                marker = f"{str(index_in_list)}."
+                marker = f"{index_in_list!s}."
                enumerated = True
            doc.add_list_item(
                text=text,
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                # otherwise they represent emphasis (bold or italic)
                self.markdown = self._shorten_underscore_sequences(text_stream)
            if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                with open(self.path_or_stream, encoding="utf-8") as f:
                    md_content = f.read()
                    # remove invalid sequences
                    # very long sequences of underscores will lead to unnecessary long processing times.
@ -168,7 +168,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            )
        self.inline_texts = []
-    def _iterate_elements(
+    def _iterate_elements(  # noqa: C901
        self,
        element: marko.element.Element,
        depth: int,
@ -176,7 +176,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
        visited: Set[marko.element.Element],
        parent_item: Optional[NodeItem] = None,
    ):
        if element in visited:
            return
@ -236,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            if has_non_empty_list_items:
                label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
                parent_item = doc.add_group(
-                    label=label, name=f"list", parent=parent_item
+                    label=label, name="list", parent=parent_item
                )
        elif (
@ -320,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            self._html_blocks += 1
            self._process_inline_text(parent_item, doc)
            self._close_table(doc)
-            _log.debug("HTML Block: {}".format(element))
+            _log.debug(f"HTML Block: {element}")
            if (
                len(element.body) > 0
            ):  # If Marko doesn't return any content for HTML block, skip it
@ -332,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
        else:
            if not isinstance(element, str):
                self._close_table(doc)
-                _log.debug("Some other element: {}".format(element))
+                _log.debug(f"Some other element: {element}")
        processed_block_types = (
            marko.block.Heading,
@ -398,7 +397,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            # if HTML blocks were detected, export to HTML and delegate to HTML backend
            if self._html_blocks > 0:
                # export to HTML
                html_backend_cls = HTMLDocumentBackend
                html_str = doc.export_to_html()
--- a/docling/backend/msexcel_backend.py
+++ b/docling/backend/msexcel_backend.py
@ -184,7 +184,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
        """
        if self.workbook is not None:
            # Iterate over all sheets
            for sheet_name in self.workbook.sheetnames:
                _log.info(f"Processing sheet: {sheet_name}")
@ -253,7 +252,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
                )
                for excel_cell in excel_table.data:
                    cell = TableCell(
                        text=excel_cell.text,
                        row_span=excel_cell.row_span,
@ -303,7 +301,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
        # Iterate over all cells in the sheet
        for ri, row in enumerate(sheet.iter_rows(values_only=False)):
            for rj, cell in enumerate(row):
                # Skip empty or already visited cells
                if cell.value is None or (ri, rj) in visited:
                    continue
@ -342,7 +339,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
        visited_cells: set[tuple[int, int]] = set()
        for ri in range(start_row, max_row + 1):
            for rj in range(start_col, max_col + 1):
                cell = sheet.cell(row=ri + 1, column=rj + 1)  # 1-based indexing
                # Check if the cell belongs to a merged range
@ -350,14 +346,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
                col_span = 1
                for merged_range in sheet.merged_cells.ranges:
                    if (
                        merged_range.min_row <= ri + 1
                        and ri + 1 <= merged_range.max_row
                        and merged_range.min_col <= rj + 1
                        and rj + 1 <= merged_range.max_col
                    ):
                        row_span = merged_range.max_row - merged_range.min_row + 1
                        col_span = merged_range.max_col - merged_range.min_col + 1
                        break
@ -499,7 +493,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
                            ),
                        ),
                    )
-                except:
+                except Exception:
                    _log.error("could not extract the image from excel sheets")
        return doc
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@ -120,13 +120,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        return prov
-    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
+    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):  # noqa: C901
        is_a_list = False
        is_list_group_created = False
        enum_list_item_value = 0
        new_list = None
        bullet_type = "None"
        list_text = ""
        list_label = GroupLabel.LIST
        doc_label = DocItemLabel.LIST_ITEM
        prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
@ -243,7 +242,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                    enum_marker = str(enum_list_item_value) + "."
                if not is_list_group_created:
                    new_list = doc.add_group(
-                        label=list_label, name=f"list", parent=parent_slide
+                        label=list_label, name="list", parent=parent_slide
                    )
                    is_list_group_created = True
                doc.add_list_item(
@ -368,11 +367,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        slide_width = pptx_obj.slide_width
        slide_height = pptx_obj.slide_height
        text_content = []  # type: ignore
        max_levels = 10
        parents = {}  # type: ignore
-        for i in range(0, max_levels):
+        for i in range(max_levels):
            parents[i] = None
        # Loop through each slide
@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
            )
            slide_size = Size(width=slide_width, height=slide_height)
-            parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
+            doc.add_page(page_no=slide_ind + 1, size=slide_size)
            def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
                handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def _get_level(self) -> int:
        """Return the first None index."""
        for k, v in self.parents.items():
-            if k >= 0 and v == None:
+            if k >= 0 and v is None:
                return k
        return 0
@ -418,7 +418,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            else prev_parent
        )
-    def _handle_text_elements(
+    def _handle_text_elements(  # noqa: C901
        self,
        element: BaseOxmlElement,
        docx_obj: DocxDocument,
@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
                )
                if cell is None or cell._tc in cell_set:
-                    _log.debug(f"  skipped since repeated content")
+                    _log.debug("  skipped since repeated content")
                    col_idx += cell.grid_span
                    continue
                else:
@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    image=ImageRef.from_pil(image=pil_image, dpi=72),
                    caption=None,
                )
-            except (UnidentifiedImageError, OSError) as e:
+            except (UnidentifiedImageError, OSError):
                _log.warning("Warning: image cannot be loaded by Pillow")
                doc.add_picture(
                    parent=self.parents[level - 1],
--- a/docling/backend/pdf_backend.py
+++ b/docling/backend/pdf_backend.py
@ -1,7 +1,8 @@
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, Optional, Set, Union
+from typing import Optional, Set, Union
 from docling_core.types.doc import BoundingBox, Size
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@ -1,8 +1,9 @@
 import logging
 import random
 from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
        self.valid = True  # No better way to tell from pypdfium.
        try:
            self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
-        except PdfiumError as e:
+        except PdfiumError:
            _log.info(
                f"An exception occurred when loading page {page_no} of document {document_hash}.",
                exc_info=True,
@ -225,7 +226,6 @@ class PyPdfiumPageBackend(PdfPageBackend):
    def get_page_image(
        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
    ) -> Image.Image:
        page_size = self.get_size()
        if not cropbox:
--- a/docling/backend/xml/jats_backend.py
+++ b/docling/backend/xml/jats_backend.py
@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
            doc_info: etree.DocInfo = self.tree.docinfo
            if doc_info.system_url and any(
-                [kwd in doc_info.system_url for kwd in JATS_DTD_URL]
+                kwd in doc_info.system_url for kwd in JATS_DTD_URL
            ):
                self.valid = True
                return
            for ent in doc_info.internalDTD.iterentities():
                if ent.system_url and any(
-                    [kwd in ent.system_url for kwd in JATS_DTD_URL]
+                    kwd in ent.system_url for kwd in JATS_DTD_URL
                ):
                    self.valid = True
                    return
@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
                # TODO: once superscript is supported, add label with formatting
                aff = aff.removeprefix(f"{label[0].text}, ")
            affiliation_names.append(aff)
-        affiliation_ids_names = {
+        affiliation_ids_names = dict(
-            id: name
+            zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
-            for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
+        )
        }
        # Get author names and affiliation names
        for author_node in meta.xpath(
@ -300,7 +299,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
    def _add_abstract(
        self, doc: DoclingDocument, xml_components: XMLComponents
    ) -> None:
        for abstract in xml_components["abstract"]:
            text: str = abstract["content"]
            title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
@ -349,7 +347,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
        return
-    def _parse_element_citation(self, node: etree._Element) -> str:
+    def _parse_element_citation(self, node: etree._Element) -> str:  # noqa: C901
        citation: Citation = {
            "author_names": "",
            "title": "",
@ -440,7 +438,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
            citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
            if len(node.xpath("lpage")) > 0:
                citation["page"] += (
-                    "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
+                    "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()  # noqa: RUF001
                )
        # Flatten the citation to string
@ -595,9 +593,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
        try:
            self._add_table(doc, parent, table)
-        except Exception as e:
+        except Exception:
-            _log.warning(f"Skipping unsupported table in {str(self.file)}")
+            _log.warning(f"Skipping unsupported table in {self.file!s}")
            pass
        return
@ -609,7 +606,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
        )
        return
-    def _walk_linear(
+    def _walk_linear(  # noqa: C901
        self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
    ) -> str:
        skip_tags = ["term"]
--- a/docling/backend/xml/uspto_backend.py
+++ b/docling/backend/xml/uspto_backend.py
@ -122,7 +122,6 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
    @override
    def convert(self) -> DoclingDocument:
        if self.parser is not None:
            doc = self.parser.parse(self.patent_content)
            if doc is None:
@ -163,7 +162,6 @@ class PatentUspto(ABC):
        Returns:
            The patent parsed as a docling document.
        """
        pass
 class PatentUsptoIce(PatentUspto):
@ -265,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
            self.style_html = HtmlEntity()
        @override
-        def startElement(self, tag, attributes):  # noqa: N802
+        def startElement(self, tag, attributes):
            """Signal the start of an element.
            Args:
@ -281,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
            self._start_registered_elements(tag, attributes)
        @override
-        def skippedEntity(self, name):  # noqa: N802
+        def skippedEntity(self, name):
            """Receive notification of a skipped entity.
            HTML entities will be skipped by the parser. This method will unescape them
@ -315,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
                        self.text += unescaped
        @override
-        def endElement(self, tag):  # noqa: N802
+        def endElement(self, tag):
            """Signal the end of an element.
            Args:
@ -603,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
            self.style_html = HtmlEntity()
        @override
-        def startElement(self, tag, attributes):  # noqa: N802
+        def startElement(self, tag, attributes):
            """Signal the start of an element.
            Args:
@ -616,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
            self._start_registered_elements(tag, attributes)
        @override
-        def skippedEntity(self, name):  # noqa: N802
+        def skippedEntity(self, name):
            """Receive notification of a skipped entity.
            HTML entities will be skipped by the parser. This method will unescape them
@ -650,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
                        self.text += unescaped
        @override
-        def endElement(self, tag):  # noqa: N802
+        def endElement(self, tag):
            """Signal the end of an element.
            Args:
@ -691,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
            if tag in [member.value for member in self.Element]:
                if (
                    tag == self.Element.HEADING.value
-                    and not self.Element.SDOCL.value in self.property
+                    and self.Element.SDOCL.value not in self.property
                ):
                    level_attr: str = attributes.get("LVL", "")
                    new_level: int = int(level_attr) if level_attr.isnumeric() else 1
@ -743,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
                # headers except claims statement
                elif (
                    self.Element.HEADING.value in self.property
-                    and not self.Element.SDOCL.value in self.property
+                    and self.Element.SDOCL.value not in self.property
                    and text.strip()
                ):
                    self.parents[self.level + 1] = self.doc.add_heading(
@ -1164,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
            self.style_html = HtmlEntity()
        @override
-        def startElement(self, tag, attributes):  # noqa: N802
+        def startElement(self, tag, attributes):
            """Signal the start of an element.
            Args:
@ -1177,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
            self._start_registered_elements(tag, attributes)
        @override
-        def skippedEntity(self, name):  # noqa: N802
+        def skippedEntity(self, name):
            """Receive notification of a skipped entity.
            HTML entities will be skipped by the parser. This method will unescape them
@ -1211,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
                        self.text += unescaped
        @override
-        def endElement(self, tag):  # noqa: N802
+        def endElement(self, tag):
            """Signal the end of an element.
            Args:
@ -1474,9 +1472,7 @@ class XmlTable:
                if cw == 0:
                    offset_w0.append(col["offset"][ic])
-            min_colinfo["offset"] = sorted(
+            min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))
                list(set(col["offset"] + min_colinfo["offset"]))
            )
        # add back the 0 width cols to offset list
        offset_w0 = list(set(offset_w0))
@ -1527,7 +1523,7 @@ class XmlTable:
        return ncols_max
-    def _parse_table(self, table: Tag) -> TableData:
+    def _parse_table(self, table: Tag) -> TableData:  # noqa: C901
        """Parse the content of a table tag.
        Args:
@ -1722,7 +1718,7 @@ class HtmlEntity:
                "0": "&#8304;",
                "+": "&#8314;",
                "-": "&#8315;",
-                "−": "&#8315;",
+                "−": "&#8315;",  # noqa: RUF001
                "=": "&#8316;",
                "(": "&#8317;",
                ")": "&#8318;",
@ -1746,7 +1742,7 @@ class HtmlEntity:
                "0": "&#8320;",
                "+": "&#8330;",
                "-": "&#8331;",
-                "−": "&#8331;",
+                "−": "&#8331;",  # noqa: RUF001
                "=": "&#8332;",
                "(": "&#8333;",
                ")": "&#8334;",
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -6,14 +6,16 @@ import sys
 import tempfile
 import time
 import warnings
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Annotated, Dict, Iterable, List, Optional, Type
+from typing import Annotated, Dict, List, Optional, Type
 import rich.table
 import typer
 from docling_core.types.doc import ImageRefMode
 from docling_core.utils.file import resolve_source_to_path
 from pydantic import TypeAdapter
 from rich.console import Console
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
@ -53,7 +55,6 @@ warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
 _log = logging.getLogger(__name__)
 from rich.console import Console
 console = Console()
 err_console = Console(stderr=True)
@ -160,7 +161,6 @@ def export_documents(
    export_doctags: bool,
    image_export_mode: ImageRefMode,
 ):
    success_count = 0
    failure_count = 0
@ -233,7 +233,7 @@ def _split_list(raw: Optional[str]) -> Optional[List[str]]:
@app.command(no_args_is_help=True)
-def convert(
+def convert(  # noqa: C901
    input_sources: Annotated[
        List[str],
        typer.Argument(
@ -289,7 +289,7 @@ def convert(
            ...,
            help=(
                f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
-                f"{', '.join((o.value for o in ocr_engines_enum_internal))}. "
+                f"{', '.join(o.value for o in ocr_engines_enum_internal)}. "
                f"Use the option --show-external-plugins to see the options allowed with external plugins."
            ),
        ),
@ -430,7 +430,7 @@ def convert(
    settings.debug.visualize_ocr = debug_visualize_ocr
    if from_formats is None:
-        from_formats = [e for e in InputFormat]
+        from_formats = list(InputFormat)
    parsed_headers: Optional[Dict[str, str]] = None
    if headers is not None:
--- a/docling/cli/models.py
+++ b/docling/cli/models.py
@ -62,7 +62,7 @@ def download(
    models: Annotated[
        Optional[list[_AvailableModels]],
        typer.Argument(
-            help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
+            help="Models to download (default behavior: a predefined set of models will be downloaded).",
        ),
    ] = None,
    all: Annotated[
@ -89,14 +89,13 @@ def download(
            "Cannot simultaneously set 'all' parameter and specify models to download."
        )
    if not quiet:
        FORMAT = "%(message)s"
        logging.basicConfig(
            level=logging.INFO,
            format="[blue]%(message)s[/blue]",
            datefmt="[%X]",
            handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
        )
-    to_download = models or ([m for m in _AvailableModels] if all else _default_models)
+    to_download = models or (list(_AvailableModels) if all else _default_models)
    output_dir = download_models(
        output_dir=output_dir,
        force=force,
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -10,7 +10,9 @@ from docling_core.types.doc import (
    TableCell,
 )
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
-from docling_core.types.io import (  # DO ΝΟΤ REMOVE; explicitly exposed from this location
+
 # DO NOT REMOVE; explicitly exposed from this location
 from docling_core.types.io import (
    DocumentStream,
 )
 from PIL.Image import Image
@ -233,9 +235,9 @@ class Page(BaseModel):
        None  # Internal PDF backend. By default it is cleared during assembling.
    )
    _default_image_scale: float = 1.0  # Default image scale for external usage.
-    _image_cache: Dict[float, Image] = (
+    _image_cache: Dict[
-        {}
+        float, Image
-    )  # Cache of images in different scales. By default it is cleared during assembling.
+    ] = {}  # Cache of images in different scales. By default it is cleared during assembling.
    def get_image(
        self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
@ -243,7 +245,7 @@ class Page(BaseModel):
        if self._backend is None:
            return self._image_cache.get(scale, None)
-        if not scale in self._image_cache:
+        if scale not in self._image_cache:
            if cropbox is None:
                self._image_cache[scale] = self._backend.get_page_image(scale=scale)
            else:
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -1,13 +1,13 @@
 import csv
 import logging
 import re
 from collections.abc import Iterable
 from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
 from typing import (
    TYPE_CHECKING,
    Dict,
    Iterable,
    List,
    Literal,
    Optional,
@ -17,6 +17,8 @@ from typing import (
 )
 import filetype
 # DO NOT REMOVE; explicitly exposed from this location
 from docling_core.types.doc import (
    DocItem,
    DocItemLabel,
@ -35,14 +37,14 @@ from docling_core.types.legacy_doc.base import (
    PageReference,
    Prov,
    Ref,
    Table as DsSchemaTable,
    TableCell,
 )
 from docling_core.types.legacy_doc.base import Table as DsSchemaTable
 from docling_core.types.legacy_doc.base import TableCell
 from docling_core.types.legacy_doc.document import (
    CCSDocumentDescription as DsDocumentDescription,
    CCSFileInfoObject as DsFileInfoObject,
    ExportedCCSDocument as DsDocument,
 )
 from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
 from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
 from docling_core.utils.file import resolve_source_to_stream
 from docling_core.utils.legacy import docling_document_to_legacy
 from pydantic import BaseModel
@ -65,7 +67,7 @@ from docling.datamodel.base_models import (
 )
 from docling.datamodel.settings import DocumentLimits
 from docling.utils.profiling import ProfilingItem
-from docling.utils.utils import create_file_hash, create_hash
+from docling.utils.utils import create_file_hash
 if TYPE_CHECKING:
    from docling.document_converter import FormatOption
@ -134,9 +136,9 @@ class InputDocument(BaseModel):
                    self._init_doc(backend, path_or_stream)
            elif isinstance(path_or_stream, BytesIO):
-                assert (
+                assert filename is not None, (
-                    filename is not None
+                    "Can't construct InputDocument from stream without providing filename arg."
-                ), "Can't construct InputDocument from stream without providing filename arg."
+                )
                self.file = PurePath(filename)
                self.filesize = path_or_stream.getbuffer().nbytes
@ -228,7 +230,6 @@ class _DummyBackend(AbstractDocumentBackend):
 class _DocumentConversionInput(BaseModel):
    path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
    headers: Optional[Dict[str, str]] = None
    limits: Optional[DocumentLimits] = DocumentLimits()
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -380,7 +380,6 @@ class PaginatedPipelineOptions(PipelineOptions):
 class VlmPipelineOptions(PaginatedPipelineOptions):
    generate_page_images: bool = True
    force_backend_text: bool = (
        False  # (To be used with vlms, or other generative models)
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -1,11 +1,11 @@
 import hashlib
 import logging
 import math
 import sys
 import time
 from collections.abc import Iterable, Iterator
 from functools import partial
 from pathlib import Path
-from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
+from typing import Dict, List, Optional, Tuple, Type, Union
 from pydantic import BaseModel, ConfigDict, model_validator, validate_call
@ -172,7 +172,7 @@ class DocumentConverter:
        format_options: Optional[Dict[InputFormat, FormatOption]] = None,
    ):
        self.allowed_formats = (
-            allowed_formats if allowed_formats is not None else [e for e in InputFormat]
+            allowed_formats if allowed_formats is not None else list(InputFormat)
        )
        self.format_to_options = {
            format: (
@ -254,7 +254,7 @@ class DocumentConverter:
        if not had_result and raises_on_error:
            raise ConversionError(
-                f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
+                "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
            )
    def _convert(
@ -266,7 +266,7 @@ class DocumentConverter:
            conv_input.docs(self.format_to_options),
            settings.perf.doc_batch_size,  # pass format_options
        ):
-            _log.info(f"Going to convert document batch...")
+            _log.info("Going to convert document batch...")
            # parallel processing only within input_batch
            # with ThreadPoolExecutor(
--- a/docling/models/api_vlm_model.py
+++ b/docling/models/api_vlm_model.py
@ -1,4 +1,4 @@
-from typing import Iterable
+from collections.abc import Iterable
 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
@ -10,7 +10,6 @@ from docling.utils.profiling import TimeRecorder
 class ApiVlmModel(BasePageModel):
    def __init__(
        self,
        enabled: bool,
--- a/docling/models/base_model.py
+++ b/docling/models/base_model.py
@ -1,5 +1,6 @@
 from abc import ABC, abstractmethod
-from typing import Any, Generic, Iterable, Optional, Protocol, Type
+from collections.abc import Iterable
 from typing import Generic, Optional, Protocol, Type
 from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
 from typing_extensions import TypeVar
@ -29,7 +30,6 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
 class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
    elements_batch_size: int = settings.perf.elements_batch_size
    @abstractmethod
@ -50,7 +50,6 @@ class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
 class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
    def prepare_element(
        self, conv_res: ConversionResult, element: NodeItem
    ) -> Optional[NodeItem]:
@ -62,7 +61,6 @@ class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
 class BaseItemAndImageEnrichmentModel(
    GenericEnrichmentModel[ItemAndImageEnrichmentElement]
 ):
    images_scale: float
    expansion_factor: float = 0.0
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@ -1,12 +1,12 @@
 import copy
 import logging
 from abc import abstractmethod
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, List, Optional, Type
+from typing import List, Optional, Type
 import numpy as np
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
 from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import binary_dilation, find_objects, label
--- a/docling/models/code_formula_model.py
+++ b/docling/models/code_formula_model.py
@ -1,7 +1,8 @@
 import re
 from collections import Counter
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, List, Literal, Optional, Tuple, Union
+from typing import List, Literal, Optional, Tuple, Union
 import numpy as np
 from docling_core.types.doc import (
--- a/docling/models/document_picture_classifier.py
+++ b/docling/models/document_picture_classifier.py
@ -1,5 +1,6 @@
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, List, Literal, Optional, Tuple, Union
+from typing import List, Literal, Optional, Union
 import numpy as np
 from docling_core.types.doc import (
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@ -1,8 +1,9 @@
 import logging
 import warnings
 import zipfile
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, List, Optional, Type
+from typing import List, Optional, Type
 import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
@ -58,12 +59,10 @@ class EasyOcrModel(BaseOcrModel):
                device = decide_device(accelerator_options.device)
                # Enable easyocr GPU if running on CUDA, MPS
                use_gpu = any(
-                    [
+                    device.startswith(x)
-                        device.startswith(x)
+                    for x in [
-                        for x in [
+                        AcceleratorDevice.CUDA.value,
-                            AcceleratorDevice.CUDA.value,
+                        AcceleratorDevice.MPS.value,
                            AcceleratorDevice.MPS.value,
                        ]
                    ]
                )
            else:
@ -98,8 +97,10 @@ class EasyOcrModel(BaseOcrModel):
        progress: bool = False,
    ) -> Path:
        # Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
-        from easyocr.config import detection_models as det_models_dict
+        from easyocr.config import (
-        from easyocr.config import recognition_models as rec_models_dict
+            detection_models as det_models_dict,
            recognition_models as rec_models_dict,
        )
        if local_dir is None:
            local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
@ -126,13 +127,11 @@ class EasyOcrModel(BaseOcrModel):
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        if not self.enabled:
            yield from page_batch
            return
        for page in page_batch:
            assert page._backend is not None
            if not page._backend.is_valid():
                yield page
--- a/docling/models/factories/init.py
+++ b/docling/models/factories/init.py
@ -9,7 +9,7 @@ from docling.models.factories.picture_description_factory import (
 logger = logging.getLogger(__name__)
-@lru_cache()
+@lru_cache
 def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
    factory = OcrFactory()
    factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
@ -17,7 +17,7 @@ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
    return factory
-@lru_cache()
+@lru_cache
 def get_picture_description_factory(
    allow_external_plugins: bool = False,
 ) -> PictureDescriptionFactory:
--- a/docling/models/factories/base_factory.py
+++ b/docling/models/factories/base_factory.py
@ -33,7 +33,7 @@ class BaseFactory(Generic[A], metaclass=ABCMeta):
    @property
    def registered_kind(self) -> list[str]:
-        return list(opt.kind for opt in self._classes.keys())
+        return [opt.kind for opt in self._classes.keys()]
    def get_enum(self) -> enum.Enum:
        return enum.Enum(
--- a/docling/models/hf_mlx_model.py
+++ b/docling/models/hf_mlx_model.py
@ -1,25 +1,22 @@
 import logging
 import time
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, List, Optional
+from typing import Optional
 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    HuggingFaceVlmOptions,
 )
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
 class HuggingFaceMlxModel(BasePageModel):
    def __init__(
        self,
        enabled: bool,
@ -32,7 +29,6 @@ class HuggingFaceMlxModel(BasePageModel):
        self.vlm_options = vlm_options
        if self.enabled:
            try:
                from mlx_vlm import generate, load  # type: ignore
                from mlx_vlm.prompt_utils import apply_chat_template  # type: ignore
@ -125,6 +121,8 @@ class HuggingFaceMlxModel(BasePageModel):
                    generation_time = time.time() - start_time
                    page_tags = output
                    _log.debug(f"Generation time {generation_time:.2f} seconds.")
                    # inference_time = time.time() - start_time
                    # tokens_per_second = num_tokens / generation_time
                    # print("")
--- a/docling/models/hf_vlm_model.py
+++ b/docling/models/hf_vlm_model.py
@ -1,16 +1,15 @@
 import logging
 import time
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, List, Optional
+from typing import Optional
 from docling.datamodel.base_models import Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    HuggingFaceVlmOptions,
 )
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import TimeRecorder
@ -19,7 +18,6 @@ _log = logging.getLogger(__name__)
 class HuggingFaceVlmModel(BasePageModel):
    def __init__(
        self,
        enabled: bool,
@ -42,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel):
            device = decide_device(accelerator_options.device)
            self.device = device
-            _log.debug("Available device for HuggingFace VLM: {}".format(device))
+            _log.debug(f"Available device for HuggingFace VLM: {device}")
            repo_cache_folder = vlm_options.repo_id.replace("/", "--")
@ -168,6 +166,10 @@ class HuggingFaceVlmModel(BasePageModel):
                    num_tokens = len(generated_ids[0])
                    page_tags = generated_texts
                    _log.debug(
                        f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
                    )
                    # inference_time = time.time() - start_time
                    # tokens_per_second = num_tokens / generation_time
                    # print("")
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -1,8 +1,9 @@
 import copy
 import logging
 import warnings
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Union
+from typing import Optional
 from docling_core.types.doc import DocItemLabel
 from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
@ -142,7 +143,6 @@ class LayoutModel(BasePageModel):
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        for page in page_batch:
            assert page._backend is not None
            if not page._backend.is_valid():
--- a/docling/models/ocr_mac_model.py
+++ b/docling/models/ocr_mac_model.py
@ -1,8 +1,9 @@
 import logging
 import sys
 import tempfile
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Tuple, Type
+from typing import Optional, Type
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell
@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel):
        if self.enabled:
            if "darwin" != sys.platform:
-                raise RuntimeError(f"OcrMac is only supported on Mac.")
+                raise RuntimeError("OcrMac is only supported on Mac.")
            install_errmsg = (
                "ocrmac is not correctly installed. "
                "Please install it via `pip install ocrmac` to use this OCR engine. "
@ -58,7 +59,6 @@ class OcrMacModel(BaseOcrModel):
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        if not self.enabled:
            yield from page_batch
            return
@ -69,7 +69,6 @@ class OcrMacModel(BaseOcrModel):
                yield page
            else:
                with TimeRecorder(conv_res, "ocr"):
                    ocr_rects = self.get_ocr_rects(page)
                    all_ocr_cells = []
--- a/docling/models/page_assemble_model.py
+++ b/docling/models/page_assemble_model.py
@ -1,6 +1,7 @@
 import logging
 import re
-from typing import Iterable, List
+from collections.abc import Iterable
 from typing import List
 from pydantic import BaseModel
@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel):
        sanitized_text = "".join(lines)
        # Text normalization
-        sanitized_text = sanitized_text.replace("⁄", "/")
+        sanitized_text = sanitized_text.replace("⁄", "/")  # noqa: RUF001
-        sanitized_text = sanitized_text.replace("’", "'")
+        sanitized_text = sanitized_text.replace("’", "'")  # noqa: RUF001
-        sanitized_text = sanitized_text.replace("‘", "'")
+        sanitized_text = sanitized_text.replace("‘", "'")  # noqa: RUF001
        sanitized_text = sanitized_text.replace("“", '"')
        sanitized_text = sanitized_text.replace("”", '"')
        sanitized_text = sanitized_text.replace("•", "·")
@ -71,7 +72,6 @@ class PageAssembleModel(BasePageModel):
                yield page
            else:
                with TimeRecorder(conv_res, "page_assemble"):
                    assert page.predictions.layout is not None
                    # assembles some JSON output page by page.
@ -83,7 +83,6 @@ class PageAssembleModel(BasePageModel):
                    for cluster in page.predictions.layout.clusters:
                        # _log.info("Cluster label seen:", cluster.label)
                        if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
                            textlines = [
                                cell.text.replace("\x02", "-").strip()
                                for cell in cluster.cells
@ -109,9 +108,7 @@ class PageAssembleModel(BasePageModel):
                                tbl = page.predictions.tablestructure.table_map.get(
                                    cluster.id, None
                                )
-                            if (
+                            if not tbl:  # fallback: add table without structure, if it isn't present
                                not tbl
                            ):  # fallback: add table without structure, if it isn't present
                                tbl = Table(
                                    label=cluster.label,
                                    id=cluster.id,
@ -130,9 +127,7 @@ class PageAssembleModel(BasePageModel):
                                fig = page.predictions.figures_classification.figure_map.get(
                                    cluster.id, None
                                )
-                            if (
+                            if not fig:  # fallback: add figure without classification, if it isn't present
                                not fig
                            ):  # fallback: add figure without classification, if it isn't present
                                fig = FigureElement(
                                    label=cluster.label,
                                    id=cluster.id,
--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@ -1,5 +1,6 @@
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional
+from typing import Optional
 from PIL import ImageDraw
 from pydantic import BaseModel
--- a/docling/models/picture_description_api_model.py
+++ b/docling/models/picture_description_api_model.py
@ -1,5 +1,6 @@
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Type, Union
+from typing import Optional, Type, Union
 from PIL import Image
--- a/docling/models/picture_description_base_model.py
+++ b/docling/models/picture_description_base_model.py
@ -1,12 +1,11 @@
 import logging
 from abc import abstractmethod
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Any, Iterable, List, Optional, Type, Union
+from typing import List, Optional, Type, Union
 from docling_core.types.doc import (
    DoclingDocument,
    NodeItem,
    PictureClassificationClass,
    PictureItem,
 )
 from docling_core.types.doc.document import (  # TODO: move import to docling_core.types.doc
--- a/docling/models/picture_description_vlm_model.py
+++ b/docling/models/picture_description_vlm_model.py
@ -1,5 +1,6 @@
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Type, Union
+from typing import Optional, Type, Union
 from PIL import Image
@ -13,7 +14,6 @@ from docling.utils.accelerator_utils import decide_device
 class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
    @classmethod
    def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
        return PictureDescriptionVlmOptions
@ -36,7 +36,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
        self.options: PictureDescriptionVlmOptions
        if self.enabled:
            if artifacts_path is None:
                artifacts_path = self.download_models(repo_id=self.options.repo_id)
            else:
--- a/docling/models/rapid_ocr_model.py
+++ b/docling/models/rapid_ocr_model.py
@ -1,6 +1,7 @@
 import logging
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Type
+from typing import Optional, Type
 import numpy
 from docling_core.types.doc import BoundingBox, CoordOrigin
@ -74,13 +75,11 @@ class RapidOcrModel(BaseOcrModel):
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        if not self.enabled:
            yield from page_batch
            return
        for page in page_batch:
            assert page._backend is not None
            if not page._backend.is_valid():
                yield page
--- a/docling/models/readingorder_model.py
+++ b/docling/models/readingorder_model.py
@ -1,12 +1,7 @@
 import copy
 import random
 from pathlib import Path
 from typing import Dict, List
 from docling_core.types.doc import (
    BoundingBox,
    CoordOrigin,
    DocItem,
    DocItemLabel,
    DoclingDocument,
    DocumentOrigin,
@ -17,13 +12,10 @@ from docling_core.types.doc import (
    TableData,
 )
 from docling_core.types.doc.document import ContentLayer
 from docling_core.types.legacy_doc.base import Ref
 from docling_core.types.legacy_doc.document import BaseText
 from docling_ibm_models.reading_order.reading_order_rb import (
    PageElement as ReadingOrderPageElement,
    ReadingOrderPredictor,
 )
 from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
 from PIL import ImageDraw
 from pydantic import BaseModel, ConfigDict
 from docling.datamodel.base_models import (
@ -35,7 +27,6 @@ from docling.datamodel.base_models import (
    TextElement,
 )
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.settings import settings
 from docling.utils.profiling import ProfilingScope, TimeRecorder
@ -53,12 +44,10 @@ class ReadingOrderModel:
    def _assembled_to_readingorder_elements(
        self, conv_res: ConversionResult
    ) -> List[ReadingOrderPageElement]:
        elements: List[ReadingOrderPageElement] = []
        page_no_to_pages = {p.page_no: p for p in conv_res.pages}
        for element in conv_res.assembled.elements:
            page_height = page_no_to_pages[element.page_no].size.height  # type: ignore
            bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
            text = element.text or ""
@ -84,7 +73,6 @@ class ReadingOrderModel:
    def _add_child_elements(
        self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
    ):
        child: Cluster
        for child in element.cluster.children:
            c_label = child.label
@ -110,7 +98,7 @@ class ReadingOrderModel:
            else:
                doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
-    def _readingorder_elements_to_docling_doc(
+    def _readingorder_elements_to_docling_doc(  # noqa: C901
        self,
        conv_res: ConversionResult,
        ro_elements: List[ReadingOrderPageElement],
@ -118,7 +106,6 @@ class ReadingOrderModel:
        el_to_footnotes_mapping: Dict[int, List[int]],
        el_merges_mapping: Dict[int, List[int]],
    ) -> DoclingDocument:
        id_to_elem = {
            RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
            for elem in conv_res.assembled.elements
@ -192,7 +179,6 @@ class ReadingOrderModel:
                            code_item.footnotes.append(new_footnote_item.get_ref())
                else:
                    new_item, current_list = self._handle_text_element(
                        element, out_doc, current_list, page_height
                    )
@ -206,7 +192,6 @@ class ReadingOrderModel:
                            )
            elif isinstance(element, Table):
                tbl_data = TableData(
                    num_rows=element.num_rows,
                    num_cols=element.num_cols,
@ -342,12 +327,12 @@ class ReadingOrderModel:
        return new_item, current_list
    def _merge_elements(self, element, merged_elem, new_item, page_height):
-        assert isinstance(
+        assert isinstance(merged_elem, type(element)), (
-            merged_elem, type(element)
+            "Merged element must be of same type as element."
-        ), "Merged element must be of same type as element."
+        )
-        assert (
+        assert merged_elem.label == new_item.label, (
-            merged_elem.label == new_item.label
+            "Labels of merged elements must match."
-        ), "Labels of merged elements must match."
+        )
        prov = ProvenanceItem(
            page_no=element.page_no + 1,
            charspan=(
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -1,13 +1,13 @@
 import copy
 import warnings
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Union
+from typing import Optional
 import numpy
 from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
 from docling_core.types.doc.page import (
    BoundingRectangle,
    SegmentedPdfPage,
    TextCellUnit,
 )
 from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
@ -44,7 +44,6 @@ class TableStructureModel(BasePageModel):
        self.enabled = enabled
        if self.enabled:
            if artifacts_path is None:
                artifacts_path = self.download_models() / self._model_path
            else:
@ -175,7 +174,6 @@ class TableStructureModel(BasePageModel):
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        if not self.enabled:
            yield from page_batch
            return
@ -186,7 +184,6 @@ class TableStructureModel(BasePageModel):
                yield page
            else:
                with TimeRecorder(conv_res, "table_structure"):
                    assert page.predictions.layout is not None
                    assert page.size is not None
@ -260,7 +257,6 @@ class TableStructureModel(BasePageModel):
                            table_out = tf_output[0]
                            table_cells = []
                            for element in table_out["tf_responses"]:
                                if not self.do_cell_matching:
                                    the_bbox = BoundingBox.model_validate(
                                        element["bbox"]
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@ -3,9 +3,10 @@ import io
 import logging
 import os
 import tempfile
 from collections.abc import Iterable
 from pathlib import Path
 from subprocess import DEVNULL, PIPE, Popen
-from typing import Iterable, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type
 import pandas as pd
 from docling_core.types.doc import BoundingBox, CoordOrigin
@ -63,8 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel):
                )
    def _get_name_and_version(self) -> Tuple[str, str]:
-
+        if self._name is not None and self._version is not None:
        if self._name != None and self._version != None:
            return self._name, self._version  # type: ignore
        cmd = [self.options.tesseract_cmd, "--version"]
@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
        # _log.info(decoded_data)
        # Read the TSV file generated by Tesseract
-        df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
+        df_result = pd.read_csv(
            io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
        )
        # Display the dataframe (optional)
        # _log.info("df: ", df.head())
        # Filter rows that contain actual text (ignore header or empty rows)
-        df_filtered = df[
+        df_filtered = df_result[
-            df["text"].notnull() & (df["text"].apply(str).str.strip() != "")
+            df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
        ]
        return df_filtered
@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
        output, _ = proc.communicate()
        decoded_data = output.decode("utf-8")
-        df = pd.read_csv(
+        df_detected = pd.read_csv(
            io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
        )
-        scripts = df.loc[df["key"] == "Script"].value.tolist()
+        scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
        if len(scripts) == 0:
            _log.warning("Tesseract cannot detect the script of the page")
            return None
@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
        proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
        output, _ = proc.communicate()
        decoded_data = output.decode("utf-8")
-        df = pd.read_csv(io.StringIO(decoded_data), header=None)
+        df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
-        self._tesseract_languages = df[0].tolist()[1:]
+        self._tesseract_languages = df_list[0].tolist()[1:]
        # Decide the script prefix
-        if any([l.startswith("script/") for l in self._tesseract_languages]):
+        if any(lang.startswith("script/") for lang in self._tesseract_languages):
            script_prefix = "script/"
        else:
            script_prefix = ""
@ -197,7 +199,6 @@ class TesseractOcrCliModel(BaseOcrModel):
    def __call__(
        self, conv_res: ConversionResult, page_batch: Iterable[Page]
    ) -> Iterable[Page]:
        if not self.enabled:
            yield from page_batch
            return
@ -225,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
                                fname = image_file.name
                                high_res_image.save(image_file)
-                            df = self._run_tesseract(fname)
+                            df_result = self._run_tesseract(fname)
                        finally:
                            if os.path.exists(fname):
                                os.remove(fname)
-                        # _log.info(df)
+                        # _log.info(df_result)
                        # Print relevant columns (bounding box and text)
-                        for ix, row in df.iterrows():
+                        for ix, row in df_result.iterrows():
                            text = row["text"]
                            conf = row["conf"]
-                            l = float(row["left"])
+                            l = float(row["left"])  # noqa: E741
                            b = float(row["top"])
                            w = float(row["width"])
                            h = float(row["height"])
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -1,6 +1,7 @@
 import logging
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Iterable, Optional, Type
+from typing import Optional, Type
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import BoundingRectangle, TextCell
@ -37,9 +38,6 @@ class TesseractOcrModel(BaseOcrModel):
        self.options: TesseractOcrOptions
        self.scale = 3  # multiplier for 72 dpi == 216 dpi.
        self.reader = None
        self.osd_reader = None
        self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
        if self.enabled:
            install_errmsg = (
@ -64,7 +62,7 @@ class TesseractOcrModel(BaseOcrModel):
                raise ImportError(install_errmsg)
            try:
                tesseract_version = tesserocr.tesseract_version()
-            except:
+            except Exception:
                raise ImportError(install_errmsg)
            _, self._tesserocr_languages = tesserocr.get_languages()
@ -75,7 +73,7 @@ class TesseractOcrModel(BaseOcrModel):
            _log.debug("Initializing TesserOCR: %s", tesseract_version)
            lang = "+".join(self.options.lang)
-            if any([l.startswith("script/") for l in self._tesserocr_languages]):
+            if any(lang.startswith("script/") for lang in self._tesserocr_languages):
                self.script_prefix = "script/"
            else:
                self.script_prefix = ""
@ -86,6 +84,10 @@ class TesseractOcrModel(BaseOcrModel):
                "oem": tesserocr.OEM.DEFAULT,
            }
            self.reader = None
            self.osd_reader = None
            self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
            if self.options.path is not None:
                tesserocr_kwargs["path"] = self.options.path
--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@ -3,9 +3,10 @@ import logging
 import time
 import traceback
 from abc import ABC, abstractmethod
-from typing import Any, Callable, Iterable, List
+from collections.abc import Iterable
 from typing import Any, Callable, List
-from docling_core.types.doc import DoclingDocument, NodeItem
+from docling_core.types.doc import NodeItem
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
@ -64,7 +65,6 @@ class BasePipeline(ABC):
        return conv_res
    def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
        def _prepare_elements(
            conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
        ) -> Iterable[NodeItem]:
@ -113,7 +113,6 @@ class BasePipeline(ABC):
 class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
    def __init__(self, pipeline_options: PipelineOptions):
        super().__init__(pipeline_options)
        self.keep_backend = False
@ -127,7 +126,6 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
        yield from page_batch
    def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
        if not isinstance(conv_res.input._backend, PdfDocumentBackend):
            raise RuntimeError(
                f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
@ -139,8 +137,7 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
        total_elapsed_time = 0.0
        with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
-
+            for i in range(conv_res.input.page_count):
            for i in range(0, conv_res.input.page_count):
                start_page, end_page = conv_res.input.limits.page_range
                if (start_page - 1) <= i <= (end_page - 1):
                    conv_res.pages.append(Page(page_no=i))
@ -161,7 +158,6 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                    pipeline_pages = self._apply_on_pages(conv_res, init_pages)
                    for p in pipeline_pages:  # Must exhaust!
                        # Cleanup cached images
                        if not self.keep_images:
                            p._image_cache = {}
--- a/docling/pipeline/simple_pipeline.py
+++ b/docling/pipeline/simple_pipeline.py
@ -24,7 +24,6 @@ class SimplePipeline(BasePipeline):
        super().__init__(pipeline_options)
    def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
        if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
            raise RuntimeError(
                f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -1,5 +1,4 @@
 import logging
 import sys
 import warnings
 from pathlib import Path
 from typing import Optional, cast
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@ -1,5 +1,4 @@
 import logging
 import warnings
 from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Union, cast
@ -32,7 +31,6 @@ _log = logging.getLogger(__name__)
 class VlmPipeline(PaginatedPipeline):
    def __init__(self, pipeline_options: VlmPipelineOptions):
        super().__init__(pipeline_options)
        self.keep_backend = True
@ -114,7 +112,6 @@ class VlmPipeline(PaginatedPipeline):
    def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
        with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
            if (
                self.pipeline_options.vlm_options.response_format
                == ResponseFormat.DOCTAGS
--- a/docling/utils/export.py
+++ b/docling/utils/export.py
@ -1,8 +1,8 @@
 import logging
-from typing import Any, Dict, Iterable, List, Tuple, Union
+from collections.abc import Iterable
 from typing import Any, Dict, List, Tuple, Union
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import TextCell
 from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
 from docling.datamodel.document import ConversionResult, Page
@ -13,7 +13,6 @@ _log = logging.getLogger(__name__)
 def generate_multimodal_pages(
    doc_result: ConversionResult,
 ) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
    label_to_doclaynet = {
        "title": "title",
        "table-of-contents": "document_index",
@ -122,7 +121,6 @@ def generate_multimodal_pages(
    if doc.main_text is None:
        return
    for ix, orig_item in enumerate(doc.main_text):
        item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
        if item is None or item.prov is None or len(item.prov) == 0:
            _log.debug(f"Skipping item {orig_item}")
--- a/docling/utils/glm_utils.py
+++ b/docling/utils/glm_utils.py
@ -29,7 +29,7 @@ def resolve_item(paths, obj):
    try:
        key = int(paths[0])
-    except:
+    except Exception:
        key = paths[0]
    if len(paths) == 1:
@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
    return unique_objects
-def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
+def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:  # noqa: C901
    origin = DocumentOrigin(
        mimetype="application/pdf",
        filename=doc_glm["file-info"]["filename"],
--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@ -18,7 +18,7 @@ class UnionFind:
    def __init__(self, elements):
        self.parent = {elem: elem for elem in elements}
-        self.rank = {elem: 0 for elem in elements}
+        self.rank = dict.fromkeys(elements, 0)
    def find(self, x):
        if self.parent[x] != x:
@ -484,7 +484,9 @@ class LayoutPostprocessor:
        spatial_index = (
            self.regular_index
            if cluster_type == "regular"
-            else self.picture_index if cluster_type == "picture" else self.wrapper_index
+            else self.picture_index
            if cluster_type == "picture"
            else self.wrapper_index
        )
        # Map of currently valid clusters
--- a/docling/utils/model_downloader.py
+++ b/docling/utils/model_downloader.py
@ -37,7 +37,7 @@ def download_models(
    output_dir.mkdir(exist_ok=True, parents=True)
    if with_layout:
-        _log.info(f"Downloading layout model...")
+        _log.info("Downloading layout model...")
        LayoutModel.download_models(
            local_dir=output_dir / LayoutModel._model_repo_folder,
            force=force,
@ -45,7 +45,7 @@ def download_models(
        )
    if with_tableformer:
-        _log.info(f"Downloading tableformer model...")
+        _log.info("Downloading tableformer model...")
        TableStructureModel.download_models(
            local_dir=output_dir / TableStructureModel._model_repo_folder,
            force=force,
@ -53,7 +53,7 @@ def download_models(
        )
    if with_picture_classifier:
-        _log.info(f"Downloading picture classifier model...")
+        _log.info("Downloading picture classifier model...")
        DocumentPictureClassifier.download_models(
            local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
            force=force,
@ -61,7 +61,7 @@ def download_models(
        )
    if with_code_formula:
-        _log.info(f"Downloading code formula model...")
+        _log.info("Downloading code formula model...")
        CodeFormulaModel.download_models(
            local_dir=output_dir / CodeFormulaModel._model_repo_folder,
            force=force,
@ -69,7 +69,7 @@ def download_models(
        )
    if with_smolvlm:
-        _log.info(f"Downloading SmolVlm model...")
+        _log.info("Downloading SmolVlm model...")
        PictureDescriptionVlmModel.download_models(
            repo_id=smolvlm_picture_description.repo_id,
            local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
@ -78,7 +78,7 @@ def download_models(
        )
    if with_granite_vision:
-        _log.info(f"Downloading Granite Vision model...")
+        _log.info("Downloading Granite Vision model...")
        PictureDescriptionVlmModel.download_models(
            repo_id=granite_picture_description.repo_id,
            local_dir=output_dir / granite_picture_description.repo_cache_folder,
@ -87,7 +87,7 @@ def download_models(
        )
    if with_easyocr:
-        _log.info(f"Downloading easyocr models...")
+        _log.info("Downloading easyocr models...")
        EasyOcrModel.download_models(
            local_dir=output_dir / EasyOcrModel._model_repo_folder,
            force=force,
--- a/docling/utils/utils.py
+++ b/docling/utils/utils.py
@ -13,7 +13,7 @@ def chunkify(iterator, chunk_size):
    if isinstance(iterator, List):
        iterator = iter(iterator)
    for first in iterator:  # Take the first element from the iterator
-        yield [first] + list(islice(iterator, chunk_size - 1))
+        yield [first, *list(islice(iterator, chunk_size - 1))]
 def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
--- a/docs/examples/backend_xml_rag.ipynb
+++ b/docs/examples/backend_xml_rag.ipynb
@ -383,7 +383,7 @@
    "\n",
    "print(f\"Downloading {url}...\")\n",
    "buf = BytesIO(requests.get(url).content)\n",
-    "print(f\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
+    "print(\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
    "with zipfile.ZipFile(buf) as zf:\n",
    "    res = zf.testzip()\n",
    "    if res:\n",
@ -544,7 +544,7 @@
   "source": [
    "doc = backend.convert()\n",
    "\n",
-    "claims_sec = [item for item in doc.texts if item.text == \"CLAIMS\"][0]\n",
+    "claims_sec = next(item for item in doc.texts if item.text == \"CLAIMS\")\n",
    "print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')"
   ]
  },
--- a/docs/examples/batch_convert.py
+++ b/docs/examples/batch_convert.py
@ -1,8 +1,8 @@
 import json
 import logging
 import time
 from collections.abc import Iterable
 from pathlib import Path
 from typing import Iterable
 import yaml
 from docling_core.types.doc import ImageRefMode
@ -11,7 +11,6 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
 from docling.datamodel.base_models import ConversionStatus, InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, PdfFormatOption
 _log = logging.getLogger(__name__)
--- a/docs/examples/custom_convert.py
+++ b/docs/examples/custom_convert.py
@ -3,7 +3,6 @@ import logging
 import time
 from pathlib import Path
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
@ -11,9 +10,6 @@ from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.models.ocr_mac_model import OcrMacOptions
 from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
 from docling.models.tesseract_ocr_model import TesseractOcrOptions
 _log = logging.getLogger(__name__)
--- a/docs/examples/develop_formula_understanding.py
+++ b/docs/examples/develop_formula_understanding.py
@ -3,8 +3,8 @@
 # It does not run the actual formula understanding model.
 import logging
 from collections.abc import Iterable
 from pathlib import Path
 from typing import Iterable
 from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem
@ -49,7 +49,6 @@ class ExampleFormulaUnderstandingEnrichmentModel(BaseItemAndImageEnrichmentModel
 # How the pipeline can be extended.
 class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
    def __init__(self, pipeline_options: ExampleFormulaUnderstandingPipelineOptions):
        super().__init__(pipeline_options)
        self.pipeline_options: ExampleFormulaUnderstandingPipelineOptions
@ -85,7 +84,7 @@ def main():
            )
        }
    )
-    result = doc_converter.convert(input_doc_path)
+    doc_converter.convert(input_doc_path)
 if __name__ == "__main__":
--- a/docs/examples/develop_picture_enrichment.py
+++ b/docs/examples/develop_picture_enrichment.py
@ -3,8 +3,9 @@
 # It does not run the actual picture classifier model.
 import logging
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Any, Iterable
+from typing import Any
 from docling_core.types.doc import (
    DoclingDocument,
--- a/docs/examples/export_figures.py
+++ b/docs/examples/export_figures.py
@ -4,7 +4,7 @@ from pathlib import Path
 from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
-from docling.datamodel.base_models import FigureElement, InputFormat, Table
+from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
--- a/docs/examples/export_multimodal.py
+++ b/docs/examples/export_multimodal.py
@ -51,7 +51,6 @@ def main():
        page_segments,
        page,
    ) in generate_multimodal_pages(conv_res):
        dpi = page._default_image_scale * 72
        rows.append(
@ -81,10 +80,10 @@ def main():
        )
    # Generate one parquet from all documents
-    df = pd.json_normalize(rows)
+    df_result = pd.json_normalize(rows)
    now = datetime.datetime.now()
    output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
-    df.to_parquet(output_filename)
+    df_result.to_parquet(output_filename)
    end_time = time.time() - start_time
--- a/docs/examples/export_tables.py
+++ b/docs/examples/export_tables.py
@ -32,12 +32,12 @@ def main():
        print(table_df.to_markdown())
        # Save the table as csv
-        element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
+        element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv"
        _log.info(f"Saving CSV table to {element_csv_filename}")
        table_df.to_csv(element_csv_filename)
        # Save the table as html
-        element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
+        element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html"
        _log.info(f"Saving HTML table to {element_html_filename}")
        with element_html_filename.open("w") as fp:
            fp.write(table.export_to_html(doc=conv_res.document))
--- a/docs/examples/full_page_ocr.py
+++ b/docs/examples/full_page_ocr.py
@ -1,14 +1,9 @@
 from pathlib import Path
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
    OcrMacOptions,
    PdfPipelineOptions,
    RapidOcrOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
--- a/docs/examples/hybrid_chunking.ipynb
+++ b/docs/examples/hybrid_chunking.ipynb
@ -153,10 +153,10 @@
   "source": [
    "for i, chunk in enumerate(chunk_iter):\n",
    "    print(f\"=== {i} ===\")\n",
-    "    print(f\"chunk.text:\\n{repr(f'{chunk.text[:300]}…')}\")\n",
+    "    print(f\"chunk.text:\\n{f'{chunk.text[:300]}…'!r}\")\n",
    "\n",
    "    enriched_text = chunker.serialize(chunk=chunk)\n",
-    "    print(f\"chunker.serialize(chunk):\\n{repr(f'{enriched_text[:300]}…')}\")\n",
+    "    print(f\"chunker.serialize(chunk):\\n{f'{enriched_text[:300]}…'!r}\")\n",
    "\n",
    "    print()"
   ]
@ -353,11 +353,11 @@
    "for i, chunk in enumerate(chunks):\n",
    "    print(f\"=== {i} ===\")\n",
    "    txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
-    "    print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
+    "    print(f\"chunk.text ({txt_tokens} tokens):\\n{chunk.text!r}\")\n",
    "\n",
    "    ser_txt = chunker.serialize(chunk=chunk)\n",
    "    ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
-    "    print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
+    "    print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{ser_txt!r}\")\n",
    "\n",
    "    print()"
   ]
--- a/docs/examples/minimal_vlm_pipeline.py
+++ b/docs/examples/minimal_vlm_pipeline.py
@ -2,17 +2,14 @@ import json
 import time
 from pathlib import Path
-import yaml
+from docling_core.types.doc import DocItemLabel, ImageRefMode
 from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    VlmPipelineOptions,
    granite_vision_vlm_conversion_options,
    smoldocling_vlm_conversion_options,
    smoldocling_vlm_mlx_conversion_options,
 )
 from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline
@ -39,9 +36,6 @@ pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
 ## Alternative VLM models:
 # pipeline_options.vlm_options = granite_vision_vlm_conversion_options
 from docling_core.types.doc import DocItemLabel, ImageRefMode
 from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
 ## Set up pipeline for PDF or image inputs
 converter = DocumentConverter(
    format_options={
@ -62,7 +56,7 @@ out_path.mkdir(parents=True, exist_ok=True)
 for source in sources:
    start_time = time.time()
    print("================================================")
-    print("Processing... {}".format(source))
+    print(f"Processing... {source}")
    print("================================================")
    print("")
@ -77,7 +71,7 @@ for source in sources:
        print(page.predictions.vlm_response.text)
    res.document.save_as_html(
-        filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
+        filename=Path(f"{out_path}/{res.input.file.stem}.html"),
        image_mode=ImageRefMode.REFERENCED,
        labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
    )
--- a/docs/examples/pictures_description.ipynb
+++ b/docs/examples/pictures_description.ipynb
@ -144,7 +144,7 @@
    "for pic in doc.pictures[:5]:\n",
    "    html_item = (\n",
    "        f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
-    "        f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
+    "        f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
    "        f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
    "    )\n",
    "    for annotation in pic.annotations:\n",
@ -252,7 +252,7 @@
    "for pic in doc.pictures[:5]:\n",
    "    html_item = (\n",
    "        f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
-    "        f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
+    "        f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
    "        f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
    "    )\n",
    "    for annotation in pic.annotations:\n",
--- a/docs/examples/rag_azuresearch.ipynb
+++ b/docs/examples/rag_azuresearch.ipynb
@ -283,7 +283,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
@ -369,7 +369,7 @@
    "    new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n",
    "    try:\n",
    "        index_client.delete_index(index_name)\n",
-    "    except:\n",
+    "    except Exception:\n",
    "        pass\n",
    "\n",
    "    index_client.create_or_update_index(new_index)\n",
@ -487,7 +487,7 @@
    "\n",
    "    all_succeeded = all(r.succeeded for r in resp)\n",
    "    console.print(\n",
-    "        f\"Uploaded batch {i} -> {i+len(subset)}; all_succeeded: {all_succeeded}, \"\n",
+    "        f\"Uploaded batch {i} -> {i + len(subset)}; all_succeeded: {all_succeeded}, \"\n",
    "        f\"first_doc_status_code: {resp[0].status_code}\"\n",
    "    )\n",
    "\n",
@ -807,10 +807,12 @@
    }
   ],
   "source": [
    "from typing import Optional\n",
    "\n",
    "from azure.search.documents.models import VectorizableTextQuery\n",
    "\n",
    "\n",
-    "def generate_chat_response(prompt: str, system_message: str = None):\n",
+    "def generate_chat_response(prompt: str, system_message: Optional[str] = None):\n",
    "    \"\"\"\n",
    "    Generates a single-turn chat response using Azure OpenAI Chat.\n",
    "    If you need multi-turn conversation or follow-up queries, you'll have to\n",
--- a/docs/examples/rag_haystack.ipynb
+++ b/docs/examples/rag_haystack.ipynb
@ -351,7 +351,7 @@
    "for source in sources:\n",
    "    if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
    "        doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n",
-    "        print(f\"- text: {repr(doc_chunk.text)}\")\n",
+    "        print(f\"- text: {doc_chunk.text!r}\")\n",
    "        if doc_chunk.meta.origin:\n",
    "            print(f\"  file: {doc_chunk.meta.origin.filename}\")\n",
    "        if doc_chunk.meta.headings:\n",
--- a/docs/examples/rag_langchain.ipynb
+++ b/docs/examples/rag_langchain.ipynb
@ -341,7 +341,7 @@
    "print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{clipped_answer}\")\n",
    "for i, doc in enumerate(resp_dict[\"context\"]):\n",
    "    print()\n",
-    "    print(f\"Source {i+1}:\")\n",
+    "    print(f\"Source {i + 1}:\")\n",
    "    print(f\"  text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
    "    for key in doc.metadata:\n",
    "        if key != \"pk\":\n",
--- a/docs/examples/rag_weaviate.ipynb
+++ b/docs/examples/rag_weaviate.ipynb
@ -59,7 +59,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
   "metadata": {
    "collapsed": true,
    "id": "u076oUSF_YUG"
@ -72,12 +72,11 @@
    "%pip install rich\n",
    "%pip install torch\n",
    "\n",
    "import logging\n",
    "import warnings\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "import logging\n",
    "\n",
    "# Suppress Weaviate client logs\n",
    "logging.getLogger(\"weaviate\").setLevel(logging.ERROR)"
   ]
@ -119,7 +118,7 @@
    "    device = torch.device(\"mps\")\n",
    "    print(\"MPS GPU is enabled.\")\n",
    "else:\n",
-    "    raise EnvironmentError(\n",
+    "    raise OSError(\n",
    "        \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
    "    )"
   ]
@ -226,7 +225,6 @@
    }
   ],
   "source": [
    "from docling.datamodel.document import ConversionResult\n",
    "from docling.document_converter import DocumentConverter\n",
    "\n",
    "# Instantiate the doc converter\n",
@ -345,7 +343,7 @@
    "\n",
    "    openai_api_key = os.getenv(openai_api_key_var)\n",
    "    if not openai_api_key:\n",
-    "        raise EnvironmentError(\n",
+    "        raise OSError(\n",
    "            f\"Environment variable '{openai_api_key_var}' is not set. \"\n",
    "            \"Please define it before running this script.\"\n",
    "        )"
@ -387,7 +385,6 @@
   "outputs": [],
   "source": [
    "import weaviate.classes.config as wc\n",
    "from weaviate.classes.config import DataType, Property\n",
    "\n",
    "# Define the collection name\n",
    "collection_name = \"docling\"\n",
--- a/docs/examples/run_md.py
+++ b/docs/examples/run_md.py
@ -25,9 +25,7 @@ def main():
        document = mdb.convert()
        out_path = Path("scratch")
-        print(
+        print(f"Document {path} converted.\nSaved markdown output to: {out_path!s}")
            f"Document {path} converted." f"\nSaved markdown output to: {str(out_path)}"
        )
        # Export Docling document format to markdowndoc:
        fn = os.path.basename(path)
--- a/docs/examples/run_with_accelerator.py
+++ b/docs/examples/run_with_accelerator.py
@ -1,13 +1,10 @@
 from pathlib import Path
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    PdfPipelineOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
 )
 from docling.datamodel.settings import settings
 from docling.document_converter import DocumentConverter, PdfFormatOption
--- a/docs/examples/run_with_formats.py
+++ b/docs/examples/run_with_formats.py
@ -63,7 +63,7 @@ def main():
        out_path = Path("scratch")
        print(
            f"Document {res.input.file.name} converted."
-            f"\nSaved markdown output to: {str(out_path)}"
+            f"\nSaved markdown output to: {out_path!s}"
        )
        _log.debug(res.document._export_to_indented_text(max_text_len=16))
        # Export Docling document format to markdowndoc:
--- a/docs/examples/tesseract_lang_detection.py
+++ b/docs/examples/tesseract_lang_detection.py
@ -4,7 +4,6 @@ from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    PdfPipelineOptions,
    TesseractCliOcrOptions,
    TesseractOcrOptions,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
--- a/docs/examples/translate.py
+++ b/docs/examples/translate.py
@ -2,9 +2,9 @@ import logging
 import time
 from pathlib import Path
-from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem
+from docling_core.types.doc import ImageRefMode, TableItem, TextItem
-from docling.datamodel.base_models import FigureElement, InputFormat, Table
+from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
@ -15,7 +15,6 @@ IMAGE_RESOLUTION_SCALE = 2.0
 # FIXME: put in your favorite translation code ....
 def translate(text: str, src: str = "en", dest: str = "de"):
    _log.warning("!!! IMPLEMENT HERE YOUR FAVORITE TRANSLATION CODE!!!")
    # from googletrans import Translator
@ -52,10 +51,9 @@ def main():
        }
    )
    start_time = time.time()
    conv_res = doc_converter.convert(input_doc_path)
    conv_doc = conv_res.document
    doc_filename = conv_res.input.file
    # Save markdown with embedded pictures in original text
    md_filename = output_dir / f"{doc_filename}-with-images-orig.md"
--- a/docs/examples/visual_grounding.ipynb
+++ b/docs/examples/visual_grounding.ipynb
@ -432,7 +432,7 @@
    "\n",
    "for i, doc in enumerate(resp_dict[\"context\"][:]):\n",
    "    image_by_page = {}\n",
-    "    print(f\"Source {i+1}:\")\n",
+    "    print(f\"Source {i + 1}:\")\n",
    "    print(f\"  text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
    "    meta = DocMeta.model_validate(doc.metadata[\"dl_meta\"])\n",
    "\n",
--- a/docs/examples/vlm_pipeline_api_model.py
+++ b/docs/examples/vlm_pipeline_api_model.py
@ -10,7 +10,6 @@ from docling.datamodel.pipeline_options import (
    ApiVlmOptions,
    ResponseFormat,
    VlmPipelineOptions,
    granite_vision_vlm_ollama_conversion_options,
 )
 from docling.document_converter import DocumentConverter, PdfFormatOption
 from docling.pipeline.vlm_pipeline import VlmPipeline
--- a/poetry.lock
+++ b/poetry.lock
@ -692,6 +692,84 @@ traitlets = ">=4"
 [package.extras]
 test = ["pytest"]
 [[package]]
 name = "coverage"
 version = "7.8.0"
 description = "Code coverage measurement for Python"
 optional = false
 python-versions = ">=3.9"
 files = [
    {file = "coverage-7.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2931f66991175369859b5fd58529cd4b73582461877ecfd859b6549869287ffe"},
    {file = "coverage-7.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52a523153c568d2c0ef8826f6cc23031dc86cffb8c6aeab92c4ff776e7951b28"},
    {file = "coverage-7.8.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c8a5c139aae4c35cbd7cadca1df02ea8cf28a911534fc1b0456acb0b14234f3"},
    {file = "coverage-7.8.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a26c0c795c3e0b63ec7da6efded5f0bc856d7c0b24b2ac84b4d1d7bc578d676"},
    {file = "coverage-7.8.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:821f7bcbaa84318287115d54becb1915eece6918136c6f91045bb84e2f88739d"},
    {file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a321c61477ff8ee705b8a5fed370b5710c56b3a52d17b983d9215861e37b642a"},
    {file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ed2144b8a78f9d94d9515963ed273d620e07846acd5d4b0a642d4849e8d91a0c"},
    {file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:042e7841a26498fff7a37d6fda770d17519982f5b7d8bf5278d140b67b61095f"},
    {file = "coverage-7.8.0-cp310-cp310-win32.whl", hash = "sha256:f9983d01d7705b2d1f7a95e10bbe4091fabc03a46881a256c2787637b087003f"},
    {file = "coverage-7.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:5a570cd9bd20b85d1a0d7b009aaf6c110b52b5755c17be6962f8ccd65d1dbd23"},
    {file = "coverage-7.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e7ac22a0bb2c7c49f441f7a6d46c9c80d96e56f5a8bc6972529ed43c8b694e27"},
    {file = "coverage-7.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bf13d564d310c156d1c8e53877baf2993fb3073b2fc9f69790ca6a732eb4bfea"},
    {file = "coverage-7.8.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5761c70c017c1b0d21b0815a920ffb94a670c8d5d409d9b38857874c21f70d7"},
    {file = "coverage-7.8.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5ff52d790c7e1628241ffbcaeb33e07d14b007b6eb00a19320c7b8a7024c040"},
    {file = "coverage-7.8.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d39fc4817fd67b3915256af5dda75fd4ee10621a3d484524487e33416c6f3543"},
    {file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b44674870709017e4b4036e3d0d6c17f06a0e6d4436422e0ad29b882c40697d2"},
    {file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8f99eb72bf27cbb167b636eb1726f590c00e1ad375002230607a844d9e9a2318"},
    {file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b571bf5341ba8c6bc02e0baeaf3b061ab993bf372d982ae509807e7f112554e9"},
    {file = "coverage-7.8.0-cp311-cp311-win32.whl", hash = "sha256:e75a2ad7b647fd8046d58c3132d7eaf31b12d8a53c0e4b21fa9c4d23d6ee6d3c"},
    {file = "coverage-7.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:3043ba1c88b2139126fc72cb48574b90e2e0546d4c78b5299317f61b7f718b78"},
    {file = "coverage-7.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bbb5cc845a0292e0c520656d19d7ce40e18d0e19b22cb3e0409135a575bf79fc"},
    {file = "coverage-7.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4dfd9a93db9e78666d178d4f08a5408aa3f2474ad4d0e0378ed5f2ef71640cb6"},
    {file = "coverage-7.8.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f017a61399f13aa6d1039f75cd467be388d157cd81f1a119b9d9a68ba6f2830d"},
    {file = "coverage-7.8.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0915742f4c82208ebf47a2b154a5334155ed9ef9fe6190674b8a46c2fb89cb05"},
    {file = "coverage-7.8.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a40fcf208e021eb14b0fac6bdb045c0e0cab53105f93ba0d03fd934c956143a"},
    {file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a1f406a8e0995d654b2ad87c62caf6befa767885301f3b8f6f73e6f3c31ec3a6"},
    {file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:77af0f6447a582fdc7de5e06fa3757a3ef87769fbb0fdbdeba78c23049140a47"},
    {file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f2d32f95922927186c6dbc8bc60df0d186b6edb828d299ab10898ef3f40052fe"},
    {file = "coverage-7.8.0-cp312-cp312-win32.whl", hash = "sha256:769773614e676f9d8e8a0980dd7740f09a6ea386d0f383db6821df07d0f08545"},
    {file = "coverage-7.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:e5d2b9be5b0693cf21eb4ce0ec8d211efb43966f6657807f6859aab3814f946b"},
    {file = "coverage-7.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5ac46d0c2dd5820ce93943a501ac5f6548ea81594777ca585bf002aa8854cacd"},
    {file = "coverage-7.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:771eb7587a0563ca5bb6f622b9ed7f9d07bd08900f7589b4febff05f469bea00"},
    {file = "coverage-7.8.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42421e04069fb2cbcbca5a696c4050b84a43b05392679d4068acbe65449b5c64"},
    {file = "coverage-7.8.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:554fec1199d93ab30adaa751db68acec2b41c5602ac944bb19187cb9a41a8067"},
    {file = "coverage-7.8.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aaeb00761f985007b38cf463b1d160a14a22c34eb3f6a39d9ad6fc27cb73008"},
    {file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:581a40c7b94921fffd6457ffe532259813fc68eb2bdda60fa8cc343414ce3733"},
    {file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:f319bae0321bc838e205bf9e5bc28f0a3165f30c203b610f17ab5552cff90323"},
    {file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04bfec25a8ef1c5f41f5e7e5c842f6b615599ca8ba8391ec33a9290d9d2db3a3"},
    {file = "coverage-7.8.0-cp313-cp313-win32.whl", hash = "sha256:dd19608788b50eed889e13a5d71d832edc34fc9dfce606f66e8f9f917eef910d"},
    {file = "coverage-7.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:a9abbccd778d98e9c7e85038e35e91e67f5b520776781d9a1e2ee9d400869487"},
    {file = "coverage-7.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:18c5ae6d061ad5b3e7eef4363fb27a0576012a7447af48be6c75b88494c6cf25"},
    {file = "coverage-7.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:95aa6ae391a22bbbce1b77ddac846c98c5473de0372ba5c463480043a07bff42"},
    {file = "coverage-7.8.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e013b07ba1c748dacc2a80e69a46286ff145935f260eb8c72df7185bf048f502"},
    {file = "coverage-7.8.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d766a4f0e5aa1ba056ec3496243150698dc0481902e2b8559314368717be82b1"},
    {file = "coverage-7.8.0-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad80e6b4a0c3cb6f10f29ae4c60e991f424e6b14219d46f1e7d442b938ee68a4"},
    {file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b87eb6fc9e1bb8f98892a2458781348fa37e6925f35bb6ceb9d4afd54ba36c73"},
    {file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:d1ba00ae33be84066cfbe7361d4e04dec78445b2b88bdb734d0d1cbab916025a"},
    {file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f3c38e4e5ccbdc9198aecc766cedbb134b2d89bf64533973678dfcf07effd883"},
    {file = "coverage-7.8.0-cp313-cp313t-win32.whl", hash = "sha256:379fe315e206b14e21db5240f89dc0774bdd3e25c3c58c2c733c99eca96f1ada"},
    {file = "coverage-7.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2e4b6b87bb0c846a9315e3ab4be2d52fac905100565f4b92f02c445c8799e257"},
    {file = "coverage-7.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa260de59dfb143af06dcf30c2be0b200bed2a73737a8a59248fcb9fa601ef0f"},
    {file = "coverage-7.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:96121edfa4c2dfdda409877ea8608dd01de816a4dc4a0523356067b305e4e17a"},
    {file = "coverage-7.8.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b8af63b9afa1031c0ef05b217faa598f3069148eeee6bb24b79da9012423b82"},
    {file = "coverage-7.8.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:89b1f4af0d4afe495cd4787a68e00f30f1d15939f550e869de90a86efa7e0814"},
    {file = "coverage-7.8.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94ec0be97723ae72d63d3aa41961a0b9a6f5a53ff599813c324548d18e3b9e8c"},
    {file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8a1d96e780bdb2d0cbb297325711701f7c0b6f89199a57f2049e90064c29f6bd"},
    {file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:f1d8a2a57b47142b10374902777e798784abf400a004b14f1b0b9eaf1e528ba4"},
    {file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:cf60dd2696b457b710dd40bf17ad269d5f5457b96442f7f85722bdb16fa6c899"},
    {file = "coverage-7.8.0-cp39-cp39-win32.whl", hash = "sha256:be945402e03de47ba1872cd5236395e0f4ad635526185a930735f66710e1bd3f"},
    {file = "coverage-7.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:90e7fbc6216ecaffa5a880cdc9c77b7418c1dcb166166b78dbc630d07f278cc3"},
    {file = "coverage-7.8.0-pp39.pp310.pp311-none-any.whl", hash = "sha256:b8194fb8e50d556d5849753de991d390c5a1edeeba50f68e3a9253fbd8bf8ccd"},
    {file = "coverage-7.8.0-py3-none-any.whl", hash = "sha256:dbf364b4c5e7bae9250528167dfe40219b62e2d573c854d74be213e1e52069f7"},
    {file = "coverage-7.8.0.tar.gz", hash = "sha256:7a3d62b3b03b4b6fd41a085f3574874cf946cb4604d2b4d3e8dca8cd570ca501"},
 ]
 [package.dependencies]
 tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""}
 [package.extras]
 toml = ["tomli"]
 [[package]]
 name = "cryptography"
 version = "43.0.3"
@ -5073,6 +5151,24 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
 [package.extras]
 testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
 [[package]]
 name = "pytest-cov"
 version = "6.1.1"
 description = "Pytest plugin for measuring coverage."
 optional = false
 python-versions = ">=3.9"
 files = [
    {file = "pytest_cov-6.1.1-py3-none-any.whl", hash = "sha256:bddf29ed2d0ab6f4df17b4c55b0a657287db8684af9c42ea546b21b1041b3dde"},
    {file = "pytest_cov-6.1.1.tar.gz", hash = "sha256:46935f7aaefba760e716c2ebfbe1c216240b9592966e7da99ea8292d4d3e2a0a"},
 ]
 [package.dependencies]
 coverage = {version = ">=7.5", extras = ["toml"]}
 pytest = ">=4.6"
 [package.extras]
 testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"]
 [[package]]
 name = "pytest-xdist"
 version = "3.6.1"
@ -7882,4 +7978,4 @@ vlm = ["accelerate", "transformers", "transformers"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "d2a8f7997b9ffb249ad26ba492b766d580bdb0072d50e76b0afd92496e983e96"
+content-hash = "b36037ec17dc4b6d5197a2f63a1367e05bf888b4fa97e2e2e8c29c217741d69c"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -110,6 +110,8 @@ ipywidgets = "^8.1.5"
 nbqa = "^1.9.0"
 types-openpyxl = "^3.1.5.20241114"
 types-tqdm = "^4.67.0.20241221"
 coverage = "^7.6.2"
 pytest-cov = "^6.0.0"
 [tool.poetry.group.docs.dependencies]
 mkdocs-material = "^9.5.40"
@ -164,15 +166,82 @@ docling-tools = "docling.cli.tools:app"
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
-[tool.black]
+[tool.ruff]
 target-version = "py39"
 line-length = 88
-target-version = ["py39"]
+respect-gitignore = true
 include = '\.pyi?$'
-[tool.isort]
+# extend-exclude = [
-profile = "black"
+#     "tests",
-line_length = 88
+# ]
-py_version = 39
+
 [tool.ruff.format]
 skip-magic-trailing-comma = false
 [tool.ruff.lint]
 select = [
    # "B",  # flake8-bugbear
    "C",  # flake8-comprehensions
    "C9",  # mccabe
    # "D",  # flake8-docstrings
    "E",  # pycodestyle errors (default)
    "F",  # pyflakes (default)
    "I",  # isort
    "PD", # pandas-vet
    "PIE", # pie
    # "PTH", # pathlib
    "Q",  # flake8-quotes
    # "RET", # return
    "RUF", # Enable all ruff-specific checks
    # "SIM", # simplify
    "S307", # eval
    # "T20",  # (disallow print statements) keep debugging statements out of the codebase
    "W",  # pycodestyle warnings
    "ASYNC", # async
    "UP", # pyupgrade
 ]
 ignore = [
    "C408",  # Unnecessary `dict()` call (rewrite as a literal)
    "E501",  # Line too long, handled by ruff formatter
    "D107", # "Missing docstring in __init__",
    "F401",  # imported but unused; consider using `importlib.util.find_spec` to test for "
    "F811", # "redefinition of the same function"
    "PL", # Pylint
    "RUF012", # Mutable Class Attributes
    "UP006",  # List vs list, etc
    "UP007", # Option and Union
    "UP035",  # `typing.Set` is deprecated, use `set` instead"
 ]
 #extend-select = []
 [tool.ruff.lint.pep8-naming]
 classmethod-decorators = [
    # Allow Pydantic's `@validator` decorator to trigger class method treatment.
    "pydantic.validator",
 ]
 [tool.ruff.lint.per-file-ignores]
 "__init__.py" = ["E402", "F401"]
 "tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests
 [tool.ruff.lint.mccabe]
 max-complexity = 20
 # [tool.ruff.lint.isort.sections]
 # "docling" = ["docling_core", "docling_ibm_models", "docling_parse"]
 [tool.ruff.lint.isort]
 combine-as-imports = true
 # section-order = [
 #   "future",
 #   "standard-library",
 #   "third-party",
 #   "docling",
 #   "first-party",
 #   "local-folder",
 # ]
 [tool.mypy]
 pretty = true
@ -200,10 +269,6 @@ module = [
 ]
 ignore_missing_imports = true
 [tool.flake8]
 max-line-length = 88
 extend-ignore = ["E203", "E501"]
 [tool.semantic_release]
 # for default values check:
 # https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg
--- a/tests/test_backend_asciidoc.py
+++ b/tests/test_backend_asciidoc.py
@ -19,7 +19,6 @@ def _get_backend(fname):
 def test_asciidocs_examples():
    fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc"))
    for fname in fnames:
@ -38,8 +37,8 @@ def test_asciidocs_examples():
        print("\n\n", pred_mddoc)
        if os.path.exists(gname):
-            with open(gname, "r") as fr:
+            with open(gname) as fr:
-                true_mddoc = fr.read()
+                fr.read()
            # assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
        else:
--- a/tests/test_backend_csv.py
+++ b/tests/test_backend_csv.py
@ -1,5 +1,3 @@
 import json
 import os
 from pathlib import Path
 from pytest import warns
@ -15,22 +13,19 @@ GENERATE = GEN_TEST_DATA
 def get_csv_paths():
    # Define the directory you want to search
-    directory = Path(f"./tests/data/csv/")
+    directory = Path("./tests/data/csv/")
    # List all CSV files in the directory and its subdirectories
    return sorted(directory.rglob("*.csv"))
 def get_csv_path(name: str):
    # Return the matching CSV file path
    return Path(f"./tests/data/csv/{name}.csv")
 def get_converter():
    converter = DocumentConverter(allowed_formats=[InputFormat.CSV])
    return converter
@ -55,9 +50,9 @@ def test_e2e_valid_csv_conversions():
        pred_itxt: str = doc._export_to_indented_text(
            max_text_len=70, explicit_tables=False
        )
-        assert verify_export(
+        assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
-            pred_itxt, str(gt_path) + ".itxt"
+            "export to indented-text"
-        ), "export to indented-text"
+        )
        assert verify_document(
            pred_doc=doc,
--- a/tests/test_backend_docling_parse.py
+++ b/tests/test_backend_docling_parse.py
@ -32,7 +32,7 @@ def test_text_cell_counts():
    doc_backend = _get_backend(pdf_doc)
-    for page_index in range(0, doc_backend.page_count()):
+    for page_index in range(doc_backend.page_count()):
        last_cell_count = None
        for i in range(10):
            page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
@ -42,9 +42,9 @@ def test_text_cell_counts():
                last_cell_count = len(cells)
            if len(cells) != last_cell_count:
-                assert (
+                assert False, (
-                    False
+                    "Loading page multiple times yielded non-identical text cell counts"
-                ), "Loading page multiple times yielded non-identical text cell counts"
+                )
            last_cell_count = len(cells)
@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path):
    page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
    # Crop out "Figure 1" from the DocLayNet paper
-    im = page_backend.get_page_image(
+    page_backend.get_page_image(
        scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
    )
    # im.show()
--- a/tests/test_backend_docling_parse_v2.py
+++ b/tests/test_backend_docling_parse_v2.py
@ -31,7 +31,7 @@ def test_text_cell_counts():
    doc_backend = _get_backend(pdf_doc)
-    for page_index in range(0, doc_backend.page_count()):
+    for page_index in range(doc_backend.page_count()):
        last_cell_count = None
        for i in range(10):
            page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
@ -41,9 +41,9 @@ def test_text_cell_counts():
                last_cell_count = len(cells)
            if len(cells) != last_cell_count:
-                assert (
+                assert False, (
-                    False
+                    "Loading page multiple times yielded non-identical text cell counts"
-                ), "Loading page multiple times yielded non-identical text cell counts"
+                )
            last_cell_count = len(cells)
@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path):
    page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
    # Crop out "Figure 1" from the DocLayNet paper
-    im = page_backend.get_page_image(
+    page_backend.get_page_image(
        scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
    )
    # im.show()
--- a/tests/test_backend_docling_parse_v4.py
+++ b/tests/test_backend_docling_parse_v4.py
@ -31,7 +31,7 @@ def test_text_cell_counts():
    doc_backend = _get_backend(pdf_doc)
-    for page_index in range(0, doc_backend.page_count()):
+    for page_index in range(doc_backend.page_count()):
        last_cell_count = None
        for i in range(10):
            page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
@ -41,9 +41,9 @@ def test_text_cell_counts():
                last_cell_count = len(cells)
            if len(cells) != last_cell_count:
-                assert (
+                assert False, (
-                    False
+                    "Loading page multiple times yielded non-identical text cell counts"
-                ), "Loading page multiple times yielded non-identical text cell counts"
+                )
            last_cell_count = len(cells)
@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path):
    page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
    # Crop out "Figure 1" from the DocLayNet paper
-    im = page_backend.get_page_image(
+    page_backend.get_page_image(
        scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
    )
    # im.show()
--- a/tests/test_backend_html.py
+++ b/tests/test_backend_html.py
@ -105,7 +105,6 @@ def test_ordered_lists():
 def get_html_paths():
    # Define the directory you want to search
    directory = Path("./tests/data/html/")
@ -115,14 +114,12 @@ def get_html_paths():
 def get_converter():
    converter = DocumentConverter(allowed_formats=[InputFormat.HTML])
    return converter
 def test_e2e_html_conversions():
    html_paths = get_html_paths()
    converter = get_converter()
@ -138,15 +135,15 @@ def test_e2e_html_conversions():
        doc: DoclingDocument = conv_result.document
        pred_md: str = doc.export_to_markdown()
-        assert verify_export(
+        assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
-            pred_md, str(gt_path) + ".md", generate=GENERATE
+            "export to md"
-        ), "export to md"
+        )
        pred_itxt: str = doc._export_to_indented_text(
            max_text_len=70, explicit_tables=False
        )
-        assert verify_export(
+        assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
-            pred_itxt, str(gt_path) + ".itxt", generate=GENERATE
+            "export to indented-text"
-        ), "export to indented-text"
+        )
        assert verify_document(doc, str(gt_path) + ".json", GENERATE)
--- a/tests/test_backend_jats.py
+++ b/tests/test_backend_jats.py
@ -15,7 +15,7 @@ GENERATE = GEN_TEST_DATA
 def get_pubmed_paths():
-    directory = Path(os.path.dirname(__file__) + f"/data/pubmed/")
+    directory = Path(os.path.dirname(__file__) + "/data/pubmed/")
    xml_files = sorted(directory.rglob("*.xml"))
    return xml_files
@ -47,9 +47,9 @@ def test_e2e_pubmed_conversions(use_stream=False):
        pred_itxt: str = doc._export_to_indented_text(
            max_text_len=70, explicit_tables=False
        )
-        assert verify_export(
+        assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
-            pred_itxt, str(gt_path) + ".itxt"
+            "export to indented-text"
-        ), "export to indented-text"
+        )
        assert verify_document(doc, str(gt_path) + ".json", GENERATE), "export to json"
--- a/tests/test_backend_msexcel.py
+++ b/tests/test_backend_msexcel.py
@ -17,7 +17,6 @@ GENERATE = GEN_TEST_DATA
 def get_xlsx_paths():
    # Define the directory you want to search
    directory = Path("./tests/data/xlsx/")
@ -27,7 +26,6 @@ def get_xlsx_paths():
 def get_converter():
    converter = DocumentConverter(allowed_formats=[InputFormat.XLSX])
    return converter
@ -65,13 +63,13 @@ def test_e2e_xlsx_conversions(documents) -> None:
        pred_itxt: str = doc._export_to_indented_text(
            max_text_len=70, explicit_tables=False
        )
-        assert verify_export(
+        assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
-            pred_itxt, str(gt_path) + ".itxt"
+            "export to indented-text"
-        ), "export to indented-text"
+        )
-        assert verify_document(
+        assert verify_document(doc, str(gt_path) + ".json", GENERATE), (
-            doc, str(gt_path) + ".json", GENERATE
+            "document document"
-        ), "document document"
+        )
 def test_pages(documents) -> None:
@ -81,7 +79,7 @@ def test_pages(documents) -> None:
        documents: The paths and converted documents.
    """
    # number of pages from the backend method
-    path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0]
+    path = next(item for item in get_xlsx_paths() if item.stem == "test-01")
    in_doc = InputDocument(
        path_or_stream=path,
        format=InputFormat.XLSX,
@ -92,7 +90,7 @@ def test_pages(documents) -> None:
    assert backend.page_count() == 3
    # number of pages from the converted document
-    doc = [item for path, item in documents if path.stem == "test-01"][0]
+    doc = next(item for path, item in documents if path.stem == "test-01")
    assert len(doc.pages) == 3
    # page sizes as number of cells
--- a/tests/test_backend_msword.py
+++ b/tests/test_backend_msword.py
@ -1,4 +1,3 @@
 import os
 from pathlib import Path
 from docling.backend.msword_backend import MsWordDocumentBackend
@ -43,7 +42,6 @@ def test_heading_levels():
 def get_docx_paths():
    # Define the directory you want to search
    directory = Path("./tests/data/docx/")
@ -53,14 +51,12 @@ def get_docx_paths():
 def get_converter():
    converter = DocumentConverter(allowed_formats=[InputFormat.DOCX])
    return converter
 def test_e2e_docx_conversions():
    docx_paths = get_docx_paths()
    converter = get_converter()
@ -76,20 +72,20 @@ def test_e2e_docx_conversions():
        doc: DoclingDocument = conv_result.document
        pred_md: str = doc.export_to_markdown()
-        assert verify_export(
+        assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
-            pred_md, str(gt_path) + ".md", generate=GENERATE
+            "export to md"
-        ), "export to md"
+        )
        pred_itxt: str = doc._export_to_indented_text(
            max_text_len=70, explicit_tables=False
        )
-        assert verify_export(
+        assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
-            pred_itxt, str(gt_path) + ".itxt", generate=GENERATE
+            "export to indented-text"
-        ), "export to indented-text"
+        )
-        assert verify_document(
+        assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
-            doc, str(gt_path) + ".json", generate=GENERATE
+            "document document"
-        ), "document document"
+        )
        if docx_path.name == "word_tables.docx":
            pred_html: str = doc.export_to_html()
--- a/tests/test_backend_patent_uspto.py
+++ b/tests/test_backend_patent_uspto.py
@ -109,27 +109,27 @@ def test_patent_groundtruth(patents, groundtruth):
        md_name = path.stem + ".md"
        if md_name in gt_names:
            pred_md = doc.export_to_markdown()
-            assert (
+            assert pred_md == gt_names[md_name], (
-                pred_md == gt_names[md_name]
+                f"Markdown file mismatch against groundtruth {md_name}"
-            ), f"Markdown file mismatch against groundtruth {md_name}"
+            )
        json_path = path.with_suffix(".json")
        if json_path.stem in gt_names:
-            assert verify_document(
+            assert verify_document(doc, str(json_path), GENERATE), (
-                doc, str(json_path), GENERATE
+                f"JSON file mismatch against groundtruth {json_path}"
-            ), f"JSON file mismatch against groundtruth {json_path}"
+            )
        itxt_name = path.stem + ".itxt"
        if itxt_name in gt_names:
            pred_itxt = doc._export_to_indented_text()
-            assert (
+            assert pred_itxt == gt_names[itxt_name], (
-                pred_itxt == gt_names[itxt_name]
+                f"Indented text file mismatch against groundtruth {itxt_name}"
-            ), f"Indented text file mismatch against groundtruth {itxt_name}"
+            )
 def test_tables(tables):
    """Test the table parser."""
    # CHECK table in file tables_20180000016.xml
    file_name = "tables_ipa20180000016.xml"
-    file_table = [item[1] for item in tables if item[0].name == file_name][0]
+    file_table = next(item[1] for item in tables if item[0].name == file_name)
    assert file_table.num_rows == 13
    assert file_table.num_cols == 10
    assert len(file_table.table_cells) == 130
@ -140,7 +140,7 @@ def test_patent_uspto_ice(patents):
    # CHECK application doc number 20200022300
    file_name = "ipa20200022300.xml"
-    doc = [item[1] for item in patents if item[0].name == file_name][0]
+    doc = next(item[1] for item in patents if item[0].name == file_name)
    if GENERATE:
        _generate_groundtruth(doc, Path(file_name).stem)
@ -278,7 +278,7 @@ def test_patent_uspto_ice(patents):
    # CHECK application doc number 20180000016 for HTML entities, level 2 headings, tables
    file_name = "ipa20180000016.xml"
-    doc = [item[1] for item in patents if item[0].name == file_name][0]
+    doc = next(item[1] for item in patents if item[0].name == file_name)
    if GENERATE:
        _generate_groundtruth(doc, Path(file_name).stem)
@ -348,7 +348,7 @@ def test_patent_uspto_ice(patents):
    # CHECK application doc number 20110039701 for complex long tables
    file_name = "ipa20110039701.xml"
-    doc = [item[1] for item in patents if item[0].name == file_name][0]
+    doc = next(item[1] for item in patents if item[0].name == file_name)
    assert doc.name == file_name
    assert len(doc.tables) == 17
@ -358,7 +358,7 @@ def test_patent_uspto_grant_v2(patents):
    # CHECK application doc number 06442728
    file_name = "pg06442728.xml"
-    doc = [item[1] for item in patents if item[0].name == file_name][0]
+    doc = next(item[1] for item in patents if item[0].name == file_name)
    if GENERATE:
        _generate_groundtruth(doc, Path(file_name).stem)
@ -376,12 +376,12 @@ def test_patent_uspto_grant_v2(patents):
    assert isinstance(texts[2], TextItem)
    assert texts[2].text == (
        "An interleaver receives incoming data frames of size N. The interleaver "
-        "indexes the elements of the frame with an N₁×N₂ index array. The interleaver "
+        "indexes the elements of the frame with an N₁×N₂ index array. The interleaver "  # noqa: RUF001
        "then effectively rearranges (permutes) the data by permuting the rows of the "
-        "index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to "
+        "index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to "  # noqa: RUF001
        "permute the columns (indexed by k) of each row (indexed by j). P is at least "
        "equal to N₂, βj is a constant which may be different for each row, and each "
-        "αj is a relative prime number relative to P. After permuting, the "
+        "αj is a relative prime number relative to P. After permuting, the "  # noqa: RUF001
        "interleaver outputs the data in a different order than received (e.g., "
        "receives sequentially row by row, outputs sequentially each column by column)."
    )
@ -402,7 +402,7 @@ def test_patent_uspto_app_v1(patents):
    # CHECK application doc number 20010031492
    file_name = "pa20010031492.xml"
-    doc = [item[1] for item in patents if item[0].name == file_name][0]
+    doc = next(item[1] for item in patents if item[0].name == file_name)
    if GENERATE:
        _generate_groundtruth(doc, Path(file_name).stem)
@ -432,7 +432,7 @@ def test_patent_uspto_grant_aps(patents):
    # CHECK application doc number 057006474
    file_name = "pftaps057006474.txt"
-    doc = [item[1] for item in patents if item[0].name == file_name][0]
+    doc = next(item[1] for item in patents if item[0].name == file_name)
    if GENERATE:
        _generate_groundtruth(doc, Path(file_name).stem)
--- a/tests/test_backend_pdfium.py
+++ b/tests/test_backend_pdfium.py
@ -32,7 +32,7 @@ def test_text_cell_counts():
    doc_backend = _get_backend(pdf_doc)
-    for page_index in range(0, doc_backend.page_count()):
+    for page_index in range(doc_backend.page_count()):
        last_cell_count = None
        for i in range(10):
            page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
@ -42,9 +42,9 @@ def test_text_cell_counts():
                last_cell_count = len(cells)
            if len(cells) != last_cell_count:
-                assert (
+                assert False, (
-                    False
+                    "Loading page multiple times yielded non-identical text cell counts"
-                ), "Loading page multiple times yielded non-identical text cell counts"
+                )
            last_cell_count = len(cells)
@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path):
    page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
    # Crop out "Figure 1" from the DocLayNet paper
-    im = page_backend.get_page_image(
+    page_backend.get_page_image(
        scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
    )
    # im.show()
--- a/tests/test_backend_pptx.py
+++ b/tests/test_backend_pptx.py
@ -1,4 +1,3 @@
 import os
 from pathlib import Path
 from docling.datamodel.base_models import InputFormat
@ -12,7 +11,6 @@ GENERATE = GEN_TEST_DATA
 def get_pptx_paths():
    # Define the directory you want to search
    directory = Path("./tests/data/pptx/")
@ -22,14 +20,12 @@ def get_pptx_paths():
 def get_converter():
    converter = DocumentConverter(allowed_formats=[InputFormat.PPTX])
    return converter
 def test_e2e_pptx_conversions():
    pptx_paths = get_pptx_paths()
    converter = get_converter()
@ -50,10 +46,10 @@ def test_e2e_pptx_conversions():
        pred_itxt: str = doc._export_to_indented_text(
            max_text_len=70, explicit_tables=False
        )
-        assert verify_export(
+        assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
-            pred_itxt, str(gt_path) + ".itxt"
+            "export to indented-text"
-        ), "export to indented-text"
+        )
-        assert verify_document(
+        assert verify_document(doc, str(gt_path) + ".json", GENERATE), (
-            doc, str(gt_path) + ".json", GENERATE
+            "document document"
-        ), "document document"
+        )
--- a/tests/test_code_formula.py
+++ b/tests/test_code_formula.py
@ -3,7 +3,6 @@ from pathlib import Path
 from docling_core.types.doc import CodeItem, TextItem
 from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import PdfPipelineOptions
@ -12,7 +11,6 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 def get_converter():
    pipeline_options = PdfPipelineOptions()
    pipeline_options.generate_page_images = True
--- a/tests/test_document_picture_classifier.py
+++ b/tests/test_document_picture_classifier.py
@ -2,7 +2,6 @@ from pathlib import Path
 from docling_core.types.doc import PictureClassificationData
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import PdfPipelineOptions
@ -11,7 +10,6 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
 def get_converter():
    pipeline_options = PdfPipelineOptions()
    pipeline_options.generate_page_images = True
@ -49,32 +47,32 @@ def test_picture_classifier():
    res = results[0]
    assert len(res.annotations) == 1
-    assert type(res.annotations[0]) == PictureClassificationData
+    assert isinstance(res.annotations[0], PictureClassificationData)
    classification_data = res.annotations[0]
    assert classification_data.provenance == "DocumentPictureClassifier"
-    assert (
+    assert len(classification_data.predicted_classes) == 16, (
-        len(classification_data.predicted_classes) == 16
+        "Number of predicted classes is not equal to 16"
-    ), "Number of predicted classes is not equal to 16"
+    )
    confidences = [pred.confidence for pred in classification_data.predicted_classes]
-    assert confidences == sorted(
+    assert confidences == sorted(confidences, reverse=True), (
-        confidences, reverse=True
+        "Predictions are not sorted in descending order of confidence"
-    ), "Predictions are not sorted in descending order of confidence"
+    )
-    assert (
+    assert classification_data.predicted_classes[0].class_name == "bar_chart", (
-        classification_data.predicted_classes[0].class_name == "bar_chart"
+        "The prediction is wrong for the bar chart image."
-    ), "The prediction is wrong for the bar chart image."
+    )
    res = results[1]
    assert len(res.annotations) == 1
-    assert type(res.annotations[0]) == PictureClassificationData
+    assert isinstance(res.annotations[0], PictureClassificationData)
    classification_data = res.annotations[0]
    assert classification_data.provenance == "DocumentPictureClassifier"
-    assert (
+    assert len(classification_data.predicted_classes) == 16, (
-        len(classification_data.predicted_classes) == 16
+        "Number of predicted classes is not equal to 16"
-    ), "Number of predicted classes is not equal to 16"
+    )
    confidences = [pred.confidence for pred in classification_data.predicted_classes]
-    assert confidences == sorted(
+    assert confidences == sorted(confidences, reverse=True), (
-        confidences, reverse=True
+        "Predictions are not sorted in descending order of confidence"
-    ), "Predictions are not sorted in descending order of confidence"
+    )
-    assert (
+    assert classification_data.predicted_classes[0].class_name == "map", (
-        classification_data.predicted_classes[0].class_name == "map"
+        "The prediction is wrong for the bar chart image."
-    ), "The prediction is wrong for the bar chart image."
+    )
--- a/tests/test_e2e_conversion.py
+++ b/tests/test_e2e_conversion.py
@ -1,7 +1,6 @@
 from pathlib import Path
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions
@ -15,7 +14,6 @@ GENERATE_V2 = GEN_TEST_DATA
 def get_pdf_paths():
    # Define the directory you want to search
    directory = Path("./tests/data/pdf/")
@ -25,7 +23,6 @@ def get_pdf_paths():
 def get_converter():
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = False
    pipeline_options.do_table_structure = True
@ -45,7 +42,6 @@ def get_converter():
 def test_e2e_pdfs_conversions():
    pdf_paths = get_pdf_paths()
    converter = get_converter()
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@ -3,7 +3,6 @@ from pathlib import Path
 from typing import List
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@ -12,10 +12,9 @@ from docling.document_converter import PdfFormatOption
 def test_in_doc_from_valid_path():
    test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
    doc = _make_input_doc(test_doc_path)
-    assert doc.valid == True
+    assert doc.valid is True
 def test_in_doc_from_invalid_path():
@ -23,29 +22,26 @@ def test_in_doc_from_invalid_path():
    doc = _make_input_doc(test_doc_path)
-    assert doc.valid == False
+    assert doc.valid is False
 def test_in_doc_from_valid_buf():
    buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read())
    stream = DocumentStream(name="my_doc.pdf", stream=buf)
    doc = _make_input_doc_from_stream(stream)
-    assert doc.valid == True
+    assert doc.valid is True
 def test_in_doc_from_invalid_buf():
    buf = BytesIO(b"")
    stream = DocumentStream(name="my_doc.pdf", stream=buf)
    doc = _make_input_doc_from_stream(stream)
-    assert doc.valid == False
+    assert doc.valid is False
 def test_image_in_pdf_backend():
    in_doc = InputDocument(
        path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
        format=InputFormat.IMAGE,
@ -76,7 +72,6 @@ def test_image_in_pdf_backend():
 def test_in_doc_with_page_range():
    test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
    limits = DocumentLimits()
    limits.page_range = (1, 10)
@ -87,7 +82,7 @@ def test_in_doc_with_page_range():
        backend=PyPdfiumDocumentBackend,
        limits=limits,
    )
-    assert doc.valid == True
+    assert doc.valid is True
    limits.page_range = (9, 9)
@ -97,7 +92,7 @@ def test_in_doc_with_page_range():
        backend=PyPdfiumDocumentBackend,
        limits=limits,
    )
-    assert doc.valid == True
+    assert doc.valid is True
    limits.page_range = (11, 12)
@ -107,7 +102,7 @@ def test_in_doc_with_page_range():
        backend=PyPdfiumDocumentBackend,
        limits=limits,
    )
-    assert doc.valid == False
+    assert doc.valid is False
 def test_guess_format(tmp_path):
@ -192,17 +187,17 @@ def test_guess_format(tmp_path):
    )
    doc_path = temp_dir / "docling_test.xml"
    doc_path.write_text(xml_content, encoding="utf-8")
-    assert dci._guess_format(doc_path) == None
+    assert dci._guess_format(doc_path) is None
    buf = BytesIO(Path(doc_path).open("rb").read())
    stream = DocumentStream(name="docling_test.xml", stream=buf)
-    assert dci._guess_format(stream) == None
+    assert dci._guess_format(stream) is None
    # Invalid USPTO patent (as plain text)
    stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz"))
-    assert dci._guess_format(stream) == None
+    assert dci._guess_format(stream) is None
    doc_path = temp_dir / "pftaps_wrong.txt"
    doc_path.write_text("xyz", encoding="utf-8")
-    assert dci._guess_format(doc_path) == None
+    assert dci._guess_format(doc_path) is None
    # Valid Docling JSON
    test_str = '{"name": ""}'
--- a/Show More
+++ b/Show More