ci: add coverage and ruff (#1383)

* add coverage calculation and push Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * new codecov version and usage of token Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * enable ruff formatter instead of black and isort Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff lint fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff unsafe fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add removed imports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * runs 1 on linter issues Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * finalize linter fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update pyproject.toml Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-04-14 18:01:26 +02:00
parent 293c28ca7c
commit 5458a88464
104 changed files with 665 additions and 633 deletions
--- a/docling/backend/asciidoc_backend.py
+++ b/docling/backend/asciidoc_backend.py
@@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                text_stream = self.path_or_stream.getvalue().decode("utf-8")
                self.lines = text_stream.split("\n")
            if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                with open(self.path_or_stream, encoding="utf-8") as f:
                    self.lines = f.readlines()
            self.valid = True

@@ -75,14 +75,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):

        return doc

-    def _parse(self, doc: DoclingDocument):
+    def _parse(self, doc: DoclingDocument):  # noqa: C901
        """
        Main function that orchestrates the parsing by yielding components:
        title, section headers, text, lists, and tables.
        """

-        content = ""
-
        in_list = False
        in_table = False

@@ -95,7 +93,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        # indents: dict[int, Union[DocItem, GroupItem, None]] = {}
        indents: dict[int, Union[GroupItem, None]] = {}

-        for i in range(0, 10):
+        for i in range(10):
            parents[i] = None
            indents[i] = None

@@ -125,7 +123,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):

            # Lists
            elif self._is_list_item(line):
-
                _log.debug(f"line: {line}")
                item = self._parse_list_item(line)
                _log.debug(f"parsed list-item: {item}")
@@ -147,7 +144,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                    indents[level + 1] = item["indent"]

                elif in_list and item["indent"] < indents[level]:
-
                    # print(item["indent"], " => ", indents[level])
                    while item["indent"] < indents[level]:
                        # print(item["indent"], " => ", indents[level])
@@ -176,7 +172,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
            elif in_table and (
                (not self._is_table_line(line)) or line.strip() == "|==="
            ):  # end of table
-
                caption = None
                if len(caption_data) > 0:
                    caption = doc.add_text(
@@ -195,7 +190,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):

            # Picture
            elif self._is_picture(line):
-
                caption = None
                if len(caption_data) > 0:
                    caption = doc.add_text(
@@ -250,7 +244,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                text_data = []

            elif len(line.strip()) > 0:  # allow multiline texts
-
                item = self._parse_text(line)
                text_data.append(item["text"])

@@ -273,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):

    def _get_current_level(self, parents):
        for k, v in parents.items():
-            if v == None and k > 0:
+            if v is None and k > 0:
                return k - 1

        return 0

    def _get_current_parent(self, parents):
        for k, v in parents.items():
-            if v == None and k > 0:
+            if v is None and k > 0:
                return parents[k - 1]

        return None
@@ -328,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                    "marker": marker,
                    "text": text.strip(),
                    "numbered": False,
-                    "indent": 0 if indent == None else len(indent),
+                    "indent": 0 if indent is None else len(indent),
                }
            else:
                return {
@@ -336,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                    "marker": marker,
                    "text": text.strip(),
                    "numbered": True,
-                    "indent": 0 if indent == None else len(indent),
+                    "indent": 0 if indent is None else len(indent),
                }
        else:
            # Fallback if no match
@@ -357,7 +350,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        return [cell.strip() for cell in line.split("|") if cell.strip()]

    def _populate_table_as_grid(self, table_data):
-
        num_rows = len(table_data)

        # Adjust the table data into a grid format
--- a/docling/backend/csv_backend.py
+++ b/docling/backend/csv_backend.py
@@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
        head = self.content.readline()
        dialect = csv.Sniffer().sniff(head, ",;\t|:")
        _log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
-        if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
+        if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
            raise RuntimeError(
                f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
            )
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@@ -1,8 +1,9 @@
 import logging
 import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, List, Optional, Union
+from typing import List, Optional, Union

 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
@@ -156,7 +157,6 @@ class DoclingParsePageBackend(PdfPageBackend):
    def get_page_image(
        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
    ) -> Image.Image:
-
        page_size = self.get_size()

        if not cropbox:
--- a/docling/backend/docling_parse_v2_backend.py
+++ b/docling/backend/docling_parse_v2_backend.py
@@ -1,8 +1,9 @@
 import logging
 import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union

 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -172,7 +173,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
    def get_page_image(
        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
    ) -> Image.Image:
-
        page_size = self.get_size()

        if not cropbox:
--- a/docling/backend/docling_parse_v4_backend.py
+++ b/docling/backend/docling_parse_v4_backend.py
@@ -1,14 +1,14 @@
 import logging
-import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, Optional, Union

 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
 from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
-from PIL import Image, ImageDraw
+from PIL import Image
 from pypdfium2 import PdfPage

 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
@@ -93,7 +93,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
    def get_page_image(
        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
    ) -> Image.Image:
-
        page_size = self.get_size()

        if not cropbox:
--- a/docling/backend/docx/latex/latex_dict.py
+++ b/docling/backend/docx/latex/latex_dict.py
@@ -1,12 +1,8 @@
-# -*- coding: utf-8 -*-
-
 """
 Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
 On 23/01/2025
 """

-from __future__ import unicode_literals
-
 CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")

 BLANK = ""
@@ -79,7 +75,6 @@ CHR_BO = {
 }

 T = {
-    "\u2192": "\\rightarrow ",
    # Greek letters
    "\U0001d6fc": "\\alpha ",
    "\U0001d6fd": "\\beta ",
--- a/docling/backend/docx/latex/omml.py
+++ b/docling/backend/docx/latex/omml.py
@@ -76,8 +76,7 @@ def get_val(key, default=None, store=CHR):
        return default


-class Tag2Method(object):
-
+class Tag2Method:
    def call_method(self, elm, stag=None):
        getmethod = self.tag2meth.get
        if stag is None:
@@ -130,7 +129,6 @@ class Tag2Method(object):


 class Pr(Tag2Method):
-
    text = ""

    __val_tags = ("chr", "pos", "begChr", "endChr", "type")
@@ -159,7 +157,7 @@ class Pr(Tag2Method):
    def do_common(self, elm):
        stag = elm.tag.replace(OMML_NS, "")
        if stag in self.__val_tags:
-            t = elm.get("{0}val".format(OMML_NS))
+            t = elm.get(f"{OMML_NS}val")
            self.__innerdict[stag] = t
        return None

@@ -248,7 +246,6 @@ class oMath2Latex(Tag2Method):
        """
        the Pre-Sub-Superscript object -- Not support yet
        """
-        pass

    def do_sub(self, elm):
        text = self.process_children(elm)
@@ -331,7 +328,7 @@ class oMath2Latex(Tag2Method):
        t_dict = self.process_children_dict(elm, include=("e", "lim"))
        latex_s = LIM_FUNC.get(t_dict["e"])
        if not latex_s:
-            raise NotSupport("Not support lim %s" % t_dict["e"])
+            raise RuntimeError("Not support lim {}".format(t_dict["e"]))
        else:
            return latex_s.format(lim=t_dict.get("lim"))

@@ -413,7 +410,7 @@ class oMath2Latex(Tag2Method):
        """
        _str = []
        _base_str = []
-        found_text = elm.findtext("./{0}t".format(OMML_NS))
+        found_text = elm.findtext(f"./{OMML_NS}t")
        if found_text:
            for s in found_text:
                out_latex_str = self.process_unicode(s)
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -55,7 +55,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        self.max_levels = 10
        self.level = 0
        self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
-        for i in range(0, self.max_levels):
+        for i in range(self.max_levels):
            self.parents[i] = None

        try:
@@ -126,7 +126,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
        return doc

    def walk(self, tag: Tag, doc: DoclingDocument) -> None:
-
        # Iterate over elements in the body of the document
        text: str = ""
        for element in tag.children:
@@ -135,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    self.analyze_tag(cast(Tag, element), doc)
                except Exception as exc_child:
                    _log.error(
-                        f"Error processing child from tag {tag.name}: {repr(exc_child)}"
+                        f"Error processing child from tag {tag.name}: {exc_child!r}"
                    )
                    raise exc_child
            elif isinstance(element, NavigableString) and not isinstance(
@@ -147,7 +146,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                    item for item in element.next_siblings if isinstance(item, Tag)
                ]
                if element.next_sibling is None or any(
-                    [item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
+                    item.name in TAGS_FOR_NODE_ITEMS for item in siblings
                ):
                    text = text.strip()
                    if text and tag.name in ["div"]:
@@ -222,7 +221,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            )
        else:
            if hlevel > self.level:
-
                # add invisible group
                for i in range(self.level + 1, hlevel):
                    self.parents[i] = doc.add_group(
@@ -234,7 +232,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
                self.level = hlevel

            elif hlevel < self.level:
-
                # remove the tail
                for key in self.parents.keys():
                    if key > hlevel:
@@ -360,7 +357,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            marker = ""
            enumerated = False
            if parent_label == GroupLabel.ORDERED_LIST:
-                marker = f"{str(index_in_list)}."
+                marker = f"{index_in_list!s}."
                enumerated = True
            doc.add_list_item(
                text=text,
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                # otherwise they represent emphasis (bold or italic)
                self.markdown = self._shorten_underscore_sequences(text_stream)
            if isinstance(self.path_or_stream, Path):
-                with open(self.path_or_stream, "r", encoding="utf-8") as f:
+                with open(self.path_or_stream, encoding="utf-8") as f:
                    md_content = f.read()
                    # remove invalid sequences
                    # very long sequences of underscores will lead to unnecessary long processing times.
@@ -168,7 +168,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            )
        self.inline_texts = []

-    def _iterate_elements(
+    def _iterate_elements(  # noqa: C901
        self,
        element: marko.element.Element,
        depth: int,
@@ -176,7 +176,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
        visited: Set[marko.element.Element],
        parent_item: Optional[NodeItem] = None,
    ):
-
        if element in visited:
            return

@@ -236,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            if has_non_empty_list_items:
                label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
                parent_item = doc.add_group(
-                    label=label, name=f"list", parent=parent_item
+                    label=label, name="list", parent=parent_item
                )

        elif (
@@ -320,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
            self._html_blocks += 1
            self._process_inline_text(parent_item, doc)
            self._close_table(doc)
-            _log.debug("HTML Block: {}".format(element))
+            _log.debug(f"HTML Block: {element}")
            if (
                len(element.body) > 0
            ):  # If Marko doesn't return any content for HTML block, skip it
@@ -332,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
        else:
            if not isinstance(element, str):
                self._close_table(doc)
-                _log.debug("Some other element: {}".format(element))
+                _log.debug(f"Some other element: {element}")

        processed_block_types = (
            marko.block.Heading,
@@ -398,7 +397,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):

            # if HTML blocks were detected, export to HTML and delegate to HTML backend
            if self._html_blocks > 0:
-
                # export to HTML
                html_backend_cls = HTMLDocumentBackend
                html_str = doc.export_to_html()
--- a/docling/backend/msexcel_backend.py
+++ b/docling/backend/msexcel_backend.py
@@ -184,7 +184,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
        """

        if self.workbook is not None:
-
            # Iterate over all sheets
            for sheet_name in self.workbook.sheetnames:
                _log.info(f"Processing sheet: {sheet_name}")
@@ -253,7 +252,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
                )

                for excel_cell in excel_table.data:
-
                    cell = TableCell(
                        text=excel_cell.text,
                        row_span=excel_cell.row_span,
@@ -303,7 +301,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
        # Iterate over all cells in the sheet
        for ri, row in enumerate(sheet.iter_rows(values_only=False)):
            for rj, cell in enumerate(row):
-
                # Skip empty or already visited cells
                if cell.value is None or (ri, rj) in visited:
                    continue
@@ -342,7 +339,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
        visited_cells: set[tuple[int, int]] = set()
        for ri in range(start_row, max_row + 1):
            for rj in range(start_col, max_col + 1):
-
                cell = sheet.cell(row=ri + 1, column=rj + 1)  # 1-based indexing

                # Check if the cell belongs to a merged range
@@ -350,14 +346,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
                col_span = 1

                for merged_range in sheet.merged_cells.ranges:
-
                    if (
                        merged_range.min_row <= ri + 1
                        and ri + 1 <= merged_range.max_row
                        and merged_range.min_col <= rj + 1
                        and rj + 1 <= merged_range.max_col
                    ):
-
                        row_span = merged_range.max_row - merged_range.min_row + 1
                        col_span = merged_range.max_col - merged_range.min_col + 1
                        break
@@ -499,7 +493,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
                            ),
                        ),
                    )
-                except:
+                except Exception:
                    _log.error("could not extract the image from excel sheets")

        return doc
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -120,13 +120,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB

        return prov

-    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
+    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):  # noqa: C901
        is_a_list = False
        is_list_group_created = False
        enum_list_item_value = 0
        new_list = None
        bullet_type = "None"
-        list_text = ""
        list_label = GroupLabel.LIST
        doc_label = DocItemLabel.LIST_ITEM
        prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
@@ -243,7 +242,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
                    enum_marker = str(enum_list_item_value) + "."
                if not is_list_group_created:
                    new_list = doc.add_group(
-                        label=list_label, name=f"list", parent=parent_slide
+                        label=list_label, name="list", parent=parent_slide
                    )
                    is_list_group_created = True
                doc.add_list_item(
@@ -368,11 +367,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        slide_width = pptx_obj.slide_width
        slide_height = pptx_obj.slide_height

-        text_content = []  # type: ignore
-
        max_levels = 10
        parents = {}  # type: ignore
-        for i in range(0, max_levels):
+        for i in range(max_levels):
            parents[i] = None

        # Loop through each slide
@@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
            )

            slide_size = Size(width=slide_width, height=slide_height)
-            parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
+            doc.add_page(page_no=slide_ind + 1, size=slide_size)

            def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
                handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
    def _get_level(self) -> int:
        """Return the first None index."""
        for k, v in self.parents.items():
-            if k >= 0 and v == None:
+            if k >= 0 and v is None:
                return k
        return 0

@@ -418,7 +418,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            else prev_parent
        )

-    def _handle_text_elements(
+    def _handle_text_elements(  # noqa: C901
        self,
        element: BaseOxmlElement,
        docx_obj: DocxDocument,
@@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
                )
                if cell is None or cell._tc in cell_set:
-                    _log.debug(f"  skipped since repeated content")
+                    _log.debug("  skipped since repeated content")
                    col_idx += cell.grid_span
                    continue
                else:
@@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    image=ImageRef.from_pil(image=pil_image, dpi=72),
                    caption=None,
                )
-            except (UnidentifiedImageError, OSError) as e:
+            except (UnidentifiedImageError, OSError):
                _log.warning("Warning: image cannot be loaded by Pillow")
                doc.add_picture(
                    parent=self.parents[level - 1],
--- a/docling/backend/pdf_backend.py
+++ b/docling/backend/pdf_backend.py
@@ -1,7 +1,8 @@
 from abc import ABC, abstractmethod
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import Iterable, Optional, Set, Union
+from typing import Optional, Set, Union

 from docling_core.types.doc import BoundingBox, Size
 from docling_core.types.doc.page import SegmentedPdfPage, TextCell
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@@ -1,8 +1,9 @@
 import logging
 import random
+from collections.abc import Iterable
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union

 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
@@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
        self.valid = True  # No better way to tell from pypdfium.
        try:
            self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
-        except PdfiumError as e:
+        except PdfiumError:
            _log.info(
                f"An exception occurred when loading page {page_no} of document {document_hash}.",
                exc_info=True,
@@ -225,7 +226,6 @@ class PyPdfiumPageBackend(PdfPageBackend):
    def get_page_image(
        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
    ) -> Image.Image:
-
        page_size = self.get_size()

        if not cropbox:
--- a/docling/backend/xml/jats_backend.py
+++ b/docling/backend/xml/jats_backend.py
@@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):

            doc_info: etree.DocInfo = self.tree.docinfo
            if doc_info.system_url and any(
-                [kwd in doc_info.system_url for kwd in JATS_DTD_URL]
+                kwd in doc_info.system_url for kwd in JATS_DTD_URL
            ):
                self.valid = True
                return
            for ent in doc_info.internalDTD.iterentities():
                if ent.system_url and any(
-                    [kwd in ent.system_url for kwd in JATS_DTD_URL]
+                    kwd in ent.system_url for kwd in JATS_DTD_URL
                ):
                    self.valid = True
                    return
@@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
                # TODO: once superscript is supported, add label with formatting
                aff = aff.removeprefix(f"{label[0].text}, ")
            affiliation_names.append(aff)
-        affiliation_ids_names = {
-            id: name
-            for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
-        }
+        affiliation_ids_names = dict(
+            zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
+        )

        # Get author names and affiliation names
        for author_node in meta.xpath(
@@ -300,7 +299,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
    def _add_abstract(
        self, doc: DoclingDocument, xml_components: XMLComponents
    ) -> None:
-
        for abstract in xml_components["abstract"]:
            text: str = abstract["content"]
            title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
@@ -349,7 +347,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):

        return

-    def _parse_element_citation(self, node: etree._Element) -> str:
+    def _parse_element_citation(self, node: etree._Element) -> str:  # noqa: C901
        citation: Citation = {
            "author_names": "",
            "title": "",
@@ -440,7 +438,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
            citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
            if len(node.xpath("lpage")) > 0:
                citation["page"] += (
-                    "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
+                    "–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()  # noqa: RUF001
                )

        # Flatten the citation to string
@@ -595,9 +593,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):

        try:
            self._add_table(doc, parent, table)
-        except Exception as e:
-            _log.warning(f"Skipping unsupported table in {str(self.file)}")
-            pass
+        except Exception:
+            _log.warning(f"Skipping unsupported table in {self.file!s}")

        return

@@ -609,7 +606,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
        )
        return

-    def _walk_linear(
+    def _walk_linear(  # noqa: C901
        self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
    ) -> str:
        skip_tags = ["term"]
--- a/docling/backend/xml/uspto_backend.py
+++ b/docling/backend/xml/uspto_backend.py
@@ -122,7 +122,6 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):

    @override
    def convert(self) -> DoclingDocument:
-
        if self.parser is not None:
            doc = self.parser.parse(self.patent_content)
            if doc is None:
@@ -163,7 +162,6 @@ class PatentUspto(ABC):
        Returns:
            The patent parsed as a docling document.
        """
-        pass


 class PatentUsptoIce(PatentUspto):
@@ -265,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
            self.style_html = HtmlEntity()

        @override
-        def startElement(self, tag, attributes):  # noqa: N802
+        def startElement(self, tag, attributes):
            """Signal the start of an element.

            Args:
@@ -281,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
            self._start_registered_elements(tag, attributes)

        @override
-        def skippedEntity(self, name):  # noqa: N802
+        def skippedEntity(self, name):
            """Receive notification of a skipped entity.

            HTML entities will be skipped by the parser. This method will unescape them
@@ -315,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
                        self.text += unescaped

        @override
-        def endElement(self, tag):  # noqa: N802
+        def endElement(self, tag):
            """Signal the end of an element.

            Args:
@@ -603,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
            self.style_html = HtmlEntity()

        @override
-        def startElement(self, tag, attributes):  # noqa: N802
+        def startElement(self, tag, attributes):
            """Signal the start of an element.

            Args:
@@ -616,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
            self._start_registered_elements(tag, attributes)

        @override
-        def skippedEntity(self, name):  # noqa: N802
+        def skippedEntity(self, name):
            """Receive notification of a skipped entity.

            HTML entities will be skipped by the parser. This method will unescape them
@@ -650,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
                        self.text += unescaped

        @override
-        def endElement(self, tag):  # noqa: N802
+        def endElement(self, tag):
            """Signal the end of an element.

            Args:
@@ -691,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
            if tag in [member.value for member in self.Element]:
                if (
                    tag == self.Element.HEADING.value
-                    and not self.Element.SDOCL.value in self.property
+                    and self.Element.SDOCL.value not in self.property
                ):
                    level_attr: str = attributes.get("LVL", "")
                    new_level: int = int(level_attr) if level_attr.isnumeric() else 1
@@ -743,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
                # headers except claims statement
                elif (
                    self.Element.HEADING.value in self.property
-                    and not self.Element.SDOCL.value in self.property
+                    and self.Element.SDOCL.value not in self.property
                    and text.strip()
                ):
                    self.parents[self.level + 1] = self.doc.add_heading(
@@ -1164,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
            self.style_html = HtmlEntity()

        @override
-        def startElement(self, tag, attributes):  # noqa: N802
+        def startElement(self, tag, attributes):
            """Signal the start of an element.

            Args:
@@ -1177,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
            self._start_registered_elements(tag, attributes)

        @override
-        def skippedEntity(self, name):  # noqa: N802
+        def skippedEntity(self, name):
            """Receive notification of a skipped entity.

            HTML entities will be skipped by the parser. This method will unescape them
@@ -1211,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
                        self.text += unescaped

        @override
-        def endElement(self, tag):  # noqa: N802
+        def endElement(self, tag):
            """Signal the end of an element.

            Args:
@@ -1474,9 +1472,7 @@ class XmlTable:
                if cw == 0:
                    offset_w0.append(col["offset"][ic])

-            min_colinfo["offset"] = sorted(
-                list(set(col["offset"] + min_colinfo["offset"]))
-            )
+            min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))

        # add back the 0 width cols to offset list
        offset_w0 = list(set(offset_w0))
@@ -1527,7 +1523,7 @@ class XmlTable:

        return ncols_max

-    def _parse_table(self, table: Tag) -> TableData:
+    def _parse_table(self, table: Tag) -> TableData:  # noqa: C901
        """Parse the content of a table tag.

        Args:
@@ -1722,7 +1718,7 @@ class HtmlEntity:
                "0": "&#8304;",
                "+": "&#8314;",
                "-": "&#8315;",
-                "−": "&#8315;",
+                "−": "&#8315;",  # noqa: RUF001
                "=": "&#8316;",
                "(": "&#8317;",
                ")": "&#8318;",
@@ -1746,7 +1742,7 @@ class HtmlEntity:
                "0": "&#8320;",
                "+": "&#8330;",
                "-": "&#8331;",
-                "−": "&#8331;",
+                "−": "&#8331;",  # noqa: RUF001
                "=": "&#8332;",
                "(": "&#8333;",
                ")": "&#8334;",