ci: add coverage and ruff (#1383)

* add coverage calculation and push

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* new codecov version and usage of token

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* enable ruff formatter instead of black and isort

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply ruff lint fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply ruff unsafe fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add removed imports

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* runs 1 on linter issues

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* finalize linter fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* Update pyproject.toml

Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Michele Dolfi
2025-04-14 18:01:26 +02:00
committed by GitHub
parent 293c28ca7c
commit 5458a88464
104 changed files with 665 additions and 633 deletions

View File

@@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
text_stream = self.path_or_stream.getvalue().decode("utf-8")
self.lines = text_stream.split("\n")
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
with open(self.path_or_stream, encoding="utf-8") as f:
self.lines = f.readlines()
self.valid = True
@@ -75,14 +75,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return doc
def _parse(self, doc: DoclingDocument):
def _parse(self, doc: DoclingDocument): # noqa: C901
"""
Main function that orchestrates the parsing by yielding components:
title, section headers, text, lists, and tables.
"""
content = ""
in_list = False
in_table = False
@@ -95,7 +93,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
indents: dict[int, Union[GroupItem, None]] = {}
for i in range(0, 10):
for i in range(10):
parents[i] = None
indents[i] = None
@@ -125,7 +123,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
# Lists
elif self._is_list_item(line):
_log.debug(f"line: {line}")
item = self._parse_list_item(line)
_log.debug(f"parsed list-item: {item}")
@@ -147,7 +144,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
indents[level + 1] = item["indent"]
elif in_list and item["indent"] < indents[level]:
# print(item["indent"], " => ", indents[level])
while item["indent"] < indents[level]:
# print(item["indent"], " => ", indents[level])
@@ -176,7 +172,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
elif in_table and (
(not self._is_table_line(line)) or line.strip() == "|==="
): # end of table
caption = None
if len(caption_data) > 0:
caption = doc.add_text(
@@ -195,7 +190,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
# Picture
elif self._is_picture(line):
caption = None
if len(caption_data) > 0:
caption = doc.add_text(
@@ -250,7 +244,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
text_data = []
elif len(line.strip()) > 0: # allow multiline texts
item = self._parse_text(line)
text_data.append(item["text"])
@@ -273,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
def _get_current_level(self, parents):
for k, v in parents.items():
if v == None and k > 0:
if v is None and k > 0:
return k - 1
return 0
def _get_current_parent(self, parents):
for k, v in parents.items():
if v == None and k > 0:
if v is None and k > 0:
return parents[k - 1]
return None
@@ -328,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
"marker": marker,
"text": text.strip(),
"numbered": False,
"indent": 0 if indent == None else len(indent),
"indent": 0 if indent is None else len(indent),
}
else:
return {
@@ -336,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
"marker": marker,
"text": text.strip(),
"numbered": True,
"indent": 0 if indent == None else len(indent),
"indent": 0 if indent is None else len(indent),
}
else:
# Fallback if no match
@@ -357,7 +350,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
return [cell.strip() for cell in line.split("|") if cell.strip()]
def _populate_table_as_grid(self, table_data):
num_rows = len(table_data)
# Adjust the table data into a grid format

View File

@@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
head = self.content.readline()
dialect = csv.Sniffer().sniff(head, ",;\t|:")
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
raise RuntimeError(
f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
)

View File

@@ -1,8 +1,9 @@
import logging
import random
from collections.abc import Iterable
from io import BytesIO
from pathlib import Path
from typing import Iterable, List, Optional, Union
from typing import List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
@@ -156,7 +157,6 @@ class DoclingParsePageBackend(PdfPageBackend):
def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
page_size = self.get_size()
if not cropbox:

View File

@@ -1,8 +1,9 @@
import logging
import random
from collections.abc import Iterable
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin
@@ -172,7 +173,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
page_size = self.get_size()
if not cropbox:

View File

@@ -1,14 +1,14 @@
import logging
import random
from collections.abc import Iterable
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
from typing import TYPE_CHECKING, Optional, Union
import pypdfium2 as pdfium
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
from PIL import Image, ImageDraw
from PIL import Image
from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
@@ -93,7 +93,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
page_size = self.get_size()
if not cropbox:

View File

@@ -1,12 +1,8 @@
# -*- coding: utf-8 -*-
"""
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
On 23/01/2025
"""
from __future__ import unicode_literals
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
BLANK = ""
@@ -79,7 +75,6 @@ CHR_BO = {
}
T = {
"\u2192": "\\rightarrow ",
# Greek letters
"\U0001d6fc": "\\alpha ",
"\U0001d6fd": "\\beta ",

View File

@@ -76,8 +76,7 @@ def get_val(key, default=None, store=CHR):
return default
class Tag2Method(object):
class Tag2Method:
def call_method(self, elm, stag=None):
getmethod = self.tag2meth.get
if stag is None:
@@ -130,7 +129,6 @@ class Tag2Method(object):
class Pr(Tag2Method):
text = ""
__val_tags = ("chr", "pos", "begChr", "endChr", "type")
@@ -159,7 +157,7 @@ class Pr(Tag2Method):
def do_common(self, elm):
stag = elm.tag.replace(OMML_NS, "")
if stag in self.__val_tags:
t = elm.get("{0}val".format(OMML_NS))
t = elm.get(f"{OMML_NS}val")
self.__innerdict[stag] = t
return None
@@ -248,7 +246,6 @@ class oMath2Latex(Tag2Method):
"""
the Pre-Sub-Superscript object -- Not support yet
"""
pass
def do_sub(self, elm):
text = self.process_children(elm)
@@ -331,7 +328,7 @@ class oMath2Latex(Tag2Method):
t_dict = self.process_children_dict(elm, include=("e", "lim"))
latex_s = LIM_FUNC.get(t_dict["e"])
if not latex_s:
raise NotSupport("Not support lim %s" % t_dict["e"])
raise RuntimeError("Not support lim {}".format(t_dict["e"]))
else:
return latex_s.format(lim=t_dict.get("lim"))
@@ -413,7 +410,7 @@ class oMath2Latex(Tag2Method):
"""
_str = []
_base_str = []
found_text = elm.findtext("./{0}t".format(OMML_NS))
found_text = elm.findtext(f"./{OMML_NS}t")
if found_text:
for s in found_text:
out_latex_str = self.process_unicode(s)

View File

@@ -55,7 +55,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.max_levels = 10
self.level = 0
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
for i in range(0, self.max_levels):
for i in range(self.max_levels):
self.parents[i] = None
try:
@@ -126,7 +126,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return doc
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
# Iterate over elements in the body of the document
text: str = ""
for element in tag.children:
@@ -135,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.analyze_tag(cast(Tag, element), doc)
except Exception as exc_child:
_log.error(
f"Error processing child from tag {tag.name}: {repr(exc_child)}"
f"Error processing child from tag {tag.name}: {exc_child!r}"
)
raise exc_child
elif isinstance(element, NavigableString) and not isinstance(
@@ -147,7 +146,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
item for item in element.next_siblings if isinstance(item, Tag)
]
if element.next_sibling is None or any(
[item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
item.name in TAGS_FOR_NODE_ITEMS for item in siblings
):
text = text.strip()
if text and tag.name in ["div"]:
@@ -222,7 +221,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
)
else:
if hlevel > self.level:
# add invisible group
for i in range(self.level + 1, hlevel):
self.parents[i] = doc.add_group(
@@ -234,7 +232,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.level = hlevel
elif hlevel < self.level:
# remove the tail
for key in self.parents.keys():
if key > hlevel:
@@ -360,7 +357,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
marker = ""
enumerated = False
if parent_label == GroupLabel.ORDERED_LIST:
marker = f"{str(index_in_list)}."
marker = f"{index_in_list!s}."
enumerated = True
doc.add_list_item(
text=text,

View File

@@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# otherwise they represent emphasis (bold or italic)
self.markdown = self._shorten_underscore_sequences(text_stream)
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
with open(self.path_or_stream, encoding="utf-8") as f:
md_content = f.read()
# remove invalid sequences
# very long sequences of underscores will lead to unnecessary long processing times.
@@ -168,7 +168,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
)
self.inline_texts = []
def _iterate_elements(
def _iterate_elements( # noqa: C901
self,
element: marko.element.Element,
depth: int,
@@ -176,7 +176,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
visited: Set[marko.element.Element],
parent_item: Optional[NodeItem] = None,
):
if element in visited:
return
@@ -236,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
if has_non_empty_list_items:
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
parent_item = doc.add_group(
label=label, name=f"list", parent=parent_item
label=label, name="list", parent=parent_item
)
elif (
@@ -320,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self._html_blocks += 1
self._process_inline_text(parent_item, doc)
self._close_table(doc)
_log.debug("HTML Block: {}".format(element))
_log.debug(f"HTML Block: {element}")
if (
len(element.body) > 0
): # If Marko doesn't return any content for HTML block, skip it
@@ -332,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
else:
if not isinstance(element, str):
self._close_table(doc)
_log.debug("Some other element: {}".format(element))
_log.debug(f"Some other element: {element}")
processed_block_types = (
marko.block.Heading,
@@ -398,7 +397,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# if HTML blocks were detected, export to HTML and delegate to HTML backend
if self._html_blocks > 0:
# export to HTML
html_backend_cls = HTMLDocumentBackend
html_str = doc.export_to_html()

View File

@@ -184,7 +184,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
"""
if self.workbook is not None:
# Iterate over all sheets
for sheet_name in self.workbook.sheetnames:
_log.info(f"Processing sheet: {sheet_name}")
@@ -253,7 +252,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
)
for excel_cell in excel_table.data:
cell = TableCell(
text=excel_cell.text,
row_span=excel_cell.row_span,
@@ -303,7 +301,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
# Iterate over all cells in the sheet
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
for rj, cell in enumerate(row):
# Skip empty or already visited cells
if cell.value is None or (ri, rj) in visited:
continue
@@ -342,7 +339,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
visited_cells: set[tuple[int, int]] = set()
for ri in range(start_row, max_row + 1):
for rj in range(start_col, max_col + 1):
cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
# Check if the cell belongs to a merged range
@@ -350,14 +346,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
col_span = 1
for merged_range in sheet.merged_cells.ranges:
if (
merged_range.min_row <= ri + 1
and ri + 1 <= merged_range.max_row
and merged_range.min_col <= rj + 1
and rj + 1 <= merged_range.max_col
):
row_span = merged_range.max_row - merged_range.min_row + 1
col_span = merged_range.max_col - merged_range.min_col + 1
break
@@ -499,7 +493,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
),
),
)
except:
except Exception:
_log.error("could not extract the image from excel sheets")
return doc

View File

@@ -120,13 +120,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
return prov
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901
is_a_list = False
is_list_group_created = False
enum_list_item_value = 0
new_list = None
bullet_type = "None"
list_text = ""
list_label = GroupLabel.LIST
doc_label = DocItemLabel.LIST_ITEM
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
@@ -243,7 +242,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
enum_marker = str(enum_list_item_value) + "."
if not is_list_group_created:
new_list = doc.add_group(
label=list_label, name=f"list", parent=parent_slide
label=list_label, name="list", parent=parent_slide
)
is_list_group_created = True
doc.add_list_item(
@@ -368,11 +367,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
slide_width = pptx_obj.slide_width
slide_height = pptx_obj.slide_height
text_content = [] # type: ignore
max_levels = 10
parents = {} # type: ignore
for i in range(0, max_levels):
for i in range(max_levels):
parents[i] = None
# Loop through each slide
@@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
)
slide_size = Size(width=slide_width, height=slide_height)
parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
doc.add_page(page_no=slide_ind + 1, size=slide_size)
def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
handle_groups(shape, parent_slide, slide_ind, doc, slide_size)

View File

@@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def _get_level(self) -> int:
"""Return the first None index."""
for k, v in self.parents.items():
if k >= 0 and v == None:
if k >= 0 and v is None:
return k
return 0
@@ -418,7 +418,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
else prev_parent
)
def _handle_text_elements(
def _handle_text_elements( # noqa: C901
self,
element: BaseOxmlElement,
docx_obj: DocxDocument,
@@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
)
if cell is None or cell._tc in cell_set:
_log.debug(f" skipped since repeated content")
_log.debug(" skipped since repeated content")
col_idx += cell.grid_span
continue
else:
@@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
image=ImageRef.from_pil(image=pil_image, dpi=72),
caption=None,
)
except (UnidentifiedImageError, OSError) as e:
except (UnidentifiedImageError, OSError):
_log.warning("Warning: image cannot be loaded by Pillow")
doc.add_picture(
parent=self.parents[level - 1],

View File

@@ -1,7 +1,8 @@
from abc import ABC, abstractmethod
from collections.abc import Iterable
from io import BytesIO
from pathlib import Path
from typing import Iterable, Optional, Set, Union
from typing import Optional, Set, Union
from docling_core.types.doc import BoundingBox, Size
from docling_core.types.doc.page import SegmentedPdfPage, TextCell

View File

@@ -1,8 +1,9 @@
import logging
import random
from collections.abc import Iterable
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
from typing import TYPE_CHECKING, List, Optional, Union
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
@@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
self.valid = True # No better way to tell from pypdfium.
try:
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
except PdfiumError as e:
except PdfiumError:
_log.info(
f"An exception occurred when loading page {page_no} of document {document_hash}.",
exc_info=True,
@@ -225,7 +226,6 @@ class PyPdfiumPageBackend(PdfPageBackend):
def get_page_image(
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
) -> Image.Image:
page_size = self.get_size()
if not cropbox:

View File

@@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
doc_info: etree.DocInfo = self.tree.docinfo
if doc_info.system_url and any(
[kwd in doc_info.system_url for kwd in JATS_DTD_URL]
kwd in doc_info.system_url for kwd in JATS_DTD_URL
):
self.valid = True
return
for ent in doc_info.internalDTD.iterentities():
if ent.system_url and any(
[kwd in ent.system_url for kwd in JATS_DTD_URL]
kwd in ent.system_url for kwd in JATS_DTD_URL
):
self.valid = True
return
@@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
# TODO: once superscript is supported, add label with formatting
aff = aff.removeprefix(f"{label[0].text}, ")
affiliation_names.append(aff)
affiliation_ids_names = {
id: name
for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
}
affiliation_ids_names = dict(
zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
)
# Get author names and affiliation names
for author_node in meta.xpath(
@@ -300,7 +299,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
def _add_abstract(
self, doc: DoclingDocument, xml_components: XMLComponents
) -> None:
for abstract in xml_components["abstract"]:
text: str = abstract["content"]
title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
@@ -349,7 +347,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
return
def _parse_element_citation(self, node: etree._Element) -> str:
def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901
citation: Citation = {
"author_names": "",
"title": "",
@@ -440,7 +438,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
if len(node.xpath("lpage")) > 0:
citation["page"] += (
"" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
"" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001
)
# Flatten the citation to string
@@ -595,9 +593,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
try:
self._add_table(doc, parent, table)
except Exception as e:
_log.warning(f"Skipping unsupported table in {str(self.file)}")
pass
except Exception:
_log.warning(f"Skipping unsupported table in {self.file!s}")
return
@@ -609,7 +606,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
)
return
def _walk_linear(
def _walk_linear( # noqa: C901
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
) -> str:
skip_tags = ["term"]

View File

@@ -122,7 +122,6 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
@override
def convert(self) -> DoclingDocument:
if self.parser is not None:
doc = self.parser.parse(self.patent_content)
if doc is None:
@@ -163,7 +162,6 @@ class PatentUspto(ABC):
Returns:
The patent parsed as a docling document.
"""
pass
class PatentUsptoIce(PatentUspto):
@@ -265,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
self.style_html = HtmlEntity()
@override
def startElement(self, tag, attributes): # noqa: N802
def startElement(self, tag, attributes):
"""Signal the start of an element.
Args:
@@ -281,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
self._start_registered_elements(tag, attributes)
@override
def skippedEntity(self, name): # noqa: N802
def skippedEntity(self, name):
"""Receive notification of a skipped entity.
HTML entities will be skipped by the parser. This method will unescape them
@@ -315,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
self.text += unescaped
@override
def endElement(self, tag): # noqa: N802
def endElement(self, tag):
"""Signal the end of an element.
Args:
@@ -603,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
self.style_html = HtmlEntity()
@override
def startElement(self, tag, attributes): # noqa: N802
def startElement(self, tag, attributes):
"""Signal the start of an element.
Args:
@@ -616,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
self._start_registered_elements(tag, attributes)
@override
def skippedEntity(self, name): # noqa: N802
def skippedEntity(self, name):
"""Receive notification of a skipped entity.
HTML entities will be skipped by the parser. This method will unescape them
@@ -650,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
self.text += unescaped
@override
def endElement(self, tag): # noqa: N802
def endElement(self, tag):
"""Signal the end of an element.
Args:
@@ -691,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
if tag in [member.value for member in self.Element]:
if (
tag == self.Element.HEADING.value
and not self.Element.SDOCL.value in self.property
and self.Element.SDOCL.value not in self.property
):
level_attr: str = attributes.get("LVL", "")
new_level: int = int(level_attr) if level_attr.isnumeric() else 1
@@ -743,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
# headers except claims statement
elif (
self.Element.HEADING.value in self.property
and not self.Element.SDOCL.value in self.property
and self.Element.SDOCL.value not in self.property
and text.strip()
):
self.parents[self.level + 1] = self.doc.add_heading(
@@ -1164,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
self.style_html = HtmlEntity()
@override
def startElement(self, tag, attributes): # noqa: N802
def startElement(self, tag, attributes):
"""Signal the start of an element.
Args:
@@ -1177,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
self._start_registered_elements(tag, attributes)
@override
def skippedEntity(self, name): # noqa: N802
def skippedEntity(self, name):
"""Receive notification of a skipped entity.
HTML entities will be skipped by the parser. This method will unescape them
@@ -1211,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
self.text += unescaped
@override
def endElement(self, tag): # noqa: N802
def endElement(self, tag):
"""Signal the end of an element.
Args:
@@ -1474,9 +1472,7 @@ class XmlTable:
if cw == 0:
offset_w0.append(col["offset"][ic])
min_colinfo["offset"] = sorted(
list(set(col["offset"] + min_colinfo["offset"]))
)
min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))
# add back the 0 width cols to offset list
offset_w0 = list(set(offset_w0))
@@ -1527,7 +1523,7 @@ class XmlTable:
return ncols_max
def _parse_table(self, table: Tag) -> TableData:
def _parse_table(self, table: Tag) -> TableData: # noqa: C901
"""Parse the content of a table tag.
Args:
@@ -1722,7 +1718,7 @@ class HtmlEntity:
"0": "&#8304;",
"+": "&#8314;",
"-": "&#8315;",
"": "&#8315;",
"": "&#8315;", # noqa: RUF001
"=": "&#8316;",
"(": "&#8317;",
")": "&#8318;",
@@ -1746,7 +1742,7 @@ class HtmlEntity:
"0": "&#8320;",
"+": "&#8330;",
"-": "&#8331;",
"": "&#8331;",
"": "&#8331;", # noqa: RUF001
"=": "&#8332;",
"(": "&#8333;",
")": "&#8334;",