ci: add coverage and ruff (#1383)
* add coverage calculation and push Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * new codecov version and usage of token Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * enable ruff formatter instead of black and isort Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff lint fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff unsafe fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add removed imports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * runs 1 on linter issues Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * finalize linter fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update pyproject.toml Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
@@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||
self.lines = text_stream.split("\n")
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||
with open(self.path_or_stream, encoding="utf-8") as f:
|
||||
self.lines = f.readlines()
|
||||
self.valid = True
|
||||
|
||||
@@ -75,14 +75,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
|
||||
return doc
|
||||
|
||||
def _parse(self, doc: DoclingDocument):
|
||||
def _parse(self, doc: DoclingDocument): # noqa: C901
|
||||
"""
|
||||
Main function that orchestrates the parsing by yielding components:
|
||||
title, section headers, text, lists, and tables.
|
||||
"""
|
||||
|
||||
content = ""
|
||||
|
||||
in_list = False
|
||||
in_table = False
|
||||
|
||||
@@ -95,7 +93,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
||||
indents: dict[int, Union[GroupItem, None]] = {}
|
||||
|
||||
for i in range(0, 10):
|
||||
for i in range(10):
|
||||
parents[i] = None
|
||||
indents[i] = None
|
||||
|
||||
@@ -125,7 +123,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
|
||||
# Lists
|
||||
elif self._is_list_item(line):
|
||||
|
||||
_log.debug(f"line: {line}")
|
||||
item = self._parse_list_item(line)
|
||||
_log.debug(f"parsed list-item: {item}")
|
||||
@@ -147,7 +144,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
indents[level + 1] = item["indent"]
|
||||
|
||||
elif in_list and item["indent"] < indents[level]:
|
||||
|
||||
# print(item["indent"], " => ", indents[level])
|
||||
while item["indent"] < indents[level]:
|
||||
# print(item["indent"], " => ", indents[level])
|
||||
@@ -176,7 +172,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
elif in_table and (
|
||||
(not self._is_table_line(line)) or line.strip() == "|==="
|
||||
): # end of table
|
||||
|
||||
caption = None
|
||||
if len(caption_data) > 0:
|
||||
caption = doc.add_text(
|
||||
@@ -195,7 +190,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
|
||||
# Picture
|
||||
elif self._is_picture(line):
|
||||
|
||||
caption = None
|
||||
if len(caption_data) > 0:
|
||||
caption = doc.add_text(
|
||||
@@ -250,7 +244,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
text_data = []
|
||||
|
||||
elif len(line.strip()) > 0: # allow multiline texts
|
||||
|
||||
item = self._parse_text(line)
|
||||
text_data.append(item["text"])
|
||||
|
||||
@@ -273,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
|
||||
def _get_current_level(self, parents):
|
||||
for k, v in parents.items():
|
||||
if v == None and k > 0:
|
||||
if v is None and k > 0:
|
||||
return k - 1
|
||||
|
||||
return 0
|
||||
|
||||
def _get_current_parent(self, parents):
|
||||
for k, v in parents.items():
|
||||
if v == None and k > 0:
|
||||
if v is None and k > 0:
|
||||
return parents[k - 1]
|
||||
|
||||
return None
|
||||
@@ -328,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
"marker": marker,
|
||||
"text": text.strip(),
|
||||
"numbered": False,
|
||||
"indent": 0 if indent == None else len(indent),
|
||||
"indent": 0 if indent is None else len(indent),
|
||||
}
|
||||
else:
|
||||
return {
|
||||
@@ -336,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
"marker": marker,
|
||||
"text": text.strip(),
|
||||
"numbered": True,
|
||||
"indent": 0 if indent == None else len(indent),
|
||||
"indent": 0 if indent is None else len(indent),
|
||||
}
|
||||
else:
|
||||
# Fallback if no match
|
||||
@@ -357,7 +350,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
return [cell.strip() for cell in line.split("|") if cell.strip()]
|
||||
|
||||
def _populate_table_as_grid(self, table_data):
|
||||
|
||||
num_rows = len(table_data)
|
||||
|
||||
# Adjust the table data into a grid format
|
||||
|
||||
@@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
|
||||
head = self.content.readline()
|
||||
dialect = csv.Sniffer().sniff(head, ",;\t|:")
|
||||
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
|
||||
if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
|
||||
if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
|
||||
raise RuntimeError(
|
||||
f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
|
||||
)
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
import logging
|
||||
import random
|
||||
from collections.abc import Iterable
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Union
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||
@@ -156,7 +157,6 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
def get_page_image(
|
||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||
) -> Image.Image:
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
if not cropbox:
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
import logging
|
||||
import random
|
||||
from collections.abc import Iterable
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
@@ -172,7 +173,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
||||
def get_page_image(
|
||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||
) -> Image.Image:
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
if not cropbox:
|
||||
|
||||
@@ -1,14 +1,14 @@
|
||||
import logging
|
||||
import random
|
||||
from collections.abc import Iterable
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
|
||||
from PIL import Image, ImageDraw
|
||||
from PIL import Image
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
@@ -93,7 +93,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
||||
def get_page_image(
|
||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||
) -> Image.Image:
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
if not cropbox:
|
||||
|
||||
@@ -1,12 +1,8 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
|
||||
On 23/01/2025
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
|
||||
|
||||
BLANK = ""
|
||||
@@ -79,7 +75,6 @@ CHR_BO = {
|
||||
}
|
||||
|
||||
T = {
|
||||
"\u2192": "\\rightarrow ",
|
||||
# Greek letters
|
||||
"\U0001d6fc": "\\alpha ",
|
||||
"\U0001d6fd": "\\beta ",
|
||||
|
||||
@@ -76,8 +76,7 @@ def get_val(key, default=None, store=CHR):
|
||||
return default
|
||||
|
||||
|
||||
class Tag2Method(object):
|
||||
|
||||
class Tag2Method:
|
||||
def call_method(self, elm, stag=None):
|
||||
getmethod = self.tag2meth.get
|
||||
if stag is None:
|
||||
@@ -130,7 +129,6 @@ class Tag2Method(object):
|
||||
|
||||
|
||||
class Pr(Tag2Method):
|
||||
|
||||
text = ""
|
||||
|
||||
__val_tags = ("chr", "pos", "begChr", "endChr", "type")
|
||||
@@ -159,7 +157,7 @@ class Pr(Tag2Method):
|
||||
def do_common(self, elm):
|
||||
stag = elm.tag.replace(OMML_NS, "")
|
||||
if stag in self.__val_tags:
|
||||
t = elm.get("{0}val".format(OMML_NS))
|
||||
t = elm.get(f"{OMML_NS}val")
|
||||
self.__innerdict[stag] = t
|
||||
return None
|
||||
|
||||
@@ -248,7 +246,6 @@ class oMath2Latex(Tag2Method):
|
||||
"""
|
||||
the Pre-Sub-Superscript object -- Not support yet
|
||||
"""
|
||||
pass
|
||||
|
||||
def do_sub(self, elm):
|
||||
text = self.process_children(elm)
|
||||
@@ -331,7 +328,7 @@ class oMath2Latex(Tag2Method):
|
||||
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
||||
latex_s = LIM_FUNC.get(t_dict["e"])
|
||||
if not latex_s:
|
||||
raise NotSupport("Not support lim %s" % t_dict["e"])
|
||||
raise RuntimeError("Not support lim {}".format(t_dict["e"]))
|
||||
else:
|
||||
return latex_s.format(lim=t_dict.get("lim"))
|
||||
|
||||
@@ -413,7 +410,7 @@ class oMath2Latex(Tag2Method):
|
||||
"""
|
||||
_str = []
|
||||
_base_str = []
|
||||
found_text = elm.findtext("./{0}t".format(OMML_NS))
|
||||
found_text = elm.findtext(f"./{OMML_NS}t")
|
||||
if found_text:
|
||||
for s in found_text:
|
||||
out_latex_str = self.process_unicode(s)
|
||||
|
||||
@@ -55,7 +55,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.max_levels = 10
|
||||
self.level = 0
|
||||
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
||||
for i in range(0, self.max_levels):
|
||||
for i in range(self.max_levels):
|
||||
self.parents[i] = None
|
||||
|
||||
try:
|
||||
@@ -126,7 +126,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
return doc
|
||||
|
||||
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||
|
||||
# Iterate over elements in the body of the document
|
||||
text: str = ""
|
||||
for element in tag.children:
|
||||
@@ -135,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.analyze_tag(cast(Tag, element), doc)
|
||||
except Exception as exc_child:
|
||||
_log.error(
|
||||
f"Error processing child from tag {tag.name}: {repr(exc_child)}"
|
||||
f"Error processing child from tag {tag.name}: {exc_child!r}"
|
||||
)
|
||||
raise exc_child
|
||||
elif isinstance(element, NavigableString) and not isinstance(
|
||||
@@ -147,7 +146,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
item for item in element.next_siblings if isinstance(item, Tag)
|
||||
]
|
||||
if element.next_sibling is None or any(
|
||||
[item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
|
||||
item.name in TAGS_FOR_NODE_ITEMS for item in siblings
|
||||
):
|
||||
text = text.strip()
|
||||
if text and tag.name in ["div"]:
|
||||
@@ -222,7 +221,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
else:
|
||||
if hlevel > self.level:
|
||||
|
||||
# add invisible group
|
||||
for i in range(self.level + 1, hlevel):
|
||||
self.parents[i] = doc.add_group(
|
||||
@@ -234,7 +232,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.level = hlevel
|
||||
|
||||
elif hlevel < self.level:
|
||||
|
||||
# remove the tail
|
||||
for key in self.parents.keys():
|
||||
if key > hlevel:
|
||||
@@ -360,7 +357,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
marker = ""
|
||||
enumerated = False
|
||||
if parent_label == GroupLabel.ORDERED_LIST:
|
||||
marker = f"{str(index_in_list)}."
|
||||
marker = f"{index_in_list!s}."
|
||||
enumerated = True
|
||||
doc.add_list_item(
|
||||
text=text,
|
||||
|
||||
@@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
# otherwise they represent emphasis (bold or italic)
|
||||
self.markdown = self._shorten_underscore_sequences(text_stream)
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||
with open(self.path_or_stream, encoding="utf-8") as f:
|
||||
md_content = f.read()
|
||||
# remove invalid sequences
|
||||
# very long sequences of underscores will lead to unnecessary long processing times.
|
||||
@@ -168,7 +168,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
self.inline_texts = []
|
||||
|
||||
def _iterate_elements(
|
||||
def _iterate_elements( # noqa: C901
|
||||
self,
|
||||
element: marko.element.Element,
|
||||
depth: int,
|
||||
@@ -176,7 +176,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
visited: Set[marko.element.Element],
|
||||
parent_item: Optional[NodeItem] = None,
|
||||
):
|
||||
|
||||
if element in visited:
|
||||
return
|
||||
|
||||
@@ -236,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
if has_non_empty_list_items:
|
||||
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
||||
parent_item = doc.add_group(
|
||||
label=label, name=f"list", parent=parent_item
|
||||
label=label, name="list", parent=parent_item
|
||||
)
|
||||
|
||||
elif (
|
||||
@@ -320,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._html_blocks += 1
|
||||
self._process_inline_text(parent_item, doc)
|
||||
self._close_table(doc)
|
||||
_log.debug("HTML Block: {}".format(element))
|
||||
_log.debug(f"HTML Block: {element}")
|
||||
if (
|
||||
len(element.body) > 0
|
||||
): # If Marko doesn't return any content for HTML block, skip it
|
||||
@@ -332,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
if not isinstance(element, str):
|
||||
self._close_table(doc)
|
||||
_log.debug("Some other element: {}".format(element))
|
||||
_log.debug(f"Some other element: {element}")
|
||||
|
||||
processed_block_types = (
|
||||
marko.block.Heading,
|
||||
@@ -398,7 +397,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
||||
if self._html_blocks > 0:
|
||||
|
||||
# export to HTML
|
||||
html_backend_cls = HTMLDocumentBackend
|
||||
html_str = doc.export_to_html()
|
||||
|
||||
@@ -184,7 +184,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
||||
"""
|
||||
|
||||
if self.workbook is not None:
|
||||
|
||||
# Iterate over all sheets
|
||||
for sheet_name in self.workbook.sheetnames:
|
||||
_log.info(f"Processing sheet: {sheet_name}")
|
||||
@@ -253,7 +252,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
||||
)
|
||||
|
||||
for excel_cell in excel_table.data:
|
||||
|
||||
cell = TableCell(
|
||||
text=excel_cell.text,
|
||||
row_span=excel_cell.row_span,
|
||||
@@ -303,7 +301,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
||||
# Iterate over all cells in the sheet
|
||||
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
|
||||
for rj, cell in enumerate(row):
|
||||
|
||||
# Skip empty or already visited cells
|
||||
if cell.value is None or (ri, rj) in visited:
|
||||
continue
|
||||
@@ -342,7 +339,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
||||
visited_cells: set[tuple[int, int]] = set()
|
||||
for ri in range(start_row, max_row + 1):
|
||||
for rj in range(start_col, max_col + 1):
|
||||
|
||||
cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
|
||||
|
||||
# Check if the cell belongs to a merged range
|
||||
@@ -350,14 +346,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
||||
col_span = 1
|
||||
|
||||
for merged_range in sheet.merged_cells.ranges:
|
||||
|
||||
if (
|
||||
merged_range.min_row <= ri + 1
|
||||
and ri + 1 <= merged_range.max_row
|
||||
and merged_range.min_col <= rj + 1
|
||||
and rj + 1 <= merged_range.max_col
|
||||
):
|
||||
|
||||
row_span = merged_range.max_row - merged_range.min_row + 1
|
||||
col_span = merged_range.max_col - merged_range.min_col + 1
|
||||
break
|
||||
@@ -499,7 +493,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
||||
),
|
||||
),
|
||||
)
|
||||
except:
|
||||
except Exception:
|
||||
_log.error("could not extract the image from excel sheets")
|
||||
|
||||
return doc
|
||||
|
||||
@@ -120,13 +120,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
|
||||
return prov
|
||||
|
||||
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
|
||||
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901
|
||||
is_a_list = False
|
||||
is_list_group_created = False
|
||||
enum_list_item_value = 0
|
||||
new_list = None
|
||||
bullet_type = "None"
|
||||
list_text = ""
|
||||
list_label = GroupLabel.LIST
|
||||
doc_label = DocItemLabel.LIST_ITEM
|
||||
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
|
||||
@@ -243,7 +242,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
enum_marker = str(enum_list_item_value) + "."
|
||||
if not is_list_group_created:
|
||||
new_list = doc.add_group(
|
||||
label=list_label, name=f"list", parent=parent_slide
|
||||
label=list_label, name="list", parent=parent_slide
|
||||
)
|
||||
is_list_group_created = True
|
||||
doc.add_list_item(
|
||||
@@ -368,11 +367,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
slide_width = pptx_obj.slide_width
|
||||
slide_height = pptx_obj.slide_height
|
||||
|
||||
text_content = [] # type: ignore
|
||||
|
||||
max_levels = 10
|
||||
parents = {} # type: ignore
|
||||
for i in range(0, max_levels):
|
||||
for i in range(max_levels):
|
||||
parents[i] = None
|
||||
|
||||
# Loop through each slide
|
||||
@@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
)
|
||||
|
||||
slide_size = Size(width=slide_width, height=slide_height)
|
||||
parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
|
||||
doc.add_page(page_no=slide_ind + 1, size=slide_size)
|
||||
|
||||
def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
|
||||
handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
|
||||
|
||||
@@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
def _get_level(self) -> int:
|
||||
"""Return the first None index."""
|
||||
for k, v in self.parents.items():
|
||||
if k >= 0 and v == None:
|
||||
if k >= 0 and v is None:
|
||||
return k
|
||||
return 0
|
||||
|
||||
@@ -418,7 +418,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
else prev_parent
|
||||
)
|
||||
|
||||
def _handle_text_elements(
|
||||
def _handle_text_elements( # noqa: C901
|
||||
self,
|
||||
element: BaseOxmlElement,
|
||||
docx_obj: DocxDocument,
|
||||
@@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
|
||||
)
|
||||
if cell is None or cell._tc in cell_set:
|
||||
_log.debug(f" skipped since repeated content")
|
||||
_log.debug(" skipped since repeated content")
|
||||
col_idx += cell.grid_span
|
||||
continue
|
||||
else:
|
||||
@@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||
caption=None,
|
||||
)
|
||||
except (UnidentifiedImageError, OSError) as e:
|
||||
except (UnidentifiedImageError, OSError):
|
||||
_log.warning("Warning: image cannot be loaded by Pillow")
|
||||
doc.add_picture(
|
||||
parent=self.parents[level - 1],
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Iterable
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Set, Union
|
||||
from typing import Optional, Set, Union
|
||||
|
||||
from docling_core.types.doc import BoundingBox, Size
|
||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
import logging
|
||||
import random
|
||||
from collections.abc import Iterable
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
import pypdfium2.raw as pdfium_c
|
||||
@@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
self.valid = True # No better way to tell from pypdfium.
|
||||
try:
|
||||
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
||||
except PdfiumError as e:
|
||||
except PdfiumError:
|
||||
_log.info(
|
||||
f"An exception occurred when loading page {page_no} of document {document_hash}.",
|
||||
exc_info=True,
|
||||
@@ -225,7 +226,6 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
def get_page_image(
|
||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||
) -> Image.Image:
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
if not cropbox:
|
||||
|
||||
@@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
doc_info: etree.DocInfo = self.tree.docinfo
|
||||
if doc_info.system_url and any(
|
||||
[kwd in doc_info.system_url for kwd in JATS_DTD_URL]
|
||||
kwd in doc_info.system_url for kwd in JATS_DTD_URL
|
||||
):
|
||||
self.valid = True
|
||||
return
|
||||
for ent in doc_info.internalDTD.iterentities():
|
||||
if ent.system_url and any(
|
||||
[kwd in ent.system_url for kwd in JATS_DTD_URL]
|
||||
kwd in ent.system_url for kwd in JATS_DTD_URL
|
||||
):
|
||||
self.valid = True
|
||||
return
|
||||
@@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
# TODO: once superscript is supported, add label with formatting
|
||||
aff = aff.removeprefix(f"{label[0].text}, ")
|
||||
affiliation_names.append(aff)
|
||||
affiliation_ids_names = {
|
||||
id: name
|
||||
for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
|
||||
}
|
||||
affiliation_ids_names = dict(
|
||||
zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
|
||||
)
|
||||
|
||||
# Get author names and affiliation names
|
||||
for author_node in meta.xpath(
|
||||
@@ -300,7 +299,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
def _add_abstract(
|
||||
self, doc: DoclingDocument, xml_components: XMLComponents
|
||||
) -> None:
|
||||
|
||||
for abstract in xml_components["abstract"]:
|
||||
text: str = abstract["content"]
|
||||
title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
|
||||
@@ -349,7 +347,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
return
|
||||
|
||||
def _parse_element_citation(self, node: etree._Element) -> str:
|
||||
def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901
|
||||
citation: Citation = {
|
||||
"author_names": "",
|
||||
"title": "",
|
||||
@@ -440,7 +438,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
|
||||
if len(node.xpath("lpage")) > 0:
|
||||
citation["page"] += (
|
||||
"–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
|
||||
"–" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001
|
||||
)
|
||||
|
||||
# Flatten the citation to string
|
||||
@@ -595,9 +593,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
try:
|
||||
self._add_table(doc, parent, table)
|
||||
except Exception as e:
|
||||
_log.warning(f"Skipping unsupported table in {str(self.file)}")
|
||||
pass
|
||||
except Exception:
|
||||
_log.warning(f"Skipping unsupported table in {self.file!s}")
|
||||
|
||||
return
|
||||
|
||||
@@ -609,7 +606,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
return
|
||||
|
||||
def _walk_linear(
|
||||
def _walk_linear( # noqa: C901
|
||||
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
|
||||
) -> str:
|
||||
skip_tags = ["term"]
|
||||
|
||||
@@ -122,7 +122,6 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
@override
|
||||
def convert(self) -> DoclingDocument:
|
||||
|
||||
if self.parser is not None:
|
||||
doc = self.parser.parse(self.patent_content)
|
||||
if doc is None:
|
||||
@@ -163,7 +162,6 @@ class PatentUspto(ABC):
|
||||
Returns:
|
||||
The patent parsed as a docling document.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class PatentUsptoIce(PatentUspto):
|
||||
@@ -265,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
|
||||
self.style_html = HtmlEntity()
|
||||
|
||||
@override
|
||||
def startElement(self, tag, attributes): # noqa: N802
|
||||
def startElement(self, tag, attributes):
|
||||
"""Signal the start of an element.
|
||||
|
||||
Args:
|
||||
@@ -281,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
|
||||
self._start_registered_elements(tag, attributes)
|
||||
|
||||
@override
|
||||
def skippedEntity(self, name): # noqa: N802
|
||||
def skippedEntity(self, name):
|
||||
"""Receive notification of a skipped entity.
|
||||
|
||||
HTML entities will be skipped by the parser. This method will unescape them
|
||||
@@ -315,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
|
||||
self.text += unescaped
|
||||
|
||||
@override
|
||||
def endElement(self, tag): # noqa: N802
|
||||
def endElement(self, tag):
|
||||
"""Signal the end of an element.
|
||||
|
||||
Args:
|
||||
@@ -603,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
self.style_html = HtmlEntity()
|
||||
|
||||
@override
|
||||
def startElement(self, tag, attributes): # noqa: N802
|
||||
def startElement(self, tag, attributes):
|
||||
"""Signal the start of an element.
|
||||
|
||||
Args:
|
||||
@@ -616,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
self._start_registered_elements(tag, attributes)
|
||||
|
||||
@override
|
||||
def skippedEntity(self, name): # noqa: N802
|
||||
def skippedEntity(self, name):
|
||||
"""Receive notification of a skipped entity.
|
||||
|
||||
HTML entities will be skipped by the parser. This method will unescape them
|
||||
@@ -650,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
self.text += unescaped
|
||||
|
||||
@override
|
||||
def endElement(self, tag): # noqa: N802
|
||||
def endElement(self, tag):
|
||||
"""Signal the end of an element.
|
||||
|
||||
Args:
|
||||
@@ -691,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
if tag in [member.value for member in self.Element]:
|
||||
if (
|
||||
tag == self.Element.HEADING.value
|
||||
and not self.Element.SDOCL.value in self.property
|
||||
and self.Element.SDOCL.value not in self.property
|
||||
):
|
||||
level_attr: str = attributes.get("LVL", "")
|
||||
new_level: int = int(level_attr) if level_attr.isnumeric() else 1
|
||||
@@ -743,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
# headers except claims statement
|
||||
elif (
|
||||
self.Element.HEADING.value in self.property
|
||||
and not self.Element.SDOCL.value in self.property
|
||||
and self.Element.SDOCL.value not in self.property
|
||||
and text.strip()
|
||||
):
|
||||
self.parents[self.level + 1] = self.doc.add_heading(
|
||||
@@ -1164,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
|
||||
self.style_html = HtmlEntity()
|
||||
|
||||
@override
|
||||
def startElement(self, tag, attributes): # noqa: N802
|
||||
def startElement(self, tag, attributes):
|
||||
"""Signal the start of an element.
|
||||
|
||||
Args:
|
||||
@@ -1177,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
|
||||
self._start_registered_elements(tag, attributes)
|
||||
|
||||
@override
|
||||
def skippedEntity(self, name): # noqa: N802
|
||||
def skippedEntity(self, name):
|
||||
"""Receive notification of a skipped entity.
|
||||
|
||||
HTML entities will be skipped by the parser. This method will unescape them
|
||||
@@ -1211,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
|
||||
self.text += unescaped
|
||||
|
||||
@override
|
||||
def endElement(self, tag): # noqa: N802
|
||||
def endElement(self, tag):
|
||||
"""Signal the end of an element.
|
||||
|
||||
Args:
|
||||
@@ -1474,9 +1472,7 @@ class XmlTable:
|
||||
if cw == 0:
|
||||
offset_w0.append(col["offset"][ic])
|
||||
|
||||
min_colinfo["offset"] = sorted(
|
||||
list(set(col["offset"] + min_colinfo["offset"]))
|
||||
)
|
||||
min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))
|
||||
|
||||
# add back the 0 width cols to offset list
|
||||
offset_w0 = list(set(offset_w0))
|
||||
@@ -1527,7 +1523,7 @@ class XmlTable:
|
||||
|
||||
return ncols_max
|
||||
|
||||
def _parse_table(self, table: Tag) -> TableData:
|
||||
def _parse_table(self, table: Tag) -> TableData: # noqa: C901
|
||||
"""Parse the content of a table tag.
|
||||
|
||||
Args:
|
||||
@@ -1722,7 +1718,7 @@ class HtmlEntity:
|
||||
"0": "⁰",
|
||||
"+": "⁺",
|
||||
"-": "⁻",
|
||||
"−": "⁻",
|
||||
"−": "⁻", # noqa: RUF001
|
||||
"=": "⁼",
|
||||
"(": "⁽",
|
||||
")": "⁾",
|
||||
@@ -1746,7 +1742,7 @@ class HtmlEntity:
|
||||
"0": "₀",
|
||||
"+": "₊",
|
||||
"-": "₋",
|
||||
"−": "₋",
|
||||
"−": "₋", # noqa: RUF001
|
||||
"=": "₌",
|
||||
"(": "₍",
|
||||
")": "₎",
|
||||
|
||||
Reference in New Issue
Block a user