ci: add coverage and ruff (#1383)
* add coverage calculation and push Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * new codecov version and usage of token Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * enable ruff formatter instead of black and isort Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff lint fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff unsafe fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add removed imports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * runs 1 on linter issues Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * finalize linter fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update pyproject.toml Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
293c28ca7c
commit
5458a88464
17
.github/codecov.yml
vendored
Normal file
17
.github/codecov.yml
vendored
Normal file
@ -0,0 +1,17 @@
|
||||
codecov:
|
||||
# https://docs.codecov.io/docs/comparing-commits
|
||||
allow_coverage_offsets: true
|
||||
coverage:
|
||||
status:
|
||||
project:
|
||||
default:
|
||||
informational: true
|
||||
target: auto # auto compares coverage to the previous base commit
|
||||
flags:
|
||||
- docling
|
||||
comment:
|
||||
layout: "reach, diff, flags, files"
|
||||
behavior: default
|
||||
require_changes: false # if true: only post the comment if coverage changes
|
||||
branches: # branch names that can post comment
|
||||
- "main"
|
2
.github/workflows/cd.yml
vendored
2
.github/workflows/cd.yml
vendored
@ -10,6 +10,8 @@ env:
|
||||
jobs:
|
||||
code-checks:
|
||||
uses: ./.github/workflows/checks.yml
|
||||
with:
|
||||
push_coverage: false
|
||||
pre-release-check:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
|
16
.github/workflows/checks.yml
vendored
16
.github/workflows/checks.yml
vendored
@ -1,5 +1,13 @@
|
||||
on:
|
||||
workflow_call:
|
||||
inputs:
|
||||
push_coverage:
|
||||
type: boolean
|
||||
description: "If true, the coverage results are pushed to codecov.io."
|
||||
default: true
|
||||
secrets:
|
||||
CODECOV_TOKEN:
|
||||
required: false
|
||||
|
||||
env:
|
||||
HF_HUB_DOWNLOAD_TIMEOUT: "60"
|
||||
@ -32,7 +40,13 @@ jobs:
|
||||
run: poetry install --all-extras
|
||||
- name: Testing
|
||||
run: |
|
||||
poetry run pytest -v tests
|
||||
poetry run pytest -v --cov=docling --cov-report=xml tests
|
||||
- name: Upload coverage to Codecov
|
||||
if: inputs.push_coverage
|
||||
uses: codecov/codecov-action@v5
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }}
|
||||
file: ./coverage.xml
|
||||
- name: Run examples
|
||||
run: |
|
||||
for file in docs/examples/*.py; do
|
||||
|
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
@ -17,3 +17,5 @@ jobs:
|
||||
code-checks:
|
||||
if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }}
|
||||
uses: ./.github/workflows/checks.yml
|
||||
secrets:
|
||||
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
||||
|
@ -1,43 +1,26 @@
|
||||
fail_fast: true
|
||||
repos:
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.11.5
|
||||
hooks:
|
||||
# Run the Ruff formatter.
|
||||
- id: ruff-format
|
||||
name: "Ruff formatter"
|
||||
args: [--config=pyproject.toml]
|
||||
files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
|
||||
# Run the Ruff linter.
|
||||
- id: ruff
|
||||
name: "Ruff linter"
|
||||
args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
|
||||
files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: black
|
||||
name: Black
|
||||
entry: poetry run black docling docs/examples tests
|
||||
pass_filenames: false
|
||||
language: system
|
||||
files: '\.py$'
|
||||
- id: isort
|
||||
name: isort
|
||||
entry: poetry run isort docling docs/examples tests
|
||||
pass_filenames: false
|
||||
language: system
|
||||
files: '\.py$'
|
||||
# - id: flake8
|
||||
# name: flake8
|
||||
# entry: poetry run flake8 docling
|
||||
# pass_filenames: false
|
||||
# language: system
|
||||
# files: '\.py$'
|
||||
- id: mypy
|
||||
name: MyPy
|
||||
entry: poetry run mypy docling
|
||||
pass_filenames: false
|
||||
language: system
|
||||
files: '\.py$'
|
||||
- id: nbqa_black
|
||||
name: nbQA Black
|
||||
entry: poetry run nbqa black docs/examples
|
||||
pass_filenames: false
|
||||
language: system
|
||||
files: '\.ipynb$'
|
||||
- id: nbqa_isort
|
||||
name: nbQA isort
|
||||
entry: poetry run nbqa isort docs/examples
|
||||
pass_filenames: false
|
||||
language: system
|
||||
files: '\.ipynb$'
|
||||
- id: poetry
|
||||
name: Poetry check
|
||||
entry: poetry check --lock
|
||||
|
@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||
self.lines = text_stream.split("\n")
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||
with open(self.path_or_stream, encoding="utf-8") as f:
|
||||
self.lines = f.readlines()
|
||||
self.valid = True
|
||||
|
||||
@ -75,14 +75,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
|
||||
return doc
|
||||
|
||||
def _parse(self, doc: DoclingDocument):
|
||||
def _parse(self, doc: DoclingDocument): # noqa: C901
|
||||
"""
|
||||
Main function that orchestrates the parsing by yielding components:
|
||||
title, section headers, text, lists, and tables.
|
||||
"""
|
||||
|
||||
content = ""
|
||||
|
||||
in_list = False
|
||||
in_table = False
|
||||
|
||||
@ -95,7 +93,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
||||
indents: dict[int, Union[GroupItem, None]] = {}
|
||||
|
||||
for i in range(0, 10):
|
||||
for i in range(10):
|
||||
parents[i] = None
|
||||
indents[i] = None
|
||||
|
||||
@ -125,7 +123,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
|
||||
# Lists
|
||||
elif self._is_list_item(line):
|
||||
|
||||
_log.debug(f"line: {line}")
|
||||
item = self._parse_list_item(line)
|
||||
_log.debug(f"parsed list-item: {item}")
|
||||
@ -147,7 +144,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
indents[level + 1] = item["indent"]
|
||||
|
||||
elif in_list and item["indent"] < indents[level]:
|
||||
|
||||
# print(item["indent"], " => ", indents[level])
|
||||
while item["indent"] < indents[level]:
|
||||
# print(item["indent"], " => ", indents[level])
|
||||
@ -176,7 +172,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
elif in_table and (
|
||||
(not self._is_table_line(line)) or line.strip() == "|==="
|
||||
): # end of table
|
||||
|
||||
caption = None
|
||||
if len(caption_data) > 0:
|
||||
caption = doc.add_text(
|
||||
@ -195,7 +190,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
|
||||
# Picture
|
||||
elif self._is_picture(line):
|
||||
|
||||
caption = None
|
||||
if len(caption_data) > 0:
|
||||
caption = doc.add_text(
|
||||
@ -250,7 +244,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
text_data = []
|
||||
|
||||
elif len(line.strip()) > 0: # allow multiline texts
|
||||
|
||||
item = self._parse_text(line)
|
||||
text_data.append(item["text"])
|
||||
|
||||
@ -273,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
|
||||
def _get_current_level(self, parents):
|
||||
for k, v in parents.items():
|
||||
if v == None and k > 0:
|
||||
if v is None and k > 0:
|
||||
return k - 1
|
||||
|
||||
return 0
|
||||
|
||||
def _get_current_parent(self, parents):
|
||||
for k, v in parents.items():
|
||||
if v == None and k > 0:
|
||||
if v is None and k > 0:
|
||||
return parents[k - 1]
|
||||
|
||||
return None
|
||||
@ -328,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
"marker": marker,
|
||||
"text": text.strip(),
|
||||
"numbered": False,
|
||||
"indent": 0 if indent == None else len(indent),
|
||||
"indent": 0 if indent is None else len(indent),
|
||||
}
|
||||
else:
|
||||
return {
|
||||
@ -336,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
"marker": marker,
|
||||
"text": text.strip(),
|
||||
"numbered": True,
|
||||
"indent": 0 if indent == None else len(indent),
|
||||
"indent": 0 if indent is None else len(indent),
|
||||
}
|
||||
else:
|
||||
# Fallback if no match
|
||||
@ -357,7 +350,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
return [cell.strip() for cell in line.split("|") if cell.strip()]
|
||||
|
||||
def _populate_table_as_grid(self, table_data):
|
||||
|
||||
num_rows = len(table_data)
|
||||
|
||||
# Adjust the table data into a grid format
|
||||
|
@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
|
||||
head = self.content.readline()
|
||||
dialect = csv.Sniffer().sniff(head, ",;\t|:")
|
||||
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
|
||||
if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
|
||||
if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
|
||||
raise RuntimeError(
|
||||
f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
|
||||
)
|
||||
|
@ -1,8 +1,9 @@
|
||||
import logging
|
||||
import random
|
||||
from collections.abc import Iterable
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Union
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||
@ -156,7 +157,6 @@ class DoclingParsePageBackend(PdfPageBackend):
|
||||
def get_page_image(
|
||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||
) -> Image.Image:
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
if not cropbox:
|
||||
|
@ -1,8 +1,9 @@
|
||||
import logging
|
||||
import random
|
||||
from collections.abc import Iterable
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
@ -172,7 +173,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
||||
def get_page_image(
|
||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||
) -> Image.Image:
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
if not cropbox:
|
||||
|
@ -1,14 +1,14 @@
|
||||
import logging
|
||||
import random
|
||||
from collections.abc import Iterable
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
|
||||
from PIL import Image, ImageDraw
|
||||
from PIL import Image
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
@ -93,7 +93,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
||||
def get_page_image(
|
||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||
) -> Image.Image:
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
if not cropbox:
|
||||
|
@ -1,12 +1,8 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
|
||||
On 23/01/2025
|
||||
"""
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
|
||||
|
||||
BLANK = ""
|
||||
@ -79,7 +75,6 @@ CHR_BO = {
|
||||
}
|
||||
|
||||
T = {
|
||||
"\u2192": "\\rightarrow ",
|
||||
# Greek letters
|
||||
"\U0001d6fc": "\\alpha ",
|
||||
"\U0001d6fd": "\\beta ",
|
||||
|
@ -76,8 +76,7 @@ def get_val(key, default=None, store=CHR):
|
||||
return default
|
||||
|
||||
|
||||
class Tag2Method(object):
|
||||
|
||||
class Tag2Method:
|
||||
def call_method(self, elm, stag=None):
|
||||
getmethod = self.tag2meth.get
|
||||
if stag is None:
|
||||
@ -130,7 +129,6 @@ class Tag2Method(object):
|
||||
|
||||
|
||||
class Pr(Tag2Method):
|
||||
|
||||
text = ""
|
||||
|
||||
__val_tags = ("chr", "pos", "begChr", "endChr", "type")
|
||||
@ -159,7 +157,7 @@ class Pr(Tag2Method):
|
||||
def do_common(self, elm):
|
||||
stag = elm.tag.replace(OMML_NS, "")
|
||||
if stag in self.__val_tags:
|
||||
t = elm.get("{0}val".format(OMML_NS))
|
||||
t = elm.get(f"{OMML_NS}val")
|
||||
self.__innerdict[stag] = t
|
||||
return None
|
||||
|
||||
@ -248,7 +246,6 @@ class oMath2Latex(Tag2Method):
|
||||
"""
|
||||
the Pre-Sub-Superscript object -- Not support yet
|
||||
"""
|
||||
pass
|
||||
|
||||
def do_sub(self, elm):
|
||||
text = self.process_children(elm)
|
||||
@ -331,7 +328,7 @@ class oMath2Latex(Tag2Method):
|
||||
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
||||
latex_s = LIM_FUNC.get(t_dict["e"])
|
||||
if not latex_s:
|
||||
raise NotSupport("Not support lim %s" % t_dict["e"])
|
||||
raise RuntimeError("Not support lim {}".format(t_dict["e"]))
|
||||
else:
|
||||
return latex_s.format(lim=t_dict.get("lim"))
|
||||
|
||||
@ -413,7 +410,7 @@ class oMath2Latex(Tag2Method):
|
||||
"""
|
||||
_str = []
|
||||
_base_str = []
|
||||
found_text = elm.findtext("./{0}t".format(OMML_NS))
|
||||
found_text = elm.findtext(f"./{OMML_NS}t")
|
||||
if found_text:
|
||||
for s in found_text:
|
||||
out_latex_str = self.process_unicode(s)
|
||||
|
@ -55,7 +55,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.max_levels = 10
|
||||
self.level = 0
|
||||
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
||||
for i in range(0, self.max_levels):
|
||||
for i in range(self.max_levels):
|
||||
self.parents[i] = None
|
||||
|
||||
try:
|
||||
@ -126,7 +126,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
return doc
|
||||
|
||||
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||
|
||||
# Iterate over elements in the body of the document
|
||||
text: str = ""
|
||||
for element in tag.children:
|
||||
@ -135,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.analyze_tag(cast(Tag, element), doc)
|
||||
except Exception as exc_child:
|
||||
_log.error(
|
||||
f"Error processing child from tag {tag.name}: {repr(exc_child)}"
|
||||
f"Error processing child from tag {tag.name}: {exc_child!r}"
|
||||
)
|
||||
raise exc_child
|
||||
elif isinstance(element, NavigableString) and not isinstance(
|
||||
@ -147,7 +146,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
item for item in element.next_siblings if isinstance(item, Tag)
|
||||
]
|
||||
if element.next_sibling is None or any(
|
||||
[item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
|
||||
item.name in TAGS_FOR_NODE_ITEMS for item in siblings
|
||||
):
|
||||
text = text.strip()
|
||||
if text and tag.name in ["div"]:
|
||||
@ -222,7 +221,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
else:
|
||||
if hlevel > self.level:
|
||||
|
||||
# add invisible group
|
||||
for i in range(self.level + 1, hlevel):
|
||||
self.parents[i] = doc.add_group(
|
||||
@ -234,7 +232,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.level = hlevel
|
||||
|
||||
elif hlevel < self.level:
|
||||
|
||||
# remove the tail
|
||||
for key in self.parents.keys():
|
||||
if key > hlevel:
|
||||
@ -360,7 +357,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
marker = ""
|
||||
enumerated = False
|
||||
if parent_label == GroupLabel.ORDERED_LIST:
|
||||
marker = f"{str(index_in_list)}."
|
||||
marker = f"{index_in_list!s}."
|
||||
enumerated = True
|
||||
doc.add_list_item(
|
||||
text=text,
|
||||
|
@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
# otherwise they represent emphasis (bold or italic)
|
||||
self.markdown = self._shorten_underscore_sequences(text_stream)
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||
with open(self.path_or_stream, encoding="utf-8") as f:
|
||||
md_content = f.read()
|
||||
# remove invalid sequences
|
||||
# very long sequences of underscores will lead to unnecessary long processing times.
|
||||
@ -168,7 +168,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
self.inline_texts = []
|
||||
|
||||
def _iterate_elements(
|
||||
def _iterate_elements( # noqa: C901
|
||||
self,
|
||||
element: marko.element.Element,
|
||||
depth: int,
|
||||
@ -176,7 +176,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
visited: Set[marko.element.Element],
|
||||
parent_item: Optional[NodeItem] = None,
|
||||
):
|
||||
|
||||
if element in visited:
|
||||
return
|
||||
|
||||
@ -236,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
if has_non_empty_list_items:
|
||||
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
||||
parent_item = doc.add_group(
|
||||
label=label, name=f"list", parent=parent_item
|
||||
label=label, name="list", parent=parent_item
|
||||
)
|
||||
|
||||
elif (
|
||||
@ -320,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
self._html_blocks += 1
|
||||
self._process_inline_text(parent_item, doc)
|
||||
self._close_table(doc)
|
||||
_log.debug("HTML Block: {}".format(element))
|
||||
_log.debug(f"HTML Block: {element}")
|
||||
if (
|
||||
len(element.body) > 0
|
||||
): # If Marko doesn't return any content for HTML block, skip it
|
||||
@ -332,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
else:
|
||||
if not isinstance(element, str):
|
||||
self._close_table(doc)
|
||||
_log.debug("Some other element: {}".format(element))
|
||||
_log.debug(f"Some other element: {element}")
|
||||
|
||||
processed_block_types = (
|
||||
marko.block.Heading,
|
||||
@ -398,7 +397,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
||||
if self._html_blocks > 0:
|
||||
|
||||
# export to HTML
|
||||
html_backend_cls = HTMLDocumentBackend
|
||||
html_str = doc.export_to_html()
|
||||
|
@ -184,7 +184,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
||||
"""
|
||||
|
||||
if self.workbook is not None:
|
||||
|
||||
# Iterate over all sheets
|
||||
for sheet_name in self.workbook.sheetnames:
|
||||
_log.info(f"Processing sheet: {sheet_name}")
|
||||
@ -253,7 +252,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
||||
)
|
||||
|
||||
for excel_cell in excel_table.data:
|
||||
|
||||
cell = TableCell(
|
||||
text=excel_cell.text,
|
||||
row_span=excel_cell.row_span,
|
||||
@ -303,7 +301,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
||||
# Iterate over all cells in the sheet
|
||||
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
|
||||
for rj, cell in enumerate(row):
|
||||
|
||||
# Skip empty or already visited cells
|
||||
if cell.value is None or (ri, rj) in visited:
|
||||
continue
|
||||
@ -342,7 +339,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
||||
visited_cells: set[tuple[int, int]] = set()
|
||||
for ri in range(start_row, max_row + 1):
|
||||
for rj in range(start_col, max_col + 1):
|
||||
|
||||
cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
|
||||
|
||||
# Check if the cell belongs to a merged range
|
||||
@ -350,14 +346,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
||||
col_span = 1
|
||||
|
||||
for merged_range in sheet.merged_cells.ranges:
|
||||
|
||||
if (
|
||||
merged_range.min_row <= ri + 1
|
||||
and ri + 1 <= merged_range.max_row
|
||||
and merged_range.min_col <= rj + 1
|
||||
and rj + 1 <= merged_range.max_col
|
||||
):
|
||||
|
||||
row_span = merged_range.max_row - merged_range.min_row + 1
|
||||
col_span = merged_range.max_col - merged_range.min_col + 1
|
||||
break
|
||||
@ -499,7 +493,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
||||
),
|
||||
),
|
||||
)
|
||||
except:
|
||||
except Exception:
|
||||
_log.error("could not extract the image from excel sheets")
|
||||
|
||||
return doc
|
||||
|
@ -120,13 +120,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
|
||||
return prov
|
||||
|
||||
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
|
||||
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901
|
||||
is_a_list = False
|
||||
is_list_group_created = False
|
||||
enum_list_item_value = 0
|
||||
new_list = None
|
||||
bullet_type = "None"
|
||||
list_text = ""
|
||||
list_label = GroupLabel.LIST
|
||||
doc_label = DocItemLabel.LIST_ITEM
|
||||
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
|
||||
@ -243,7 +242,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
enum_marker = str(enum_list_item_value) + "."
|
||||
if not is_list_group_created:
|
||||
new_list = doc.add_group(
|
||||
label=list_label, name=f"list", parent=parent_slide
|
||||
label=list_label, name="list", parent=parent_slide
|
||||
)
|
||||
is_list_group_created = True
|
||||
doc.add_list_item(
|
||||
@ -368,11 +367,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
slide_width = pptx_obj.slide_width
|
||||
slide_height = pptx_obj.slide_height
|
||||
|
||||
text_content = [] # type: ignore
|
||||
|
||||
max_levels = 10
|
||||
parents = {} # type: ignore
|
||||
for i in range(0, max_levels):
|
||||
for i in range(max_levels):
|
||||
parents[i] = None
|
||||
|
||||
# Loop through each slide
|
||||
@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
)
|
||||
|
||||
slide_size = Size(width=slide_width, height=slide_height)
|
||||
parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
|
||||
doc.add_page(page_no=slide_ind + 1, size=slide_size)
|
||||
|
||||
def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
|
||||
handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
|
||||
|
@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
def _get_level(self) -> int:
|
||||
"""Return the first None index."""
|
||||
for k, v in self.parents.items():
|
||||
if k >= 0 and v == None:
|
||||
if k >= 0 and v is None:
|
||||
return k
|
||||
return 0
|
||||
|
||||
@ -418,7 +418,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
else prev_parent
|
||||
)
|
||||
|
||||
def _handle_text_elements(
|
||||
def _handle_text_elements( # noqa: C901
|
||||
self,
|
||||
element: BaseOxmlElement,
|
||||
docx_obj: DocxDocument,
|
||||
@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
|
||||
)
|
||||
if cell is None or cell._tc in cell_set:
|
||||
_log.debug(f" skipped since repeated content")
|
||||
_log.debug(" skipped since repeated content")
|
||||
col_idx += cell.grid_span
|
||||
continue
|
||||
else:
|
||||
@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||
caption=None,
|
||||
)
|
||||
except (UnidentifiedImageError, OSError) as e:
|
||||
except (UnidentifiedImageError, OSError):
|
||||
_log.warning("Warning: image cannot be loaded by Pillow")
|
||||
doc.add_picture(
|
||||
parent=self.parents[level - 1],
|
||||
|
@ -1,7 +1,8 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from collections.abc import Iterable
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Set, Union
|
||||
from typing import Optional, Set, Union
|
||||
|
||||
from docling_core.types.doc import BoundingBox, Size
|
||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||
|
@ -1,8 +1,9 @@
|
||||
import logging
|
||||
import random
|
||||
from collections.abc import Iterable
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, List, Optional, Union
|
||||
|
||||
import pypdfium2 as pdfium
|
||||
import pypdfium2.raw as pdfium_c
|
||||
@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
self.valid = True # No better way to tell from pypdfium.
|
||||
try:
|
||||
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
||||
except PdfiumError as e:
|
||||
except PdfiumError:
|
||||
_log.info(
|
||||
f"An exception occurred when loading page {page_no} of document {document_hash}.",
|
||||
exc_info=True,
|
||||
@ -225,7 +226,6 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
def get_page_image(
|
||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||
) -> Image.Image:
|
||||
|
||||
page_size = self.get_size()
|
||||
|
||||
if not cropbox:
|
||||
|
@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
doc_info: etree.DocInfo = self.tree.docinfo
|
||||
if doc_info.system_url and any(
|
||||
[kwd in doc_info.system_url for kwd in JATS_DTD_URL]
|
||||
kwd in doc_info.system_url for kwd in JATS_DTD_URL
|
||||
):
|
||||
self.valid = True
|
||||
return
|
||||
for ent in doc_info.internalDTD.iterentities():
|
||||
if ent.system_url and any(
|
||||
[kwd in ent.system_url for kwd in JATS_DTD_URL]
|
||||
kwd in ent.system_url for kwd in JATS_DTD_URL
|
||||
):
|
||||
self.valid = True
|
||||
return
|
||||
@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
# TODO: once superscript is supported, add label with formatting
|
||||
aff = aff.removeprefix(f"{label[0].text}, ")
|
||||
affiliation_names.append(aff)
|
||||
affiliation_ids_names = {
|
||||
id: name
|
||||
for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
|
||||
}
|
||||
affiliation_ids_names = dict(
|
||||
zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
|
||||
)
|
||||
|
||||
# Get author names and affiliation names
|
||||
for author_node in meta.xpath(
|
||||
@ -300,7 +299,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
def _add_abstract(
|
||||
self, doc: DoclingDocument, xml_components: XMLComponents
|
||||
) -> None:
|
||||
|
||||
for abstract in xml_components["abstract"]:
|
||||
text: str = abstract["content"]
|
||||
title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
|
||||
@ -349,7 +347,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
return
|
||||
|
||||
def _parse_element_citation(self, node: etree._Element) -> str:
|
||||
def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901
|
||||
citation: Citation = {
|
||||
"author_names": "",
|
||||
"title": "",
|
||||
@ -440,7 +438,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
|
||||
if len(node.xpath("lpage")) > 0:
|
||||
citation["page"] += (
|
||||
"–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
|
||||
"–" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001
|
||||
)
|
||||
|
||||
# Flatten the citation to string
|
||||
@ -595,9 +593,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
try:
|
||||
self._add_table(doc, parent, table)
|
||||
except Exception as e:
|
||||
_log.warning(f"Skipping unsupported table in {str(self.file)}")
|
||||
pass
|
||||
except Exception:
|
||||
_log.warning(f"Skipping unsupported table in {self.file!s}")
|
||||
|
||||
return
|
||||
|
||||
@ -609,7 +606,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
return
|
||||
|
||||
def _walk_linear(
|
||||
def _walk_linear( # noqa: C901
|
||||
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
|
||||
) -> str:
|
||||
skip_tags = ["term"]
|
||||
|
@ -122,7 +122,6 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
@override
|
||||
def convert(self) -> DoclingDocument:
|
||||
|
||||
if self.parser is not None:
|
||||
doc = self.parser.parse(self.patent_content)
|
||||
if doc is None:
|
||||
@ -163,7 +162,6 @@ class PatentUspto(ABC):
|
||||
Returns:
|
||||
The patent parsed as a docling document.
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class PatentUsptoIce(PatentUspto):
|
||||
@ -265,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
|
||||
self.style_html = HtmlEntity()
|
||||
|
||||
@override
|
||||
def startElement(self, tag, attributes): # noqa: N802
|
||||
def startElement(self, tag, attributes):
|
||||
"""Signal the start of an element.
|
||||
|
||||
Args:
|
||||
@ -281,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
|
||||
self._start_registered_elements(tag, attributes)
|
||||
|
||||
@override
|
||||
def skippedEntity(self, name): # noqa: N802
|
||||
def skippedEntity(self, name):
|
||||
"""Receive notification of a skipped entity.
|
||||
|
||||
HTML entities will be skipped by the parser. This method will unescape them
|
||||
@ -315,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
|
||||
self.text += unescaped
|
||||
|
||||
@override
|
||||
def endElement(self, tag): # noqa: N802
|
||||
def endElement(self, tag):
|
||||
"""Signal the end of an element.
|
||||
|
||||
Args:
|
||||
@ -603,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
self.style_html = HtmlEntity()
|
||||
|
||||
@override
|
||||
def startElement(self, tag, attributes): # noqa: N802
|
||||
def startElement(self, tag, attributes):
|
||||
"""Signal the start of an element.
|
||||
|
||||
Args:
|
||||
@ -616,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
self._start_registered_elements(tag, attributes)
|
||||
|
||||
@override
|
||||
def skippedEntity(self, name): # noqa: N802
|
||||
def skippedEntity(self, name):
|
||||
"""Receive notification of a skipped entity.
|
||||
|
||||
HTML entities will be skipped by the parser. This method will unescape them
|
||||
@ -650,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
self.text += unescaped
|
||||
|
||||
@override
|
||||
def endElement(self, tag): # noqa: N802
|
||||
def endElement(self, tag):
|
||||
"""Signal the end of an element.
|
||||
|
||||
Args:
|
||||
@ -691,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
if tag in [member.value for member in self.Element]:
|
||||
if (
|
||||
tag == self.Element.HEADING.value
|
||||
and not self.Element.SDOCL.value in self.property
|
||||
and self.Element.SDOCL.value not in self.property
|
||||
):
|
||||
level_attr: str = attributes.get("LVL", "")
|
||||
new_level: int = int(level_attr) if level_attr.isnumeric() else 1
|
||||
@ -743,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
||||
# headers except claims statement
|
||||
elif (
|
||||
self.Element.HEADING.value in self.property
|
||||
and not self.Element.SDOCL.value in self.property
|
||||
and self.Element.SDOCL.value not in self.property
|
||||
and text.strip()
|
||||
):
|
||||
self.parents[self.level + 1] = self.doc.add_heading(
|
||||
@ -1164,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
|
||||
self.style_html = HtmlEntity()
|
||||
|
||||
@override
|
||||
def startElement(self, tag, attributes): # noqa: N802
|
||||
def startElement(self, tag, attributes):
|
||||
"""Signal the start of an element.
|
||||
|
||||
Args:
|
||||
@ -1177,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
|
||||
self._start_registered_elements(tag, attributes)
|
||||
|
||||
@override
|
||||
def skippedEntity(self, name): # noqa: N802
|
||||
def skippedEntity(self, name):
|
||||
"""Receive notification of a skipped entity.
|
||||
|
||||
HTML entities will be skipped by the parser. This method will unescape them
|
||||
@ -1211,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
|
||||
self.text += unescaped
|
||||
|
||||
@override
|
||||
def endElement(self, tag): # noqa: N802
|
||||
def endElement(self, tag):
|
||||
"""Signal the end of an element.
|
||||
|
||||
Args:
|
||||
@ -1474,9 +1472,7 @@ class XmlTable:
|
||||
if cw == 0:
|
||||
offset_w0.append(col["offset"][ic])
|
||||
|
||||
min_colinfo["offset"] = sorted(
|
||||
list(set(col["offset"] + min_colinfo["offset"]))
|
||||
)
|
||||
min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))
|
||||
|
||||
# add back the 0 width cols to offset list
|
||||
offset_w0 = list(set(offset_w0))
|
||||
@ -1527,7 +1523,7 @@ class XmlTable:
|
||||
|
||||
return ncols_max
|
||||
|
||||
def _parse_table(self, table: Tag) -> TableData:
|
||||
def _parse_table(self, table: Tag) -> TableData: # noqa: C901
|
||||
"""Parse the content of a table tag.
|
||||
|
||||
Args:
|
||||
@ -1722,7 +1718,7 @@ class HtmlEntity:
|
||||
"0": "⁰",
|
||||
"+": "⁺",
|
||||
"-": "⁻",
|
||||
"−": "⁻",
|
||||
"−": "⁻", # noqa: RUF001
|
||||
"=": "⁼",
|
||||
"(": "⁽",
|
||||
")": "⁾",
|
||||
@ -1746,7 +1742,7 @@ class HtmlEntity:
|
||||
"0": "₀",
|
||||
"+": "₊",
|
||||
"-": "₋",
|
||||
"−": "₋",
|
||||
"−": "₋", # noqa: RUF001
|
||||
"=": "₌",
|
||||
"(": "₍",
|
||||
")": "₎",
|
||||
|
@ -6,14 +6,16 @@ import sys
|
||||
import tempfile
|
||||
import time
|
||||
import warnings
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
||||
from typing import Annotated, Dict, List, Optional, Type
|
||||
|
||||
import rich.table
|
||||
import typer
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
from docling_core.utils.file import resolve_source_to_path
|
||||
from pydantic import TypeAdapter
|
||||
from rich.console import Console
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
@ -53,7 +55,6 @@ warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|
|
||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
from rich.console import Console
|
||||
|
||||
console = Console()
|
||||
err_console = Console(stderr=True)
|
||||
@ -160,7 +161,6 @@ def export_documents(
|
||||
export_doctags: bool,
|
||||
image_export_mode: ImageRefMode,
|
||||
):
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
|
||||
@ -233,7 +233,7 @@ def _split_list(raw: Optional[str]) -> Optional[List[str]]:
|
||||
|
||||
|
||||
@app.command(no_args_is_help=True)
|
||||
def convert(
|
||||
def convert( # noqa: C901
|
||||
input_sources: Annotated[
|
||||
List[str],
|
||||
typer.Argument(
|
||||
@ -289,7 +289,7 @@ def convert(
|
||||
...,
|
||||
help=(
|
||||
f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
|
||||
f"{', '.join((o.value for o in ocr_engines_enum_internal))}. "
|
||||
f"{', '.join(o.value for o in ocr_engines_enum_internal)}. "
|
||||
f"Use the option --show-external-plugins to see the options allowed with external plugins."
|
||||
),
|
||||
),
|
||||
@ -430,7 +430,7 @@ def convert(
|
||||
settings.debug.visualize_ocr = debug_visualize_ocr
|
||||
|
||||
if from_formats is None:
|
||||
from_formats = [e for e in InputFormat]
|
||||
from_formats = list(InputFormat)
|
||||
|
||||
parsed_headers: Optional[Dict[str, str]] = None
|
||||
if headers is not None:
|
||||
|
@ -62,7 +62,7 @@ def download(
|
||||
models: Annotated[
|
||||
Optional[list[_AvailableModels]],
|
||||
typer.Argument(
|
||||
help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
|
||||
help="Models to download (default behavior: a predefined set of models will be downloaded).",
|
||||
),
|
||||
] = None,
|
||||
all: Annotated[
|
||||
@ -89,14 +89,13 @@ def download(
|
||||
"Cannot simultaneously set 'all' parameter and specify models to download."
|
||||
)
|
||||
if not quiet:
|
||||
FORMAT = "%(message)s"
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="[blue]%(message)s[/blue]",
|
||||
datefmt="[%X]",
|
||||
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
||||
)
|
||||
to_download = models or ([m for m in _AvailableModels] if all else _default_models)
|
||||
to_download = models or (list(_AvailableModels) if all else _default_models)
|
||||
output_dir = download_models(
|
||||
output_dir=output_dir,
|
||||
force=force,
|
||||
|
@ -10,7 +10,9 @@ from docling_core.types.doc import (
|
||||
TableCell,
|
||||
)
|
||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
||||
|
||||
# DO NOT REMOVE; explicitly exposed from this location
|
||||
from docling_core.types.io import (
|
||||
DocumentStream,
|
||||
)
|
||||
from PIL.Image import Image
|
||||
@ -233,9 +235,9 @@ class Page(BaseModel):
|
||||
None # Internal PDF backend. By default it is cleared during assembling.
|
||||
)
|
||||
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
||||
_image_cache: Dict[float, Image] = (
|
||||
{}
|
||||
) # Cache of images in different scales. By default it is cleared during assembling.
|
||||
_image_cache: Dict[
|
||||
float, Image
|
||||
] = {} # Cache of images in different scales. By default it is cleared during assembling.
|
||||
|
||||
def get_image(
|
||||
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
|
||||
@ -243,7 +245,7 @@ class Page(BaseModel):
|
||||
if self._backend is None:
|
||||
return self._image_cache.get(scale, None)
|
||||
|
||||
if not scale in self._image_cache:
|
||||
if scale not in self._image_cache:
|
||||
if cropbox is None:
|
||||
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
||||
else:
|
||||
|
@ -1,13 +1,13 @@
|
||||
import csv
|
||||
import logging
|
||||
import re
|
||||
from collections.abc import Iterable
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Literal,
|
||||
Optional,
|
||||
@ -17,6 +17,8 @@ from typing import (
|
||||
)
|
||||
|
||||
import filetype
|
||||
|
||||
# DO NOT REMOVE; explicitly exposed from this location
|
||||
from docling_core.types.doc import (
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
@ -35,14 +37,14 @@ from docling_core.types.legacy_doc.base import (
|
||||
PageReference,
|
||||
Prov,
|
||||
Ref,
|
||||
Table as DsSchemaTable,
|
||||
TableCell,
|
||||
)
|
||||
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
|
||||
from docling_core.types.legacy_doc.base import TableCell
|
||||
from docling_core.types.legacy_doc.document import (
|
||||
CCSDocumentDescription as DsDocumentDescription,
|
||||
CCSFileInfoObject as DsFileInfoObject,
|
||||
ExportedCCSDocument as DsDocument,
|
||||
)
|
||||
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
||||
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
||||
from docling_core.utils.file import resolve_source_to_stream
|
||||
from docling_core.utils.legacy import docling_document_to_legacy
|
||||
from pydantic import BaseModel
|
||||
@ -65,7 +67,7 @@ from docling.datamodel.base_models import (
|
||||
)
|
||||
from docling.datamodel.settings import DocumentLimits
|
||||
from docling.utils.profiling import ProfilingItem
|
||||
from docling.utils.utils import create_file_hash, create_hash
|
||||
from docling.utils.utils import create_file_hash
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.document_converter import FormatOption
|
||||
@ -134,9 +136,9 @@ class InputDocument(BaseModel):
|
||||
self._init_doc(backend, path_or_stream)
|
||||
|
||||
elif isinstance(path_or_stream, BytesIO):
|
||||
assert (
|
||||
filename is not None
|
||||
), "Can't construct InputDocument from stream without providing filename arg."
|
||||
assert filename is not None, (
|
||||
"Can't construct InputDocument from stream without providing filename arg."
|
||||
)
|
||||
self.file = PurePath(filename)
|
||||
self.filesize = path_or_stream.getbuffer().nbytes
|
||||
|
||||
@ -228,7 +230,6 @@ class _DummyBackend(AbstractDocumentBackend):
|
||||
|
||||
|
||||
class _DocumentConversionInput(BaseModel):
|
||||
|
||||
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
||||
headers: Optional[Dict[str, str]] = None
|
||||
limits: Optional[DocumentLimits] = DocumentLimits()
|
||||
|
@ -380,7 +380,6 @@ class PaginatedPipelineOptions(PipelineOptions):
|
||||
|
||||
|
||||
class VlmPipelineOptions(PaginatedPipelineOptions):
|
||||
|
||||
generate_page_images: bool = True
|
||||
force_backend_text: bool = (
|
||||
False # (To be used with vlms, or other generative models)
|
||||
|
@ -1,11 +1,11 @@
|
||||
import hashlib
|
||||
import logging
|
||||
import math
|
||||
import sys
|
||||
import time
|
||||
from collections.abc import Iterable, Iterator
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
|
||||
from typing import Dict, List, Optional, Tuple, Type, Union
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||
|
||||
@ -172,7 +172,7 @@ class DocumentConverter:
|
||||
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
||||
):
|
||||
self.allowed_formats = (
|
||||
allowed_formats if allowed_formats is not None else [e for e in InputFormat]
|
||||
allowed_formats if allowed_formats is not None else list(InputFormat)
|
||||
)
|
||||
self.format_to_options = {
|
||||
format: (
|
||||
@ -254,7 +254,7 @@ class DocumentConverter:
|
||||
|
||||
if not had_result and raises_on_error:
|
||||
raise ConversionError(
|
||||
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
||||
"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
||||
)
|
||||
|
||||
def _convert(
|
||||
@ -266,7 +266,7 @@ class DocumentConverter:
|
||||
conv_input.docs(self.format_to_options),
|
||||
settings.perf.doc_batch_size, # pass format_options
|
||||
):
|
||||
_log.info(f"Going to convert document batch...")
|
||||
_log.info("Going to convert document batch...")
|
||||
|
||||
# parallel processing only within input_batch
|
||||
# with ThreadPoolExecutor(
|
||||
|
@ -1,4 +1,4 @@
|
||||
from typing import Iterable
|
||||
from collections.abc import Iterable
|
||||
|
||||
from docling.datamodel.base_models import Page, VlmPrediction
|
||||
from docling.datamodel.document import ConversionResult
|
||||
@ -10,7 +10,6 @@ from docling.utils.profiling import TimeRecorder
|
||||
|
||||
|
||||
class ApiVlmModel(BasePageModel):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
|
@ -1,5 +1,6 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Generic, Iterable, Optional, Protocol, Type
|
||||
from collections.abc import Iterable
|
||||
from typing import Generic, Optional, Protocol, Type
|
||||
|
||||
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
||||
from typing_extensions import TypeVar
|
||||
@ -29,7 +30,6 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
|
||||
|
||||
|
||||
class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
|
||||
|
||||
elements_batch_size: int = settings.perf.elements_batch_size
|
||||
|
||||
@abstractmethod
|
||||
@ -50,7 +50,6 @@ class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
|
||||
|
||||
|
||||
class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
|
||||
|
||||
def prepare_element(
|
||||
self, conv_res: ConversionResult, element: NodeItem
|
||||
) -> Optional[NodeItem]:
|
||||
@ -62,7 +61,6 @@ class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
|
||||
class BaseItemAndImageEnrichmentModel(
|
||||
GenericEnrichmentModel[ItemAndImageEnrichmentElement]
|
||||
):
|
||||
|
||||
images_scale: float
|
||||
expansion_factor: float = 0.0
|
||||
|
||||
|
@ -1,12 +1,12 @@
|
||||
import copy
|
||||
import logging
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Type
|
||||
from typing import List, Optional, Type
|
||||
|
||||
import numpy as np
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
|
||||
from PIL import Image, ImageDraw
|
||||
from rtree import index
|
||||
from scipy.ndimage import binary_dilation, find_objects, label
|
||||
|
@ -1,7 +1,8 @@
|
||||
import re
|
||||
from collections import Counter
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
from docling_core.types.doc import (
|
||||
|
@ -1,5 +1,6 @@
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
from docling_core.types.doc import (
|
||||
|
@ -1,8 +1,9 @@
|
||||
import logging
|
||||
import warnings
|
||||
import zipfile
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional, Type
|
||||
from typing import List, Optional, Type
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
@ -58,12 +59,10 @@ class EasyOcrModel(BaseOcrModel):
|
||||
device = decide_device(accelerator_options.device)
|
||||
# Enable easyocr GPU if running on CUDA, MPS
|
||||
use_gpu = any(
|
||||
[
|
||||
device.startswith(x)
|
||||
for x in [
|
||||
AcceleratorDevice.CUDA.value,
|
||||
AcceleratorDevice.MPS.value,
|
||||
]
|
||||
device.startswith(x)
|
||||
for x in [
|
||||
AcceleratorDevice.CUDA.value,
|
||||
AcceleratorDevice.MPS.value,
|
||||
]
|
||||
)
|
||||
else:
|
||||
@ -98,8 +97,10 @@ class EasyOcrModel(BaseOcrModel):
|
||||
progress: bool = False,
|
||||
) -> Path:
|
||||
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
|
||||
from easyocr.config import detection_models as det_models_dict
|
||||
from easyocr.config import recognition_models as rec_models_dict
|
||||
from easyocr.config import (
|
||||
detection_models as det_models_dict,
|
||||
recognition_models as rec_models_dict,
|
||||
)
|
||||
|
||||
if local_dir is None:
|
||||
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
|
||||
@ -126,13 +127,11 @@ class EasyOcrModel(BaseOcrModel):
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
return
|
||||
|
||||
for page in page_batch:
|
||||
|
||||
assert page._backend is not None
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
|
@ -9,7 +9,7 @@ from docling.models.factories.picture_description_factory import (
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@lru_cache()
|
||||
@lru_cache
|
||||
def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
||||
factory = OcrFactory()
|
||||
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
|
||||
@ -17,7 +17,7 @@ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
||||
return factory
|
||||
|
||||
|
||||
@lru_cache()
|
||||
@lru_cache
|
||||
def get_picture_description_factory(
|
||||
allow_external_plugins: bool = False,
|
||||
) -> PictureDescriptionFactory:
|
||||
|
@ -33,7 +33,7 @@ class BaseFactory(Generic[A], metaclass=ABCMeta):
|
||||
|
||||
@property
|
||||
def registered_kind(self) -> list[str]:
|
||||
return list(opt.kind for opt in self._classes.keys())
|
||||
return [opt.kind for opt in self._classes.keys()]
|
||||
|
||||
def get_enum(self) -> enum.Enum:
|
||||
return enum.Enum(
|
||||
|
@ -1,25 +1,22 @@
|
||||
import logging
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional
|
||||
from typing import Optional
|
||||
|
||||
from docling.datamodel.base_models import Page, VlmPrediction
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
HuggingFaceVlmOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HuggingFaceMlxModel(BasePageModel):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
@ -32,7 +29,6 @@ class HuggingFaceMlxModel(BasePageModel):
|
||||
self.vlm_options = vlm_options
|
||||
|
||||
if self.enabled:
|
||||
|
||||
try:
|
||||
from mlx_vlm import generate, load # type: ignore
|
||||
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
|
||||
@ -125,6 +121,8 @@ class HuggingFaceMlxModel(BasePageModel):
|
||||
generation_time = time.time() - start_time
|
||||
page_tags = output
|
||||
|
||||
_log.debug(f"Generation time {generation_time:.2f} seconds.")
|
||||
|
||||
# inference_time = time.time() - start_time
|
||||
# tokens_per_second = num_tokens / generation_time
|
||||
# print("")
|
||||
|
@ -1,16 +1,15 @@
|
||||
import logging
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional
|
||||
from typing import Optional
|
||||
|
||||
from docling.datamodel.base_models import Page, VlmPrediction
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
HuggingFaceVlmOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
@ -19,7 +18,6 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class HuggingFaceVlmModel(BasePageModel):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
@ -42,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel):
|
||||
device = decide_device(accelerator_options.device)
|
||||
self.device = device
|
||||
|
||||
_log.debug("Available device for HuggingFace VLM: {}".format(device))
|
||||
_log.debug(f"Available device for HuggingFace VLM: {device}")
|
||||
|
||||
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
||||
|
||||
@ -168,6 +166,10 @@ class HuggingFaceVlmModel(BasePageModel):
|
||||
num_tokens = len(generated_ids[0])
|
||||
page_tags = generated_texts
|
||||
|
||||
_log.debug(
|
||||
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
|
||||
)
|
||||
|
||||
# inference_time = time.time() - start_time
|
||||
# tokens_per_second = num_tokens / generation_time
|
||||
# print("")
|
||||
|
@ -1,8 +1,9 @@
|
||||
import copy
|
||||
import logging
|
||||
import warnings
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Union
|
||||
from typing import Optional
|
||||
|
||||
from docling_core.types.doc import DocItemLabel
|
||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||
@ -142,7 +143,6 @@ class LayoutModel(BasePageModel):
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
for page in page_batch:
|
||||
assert page._backend is not None
|
||||
if not page._backend.is_valid():
|
||||
|
@ -1,8 +1,9 @@
|
||||
import logging
|
||||
import sys
|
||||
import tempfile
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Tuple, Type
|
||||
from typing import Optional, Type
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel):
|
||||
|
||||
if self.enabled:
|
||||
if "darwin" != sys.platform:
|
||||
raise RuntimeError(f"OcrMac is only supported on Mac.")
|
||||
raise RuntimeError("OcrMac is only supported on Mac.")
|
||||
install_errmsg = (
|
||||
"ocrmac is not correctly installed. "
|
||||
"Please install it via `pip install ocrmac` to use this OCR engine. "
|
||||
@ -58,7 +59,6 @@ class OcrMacModel(BaseOcrModel):
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
return
|
||||
@ -69,7 +69,6 @@ class OcrMacModel(BaseOcrModel):
|
||||
yield page
|
||||
else:
|
||||
with TimeRecorder(conv_res, "ocr"):
|
||||
|
||||
ocr_rects = self.get_ocr_rects(page)
|
||||
|
||||
all_ocr_cells = []
|
||||
|
@ -1,6 +1,7 @@
|
||||
import logging
|
||||
import re
|
||||
from typing import Iterable, List
|
||||
from collections.abc import Iterable
|
||||
from typing import List
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel):
|
||||
sanitized_text = "".join(lines)
|
||||
|
||||
# Text normalization
|
||||
sanitized_text = sanitized_text.replace("⁄", "/")
|
||||
sanitized_text = sanitized_text.replace("’", "'")
|
||||
sanitized_text = sanitized_text.replace("‘", "'")
|
||||
sanitized_text = sanitized_text.replace("⁄", "/") # noqa: RUF001
|
||||
sanitized_text = sanitized_text.replace("’", "'") # noqa: RUF001
|
||||
sanitized_text = sanitized_text.replace("‘", "'") # noqa: RUF001
|
||||
sanitized_text = sanitized_text.replace("“", '"')
|
||||
sanitized_text = sanitized_text.replace("”", '"')
|
||||
sanitized_text = sanitized_text.replace("•", "·")
|
||||
@ -71,7 +72,6 @@ class PageAssembleModel(BasePageModel):
|
||||
yield page
|
||||
else:
|
||||
with TimeRecorder(conv_res, "page_assemble"):
|
||||
|
||||
assert page.predictions.layout is not None
|
||||
|
||||
# assembles some JSON output page by page.
|
||||
@ -83,7 +83,6 @@ class PageAssembleModel(BasePageModel):
|
||||
for cluster in page.predictions.layout.clusters:
|
||||
# _log.info("Cluster label seen:", cluster.label)
|
||||
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
||||
|
||||
textlines = [
|
||||
cell.text.replace("\x02", "-").strip()
|
||||
for cell in cluster.cells
|
||||
@ -109,9 +108,7 @@ class PageAssembleModel(BasePageModel):
|
||||
tbl = page.predictions.tablestructure.table_map.get(
|
||||
cluster.id, None
|
||||
)
|
||||
if (
|
||||
not tbl
|
||||
): # fallback: add table without structure, if it isn't present
|
||||
if not tbl: # fallback: add table without structure, if it isn't present
|
||||
tbl = Table(
|
||||
label=cluster.label,
|
||||
id=cluster.id,
|
||||
@ -130,9 +127,7 @@ class PageAssembleModel(BasePageModel):
|
||||
fig = page.predictions.figures_classification.figure_map.get(
|
||||
cluster.id, None
|
||||
)
|
||||
if (
|
||||
not fig
|
||||
): # fallback: add figure without classification, if it isn't present
|
||||
if not fig: # fallback: add figure without classification, if it isn't present
|
||||
fig = FigureElement(
|
||||
label=cluster.label,
|
||||
id=cluster.id,
|
||||
|
@ -1,5 +1,6 @@
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional
|
||||
from typing import Optional
|
||||
|
||||
from PIL import ImageDraw
|
||||
from pydantic import BaseModel
|
||||
|
@ -1,5 +1,6 @@
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Type, Union
|
||||
from typing import Optional, Type, Union
|
||||
|
||||
from PIL import Image
|
||||
|
||||
|
@ -1,12 +1,11 @@
|
||||
import logging
|
||||
from abc import abstractmethod
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, List, Optional, Type, Union
|
||||
from typing import List, Optional, Type, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DoclingDocument,
|
||||
NodeItem,
|
||||
PictureClassificationClass,
|
||||
PictureItem,
|
||||
)
|
||||
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
||||
|
@ -1,5 +1,6 @@
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Type, Union
|
||||
from typing import Optional, Type, Union
|
||||
|
||||
from PIL import Image
|
||||
|
||||
@ -13,7 +14,6 @@ from docling.utils.accelerator_utils import decide_device
|
||||
|
||||
|
||||
class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
||||
|
||||
@classmethod
|
||||
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
||||
return PictureDescriptionVlmOptions
|
||||
@ -36,7 +36,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
||||
self.options: PictureDescriptionVlmOptions
|
||||
|
||||
if self.enabled:
|
||||
|
||||
if artifacts_path is None:
|
||||
artifacts_path = self.download_models(repo_id=self.options.repo_id)
|
||||
else:
|
||||
|
@ -1,6 +1,7 @@
|
||||
import logging
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Type
|
||||
from typing import Optional, Type
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
@ -74,13 +75,11 @@ class RapidOcrModel(BaseOcrModel):
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
return
|
||||
|
||||
for page in page_batch:
|
||||
|
||||
assert page._backend is not None
|
||||
if not page._backend.is_valid():
|
||||
yield page
|
||||
|
@ -1,12 +1,7 @@
|
||||
import copy
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
|
||||
from docling_core.types.doc import (
|
||||
BoundingBox,
|
||||
CoordOrigin,
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
@ -17,13 +12,10 @@ from docling_core.types.doc import (
|
||||
TableData,
|
||||
)
|
||||
from docling_core.types.doc.document import ContentLayer
|
||||
from docling_core.types.legacy_doc.base import Ref
|
||||
from docling_core.types.legacy_doc.document import BaseText
|
||||
from docling_ibm_models.reading_order.reading_order_rb import (
|
||||
PageElement as ReadingOrderPageElement,
|
||||
ReadingOrderPredictor,
|
||||
)
|
||||
from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
|
||||
from PIL import ImageDraw
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
@ -35,7 +27,6 @@ from docling.datamodel.base_models import (
|
||||
TextElement,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||
|
||||
|
||||
@ -53,12 +44,10 @@ class ReadingOrderModel:
|
||||
def _assembled_to_readingorder_elements(
|
||||
self, conv_res: ConversionResult
|
||||
) -> List[ReadingOrderPageElement]:
|
||||
|
||||
elements: List[ReadingOrderPageElement] = []
|
||||
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
|
||||
|
||||
for element in conv_res.assembled.elements:
|
||||
|
||||
page_height = page_no_to_pages[element.page_no].size.height # type: ignore
|
||||
bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
|
||||
text = element.text or ""
|
||||
@ -84,7 +73,6 @@ class ReadingOrderModel:
|
||||
def _add_child_elements(
|
||||
self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
|
||||
):
|
||||
|
||||
child: Cluster
|
||||
for child in element.cluster.children:
|
||||
c_label = child.label
|
||||
@ -110,7 +98,7 @@ class ReadingOrderModel:
|
||||
else:
|
||||
doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
|
||||
|
||||
def _readingorder_elements_to_docling_doc(
|
||||
def _readingorder_elements_to_docling_doc( # noqa: C901
|
||||
self,
|
||||
conv_res: ConversionResult,
|
||||
ro_elements: List[ReadingOrderPageElement],
|
||||
@ -118,7 +106,6 @@ class ReadingOrderModel:
|
||||
el_to_footnotes_mapping: Dict[int, List[int]],
|
||||
el_merges_mapping: Dict[int, List[int]],
|
||||
) -> DoclingDocument:
|
||||
|
||||
id_to_elem = {
|
||||
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
|
||||
for elem in conv_res.assembled.elements
|
||||
@ -192,7 +179,6 @@ class ReadingOrderModel:
|
||||
|
||||
code_item.footnotes.append(new_footnote_item.get_ref())
|
||||
else:
|
||||
|
||||
new_item, current_list = self._handle_text_element(
|
||||
element, out_doc, current_list, page_height
|
||||
)
|
||||
@ -206,7 +192,6 @@ class ReadingOrderModel:
|
||||
)
|
||||
|
||||
elif isinstance(element, Table):
|
||||
|
||||
tbl_data = TableData(
|
||||
num_rows=element.num_rows,
|
||||
num_cols=element.num_cols,
|
||||
@ -342,12 +327,12 @@ class ReadingOrderModel:
|
||||
return new_item, current_list
|
||||
|
||||
def _merge_elements(self, element, merged_elem, new_item, page_height):
|
||||
assert isinstance(
|
||||
merged_elem, type(element)
|
||||
), "Merged element must be of same type as element."
|
||||
assert (
|
||||
merged_elem.label == new_item.label
|
||||
), "Labels of merged elements must match."
|
||||
assert isinstance(merged_elem, type(element)), (
|
||||
"Merged element must be of same type as element."
|
||||
)
|
||||
assert merged_elem.label == new_item.label, (
|
||||
"Labels of merged elements must match."
|
||||
)
|
||||
prov = ProvenanceItem(
|
||||
page_no=element.page_no + 1,
|
||||
charspan=(
|
||||
|
@ -1,13 +1,13 @@
|
||||
import copy
|
||||
import warnings
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Union
|
||||
from typing import Optional
|
||||
|
||||
import numpy
|
||||
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
||||
from docling_core.types.doc.page import (
|
||||
BoundingRectangle,
|
||||
SegmentedPdfPage,
|
||||
TextCellUnit,
|
||||
)
|
||||
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
||||
@ -44,7 +44,6 @@ class TableStructureModel(BasePageModel):
|
||||
|
||||
self.enabled = enabled
|
||||
if self.enabled:
|
||||
|
||||
if artifacts_path is None:
|
||||
artifacts_path = self.download_models() / self._model_path
|
||||
else:
|
||||
@ -175,7 +174,6 @@ class TableStructureModel(BasePageModel):
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
return
|
||||
@ -186,7 +184,6 @@ class TableStructureModel(BasePageModel):
|
||||
yield page
|
||||
else:
|
||||
with TimeRecorder(conv_res, "table_structure"):
|
||||
|
||||
assert page.predictions.layout is not None
|
||||
assert page.size is not None
|
||||
|
||||
@ -260,7 +257,6 @@ class TableStructureModel(BasePageModel):
|
||||
table_out = tf_output[0]
|
||||
table_cells = []
|
||||
for element in table_out["tf_responses"]:
|
||||
|
||||
if not self.do_cell_matching:
|
||||
the_bbox = BoundingBox.model_validate(
|
||||
element["bbox"]
|
||||
|
@ -3,9 +3,10 @@ import io
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from subprocess import DEVNULL, PIPE, Popen
|
||||
from typing import Iterable, List, Optional, Tuple, Type
|
||||
from typing import List, Optional, Tuple, Type
|
||||
|
||||
import pandas as pd
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
@ -63,8 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
)
|
||||
|
||||
def _get_name_and_version(self) -> Tuple[str, str]:
|
||||
|
||||
if self._name != None and self._version != None:
|
||||
if self._name is not None and self._version is not None:
|
||||
return self._name, self._version # type: ignore
|
||||
|
||||
cmd = [self.options.tesseract_cmd, "--version"]
|
||||
@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
# _log.info(decoded_data)
|
||||
|
||||
# Read the TSV file generated by Tesseract
|
||||
df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
|
||||
df_result = pd.read_csv(
|
||||
io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
|
||||
)
|
||||
|
||||
# Display the dataframe (optional)
|
||||
# _log.info("df: ", df.head())
|
||||
|
||||
# Filter rows that contain actual text (ignore header or empty rows)
|
||||
df_filtered = df[
|
||||
df["text"].notnull() & (df["text"].apply(str).str.strip() != "")
|
||||
df_filtered = df_result[
|
||||
df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
|
||||
]
|
||||
|
||||
return df_filtered
|
||||
@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
||||
output, _ = proc.communicate()
|
||||
decoded_data = output.decode("utf-8")
|
||||
df = pd.read_csv(
|
||||
df_detected = pd.read_csv(
|
||||
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
||||
)
|
||||
scripts = df.loc[df["key"] == "Script"].value.tolist()
|
||||
scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
|
||||
if len(scripts) == 0:
|
||||
_log.warning("Tesseract cannot detect the script of the page")
|
||||
return None
|
||||
@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
||||
output, _ = proc.communicate()
|
||||
decoded_data = output.decode("utf-8")
|
||||
df = pd.read_csv(io.StringIO(decoded_data), header=None)
|
||||
self._tesseract_languages = df[0].tolist()[1:]
|
||||
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
|
||||
self._tesseract_languages = df_list[0].tolist()[1:]
|
||||
|
||||
# Decide the script prefix
|
||||
if any([l.startswith("script/") for l in self._tesseract_languages]):
|
||||
if any(lang.startswith("script/") for lang in self._tesseract_languages):
|
||||
script_prefix = "script/"
|
||||
else:
|
||||
script_prefix = ""
|
||||
@ -197,7 +199,6 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
||||
if not self.enabled:
|
||||
yield from page_batch
|
||||
return
|
||||
@ -225,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
fname = image_file.name
|
||||
high_res_image.save(image_file)
|
||||
|
||||
df = self._run_tesseract(fname)
|
||||
df_result = self._run_tesseract(fname)
|
||||
finally:
|
||||
if os.path.exists(fname):
|
||||
os.remove(fname)
|
||||
|
||||
# _log.info(df)
|
||||
# _log.info(df_result)
|
||||
|
||||
# Print relevant columns (bounding box and text)
|
||||
for ix, row in df.iterrows():
|
||||
for ix, row in df_result.iterrows():
|
||||
text = row["text"]
|
||||
conf = row["conf"]
|
||||
|
||||
l = float(row["left"])
|
||||
l = float(row["left"]) # noqa: E741
|
||||
b = float(row["top"])
|
||||
w = float(row["width"])
|
||||
h = float(row["height"])
|
||||
|
@ -1,6 +1,7 @@
|
||||
import logging
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Type
|
||||
from typing import Optional, Type
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||
@ -37,9 +38,6 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
self.options: TesseractOcrOptions
|
||||
|
||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||
self.reader = None
|
||||
self.osd_reader = None
|
||||
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
||||
|
||||
if self.enabled:
|
||||
install_errmsg = (
|
||||
@ -64,7 +62,7 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
raise ImportError(install_errmsg)
|
||||
try:
|
||||
tesseract_version = tesserocr.tesseract_version()
|
||||
except:
|
||||
except Exception:
|
||||
raise ImportError(install_errmsg)
|
||||
|
||||
_, self._tesserocr_languages = tesserocr.get_languages()
|
||||
@ -75,7 +73,7 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
||||
lang = "+".join(self.options.lang)
|
||||
|
||||
if any([l.startswith("script/") for l in self._tesserocr_languages]):
|
||||
if any(lang.startswith("script/") for lang in self._tesserocr_languages):
|
||||
self.script_prefix = "script/"
|
||||
else:
|
||||
self.script_prefix = ""
|
||||
@ -86,6 +84,10 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
"oem": tesserocr.OEM.DEFAULT,
|
||||
}
|
||||
|
||||
self.reader = None
|
||||
self.osd_reader = None
|
||||
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
||||
|
||||
if self.options.path is not None:
|
||||
tesserocr_kwargs["path"] = self.options.path
|
||||
|
||||
|
@ -3,9 +3,10 @@ import logging
|
||||
import time
|
||||
import traceback
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any, Callable, Iterable, List
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Callable, List
|
||||
|
||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
||||
from docling_core.types.doc import NodeItem
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
@ -64,7 +65,6 @@ class BasePipeline(ABC):
|
||||
return conv_res
|
||||
|
||||
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
|
||||
def _prepare_elements(
|
||||
conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
|
||||
) -> Iterable[NodeItem]:
|
||||
@ -113,7 +113,6 @@ class BasePipeline(ABC):
|
||||
|
||||
|
||||
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
|
||||
def __init__(self, pipeline_options: PipelineOptions):
|
||||
super().__init__(pipeline_options)
|
||||
self.keep_backend = False
|
||||
@ -127,7 +126,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
yield from page_batch
|
||||
|
||||
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
|
||||
if not isinstance(conv_res.input._backend, PdfDocumentBackend):
|
||||
raise RuntimeError(
|
||||
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
|
||||
@ -139,8 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
|
||||
total_elapsed_time = 0.0
|
||||
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
||||
|
||||
for i in range(0, conv_res.input.page_count):
|
||||
for i in range(conv_res.input.page_count):
|
||||
start_page, end_page = conv_res.input.limits.page_range
|
||||
if (start_page - 1) <= i <= (end_page - 1):
|
||||
conv_res.pages.append(Page(page_no=i))
|
||||
@ -161,7 +158,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
|
||||
|
||||
for p in pipeline_pages: # Must exhaust!
|
||||
|
||||
# Cleanup cached images
|
||||
if not self.keep_images:
|
||||
p._image_cache = {}
|
||||
|
@ -24,7 +24,6 @@ class SimplePipeline(BasePipeline):
|
||||
super().__init__(pipeline_options)
|
||||
|
||||
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
|
||||
if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
|
||||
raise RuntimeError(
|
||||
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
|
||||
|
@ -1,5 +1,4 @@
|
||||
import logging
|
||||
import sys
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Optional, cast
|
||||
|
@ -1,5 +1,4 @@
|
||||
import logging
|
||||
import warnings
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union, cast
|
||||
@ -32,7 +31,6 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class VlmPipeline(PaginatedPipeline):
|
||||
|
||||
def __init__(self, pipeline_options: VlmPipelineOptions):
|
||||
super().__init__(pipeline_options)
|
||||
self.keep_backend = True
|
||||
@ -114,7 +112,6 @@ class VlmPipeline(PaginatedPipeline):
|
||||
|
||||
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
||||
|
||||
if (
|
||||
self.pipeline_options.vlm_options.response_format
|
||||
== ResponseFormat.DOCTAGS
|
||||
|
@ -1,8 +1,8 @@
|
||||
import logging
|
||||
from typing import Any, Dict, Iterable, List, Tuple, Union
|
||||
from collections.abc import Iterable
|
||||
from typing import Any, Dict, List, Tuple, Union
|
||||
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
from docling_core.types.doc.page import TextCell
|
||||
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
||||
|
||||
from docling.datamodel.document import ConversionResult, Page
|
||||
@ -13,7 +13,6 @@ _log = logging.getLogger(__name__)
|
||||
def generate_multimodal_pages(
|
||||
doc_result: ConversionResult,
|
||||
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
|
||||
|
||||
label_to_doclaynet = {
|
||||
"title": "title",
|
||||
"table-of-contents": "document_index",
|
||||
@ -122,7 +121,6 @@ def generate_multimodal_pages(
|
||||
if doc.main_text is None:
|
||||
return
|
||||
for ix, orig_item in enumerate(doc.main_text):
|
||||
|
||||
item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
|
||||
if item is None or item.prov is None or len(item.prov) == 0:
|
||||
_log.debug(f"Skipping item {orig_item}")
|
||||
|
@ -29,7 +29,7 @@ def resolve_item(paths, obj):
|
||||
|
||||
try:
|
||||
key = int(paths[0])
|
||||
except:
|
||||
except Exception:
|
||||
key = paths[0]
|
||||
|
||||
if len(paths) == 1:
|
||||
@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
|
||||
return unique_objects
|
||||
|
||||
|
||||
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
||||
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: # noqa: C901
|
||||
origin = DocumentOrigin(
|
||||
mimetype="application/pdf",
|
||||
filename=doc_glm["file-info"]["filename"],
|
||||
|
@ -18,7 +18,7 @@ class UnionFind:
|
||||
|
||||
def __init__(self, elements):
|
||||
self.parent = {elem: elem for elem in elements}
|
||||
self.rank = {elem: 0 for elem in elements}
|
||||
self.rank = dict.fromkeys(elements, 0)
|
||||
|
||||
def find(self, x):
|
||||
if self.parent[x] != x:
|
||||
@ -484,7 +484,9 @@ class LayoutPostprocessor:
|
||||
spatial_index = (
|
||||
self.regular_index
|
||||
if cluster_type == "regular"
|
||||
else self.picture_index if cluster_type == "picture" else self.wrapper_index
|
||||
else self.picture_index
|
||||
if cluster_type == "picture"
|
||||
else self.wrapper_index
|
||||
)
|
||||
|
||||
# Map of currently valid clusters
|
||||
|
@ -37,7 +37,7 @@ def download_models(
|
||||
output_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
if with_layout:
|
||||
_log.info(f"Downloading layout model...")
|
||||
_log.info("Downloading layout model...")
|
||||
LayoutModel.download_models(
|
||||
local_dir=output_dir / LayoutModel._model_repo_folder,
|
||||
force=force,
|
||||
@ -45,7 +45,7 @@ def download_models(
|
||||
)
|
||||
|
||||
if with_tableformer:
|
||||
_log.info(f"Downloading tableformer model...")
|
||||
_log.info("Downloading tableformer model...")
|
||||
TableStructureModel.download_models(
|
||||
local_dir=output_dir / TableStructureModel._model_repo_folder,
|
||||
force=force,
|
||||
@ -53,7 +53,7 @@ def download_models(
|
||||
)
|
||||
|
||||
if with_picture_classifier:
|
||||
_log.info(f"Downloading picture classifier model...")
|
||||
_log.info("Downloading picture classifier model...")
|
||||
DocumentPictureClassifier.download_models(
|
||||
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
|
||||
force=force,
|
||||
@ -61,7 +61,7 @@ def download_models(
|
||||
)
|
||||
|
||||
if with_code_formula:
|
||||
_log.info(f"Downloading code formula model...")
|
||||
_log.info("Downloading code formula model...")
|
||||
CodeFormulaModel.download_models(
|
||||
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
|
||||
force=force,
|
||||
@ -69,7 +69,7 @@ def download_models(
|
||||
)
|
||||
|
||||
if with_smolvlm:
|
||||
_log.info(f"Downloading SmolVlm model...")
|
||||
_log.info("Downloading SmolVlm model...")
|
||||
PictureDescriptionVlmModel.download_models(
|
||||
repo_id=smolvlm_picture_description.repo_id,
|
||||
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
|
||||
@ -78,7 +78,7 @@ def download_models(
|
||||
)
|
||||
|
||||
if with_granite_vision:
|
||||
_log.info(f"Downloading Granite Vision model...")
|
||||
_log.info("Downloading Granite Vision model...")
|
||||
PictureDescriptionVlmModel.download_models(
|
||||
repo_id=granite_picture_description.repo_id,
|
||||
local_dir=output_dir / granite_picture_description.repo_cache_folder,
|
||||
@ -87,7 +87,7 @@ def download_models(
|
||||
)
|
||||
|
||||
if with_easyocr:
|
||||
_log.info(f"Downloading easyocr models...")
|
||||
_log.info("Downloading easyocr models...")
|
||||
EasyOcrModel.download_models(
|
||||
local_dir=output_dir / EasyOcrModel._model_repo_folder,
|
||||
force=force,
|
||||
|
@ -13,7 +13,7 @@ def chunkify(iterator, chunk_size):
|
||||
if isinstance(iterator, List):
|
||||
iterator = iter(iterator)
|
||||
for first in iterator: # Take the first element from the iterator
|
||||
yield [first] + list(islice(iterator, chunk_size - 1))
|
||||
yield [first, *list(islice(iterator, chunk_size - 1))]
|
||||
|
||||
|
||||
def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
|
||||
|
@ -383,7 +383,7 @@
|
||||
"\n",
|
||||
"print(f\"Downloading {url}...\")\n",
|
||||
"buf = BytesIO(requests.get(url).content)\n",
|
||||
"print(f\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
|
||||
"print(\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
|
||||
"with zipfile.ZipFile(buf) as zf:\n",
|
||||
" res = zf.testzip()\n",
|
||||
" if res:\n",
|
||||
@ -544,7 +544,7 @@
|
||||
"source": [
|
||||
"doc = backend.convert()\n",
|
||||
"\n",
|
||||
"claims_sec = [item for item in doc.texts if item.text == \"CLAIMS\"][0]\n",
|
||||
"claims_sec = next(item for item in doc.texts if item.text == \"CLAIMS\")\n",
|
||||
"print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')"
|
||||
]
|
||||
},
|
||||
|
@ -1,8 +1,8 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
import yaml
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
@ -11,7 +11,6 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -3,7 +3,6 @@ import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
@ -11,9 +10,6 @@ from docling.datamodel.pipeline_options import (
|
||||
PdfPipelineOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.models.ocr_mac_model import OcrMacOptions
|
||||
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
||||
from docling.models.tesseract_ocr_model import TesseractOcrOptions
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -3,8 +3,8 @@
|
||||
# It does not run the actual formula understanding model.
|
||||
|
||||
import logging
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem
|
||||
|
||||
@ -49,7 +49,6 @@ class ExampleFormulaUnderstandingEnrichmentModel(BaseItemAndImageEnrichmentModel
|
||||
|
||||
# How the pipeline can be extended.
|
||||
class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
|
||||
|
||||
def __init__(self, pipeline_options: ExampleFormulaUnderstandingPipelineOptions):
|
||||
super().__init__(pipeline_options)
|
||||
self.pipeline_options: ExampleFormulaUnderstandingPipelineOptions
|
||||
@ -85,7 +84,7 @@ def main():
|
||||
)
|
||||
}
|
||||
)
|
||||
result = doc_converter.convert(input_doc_path)
|
||||
doc_converter.convert(input_doc_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -3,8 +3,9 @@
|
||||
# It does not run the actual picture classifier model.
|
||||
|
||||
import logging
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable
|
||||
from typing import Any
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DoclingDocument,
|
||||
|
@ -4,7 +4,7 @@ from pathlib import Path
|
||||
|
||||
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
|
||||
|
||||
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
@ -51,7 +51,6 @@ def main():
|
||||
page_segments,
|
||||
page,
|
||||
) in generate_multimodal_pages(conv_res):
|
||||
|
||||
dpi = page._default_image_scale * 72
|
||||
|
||||
rows.append(
|
||||
@ -81,10 +80,10 @@ def main():
|
||||
)
|
||||
|
||||
# Generate one parquet from all documents
|
||||
df = pd.json_normalize(rows)
|
||||
df_result = pd.json_normalize(rows)
|
||||
now = datetime.datetime.now()
|
||||
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
|
||||
df.to_parquet(output_filename)
|
||||
df_result.to_parquet(output_filename)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
|
@ -32,12 +32,12 @@ def main():
|
||||
print(table_df.to_markdown())
|
||||
|
||||
# Save the table as csv
|
||||
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
|
||||
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv"
|
||||
_log.info(f"Saving CSV table to {element_csv_filename}")
|
||||
table_df.to_csv(element_csv_filename)
|
||||
|
||||
# Save the table as html
|
||||
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
|
||||
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html"
|
||||
_log.info(f"Saving HTML table to {element_html_filename}")
|
||||
with element_html_filename.open("w") as fp:
|
||||
fp.write(table.export_to_html(doc=conv_res.document))
|
||||
|
@ -1,14 +1,9 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
OcrMacOptions,
|
||||
PdfPipelineOptions,
|
||||
RapidOcrOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
@ -153,10 +153,10 @@
|
||||
"source": [
|
||||
"for i, chunk in enumerate(chunk_iter):\n",
|
||||
" print(f\"=== {i} ===\")\n",
|
||||
" print(f\"chunk.text:\\n{repr(f'{chunk.text[:300]}…')}\")\n",
|
||||
" print(f\"chunk.text:\\n{f'{chunk.text[:300]}…'!r}\")\n",
|
||||
"\n",
|
||||
" enriched_text = chunker.serialize(chunk=chunk)\n",
|
||||
" print(f\"chunker.serialize(chunk):\\n{repr(f'{enriched_text[:300]}…')}\")\n",
|
||||
" print(f\"chunker.serialize(chunk):\\n{f'{enriched_text[:300]}…'!r}\")\n",
|
||||
"\n",
|
||||
" print()"
|
||||
]
|
||||
@ -353,11 +353,11 @@
|
||||
"for i, chunk in enumerate(chunks):\n",
|
||||
" print(f\"=== {i} ===\")\n",
|
||||
" txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
|
||||
" print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
|
||||
" print(f\"chunk.text ({txt_tokens} tokens):\\n{chunk.text!r}\")\n",
|
||||
"\n",
|
||||
" ser_txt = chunker.serialize(chunk=chunk)\n",
|
||||
" ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
|
||||
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
|
||||
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{ser_txt!r}\")\n",
|
||||
"\n",
|
||||
" print()"
|
||||
]
|
||||
|
@ -2,17 +2,14 @@ import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
||||
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
VlmPipelineOptions,
|
||||
granite_vision_vlm_conversion_options,
|
||||
smoldocling_vlm_conversion_options,
|
||||
smoldocling_vlm_mlx_conversion_options,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
|
||||
@ -39,9 +36,6 @@ pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
|
||||
## Alternative VLM models:
|
||||
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
||||
|
||||
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
||||
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
||||
|
||||
## Set up pipeline for PDF or image inputs
|
||||
converter = DocumentConverter(
|
||||
format_options={
|
||||
@ -62,7 +56,7 @@ out_path.mkdir(parents=True, exist_ok=True)
|
||||
for source in sources:
|
||||
start_time = time.time()
|
||||
print("================================================")
|
||||
print("Processing... {}".format(source))
|
||||
print(f"Processing... {source}")
|
||||
print("================================================")
|
||||
print("")
|
||||
|
||||
@ -77,7 +71,7 @@ for source in sources:
|
||||
print(page.predictions.vlm_response.text)
|
||||
|
||||
res.document.save_as_html(
|
||||
filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
|
||||
filename=Path(f"{out_path}/{res.input.file.stem}.html"),
|
||||
image_mode=ImageRefMode.REFERENCED,
|
||||
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
|
||||
)
|
||||
|
@ -144,7 +144,7 @@
|
||||
"for pic in doc.pictures[:5]:\n",
|
||||
" html_item = (\n",
|
||||
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
|
||||
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
|
||||
" f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
|
||||
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
|
||||
" )\n",
|
||||
" for annotation in pic.annotations:\n",
|
||||
@ -252,7 +252,7 @@
|
||||
"for pic in doc.pictures[:5]:\n",
|
||||
" html_item = (\n",
|
||||
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
|
||||
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
|
||||
" f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
|
||||
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
|
||||
" )\n",
|
||||
" for annotation in pic.annotations:\n",
|
||||
|
@ -283,7 +283,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
@ -369,7 +369,7 @@
|
||||
" new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n",
|
||||
" try:\n",
|
||||
" index_client.delete_index(index_name)\n",
|
||||
" except:\n",
|
||||
" except Exception:\n",
|
||||
" pass\n",
|
||||
"\n",
|
||||
" index_client.create_or_update_index(new_index)\n",
|
||||
@ -487,7 +487,7 @@
|
||||
"\n",
|
||||
" all_succeeded = all(r.succeeded for r in resp)\n",
|
||||
" console.print(\n",
|
||||
" f\"Uploaded batch {i} -> {i+len(subset)}; all_succeeded: {all_succeeded}, \"\n",
|
||||
" f\"Uploaded batch {i} -> {i + len(subset)}; all_succeeded: {all_succeeded}, \"\n",
|
||||
" f\"first_doc_status_code: {resp[0].status_code}\"\n",
|
||||
" )\n",
|
||||
"\n",
|
||||
@ -807,10 +807,12 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from typing import Optional\n",
|
||||
"\n",
|
||||
"from azure.search.documents.models import VectorizableTextQuery\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def generate_chat_response(prompt: str, system_message: str = None):\n",
|
||||
"def generate_chat_response(prompt: str, system_message: Optional[str] = None):\n",
|
||||
" \"\"\"\n",
|
||||
" Generates a single-turn chat response using Azure OpenAI Chat.\n",
|
||||
" If you need multi-turn conversation or follow-up queries, you'll have to\n",
|
||||
|
@ -351,7 +351,7 @@
|
||||
"for source in sources:\n",
|
||||
" if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
|
||||
" doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n",
|
||||
" print(f\"- text: {repr(doc_chunk.text)}\")\n",
|
||||
" print(f\"- text: {doc_chunk.text!r}\")\n",
|
||||
" if doc_chunk.meta.origin:\n",
|
||||
" print(f\" file: {doc_chunk.meta.origin.filename}\")\n",
|
||||
" if doc_chunk.meta.headings:\n",
|
||||
|
@ -341,7 +341,7 @@
|
||||
"print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{clipped_answer}\")\n",
|
||||
"for i, doc in enumerate(resp_dict[\"context\"]):\n",
|
||||
" print()\n",
|
||||
" print(f\"Source {i+1}:\")\n",
|
||||
" print(f\"Source {i + 1}:\")\n",
|
||||
" print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
|
||||
" for key in doc.metadata:\n",
|
||||
" if key != \"pk\":\n",
|
||||
|
@ -59,7 +59,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true,
|
||||
"id": "u076oUSF_YUG"
|
||||
@ -72,12 +72,11 @@
|
||||
"%pip install rich\n",
|
||||
"%pip install torch\n",
|
||||
"\n",
|
||||
"import logging\n",
|
||||
"import warnings\n",
|
||||
"\n",
|
||||
"warnings.filterwarnings(\"ignore\")\n",
|
||||
"\n",
|
||||
"import logging\n",
|
||||
"\n",
|
||||
"# Suppress Weaviate client logs\n",
|
||||
"logging.getLogger(\"weaviate\").setLevel(logging.ERROR)"
|
||||
]
|
||||
@ -119,7 +118,7 @@
|
||||
" device = torch.device(\"mps\")\n",
|
||||
" print(\"MPS GPU is enabled.\")\n",
|
||||
"else:\n",
|
||||
" raise EnvironmentError(\n",
|
||||
" raise OSError(\n",
|
||||
" \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
|
||||
" )"
|
||||
]
|
||||
@ -226,7 +225,6 @@
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from docling.datamodel.document import ConversionResult\n",
|
||||
"from docling.document_converter import DocumentConverter\n",
|
||||
"\n",
|
||||
"# Instantiate the doc converter\n",
|
||||
@ -345,7 +343,7 @@
|
||||
"\n",
|
||||
" openai_api_key = os.getenv(openai_api_key_var)\n",
|
||||
" if not openai_api_key:\n",
|
||||
" raise EnvironmentError(\n",
|
||||
" raise OSError(\n",
|
||||
" f\"Environment variable '{openai_api_key_var}' is not set. \"\n",
|
||||
" \"Please define it before running this script.\"\n",
|
||||
" )"
|
||||
@ -387,7 +385,6 @@
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import weaviate.classes.config as wc\n",
|
||||
"from weaviate.classes.config import DataType, Property\n",
|
||||
"\n",
|
||||
"# Define the collection name\n",
|
||||
"collection_name = \"docling\"\n",
|
||||
|
@ -25,9 +25,7 @@ def main():
|
||||
document = mdb.convert()
|
||||
|
||||
out_path = Path("scratch")
|
||||
print(
|
||||
f"Document {path} converted." f"\nSaved markdown output to: {str(out_path)}"
|
||||
)
|
||||
print(f"Document {path} converted.\nSaved markdown output to: {out_path!s}")
|
||||
|
||||
# Export Docling document format to markdowndoc:
|
||||
fn = os.path.basename(path)
|
||||
|
@ -1,13 +1,10 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
AcceleratorOptions,
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
@ -63,7 +63,7 @@ def main():
|
||||
out_path = Path("scratch")
|
||||
print(
|
||||
f"Document {res.input.file.name} converted."
|
||||
f"\nSaved markdown output to: {str(out_path)}"
|
||||
f"\nSaved markdown output to: {out_path!s}"
|
||||
)
|
||||
_log.debug(res.document._export_to_indented_text(max_text_len=16))
|
||||
# Export Docling document format to markdowndoc:
|
||||
|
@ -4,7 +4,6 @@ from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
PdfPipelineOptions,
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
@ -2,9 +2,9 @@ import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem
|
||||
from docling_core.types.doc import ImageRefMode, TableItem, TextItem
|
||||
|
||||
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
@ -15,7 +15,6 @@ IMAGE_RESOLUTION_SCALE = 2.0
|
||||
|
||||
# FIXME: put in your favorite translation code ....
|
||||
def translate(text: str, src: str = "en", dest: str = "de"):
|
||||
|
||||
_log.warning("!!! IMPLEMENT HERE YOUR FAVORITE TRANSLATION CODE!!!")
|
||||
# from googletrans import Translator
|
||||
|
||||
@ -52,10 +51,9 @@ def main():
|
||||
}
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_res = doc_converter.convert(input_doc_path)
|
||||
conv_doc = conv_res.document
|
||||
doc_filename = conv_res.input.file
|
||||
|
||||
# Save markdown with embedded pictures in original text
|
||||
md_filename = output_dir / f"{doc_filename}-with-images-orig.md"
|
||||
|
@ -432,7 +432,7 @@
|
||||
"\n",
|
||||
"for i, doc in enumerate(resp_dict[\"context\"][:]):\n",
|
||||
" image_by_page = {}\n",
|
||||
" print(f\"Source {i+1}:\")\n",
|
||||
" print(f\"Source {i + 1}:\")\n",
|
||||
" print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
|
||||
" meta = DocMeta.model_validate(doc.metadata[\"dl_meta\"])\n",
|
||||
"\n",
|
||||
|
@ -10,7 +10,6 @@ from docling.datamodel.pipeline_options import (
|
||||
ApiVlmOptions,
|
||||
ResponseFormat,
|
||||
VlmPipelineOptions,
|
||||
granite_vision_vlm_ollama_conversion_options,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
|
98
poetry.lock
generated
98
poetry.lock
generated
@ -692,6 +692,84 @@ traitlets = ">=4"
|
||||
[package.extras]
|
||||
test = ["pytest"]
|
||||
|
||||
[[package]]
|
||||
name = "coverage"
|
||||
version = "7.8.0"
|
||||
description = "Code coverage measurement for Python"
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
files = [
|
||||
{file = "coverage-7.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2931f66991175369859b5fd58529cd4b73582461877ecfd859b6549869287ffe"},
|
||||
{file = "coverage-7.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52a523153c568d2c0ef8826f6cc23031dc86cffb8c6aeab92c4ff776e7951b28"},
|
||||
{file = "coverage-7.8.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c8a5c139aae4c35cbd7cadca1df02ea8cf28a911534fc1b0456acb0b14234f3"},
|
||||
{file = "coverage-7.8.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a26c0c795c3e0b63ec7da6efded5f0bc856d7c0b24b2ac84b4d1d7bc578d676"},
|
||||
{file = "coverage-7.8.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:821f7bcbaa84318287115d54becb1915eece6918136c6f91045bb84e2f88739d"},
|
||||
{file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a321c61477ff8ee705b8a5fed370b5710c56b3a52d17b983d9215861e37b642a"},
|
||||
{file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ed2144b8a78f9d94d9515963ed273d620e07846acd5d4b0a642d4849e8d91a0c"},
|
||||
{file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:042e7841a26498fff7a37d6fda770d17519982f5b7d8bf5278d140b67b61095f"},
|
||||
{file = "coverage-7.8.0-cp310-cp310-win32.whl", hash = "sha256:f9983d01d7705b2d1f7a95e10bbe4091fabc03a46881a256c2787637b087003f"},
|
||||
{file = "coverage-7.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:5a570cd9bd20b85d1a0d7b009aaf6c110b52b5755c17be6962f8ccd65d1dbd23"},
|
||||
{file = "coverage-7.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e7ac22a0bb2c7c49f441f7a6d46c9c80d96e56f5a8bc6972529ed43c8b694e27"},
|
||||
{file = "coverage-7.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bf13d564d310c156d1c8e53877baf2993fb3073b2fc9f69790ca6a732eb4bfea"},
|
||||
{file = "coverage-7.8.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5761c70c017c1b0d21b0815a920ffb94a670c8d5d409d9b38857874c21f70d7"},
|
||||
{file = "coverage-7.8.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5ff52d790c7e1628241ffbcaeb33e07d14b007b6eb00a19320c7b8a7024c040"},
|
||||
{file = "coverage-7.8.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d39fc4817fd67b3915256af5dda75fd4ee10621a3d484524487e33416c6f3543"},
|
||||
{file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b44674870709017e4b4036e3d0d6c17f06a0e6d4436422e0ad29b882c40697d2"},
|
||||
{file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8f99eb72bf27cbb167b636eb1726f590c00e1ad375002230607a844d9e9a2318"},
|
||||
{file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b571bf5341ba8c6bc02e0baeaf3b061ab993bf372d982ae509807e7f112554e9"},
|
||||
{file = "coverage-7.8.0-cp311-cp311-win32.whl", hash = "sha256:e75a2ad7b647fd8046d58c3132d7eaf31b12d8a53c0e4b21fa9c4d23d6ee6d3c"},
|
||||
{file = "coverage-7.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:3043ba1c88b2139126fc72cb48574b90e2e0546d4c78b5299317f61b7f718b78"},
|
||||
{file = "coverage-7.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bbb5cc845a0292e0c520656d19d7ce40e18d0e19b22cb3e0409135a575bf79fc"},
|
||||
{file = "coverage-7.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4dfd9a93db9e78666d178d4f08a5408aa3f2474ad4d0e0378ed5f2ef71640cb6"},
|
||||
{file = "coverage-7.8.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f017a61399f13aa6d1039f75cd467be388d157cd81f1a119b9d9a68ba6f2830d"},
|
||||
{file = "coverage-7.8.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0915742f4c82208ebf47a2b154a5334155ed9ef9fe6190674b8a46c2fb89cb05"},
|
||||
{file = "coverage-7.8.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a40fcf208e021eb14b0fac6bdb045c0e0cab53105f93ba0d03fd934c956143a"},
|
||||
{file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a1f406a8e0995d654b2ad87c62caf6befa767885301f3b8f6f73e6f3c31ec3a6"},
|
||||
{file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:77af0f6447a582fdc7de5e06fa3757a3ef87769fbb0fdbdeba78c23049140a47"},
|
||||
{file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f2d32f95922927186c6dbc8bc60df0d186b6edb828d299ab10898ef3f40052fe"},
|
||||
{file = "coverage-7.8.0-cp312-cp312-win32.whl", hash = "sha256:769773614e676f9d8e8a0980dd7740f09a6ea386d0f383db6821df07d0f08545"},
|
||||
{file = "coverage-7.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:e5d2b9be5b0693cf21eb4ce0ec8d211efb43966f6657807f6859aab3814f946b"},
|
||||
{file = "coverage-7.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5ac46d0c2dd5820ce93943a501ac5f6548ea81594777ca585bf002aa8854cacd"},
|
||||
{file = "coverage-7.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:771eb7587a0563ca5bb6f622b9ed7f9d07bd08900f7589b4febff05f469bea00"},
|
||||
{file = "coverage-7.8.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42421e04069fb2cbcbca5a696c4050b84a43b05392679d4068acbe65449b5c64"},
|
||||
{file = "coverage-7.8.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:554fec1199d93ab30adaa751db68acec2b41c5602ac944bb19187cb9a41a8067"},
|
||||
{file = "coverage-7.8.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aaeb00761f985007b38cf463b1d160a14a22c34eb3f6a39d9ad6fc27cb73008"},
|
||||
{file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:581a40c7b94921fffd6457ffe532259813fc68eb2bdda60fa8cc343414ce3733"},
|
||||
{file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:f319bae0321bc838e205bf9e5bc28f0a3165f30c203b610f17ab5552cff90323"},
|
||||
{file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04bfec25a8ef1c5f41f5e7e5c842f6b615599ca8ba8391ec33a9290d9d2db3a3"},
|
||||
{file = "coverage-7.8.0-cp313-cp313-win32.whl", hash = "sha256:dd19608788b50eed889e13a5d71d832edc34fc9dfce606f66e8f9f917eef910d"},
|
||||
{file = "coverage-7.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:a9abbccd778d98e9c7e85038e35e91e67f5b520776781d9a1e2ee9d400869487"},
|
||||
{file = "coverage-7.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:18c5ae6d061ad5b3e7eef4363fb27a0576012a7447af48be6c75b88494c6cf25"},
|
||||
{file = "coverage-7.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:95aa6ae391a22bbbce1b77ddac846c98c5473de0372ba5c463480043a07bff42"},
|
||||
{file = "coverage-7.8.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e013b07ba1c748dacc2a80e69a46286ff145935f260eb8c72df7185bf048f502"},
|
||||
{file = "coverage-7.8.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d766a4f0e5aa1ba056ec3496243150698dc0481902e2b8559314368717be82b1"},
|
||||
{file = "coverage-7.8.0-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad80e6b4a0c3cb6f10f29ae4c60e991f424e6b14219d46f1e7d442b938ee68a4"},
|
||||
{file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b87eb6fc9e1bb8f98892a2458781348fa37e6925f35bb6ceb9d4afd54ba36c73"},
|
||||
{file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:d1ba00ae33be84066cfbe7361d4e04dec78445b2b88bdb734d0d1cbab916025a"},
|
||||
{file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f3c38e4e5ccbdc9198aecc766cedbb134b2d89bf64533973678dfcf07effd883"},
|
||||
{file = "coverage-7.8.0-cp313-cp313t-win32.whl", hash = "sha256:379fe315e206b14e21db5240f89dc0774bdd3e25c3c58c2c733c99eca96f1ada"},
|
||||
{file = "coverage-7.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2e4b6b87bb0c846a9315e3ab4be2d52fac905100565f4b92f02c445c8799e257"},
|
||||
{file = "coverage-7.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa260de59dfb143af06dcf30c2be0b200bed2a73737a8a59248fcb9fa601ef0f"},
|
||||
{file = "coverage-7.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:96121edfa4c2dfdda409877ea8608dd01de816a4dc4a0523356067b305e4e17a"},
|
||||
{file = "coverage-7.8.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b8af63b9afa1031c0ef05b217faa598f3069148eeee6bb24b79da9012423b82"},
|
||||
{file = "coverage-7.8.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:89b1f4af0d4afe495cd4787a68e00f30f1d15939f550e869de90a86efa7e0814"},
|
||||
{file = "coverage-7.8.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94ec0be97723ae72d63d3aa41961a0b9a6f5a53ff599813c324548d18e3b9e8c"},
|
||||
{file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8a1d96e780bdb2d0cbb297325711701f7c0b6f89199a57f2049e90064c29f6bd"},
|
||||
{file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:f1d8a2a57b47142b10374902777e798784abf400a004b14f1b0b9eaf1e528ba4"},
|
||||
{file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:cf60dd2696b457b710dd40bf17ad269d5f5457b96442f7f85722bdb16fa6c899"},
|
||||
{file = "coverage-7.8.0-cp39-cp39-win32.whl", hash = "sha256:be945402e03de47ba1872cd5236395e0f4ad635526185a930735f66710e1bd3f"},
|
||||
{file = "coverage-7.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:90e7fbc6216ecaffa5a880cdc9c77b7418c1dcb166166b78dbc630d07f278cc3"},
|
||||
{file = "coverage-7.8.0-pp39.pp310.pp311-none-any.whl", hash = "sha256:b8194fb8e50d556d5849753de991d390c5a1edeeba50f68e3a9253fbd8bf8ccd"},
|
||||
{file = "coverage-7.8.0-py3-none-any.whl", hash = "sha256:dbf364b4c5e7bae9250528167dfe40219b62e2d573c854d74be213e1e52069f7"},
|
||||
{file = "coverage-7.8.0.tar.gz", hash = "sha256:7a3d62b3b03b4b6fd41a085f3574874cf946cb4604d2b4d3e8dca8cd570ca501"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""}
|
||||
|
||||
[package.extras]
|
||||
toml = ["tomli"]
|
||||
|
||||
[[package]]
|
||||
name = "cryptography"
|
||||
version = "43.0.3"
|
||||
@ -5073,6 +5151,24 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
|
||||
[package.extras]
|
||||
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-cov"
|
||||
version = "6.1.1"
|
||||
description = "Pytest plugin for measuring coverage."
|
||||
optional = false
|
||||
python-versions = ">=3.9"
|
||||
files = [
|
||||
{file = "pytest_cov-6.1.1-py3-none-any.whl", hash = "sha256:bddf29ed2d0ab6f4df17b4c55b0a657287db8684af9c42ea546b21b1041b3dde"},
|
||||
{file = "pytest_cov-6.1.1.tar.gz", hash = "sha256:46935f7aaefba760e716c2ebfbe1c216240b9592966e7da99ea8292d4d3e2a0a"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
coverage = {version = ">=7.5", extras = ["toml"]}
|
||||
pytest = ">=4.6"
|
||||
|
||||
[package.extras]
|
||||
testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"]
|
||||
|
||||
[[package]]
|
||||
name = "pytest-xdist"
|
||||
version = "3.6.1"
|
||||
@ -7882,4 +7978,4 @@ vlm = ["accelerate", "transformers", "transformers"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "d2a8f7997b9ffb249ad26ba492b766d580bdb0072d50e76b0afd92496e983e96"
|
||||
content-hash = "b36037ec17dc4b6d5197a2f63a1367e05bf888b4fa97e2e2e8c29c217741d69c"
|
||||
|
@ -110,6 +110,8 @@ ipywidgets = "^8.1.5"
|
||||
nbqa = "^1.9.0"
|
||||
types-openpyxl = "^3.1.5.20241114"
|
||||
types-tqdm = "^4.67.0.20241221"
|
||||
coverage = "^7.6.2"
|
||||
pytest-cov = "^6.0.0"
|
||||
|
||||
[tool.poetry.group.docs.dependencies]
|
||||
mkdocs-material = "^9.5.40"
|
||||
@ -164,15 +166,82 @@ docling-tools = "docling.cli.tools:app"
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.black]
|
||||
[tool.ruff]
|
||||
target-version = "py39"
|
||||
line-length = 88
|
||||
target-version = ["py39"]
|
||||
include = '\.pyi?$'
|
||||
respect-gitignore = true
|
||||
|
||||
[tool.isort]
|
||||
profile = "black"
|
||||
line_length = 88
|
||||
py_version = 39
|
||||
# extend-exclude = [
|
||||
# "tests",
|
||||
# ]
|
||||
|
||||
[tool.ruff.format]
|
||||
skip-magic-trailing-comma = false
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
# "B", # flake8-bugbear
|
||||
"C", # flake8-comprehensions
|
||||
"C9", # mccabe
|
||||
# "D", # flake8-docstrings
|
||||
"E", # pycodestyle errors (default)
|
||||
"F", # pyflakes (default)
|
||||
"I", # isort
|
||||
"PD", # pandas-vet
|
||||
"PIE", # pie
|
||||
# "PTH", # pathlib
|
||||
"Q", # flake8-quotes
|
||||
# "RET", # return
|
||||
"RUF", # Enable all ruff-specific checks
|
||||
# "SIM", # simplify
|
||||
"S307", # eval
|
||||
# "T20", # (disallow print statements) keep debugging statements out of the codebase
|
||||
"W", # pycodestyle warnings
|
||||
"ASYNC", # async
|
||||
"UP", # pyupgrade
|
||||
]
|
||||
|
||||
ignore = [
|
||||
"C408", # Unnecessary `dict()` call (rewrite as a literal)
|
||||
"E501", # Line too long, handled by ruff formatter
|
||||
"D107", # "Missing docstring in __init__",
|
||||
"F401", # imported but unused; consider using `importlib.util.find_spec` to test for "
|
||||
"F811", # "redefinition of the same function"
|
||||
"PL", # Pylint
|
||||
"RUF012", # Mutable Class Attributes
|
||||
"UP006", # List vs list, etc
|
||||
"UP007", # Option and Union
|
||||
"UP035", # `typing.Set` is deprecated, use `set` instead"
|
||||
]
|
||||
|
||||
#extend-select = []
|
||||
|
||||
[tool.ruff.lint.pep8-naming]
|
||||
classmethod-decorators = [
|
||||
# Allow Pydantic's `@validator` decorator to trigger class method treatment.
|
||||
"pydantic.validator",
|
||||
]
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
"__init__.py" = ["E402", "F401"]
|
||||
"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests
|
||||
|
||||
[tool.ruff.lint.mccabe]
|
||||
max-complexity = 20
|
||||
|
||||
# [tool.ruff.lint.isort.sections]
|
||||
# "docling" = ["docling_core", "docling_ibm_models", "docling_parse"]
|
||||
|
||||
[tool.ruff.lint.isort]
|
||||
combine-as-imports = true
|
||||
# section-order = [
|
||||
# "future",
|
||||
# "standard-library",
|
||||
# "third-party",
|
||||
# "docling",
|
||||
# "first-party",
|
||||
# "local-folder",
|
||||
# ]
|
||||
|
||||
[tool.mypy]
|
||||
pretty = true
|
||||
@ -200,10 +269,6 @@ module = [
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
[tool.flake8]
|
||||
max-line-length = 88
|
||||
extend-ignore = ["E203", "E501"]
|
||||
|
||||
[tool.semantic_release]
|
||||
# for default values check:
|
||||
# https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg
|
||||
|
@ -19,7 +19,6 @@ def _get_backend(fname):
|
||||
|
||||
|
||||
def test_asciidocs_examples():
|
||||
|
||||
fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc"))
|
||||
|
||||
for fname in fnames:
|
||||
@ -38,8 +37,8 @@ def test_asciidocs_examples():
|
||||
print("\n\n", pred_mddoc)
|
||||
|
||||
if os.path.exists(gname):
|
||||
with open(gname, "r") as fr:
|
||||
true_mddoc = fr.read()
|
||||
with open(gname) as fr:
|
||||
fr.read()
|
||||
|
||||
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
|
||||
else:
|
||||
|
@ -1,5 +1,3 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from pytest import warns
|
||||
@ -15,22 +13,19 @@ GENERATE = GEN_TEST_DATA
|
||||
|
||||
|
||||
def get_csv_paths():
|
||||
|
||||
# Define the directory you want to search
|
||||
directory = Path(f"./tests/data/csv/")
|
||||
directory = Path("./tests/data/csv/")
|
||||
|
||||
# List all CSV files in the directory and its subdirectories
|
||||
return sorted(directory.rglob("*.csv"))
|
||||
|
||||
|
||||
def get_csv_path(name: str):
|
||||
|
||||
# Return the matching CSV file path
|
||||
return Path(f"./tests/data/csv/{name}.csv")
|
||||
|
||||
|
||||
def get_converter():
|
||||
|
||||
converter = DocumentConverter(allowed_formats=[InputFormat.CSV])
|
||||
|
||||
return converter
|
||||
@ -55,9 +50,9 @@ def test_e2e_valid_csv_conversions():
|
||||
pred_itxt: str = doc._export_to_indented_text(
|
||||
max_text_len=70, explicit_tables=False
|
||||
)
|
||||
assert verify_export(
|
||||
pred_itxt, str(gt_path) + ".itxt"
|
||||
), "export to indented-text"
|
||||
assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
|
||||
"export to indented-text"
|
||||
)
|
||||
|
||||
assert verify_document(
|
||||
pred_doc=doc,
|
||||
|
@ -32,7 +32,7 @@ def test_text_cell_counts():
|
||||
|
||||
doc_backend = _get_backend(pdf_doc)
|
||||
|
||||
for page_index in range(0, doc_backend.page_count()):
|
||||
for page_index in range(doc_backend.page_count()):
|
||||
last_cell_count = None
|
||||
for i in range(10):
|
||||
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
||||
@ -42,9 +42,9 @@ def test_text_cell_counts():
|
||||
last_cell_count = len(cells)
|
||||
|
||||
if len(cells) != last_cell_count:
|
||||
assert (
|
||||
False
|
||||
), "Loading page multiple times yielded non-identical text cell counts"
|
||||
assert False, (
|
||||
"Loading page multiple times yielded non-identical text cell counts"
|
||||
)
|
||||
last_cell_count = len(cells)
|
||||
|
||||
|
||||
@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path):
|
||||
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
||||
|
||||
# Crop out "Figure 1" from the DocLayNet paper
|
||||
im = page_backend.get_page_image(
|
||||
page_backend.get_page_image(
|
||||
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
||||
)
|
||||
# im.show()
|
||||
|
@ -31,7 +31,7 @@ def test_text_cell_counts():
|
||||
|
||||
doc_backend = _get_backend(pdf_doc)
|
||||
|
||||
for page_index in range(0, doc_backend.page_count()):
|
||||
for page_index in range(doc_backend.page_count()):
|
||||
last_cell_count = None
|
||||
for i in range(10):
|
||||
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
|
||||
@ -41,9 +41,9 @@ def test_text_cell_counts():
|
||||
last_cell_count = len(cells)
|
||||
|
||||
if len(cells) != last_cell_count:
|
||||
assert (
|
||||
False
|
||||
), "Loading page multiple times yielded non-identical text cell counts"
|
||||
assert False, (
|
||||
"Loading page multiple times yielded non-identical text cell counts"
|
||||
)
|
||||
last_cell_count = len(cells)
|
||||
|
||||
|
||||
@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path):
|
||||
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
|
||||
|
||||
# Crop out "Figure 1" from the DocLayNet paper
|
||||
im = page_backend.get_page_image(
|
||||
page_backend.get_page_image(
|
||||
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
||||
)
|
||||
# im.show()
|
||||
|
@ -31,7 +31,7 @@ def test_text_cell_counts():
|
||||
|
||||
doc_backend = _get_backend(pdf_doc)
|
||||
|
||||
for page_index in range(0, doc_backend.page_count()):
|
||||
for page_index in range(doc_backend.page_count()):
|
||||
last_cell_count = None
|
||||
for i in range(10):
|
||||
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
|
||||
@ -41,9 +41,9 @@ def test_text_cell_counts():
|
||||
last_cell_count = len(cells)
|
||||
|
||||
if len(cells) != last_cell_count:
|
||||
assert (
|
||||
False
|
||||
), "Loading page multiple times yielded non-identical text cell counts"
|
||||
assert False, (
|
||||
"Loading page multiple times yielded non-identical text cell counts"
|
||||
)
|
||||
last_cell_count = len(cells)
|
||||
|
||||
|
||||
@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path):
|
||||
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
|
||||
|
||||
# Crop out "Figure 1" from the DocLayNet paper
|
||||
im = page_backend.get_page_image(
|
||||
page_backend.get_page_image(
|
||||
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
||||
)
|
||||
# im.show()
|
||||
|
@ -105,7 +105,6 @@ def test_ordered_lists():
|
||||
|
||||
|
||||
def get_html_paths():
|
||||
|
||||
# Define the directory you want to search
|
||||
directory = Path("./tests/data/html/")
|
||||
|
||||
@ -115,14 +114,12 @@ def get_html_paths():
|
||||
|
||||
|
||||
def get_converter():
|
||||
|
||||
converter = DocumentConverter(allowed_formats=[InputFormat.HTML])
|
||||
|
||||
return converter
|
||||
|
||||
|
||||
def test_e2e_html_conversions():
|
||||
|
||||
html_paths = get_html_paths()
|
||||
converter = get_converter()
|
||||
|
||||
@ -138,15 +135,15 @@ def test_e2e_html_conversions():
|
||||
doc: DoclingDocument = conv_result.document
|
||||
|
||||
pred_md: str = doc.export_to_markdown()
|
||||
assert verify_export(
|
||||
pred_md, str(gt_path) + ".md", generate=GENERATE
|
||||
), "export to md"
|
||||
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
|
||||
"export to md"
|
||||
)
|
||||
|
||||
pred_itxt: str = doc._export_to_indented_text(
|
||||
max_text_len=70, explicit_tables=False
|
||||
)
|
||||
assert verify_export(
|
||||
pred_itxt, str(gt_path) + ".itxt", generate=GENERATE
|
||||
), "export to indented-text"
|
||||
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
|
||||
"export to indented-text"
|
||||
)
|
||||
|
||||
assert verify_document(doc, str(gt_path) + ".json", GENERATE)
|
||||
|
@ -15,7 +15,7 @@ GENERATE = GEN_TEST_DATA
|
||||
|
||||
|
||||
def get_pubmed_paths():
|
||||
directory = Path(os.path.dirname(__file__) + f"/data/pubmed/")
|
||||
directory = Path(os.path.dirname(__file__) + "/data/pubmed/")
|
||||
xml_files = sorted(directory.rglob("*.xml"))
|
||||
return xml_files
|
||||
|
||||
@ -47,9 +47,9 @@ def test_e2e_pubmed_conversions(use_stream=False):
|
||||
pred_itxt: str = doc._export_to_indented_text(
|
||||
max_text_len=70, explicit_tables=False
|
||||
)
|
||||
assert verify_export(
|
||||
pred_itxt, str(gt_path) + ".itxt"
|
||||
), "export to indented-text"
|
||||
assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
|
||||
"export to indented-text"
|
||||
)
|
||||
|
||||
assert verify_document(doc, str(gt_path) + ".json", GENERATE), "export to json"
|
||||
|
||||
|
@ -17,7 +17,6 @@ GENERATE = GEN_TEST_DATA
|
||||
|
||||
|
||||
def get_xlsx_paths():
|
||||
|
||||
# Define the directory you want to search
|
||||
directory = Path("./tests/data/xlsx/")
|
||||
|
||||
@ -27,7 +26,6 @@ def get_xlsx_paths():
|
||||
|
||||
|
||||
def get_converter():
|
||||
|
||||
converter = DocumentConverter(allowed_formats=[InputFormat.XLSX])
|
||||
|
||||
return converter
|
||||
@ -65,13 +63,13 @@ def test_e2e_xlsx_conversions(documents) -> None:
|
||||
pred_itxt: str = doc._export_to_indented_text(
|
||||
max_text_len=70, explicit_tables=False
|
||||
)
|
||||
assert verify_export(
|
||||
pred_itxt, str(gt_path) + ".itxt"
|
||||
), "export to indented-text"
|
||||
assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
|
||||
"export to indented-text"
|
||||
)
|
||||
|
||||
assert verify_document(
|
||||
doc, str(gt_path) + ".json", GENERATE
|
||||
), "document document"
|
||||
assert verify_document(doc, str(gt_path) + ".json", GENERATE), (
|
||||
"document document"
|
||||
)
|
||||
|
||||
|
||||
def test_pages(documents) -> None:
|
||||
@ -81,7 +79,7 @@ def test_pages(documents) -> None:
|
||||
documents: The paths and converted documents.
|
||||
"""
|
||||
# number of pages from the backend method
|
||||
path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0]
|
||||
path = next(item for item in get_xlsx_paths() if item.stem == "test-01")
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=path,
|
||||
format=InputFormat.XLSX,
|
||||
@ -92,7 +90,7 @@ def test_pages(documents) -> None:
|
||||
assert backend.page_count() == 3
|
||||
|
||||
# number of pages from the converted document
|
||||
doc = [item for path, item in documents if path.stem == "test-01"][0]
|
||||
doc = next(item for path, item in documents if path.stem == "test-01")
|
||||
assert len(doc.pages) == 3
|
||||
|
||||
# page sizes as number of cells
|
||||
|
@ -1,4 +1,3 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
@ -43,7 +42,6 @@ def test_heading_levels():
|
||||
|
||||
|
||||
def get_docx_paths():
|
||||
|
||||
# Define the directory you want to search
|
||||
directory = Path("./tests/data/docx/")
|
||||
|
||||
@ -53,14 +51,12 @@ def get_docx_paths():
|
||||
|
||||
|
||||
def get_converter():
|
||||
|
||||
converter = DocumentConverter(allowed_formats=[InputFormat.DOCX])
|
||||
|
||||
return converter
|
||||
|
||||
|
||||
def test_e2e_docx_conversions():
|
||||
|
||||
docx_paths = get_docx_paths()
|
||||
converter = get_converter()
|
||||
|
||||
@ -76,20 +72,20 @@ def test_e2e_docx_conversions():
|
||||
doc: DoclingDocument = conv_result.document
|
||||
|
||||
pred_md: str = doc.export_to_markdown()
|
||||
assert verify_export(
|
||||
pred_md, str(gt_path) + ".md", generate=GENERATE
|
||||
), "export to md"
|
||||
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
|
||||
"export to md"
|
||||
)
|
||||
|
||||
pred_itxt: str = doc._export_to_indented_text(
|
||||
max_text_len=70, explicit_tables=False
|
||||
)
|
||||
assert verify_export(
|
||||
pred_itxt, str(gt_path) + ".itxt", generate=GENERATE
|
||||
), "export to indented-text"
|
||||
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
|
||||
"export to indented-text"
|
||||
)
|
||||
|
||||
assert verify_document(
|
||||
doc, str(gt_path) + ".json", generate=GENERATE
|
||||
), "document document"
|
||||
assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
|
||||
"document document"
|
||||
)
|
||||
|
||||
if docx_path.name == "word_tables.docx":
|
||||
pred_html: str = doc.export_to_html()
|
||||
|
@ -109,27 +109,27 @@ def test_patent_groundtruth(patents, groundtruth):
|
||||
md_name = path.stem + ".md"
|
||||
if md_name in gt_names:
|
||||
pred_md = doc.export_to_markdown()
|
||||
assert (
|
||||
pred_md == gt_names[md_name]
|
||||
), f"Markdown file mismatch against groundtruth {md_name}"
|
||||
assert pred_md == gt_names[md_name], (
|
||||
f"Markdown file mismatch against groundtruth {md_name}"
|
||||
)
|
||||
json_path = path.with_suffix(".json")
|
||||
if json_path.stem in gt_names:
|
||||
assert verify_document(
|
||||
doc, str(json_path), GENERATE
|
||||
), f"JSON file mismatch against groundtruth {json_path}"
|
||||
assert verify_document(doc, str(json_path), GENERATE), (
|
||||
f"JSON file mismatch against groundtruth {json_path}"
|
||||
)
|
||||
itxt_name = path.stem + ".itxt"
|
||||
if itxt_name in gt_names:
|
||||
pred_itxt = doc._export_to_indented_text()
|
||||
assert (
|
||||
pred_itxt == gt_names[itxt_name]
|
||||
), f"Indented text file mismatch against groundtruth {itxt_name}"
|
||||
assert pred_itxt == gt_names[itxt_name], (
|
||||
f"Indented text file mismatch against groundtruth {itxt_name}"
|
||||
)
|
||||
|
||||
|
||||
def test_tables(tables):
|
||||
"""Test the table parser."""
|
||||
# CHECK table in file tables_20180000016.xml
|
||||
file_name = "tables_ipa20180000016.xml"
|
||||
file_table = [item[1] for item in tables if item[0].name == file_name][0]
|
||||
file_table = next(item[1] for item in tables if item[0].name == file_name)
|
||||
assert file_table.num_rows == 13
|
||||
assert file_table.num_cols == 10
|
||||
assert len(file_table.table_cells) == 130
|
||||
@ -140,7 +140,7 @@ def test_patent_uspto_ice(patents):
|
||||
|
||||
# CHECK application doc number 20200022300
|
||||
file_name = "ipa20200022300.xml"
|
||||
doc = [item[1] for item in patents if item[0].name == file_name][0]
|
||||
doc = next(item[1] for item in patents if item[0].name == file_name)
|
||||
if GENERATE:
|
||||
_generate_groundtruth(doc, Path(file_name).stem)
|
||||
|
||||
@ -278,7 +278,7 @@ def test_patent_uspto_ice(patents):
|
||||
|
||||
# CHECK application doc number 20180000016 for HTML entities, level 2 headings, tables
|
||||
file_name = "ipa20180000016.xml"
|
||||
doc = [item[1] for item in patents if item[0].name == file_name][0]
|
||||
doc = next(item[1] for item in patents if item[0].name == file_name)
|
||||
if GENERATE:
|
||||
_generate_groundtruth(doc, Path(file_name).stem)
|
||||
|
||||
@ -348,7 +348,7 @@ def test_patent_uspto_ice(patents):
|
||||
|
||||
# CHECK application doc number 20110039701 for complex long tables
|
||||
file_name = "ipa20110039701.xml"
|
||||
doc = [item[1] for item in patents if item[0].name == file_name][0]
|
||||
doc = next(item[1] for item in patents if item[0].name == file_name)
|
||||
assert doc.name == file_name
|
||||
assert len(doc.tables) == 17
|
||||
|
||||
@ -358,7 +358,7 @@ def test_patent_uspto_grant_v2(patents):
|
||||
|
||||
# CHECK application doc number 06442728
|
||||
file_name = "pg06442728.xml"
|
||||
doc = [item[1] for item in patents if item[0].name == file_name][0]
|
||||
doc = next(item[1] for item in patents if item[0].name == file_name)
|
||||
if GENERATE:
|
||||
_generate_groundtruth(doc, Path(file_name).stem)
|
||||
|
||||
@ -376,12 +376,12 @@ def test_patent_uspto_grant_v2(patents):
|
||||
assert isinstance(texts[2], TextItem)
|
||||
assert texts[2].text == (
|
||||
"An interleaver receives incoming data frames of size N. The interleaver "
|
||||
"indexes the elements of the frame with an N₁×N₂ index array. The interleaver "
|
||||
"indexes the elements of the frame with an N₁×N₂ index array. The interleaver " # noqa: RUF001
|
||||
"then effectively rearranges (permutes) the data by permuting the rows of the "
|
||||
"index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to "
|
||||
"index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to " # noqa: RUF001
|
||||
"permute the columns (indexed by k) of each row (indexed by j). P is at least "
|
||||
"equal to N₂, βj is a constant which may be different for each row, and each "
|
||||
"αj is a relative prime number relative to P. After permuting, the "
|
||||
"αj is a relative prime number relative to P. After permuting, the " # noqa: RUF001
|
||||
"interleaver outputs the data in a different order than received (e.g., "
|
||||
"receives sequentially row by row, outputs sequentially each column by column)."
|
||||
)
|
||||
@ -402,7 +402,7 @@ def test_patent_uspto_app_v1(patents):
|
||||
|
||||
# CHECK application doc number 20010031492
|
||||
file_name = "pa20010031492.xml"
|
||||
doc = [item[1] for item in patents if item[0].name == file_name][0]
|
||||
doc = next(item[1] for item in patents if item[0].name == file_name)
|
||||
if GENERATE:
|
||||
_generate_groundtruth(doc, Path(file_name).stem)
|
||||
|
||||
@ -432,7 +432,7 @@ def test_patent_uspto_grant_aps(patents):
|
||||
|
||||
# CHECK application doc number 057006474
|
||||
file_name = "pftaps057006474.txt"
|
||||
doc = [item[1] for item in patents if item[0].name == file_name][0]
|
||||
doc = next(item[1] for item in patents if item[0].name == file_name)
|
||||
if GENERATE:
|
||||
_generate_groundtruth(doc, Path(file_name).stem)
|
||||
|
||||
|
@ -32,7 +32,7 @@ def test_text_cell_counts():
|
||||
|
||||
doc_backend = _get_backend(pdf_doc)
|
||||
|
||||
for page_index in range(0, doc_backend.page_count()):
|
||||
for page_index in range(doc_backend.page_count()):
|
||||
last_cell_count = None
|
||||
for i in range(10):
|
||||
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||
@ -42,9 +42,9 @@ def test_text_cell_counts():
|
||||
last_cell_count = len(cells)
|
||||
|
||||
if len(cells) != last_cell_count:
|
||||
assert (
|
||||
False
|
||||
), "Loading page multiple times yielded non-identical text cell counts"
|
||||
assert False, (
|
||||
"Loading page multiple times yielded non-identical text cell counts"
|
||||
)
|
||||
last_cell_count = len(cells)
|
||||
|
||||
|
||||
@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path):
|
||||
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||
|
||||
# Crop out "Figure 1" from the DocLayNet paper
|
||||
im = page_backend.get_page_image(
|
||||
page_backend.get_page_image(
|
||||
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
||||
)
|
||||
# im.show()
|
||||
|
@ -1,4 +1,3 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
@ -12,7 +11,6 @@ GENERATE = GEN_TEST_DATA
|
||||
|
||||
|
||||
def get_pptx_paths():
|
||||
|
||||
# Define the directory you want to search
|
||||
directory = Path("./tests/data/pptx/")
|
||||
|
||||
@ -22,14 +20,12 @@ def get_pptx_paths():
|
||||
|
||||
|
||||
def get_converter():
|
||||
|
||||
converter = DocumentConverter(allowed_formats=[InputFormat.PPTX])
|
||||
|
||||
return converter
|
||||
|
||||
|
||||
def test_e2e_pptx_conversions():
|
||||
|
||||
pptx_paths = get_pptx_paths()
|
||||
converter = get_converter()
|
||||
|
||||
@ -50,10 +46,10 @@ def test_e2e_pptx_conversions():
|
||||
pred_itxt: str = doc._export_to_indented_text(
|
||||
max_text_len=70, explicit_tables=False
|
||||
)
|
||||
assert verify_export(
|
||||
pred_itxt, str(gt_path) + ".itxt"
|
||||
), "export to indented-text"
|
||||
assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
|
||||
"export to indented-text"
|
||||
)
|
||||
|
||||
assert verify_document(
|
||||
doc, str(gt_path) + ".json", GENERATE
|
||||
), "document document"
|
||||
assert verify_document(doc, str(gt_path) + ".json", GENERATE), (
|
||||
"document document"
|
||||
)
|
||||
|
@ -3,7 +3,6 @@ from pathlib import Path
|
||||
from docling_core.types.doc import CodeItem, TextItem
|
||||
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
@ -12,7 +11,6 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
|
||||
|
||||
def get_converter():
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.generate_page_images = True
|
||||
|
||||
|
@ -2,7 +2,6 @@ from pathlib import Path
|
||||
|
||||
from docling_core.types.doc import PictureClassificationData
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
@ -11,7 +10,6 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
|
||||
|
||||
def get_converter():
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.generate_page_images = True
|
||||
|
||||
@ -49,32 +47,32 @@ def test_picture_classifier():
|
||||
|
||||
res = results[0]
|
||||
assert len(res.annotations) == 1
|
||||
assert type(res.annotations[0]) == PictureClassificationData
|
||||
assert isinstance(res.annotations[0], PictureClassificationData)
|
||||
classification_data = res.annotations[0]
|
||||
assert classification_data.provenance == "DocumentPictureClassifier"
|
||||
assert (
|
||||
len(classification_data.predicted_classes) == 16
|
||||
), "Number of predicted classes is not equal to 16"
|
||||
assert len(classification_data.predicted_classes) == 16, (
|
||||
"Number of predicted classes is not equal to 16"
|
||||
)
|
||||
confidences = [pred.confidence for pred in classification_data.predicted_classes]
|
||||
assert confidences == sorted(
|
||||
confidences, reverse=True
|
||||
), "Predictions are not sorted in descending order of confidence"
|
||||
assert (
|
||||
classification_data.predicted_classes[0].class_name == "bar_chart"
|
||||
), "The prediction is wrong for the bar chart image."
|
||||
assert confidences == sorted(confidences, reverse=True), (
|
||||
"Predictions are not sorted in descending order of confidence"
|
||||
)
|
||||
assert classification_data.predicted_classes[0].class_name == "bar_chart", (
|
||||
"The prediction is wrong for the bar chart image."
|
||||
)
|
||||
|
||||
res = results[1]
|
||||
assert len(res.annotations) == 1
|
||||
assert type(res.annotations[0]) == PictureClassificationData
|
||||
assert isinstance(res.annotations[0], PictureClassificationData)
|
||||
classification_data = res.annotations[0]
|
||||
assert classification_data.provenance == "DocumentPictureClassifier"
|
||||
assert (
|
||||
len(classification_data.predicted_classes) == 16
|
||||
), "Number of predicted classes is not equal to 16"
|
||||
assert len(classification_data.predicted_classes) == 16, (
|
||||
"Number of predicted classes is not equal to 16"
|
||||
)
|
||||
confidences = [pred.confidence for pred in classification_data.predicted_classes]
|
||||
assert confidences == sorted(
|
||||
confidences, reverse=True
|
||||
), "Predictions are not sorted in descending order of confidence"
|
||||
assert (
|
||||
classification_data.predicted_classes[0].class_name == "map"
|
||||
), "The prediction is wrong for the bar chart image."
|
||||
assert confidences == sorted(confidences, reverse=True), (
|
||||
"Predictions are not sorted in descending order of confidence"
|
||||
)
|
||||
assert classification_data.predicted_classes[0].class_name == "map", (
|
||||
"The prediction is wrong for the bar chart image."
|
||||
)
|
||||
|
@ -1,7 +1,6 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions
|
||||
@ -15,7 +14,6 @@ GENERATE_V2 = GEN_TEST_DATA
|
||||
|
||||
|
||||
def get_pdf_paths():
|
||||
|
||||
# Define the directory you want to search
|
||||
directory = Path("./tests/data/pdf/")
|
||||
|
||||
@ -25,7 +23,6 @@ def get_pdf_paths():
|
||||
|
||||
|
||||
def get_converter():
|
||||
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
@ -45,7 +42,6 @@ def get_converter():
|
||||
|
||||
|
||||
def test_e2e_pdfs_conversions():
|
||||
|
||||
pdf_paths = get_pdf_paths()
|
||||
converter = get_converter()
|
||||
|
||||
|
@ -3,7 +3,6 @@ from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
|
@ -12,10 +12,9 @@ from docling.document_converter import PdfFormatOption
|
||||
|
||||
|
||||
def test_in_doc_from_valid_path():
|
||||
|
||||
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
doc = _make_input_doc(test_doc_path)
|
||||
assert doc.valid == True
|
||||
assert doc.valid is True
|
||||
|
||||
|
||||
def test_in_doc_from_invalid_path():
|
||||
@ -23,29 +22,26 @@ def test_in_doc_from_invalid_path():
|
||||
|
||||
doc = _make_input_doc(test_doc_path)
|
||||
|
||||
assert doc.valid == False
|
||||
assert doc.valid is False
|
||||
|
||||
|
||||
def test_in_doc_from_valid_buf():
|
||||
|
||||
buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read())
|
||||
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
||||
|
||||
doc = _make_input_doc_from_stream(stream)
|
||||
assert doc.valid == True
|
||||
assert doc.valid is True
|
||||
|
||||
|
||||
def test_in_doc_from_invalid_buf():
|
||||
|
||||
buf = BytesIO(b"")
|
||||
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
||||
|
||||
doc = _make_input_doc_from_stream(stream)
|
||||
assert doc.valid == False
|
||||
assert doc.valid is False
|
||||
|
||||
|
||||
def test_image_in_pdf_backend():
|
||||
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
format=InputFormat.IMAGE,
|
||||
@ -76,7 +72,6 @@ def test_image_in_pdf_backend():
|
||||
|
||||
|
||||
def test_in_doc_with_page_range():
|
||||
|
||||
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
limits = DocumentLimits()
|
||||
limits.page_range = (1, 10)
|
||||
@ -87,7 +82,7 @@ def test_in_doc_with_page_range():
|
||||
backend=PyPdfiumDocumentBackend,
|
||||
limits=limits,
|
||||
)
|
||||
assert doc.valid == True
|
||||
assert doc.valid is True
|
||||
|
||||
limits.page_range = (9, 9)
|
||||
|
||||
@ -97,7 +92,7 @@ def test_in_doc_with_page_range():
|
||||
backend=PyPdfiumDocumentBackend,
|
||||
limits=limits,
|
||||
)
|
||||
assert doc.valid == True
|
||||
assert doc.valid is True
|
||||
|
||||
limits.page_range = (11, 12)
|
||||
|
||||
@ -107,7 +102,7 @@ def test_in_doc_with_page_range():
|
||||
backend=PyPdfiumDocumentBackend,
|
||||
limits=limits,
|
||||
)
|
||||
assert doc.valid == False
|
||||
assert doc.valid is False
|
||||
|
||||
|
||||
def test_guess_format(tmp_path):
|
||||
@ -192,17 +187,17 @@ def test_guess_format(tmp_path):
|
||||
)
|
||||
doc_path = temp_dir / "docling_test.xml"
|
||||
doc_path.write_text(xml_content, encoding="utf-8")
|
||||
assert dci._guess_format(doc_path) == None
|
||||
assert dci._guess_format(doc_path) is None
|
||||
buf = BytesIO(Path(doc_path).open("rb").read())
|
||||
stream = DocumentStream(name="docling_test.xml", stream=buf)
|
||||
assert dci._guess_format(stream) == None
|
||||
assert dci._guess_format(stream) is None
|
||||
|
||||
# Invalid USPTO patent (as plain text)
|
||||
stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz"))
|
||||
assert dci._guess_format(stream) == None
|
||||
assert dci._guess_format(stream) is None
|
||||
doc_path = temp_dir / "pftaps_wrong.txt"
|
||||
doc_path.write_text("xyz", encoding="utf-8")
|
||||
assert dci._guess_format(doc_path) == None
|
||||
assert dci._guess_format(doc_path) is None
|
||||
|
||||
# Valid Docling JSON
|
||||
test_str = '{"name": ""}'
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user