ci: add coverage and ruff (#1383)
* add coverage calculation and push Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * new codecov version and usage of token Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * enable ruff formatter instead of black and isort Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff lint fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * apply ruff unsafe fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add removed imports Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * runs 1 on linter issues Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * finalize linter fixes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Update pyproject.toml Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
293c28ca7c
commit
5458a88464
17
.github/codecov.yml
vendored
Normal file
17
.github/codecov.yml
vendored
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
codecov:
|
||||||
|
# https://docs.codecov.io/docs/comparing-commits
|
||||||
|
allow_coverage_offsets: true
|
||||||
|
coverage:
|
||||||
|
status:
|
||||||
|
project:
|
||||||
|
default:
|
||||||
|
informational: true
|
||||||
|
target: auto # auto compares coverage to the previous base commit
|
||||||
|
flags:
|
||||||
|
- docling
|
||||||
|
comment:
|
||||||
|
layout: "reach, diff, flags, files"
|
||||||
|
behavior: default
|
||||||
|
require_changes: false # if true: only post the comment if coverage changes
|
||||||
|
branches: # branch names that can post comment
|
||||||
|
- "main"
|
2
.github/workflows/cd.yml
vendored
2
.github/workflows/cd.yml
vendored
@ -10,6 +10,8 @@ env:
|
|||||||
jobs:
|
jobs:
|
||||||
code-checks:
|
code-checks:
|
||||||
uses: ./.github/workflows/checks.yml
|
uses: ./.github/workflows/checks.yml
|
||||||
|
with:
|
||||||
|
push_coverage: false
|
||||||
pre-release-check:
|
pre-release-check:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
outputs:
|
outputs:
|
||||||
|
16
.github/workflows/checks.yml
vendored
16
.github/workflows/checks.yml
vendored
@ -1,5 +1,13 @@
|
|||||||
on:
|
on:
|
||||||
workflow_call:
|
workflow_call:
|
||||||
|
inputs:
|
||||||
|
push_coverage:
|
||||||
|
type: boolean
|
||||||
|
description: "If true, the coverage results are pushed to codecov.io."
|
||||||
|
default: true
|
||||||
|
secrets:
|
||||||
|
CODECOV_TOKEN:
|
||||||
|
required: false
|
||||||
|
|
||||||
env:
|
env:
|
||||||
HF_HUB_DOWNLOAD_TIMEOUT: "60"
|
HF_HUB_DOWNLOAD_TIMEOUT: "60"
|
||||||
@ -32,7 +40,13 @@ jobs:
|
|||||||
run: poetry install --all-extras
|
run: poetry install --all-extras
|
||||||
- name: Testing
|
- name: Testing
|
||||||
run: |
|
run: |
|
||||||
poetry run pytest -v tests
|
poetry run pytest -v --cov=docling --cov-report=xml tests
|
||||||
|
- name: Upload coverage to Codecov
|
||||||
|
if: inputs.push_coverage
|
||||||
|
uses: codecov/codecov-action@v5
|
||||||
|
with:
|
||||||
|
token: ${{ secrets.CODECOV_TOKEN }}
|
||||||
|
file: ./coverage.xml
|
||||||
- name: Run examples
|
- name: Run examples
|
||||||
run: |
|
run: |
|
||||||
for file in docs/examples/*.py; do
|
for file in docs/examples/*.py; do
|
||||||
|
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
@ -17,3 +17,5 @@ jobs:
|
|||||||
code-checks:
|
code-checks:
|
||||||
if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }}
|
if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }}
|
||||||
uses: ./.github/workflows/checks.yml
|
uses: ./.github/workflows/checks.yml
|
||||||
|
secrets:
|
||||||
|
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
|
||||||
|
@ -1,43 +1,26 @@
|
|||||||
fail_fast: true
|
fail_fast: true
|
||||||
repos:
|
repos:
|
||||||
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
|
rev: v0.11.5
|
||||||
|
hooks:
|
||||||
|
# Run the Ruff formatter.
|
||||||
|
- id: ruff-format
|
||||||
|
name: "Ruff formatter"
|
||||||
|
args: [--config=pyproject.toml]
|
||||||
|
files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
|
||||||
|
# Run the Ruff linter.
|
||||||
|
- id: ruff
|
||||||
|
name: "Ruff linter"
|
||||||
|
args: [--exit-non-zero-on-fix, --fix, --config=pyproject.toml]
|
||||||
|
files: '^(docling|tests|docs/examples).*\.(py|ipynb)$'
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
- id: black
|
|
||||||
name: Black
|
|
||||||
entry: poetry run black docling docs/examples tests
|
|
||||||
pass_filenames: false
|
|
||||||
language: system
|
|
||||||
files: '\.py$'
|
|
||||||
- id: isort
|
|
||||||
name: isort
|
|
||||||
entry: poetry run isort docling docs/examples tests
|
|
||||||
pass_filenames: false
|
|
||||||
language: system
|
|
||||||
files: '\.py$'
|
|
||||||
# - id: flake8
|
|
||||||
# name: flake8
|
|
||||||
# entry: poetry run flake8 docling
|
|
||||||
# pass_filenames: false
|
|
||||||
# language: system
|
|
||||||
# files: '\.py$'
|
|
||||||
- id: mypy
|
- id: mypy
|
||||||
name: MyPy
|
name: MyPy
|
||||||
entry: poetry run mypy docling
|
entry: poetry run mypy docling
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
language: system
|
language: system
|
||||||
files: '\.py$'
|
files: '\.py$'
|
||||||
- id: nbqa_black
|
|
||||||
name: nbQA Black
|
|
||||||
entry: poetry run nbqa black docs/examples
|
|
||||||
pass_filenames: false
|
|
||||||
language: system
|
|
||||||
files: '\.ipynb$'
|
|
||||||
- id: nbqa_isort
|
|
||||||
name: nbQA isort
|
|
||||||
entry: poetry run nbqa isort docs/examples
|
|
||||||
pass_filenames: false
|
|
||||||
language: system
|
|
||||||
files: '\.ipynb$'
|
|
||||||
- id: poetry
|
- id: poetry
|
||||||
name: Poetry check
|
name: Poetry check
|
||||||
entry: poetry check --lock
|
entry: poetry check --lock
|
||||||
|
@ -34,7 +34,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||||
self.lines = text_stream.split("\n")
|
self.lines = text_stream.split("\n")
|
||||||
if isinstance(self.path_or_stream, Path):
|
if isinstance(self.path_or_stream, Path):
|
||||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
with open(self.path_or_stream, encoding="utf-8") as f:
|
||||||
self.lines = f.readlines()
|
self.lines = f.readlines()
|
||||||
self.valid = True
|
self.valid = True
|
||||||
|
|
||||||
@ -75,14 +75,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def _parse(self, doc: DoclingDocument):
|
def _parse(self, doc: DoclingDocument): # noqa: C901
|
||||||
"""
|
"""
|
||||||
Main function that orchestrates the parsing by yielding components:
|
Main function that orchestrates the parsing by yielding components:
|
||||||
title, section headers, text, lists, and tables.
|
title, section headers, text, lists, and tables.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
content = ""
|
|
||||||
|
|
||||||
in_list = False
|
in_list = False
|
||||||
in_table = False
|
in_table = False
|
||||||
|
|
||||||
@ -95,7 +93,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
||||||
indents: dict[int, Union[GroupItem, None]] = {}
|
indents: dict[int, Union[GroupItem, None]] = {}
|
||||||
|
|
||||||
for i in range(0, 10):
|
for i in range(10):
|
||||||
parents[i] = None
|
parents[i] = None
|
||||||
indents[i] = None
|
indents[i] = None
|
||||||
|
|
||||||
@ -125,7 +123,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
# Lists
|
# Lists
|
||||||
elif self._is_list_item(line):
|
elif self._is_list_item(line):
|
||||||
|
|
||||||
_log.debug(f"line: {line}")
|
_log.debug(f"line: {line}")
|
||||||
item = self._parse_list_item(line)
|
item = self._parse_list_item(line)
|
||||||
_log.debug(f"parsed list-item: {item}")
|
_log.debug(f"parsed list-item: {item}")
|
||||||
@ -147,7 +144,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
indents[level + 1] = item["indent"]
|
indents[level + 1] = item["indent"]
|
||||||
|
|
||||||
elif in_list and item["indent"] < indents[level]:
|
elif in_list and item["indent"] < indents[level]:
|
||||||
|
|
||||||
# print(item["indent"], " => ", indents[level])
|
# print(item["indent"], " => ", indents[level])
|
||||||
while item["indent"] < indents[level]:
|
while item["indent"] < indents[level]:
|
||||||
# print(item["indent"], " => ", indents[level])
|
# print(item["indent"], " => ", indents[level])
|
||||||
@ -176,7 +172,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
elif in_table and (
|
elif in_table and (
|
||||||
(not self._is_table_line(line)) or line.strip() == "|==="
|
(not self._is_table_line(line)) or line.strip() == "|==="
|
||||||
): # end of table
|
): # end of table
|
||||||
|
|
||||||
caption = None
|
caption = None
|
||||||
if len(caption_data) > 0:
|
if len(caption_data) > 0:
|
||||||
caption = doc.add_text(
|
caption = doc.add_text(
|
||||||
@ -195,7 +190,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
# Picture
|
# Picture
|
||||||
elif self._is_picture(line):
|
elif self._is_picture(line):
|
||||||
|
|
||||||
caption = None
|
caption = None
|
||||||
if len(caption_data) > 0:
|
if len(caption_data) > 0:
|
||||||
caption = doc.add_text(
|
caption = doc.add_text(
|
||||||
@ -250,7 +244,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
text_data = []
|
text_data = []
|
||||||
|
|
||||||
elif len(line.strip()) > 0: # allow multiline texts
|
elif len(line.strip()) > 0: # allow multiline texts
|
||||||
|
|
||||||
item = self._parse_text(line)
|
item = self._parse_text(line)
|
||||||
text_data.append(item["text"])
|
text_data.append(item["text"])
|
||||||
|
|
||||||
@ -273,14 +266,14 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
def _get_current_level(self, parents):
|
def _get_current_level(self, parents):
|
||||||
for k, v in parents.items():
|
for k, v in parents.items():
|
||||||
if v == None and k > 0:
|
if v is None and k > 0:
|
||||||
return k - 1
|
return k - 1
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
def _get_current_parent(self, parents):
|
def _get_current_parent(self, parents):
|
||||||
for k, v in parents.items():
|
for k, v in parents.items():
|
||||||
if v == None and k > 0:
|
if v is None and k > 0:
|
||||||
return parents[k - 1]
|
return parents[k - 1]
|
||||||
|
|
||||||
return None
|
return None
|
||||||
@ -328,7 +321,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
"marker": marker,
|
"marker": marker,
|
||||||
"text": text.strip(),
|
"text": text.strip(),
|
||||||
"numbered": False,
|
"numbered": False,
|
||||||
"indent": 0 if indent == None else len(indent),
|
"indent": 0 if indent is None else len(indent),
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
return {
|
return {
|
||||||
@ -336,7 +329,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
"marker": marker,
|
"marker": marker,
|
||||||
"text": text.strip(),
|
"text": text.strip(),
|
||||||
"numbered": True,
|
"numbered": True,
|
||||||
"indent": 0 if indent == None else len(indent),
|
"indent": 0 if indent is None else len(indent),
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
# Fallback if no match
|
# Fallback if no match
|
||||||
@ -357,7 +350,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
|
|||||||
return [cell.strip() for cell in line.split("|") if cell.strip()]
|
return [cell.strip() for cell in line.split("|") if cell.strip()]
|
||||||
|
|
||||||
def _populate_table_as_grid(self, table_data):
|
def _populate_table_as_grid(self, table_data):
|
||||||
|
|
||||||
num_rows = len(table_data)
|
num_rows = len(table_data)
|
||||||
|
|
||||||
# Adjust the table data into a grid format
|
# Adjust the table data into a grid format
|
||||||
|
@ -58,7 +58,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
head = self.content.readline()
|
head = self.content.readline()
|
||||||
dialect = csv.Sniffer().sniff(head, ",;\t|:")
|
dialect = csv.Sniffer().sniff(head, ",;\t|:")
|
||||||
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
|
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
|
||||||
if not dialect.delimiter in {",", ";", "\t", "|", ":"}:
|
if dialect.delimiter not in {",", ";", "\t", "|", ":"}:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
|
f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
|
||||||
)
|
)
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
|
from collections.abc import Iterable
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Optional, Union
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
import pypdfium2 as pdfium
|
import pypdfium2 as pdfium
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
from docling_core.types.doc import BoundingBox, CoordOrigin, Size
|
||||||
@ -156,7 +157,6 @@ class DoclingParsePageBackend(PdfPageBackend):
|
|||||||
def get_page_image(
|
def get_page_image(
|
||||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||||
) -> Image.Image:
|
) -> Image.Image:
|
||||||
|
|
||||||
page_size = self.get_size()
|
page_size = self.get_size()
|
||||||
|
|
||||||
if not cropbox:
|
if not cropbox:
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
|
from collections.abc import Iterable
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
from typing import TYPE_CHECKING, List, Optional, Union
|
||||||
|
|
||||||
import pypdfium2 as pdfium
|
import pypdfium2 as pdfium
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
@ -172,7 +173,6 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|||||||
def get_page_image(
|
def get_page_image(
|
||||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||||
) -> Image.Image:
|
) -> Image.Image:
|
||||||
|
|
||||||
page_size = self.get_size()
|
page_size = self.get_size()
|
||||||
|
|
||||||
if not cropbox:
|
if not cropbox:
|
||||||
|
@ -1,14 +1,14 @@
|
|||||||
import logging
|
import logging
|
||||||
import random
|
from collections.abc import Iterable
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
from typing import TYPE_CHECKING, Optional, Union
|
||||||
|
|
||||||
import pypdfium2 as pdfium
|
import pypdfium2 as pdfium
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||||
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
|
from docling_parse.pdf_parser import DoclingPdfParser, PdfDocument
|
||||||
from PIL import Image, ImageDraw
|
from PIL import Image
|
||||||
from pypdfium2 import PdfPage
|
from pypdfium2 import PdfPage
|
||||||
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
@ -93,7 +93,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
|||||||
def get_page_image(
|
def get_page_image(
|
||||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||||
) -> Image.Image:
|
) -> Image.Image:
|
||||||
|
|
||||||
page_size = self.get_size()
|
page_size = self.get_size()
|
||||||
|
|
||||||
if not cropbox:
|
if not cropbox:
|
||||||
|
@ -1,12 +1,8 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
|
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
|
||||||
On 23/01/2025
|
On 23/01/2025
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
|
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
|
||||||
|
|
||||||
BLANK = ""
|
BLANK = ""
|
||||||
@ -79,7 +75,6 @@ CHR_BO = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
T = {
|
T = {
|
||||||
"\u2192": "\\rightarrow ",
|
|
||||||
# Greek letters
|
# Greek letters
|
||||||
"\U0001d6fc": "\\alpha ",
|
"\U0001d6fc": "\\alpha ",
|
||||||
"\U0001d6fd": "\\beta ",
|
"\U0001d6fd": "\\beta ",
|
||||||
|
@ -76,8 +76,7 @@ def get_val(key, default=None, store=CHR):
|
|||||||
return default
|
return default
|
||||||
|
|
||||||
|
|
||||||
class Tag2Method(object):
|
class Tag2Method:
|
||||||
|
|
||||||
def call_method(self, elm, stag=None):
|
def call_method(self, elm, stag=None):
|
||||||
getmethod = self.tag2meth.get
|
getmethod = self.tag2meth.get
|
||||||
if stag is None:
|
if stag is None:
|
||||||
@ -130,7 +129,6 @@ class Tag2Method(object):
|
|||||||
|
|
||||||
|
|
||||||
class Pr(Tag2Method):
|
class Pr(Tag2Method):
|
||||||
|
|
||||||
text = ""
|
text = ""
|
||||||
|
|
||||||
__val_tags = ("chr", "pos", "begChr", "endChr", "type")
|
__val_tags = ("chr", "pos", "begChr", "endChr", "type")
|
||||||
@ -159,7 +157,7 @@ class Pr(Tag2Method):
|
|||||||
def do_common(self, elm):
|
def do_common(self, elm):
|
||||||
stag = elm.tag.replace(OMML_NS, "")
|
stag = elm.tag.replace(OMML_NS, "")
|
||||||
if stag in self.__val_tags:
|
if stag in self.__val_tags:
|
||||||
t = elm.get("{0}val".format(OMML_NS))
|
t = elm.get(f"{OMML_NS}val")
|
||||||
self.__innerdict[stag] = t
|
self.__innerdict[stag] = t
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -248,7 +246,6 @@ class oMath2Latex(Tag2Method):
|
|||||||
"""
|
"""
|
||||||
the Pre-Sub-Superscript object -- Not support yet
|
the Pre-Sub-Superscript object -- Not support yet
|
||||||
"""
|
"""
|
||||||
pass
|
|
||||||
|
|
||||||
def do_sub(self, elm):
|
def do_sub(self, elm):
|
||||||
text = self.process_children(elm)
|
text = self.process_children(elm)
|
||||||
@ -331,7 +328,7 @@ class oMath2Latex(Tag2Method):
|
|||||||
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
||||||
latex_s = LIM_FUNC.get(t_dict["e"])
|
latex_s = LIM_FUNC.get(t_dict["e"])
|
||||||
if not latex_s:
|
if not latex_s:
|
||||||
raise NotSupport("Not support lim %s" % t_dict["e"])
|
raise RuntimeError("Not support lim {}".format(t_dict["e"]))
|
||||||
else:
|
else:
|
||||||
return latex_s.format(lim=t_dict.get("lim"))
|
return latex_s.format(lim=t_dict.get("lim"))
|
||||||
|
|
||||||
@ -413,7 +410,7 @@ class oMath2Latex(Tag2Method):
|
|||||||
"""
|
"""
|
||||||
_str = []
|
_str = []
|
||||||
_base_str = []
|
_base_str = []
|
||||||
found_text = elm.findtext("./{0}t".format(OMML_NS))
|
found_text = elm.findtext(f"./{OMML_NS}t")
|
||||||
if found_text:
|
if found_text:
|
||||||
for s in found_text:
|
for s in found_text:
|
||||||
out_latex_str = self.process_unicode(s)
|
out_latex_str = self.process_unicode(s)
|
||||||
|
@ -55,7 +55,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.max_levels = 10
|
self.max_levels = 10
|
||||||
self.level = 0
|
self.level = 0
|
||||||
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
|
||||||
for i in range(0, self.max_levels):
|
for i in range(self.max_levels):
|
||||||
self.parents[i] = None
|
self.parents[i] = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -126,7 +126,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return doc
|
return doc
|
||||||
|
|
||||||
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
|
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||||
|
|
||||||
# Iterate over elements in the body of the document
|
# Iterate over elements in the body of the document
|
||||||
text: str = ""
|
text: str = ""
|
||||||
for element in tag.children:
|
for element in tag.children:
|
||||||
@ -135,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.analyze_tag(cast(Tag, element), doc)
|
self.analyze_tag(cast(Tag, element), doc)
|
||||||
except Exception as exc_child:
|
except Exception as exc_child:
|
||||||
_log.error(
|
_log.error(
|
||||||
f"Error processing child from tag {tag.name}: {repr(exc_child)}"
|
f"Error processing child from tag {tag.name}: {exc_child!r}"
|
||||||
)
|
)
|
||||||
raise exc_child
|
raise exc_child
|
||||||
elif isinstance(element, NavigableString) and not isinstance(
|
elif isinstance(element, NavigableString) and not isinstance(
|
||||||
@ -147,7 +146,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
item for item in element.next_siblings if isinstance(item, Tag)
|
item for item in element.next_siblings if isinstance(item, Tag)
|
||||||
]
|
]
|
||||||
if element.next_sibling is None or any(
|
if element.next_sibling is None or any(
|
||||||
[item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
|
item.name in TAGS_FOR_NODE_ITEMS for item in siblings
|
||||||
):
|
):
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
if text and tag.name in ["div"]:
|
if text and tag.name in ["div"]:
|
||||||
@ -222,7 +221,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if hlevel > self.level:
|
if hlevel > self.level:
|
||||||
|
|
||||||
# add invisible group
|
# add invisible group
|
||||||
for i in range(self.level + 1, hlevel):
|
for i in range(self.level + 1, hlevel):
|
||||||
self.parents[i] = doc.add_group(
|
self.parents[i] = doc.add_group(
|
||||||
@ -234,7 +232,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.level = hlevel
|
self.level = hlevel
|
||||||
|
|
||||||
elif hlevel < self.level:
|
elif hlevel < self.level:
|
||||||
|
|
||||||
# remove the tail
|
# remove the tail
|
||||||
for key in self.parents.keys():
|
for key in self.parents.keys():
|
||||||
if key > hlevel:
|
if key > hlevel:
|
||||||
@ -360,7 +357,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
marker = ""
|
marker = ""
|
||||||
enumerated = False
|
enumerated = False
|
||||||
if parent_label == GroupLabel.ORDERED_LIST:
|
if parent_label == GroupLabel.ORDERED_LIST:
|
||||||
marker = f"{str(index_in_list)}."
|
marker = f"{index_in_list!s}."
|
||||||
enumerated = True
|
enumerated = True
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
text=text,
|
text=text,
|
||||||
|
@ -83,7 +83,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# otherwise they represent emphasis (bold or italic)
|
# otherwise they represent emphasis (bold or italic)
|
||||||
self.markdown = self._shorten_underscore_sequences(text_stream)
|
self.markdown = self._shorten_underscore_sequences(text_stream)
|
||||||
if isinstance(self.path_or_stream, Path):
|
if isinstance(self.path_or_stream, Path):
|
||||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
with open(self.path_or_stream, encoding="utf-8") as f:
|
||||||
md_content = f.read()
|
md_content = f.read()
|
||||||
# remove invalid sequences
|
# remove invalid sequences
|
||||||
# very long sequences of underscores will lead to unnecessary long processing times.
|
# very long sequences of underscores will lead to unnecessary long processing times.
|
||||||
@ -168,7 +168,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
self.inline_texts = []
|
self.inline_texts = []
|
||||||
|
|
||||||
def _iterate_elements(
|
def _iterate_elements( # noqa: C901
|
||||||
self,
|
self,
|
||||||
element: marko.element.Element,
|
element: marko.element.Element,
|
||||||
depth: int,
|
depth: int,
|
||||||
@ -176,7 +176,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
visited: Set[marko.element.Element],
|
visited: Set[marko.element.Element],
|
||||||
parent_item: Optional[NodeItem] = None,
|
parent_item: Optional[NodeItem] = None,
|
||||||
):
|
):
|
||||||
|
|
||||||
if element in visited:
|
if element in visited:
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -236,7 +235,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if has_non_empty_list_items:
|
if has_non_empty_list_items:
|
||||||
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
|
||||||
parent_item = doc.add_group(
|
parent_item = doc.add_group(
|
||||||
label=label, name=f"list", parent=parent_item
|
label=label, name="list", parent=parent_item
|
||||||
)
|
)
|
||||||
|
|
||||||
elif (
|
elif (
|
||||||
@ -320,7 +319,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self._html_blocks += 1
|
self._html_blocks += 1
|
||||||
self._process_inline_text(parent_item, doc)
|
self._process_inline_text(parent_item, doc)
|
||||||
self._close_table(doc)
|
self._close_table(doc)
|
||||||
_log.debug("HTML Block: {}".format(element))
|
_log.debug(f"HTML Block: {element}")
|
||||||
if (
|
if (
|
||||||
len(element.body) > 0
|
len(element.body) > 0
|
||||||
): # If Marko doesn't return any content for HTML block, skip it
|
): # If Marko doesn't return any content for HTML block, skip it
|
||||||
@ -332,7 +331,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
else:
|
else:
|
||||||
if not isinstance(element, str):
|
if not isinstance(element, str):
|
||||||
self._close_table(doc)
|
self._close_table(doc)
|
||||||
_log.debug("Some other element: {}".format(element))
|
_log.debug(f"Some other element: {element}")
|
||||||
|
|
||||||
processed_block_types = (
|
processed_block_types = (
|
||||||
marko.block.Heading,
|
marko.block.Heading,
|
||||||
@ -398,7 +397,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
# if HTML blocks were detected, export to HTML and delegate to HTML backend
|
||||||
if self._html_blocks > 0:
|
if self._html_blocks > 0:
|
||||||
|
|
||||||
# export to HTML
|
# export to HTML
|
||||||
html_backend_cls = HTMLDocumentBackend
|
html_backend_cls = HTMLDocumentBackend
|
||||||
html_str = doc.export_to_html()
|
html_str = doc.export_to_html()
|
||||||
|
@ -184,7 +184,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
if self.workbook is not None:
|
if self.workbook is not None:
|
||||||
|
|
||||||
# Iterate over all sheets
|
# Iterate over all sheets
|
||||||
for sheet_name in self.workbook.sheetnames:
|
for sheet_name in self.workbook.sheetnames:
|
||||||
_log.info(f"Processing sheet: {sheet_name}")
|
_log.info(f"Processing sheet: {sheet_name}")
|
||||||
@ -253,7 +252,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
)
|
)
|
||||||
|
|
||||||
for excel_cell in excel_table.data:
|
for excel_cell in excel_table.data:
|
||||||
|
|
||||||
cell = TableCell(
|
cell = TableCell(
|
||||||
text=excel_cell.text,
|
text=excel_cell.text,
|
||||||
row_span=excel_cell.row_span,
|
row_span=excel_cell.row_span,
|
||||||
@ -303,7 +301,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
# Iterate over all cells in the sheet
|
# Iterate over all cells in the sheet
|
||||||
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
|
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
|
||||||
for rj, cell in enumerate(row):
|
for rj, cell in enumerate(row):
|
||||||
|
|
||||||
# Skip empty or already visited cells
|
# Skip empty or already visited cells
|
||||||
if cell.value is None or (ri, rj) in visited:
|
if cell.value is None or (ri, rj) in visited:
|
||||||
continue
|
continue
|
||||||
@ -342,7 +339,6 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
visited_cells: set[tuple[int, int]] = set()
|
visited_cells: set[tuple[int, int]] = set()
|
||||||
for ri in range(start_row, max_row + 1):
|
for ri in range(start_row, max_row + 1):
|
||||||
for rj in range(start_col, max_col + 1):
|
for rj in range(start_col, max_col + 1):
|
||||||
|
|
||||||
cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
|
cell = sheet.cell(row=ri + 1, column=rj + 1) # 1-based indexing
|
||||||
|
|
||||||
# Check if the cell belongs to a merged range
|
# Check if the cell belongs to a merged range
|
||||||
@ -350,14 +346,12 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
col_span = 1
|
col_span = 1
|
||||||
|
|
||||||
for merged_range in sheet.merged_cells.ranges:
|
for merged_range in sheet.merged_cells.ranges:
|
||||||
|
|
||||||
if (
|
if (
|
||||||
merged_range.min_row <= ri + 1
|
merged_range.min_row <= ri + 1
|
||||||
and ri + 1 <= merged_range.max_row
|
and ri + 1 <= merged_range.max_row
|
||||||
and merged_range.min_col <= rj + 1
|
and merged_range.min_col <= rj + 1
|
||||||
and rj + 1 <= merged_range.max_col
|
and rj + 1 <= merged_range.max_col
|
||||||
):
|
):
|
||||||
|
|
||||||
row_span = merged_range.max_row - merged_range.min_row + 1
|
row_span = merged_range.max_row - merged_range.min_row + 1
|
||||||
col_span = merged_range.max_col - merged_range.min_col + 1
|
col_span = merged_range.max_col - merged_range.min_col + 1
|
||||||
break
|
break
|
||||||
@ -499,7 +493,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
|
|||||||
),
|
),
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
except:
|
except Exception:
|
||||||
_log.error("could not extract the image from excel sheets")
|
_log.error("could not extract the image from excel sheets")
|
||||||
|
|
||||||
return doc
|
return doc
|
||||||
|
@ -120,13 +120,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
|
|
||||||
return prov
|
return prov
|
||||||
|
|
||||||
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
|
def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): # noqa: C901
|
||||||
is_a_list = False
|
is_a_list = False
|
||||||
is_list_group_created = False
|
is_list_group_created = False
|
||||||
enum_list_item_value = 0
|
enum_list_item_value = 0
|
||||||
new_list = None
|
new_list = None
|
||||||
bullet_type = "None"
|
bullet_type = "None"
|
||||||
list_text = ""
|
|
||||||
list_label = GroupLabel.LIST
|
list_label = GroupLabel.LIST
|
||||||
doc_label = DocItemLabel.LIST_ITEM
|
doc_label = DocItemLabel.LIST_ITEM
|
||||||
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
|
prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
|
||||||
@ -243,7 +242,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
enum_marker = str(enum_list_item_value) + "."
|
enum_marker = str(enum_list_item_value) + "."
|
||||||
if not is_list_group_created:
|
if not is_list_group_created:
|
||||||
new_list = doc.add_group(
|
new_list = doc.add_group(
|
||||||
label=list_label, name=f"list", parent=parent_slide
|
label=list_label, name="list", parent=parent_slide
|
||||||
)
|
)
|
||||||
is_list_group_created = True
|
is_list_group_created = True
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
@ -368,11 +367,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
slide_width = pptx_obj.slide_width
|
slide_width = pptx_obj.slide_width
|
||||||
slide_height = pptx_obj.slide_height
|
slide_height = pptx_obj.slide_height
|
||||||
|
|
||||||
text_content = [] # type: ignore
|
|
||||||
|
|
||||||
max_levels = 10
|
max_levels = 10
|
||||||
parents = {} # type: ignore
|
parents = {} # type: ignore
|
||||||
for i in range(0, max_levels):
|
for i in range(max_levels):
|
||||||
parents[i] = None
|
parents[i] = None
|
||||||
|
|
||||||
# Loop through each slide
|
# Loop through each slide
|
||||||
@ -383,7 +380,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
)
|
)
|
||||||
|
|
||||||
slide_size = Size(width=slide_width, height=slide_height)
|
slide_size = Size(width=slide_width, height=slide_height)
|
||||||
parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size)
|
doc.add_page(page_no=slide_ind + 1, size=slide_size)
|
||||||
|
|
||||||
def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
|
def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size):
|
||||||
handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
|
handle_groups(shape, parent_slide, slide_ind, doc, slide_size)
|
||||||
|
@ -158,7 +158,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def _get_level(self) -> int:
|
def _get_level(self) -> int:
|
||||||
"""Return the first None index."""
|
"""Return the first None index."""
|
||||||
for k, v in self.parents.items():
|
for k, v in self.parents.items():
|
||||||
if k >= 0 and v == None:
|
if k >= 0 and v is None:
|
||||||
return k
|
return k
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
@ -418,7 +418,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
else prev_parent
|
else prev_parent
|
||||||
)
|
)
|
||||||
|
|
||||||
def _handle_text_elements(
|
def _handle_text_elements( # noqa: C901
|
||||||
self,
|
self,
|
||||||
element: BaseOxmlElement,
|
element: BaseOxmlElement,
|
||||||
docx_obj: DocxDocument,
|
docx_obj: DocxDocument,
|
||||||
@ -812,7 +812,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
|
f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}"
|
||||||
)
|
)
|
||||||
if cell is None or cell._tc in cell_set:
|
if cell is None or cell._tc in cell_set:
|
||||||
_log.debug(f" skipped since repeated content")
|
_log.debug(" skipped since repeated content")
|
||||||
col_idx += cell.grid_span
|
col_idx += cell.grid_span
|
||||||
continue
|
continue
|
||||||
else:
|
else:
|
||||||
@ -879,7 +879,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
image=ImageRef.from_pil(image=pil_image, dpi=72),
|
||||||
caption=None,
|
caption=None,
|
||||||
)
|
)
|
||||||
except (UnidentifiedImageError, OSError) as e:
|
except (UnidentifiedImageError, OSError):
|
||||||
_log.warning("Warning: image cannot be loaded by Pillow")
|
_log.warning("Warning: image cannot be loaded by Pillow")
|
||||||
doc.add_picture(
|
doc.add_picture(
|
||||||
parent=self.parents[level - 1],
|
parent=self.parents[level - 1],
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
|
from collections.abc import Iterable
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Set, Union
|
from typing import Optional, Set, Union
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, Size
|
from docling_core.types.doc import BoundingBox, Size
|
||||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
|
from collections.abc import Iterable
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Iterable, List, Optional, Union
|
from typing import TYPE_CHECKING, List, Optional, Union
|
||||||
|
|
||||||
import pypdfium2 as pdfium
|
import pypdfium2 as pdfium
|
||||||
import pypdfium2.raw as pdfium_c
|
import pypdfium2.raw as pdfium_c
|
||||||
@ -29,7 +30,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
self.valid = True # No better way to tell from pypdfium.
|
self.valid = True # No better way to tell from pypdfium.
|
||||||
try:
|
try:
|
||||||
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
||||||
except PdfiumError as e:
|
except PdfiumError:
|
||||||
_log.info(
|
_log.info(
|
||||||
f"An exception occurred when loading page {page_no} of document {document_hash}.",
|
f"An exception occurred when loading page {page_no} of document {document_hash}.",
|
||||||
exc_info=True,
|
exc_info=True,
|
||||||
@ -225,7 +226,6 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
def get_page_image(
|
def get_page_image(
|
||||||
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
self, scale: float = 1, cropbox: Optional[BoundingBox] = None
|
||||||
) -> Image.Image:
|
) -> Image.Image:
|
||||||
|
|
||||||
page_size = self.get_size()
|
page_size = self.get_size()
|
||||||
|
|
||||||
if not cropbox:
|
if not cropbox:
|
||||||
|
@ -102,13 +102,13 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
doc_info: etree.DocInfo = self.tree.docinfo
|
doc_info: etree.DocInfo = self.tree.docinfo
|
||||||
if doc_info.system_url and any(
|
if doc_info.system_url and any(
|
||||||
[kwd in doc_info.system_url for kwd in JATS_DTD_URL]
|
kwd in doc_info.system_url for kwd in JATS_DTD_URL
|
||||||
):
|
):
|
||||||
self.valid = True
|
self.valid = True
|
||||||
return
|
return
|
||||||
for ent in doc_info.internalDTD.iterentities():
|
for ent in doc_info.internalDTD.iterentities():
|
||||||
if ent.system_url and any(
|
if ent.system_url and any(
|
||||||
[kwd in ent.system_url for kwd in JATS_DTD_URL]
|
kwd in ent.system_url for kwd in JATS_DTD_URL
|
||||||
):
|
):
|
||||||
self.valid = True
|
self.valid = True
|
||||||
return
|
return
|
||||||
@ -232,10 +232,9 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# TODO: once superscript is supported, add label with formatting
|
# TODO: once superscript is supported, add label with formatting
|
||||||
aff = aff.removeprefix(f"{label[0].text}, ")
|
aff = aff.removeprefix(f"{label[0].text}, ")
|
||||||
affiliation_names.append(aff)
|
affiliation_names.append(aff)
|
||||||
affiliation_ids_names = {
|
affiliation_ids_names = dict(
|
||||||
id: name
|
zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
|
||||||
for id, name in zip(meta.xpath(".//aff[@id]/@id"), affiliation_names)
|
)
|
||||||
}
|
|
||||||
|
|
||||||
# Get author names and affiliation names
|
# Get author names and affiliation names
|
||||||
for author_node in meta.xpath(
|
for author_node in meta.xpath(
|
||||||
@ -300,7 +299,6 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def _add_abstract(
|
def _add_abstract(
|
||||||
self, doc: DoclingDocument, xml_components: XMLComponents
|
self, doc: DoclingDocument, xml_components: XMLComponents
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
||||||
for abstract in xml_components["abstract"]:
|
for abstract in xml_components["abstract"]:
|
||||||
text: str = abstract["content"]
|
text: str = abstract["content"]
|
||||||
title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
|
title: str = abstract["label"] or DEFAULT_HEADER_ABSTRACT
|
||||||
@ -349,7 +347,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
def _parse_element_citation(self, node: etree._Element) -> str:
|
def _parse_element_citation(self, node: etree._Element) -> str: # noqa: C901
|
||||||
citation: Citation = {
|
citation: Citation = {
|
||||||
"author_names": "",
|
"author_names": "",
|
||||||
"title": "",
|
"title": "",
|
||||||
@ -440,7 +438,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
|
citation["page"] = node.xpath("fpage")[0].text.replace("\n", " ").strip()
|
||||||
if len(node.xpath("lpage")) > 0:
|
if len(node.xpath("lpage")) > 0:
|
||||||
citation["page"] += (
|
citation["page"] += (
|
||||||
"–" + node.xpath("lpage")[0].text.replace("\n", " ").strip()
|
"–" + node.xpath("lpage")[0].text.replace("\n", " ").strip() # noqa: RUF001
|
||||||
)
|
)
|
||||||
|
|
||||||
# Flatten the citation to string
|
# Flatten the citation to string
|
||||||
@ -595,9 +593,8 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
self._add_table(doc, parent, table)
|
self._add_table(doc, parent, table)
|
||||||
except Exception as e:
|
except Exception:
|
||||||
_log.warning(f"Skipping unsupported table in {str(self.file)}")
|
_log.warning(f"Skipping unsupported table in {self.file!s}")
|
||||||
pass
|
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -609,7 +606,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
def _walk_linear(
|
def _walk_linear( # noqa: C901
|
||||||
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
|
self, doc: DoclingDocument, parent: NodeItem, node: etree._Element
|
||||||
) -> str:
|
) -> str:
|
||||||
skip_tags = ["term"]
|
skip_tags = ["term"]
|
||||||
|
@ -122,7 +122,6 @@ class PatentUsptoDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
@override
|
@override
|
||||||
def convert(self) -> DoclingDocument:
|
def convert(self) -> DoclingDocument:
|
||||||
|
|
||||||
if self.parser is not None:
|
if self.parser is not None:
|
||||||
doc = self.parser.parse(self.patent_content)
|
doc = self.parser.parse(self.patent_content)
|
||||||
if doc is None:
|
if doc is None:
|
||||||
@ -163,7 +162,6 @@ class PatentUspto(ABC):
|
|||||||
Returns:
|
Returns:
|
||||||
The patent parsed as a docling document.
|
The patent parsed as a docling document.
|
||||||
"""
|
"""
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class PatentUsptoIce(PatentUspto):
|
class PatentUsptoIce(PatentUspto):
|
||||||
@ -265,7 +263,7 @@ class PatentUsptoIce(PatentUspto):
|
|||||||
self.style_html = HtmlEntity()
|
self.style_html = HtmlEntity()
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def startElement(self, tag, attributes): # noqa: N802
|
def startElement(self, tag, attributes):
|
||||||
"""Signal the start of an element.
|
"""Signal the start of an element.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -281,7 +279,7 @@ class PatentUsptoIce(PatentUspto):
|
|||||||
self._start_registered_elements(tag, attributes)
|
self._start_registered_elements(tag, attributes)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def skippedEntity(self, name): # noqa: N802
|
def skippedEntity(self, name):
|
||||||
"""Receive notification of a skipped entity.
|
"""Receive notification of a skipped entity.
|
||||||
|
|
||||||
HTML entities will be skipped by the parser. This method will unescape them
|
HTML entities will be skipped by the parser. This method will unescape them
|
||||||
@ -315,7 +313,7 @@ class PatentUsptoIce(PatentUspto):
|
|||||||
self.text += unescaped
|
self.text += unescaped
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def endElement(self, tag): # noqa: N802
|
def endElement(self, tag):
|
||||||
"""Signal the end of an element.
|
"""Signal the end of an element.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -603,7 +601,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|||||||
self.style_html = HtmlEntity()
|
self.style_html = HtmlEntity()
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def startElement(self, tag, attributes): # noqa: N802
|
def startElement(self, tag, attributes):
|
||||||
"""Signal the start of an element.
|
"""Signal the start of an element.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -616,7 +614,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|||||||
self._start_registered_elements(tag, attributes)
|
self._start_registered_elements(tag, attributes)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def skippedEntity(self, name): # noqa: N802
|
def skippedEntity(self, name):
|
||||||
"""Receive notification of a skipped entity.
|
"""Receive notification of a skipped entity.
|
||||||
|
|
||||||
HTML entities will be skipped by the parser. This method will unescape them
|
HTML entities will be skipped by the parser. This method will unescape them
|
||||||
@ -650,7 +648,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|||||||
self.text += unescaped
|
self.text += unescaped
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def endElement(self, tag): # noqa: N802
|
def endElement(self, tag):
|
||||||
"""Signal the end of an element.
|
"""Signal the end of an element.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -691,7 +689,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|||||||
if tag in [member.value for member in self.Element]:
|
if tag in [member.value for member in self.Element]:
|
||||||
if (
|
if (
|
||||||
tag == self.Element.HEADING.value
|
tag == self.Element.HEADING.value
|
||||||
and not self.Element.SDOCL.value in self.property
|
and self.Element.SDOCL.value not in self.property
|
||||||
):
|
):
|
||||||
level_attr: str = attributes.get("LVL", "")
|
level_attr: str = attributes.get("LVL", "")
|
||||||
new_level: int = int(level_attr) if level_attr.isnumeric() else 1
|
new_level: int = int(level_attr) if level_attr.isnumeric() else 1
|
||||||
@ -743,7 +741,7 @@ class PatentUsptoGrantV2(PatentUspto):
|
|||||||
# headers except claims statement
|
# headers except claims statement
|
||||||
elif (
|
elif (
|
||||||
self.Element.HEADING.value in self.property
|
self.Element.HEADING.value in self.property
|
||||||
and not self.Element.SDOCL.value in self.property
|
and self.Element.SDOCL.value not in self.property
|
||||||
and text.strip()
|
and text.strip()
|
||||||
):
|
):
|
||||||
self.parents[self.level + 1] = self.doc.add_heading(
|
self.parents[self.level + 1] = self.doc.add_heading(
|
||||||
@ -1164,7 +1162,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|||||||
self.style_html = HtmlEntity()
|
self.style_html = HtmlEntity()
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def startElement(self, tag, attributes): # noqa: N802
|
def startElement(self, tag, attributes):
|
||||||
"""Signal the start of an element.
|
"""Signal the start of an element.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -1177,7 +1175,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|||||||
self._start_registered_elements(tag, attributes)
|
self._start_registered_elements(tag, attributes)
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def skippedEntity(self, name): # noqa: N802
|
def skippedEntity(self, name):
|
||||||
"""Receive notification of a skipped entity.
|
"""Receive notification of a skipped entity.
|
||||||
|
|
||||||
HTML entities will be skipped by the parser. This method will unescape them
|
HTML entities will be skipped by the parser. This method will unescape them
|
||||||
@ -1211,7 +1209,7 @@ class PatentUsptoAppV1(PatentUspto):
|
|||||||
self.text += unescaped
|
self.text += unescaped
|
||||||
|
|
||||||
@override
|
@override
|
||||||
def endElement(self, tag): # noqa: N802
|
def endElement(self, tag):
|
||||||
"""Signal the end of an element.
|
"""Signal the end of an element.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -1474,9 +1472,7 @@ class XmlTable:
|
|||||||
if cw == 0:
|
if cw == 0:
|
||||||
offset_w0.append(col["offset"][ic])
|
offset_w0.append(col["offset"][ic])
|
||||||
|
|
||||||
min_colinfo["offset"] = sorted(
|
min_colinfo["offset"] = sorted(set(col["offset"] + min_colinfo["offset"]))
|
||||||
list(set(col["offset"] + min_colinfo["offset"]))
|
|
||||||
)
|
|
||||||
|
|
||||||
# add back the 0 width cols to offset list
|
# add back the 0 width cols to offset list
|
||||||
offset_w0 = list(set(offset_w0))
|
offset_w0 = list(set(offset_w0))
|
||||||
@ -1527,7 +1523,7 @@ class XmlTable:
|
|||||||
|
|
||||||
return ncols_max
|
return ncols_max
|
||||||
|
|
||||||
def _parse_table(self, table: Tag) -> TableData:
|
def _parse_table(self, table: Tag) -> TableData: # noqa: C901
|
||||||
"""Parse the content of a table tag.
|
"""Parse the content of a table tag.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@ -1722,7 +1718,7 @@ class HtmlEntity:
|
|||||||
"0": "⁰",
|
"0": "⁰",
|
||||||
"+": "⁺",
|
"+": "⁺",
|
||||||
"-": "⁻",
|
"-": "⁻",
|
||||||
"−": "⁻",
|
"−": "⁻", # noqa: RUF001
|
||||||
"=": "⁼",
|
"=": "⁼",
|
||||||
"(": "⁽",
|
"(": "⁽",
|
||||||
")": "⁾",
|
")": "⁾",
|
||||||
@ -1746,7 +1742,7 @@ class HtmlEntity:
|
|||||||
"0": "₀",
|
"0": "₀",
|
||||||
"+": "₊",
|
"+": "₊",
|
||||||
"-": "₋",
|
"-": "₋",
|
||||||
"−": "₋",
|
"−": "₋", # noqa: RUF001
|
||||||
"=": "₌",
|
"=": "₌",
|
||||||
"(": "₍",
|
"(": "₍",
|
||||||
")": "₎",
|
")": "₎",
|
||||||
|
@ -6,14 +6,16 @@ import sys
|
|||||||
import tempfile
|
import tempfile
|
||||||
import time
|
import time
|
||||||
import warnings
|
import warnings
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
from typing import Annotated, Dict, List, Optional, Type
|
||||||
|
|
||||||
import rich.table
|
import rich.table
|
||||||
import typer
|
import typer
|
||||||
from docling_core.types.doc import ImageRefMode
|
from docling_core.types.doc import ImageRefMode
|
||||||
from docling_core.utils.file import resolve_source_to_path
|
from docling_core.utils.file import resolve_source_to_path
|
||||||
from pydantic import TypeAdapter
|
from pydantic import TypeAdapter
|
||||||
|
from rich.console import Console
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||||
@ -53,7 +55,6 @@ warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|
|
|||||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
from rich.console import Console
|
|
||||||
|
|
||||||
console = Console()
|
console = Console()
|
||||||
err_console = Console(stderr=True)
|
err_console = Console(stderr=True)
|
||||||
@ -160,7 +161,6 @@ def export_documents(
|
|||||||
export_doctags: bool,
|
export_doctags: bool,
|
||||||
image_export_mode: ImageRefMode,
|
image_export_mode: ImageRefMode,
|
||||||
):
|
):
|
||||||
|
|
||||||
success_count = 0
|
success_count = 0
|
||||||
failure_count = 0
|
failure_count = 0
|
||||||
|
|
||||||
@ -233,7 +233,7 @@ def _split_list(raw: Optional[str]) -> Optional[List[str]]:
|
|||||||
|
|
||||||
|
|
||||||
@app.command(no_args_is_help=True)
|
@app.command(no_args_is_help=True)
|
||||||
def convert(
|
def convert( # noqa: C901
|
||||||
input_sources: Annotated[
|
input_sources: Annotated[
|
||||||
List[str],
|
List[str],
|
||||||
typer.Argument(
|
typer.Argument(
|
||||||
@ -289,7 +289,7 @@ def convert(
|
|||||||
...,
|
...,
|
||||||
help=(
|
help=(
|
||||||
f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
|
f"The OCR engine to use. When --allow-external-plugins is *not* set, the available values are: "
|
||||||
f"{', '.join((o.value for o in ocr_engines_enum_internal))}. "
|
f"{', '.join(o.value for o in ocr_engines_enum_internal)}. "
|
||||||
f"Use the option --show-external-plugins to see the options allowed with external plugins."
|
f"Use the option --show-external-plugins to see the options allowed with external plugins."
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
@ -430,7 +430,7 @@ def convert(
|
|||||||
settings.debug.visualize_ocr = debug_visualize_ocr
|
settings.debug.visualize_ocr = debug_visualize_ocr
|
||||||
|
|
||||||
if from_formats is None:
|
if from_formats is None:
|
||||||
from_formats = [e for e in InputFormat]
|
from_formats = list(InputFormat)
|
||||||
|
|
||||||
parsed_headers: Optional[Dict[str, str]] = None
|
parsed_headers: Optional[Dict[str, str]] = None
|
||||||
if headers is not None:
|
if headers is not None:
|
||||||
|
@ -62,7 +62,7 @@ def download(
|
|||||||
models: Annotated[
|
models: Annotated[
|
||||||
Optional[list[_AvailableModels]],
|
Optional[list[_AvailableModels]],
|
||||||
typer.Argument(
|
typer.Argument(
|
||||||
help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
|
help="Models to download (default behavior: a predefined set of models will be downloaded).",
|
||||||
),
|
),
|
||||||
] = None,
|
] = None,
|
||||||
all: Annotated[
|
all: Annotated[
|
||||||
@ -89,14 +89,13 @@ def download(
|
|||||||
"Cannot simultaneously set 'all' parameter and specify models to download."
|
"Cannot simultaneously set 'all' parameter and specify models to download."
|
||||||
)
|
)
|
||||||
if not quiet:
|
if not quiet:
|
||||||
FORMAT = "%(message)s"
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format="[blue]%(message)s[/blue]",
|
format="[blue]%(message)s[/blue]",
|
||||||
datefmt="[%X]",
|
datefmt="[%X]",
|
||||||
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
||||||
)
|
)
|
||||||
to_download = models or ([m for m in _AvailableModels] if all else _default_models)
|
to_download = models or (list(_AvailableModels) if all else _default_models)
|
||||||
output_dir = download_models(
|
output_dir = download_models(
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
force=force,
|
force=force,
|
||||||
|
@ -10,7 +10,9 @@ from docling_core.types.doc import (
|
|||||||
TableCell,
|
TableCell,
|
||||||
)
|
)
|
||||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||||
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
|
||||||
|
# DO NOT REMOVE; explicitly exposed from this location
|
||||||
|
from docling_core.types.io import (
|
||||||
DocumentStream,
|
DocumentStream,
|
||||||
)
|
)
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
@ -233,9 +235,9 @@ class Page(BaseModel):
|
|||||||
None # Internal PDF backend. By default it is cleared during assembling.
|
None # Internal PDF backend. By default it is cleared during assembling.
|
||||||
)
|
)
|
||||||
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
||||||
_image_cache: Dict[float, Image] = (
|
_image_cache: Dict[
|
||||||
{}
|
float, Image
|
||||||
) # Cache of images in different scales. By default it is cleared during assembling.
|
] = {} # Cache of images in different scales. By default it is cleared during assembling.
|
||||||
|
|
||||||
def get_image(
|
def get_image(
|
||||||
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
|
self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
|
||||||
@ -243,7 +245,7 @@ class Page(BaseModel):
|
|||||||
if self._backend is None:
|
if self._backend is None:
|
||||||
return self._image_cache.get(scale, None)
|
return self._image_cache.get(scale, None)
|
||||||
|
|
||||||
if not scale in self._image_cache:
|
if scale not in self._image_cache:
|
||||||
if cropbox is None:
|
if cropbox is None:
|
||||||
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
self._image_cache[scale] = self._backend.get_page_image(scale=scale)
|
||||||
else:
|
else:
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
import csv
|
import csv
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from collections.abc import Iterable
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
from typing import (
|
from typing import (
|
||||||
TYPE_CHECKING,
|
TYPE_CHECKING,
|
||||||
Dict,
|
Dict,
|
||||||
Iterable,
|
|
||||||
List,
|
List,
|
||||||
Literal,
|
Literal,
|
||||||
Optional,
|
Optional,
|
||||||
@ -17,6 +17,8 @@ from typing import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
import filetype
|
import filetype
|
||||||
|
|
||||||
|
# DO NOT REMOVE; explicitly exposed from this location
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItem,
|
DocItem,
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
@ -35,14 +37,14 @@ from docling_core.types.legacy_doc.base import (
|
|||||||
PageReference,
|
PageReference,
|
||||||
Prov,
|
Prov,
|
||||||
Ref,
|
Ref,
|
||||||
|
Table as DsSchemaTable,
|
||||||
|
TableCell,
|
||||||
)
|
)
|
||||||
from docling_core.types.legacy_doc.base import Table as DsSchemaTable
|
|
||||||
from docling_core.types.legacy_doc.base import TableCell
|
|
||||||
from docling_core.types.legacy_doc.document import (
|
from docling_core.types.legacy_doc.document import (
|
||||||
CCSDocumentDescription as DsDocumentDescription,
|
CCSDocumentDescription as DsDocumentDescription,
|
||||||
|
CCSFileInfoObject as DsFileInfoObject,
|
||||||
|
ExportedCCSDocument as DsDocument,
|
||||||
)
|
)
|
||||||
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
|
||||||
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
|
||||||
from docling_core.utils.file import resolve_source_to_stream
|
from docling_core.utils.file import resolve_source_to_stream
|
||||||
from docling_core.utils.legacy import docling_document_to_legacy
|
from docling_core.utils.legacy import docling_document_to_legacy
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
@ -65,7 +67,7 @@ from docling.datamodel.base_models import (
|
|||||||
)
|
)
|
||||||
from docling.datamodel.settings import DocumentLimits
|
from docling.datamodel.settings import DocumentLimits
|
||||||
from docling.utils.profiling import ProfilingItem
|
from docling.utils.profiling import ProfilingItem
|
||||||
from docling.utils.utils import create_file_hash, create_hash
|
from docling.utils.utils import create_file_hash
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from docling.document_converter import FormatOption
|
from docling.document_converter import FormatOption
|
||||||
@ -134,9 +136,9 @@ class InputDocument(BaseModel):
|
|||||||
self._init_doc(backend, path_or_stream)
|
self._init_doc(backend, path_or_stream)
|
||||||
|
|
||||||
elif isinstance(path_or_stream, BytesIO):
|
elif isinstance(path_or_stream, BytesIO):
|
||||||
assert (
|
assert filename is not None, (
|
||||||
filename is not None
|
"Can't construct InputDocument from stream without providing filename arg."
|
||||||
), "Can't construct InputDocument from stream without providing filename arg."
|
)
|
||||||
self.file = PurePath(filename)
|
self.file = PurePath(filename)
|
||||||
self.filesize = path_or_stream.getbuffer().nbytes
|
self.filesize = path_or_stream.getbuffer().nbytes
|
||||||
|
|
||||||
@ -228,7 +230,6 @@ class _DummyBackend(AbstractDocumentBackend):
|
|||||||
|
|
||||||
|
|
||||||
class _DocumentConversionInput(BaseModel):
|
class _DocumentConversionInput(BaseModel):
|
||||||
|
|
||||||
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
||||||
headers: Optional[Dict[str, str]] = None
|
headers: Optional[Dict[str, str]] = None
|
||||||
limits: Optional[DocumentLimits] = DocumentLimits()
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
||||||
|
@ -380,7 +380,6 @@ class PaginatedPipelineOptions(PipelineOptions):
|
|||||||
|
|
||||||
|
|
||||||
class VlmPipelineOptions(PaginatedPipelineOptions):
|
class VlmPipelineOptions(PaginatedPipelineOptions):
|
||||||
|
|
||||||
generate_page_images: bool = True
|
generate_page_images: bool = True
|
||||||
force_backend_text: bool = (
|
force_backend_text: bool = (
|
||||||
False # (To be used with vlms, or other generative models)
|
False # (To be used with vlms, or other generative models)
|
||||||
|
@ -1,11 +1,11 @@
|
|||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
import math
|
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Iterable, Iterator
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Iterable, Iterator, List, Optional, Tuple, Type, Union
|
from typing import Dict, List, Optional, Tuple, Type, Union
|
||||||
|
|
||||||
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||||
|
|
||||||
@ -172,7 +172,7 @@ class DocumentConverter:
|
|||||||
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
||||||
):
|
):
|
||||||
self.allowed_formats = (
|
self.allowed_formats = (
|
||||||
allowed_formats if allowed_formats is not None else [e for e in InputFormat]
|
allowed_formats if allowed_formats is not None else list(InputFormat)
|
||||||
)
|
)
|
||||||
self.format_to_options = {
|
self.format_to_options = {
|
||||||
format: (
|
format: (
|
||||||
@ -254,7 +254,7 @@ class DocumentConverter:
|
|||||||
|
|
||||||
if not had_result and raises_on_error:
|
if not had_result and raises_on_error:
|
||||||
raise ConversionError(
|
raise ConversionError(
|
||||||
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
||||||
)
|
)
|
||||||
|
|
||||||
def _convert(
|
def _convert(
|
||||||
@ -266,7 +266,7 @@ class DocumentConverter:
|
|||||||
conv_input.docs(self.format_to_options),
|
conv_input.docs(self.format_to_options),
|
||||||
settings.perf.doc_batch_size, # pass format_options
|
settings.perf.doc_batch_size, # pass format_options
|
||||||
):
|
):
|
||||||
_log.info(f"Going to convert document batch...")
|
_log.info("Going to convert document batch...")
|
||||||
|
|
||||||
# parallel processing only within input_batch
|
# parallel processing only within input_batch
|
||||||
# with ThreadPoolExecutor(
|
# with ThreadPoolExecutor(
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from typing import Iterable
|
from collections.abc import Iterable
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page, VlmPrediction
|
from docling.datamodel.base_models import Page, VlmPrediction
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
@ -10,7 +10,6 @@ from docling.utils.profiling import TimeRecorder
|
|||||||
|
|
||||||
|
|
||||||
class ApiVlmModel(BasePageModel):
|
class ApiVlmModel(BasePageModel):
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
enabled: bool,
|
enabled: bool,
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any, Generic, Iterable, Optional, Protocol, Type
|
from collections.abc import Iterable
|
||||||
|
from typing import Generic, Optional, Protocol, Type
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
|
||||||
from typing_extensions import TypeVar
|
from typing_extensions import TypeVar
|
||||||
@ -29,7 +30,6 @@ EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
|
|||||||
|
|
||||||
|
|
||||||
class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
|
class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
|
||||||
|
|
||||||
elements_batch_size: int = settings.perf.elements_batch_size
|
elements_batch_size: int = settings.perf.elements_batch_size
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
@ -50,7 +50,6 @@ class GenericEnrichmentModel(ABC, Generic[EnrichElementT]):
|
|||||||
|
|
||||||
|
|
||||||
class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
|
class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
|
||||||
|
|
||||||
def prepare_element(
|
def prepare_element(
|
||||||
self, conv_res: ConversionResult, element: NodeItem
|
self, conv_res: ConversionResult, element: NodeItem
|
||||||
) -> Optional[NodeItem]:
|
) -> Optional[NodeItem]:
|
||||||
@ -62,7 +61,6 @@ class BaseEnrichmentModel(GenericEnrichmentModel[NodeItem]):
|
|||||||
class BaseItemAndImageEnrichmentModel(
|
class BaseItemAndImageEnrichmentModel(
|
||||||
GenericEnrichmentModel[ItemAndImageEnrichmentElement]
|
GenericEnrichmentModel[ItemAndImageEnrichmentElement]
|
||||||
):
|
):
|
||||||
|
|
||||||
images_scale: float
|
images_scale: float
|
||||||
expansion_factor: float = 0.0
|
expansion_factor: float = 0.0
|
||||||
|
|
||||||
|
@ -1,12 +1,12 @@
|
|||||||
import copy
|
import copy
|
||||||
import logging
|
import logging
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Optional, Type
|
from typing import List, Optional, Type
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import BoundingRectangle, PdfTextCell, TextCell
|
|
||||||
from PIL import Image, ImageDraw
|
from PIL import Image, ImageDraw
|
||||||
from rtree import index
|
from rtree import index
|
||||||
from scipy.ndimage import binary_dilation, find_objects, label
|
from scipy.ndimage import binary_dilation, find_objects, label
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
import re
|
import re
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Literal, Optional, Tuple, Union
|
from typing import List, Literal, Optional, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Literal, Optional, Tuple, Union
|
from typing import List, Literal, Optional, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
import logging
|
import logging
|
||||||
import warnings
|
import warnings
|
||||||
import zipfile
|
import zipfile
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Optional, Type
|
from typing import List, Optional, Type
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
@ -58,12 +59,10 @@ class EasyOcrModel(BaseOcrModel):
|
|||||||
device = decide_device(accelerator_options.device)
|
device = decide_device(accelerator_options.device)
|
||||||
# Enable easyocr GPU if running on CUDA, MPS
|
# Enable easyocr GPU if running on CUDA, MPS
|
||||||
use_gpu = any(
|
use_gpu = any(
|
||||||
[
|
device.startswith(x)
|
||||||
device.startswith(x)
|
for x in [
|
||||||
for x in [
|
AcceleratorDevice.CUDA.value,
|
||||||
AcceleratorDevice.CUDA.value,
|
AcceleratorDevice.MPS.value,
|
||||||
AcceleratorDevice.MPS.value,
|
|
||||||
]
|
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@ -98,8 +97,10 @@ class EasyOcrModel(BaseOcrModel):
|
|||||||
progress: bool = False,
|
progress: bool = False,
|
||||||
) -> Path:
|
) -> Path:
|
||||||
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
|
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
|
||||||
from easyocr.config import detection_models as det_models_dict
|
from easyocr.config import (
|
||||||
from easyocr.config import recognition_models as rec_models_dict
|
detection_models as det_models_dict,
|
||||||
|
recognition_models as rec_models_dict,
|
||||||
|
)
|
||||||
|
|
||||||
if local_dir is None:
|
if local_dir is None:
|
||||||
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
|
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
|
||||||
@ -126,13 +127,11 @@ class EasyOcrModel(BaseOcrModel):
|
|||||||
def __call__(
|
def __call__(
|
||||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
) -> Iterable[Page]:
|
) -> Iterable[Page]:
|
||||||
|
|
||||||
if not self.enabled:
|
if not self.enabled:
|
||||||
yield from page_batch
|
yield from page_batch
|
||||||
return
|
return
|
||||||
|
|
||||||
for page in page_batch:
|
for page in page_batch:
|
||||||
|
|
||||||
assert page._backend is not None
|
assert page._backend is not None
|
||||||
if not page._backend.is_valid():
|
if not page._backend.is_valid():
|
||||||
yield page
|
yield page
|
||||||
|
@ -9,7 +9,7 @@ from docling.models.factories.picture_description_factory import (
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache
|
||||||
def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
||||||
factory = OcrFactory()
|
factory = OcrFactory()
|
||||||
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
|
factory.load_from_plugins(allow_external_plugins=allow_external_plugins)
|
||||||
@ -17,7 +17,7 @@ def get_ocr_factory(allow_external_plugins: bool = False) -> OcrFactory:
|
|||||||
return factory
|
return factory
|
||||||
|
|
||||||
|
|
||||||
@lru_cache()
|
@lru_cache
|
||||||
def get_picture_description_factory(
|
def get_picture_description_factory(
|
||||||
allow_external_plugins: bool = False,
|
allow_external_plugins: bool = False,
|
||||||
) -> PictureDescriptionFactory:
|
) -> PictureDescriptionFactory:
|
||||||
|
@ -33,7 +33,7 @@ class BaseFactory(Generic[A], metaclass=ABCMeta):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def registered_kind(self) -> list[str]:
|
def registered_kind(self) -> list[str]:
|
||||||
return list(opt.kind for opt in self._classes.keys())
|
return [opt.kind for opt in self._classes.keys()]
|
||||||
|
|
||||||
def get_enum(self) -> enum.Enum:
|
def get_enum(self) -> enum.Enum:
|
||||||
return enum.Enum(
|
return enum.Enum(
|
||||||
|
@ -1,25 +1,22 @@
|
|||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Optional
|
from typing import Optional
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page, VlmPrediction
|
from docling.datamodel.base_models import Page, VlmPrediction
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
HuggingFaceVlmOptions,
|
HuggingFaceVlmOptions,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
from docling.utils.accelerator_utils import decide_device
|
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class HuggingFaceMlxModel(BasePageModel):
|
class HuggingFaceMlxModel(BasePageModel):
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
enabled: bool,
|
enabled: bool,
|
||||||
@ -32,7 +29,6 @@ class HuggingFaceMlxModel(BasePageModel):
|
|||||||
self.vlm_options = vlm_options
|
self.vlm_options = vlm_options
|
||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from mlx_vlm import generate, load # type: ignore
|
from mlx_vlm import generate, load # type: ignore
|
||||||
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
|
from mlx_vlm.prompt_utils import apply_chat_template # type: ignore
|
||||||
@ -125,6 +121,8 @@ class HuggingFaceMlxModel(BasePageModel):
|
|||||||
generation_time = time.time() - start_time
|
generation_time = time.time() - start_time
|
||||||
page_tags = output
|
page_tags = output
|
||||||
|
|
||||||
|
_log.debug(f"Generation time {generation_time:.2f} seconds.")
|
||||||
|
|
||||||
# inference_time = time.time() - start_time
|
# inference_time = time.time() - start_time
|
||||||
# tokens_per_second = num_tokens / generation_time
|
# tokens_per_second = num_tokens / generation_time
|
||||||
# print("")
|
# print("")
|
||||||
|
@ -1,16 +1,15 @@
|
|||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, List, Optional
|
from typing import Optional
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page, VlmPrediction
|
from docling.datamodel.base_models import Page, VlmPrediction
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
HuggingFaceVlmOptions,
|
HuggingFaceVlmOptions,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
from docling.utils.accelerator_utils import decide_device
|
from docling.utils.accelerator_utils import decide_device
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docling.utils.profiling import TimeRecorder
|
||||||
@ -19,7 +18,6 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class HuggingFaceVlmModel(BasePageModel):
|
class HuggingFaceVlmModel(BasePageModel):
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
enabled: bool,
|
enabled: bool,
|
||||||
@ -42,7 +40,7 @@ class HuggingFaceVlmModel(BasePageModel):
|
|||||||
device = decide_device(accelerator_options.device)
|
device = decide_device(accelerator_options.device)
|
||||||
self.device = device
|
self.device = device
|
||||||
|
|
||||||
_log.debug("Available device for HuggingFace VLM: {}".format(device))
|
_log.debug(f"Available device for HuggingFace VLM: {device}")
|
||||||
|
|
||||||
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
||||||
|
|
||||||
@ -168,6 +166,10 @@ class HuggingFaceVlmModel(BasePageModel):
|
|||||||
num_tokens = len(generated_ids[0])
|
num_tokens = len(generated_ids[0])
|
||||||
page_tags = generated_texts
|
page_tags = generated_texts
|
||||||
|
|
||||||
|
_log.debug(
|
||||||
|
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
|
||||||
|
)
|
||||||
|
|
||||||
# inference_time = time.time() - start_time
|
# inference_time = time.time() - start_time
|
||||||
# tokens_per_second = num_tokens / generation_time
|
# tokens_per_second = num_tokens / generation_time
|
||||||
# print("")
|
# print("")
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
import copy
|
import copy
|
||||||
import logging
|
import logging
|
||||||
import warnings
|
import warnings
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Union
|
from typing import Optional
|
||||||
|
|
||||||
from docling_core.types.doc import DocItemLabel
|
from docling_core.types.doc import DocItemLabel
|
||||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||||
@ -142,7 +143,6 @@ class LayoutModel(BasePageModel):
|
|||||||
def __call__(
|
def __call__(
|
||||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
) -> Iterable[Page]:
|
) -> Iterable[Page]:
|
||||||
|
|
||||||
for page in page_batch:
|
for page in page_batch:
|
||||||
assert page._backend is not None
|
assert page._backend is not None
|
||||||
if not page._backend.is_valid():
|
if not page._backend.is_valid():
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Tuple, Type
|
from typing import Optional, Type
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||||
@ -41,7 +42,7 @@ class OcrMacModel(BaseOcrModel):
|
|||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
if "darwin" != sys.platform:
|
if "darwin" != sys.platform:
|
||||||
raise RuntimeError(f"OcrMac is only supported on Mac.")
|
raise RuntimeError("OcrMac is only supported on Mac.")
|
||||||
install_errmsg = (
|
install_errmsg = (
|
||||||
"ocrmac is not correctly installed. "
|
"ocrmac is not correctly installed. "
|
||||||
"Please install it via `pip install ocrmac` to use this OCR engine. "
|
"Please install it via `pip install ocrmac` to use this OCR engine. "
|
||||||
@ -58,7 +59,6 @@ class OcrMacModel(BaseOcrModel):
|
|||||||
def __call__(
|
def __call__(
|
||||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
) -> Iterable[Page]:
|
) -> Iterable[Page]:
|
||||||
|
|
||||||
if not self.enabled:
|
if not self.enabled:
|
||||||
yield from page_batch
|
yield from page_batch
|
||||||
return
|
return
|
||||||
@ -69,7 +69,6 @@ class OcrMacModel(BaseOcrModel):
|
|||||||
yield page
|
yield page
|
||||||
else:
|
else:
|
||||||
with TimeRecorder(conv_res, "ocr"):
|
with TimeRecorder(conv_res, "ocr"):
|
||||||
|
|
||||||
ocr_rects = self.get_ocr_rects(page)
|
ocr_rects = self.get_ocr_rects(page)
|
||||||
|
|
||||||
all_ocr_cells = []
|
all_ocr_cells = []
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
from typing import Iterable, List
|
from collections.abc import Iterable
|
||||||
|
from typing import List
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
@ -53,9 +54,9 @@ class PageAssembleModel(BasePageModel):
|
|||||||
sanitized_text = "".join(lines)
|
sanitized_text = "".join(lines)
|
||||||
|
|
||||||
# Text normalization
|
# Text normalization
|
||||||
sanitized_text = sanitized_text.replace("⁄", "/")
|
sanitized_text = sanitized_text.replace("⁄", "/") # noqa: RUF001
|
||||||
sanitized_text = sanitized_text.replace("’", "'")
|
sanitized_text = sanitized_text.replace("’", "'") # noqa: RUF001
|
||||||
sanitized_text = sanitized_text.replace("‘", "'")
|
sanitized_text = sanitized_text.replace("‘", "'") # noqa: RUF001
|
||||||
sanitized_text = sanitized_text.replace("“", '"')
|
sanitized_text = sanitized_text.replace("“", '"')
|
||||||
sanitized_text = sanitized_text.replace("”", '"')
|
sanitized_text = sanitized_text.replace("”", '"')
|
||||||
sanitized_text = sanitized_text.replace("•", "·")
|
sanitized_text = sanitized_text.replace("•", "·")
|
||||||
@ -71,7 +72,6 @@ class PageAssembleModel(BasePageModel):
|
|||||||
yield page
|
yield page
|
||||||
else:
|
else:
|
||||||
with TimeRecorder(conv_res, "page_assemble"):
|
with TimeRecorder(conv_res, "page_assemble"):
|
||||||
|
|
||||||
assert page.predictions.layout is not None
|
assert page.predictions.layout is not None
|
||||||
|
|
||||||
# assembles some JSON output page by page.
|
# assembles some JSON output page by page.
|
||||||
@ -83,7 +83,6 @@ class PageAssembleModel(BasePageModel):
|
|||||||
for cluster in page.predictions.layout.clusters:
|
for cluster in page.predictions.layout.clusters:
|
||||||
# _log.info("Cluster label seen:", cluster.label)
|
# _log.info("Cluster label seen:", cluster.label)
|
||||||
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
if cluster.label in LayoutModel.TEXT_ELEM_LABELS:
|
||||||
|
|
||||||
textlines = [
|
textlines = [
|
||||||
cell.text.replace("\x02", "-").strip()
|
cell.text.replace("\x02", "-").strip()
|
||||||
for cell in cluster.cells
|
for cell in cluster.cells
|
||||||
@ -109,9 +108,7 @@ class PageAssembleModel(BasePageModel):
|
|||||||
tbl = page.predictions.tablestructure.table_map.get(
|
tbl = page.predictions.tablestructure.table_map.get(
|
||||||
cluster.id, None
|
cluster.id, None
|
||||||
)
|
)
|
||||||
if (
|
if not tbl: # fallback: add table without structure, if it isn't present
|
||||||
not tbl
|
|
||||||
): # fallback: add table without structure, if it isn't present
|
|
||||||
tbl = Table(
|
tbl = Table(
|
||||||
label=cluster.label,
|
label=cluster.label,
|
||||||
id=cluster.id,
|
id=cluster.id,
|
||||||
@ -130,9 +127,7 @@ class PageAssembleModel(BasePageModel):
|
|||||||
fig = page.predictions.figures_classification.figure_map.get(
|
fig = page.predictions.figures_classification.figure_map.get(
|
||||||
cluster.id, None
|
cluster.id, None
|
||||||
)
|
)
|
||||||
if (
|
if not fig: # fallback: add figure without classification, if it isn't present
|
||||||
not fig
|
|
||||||
): # fallback: add figure without classification, if it isn't present
|
|
||||||
fig = FigureElement(
|
fig = FigureElement(
|
||||||
label=cluster.label,
|
label=cluster.label,
|
||||||
id=cluster.id,
|
id=cluster.id,
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional
|
from typing import Optional
|
||||||
|
|
||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Type, Union
|
from typing import Optional, Type, Union
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
@ -1,12 +1,11 @@
|
|||||||
import logging
|
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Iterable, List, Optional, Type, Union
|
from typing import List, Optional, Type, Union
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
NodeItem,
|
NodeItem,
|
||||||
PictureClassificationClass,
|
|
||||||
PictureItem,
|
PictureItem,
|
||||||
)
|
)
|
||||||
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Type, Union
|
from typing import Optional, Type, Union
|
||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
@ -13,7 +14,6 @@ from docling.utils.accelerator_utils import decide_device
|
|||||||
|
|
||||||
|
|
||||||
class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
||||||
return PictureDescriptionVlmOptions
|
return PictureDescriptionVlmOptions
|
||||||
@ -36,7 +36,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
|||||||
self.options: PictureDescriptionVlmOptions
|
self.options: PictureDescriptionVlmOptions
|
||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
|
|
||||||
if artifacts_path is None:
|
if artifacts_path is None:
|
||||||
artifacts_path = self.download_models(repo_id=self.options.repo_id)
|
artifacts_path = self.download_models(repo_id=self.options.repo_id)
|
||||||
else:
|
else:
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Type
|
from typing import Optional, Type
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
@ -74,13 +75,11 @@ class RapidOcrModel(BaseOcrModel):
|
|||||||
def __call__(
|
def __call__(
|
||||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
) -> Iterable[Page]:
|
) -> Iterable[Page]:
|
||||||
|
|
||||||
if not self.enabled:
|
if not self.enabled:
|
||||||
yield from page_batch
|
yield from page_batch
|
||||||
return
|
return
|
||||||
|
|
||||||
for page in page_batch:
|
for page in page_batch:
|
||||||
|
|
||||||
assert page._backend is not None
|
assert page._backend is not None
|
||||||
if not page._backend.is_valid():
|
if not page._backend.is_valid():
|
||||||
yield page
|
yield page
|
||||||
|
@ -1,12 +1,7 @@
|
|||||||
import copy
|
|
||||||
import random
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
BoundingBox,
|
|
||||||
CoordOrigin,
|
|
||||||
DocItem,
|
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
DocumentOrigin,
|
DocumentOrigin,
|
||||||
@ -17,13 +12,10 @@ from docling_core.types.doc import (
|
|||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
from docling_core.types.doc.document import ContentLayer
|
from docling_core.types.doc.document import ContentLayer
|
||||||
from docling_core.types.legacy_doc.base import Ref
|
|
||||||
from docling_core.types.legacy_doc.document import BaseText
|
|
||||||
from docling_ibm_models.reading_order.reading_order_rb import (
|
from docling_ibm_models.reading_order.reading_order_rb import (
|
||||||
PageElement as ReadingOrderPageElement,
|
PageElement as ReadingOrderPageElement,
|
||||||
|
ReadingOrderPredictor,
|
||||||
)
|
)
|
||||||
from docling_ibm_models.reading_order.reading_order_rb import ReadingOrderPredictor
|
|
||||||
from PIL import ImageDraw
|
|
||||||
from pydantic import BaseModel, ConfigDict
|
from pydantic import BaseModel, ConfigDict
|
||||||
|
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
@ -35,7 +27,6 @@ from docling.datamodel.base_models import (
|
|||||||
TextElement,
|
TextElement,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.settings import settings
|
|
||||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
|
|
||||||
|
|
||||||
@ -53,12 +44,10 @@ class ReadingOrderModel:
|
|||||||
def _assembled_to_readingorder_elements(
|
def _assembled_to_readingorder_elements(
|
||||||
self, conv_res: ConversionResult
|
self, conv_res: ConversionResult
|
||||||
) -> List[ReadingOrderPageElement]:
|
) -> List[ReadingOrderPageElement]:
|
||||||
|
|
||||||
elements: List[ReadingOrderPageElement] = []
|
elements: List[ReadingOrderPageElement] = []
|
||||||
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
|
page_no_to_pages = {p.page_no: p for p in conv_res.pages}
|
||||||
|
|
||||||
for element in conv_res.assembled.elements:
|
for element in conv_res.assembled.elements:
|
||||||
|
|
||||||
page_height = page_no_to_pages[element.page_no].size.height # type: ignore
|
page_height = page_no_to_pages[element.page_no].size.height # type: ignore
|
||||||
bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
|
bbox = element.cluster.bbox.to_bottom_left_origin(page_height)
|
||||||
text = element.text or ""
|
text = element.text or ""
|
||||||
@ -84,7 +73,6 @@ class ReadingOrderModel:
|
|||||||
def _add_child_elements(
|
def _add_child_elements(
|
||||||
self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
|
self, element: BasePageElement, doc_item: NodeItem, doc: DoclingDocument
|
||||||
):
|
):
|
||||||
|
|
||||||
child: Cluster
|
child: Cluster
|
||||||
for child in element.cluster.children:
|
for child in element.cluster.children:
|
||||||
c_label = child.label
|
c_label = child.label
|
||||||
@ -110,7 +98,7 @@ class ReadingOrderModel:
|
|||||||
else:
|
else:
|
||||||
doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
|
doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
|
||||||
|
|
||||||
def _readingorder_elements_to_docling_doc(
|
def _readingorder_elements_to_docling_doc( # noqa: C901
|
||||||
self,
|
self,
|
||||||
conv_res: ConversionResult,
|
conv_res: ConversionResult,
|
||||||
ro_elements: List[ReadingOrderPageElement],
|
ro_elements: List[ReadingOrderPageElement],
|
||||||
@ -118,7 +106,6 @@ class ReadingOrderModel:
|
|||||||
el_to_footnotes_mapping: Dict[int, List[int]],
|
el_to_footnotes_mapping: Dict[int, List[int]],
|
||||||
el_merges_mapping: Dict[int, List[int]],
|
el_merges_mapping: Dict[int, List[int]],
|
||||||
) -> DoclingDocument:
|
) -> DoclingDocument:
|
||||||
|
|
||||||
id_to_elem = {
|
id_to_elem = {
|
||||||
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
|
RefItem(cref=f"#/{elem.page_no}/{elem.cluster.id}").cref: elem
|
||||||
for elem in conv_res.assembled.elements
|
for elem in conv_res.assembled.elements
|
||||||
@ -192,7 +179,6 @@ class ReadingOrderModel:
|
|||||||
|
|
||||||
code_item.footnotes.append(new_footnote_item.get_ref())
|
code_item.footnotes.append(new_footnote_item.get_ref())
|
||||||
else:
|
else:
|
||||||
|
|
||||||
new_item, current_list = self._handle_text_element(
|
new_item, current_list = self._handle_text_element(
|
||||||
element, out_doc, current_list, page_height
|
element, out_doc, current_list, page_height
|
||||||
)
|
)
|
||||||
@ -206,7 +192,6 @@ class ReadingOrderModel:
|
|||||||
)
|
)
|
||||||
|
|
||||||
elif isinstance(element, Table):
|
elif isinstance(element, Table):
|
||||||
|
|
||||||
tbl_data = TableData(
|
tbl_data = TableData(
|
||||||
num_rows=element.num_rows,
|
num_rows=element.num_rows,
|
||||||
num_cols=element.num_cols,
|
num_cols=element.num_cols,
|
||||||
@ -342,12 +327,12 @@ class ReadingOrderModel:
|
|||||||
return new_item, current_list
|
return new_item, current_list
|
||||||
|
|
||||||
def _merge_elements(self, element, merged_elem, new_item, page_height):
|
def _merge_elements(self, element, merged_elem, new_item, page_height):
|
||||||
assert isinstance(
|
assert isinstance(merged_elem, type(element)), (
|
||||||
merged_elem, type(element)
|
"Merged element must be of same type as element."
|
||||||
), "Merged element must be of same type as element."
|
)
|
||||||
assert (
|
assert merged_elem.label == new_item.label, (
|
||||||
merged_elem.label == new_item.label
|
"Labels of merged elements must match."
|
||||||
), "Labels of merged elements must match."
|
)
|
||||||
prov = ProvenanceItem(
|
prov = ProvenanceItem(
|
||||||
page_no=element.page_no + 1,
|
page_no=element.page_no + 1,
|
||||||
charspan=(
|
charspan=(
|
||||||
|
@ -1,13 +1,13 @@
|
|||||||
import copy
|
import copy
|
||||||
import warnings
|
import warnings
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Union
|
from typing import Optional
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
from docling_core.types.doc import BoundingBox, DocItemLabel, TableCell
|
||||||
from docling_core.types.doc.page import (
|
from docling_core.types.doc.page import (
|
||||||
BoundingRectangle,
|
BoundingRectangle,
|
||||||
SegmentedPdfPage,
|
|
||||||
TextCellUnit,
|
TextCellUnit,
|
||||||
)
|
)
|
||||||
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
||||||
@ -44,7 +44,6 @@ class TableStructureModel(BasePageModel):
|
|||||||
|
|
||||||
self.enabled = enabled
|
self.enabled = enabled
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
|
|
||||||
if artifacts_path is None:
|
if artifacts_path is None:
|
||||||
artifacts_path = self.download_models() / self._model_path
|
artifacts_path = self.download_models() / self._model_path
|
||||||
else:
|
else:
|
||||||
@ -175,7 +174,6 @@ class TableStructureModel(BasePageModel):
|
|||||||
def __call__(
|
def __call__(
|
||||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
) -> Iterable[Page]:
|
) -> Iterable[Page]:
|
||||||
|
|
||||||
if not self.enabled:
|
if not self.enabled:
|
||||||
yield from page_batch
|
yield from page_batch
|
||||||
return
|
return
|
||||||
@ -186,7 +184,6 @@ class TableStructureModel(BasePageModel):
|
|||||||
yield page
|
yield page
|
||||||
else:
|
else:
|
||||||
with TimeRecorder(conv_res, "table_structure"):
|
with TimeRecorder(conv_res, "table_structure"):
|
||||||
|
|
||||||
assert page.predictions.layout is not None
|
assert page.predictions.layout is not None
|
||||||
assert page.size is not None
|
assert page.size is not None
|
||||||
|
|
||||||
@ -260,7 +257,6 @@ class TableStructureModel(BasePageModel):
|
|||||||
table_out = tf_output[0]
|
table_out = tf_output[0]
|
||||||
table_cells = []
|
table_cells = []
|
||||||
for element in table_out["tf_responses"]:
|
for element in table_out["tf_responses"]:
|
||||||
|
|
||||||
if not self.do_cell_matching:
|
if not self.do_cell_matching:
|
||||||
the_bbox = BoundingBox.model_validate(
|
the_bbox = BoundingBox.model_validate(
|
||||||
element["bbox"]
|
element["bbox"]
|
||||||
|
@ -3,9 +3,10 @@ import io
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from subprocess import DEVNULL, PIPE, Popen
|
from subprocess import DEVNULL, PIPE, Popen
|
||||||
from typing import Iterable, List, Optional, Tuple, Type
|
from typing import List, Optional, Tuple, Type
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
@ -63,8 +64,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def _get_name_and_version(self) -> Tuple[str, str]:
|
def _get_name_and_version(self) -> Tuple[str, str]:
|
||||||
|
if self._name is not None and self._version is not None:
|
||||||
if self._name != None and self._version != None:
|
|
||||||
return self._name, self._version # type: ignore
|
return self._name, self._version # type: ignore
|
||||||
|
|
||||||
cmd = [self.options.tesseract_cmd, "--version"]
|
cmd = [self.options.tesseract_cmd, "--version"]
|
||||||
@ -125,14 +125,16 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
# _log.info(decoded_data)
|
# _log.info(decoded_data)
|
||||||
|
|
||||||
# Read the TSV file generated by Tesseract
|
# Read the TSV file generated by Tesseract
|
||||||
df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
|
df_result = pd.read_csv(
|
||||||
|
io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t"
|
||||||
|
)
|
||||||
|
|
||||||
# Display the dataframe (optional)
|
# Display the dataframe (optional)
|
||||||
# _log.info("df: ", df.head())
|
# _log.info("df: ", df.head())
|
||||||
|
|
||||||
# Filter rows that contain actual text (ignore header or empty rows)
|
# Filter rows that contain actual text (ignore header or empty rows)
|
||||||
df_filtered = df[
|
df_filtered = df_result[
|
||||||
df["text"].notnull() & (df["text"].apply(str).str.strip() != "")
|
df_result["text"].notna() & (df_result["text"].apply(str).str.strip() != "")
|
||||||
]
|
]
|
||||||
|
|
||||||
return df_filtered
|
return df_filtered
|
||||||
@ -149,10 +151,10 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
||||||
output, _ = proc.communicate()
|
output, _ = proc.communicate()
|
||||||
decoded_data = output.decode("utf-8")
|
decoded_data = output.decode("utf-8")
|
||||||
df = pd.read_csv(
|
df_detected = pd.read_csv(
|
||||||
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
io.StringIO(decoded_data), sep=":", header=None, names=["key", "value"]
|
||||||
)
|
)
|
||||||
scripts = df.loc[df["key"] == "Script"].value.tolist()
|
scripts = df_detected.loc[df_detected["key"] == "Script"].value.tolist()
|
||||||
if len(scripts) == 0:
|
if len(scripts) == 0:
|
||||||
_log.warning("Tesseract cannot detect the script of the page")
|
_log.warning("Tesseract cannot detect the script of the page")
|
||||||
return None
|
return None
|
||||||
@ -183,11 +185,11 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
proc = Popen(cmd, stdout=PIPE, stderr=DEVNULL)
|
||||||
output, _ = proc.communicate()
|
output, _ = proc.communicate()
|
||||||
decoded_data = output.decode("utf-8")
|
decoded_data = output.decode("utf-8")
|
||||||
df = pd.read_csv(io.StringIO(decoded_data), header=None)
|
df_list = pd.read_csv(io.StringIO(decoded_data), header=None)
|
||||||
self._tesseract_languages = df[0].tolist()[1:]
|
self._tesseract_languages = df_list[0].tolist()[1:]
|
||||||
|
|
||||||
# Decide the script prefix
|
# Decide the script prefix
|
||||||
if any([l.startswith("script/") for l in self._tesseract_languages]):
|
if any(lang.startswith("script/") for lang in self._tesseract_languages):
|
||||||
script_prefix = "script/"
|
script_prefix = "script/"
|
||||||
else:
|
else:
|
||||||
script_prefix = ""
|
script_prefix = ""
|
||||||
@ -197,7 +199,6 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
def __call__(
|
def __call__(
|
||||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
) -> Iterable[Page]:
|
) -> Iterable[Page]:
|
||||||
|
|
||||||
if not self.enabled:
|
if not self.enabled:
|
||||||
yield from page_batch
|
yield from page_batch
|
||||||
return
|
return
|
||||||
@ -225,19 +226,19 @@ class TesseractOcrCliModel(BaseOcrModel):
|
|||||||
fname = image_file.name
|
fname = image_file.name
|
||||||
high_res_image.save(image_file)
|
high_res_image.save(image_file)
|
||||||
|
|
||||||
df = self._run_tesseract(fname)
|
df_result = self._run_tesseract(fname)
|
||||||
finally:
|
finally:
|
||||||
if os.path.exists(fname):
|
if os.path.exists(fname):
|
||||||
os.remove(fname)
|
os.remove(fname)
|
||||||
|
|
||||||
# _log.info(df)
|
# _log.info(df_result)
|
||||||
|
|
||||||
# Print relevant columns (bounding box and text)
|
# Print relevant columns (bounding box and text)
|
||||||
for ix, row in df.iterrows():
|
for ix, row in df_result.iterrows():
|
||||||
text = row["text"]
|
text = row["text"]
|
||||||
conf = row["conf"]
|
conf = row["conf"]
|
||||||
|
|
||||||
l = float(row["left"])
|
l = float(row["left"]) # noqa: E741
|
||||||
b = float(row["top"])
|
b = float(row["top"])
|
||||||
w = float(row["width"])
|
w = float(row["width"])
|
||||||
h = float(row["height"])
|
h = float(row["height"])
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Type
|
from typing import Optional, Type
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||||
@ -37,9 +38,6 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
self.options: TesseractOcrOptions
|
self.options: TesseractOcrOptions
|
||||||
|
|
||||||
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
self.scale = 3 # multiplier for 72 dpi == 216 dpi.
|
||||||
self.reader = None
|
|
||||||
self.osd_reader = None
|
|
||||||
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
|
||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
install_errmsg = (
|
install_errmsg = (
|
||||||
@ -64,7 +62,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
raise ImportError(install_errmsg)
|
raise ImportError(install_errmsg)
|
||||||
try:
|
try:
|
||||||
tesseract_version = tesserocr.tesseract_version()
|
tesseract_version = tesserocr.tesseract_version()
|
||||||
except:
|
except Exception:
|
||||||
raise ImportError(install_errmsg)
|
raise ImportError(install_errmsg)
|
||||||
|
|
||||||
_, self._tesserocr_languages = tesserocr.get_languages()
|
_, self._tesserocr_languages = tesserocr.get_languages()
|
||||||
@ -75,7 +73,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
_log.debug("Initializing TesserOCR: %s", tesseract_version)
|
||||||
lang = "+".join(self.options.lang)
|
lang = "+".join(self.options.lang)
|
||||||
|
|
||||||
if any([l.startswith("script/") for l in self._tesserocr_languages]):
|
if any(lang.startswith("script/") for lang in self._tesserocr_languages):
|
||||||
self.script_prefix = "script/"
|
self.script_prefix = "script/"
|
||||||
else:
|
else:
|
||||||
self.script_prefix = ""
|
self.script_prefix = ""
|
||||||
@ -86,6 +84,10 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
"oem": tesserocr.OEM.DEFAULT,
|
"oem": tesserocr.OEM.DEFAULT,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
self.reader = None
|
||||||
|
self.osd_reader = None
|
||||||
|
self.script_readers: dict[str, tesserocr.PyTessBaseAPI] = {}
|
||||||
|
|
||||||
if self.options.path is not None:
|
if self.options.path is not None:
|
||||||
tesserocr_kwargs["path"] = self.options.path
|
tesserocr_kwargs["path"] = self.options.path
|
||||||
|
|
||||||
|
@ -3,9 +3,10 @@ import logging
|
|||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Any, Callable, Iterable, List
|
from collections.abc import Iterable
|
||||||
|
from typing import Any, Callable, List
|
||||||
|
|
||||||
from docling_core.types.doc import DoclingDocument, NodeItem
|
from docling_core.types.doc import NodeItem
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
@ -64,7 +65,6 @@ class BasePipeline(ABC):
|
|||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
|
def _enrich_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
|
|
||||||
def _prepare_elements(
|
def _prepare_elements(
|
||||||
conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
|
conv_res: ConversionResult, model: GenericEnrichmentModel[Any]
|
||||||
) -> Iterable[NodeItem]:
|
) -> Iterable[NodeItem]:
|
||||||
@ -113,7 +113,6 @@ class BasePipeline(ABC):
|
|||||||
|
|
||||||
|
|
||||||
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
||||||
|
|
||||||
def __init__(self, pipeline_options: PipelineOptions):
|
def __init__(self, pipeline_options: PipelineOptions):
|
||||||
super().__init__(pipeline_options)
|
super().__init__(pipeline_options)
|
||||||
self.keep_backend = False
|
self.keep_backend = False
|
||||||
@ -127,7 +126,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|||||||
yield from page_batch
|
yield from page_batch
|
||||||
|
|
||||||
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
|
|
||||||
if not isinstance(conv_res.input._backend, PdfDocumentBackend):
|
if not isinstance(conv_res.input._backend, PdfDocumentBackend):
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
|
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a PDF backend. "
|
||||||
@ -139,8 +137,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|||||||
|
|
||||||
total_elapsed_time = 0.0
|
total_elapsed_time = 0.0
|
||||||
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
|
||||||
|
for i in range(conv_res.input.page_count):
|
||||||
for i in range(0, conv_res.input.page_count):
|
|
||||||
start_page, end_page = conv_res.input.limits.page_range
|
start_page, end_page = conv_res.input.limits.page_range
|
||||||
if (start_page - 1) <= i <= (end_page - 1):
|
if (start_page - 1) <= i <= (end_page - 1):
|
||||||
conv_res.pages.append(Page(page_no=i))
|
conv_res.pages.append(Page(page_no=i))
|
||||||
@ -161,7 +158,6 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
|
|||||||
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
|
pipeline_pages = self._apply_on_pages(conv_res, init_pages)
|
||||||
|
|
||||||
for p in pipeline_pages: # Must exhaust!
|
for p in pipeline_pages: # Must exhaust!
|
||||||
|
|
||||||
# Cleanup cached images
|
# Cleanup cached images
|
||||||
if not self.keep_images:
|
if not self.keep_images:
|
||||||
p._image_cache = {}
|
p._image_cache = {}
|
||||||
|
@ -24,7 +24,6 @@ class SimplePipeline(BasePipeline):
|
|||||||
super().__init__(pipeline_options)
|
super().__init__(pipeline_options)
|
||||||
|
|
||||||
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
|
|
||||||
if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
|
if not isinstance(conv_res.input._backend, DeclarativeDocumentBackend):
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
|
f"The selected backend {type(conv_res.input._backend).__name__} for {conv_res.input.file} is not a declarative backend. "
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import logging
|
import logging
|
||||||
import sys
|
|
||||||
import warnings
|
import warnings
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, cast
|
from typing import Optional, cast
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
import logging
|
import logging
|
||||||
import warnings
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Union, cast
|
from typing import List, Optional, Union, cast
|
||||||
@ -32,7 +31,6 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class VlmPipeline(PaginatedPipeline):
|
class VlmPipeline(PaginatedPipeline):
|
||||||
|
|
||||||
def __init__(self, pipeline_options: VlmPipelineOptions):
|
def __init__(self, pipeline_options: VlmPipelineOptions):
|
||||||
super().__init__(pipeline_options)
|
super().__init__(pipeline_options)
|
||||||
self.keep_backend = True
|
self.keep_backend = True
|
||||||
@ -114,7 +112,6 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
||||||
|
|
||||||
if (
|
if (
|
||||||
self.pipeline_options.vlm_options.response_format
|
self.pipeline_options.vlm_options.response_format
|
||||||
== ResponseFormat.DOCTAGS
|
== ResponseFormat.DOCTAGS
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
import logging
|
import logging
|
||||||
from typing import Any, Dict, Iterable, List, Tuple, Union
|
from collections.abc import Iterable
|
||||||
|
from typing import Any, Dict, List, Tuple, Union
|
||||||
|
|
||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import TextCell
|
|
||||||
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
|
||||||
|
|
||||||
from docling.datamodel.document import ConversionResult, Page
|
from docling.datamodel.document import ConversionResult, Page
|
||||||
@ -13,7 +13,6 @@ _log = logging.getLogger(__name__)
|
|||||||
def generate_multimodal_pages(
|
def generate_multimodal_pages(
|
||||||
doc_result: ConversionResult,
|
doc_result: ConversionResult,
|
||||||
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
|
) -> Iterable[Tuple[str, str, List[Dict[str, Any]], List[Dict[str, Any]], Page]]:
|
||||||
|
|
||||||
label_to_doclaynet = {
|
label_to_doclaynet = {
|
||||||
"title": "title",
|
"title": "title",
|
||||||
"table-of-contents": "document_index",
|
"table-of-contents": "document_index",
|
||||||
@ -122,7 +121,6 @@ def generate_multimodal_pages(
|
|||||||
if doc.main_text is None:
|
if doc.main_text is None:
|
||||||
return
|
return
|
||||||
for ix, orig_item in enumerate(doc.main_text):
|
for ix, orig_item in enumerate(doc.main_text):
|
||||||
|
|
||||||
item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
|
item = doc._resolve_ref(orig_item) if isinstance(orig_item, Ref) else orig_item
|
||||||
if item is None or item.prov is None or len(item.prov) == 0:
|
if item is None or item.prov is None or len(item.prov) == 0:
|
||||||
_log.debug(f"Skipping item {orig_item}")
|
_log.debug(f"Skipping item {orig_item}")
|
||||||
|
@ -29,7 +29,7 @@ def resolve_item(paths, obj):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
key = int(paths[0])
|
key = int(paths[0])
|
||||||
except:
|
except Exception:
|
||||||
key = paths[0]
|
key = paths[0]
|
||||||
|
|
||||||
if len(paths) == 1:
|
if len(paths) == 1:
|
||||||
@ -67,7 +67,7 @@ def _flatten_table_grid(grid: List[List[dict]]) -> List[dict]:
|
|||||||
return unique_objects
|
return unique_objects
|
||||||
|
|
||||||
|
|
||||||
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: # noqa: C901
|
||||||
origin = DocumentOrigin(
|
origin = DocumentOrigin(
|
||||||
mimetype="application/pdf",
|
mimetype="application/pdf",
|
||||||
filename=doc_glm["file-info"]["filename"],
|
filename=doc_glm["file-info"]["filename"],
|
||||||
|
@ -18,7 +18,7 @@ class UnionFind:
|
|||||||
|
|
||||||
def __init__(self, elements):
|
def __init__(self, elements):
|
||||||
self.parent = {elem: elem for elem in elements}
|
self.parent = {elem: elem for elem in elements}
|
||||||
self.rank = {elem: 0 for elem in elements}
|
self.rank = dict.fromkeys(elements, 0)
|
||||||
|
|
||||||
def find(self, x):
|
def find(self, x):
|
||||||
if self.parent[x] != x:
|
if self.parent[x] != x:
|
||||||
@ -484,7 +484,9 @@ class LayoutPostprocessor:
|
|||||||
spatial_index = (
|
spatial_index = (
|
||||||
self.regular_index
|
self.regular_index
|
||||||
if cluster_type == "regular"
|
if cluster_type == "regular"
|
||||||
else self.picture_index if cluster_type == "picture" else self.wrapper_index
|
else self.picture_index
|
||||||
|
if cluster_type == "picture"
|
||||||
|
else self.wrapper_index
|
||||||
)
|
)
|
||||||
|
|
||||||
# Map of currently valid clusters
|
# Map of currently valid clusters
|
||||||
|
@ -37,7 +37,7 @@ def download_models(
|
|||||||
output_dir.mkdir(exist_ok=True, parents=True)
|
output_dir.mkdir(exist_ok=True, parents=True)
|
||||||
|
|
||||||
if with_layout:
|
if with_layout:
|
||||||
_log.info(f"Downloading layout model...")
|
_log.info("Downloading layout model...")
|
||||||
LayoutModel.download_models(
|
LayoutModel.download_models(
|
||||||
local_dir=output_dir / LayoutModel._model_repo_folder,
|
local_dir=output_dir / LayoutModel._model_repo_folder,
|
||||||
force=force,
|
force=force,
|
||||||
@ -45,7 +45,7 @@ def download_models(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if with_tableformer:
|
if with_tableformer:
|
||||||
_log.info(f"Downloading tableformer model...")
|
_log.info("Downloading tableformer model...")
|
||||||
TableStructureModel.download_models(
|
TableStructureModel.download_models(
|
||||||
local_dir=output_dir / TableStructureModel._model_repo_folder,
|
local_dir=output_dir / TableStructureModel._model_repo_folder,
|
||||||
force=force,
|
force=force,
|
||||||
@ -53,7 +53,7 @@ def download_models(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if with_picture_classifier:
|
if with_picture_classifier:
|
||||||
_log.info(f"Downloading picture classifier model...")
|
_log.info("Downloading picture classifier model...")
|
||||||
DocumentPictureClassifier.download_models(
|
DocumentPictureClassifier.download_models(
|
||||||
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
|
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
|
||||||
force=force,
|
force=force,
|
||||||
@ -61,7 +61,7 @@ def download_models(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if with_code_formula:
|
if with_code_formula:
|
||||||
_log.info(f"Downloading code formula model...")
|
_log.info("Downloading code formula model...")
|
||||||
CodeFormulaModel.download_models(
|
CodeFormulaModel.download_models(
|
||||||
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
|
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
|
||||||
force=force,
|
force=force,
|
||||||
@ -69,7 +69,7 @@ def download_models(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if with_smolvlm:
|
if with_smolvlm:
|
||||||
_log.info(f"Downloading SmolVlm model...")
|
_log.info("Downloading SmolVlm model...")
|
||||||
PictureDescriptionVlmModel.download_models(
|
PictureDescriptionVlmModel.download_models(
|
||||||
repo_id=smolvlm_picture_description.repo_id,
|
repo_id=smolvlm_picture_description.repo_id,
|
||||||
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
|
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
|
||||||
@ -78,7 +78,7 @@ def download_models(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if with_granite_vision:
|
if with_granite_vision:
|
||||||
_log.info(f"Downloading Granite Vision model...")
|
_log.info("Downloading Granite Vision model...")
|
||||||
PictureDescriptionVlmModel.download_models(
|
PictureDescriptionVlmModel.download_models(
|
||||||
repo_id=granite_picture_description.repo_id,
|
repo_id=granite_picture_description.repo_id,
|
||||||
local_dir=output_dir / granite_picture_description.repo_cache_folder,
|
local_dir=output_dir / granite_picture_description.repo_cache_folder,
|
||||||
@ -87,7 +87,7 @@ def download_models(
|
|||||||
)
|
)
|
||||||
|
|
||||||
if with_easyocr:
|
if with_easyocr:
|
||||||
_log.info(f"Downloading easyocr models...")
|
_log.info("Downloading easyocr models...")
|
||||||
EasyOcrModel.download_models(
|
EasyOcrModel.download_models(
|
||||||
local_dir=output_dir / EasyOcrModel._model_repo_folder,
|
local_dir=output_dir / EasyOcrModel._model_repo_folder,
|
||||||
force=force,
|
force=force,
|
||||||
|
@ -13,7 +13,7 @@ def chunkify(iterator, chunk_size):
|
|||||||
if isinstance(iterator, List):
|
if isinstance(iterator, List):
|
||||||
iterator = iter(iterator)
|
iterator = iter(iterator)
|
||||||
for first in iterator: # Take the first element from the iterator
|
for first in iterator: # Take the first element from the iterator
|
||||||
yield [first] + list(islice(iterator, chunk_size - 1))
|
yield [first, *list(islice(iterator, chunk_size - 1))]
|
||||||
|
|
||||||
|
|
||||||
def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
|
def create_file_hash(path_or_stream: Union[BytesIO, Path]) -> str:
|
||||||
|
@ -383,7 +383,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"print(f\"Downloading {url}...\")\n",
|
"print(f\"Downloading {url}...\")\n",
|
||||||
"buf = BytesIO(requests.get(url).content)\n",
|
"buf = BytesIO(requests.get(url).content)\n",
|
||||||
"print(f\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
|
"print(\"Parsing zip file, splitting into XML sections, and exporting to files...\")\n",
|
||||||
"with zipfile.ZipFile(buf) as zf:\n",
|
"with zipfile.ZipFile(buf) as zf:\n",
|
||||||
" res = zf.testzip()\n",
|
" res = zf.testzip()\n",
|
||||||
" if res:\n",
|
" if res:\n",
|
||||||
@ -544,7 +544,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"doc = backend.convert()\n",
|
"doc = backend.convert()\n",
|
||||||
"\n",
|
"\n",
|
||||||
"claims_sec = [item for item in doc.texts if item.text == \"CLAIMS\"][0]\n",
|
"claims_sec = next(item for item in doc.texts if item.text == \"CLAIMS\")\n",
|
||||||
"print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')"
|
"print(f'Patent \"{doc.texts[0].text}\" has {len(claims_sec.children)} claims')"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable
|
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
from docling_core.types.doc import ImageRefMode
|
from docling_core.types.doc import ImageRefMode
|
||||||
@ -11,7 +11,6 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
|
|||||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.datamodel.settings import settings
|
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
@ -3,7 +3,6 @@ import logging
|
|||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
AcceleratorDevice,
|
||||||
@ -11,9 +10,6 @@ from docling.datamodel.pipeline_options import (
|
|||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.models.ocr_mac_model import OcrMacOptions
|
|
||||||
from docling.models.tesseract_ocr_cli_model import TesseractCliOcrOptions
|
|
||||||
from docling.models.tesseract_ocr_model import TesseractOcrOptions
|
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
@ -3,8 +3,8 @@
|
|||||||
# It does not run the actual formula understanding model.
|
# It does not run the actual formula understanding model.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable
|
|
||||||
|
|
||||||
from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem
|
from docling_core.types.doc import DocItemLabel, DoclingDocument, NodeItem, TextItem
|
||||||
|
|
||||||
@ -49,7 +49,6 @@ class ExampleFormulaUnderstandingEnrichmentModel(BaseItemAndImageEnrichmentModel
|
|||||||
|
|
||||||
# How the pipeline can be extended.
|
# How the pipeline can be extended.
|
||||||
class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
|
class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
|
||||||
|
|
||||||
def __init__(self, pipeline_options: ExampleFormulaUnderstandingPipelineOptions):
|
def __init__(self, pipeline_options: ExampleFormulaUnderstandingPipelineOptions):
|
||||||
super().__init__(pipeline_options)
|
super().__init__(pipeline_options)
|
||||||
self.pipeline_options: ExampleFormulaUnderstandingPipelineOptions
|
self.pipeline_options: ExampleFormulaUnderstandingPipelineOptions
|
||||||
@ -85,7 +84,7 @@ def main():
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
result = doc_converter.convert(input_doc_path)
|
doc_converter.convert(input_doc_path)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
@ -3,8 +3,9 @@
|
|||||||
# It does not run the actual picture classifier model.
|
# It does not run the actual picture classifier model.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Iterable
|
from typing import Any
|
||||||
|
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
|
@ -4,7 +4,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
|
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem
|
||||||
|
|
||||||
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
|
@ -51,7 +51,6 @@ def main():
|
|||||||
page_segments,
|
page_segments,
|
||||||
page,
|
page,
|
||||||
) in generate_multimodal_pages(conv_res):
|
) in generate_multimodal_pages(conv_res):
|
||||||
|
|
||||||
dpi = page._default_image_scale * 72
|
dpi = page._default_image_scale * 72
|
||||||
|
|
||||||
rows.append(
|
rows.append(
|
||||||
@ -81,10 +80,10 @@ def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Generate one parquet from all documents
|
# Generate one parquet from all documents
|
||||||
df = pd.json_normalize(rows)
|
df_result = pd.json_normalize(rows)
|
||||||
now = datetime.datetime.now()
|
now = datetime.datetime.now()
|
||||||
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
|
output_filename = output_dir / f"multimodal_{now:%Y-%m-%d_%H%M%S}.parquet"
|
||||||
df.to_parquet(output_filename)
|
df_result.to_parquet(output_filename)
|
||||||
|
|
||||||
end_time = time.time() - start_time
|
end_time = time.time() - start_time
|
||||||
|
|
||||||
|
@ -32,12 +32,12 @@ def main():
|
|||||||
print(table_df.to_markdown())
|
print(table_df.to_markdown())
|
||||||
|
|
||||||
# Save the table as csv
|
# Save the table as csv
|
||||||
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.csv"
|
element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv"
|
||||||
_log.info(f"Saving CSV table to {element_csv_filename}")
|
_log.info(f"Saving CSV table to {element_csv_filename}")
|
||||||
table_df.to_csv(element_csv_filename)
|
table_df.to_csv(element_csv_filename)
|
||||||
|
|
||||||
# Save the table as html
|
# Save the table as html
|
||||||
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix+1}.html"
|
element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html"
|
||||||
_log.info(f"Saving HTML table to {element_html_filename}")
|
_log.info(f"Saving HTML table to {element_html_filename}")
|
||||||
with element_html_filename.open("w") as fp:
|
with element_html_filename.open("w") as fp:
|
||||||
fp.write(table.export_to_html(doc=conv_res.document))
|
fp.write(table.export_to_html(doc=conv_res.document))
|
||||||
|
@ -1,14 +1,9 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
EasyOcrOptions,
|
|
||||||
OcrMacOptions,
|
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
RapidOcrOptions,
|
|
||||||
TesseractCliOcrOptions,
|
TesseractCliOcrOptions,
|
||||||
TesseractOcrOptions,
|
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
|
@ -153,10 +153,10 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"for i, chunk in enumerate(chunk_iter):\n",
|
"for i, chunk in enumerate(chunk_iter):\n",
|
||||||
" print(f\"=== {i} ===\")\n",
|
" print(f\"=== {i} ===\")\n",
|
||||||
" print(f\"chunk.text:\\n{repr(f'{chunk.text[:300]}…')}\")\n",
|
" print(f\"chunk.text:\\n{f'{chunk.text[:300]}…'!r}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" enriched_text = chunker.serialize(chunk=chunk)\n",
|
" enriched_text = chunker.serialize(chunk=chunk)\n",
|
||||||
" print(f\"chunker.serialize(chunk):\\n{repr(f'{enriched_text[:300]}…')}\")\n",
|
" print(f\"chunker.serialize(chunk):\\n{f'{enriched_text[:300]}…'!r}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" print()"
|
" print()"
|
||||||
]
|
]
|
||||||
@ -353,11 +353,11 @@
|
|||||||
"for i, chunk in enumerate(chunks):\n",
|
"for i, chunk in enumerate(chunks):\n",
|
||||||
" print(f\"=== {i} ===\")\n",
|
" print(f\"=== {i} ===\")\n",
|
||||||
" txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
|
" txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
|
||||||
" print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
|
" print(f\"chunk.text ({txt_tokens} tokens):\\n{chunk.text!r}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" ser_txt = chunker.serialize(chunk=chunk)\n",
|
" ser_txt = chunker.serialize(chunk=chunk)\n",
|
||||||
" ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
|
" ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
|
||||||
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
|
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{ser_txt!r}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" print()"
|
" print()"
|
||||||
]
|
]
|
||||||
|
@ -2,17 +2,14 @@ import json
|
|||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import yaml
|
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
||||||
|
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
|
||||||
VlmPipelineOptions,
|
VlmPipelineOptions,
|
||||||
granite_vision_vlm_conversion_options,
|
|
||||||
smoldocling_vlm_conversion_options,
|
|
||||||
smoldocling_vlm_mlx_conversion_options,
|
smoldocling_vlm_mlx_conversion_options,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
|
||||||
@ -39,9 +36,6 @@ pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
|
|||||||
## Alternative VLM models:
|
## Alternative VLM models:
|
||||||
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
||||||
|
|
||||||
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
|
||||||
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
|
||||||
|
|
||||||
## Set up pipeline for PDF or image inputs
|
## Set up pipeline for PDF or image inputs
|
||||||
converter = DocumentConverter(
|
converter = DocumentConverter(
|
||||||
format_options={
|
format_options={
|
||||||
@ -62,7 +56,7 @@ out_path.mkdir(parents=True, exist_ok=True)
|
|||||||
for source in sources:
|
for source in sources:
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
print("================================================")
|
print("================================================")
|
||||||
print("Processing... {}".format(source))
|
print(f"Processing... {source}")
|
||||||
print("================================================")
|
print("================================================")
|
||||||
print("")
|
print("")
|
||||||
|
|
||||||
@ -77,7 +71,7 @@ for source in sources:
|
|||||||
print(page.predictions.vlm_response.text)
|
print(page.predictions.vlm_response.text)
|
||||||
|
|
||||||
res.document.save_as_html(
|
res.document.save_as_html(
|
||||||
filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
|
filename=Path(f"{out_path}/{res.input.file.stem}.html"),
|
||||||
image_mode=ImageRefMode.REFERENCED,
|
image_mode=ImageRefMode.REFERENCED,
|
||||||
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
|
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
|
||||||
)
|
)
|
||||||
|
@ -144,7 +144,7 @@
|
|||||||
"for pic in doc.pictures[:5]:\n",
|
"for pic in doc.pictures[:5]:\n",
|
||||||
" html_item = (\n",
|
" html_item = (\n",
|
||||||
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
|
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
|
||||||
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
|
" f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
|
||||||
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
|
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" for annotation in pic.annotations:\n",
|
" for annotation in pic.annotations:\n",
|
||||||
@ -252,7 +252,7 @@
|
|||||||
"for pic in doc.pictures[:5]:\n",
|
"for pic in doc.pictures[:5]:\n",
|
||||||
" html_item = (\n",
|
" html_item = (\n",
|
||||||
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
|
" f\"<h3>Picture <code>{pic.self_ref}</code></h3>\"\n",
|
||||||
" f'<img src=\"{str(pic.image.uri)}\" /><br />'\n",
|
" f'<img src=\"{pic.image.uri!s}\" /><br />'\n",
|
||||||
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
|
" f\"<h4>Caption</h4>{pic.caption_text(doc=doc)}<br />\"\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
" for annotation in pic.annotations:\n",
|
" for annotation in pic.annotations:\n",
|
||||||
|
@ -283,7 +283,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 23,
|
"execution_count": null,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [
|
"outputs": [
|
||||||
{
|
{
|
||||||
@ -369,7 +369,7 @@
|
|||||||
" new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n",
|
" new_index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)\n",
|
||||||
" try:\n",
|
" try:\n",
|
||||||
" index_client.delete_index(index_name)\n",
|
" index_client.delete_index(index_name)\n",
|
||||||
" except:\n",
|
" except Exception:\n",
|
||||||
" pass\n",
|
" pass\n",
|
||||||
"\n",
|
"\n",
|
||||||
" index_client.create_or_update_index(new_index)\n",
|
" index_client.create_or_update_index(new_index)\n",
|
||||||
@ -487,7 +487,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
" all_succeeded = all(r.succeeded for r in resp)\n",
|
" all_succeeded = all(r.succeeded for r in resp)\n",
|
||||||
" console.print(\n",
|
" console.print(\n",
|
||||||
" f\"Uploaded batch {i} -> {i+len(subset)}; all_succeeded: {all_succeeded}, \"\n",
|
" f\"Uploaded batch {i} -> {i + len(subset)}; all_succeeded: {all_succeeded}, \"\n",
|
||||||
" f\"first_doc_status_code: {resp[0].status_code}\"\n",
|
" f\"first_doc_status_code: {resp[0].status_code}\"\n",
|
||||||
" )\n",
|
" )\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -807,10 +807,12 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
|
"from typing import Optional\n",
|
||||||
|
"\n",
|
||||||
"from azure.search.documents.models import VectorizableTextQuery\n",
|
"from azure.search.documents.models import VectorizableTextQuery\n",
|
||||||
"\n",
|
"\n",
|
||||||
"\n",
|
"\n",
|
||||||
"def generate_chat_response(prompt: str, system_message: str = None):\n",
|
"def generate_chat_response(prompt: str, system_message: Optional[str] = None):\n",
|
||||||
" \"\"\"\n",
|
" \"\"\"\n",
|
||||||
" Generates a single-turn chat response using Azure OpenAI Chat.\n",
|
" Generates a single-turn chat response using Azure OpenAI Chat.\n",
|
||||||
" If you need multi-turn conversation or follow-up queries, you'll have to\n",
|
" If you need multi-turn conversation or follow-up queries, you'll have to\n",
|
||||||
|
@ -351,7 +351,7 @@
|
|||||||
"for source in sources:\n",
|
"for source in sources:\n",
|
||||||
" if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
|
" if EXPORT_TYPE == ExportType.DOC_CHUNKS:\n",
|
||||||
" doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n",
|
" doc_chunk = DocChunk.model_validate(source.meta[\"dl_meta\"])\n",
|
||||||
" print(f\"- text: {repr(doc_chunk.text)}\")\n",
|
" print(f\"- text: {doc_chunk.text!r}\")\n",
|
||||||
" if doc_chunk.meta.origin:\n",
|
" if doc_chunk.meta.origin:\n",
|
||||||
" print(f\" file: {doc_chunk.meta.origin.filename}\")\n",
|
" print(f\" file: {doc_chunk.meta.origin.filename}\")\n",
|
||||||
" if doc_chunk.meta.headings:\n",
|
" if doc_chunk.meta.headings:\n",
|
||||||
|
@ -341,7 +341,7 @@
|
|||||||
"print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{clipped_answer}\")\n",
|
"print(f\"Question:\\n{resp_dict['input']}\\n\\nAnswer:\\n{clipped_answer}\")\n",
|
||||||
"for i, doc in enumerate(resp_dict[\"context\"]):\n",
|
"for i, doc in enumerate(resp_dict[\"context\"]):\n",
|
||||||
" print()\n",
|
" print()\n",
|
||||||
" print(f\"Source {i+1}:\")\n",
|
" print(f\"Source {i + 1}:\")\n",
|
||||||
" print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
|
" print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
|
||||||
" for key in doc.metadata:\n",
|
" for key in doc.metadata:\n",
|
||||||
" if key != \"pk\":\n",
|
" if key != \"pk\":\n",
|
||||||
|
@ -59,7 +59,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": null,
|
||||||
"metadata": {
|
"metadata": {
|
||||||
"collapsed": true,
|
"collapsed": true,
|
||||||
"id": "u076oUSF_YUG"
|
"id": "u076oUSF_YUG"
|
||||||
@ -72,12 +72,11 @@
|
|||||||
"%pip install rich\n",
|
"%pip install rich\n",
|
||||||
"%pip install torch\n",
|
"%pip install torch\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
"import logging\n",
|
||||||
"import warnings\n",
|
"import warnings\n",
|
||||||
"\n",
|
"\n",
|
||||||
"warnings.filterwarnings(\"ignore\")\n",
|
"warnings.filterwarnings(\"ignore\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
"import logging\n",
|
|
||||||
"\n",
|
|
||||||
"# Suppress Weaviate client logs\n",
|
"# Suppress Weaviate client logs\n",
|
||||||
"logging.getLogger(\"weaviate\").setLevel(logging.ERROR)"
|
"logging.getLogger(\"weaviate\").setLevel(logging.ERROR)"
|
||||||
]
|
]
|
||||||
@ -119,7 +118,7 @@
|
|||||||
" device = torch.device(\"mps\")\n",
|
" device = torch.device(\"mps\")\n",
|
||||||
" print(\"MPS GPU is enabled.\")\n",
|
" print(\"MPS GPU is enabled.\")\n",
|
||||||
"else:\n",
|
"else:\n",
|
||||||
" raise EnvironmentError(\n",
|
" raise OSError(\n",
|
||||||
" \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
|
" \"No GPU or MPS device found. Please check your environment and ensure GPU or MPS support is configured.\"\n",
|
||||||
" )"
|
" )"
|
||||||
]
|
]
|
||||||
@ -226,7 +225,6 @@
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from docling.datamodel.document import ConversionResult\n",
|
|
||||||
"from docling.document_converter import DocumentConverter\n",
|
"from docling.document_converter import DocumentConverter\n",
|
||||||
"\n",
|
"\n",
|
||||||
"# Instantiate the doc converter\n",
|
"# Instantiate the doc converter\n",
|
||||||
@ -345,7 +343,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
" openai_api_key = os.getenv(openai_api_key_var)\n",
|
" openai_api_key = os.getenv(openai_api_key_var)\n",
|
||||||
" if not openai_api_key:\n",
|
" if not openai_api_key:\n",
|
||||||
" raise EnvironmentError(\n",
|
" raise OSError(\n",
|
||||||
" f\"Environment variable '{openai_api_key_var}' is not set. \"\n",
|
" f\"Environment variable '{openai_api_key_var}' is not set. \"\n",
|
||||||
" \"Please define it before running this script.\"\n",
|
" \"Please define it before running this script.\"\n",
|
||||||
" )"
|
" )"
|
||||||
@ -387,7 +385,6 @@
|
|||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"import weaviate.classes.config as wc\n",
|
"import weaviate.classes.config as wc\n",
|
||||||
"from weaviate.classes.config import DataType, Property\n",
|
|
||||||
"\n",
|
"\n",
|
||||||
"# Define the collection name\n",
|
"# Define the collection name\n",
|
||||||
"collection_name = \"docling\"\n",
|
"collection_name = \"docling\"\n",
|
||||||
|
@ -25,9 +25,7 @@ def main():
|
|||||||
document = mdb.convert()
|
document = mdb.convert()
|
||||||
|
|
||||||
out_path = Path("scratch")
|
out_path = Path("scratch")
|
||||||
print(
|
print(f"Document {path} converted.\nSaved markdown output to: {out_path!s}")
|
||||||
f"Document {path} converted." f"\nSaved markdown output to: {str(out_path)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Export Docling document format to markdowndoc:
|
# Export Docling document format to markdowndoc:
|
||||||
fn = os.path.basename(path)
|
fn = os.path.basename(path)
|
||||||
|
@ -1,13 +1,10 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
AcceleratorDevice,
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
TesseractCliOcrOptions,
|
|
||||||
TesseractOcrOptions,
|
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
@ -63,7 +63,7 @@ def main():
|
|||||||
out_path = Path("scratch")
|
out_path = Path("scratch")
|
||||||
print(
|
print(
|
||||||
f"Document {res.input.file.name} converted."
|
f"Document {res.input.file.name} converted."
|
||||||
f"\nSaved markdown output to: {str(out_path)}"
|
f"\nSaved markdown output to: {out_path!s}"
|
||||||
)
|
)
|
||||||
_log.debug(res.document._export_to_indented_text(max_text_len=16))
|
_log.debug(res.document._export_to_indented_text(max_text_len=16))
|
||||||
# Export Docling document format to markdowndoc:
|
# Export Docling document format to markdowndoc:
|
||||||
|
@ -4,7 +4,6 @@ from docling.datamodel.base_models import InputFormat
|
|||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
TesseractCliOcrOptions,
|
TesseractCliOcrOptions,
|
||||||
TesseractOcrOptions,
|
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
|
@ -2,9 +2,9 @@ import logging
|
|||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling_core.types.doc import ImageRefMode, PictureItem, TableItem, TextItem
|
from docling_core.types.doc import ImageRefMode, TableItem, TextItem
|
||||||
|
|
||||||
from docling.datamodel.base_models import FigureElement, InputFormat, Table
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
@ -15,7 +15,6 @@ IMAGE_RESOLUTION_SCALE = 2.0
|
|||||||
|
|
||||||
# FIXME: put in your favorite translation code ....
|
# FIXME: put in your favorite translation code ....
|
||||||
def translate(text: str, src: str = "en", dest: str = "de"):
|
def translate(text: str, src: str = "en", dest: str = "de"):
|
||||||
|
|
||||||
_log.warning("!!! IMPLEMENT HERE YOUR FAVORITE TRANSLATION CODE!!!")
|
_log.warning("!!! IMPLEMENT HERE YOUR FAVORITE TRANSLATION CODE!!!")
|
||||||
# from googletrans import Translator
|
# from googletrans import Translator
|
||||||
|
|
||||||
@ -52,10 +51,9 @@ def main():
|
|||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
|
|
||||||
conv_res = doc_converter.convert(input_doc_path)
|
conv_res = doc_converter.convert(input_doc_path)
|
||||||
conv_doc = conv_res.document
|
conv_doc = conv_res.document
|
||||||
|
doc_filename = conv_res.input.file
|
||||||
|
|
||||||
# Save markdown with embedded pictures in original text
|
# Save markdown with embedded pictures in original text
|
||||||
md_filename = output_dir / f"{doc_filename}-with-images-orig.md"
|
md_filename = output_dir / f"{doc_filename}-with-images-orig.md"
|
||||||
|
@ -432,7 +432,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"for i, doc in enumerate(resp_dict[\"context\"][:]):\n",
|
"for i, doc in enumerate(resp_dict[\"context\"][:]):\n",
|
||||||
" image_by_page = {}\n",
|
" image_by_page = {}\n",
|
||||||
" print(f\"Source {i+1}:\")\n",
|
" print(f\"Source {i + 1}:\")\n",
|
||||||
" print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
|
" print(f\" text: {json.dumps(clip_text(doc.page_content, threshold=350))}\")\n",
|
||||||
" meta = DocMeta.model_validate(doc.metadata[\"dl_meta\"])\n",
|
" meta = DocMeta.model_validate(doc.metadata[\"dl_meta\"])\n",
|
||||||
"\n",
|
"\n",
|
||||||
|
@ -10,7 +10,6 @@ from docling.datamodel.pipeline_options import (
|
|||||||
ApiVlmOptions,
|
ApiVlmOptions,
|
||||||
ResponseFormat,
|
ResponseFormat,
|
||||||
VlmPipelineOptions,
|
VlmPipelineOptions,
|
||||||
granite_vision_vlm_ollama_conversion_options,
|
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
98
poetry.lock
generated
98
poetry.lock
generated
@ -692,6 +692,84 @@ traitlets = ">=4"
|
|||||||
[package.extras]
|
[package.extras]
|
||||||
test = ["pytest"]
|
test = ["pytest"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "coverage"
|
||||||
|
version = "7.8.0"
|
||||||
|
description = "Code coverage measurement for Python"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.9"
|
||||||
|
files = [
|
||||||
|
{file = "coverage-7.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2931f66991175369859b5fd58529cd4b73582461877ecfd859b6549869287ffe"},
|
||||||
|
{file = "coverage-7.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52a523153c568d2c0ef8826f6cc23031dc86cffb8c6aeab92c4ff776e7951b28"},
|
||||||
|
{file = "coverage-7.8.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5c8a5c139aae4c35cbd7cadca1df02ea8cf28a911534fc1b0456acb0b14234f3"},
|
||||||
|
{file = "coverage-7.8.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5a26c0c795c3e0b63ec7da6efded5f0bc856d7c0b24b2ac84b4d1d7bc578d676"},
|
||||||
|
{file = "coverage-7.8.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:821f7bcbaa84318287115d54becb1915eece6918136c6f91045bb84e2f88739d"},
|
||||||
|
{file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a321c61477ff8ee705b8a5fed370b5710c56b3a52d17b983d9215861e37b642a"},
|
||||||
|
{file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:ed2144b8a78f9d94d9515963ed273d620e07846acd5d4b0a642d4849e8d91a0c"},
|
||||||
|
{file = "coverage-7.8.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:042e7841a26498fff7a37d6fda770d17519982f5b7d8bf5278d140b67b61095f"},
|
||||||
|
{file = "coverage-7.8.0-cp310-cp310-win32.whl", hash = "sha256:f9983d01d7705b2d1f7a95e10bbe4091fabc03a46881a256c2787637b087003f"},
|
||||||
|
{file = "coverage-7.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:5a570cd9bd20b85d1a0d7b009aaf6c110b52b5755c17be6962f8ccd65d1dbd23"},
|
||||||
|
{file = "coverage-7.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e7ac22a0bb2c7c49f441f7a6d46c9c80d96e56f5a8bc6972529ed43c8b694e27"},
|
||||||
|
{file = "coverage-7.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bf13d564d310c156d1c8e53877baf2993fb3073b2fc9f69790ca6a732eb4bfea"},
|
||||||
|
{file = "coverage-7.8.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a5761c70c017c1b0d21b0815a920ffb94a670c8d5d409d9b38857874c21f70d7"},
|
||||||
|
{file = "coverage-7.8.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e5ff52d790c7e1628241ffbcaeb33e07d14b007b6eb00a19320c7b8a7024c040"},
|
||||||
|
{file = "coverage-7.8.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d39fc4817fd67b3915256af5dda75fd4ee10621a3d484524487e33416c6f3543"},
|
||||||
|
{file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b44674870709017e4b4036e3d0d6c17f06a0e6d4436422e0ad29b882c40697d2"},
|
||||||
|
{file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8f99eb72bf27cbb167b636eb1726f590c00e1ad375002230607a844d9e9a2318"},
|
||||||
|
{file = "coverage-7.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:b571bf5341ba8c6bc02e0baeaf3b061ab993bf372d982ae509807e7f112554e9"},
|
||||||
|
{file = "coverage-7.8.0-cp311-cp311-win32.whl", hash = "sha256:e75a2ad7b647fd8046d58c3132d7eaf31b12d8a53c0e4b21fa9c4d23d6ee6d3c"},
|
||||||
|
{file = "coverage-7.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:3043ba1c88b2139126fc72cb48574b90e2e0546d4c78b5299317f61b7f718b78"},
|
||||||
|
{file = "coverage-7.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bbb5cc845a0292e0c520656d19d7ce40e18d0e19b22cb3e0409135a575bf79fc"},
|
||||||
|
{file = "coverage-7.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4dfd9a93db9e78666d178d4f08a5408aa3f2474ad4d0e0378ed5f2ef71640cb6"},
|
||||||
|
{file = "coverage-7.8.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f017a61399f13aa6d1039f75cd467be388d157cd81f1a119b9d9a68ba6f2830d"},
|
||||||
|
{file = "coverage-7.8.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0915742f4c82208ebf47a2b154a5334155ed9ef9fe6190674b8a46c2fb89cb05"},
|
||||||
|
{file = "coverage-7.8.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a40fcf208e021eb14b0fac6bdb045c0e0cab53105f93ba0d03fd934c956143a"},
|
||||||
|
{file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a1f406a8e0995d654b2ad87c62caf6befa767885301f3b8f6f73e6f3c31ec3a6"},
|
||||||
|
{file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:77af0f6447a582fdc7de5e06fa3757a3ef87769fbb0fdbdeba78c23049140a47"},
|
||||||
|
{file = "coverage-7.8.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:f2d32f95922927186c6dbc8bc60df0d186b6edb828d299ab10898ef3f40052fe"},
|
||||||
|
{file = "coverage-7.8.0-cp312-cp312-win32.whl", hash = "sha256:769773614e676f9d8e8a0980dd7740f09a6ea386d0f383db6821df07d0f08545"},
|
||||||
|
{file = "coverage-7.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:e5d2b9be5b0693cf21eb4ce0ec8d211efb43966f6657807f6859aab3814f946b"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:5ac46d0c2dd5820ce93943a501ac5f6548ea81594777ca585bf002aa8854cacd"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:771eb7587a0563ca5bb6f622b9ed7f9d07bd08900f7589b4febff05f469bea00"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42421e04069fb2cbcbca5a696c4050b84a43b05392679d4068acbe65449b5c64"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:554fec1199d93ab30adaa751db68acec2b41c5602ac944bb19187cb9a41a8067"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5aaeb00761f985007b38cf463b1d160a14a22c34eb3f6a39d9ad6fc27cb73008"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:581a40c7b94921fffd6457ffe532259813fc68eb2bdda60fa8cc343414ce3733"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:f319bae0321bc838e205bf9e5bc28f0a3165f30c203b610f17ab5552cff90323"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:04bfec25a8ef1c5f41f5e7e5c842f6b615599ca8ba8391ec33a9290d9d2db3a3"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313-win32.whl", hash = "sha256:dd19608788b50eed889e13a5d71d832edc34fc9dfce606f66e8f9f917eef910d"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313-win_amd64.whl", hash = "sha256:a9abbccd778d98e9c7e85038e35e91e67f5b520776781d9a1e2ee9d400869487"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:18c5ae6d061ad5b3e7eef4363fb27a0576012a7447af48be6c75b88494c6cf25"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:95aa6ae391a22bbbce1b77ddac846c98c5473de0372ba5c463480043a07bff42"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e013b07ba1c748dacc2a80e69a46286ff145935f260eb8c72df7185bf048f502"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d766a4f0e5aa1ba056ec3496243150698dc0481902e2b8559314368717be82b1"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313t-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad80e6b4a0c3cb6f10f29ae4c60e991f424e6b14219d46f1e7d442b938ee68a4"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:b87eb6fc9e1bb8f98892a2458781348fa37e6925f35bb6ceb9d4afd54ba36c73"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:d1ba00ae33be84066cfbe7361d4e04dec78445b2b88bdb734d0d1cbab916025a"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f3c38e4e5ccbdc9198aecc766cedbb134b2d89bf64533973678dfcf07effd883"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313t-win32.whl", hash = "sha256:379fe315e206b14e21db5240f89dc0774bdd3e25c3c58c2c733c99eca96f1ada"},
|
||||||
|
{file = "coverage-7.8.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2e4b6b87bb0c846a9315e3ab4be2d52fac905100565f4b92f02c445c8799e257"},
|
||||||
|
{file = "coverage-7.8.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:fa260de59dfb143af06dcf30c2be0b200bed2a73737a8a59248fcb9fa601ef0f"},
|
||||||
|
{file = "coverage-7.8.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:96121edfa4c2dfdda409877ea8608dd01de816a4dc4a0523356067b305e4e17a"},
|
||||||
|
{file = "coverage-7.8.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b8af63b9afa1031c0ef05b217faa598f3069148eeee6bb24b79da9012423b82"},
|
||||||
|
{file = "coverage-7.8.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:89b1f4af0d4afe495cd4787a68e00f30f1d15939f550e869de90a86efa7e0814"},
|
||||||
|
{file = "coverage-7.8.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:94ec0be97723ae72d63d3aa41961a0b9a6f5a53ff599813c324548d18e3b9e8c"},
|
||||||
|
{file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8a1d96e780bdb2d0cbb297325711701f7c0b6f89199a57f2049e90064c29f6bd"},
|
||||||
|
{file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:f1d8a2a57b47142b10374902777e798784abf400a004b14f1b0b9eaf1e528ba4"},
|
||||||
|
{file = "coverage-7.8.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:cf60dd2696b457b710dd40bf17ad269d5f5457b96442f7f85722bdb16fa6c899"},
|
||||||
|
{file = "coverage-7.8.0-cp39-cp39-win32.whl", hash = "sha256:be945402e03de47ba1872cd5236395e0f4ad635526185a930735f66710e1bd3f"},
|
||||||
|
{file = "coverage-7.8.0-cp39-cp39-win_amd64.whl", hash = "sha256:90e7fbc6216ecaffa5a880cdc9c77b7418c1dcb166166b78dbc630d07f278cc3"},
|
||||||
|
{file = "coverage-7.8.0-pp39.pp310.pp311-none-any.whl", hash = "sha256:b8194fb8e50d556d5849753de991d390c5a1edeeba50f68e3a9253fbd8bf8ccd"},
|
||||||
|
{file = "coverage-7.8.0-py3-none-any.whl", hash = "sha256:dbf364b4c5e7bae9250528167dfe40219b62e2d573c854d74be213e1e52069f7"},
|
||||||
|
{file = "coverage-7.8.0.tar.gz", hash = "sha256:7a3d62b3b03b4b6fd41a085f3574874cf946cb4604d2b4d3e8dca8cd570ca501"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
tomli = {version = "*", optional = true, markers = "python_full_version <= \"3.11.0a6\" and extra == \"toml\""}
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
toml = ["tomli"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cryptography"
|
name = "cryptography"
|
||||||
version = "43.0.3"
|
version = "43.0.3"
|
||||||
@ -5073,6 +5151,24 @@ tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""}
|
|||||||
[package.extras]
|
[package.extras]
|
||||||
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pytest-cov"
|
||||||
|
version = "6.1.1"
|
||||||
|
description = "Pytest plugin for measuring coverage."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.9"
|
||||||
|
files = [
|
||||||
|
{file = "pytest_cov-6.1.1-py3-none-any.whl", hash = "sha256:bddf29ed2d0ab6f4df17b4c55b0a657287db8684af9c42ea546b21b1041b3dde"},
|
||||||
|
{file = "pytest_cov-6.1.1.tar.gz", hash = "sha256:46935f7aaefba760e716c2ebfbe1c216240b9592966e7da99ea8292d4d3e2a0a"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
coverage = {version = ">=7.5", extras = ["toml"]}
|
||||||
|
pytest = ">=4.6"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
testing = ["fields", "hunter", "process-tests", "pytest-xdist", "virtualenv"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pytest-xdist"
|
name = "pytest-xdist"
|
||||||
version = "3.6.1"
|
version = "3.6.1"
|
||||||
@ -7882,4 +7978,4 @@ vlm = ["accelerate", "transformers", "transformers"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.9"
|
python-versions = "^3.9"
|
||||||
content-hash = "d2a8f7997b9ffb249ad26ba492b766d580bdb0072d50e76b0afd92496e983e96"
|
content-hash = "b36037ec17dc4b6d5197a2f63a1367e05bf888b4fa97e2e2e8c29c217741d69c"
|
||||||
|
@ -110,6 +110,8 @@ ipywidgets = "^8.1.5"
|
|||||||
nbqa = "^1.9.0"
|
nbqa = "^1.9.0"
|
||||||
types-openpyxl = "^3.1.5.20241114"
|
types-openpyxl = "^3.1.5.20241114"
|
||||||
types-tqdm = "^4.67.0.20241221"
|
types-tqdm = "^4.67.0.20241221"
|
||||||
|
coverage = "^7.6.2"
|
||||||
|
pytest-cov = "^6.0.0"
|
||||||
|
|
||||||
[tool.poetry.group.docs.dependencies]
|
[tool.poetry.group.docs.dependencies]
|
||||||
mkdocs-material = "^9.5.40"
|
mkdocs-material = "^9.5.40"
|
||||||
@ -164,15 +166,82 @@ docling-tools = "docling.cli.tools:app"
|
|||||||
requires = ["poetry-core"]
|
requires = ["poetry-core"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
||||||
[tool.black]
|
[tool.ruff]
|
||||||
|
target-version = "py39"
|
||||||
line-length = 88
|
line-length = 88
|
||||||
target-version = ["py39"]
|
respect-gitignore = true
|
||||||
include = '\.pyi?$'
|
|
||||||
|
|
||||||
[tool.isort]
|
# extend-exclude = [
|
||||||
profile = "black"
|
# "tests",
|
||||||
line_length = 88
|
# ]
|
||||||
py_version = 39
|
|
||||||
|
[tool.ruff.format]
|
||||||
|
skip-magic-trailing-comma = false
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = [
|
||||||
|
# "B", # flake8-bugbear
|
||||||
|
"C", # flake8-comprehensions
|
||||||
|
"C9", # mccabe
|
||||||
|
# "D", # flake8-docstrings
|
||||||
|
"E", # pycodestyle errors (default)
|
||||||
|
"F", # pyflakes (default)
|
||||||
|
"I", # isort
|
||||||
|
"PD", # pandas-vet
|
||||||
|
"PIE", # pie
|
||||||
|
# "PTH", # pathlib
|
||||||
|
"Q", # flake8-quotes
|
||||||
|
# "RET", # return
|
||||||
|
"RUF", # Enable all ruff-specific checks
|
||||||
|
# "SIM", # simplify
|
||||||
|
"S307", # eval
|
||||||
|
# "T20", # (disallow print statements) keep debugging statements out of the codebase
|
||||||
|
"W", # pycodestyle warnings
|
||||||
|
"ASYNC", # async
|
||||||
|
"UP", # pyupgrade
|
||||||
|
]
|
||||||
|
|
||||||
|
ignore = [
|
||||||
|
"C408", # Unnecessary `dict()` call (rewrite as a literal)
|
||||||
|
"E501", # Line too long, handled by ruff formatter
|
||||||
|
"D107", # "Missing docstring in __init__",
|
||||||
|
"F401", # imported but unused; consider using `importlib.util.find_spec` to test for "
|
||||||
|
"F811", # "redefinition of the same function"
|
||||||
|
"PL", # Pylint
|
||||||
|
"RUF012", # Mutable Class Attributes
|
||||||
|
"UP006", # List vs list, etc
|
||||||
|
"UP007", # Option and Union
|
||||||
|
"UP035", # `typing.Set` is deprecated, use `set` instead"
|
||||||
|
]
|
||||||
|
|
||||||
|
#extend-select = []
|
||||||
|
|
||||||
|
[tool.ruff.lint.pep8-naming]
|
||||||
|
classmethod-decorators = [
|
||||||
|
# Allow Pydantic's `@validator` decorator to trigger class method treatment.
|
||||||
|
"pydantic.validator",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.ruff.lint.per-file-ignores]
|
||||||
|
"__init__.py" = ["E402", "F401"]
|
||||||
|
"tests/*.py" = ["ASYNC"] # Disable ASYNC check for tests
|
||||||
|
|
||||||
|
[tool.ruff.lint.mccabe]
|
||||||
|
max-complexity = 20
|
||||||
|
|
||||||
|
# [tool.ruff.lint.isort.sections]
|
||||||
|
# "docling" = ["docling_core", "docling_ibm_models", "docling_parse"]
|
||||||
|
|
||||||
|
[tool.ruff.lint.isort]
|
||||||
|
combine-as-imports = true
|
||||||
|
# section-order = [
|
||||||
|
# "future",
|
||||||
|
# "standard-library",
|
||||||
|
# "third-party",
|
||||||
|
# "docling",
|
||||||
|
# "first-party",
|
||||||
|
# "local-folder",
|
||||||
|
# ]
|
||||||
|
|
||||||
[tool.mypy]
|
[tool.mypy]
|
||||||
pretty = true
|
pretty = true
|
||||||
@ -200,10 +269,6 @@ module = [
|
|||||||
]
|
]
|
||||||
ignore_missing_imports = true
|
ignore_missing_imports = true
|
||||||
|
|
||||||
[tool.flake8]
|
|
||||||
max-line-length = 88
|
|
||||||
extend-ignore = ["E203", "E501"]
|
|
||||||
|
|
||||||
[tool.semantic_release]
|
[tool.semantic_release]
|
||||||
# for default values check:
|
# for default values check:
|
||||||
# https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg
|
# https://github.com/python-semantic-release/python-semantic-release/blob/v7.32.2/semantic_release/defaults.cfg
|
||||||
|
@ -19,7 +19,6 @@ def _get_backend(fname):
|
|||||||
|
|
||||||
|
|
||||||
def test_asciidocs_examples():
|
def test_asciidocs_examples():
|
||||||
|
|
||||||
fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc"))
|
fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc"))
|
||||||
|
|
||||||
for fname in fnames:
|
for fname in fnames:
|
||||||
@ -38,8 +37,8 @@ def test_asciidocs_examples():
|
|||||||
print("\n\n", pred_mddoc)
|
print("\n\n", pred_mddoc)
|
||||||
|
|
||||||
if os.path.exists(gname):
|
if os.path.exists(gname):
|
||||||
with open(gname, "r") as fr:
|
with open(gname) as fr:
|
||||||
true_mddoc = fr.read()
|
fr.read()
|
||||||
|
|
||||||
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
|
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
|
||||||
else:
|
else:
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
import json
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from pytest import warns
|
from pytest import warns
|
||||||
@ -15,22 +13,19 @@ GENERATE = GEN_TEST_DATA
|
|||||||
|
|
||||||
|
|
||||||
def get_csv_paths():
|
def get_csv_paths():
|
||||||
|
|
||||||
# Define the directory you want to search
|
# Define the directory you want to search
|
||||||
directory = Path(f"./tests/data/csv/")
|
directory = Path("./tests/data/csv/")
|
||||||
|
|
||||||
# List all CSV files in the directory and its subdirectories
|
# List all CSV files in the directory and its subdirectories
|
||||||
return sorted(directory.rglob("*.csv"))
|
return sorted(directory.rglob("*.csv"))
|
||||||
|
|
||||||
|
|
||||||
def get_csv_path(name: str):
|
def get_csv_path(name: str):
|
||||||
|
|
||||||
# Return the matching CSV file path
|
# Return the matching CSV file path
|
||||||
return Path(f"./tests/data/csv/{name}.csv")
|
return Path(f"./tests/data/csv/{name}.csv")
|
||||||
|
|
||||||
|
|
||||||
def get_converter():
|
def get_converter():
|
||||||
|
|
||||||
converter = DocumentConverter(allowed_formats=[InputFormat.CSV])
|
converter = DocumentConverter(allowed_formats=[InputFormat.CSV])
|
||||||
|
|
||||||
return converter
|
return converter
|
||||||
@ -55,9 +50,9 @@ def test_e2e_valid_csv_conversions():
|
|||||||
pred_itxt: str = doc._export_to_indented_text(
|
pred_itxt: str = doc._export_to_indented_text(
|
||||||
max_text_len=70, explicit_tables=False
|
max_text_len=70, explicit_tables=False
|
||||||
)
|
)
|
||||||
assert verify_export(
|
assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
|
||||||
pred_itxt, str(gt_path) + ".itxt"
|
"export to indented-text"
|
||||||
), "export to indented-text"
|
)
|
||||||
|
|
||||||
assert verify_document(
|
assert verify_document(
|
||||||
pred_doc=doc,
|
pred_doc=doc,
|
||||||
|
@ -32,7 +32,7 @@ def test_text_cell_counts():
|
|||||||
|
|
||||||
doc_backend = _get_backend(pdf_doc)
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
for page_index in range(0, doc_backend.page_count()):
|
for page_index in range(doc_backend.page_count()):
|
||||||
last_cell_count = None
|
last_cell_count = None
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
||||||
@ -42,9 +42,9 @@ def test_text_cell_counts():
|
|||||||
last_cell_count = len(cells)
|
last_cell_count = len(cells)
|
||||||
|
|
||||||
if len(cells) != last_cell_count:
|
if len(cells) != last_cell_count:
|
||||||
assert (
|
assert False, (
|
||||||
False
|
"Loading page multiple times yielded non-identical text cell counts"
|
||||||
), "Loading page multiple times yielded non-identical text cell counts"
|
)
|
||||||
last_cell_count = len(cells)
|
last_cell_count = len(cells)
|
||||||
|
|
||||||
|
|
||||||
@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path):
|
|||||||
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
page_backend: DoclingParsePageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
# Crop out "Figure 1" from the DocLayNet paper
|
# Crop out "Figure 1" from the DocLayNet paper
|
||||||
im = page_backend.get_page_image(
|
page_backend.get_page_image(
|
||||||
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
||||||
)
|
)
|
||||||
# im.show()
|
# im.show()
|
||||||
|
@ -31,7 +31,7 @@ def test_text_cell_counts():
|
|||||||
|
|
||||||
doc_backend = _get_backend(pdf_doc)
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
for page_index in range(0, doc_backend.page_count()):
|
for page_index in range(doc_backend.page_count()):
|
||||||
last_cell_count = None
|
last_cell_count = None
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
|
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
|
||||||
@ -41,9 +41,9 @@ def test_text_cell_counts():
|
|||||||
last_cell_count = len(cells)
|
last_cell_count = len(cells)
|
||||||
|
|
||||||
if len(cells) != last_cell_count:
|
if len(cells) != last_cell_count:
|
||||||
assert (
|
assert False, (
|
||||||
False
|
"Loading page multiple times yielded non-identical text cell counts"
|
||||||
), "Loading page multiple times yielded non-identical text cell counts"
|
)
|
||||||
last_cell_count = len(cells)
|
last_cell_count = len(cells)
|
||||||
|
|
||||||
|
|
||||||
@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path):
|
|||||||
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
|
page_backend: DoclingParseV2PageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
# Crop out "Figure 1" from the DocLayNet paper
|
# Crop out "Figure 1" from the DocLayNet paper
|
||||||
im = page_backend.get_page_image(
|
page_backend.get_page_image(
|
||||||
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
||||||
)
|
)
|
||||||
# im.show()
|
# im.show()
|
||||||
|
@ -31,7 +31,7 @@ def test_text_cell_counts():
|
|||||||
|
|
||||||
doc_backend = _get_backend(pdf_doc)
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
for page_index in range(0, doc_backend.page_count()):
|
for page_index in range(doc_backend.page_count()):
|
||||||
last_cell_count = None
|
last_cell_count = None
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
|
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
|
||||||
@ -41,9 +41,9 @@ def test_text_cell_counts():
|
|||||||
last_cell_count = len(cells)
|
last_cell_count = len(cells)
|
||||||
|
|
||||||
if len(cells) != last_cell_count:
|
if len(cells) != last_cell_count:
|
||||||
assert (
|
assert False, (
|
||||||
False
|
"Loading page multiple times yielded non-identical text cell counts"
|
||||||
), "Loading page multiple times yielded non-identical text cell counts"
|
)
|
||||||
last_cell_count = len(cells)
|
last_cell_count = len(cells)
|
||||||
|
|
||||||
|
|
||||||
@ -65,7 +65,7 @@ def test_crop_page_image(test_doc_path):
|
|||||||
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
|
page_backend: DoclingParseV4PageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
# Crop out "Figure 1" from the DocLayNet paper
|
# Crop out "Figure 1" from the DocLayNet paper
|
||||||
im = page_backend.get_page_image(
|
page_backend.get_page_image(
|
||||||
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
||||||
)
|
)
|
||||||
# im.show()
|
# im.show()
|
||||||
|
@ -105,7 +105,6 @@ def test_ordered_lists():
|
|||||||
|
|
||||||
|
|
||||||
def get_html_paths():
|
def get_html_paths():
|
||||||
|
|
||||||
# Define the directory you want to search
|
# Define the directory you want to search
|
||||||
directory = Path("./tests/data/html/")
|
directory = Path("./tests/data/html/")
|
||||||
|
|
||||||
@ -115,14 +114,12 @@ def get_html_paths():
|
|||||||
|
|
||||||
|
|
||||||
def get_converter():
|
def get_converter():
|
||||||
|
|
||||||
converter = DocumentConverter(allowed_formats=[InputFormat.HTML])
|
converter = DocumentConverter(allowed_formats=[InputFormat.HTML])
|
||||||
|
|
||||||
return converter
|
return converter
|
||||||
|
|
||||||
|
|
||||||
def test_e2e_html_conversions():
|
def test_e2e_html_conversions():
|
||||||
|
|
||||||
html_paths = get_html_paths()
|
html_paths = get_html_paths()
|
||||||
converter = get_converter()
|
converter = get_converter()
|
||||||
|
|
||||||
@ -138,15 +135,15 @@ def test_e2e_html_conversions():
|
|||||||
doc: DoclingDocument = conv_result.document
|
doc: DoclingDocument = conv_result.document
|
||||||
|
|
||||||
pred_md: str = doc.export_to_markdown()
|
pred_md: str = doc.export_to_markdown()
|
||||||
assert verify_export(
|
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
|
||||||
pred_md, str(gt_path) + ".md", generate=GENERATE
|
"export to md"
|
||||||
), "export to md"
|
)
|
||||||
|
|
||||||
pred_itxt: str = doc._export_to_indented_text(
|
pred_itxt: str = doc._export_to_indented_text(
|
||||||
max_text_len=70, explicit_tables=False
|
max_text_len=70, explicit_tables=False
|
||||||
)
|
)
|
||||||
assert verify_export(
|
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
|
||||||
pred_itxt, str(gt_path) + ".itxt", generate=GENERATE
|
"export to indented-text"
|
||||||
), "export to indented-text"
|
)
|
||||||
|
|
||||||
assert verify_document(doc, str(gt_path) + ".json", GENERATE)
|
assert verify_document(doc, str(gt_path) + ".json", GENERATE)
|
||||||
|
@ -15,7 +15,7 @@ GENERATE = GEN_TEST_DATA
|
|||||||
|
|
||||||
|
|
||||||
def get_pubmed_paths():
|
def get_pubmed_paths():
|
||||||
directory = Path(os.path.dirname(__file__) + f"/data/pubmed/")
|
directory = Path(os.path.dirname(__file__) + "/data/pubmed/")
|
||||||
xml_files = sorted(directory.rglob("*.xml"))
|
xml_files = sorted(directory.rglob("*.xml"))
|
||||||
return xml_files
|
return xml_files
|
||||||
|
|
||||||
@ -47,9 +47,9 @@ def test_e2e_pubmed_conversions(use_stream=False):
|
|||||||
pred_itxt: str = doc._export_to_indented_text(
|
pred_itxt: str = doc._export_to_indented_text(
|
||||||
max_text_len=70, explicit_tables=False
|
max_text_len=70, explicit_tables=False
|
||||||
)
|
)
|
||||||
assert verify_export(
|
assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
|
||||||
pred_itxt, str(gt_path) + ".itxt"
|
"export to indented-text"
|
||||||
), "export to indented-text"
|
)
|
||||||
|
|
||||||
assert verify_document(doc, str(gt_path) + ".json", GENERATE), "export to json"
|
assert verify_document(doc, str(gt_path) + ".json", GENERATE), "export to json"
|
||||||
|
|
||||||
|
@ -17,7 +17,6 @@ GENERATE = GEN_TEST_DATA
|
|||||||
|
|
||||||
|
|
||||||
def get_xlsx_paths():
|
def get_xlsx_paths():
|
||||||
|
|
||||||
# Define the directory you want to search
|
# Define the directory you want to search
|
||||||
directory = Path("./tests/data/xlsx/")
|
directory = Path("./tests/data/xlsx/")
|
||||||
|
|
||||||
@ -27,7 +26,6 @@ def get_xlsx_paths():
|
|||||||
|
|
||||||
|
|
||||||
def get_converter():
|
def get_converter():
|
||||||
|
|
||||||
converter = DocumentConverter(allowed_formats=[InputFormat.XLSX])
|
converter = DocumentConverter(allowed_formats=[InputFormat.XLSX])
|
||||||
|
|
||||||
return converter
|
return converter
|
||||||
@ -65,13 +63,13 @@ def test_e2e_xlsx_conversions(documents) -> None:
|
|||||||
pred_itxt: str = doc._export_to_indented_text(
|
pred_itxt: str = doc._export_to_indented_text(
|
||||||
max_text_len=70, explicit_tables=False
|
max_text_len=70, explicit_tables=False
|
||||||
)
|
)
|
||||||
assert verify_export(
|
assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
|
||||||
pred_itxt, str(gt_path) + ".itxt"
|
"export to indented-text"
|
||||||
), "export to indented-text"
|
)
|
||||||
|
|
||||||
assert verify_document(
|
assert verify_document(doc, str(gt_path) + ".json", GENERATE), (
|
||||||
doc, str(gt_path) + ".json", GENERATE
|
"document document"
|
||||||
), "document document"
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_pages(documents) -> None:
|
def test_pages(documents) -> None:
|
||||||
@ -81,7 +79,7 @@ def test_pages(documents) -> None:
|
|||||||
documents: The paths and converted documents.
|
documents: The paths and converted documents.
|
||||||
"""
|
"""
|
||||||
# number of pages from the backend method
|
# number of pages from the backend method
|
||||||
path = [item for item in get_xlsx_paths() if item.stem == "test-01"][0]
|
path = next(item for item in get_xlsx_paths() if item.stem == "test-01")
|
||||||
in_doc = InputDocument(
|
in_doc = InputDocument(
|
||||||
path_or_stream=path,
|
path_or_stream=path,
|
||||||
format=InputFormat.XLSX,
|
format=InputFormat.XLSX,
|
||||||
@ -92,7 +90,7 @@ def test_pages(documents) -> None:
|
|||||||
assert backend.page_count() == 3
|
assert backend.page_count() == 3
|
||||||
|
|
||||||
# number of pages from the converted document
|
# number of pages from the converted document
|
||||||
doc = [item for path, item in documents if path.stem == "test-01"][0]
|
doc = next(item for path, item in documents if path.stem == "test-01")
|
||||||
assert len(doc.pages) == 3
|
assert len(doc.pages) == 3
|
||||||
|
|
||||||
# page sizes as number of cells
|
# page sizes as number of cells
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
import os
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||||
@ -43,7 +42,6 @@ def test_heading_levels():
|
|||||||
|
|
||||||
|
|
||||||
def get_docx_paths():
|
def get_docx_paths():
|
||||||
|
|
||||||
# Define the directory you want to search
|
# Define the directory you want to search
|
||||||
directory = Path("./tests/data/docx/")
|
directory = Path("./tests/data/docx/")
|
||||||
|
|
||||||
@ -53,14 +51,12 @@ def get_docx_paths():
|
|||||||
|
|
||||||
|
|
||||||
def get_converter():
|
def get_converter():
|
||||||
|
|
||||||
converter = DocumentConverter(allowed_formats=[InputFormat.DOCX])
|
converter = DocumentConverter(allowed_formats=[InputFormat.DOCX])
|
||||||
|
|
||||||
return converter
|
return converter
|
||||||
|
|
||||||
|
|
||||||
def test_e2e_docx_conversions():
|
def test_e2e_docx_conversions():
|
||||||
|
|
||||||
docx_paths = get_docx_paths()
|
docx_paths = get_docx_paths()
|
||||||
converter = get_converter()
|
converter = get_converter()
|
||||||
|
|
||||||
@ -76,20 +72,20 @@ def test_e2e_docx_conversions():
|
|||||||
doc: DoclingDocument = conv_result.document
|
doc: DoclingDocument = conv_result.document
|
||||||
|
|
||||||
pred_md: str = doc.export_to_markdown()
|
pred_md: str = doc.export_to_markdown()
|
||||||
assert verify_export(
|
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
|
||||||
pred_md, str(gt_path) + ".md", generate=GENERATE
|
"export to md"
|
||||||
), "export to md"
|
)
|
||||||
|
|
||||||
pred_itxt: str = doc._export_to_indented_text(
|
pred_itxt: str = doc._export_to_indented_text(
|
||||||
max_text_len=70, explicit_tables=False
|
max_text_len=70, explicit_tables=False
|
||||||
)
|
)
|
||||||
assert verify_export(
|
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
|
||||||
pred_itxt, str(gt_path) + ".itxt", generate=GENERATE
|
"export to indented-text"
|
||||||
), "export to indented-text"
|
)
|
||||||
|
|
||||||
assert verify_document(
|
assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
|
||||||
doc, str(gt_path) + ".json", generate=GENERATE
|
"document document"
|
||||||
), "document document"
|
)
|
||||||
|
|
||||||
if docx_path.name == "word_tables.docx":
|
if docx_path.name == "word_tables.docx":
|
||||||
pred_html: str = doc.export_to_html()
|
pred_html: str = doc.export_to_html()
|
||||||
|
@ -109,27 +109,27 @@ def test_patent_groundtruth(patents, groundtruth):
|
|||||||
md_name = path.stem + ".md"
|
md_name = path.stem + ".md"
|
||||||
if md_name in gt_names:
|
if md_name in gt_names:
|
||||||
pred_md = doc.export_to_markdown()
|
pred_md = doc.export_to_markdown()
|
||||||
assert (
|
assert pred_md == gt_names[md_name], (
|
||||||
pred_md == gt_names[md_name]
|
f"Markdown file mismatch against groundtruth {md_name}"
|
||||||
), f"Markdown file mismatch against groundtruth {md_name}"
|
)
|
||||||
json_path = path.with_suffix(".json")
|
json_path = path.with_suffix(".json")
|
||||||
if json_path.stem in gt_names:
|
if json_path.stem in gt_names:
|
||||||
assert verify_document(
|
assert verify_document(doc, str(json_path), GENERATE), (
|
||||||
doc, str(json_path), GENERATE
|
f"JSON file mismatch against groundtruth {json_path}"
|
||||||
), f"JSON file mismatch against groundtruth {json_path}"
|
)
|
||||||
itxt_name = path.stem + ".itxt"
|
itxt_name = path.stem + ".itxt"
|
||||||
if itxt_name in gt_names:
|
if itxt_name in gt_names:
|
||||||
pred_itxt = doc._export_to_indented_text()
|
pred_itxt = doc._export_to_indented_text()
|
||||||
assert (
|
assert pred_itxt == gt_names[itxt_name], (
|
||||||
pred_itxt == gt_names[itxt_name]
|
f"Indented text file mismatch against groundtruth {itxt_name}"
|
||||||
), f"Indented text file mismatch against groundtruth {itxt_name}"
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_tables(tables):
|
def test_tables(tables):
|
||||||
"""Test the table parser."""
|
"""Test the table parser."""
|
||||||
# CHECK table in file tables_20180000016.xml
|
# CHECK table in file tables_20180000016.xml
|
||||||
file_name = "tables_ipa20180000016.xml"
|
file_name = "tables_ipa20180000016.xml"
|
||||||
file_table = [item[1] for item in tables if item[0].name == file_name][0]
|
file_table = next(item[1] for item in tables if item[0].name == file_name)
|
||||||
assert file_table.num_rows == 13
|
assert file_table.num_rows == 13
|
||||||
assert file_table.num_cols == 10
|
assert file_table.num_cols == 10
|
||||||
assert len(file_table.table_cells) == 130
|
assert len(file_table.table_cells) == 130
|
||||||
@ -140,7 +140,7 @@ def test_patent_uspto_ice(patents):
|
|||||||
|
|
||||||
# CHECK application doc number 20200022300
|
# CHECK application doc number 20200022300
|
||||||
file_name = "ipa20200022300.xml"
|
file_name = "ipa20200022300.xml"
|
||||||
doc = [item[1] for item in patents if item[0].name == file_name][0]
|
doc = next(item[1] for item in patents if item[0].name == file_name)
|
||||||
if GENERATE:
|
if GENERATE:
|
||||||
_generate_groundtruth(doc, Path(file_name).stem)
|
_generate_groundtruth(doc, Path(file_name).stem)
|
||||||
|
|
||||||
@ -278,7 +278,7 @@ def test_patent_uspto_ice(patents):
|
|||||||
|
|
||||||
# CHECK application doc number 20180000016 for HTML entities, level 2 headings, tables
|
# CHECK application doc number 20180000016 for HTML entities, level 2 headings, tables
|
||||||
file_name = "ipa20180000016.xml"
|
file_name = "ipa20180000016.xml"
|
||||||
doc = [item[1] for item in patents if item[0].name == file_name][0]
|
doc = next(item[1] for item in patents if item[0].name == file_name)
|
||||||
if GENERATE:
|
if GENERATE:
|
||||||
_generate_groundtruth(doc, Path(file_name).stem)
|
_generate_groundtruth(doc, Path(file_name).stem)
|
||||||
|
|
||||||
@ -348,7 +348,7 @@ def test_patent_uspto_ice(patents):
|
|||||||
|
|
||||||
# CHECK application doc number 20110039701 for complex long tables
|
# CHECK application doc number 20110039701 for complex long tables
|
||||||
file_name = "ipa20110039701.xml"
|
file_name = "ipa20110039701.xml"
|
||||||
doc = [item[1] for item in patents if item[0].name == file_name][0]
|
doc = next(item[1] for item in patents if item[0].name == file_name)
|
||||||
assert doc.name == file_name
|
assert doc.name == file_name
|
||||||
assert len(doc.tables) == 17
|
assert len(doc.tables) == 17
|
||||||
|
|
||||||
@ -358,7 +358,7 @@ def test_patent_uspto_grant_v2(patents):
|
|||||||
|
|
||||||
# CHECK application doc number 06442728
|
# CHECK application doc number 06442728
|
||||||
file_name = "pg06442728.xml"
|
file_name = "pg06442728.xml"
|
||||||
doc = [item[1] for item in patents if item[0].name == file_name][0]
|
doc = next(item[1] for item in patents if item[0].name == file_name)
|
||||||
if GENERATE:
|
if GENERATE:
|
||||||
_generate_groundtruth(doc, Path(file_name).stem)
|
_generate_groundtruth(doc, Path(file_name).stem)
|
||||||
|
|
||||||
@ -376,12 +376,12 @@ def test_patent_uspto_grant_v2(patents):
|
|||||||
assert isinstance(texts[2], TextItem)
|
assert isinstance(texts[2], TextItem)
|
||||||
assert texts[2].text == (
|
assert texts[2].text == (
|
||||||
"An interleaver receives incoming data frames of size N. The interleaver "
|
"An interleaver receives incoming data frames of size N. The interleaver "
|
||||||
"indexes the elements of the frame with an N₁×N₂ index array. The interleaver "
|
"indexes the elements of the frame with an N₁×N₂ index array. The interleaver " # noqa: RUF001
|
||||||
"then effectively rearranges (permutes) the data by permuting the rows of the "
|
"then effectively rearranges (permutes) the data by permuting the rows of the "
|
||||||
"index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to "
|
"index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to " # noqa: RUF001
|
||||||
"permute the columns (indexed by k) of each row (indexed by j). P is at least "
|
"permute the columns (indexed by k) of each row (indexed by j). P is at least "
|
||||||
"equal to N₂, βj is a constant which may be different for each row, and each "
|
"equal to N₂, βj is a constant which may be different for each row, and each "
|
||||||
"αj is a relative prime number relative to P. After permuting, the "
|
"αj is a relative prime number relative to P. After permuting, the " # noqa: RUF001
|
||||||
"interleaver outputs the data in a different order than received (e.g., "
|
"interleaver outputs the data in a different order than received (e.g., "
|
||||||
"receives sequentially row by row, outputs sequentially each column by column)."
|
"receives sequentially row by row, outputs sequentially each column by column)."
|
||||||
)
|
)
|
||||||
@ -402,7 +402,7 @@ def test_patent_uspto_app_v1(patents):
|
|||||||
|
|
||||||
# CHECK application doc number 20010031492
|
# CHECK application doc number 20010031492
|
||||||
file_name = "pa20010031492.xml"
|
file_name = "pa20010031492.xml"
|
||||||
doc = [item[1] for item in patents if item[0].name == file_name][0]
|
doc = next(item[1] for item in patents if item[0].name == file_name)
|
||||||
if GENERATE:
|
if GENERATE:
|
||||||
_generate_groundtruth(doc, Path(file_name).stem)
|
_generate_groundtruth(doc, Path(file_name).stem)
|
||||||
|
|
||||||
@ -432,7 +432,7 @@ def test_patent_uspto_grant_aps(patents):
|
|||||||
|
|
||||||
# CHECK application doc number 057006474
|
# CHECK application doc number 057006474
|
||||||
file_name = "pftaps057006474.txt"
|
file_name = "pftaps057006474.txt"
|
||||||
doc = [item[1] for item in patents if item[0].name == file_name][0]
|
doc = next(item[1] for item in patents if item[0].name == file_name)
|
||||||
if GENERATE:
|
if GENERATE:
|
||||||
_generate_groundtruth(doc, Path(file_name).stem)
|
_generate_groundtruth(doc, Path(file_name).stem)
|
||||||
|
|
||||||
|
@ -32,7 +32,7 @@ def test_text_cell_counts():
|
|||||||
|
|
||||||
doc_backend = _get_backend(pdf_doc)
|
doc_backend = _get_backend(pdf_doc)
|
||||||
|
|
||||||
for page_index in range(0, doc_backend.page_count()):
|
for page_index in range(doc_backend.page_count()):
|
||||||
last_cell_count = None
|
last_cell_count = None
|
||||||
for i in range(10):
|
for i in range(10):
|
||||||
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||||
@ -42,9 +42,9 @@ def test_text_cell_counts():
|
|||||||
last_cell_count = len(cells)
|
last_cell_count = len(cells)
|
||||||
|
|
||||||
if len(cells) != last_cell_count:
|
if len(cells) != last_cell_count:
|
||||||
assert (
|
assert False, (
|
||||||
False
|
"Loading page multiple times yielded non-identical text cell counts"
|
||||||
), "Loading page multiple times yielded non-identical text cell counts"
|
)
|
||||||
last_cell_count = len(cells)
|
last_cell_count = len(cells)
|
||||||
|
|
||||||
|
|
||||||
@ -66,7 +66,7 @@ def test_crop_page_image(test_doc_path):
|
|||||||
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
page_backend: PyPdfiumPageBackend = doc_backend.load_page(0)
|
||||||
|
|
||||||
# Crop out "Figure 1" from the DocLayNet paper
|
# Crop out "Figure 1" from the DocLayNet paper
|
||||||
im = page_backend.get_page_image(
|
page_backend.get_page_image(
|
||||||
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
scale=2, cropbox=BoundingBox(l=317, t=246, r=574, b=527)
|
||||||
)
|
)
|
||||||
# im.show()
|
# im.show()
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
import os
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
@ -12,7 +11,6 @@ GENERATE = GEN_TEST_DATA
|
|||||||
|
|
||||||
|
|
||||||
def get_pptx_paths():
|
def get_pptx_paths():
|
||||||
|
|
||||||
# Define the directory you want to search
|
# Define the directory you want to search
|
||||||
directory = Path("./tests/data/pptx/")
|
directory = Path("./tests/data/pptx/")
|
||||||
|
|
||||||
@ -22,14 +20,12 @@ def get_pptx_paths():
|
|||||||
|
|
||||||
|
|
||||||
def get_converter():
|
def get_converter():
|
||||||
|
|
||||||
converter = DocumentConverter(allowed_formats=[InputFormat.PPTX])
|
converter = DocumentConverter(allowed_formats=[InputFormat.PPTX])
|
||||||
|
|
||||||
return converter
|
return converter
|
||||||
|
|
||||||
|
|
||||||
def test_e2e_pptx_conversions():
|
def test_e2e_pptx_conversions():
|
||||||
|
|
||||||
pptx_paths = get_pptx_paths()
|
pptx_paths = get_pptx_paths()
|
||||||
converter = get_converter()
|
converter = get_converter()
|
||||||
|
|
||||||
@ -50,10 +46,10 @@ def test_e2e_pptx_conversions():
|
|||||||
pred_itxt: str = doc._export_to_indented_text(
|
pred_itxt: str = doc._export_to_indented_text(
|
||||||
max_text_len=70, explicit_tables=False
|
max_text_len=70, explicit_tables=False
|
||||||
)
|
)
|
||||||
assert verify_export(
|
assert verify_export(pred_itxt, str(gt_path) + ".itxt"), (
|
||||||
pred_itxt, str(gt_path) + ".itxt"
|
"export to indented-text"
|
||||||
), "export to indented-text"
|
)
|
||||||
|
|
||||||
assert verify_document(
|
assert verify_document(doc, str(gt_path) + ".json", GENERATE), (
|
||||||
doc, str(gt_path) + ".json", GENERATE
|
"document document"
|
||||||
), "document document"
|
)
|
||||||
|
@ -3,7 +3,6 @@ from pathlib import Path
|
|||||||
from docling_core.types.doc import CodeItem, TextItem
|
from docling_core.types.doc import CodeItem, TextItem
|
||||||
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel
|
from docling_core.types.doc.labels import CodeLanguageLabel, DocItemLabel
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
@ -12,7 +11,6 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
|||||||
|
|
||||||
|
|
||||||
def get_converter():
|
def get_converter():
|
||||||
|
|
||||||
pipeline_options = PdfPipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.generate_page_images = True
|
pipeline_options.generate_page_images = True
|
||||||
|
|
||||||
|
@ -2,7 +2,6 @@ from pathlib import Path
|
|||||||
|
|
||||||
from docling_core.types.doc import PictureClassificationData
|
from docling_core.types.doc import PictureClassificationData
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
@ -11,7 +10,6 @@ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
|||||||
|
|
||||||
|
|
||||||
def get_converter():
|
def get_converter():
|
||||||
|
|
||||||
pipeline_options = PdfPipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.generate_page_images = True
|
pipeline_options.generate_page_images = True
|
||||||
|
|
||||||
@ -49,32 +47,32 @@ def test_picture_classifier():
|
|||||||
|
|
||||||
res = results[0]
|
res = results[0]
|
||||||
assert len(res.annotations) == 1
|
assert len(res.annotations) == 1
|
||||||
assert type(res.annotations[0]) == PictureClassificationData
|
assert isinstance(res.annotations[0], PictureClassificationData)
|
||||||
classification_data = res.annotations[0]
|
classification_data = res.annotations[0]
|
||||||
assert classification_data.provenance == "DocumentPictureClassifier"
|
assert classification_data.provenance == "DocumentPictureClassifier"
|
||||||
assert (
|
assert len(classification_data.predicted_classes) == 16, (
|
||||||
len(classification_data.predicted_classes) == 16
|
"Number of predicted classes is not equal to 16"
|
||||||
), "Number of predicted classes is not equal to 16"
|
)
|
||||||
confidences = [pred.confidence for pred in classification_data.predicted_classes]
|
confidences = [pred.confidence for pred in classification_data.predicted_classes]
|
||||||
assert confidences == sorted(
|
assert confidences == sorted(confidences, reverse=True), (
|
||||||
confidences, reverse=True
|
"Predictions are not sorted in descending order of confidence"
|
||||||
), "Predictions are not sorted in descending order of confidence"
|
)
|
||||||
assert (
|
assert classification_data.predicted_classes[0].class_name == "bar_chart", (
|
||||||
classification_data.predicted_classes[0].class_name == "bar_chart"
|
"The prediction is wrong for the bar chart image."
|
||||||
), "The prediction is wrong for the bar chart image."
|
)
|
||||||
|
|
||||||
res = results[1]
|
res = results[1]
|
||||||
assert len(res.annotations) == 1
|
assert len(res.annotations) == 1
|
||||||
assert type(res.annotations[0]) == PictureClassificationData
|
assert isinstance(res.annotations[0], PictureClassificationData)
|
||||||
classification_data = res.annotations[0]
|
classification_data = res.annotations[0]
|
||||||
assert classification_data.provenance == "DocumentPictureClassifier"
|
assert classification_data.provenance == "DocumentPictureClassifier"
|
||||||
assert (
|
assert len(classification_data.predicted_classes) == 16, (
|
||||||
len(classification_data.predicted_classes) == 16
|
"Number of predicted classes is not equal to 16"
|
||||||
), "Number of predicted classes is not equal to 16"
|
)
|
||||||
confidences = [pred.confidence for pred in classification_data.predicted_classes]
|
confidences = [pred.confidence for pred in classification_data.predicted_classes]
|
||||||
assert confidences == sorted(
|
assert confidences == sorted(confidences, reverse=True), (
|
||||||
confidences, reverse=True
|
"Predictions are not sorted in descending order of confidence"
|
||||||
), "Predictions are not sorted in descending order of confidence"
|
)
|
||||||
assert (
|
assert classification_data.predicted_classes[0].class_name == "map", (
|
||||||
classification_data.predicted_classes[0].class_name == "map"
|
"The prediction is wrong for the bar chart image."
|
||||||
), "The prediction is wrong for the bar chart image."
|
)
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions
|
from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions
|
||||||
@ -15,7 +14,6 @@ GENERATE_V2 = GEN_TEST_DATA
|
|||||||
|
|
||||||
|
|
||||||
def get_pdf_paths():
|
def get_pdf_paths():
|
||||||
|
|
||||||
# Define the directory you want to search
|
# Define the directory you want to search
|
||||||
directory = Path("./tests/data/pdf/")
|
directory = Path("./tests/data/pdf/")
|
||||||
|
|
||||||
@ -25,7 +23,6 @@ def get_pdf_paths():
|
|||||||
|
|
||||||
|
|
||||||
def get_converter():
|
def get_converter():
|
||||||
|
|
||||||
pipeline_options = PdfPipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.do_ocr = False
|
pipeline_options.do_ocr = False
|
||||||
pipeline_options.do_table_structure = True
|
pipeline_options.do_table_structure = True
|
||||||
@ -45,7 +42,6 @@ def get_converter():
|
|||||||
|
|
||||||
|
|
||||||
def test_e2e_pdfs_conversions():
|
def test_e2e_pdfs_conversions():
|
||||||
|
|
||||||
pdf_paths = get_pdf_paths()
|
pdf_paths = get_pdf_paths()
|
||||||
converter = get_converter()
|
converter = get_converter()
|
||||||
|
|
||||||
|
@ -3,7 +3,6 @@ from pathlib import Path
|
|||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
|
@ -12,10 +12,9 @@ from docling.document_converter import PdfFormatOption
|
|||||||
|
|
||||||
|
|
||||||
def test_in_doc_from_valid_path():
|
def test_in_doc_from_valid_path():
|
||||||
|
|
||||||
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
doc = _make_input_doc(test_doc_path)
|
doc = _make_input_doc(test_doc_path)
|
||||||
assert doc.valid == True
|
assert doc.valid is True
|
||||||
|
|
||||||
|
|
||||||
def test_in_doc_from_invalid_path():
|
def test_in_doc_from_invalid_path():
|
||||||
@ -23,29 +22,26 @@ def test_in_doc_from_invalid_path():
|
|||||||
|
|
||||||
doc = _make_input_doc(test_doc_path)
|
doc = _make_input_doc(test_doc_path)
|
||||||
|
|
||||||
assert doc.valid == False
|
assert doc.valid is False
|
||||||
|
|
||||||
|
|
||||||
def test_in_doc_from_valid_buf():
|
def test_in_doc_from_valid_buf():
|
||||||
|
|
||||||
buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read())
|
buf = BytesIO(Path("./tests/data/pdf/2206.01062.pdf").open("rb").read())
|
||||||
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
||||||
|
|
||||||
doc = _make_input_doc_from_stream(stream)
|
doc = _make_input_doc_from_stream(stream)
|
||||||
assert doc.valid == True
|
assert doc.valid is True
|
||||||
|
|
||||||
|
|
||||||
def test_in_doc_from_invalid_buf():
|
def test_in_doc_from_invalid_buf():
|
||||||
|
|
||||||
buf = BytesIO(b"")
|
buf = BytesIO(b"")
|
||||||
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
stream = DocumentStream(name="my_doc.pdf", stream=buf)
|
||||||
|
|
||||||
doc = _make_input_doc_from_stream(stream)
|
doc = _make_input_doc_from_stream(stream)
|
||||||
assert doc.valid == False
|
assert doc.valid is False
|
||||||
|
|
||||||
|
|
||||||
def test_image_in_pdf_backend():
|
def test_image_in_pdf_backend():
|
||||||
|
|
||||||
in_doc = InputDocument(
|
in_doc = InputDocument(
|
||||||
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||||
format=InputFormat.IMAGE,
|
format=InputFormat.IMAGE,
|
||||||
@ -76,7 +72,6 @@ def test_image_in_pdf_backend():
|
|||||||
|
|
||||||
|
|
||||||
def test_in_doc_with_page_range():
|
def test_in_doc_with_page_range():
|
||||||
|
|
||||||
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||||
limits = DocumentLimits()
|
limits = DocumentLimits()
|
||||||
limits.page_range = (1, 10)
|
limits.page_range = (1, 10)
|
||||||
@ -87,7 +82,7 @@ def test_in_doc_with_page_range():
|
|||||||
backend=PyPdfiumDocumentBackend,
|
backend=PyPdfiumDocumentBackend,
|
||||||
limits=limits,
|
limits=limits,
|
||||||
)
|
)
|
||||||
assert doc.valid == True
|
assert doc.valid is True
|
||||||
|
|
||||||
limits.page_range = (9, 9)
|
limits.page_range = (9, 9)
|
||||||
|
|
||||||
@ -97,7 +92,7 @@ def test_in_doc_with_page_range():
|
|||||||
backend=PyPdfiumDocumentBackend,
|
backend=PyPdfiumDocumentBackend,
|
||||||
limits=limits,
|
limits=limits,
|
||||||
)
|
)
|
||||||
assert doc.valid == True
|
assert doc.valid is True
|
||||||
|
|
||||||
limits.page_range = (11, 12)
|
limits.page_range = (11, 12)
|
||||||
|
|
||||||
@ -107,7 +102,7 @@ def test_in_doc_with_page_range():
|
|||||||
backend=PyPdfiumDocumentBackend,
|
backend=PyPdfiumDocumentBackend,
|
||||||
limits=limits,
|
limits=limits,
|
||||||
)
|
)
|
||||||
assert doc.valid == False
|
assert doc.valid is False
|
||||||
|
|
||||||
|
|
||||||
def test_guess_format(tmp_path):
|
def test_guess_format(tmp_path):
|
||||||
@ -192,17 +187,17 @@ def test_guess_format(tmp_path):
|
|||||||
)
|
)
|
||||||
doc_path = temp_dir / "docling_test.xml"
|
doc_path = temp_dir / "docling_test.xml"
|
||||||
doc_path.write_text(xml_content, encoding="utf-8")
|
doc_path.write_text(xml_content, encoding="utf-8")
|
||||||
assert dci._guess_format(doc_path) == None
|
assert dci._guess_format(doc_path) is None
|
||||||
buf = BytesIO(Path(doc_path).open("rb").read())
|
buf = BytesIO(Path(doc_path).open("rb").read())
|
||||||
stream = DocumentStream(name="docling_test.xml", stream=buf)
|
stream = DocumentStream(name="docling_test.xml", stream=buf)
|
||||||
assert dci._guess_format(stream) == None
|
assert dci._guess_format(stream) is None
|
||||||
|
|
||||||
# Invalid USPTO patent (as plain text)
|
# Invalid USPTO patent (as plain text)
|
||||||
stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz"))
|
stream = DocumentStream(name="pftaps057006474.txt", stream=BytesIO(b"xyz"))
|
||||||
assert dci._guess_format(stream) == None
|
assert dci._guess_format(stream) is None
|
||||||
doc_path = temp_dir / "pftaps_wrong.txt"
|
doc_path = temp_dir / "pftaps_wrong.txt"
|
||||||
doc_path.write_text("xyz", encoding="utf-8")
|
doc_path.write_text("xyz", encoding="utf-8")
|
||||||
assert dci._guess_format(doc_path) == None
|
assert dci._guess_format(doc_path) is None
|
||||||
|
|
||||||
# Valid Docling JSON
|
# Valid Docling JSON
|
||||||
test_str = '{"name": ""}'
|
test_str = '{"name": ""}'
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user