feat: Support AsciiDoc and Markdown input format (#168)
* updated the base-model and added the asciidoc_backend Signed-off-by: Peter Staar <taa@zurich.ibm.com> * updated the asciidoc backend Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Ensure all models work only on valid pages (#158) Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * ci: run ci also on forks (#160) --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> * fix: fix legacy doc ref (#162) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> * docs: typo fix (#155) * Docs: Typo fix - Corrected spelling of invidual to automatic Signed-off-by: ABHISHEK FADAKE <31249309+fadkeabhi@users.noreply.github.com> * add synchronize event for forks Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: ABHISHEK FADAKE <31249309+fadkeabhi@users.noreply.github.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> * feat: add coverage_threshold to skip OCR for small images (#161) * feat: add coverage_threshold to skip OCR for small images Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * filter individual boxes Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * rename option Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * chore: bump version to 2.1.0 [skip ci] * adding tests for asciidocs Signed-off-by: Peter Staar <taa@zurich.ibm.com> * first working asciidoc parser Signed-off-by: Peter Staar <taa@zurich.ibm.com> * reformatted the code Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the mypy Signed-off-by: Peter Staar <taa@zurich.ibm.com> * adding test_02.asciidoc Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Drafting Markdown backend via Marko library Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * work in progress on MD backend Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * md_backend produces docling document with headers, paragraphs, lists Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Improvements in md parsing Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Detecting and assembling tables in markdown in temporary buffers Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added initial docling table support to md_backend Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Cleaned code, improved logging for MD Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fixes MyPy requirements, and rest of pre-commit Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fixed example run_md, added origin info to md_backend Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * working on asciidocs, struggling with ImageRef Signed-off-by: Peter Staar <taa@zurich.ibm.com> * able to parse the captions and image uri's Signed-off-by: Peter Staar <taa@zurich.ibm.com> * fixed the mypy Signed-off-by: Peter Staar <taa@zurich.ibm.com> * Update all backends with proper filename in DocumentOrigin Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update to docling-core v2.1.0 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for MD Backend, to avoid duplicated text inserts into docling doc Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fix styling Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Added support for code blocks and fenced code in MD Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * cleaned prints Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added proper processing of in-line textual elements for MD backend Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fixed issues with duplicated paragraphs and incorrect lists in pptx Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fixed issue with group ordeering in pptx backend, added gebug log into run with formats Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Peter Staar <taa@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Signed-off-by: ABHISHEK FADAKE <31249309+fadkeabhi@users.noreply.github.com> Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Peter Staar <taa@zurich.ibm.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: ABHISHEK FADAKE <31249309+fadkeabhi@users.noreply.github.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
3496b4838f
commit
3023f18ba0
@ -94,5 +94,5 @@ If you use Docling in your projects, please consider citing the following:
|
||||
|
||||
## License
|
||||
|
||||
The Docling codebase is under MIT license.
|
||||
The Docling codebase is under MIT license.
|
||||
For individual model usage, please refer to the model licenses found in the original packages.
|
||||
|
@ -13,6 +13,7 @@ if TYPE_CHECKING:
|
||||
class AbstractDocumentBackend(ABC):
|
||||
@abstractmethod
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
self.file = in_doc.file
|
||||
self.path_or_stream = path_or_stream
|
||||
self.document_hash = in_doc.document_hash
|
||||
self.input_format = in_doc.format
|
||||
|
435
docling/backend/asciidoc_backend.py
Normal file
435
docling/backend/asciidoc_backend.py
Normal file
@ -0,0 +1,435 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DocItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupItem,
|
||||
GroupLabel,
|
||||
ImageRef,
|
||||
NodeItem,
|
||||
Size,
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
from pydantic import AnyUrl
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AsciiDocBackend(DeclarativeDocumentBackend):
|
||||
|
||||
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
self.path_or_stream = path_or_stream
|
||||
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||
self.lines = text_stream.split("\n")
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||
self.lines = f.readlines()
|
||||
self.valid = True
|
||||
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"Could not initialize AsciiDoc backend for file with hash {self.document_hash}."
|
||||
) from e
|
||||
return
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
@classmethod
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
def unload(self):
|
||||
return
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return {InputFormat.ASCIIDOC}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
"""
|
||||
Parses the ASCII into a structured document model.
|
||||
"""
|
||||
|
||||
origin = DocumentOrigin(
|
||||
filename=self.file.name or "file",
|
||||
mimetype="text/asciidoc",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
|
||||
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||
|
||||
doc = self._parse(doc)
|
||||
|
||||
return doc
|
||||
|
||||
def _parse(self, doc: DoclingDocument):
|
||||
"""
|
||||
Main function that orchestrates the parsing by yielding components:
|
||||
title, section headers, text, lists, and tables.
|
||||
"""
|
||||
|
||||
content = ""
|
||||
|
||||
in_list = False
|
||||
in_table = False
|
||||
|
||||
text_data: list[str] = []
|
||||
table_data: list[str] = []
|
||||
caption_data: list[str] = []
|
||||
|
||||
# parents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
||||
parents: dict[int, Union[GroupItem, None]] = {}
|
||||
# indents: dict[int, Union[DocItem, GroupItem, None]] = {}
|
||||
indents: dict[int, Union[GroupItem, None]] = {}
|
||||
|
||||
for i in range(0, 10):
|
||||
parents[i] = None
|
||||
indents[i] = None
|
||||
|
||||
for line in self.lines:
|
||||
# line = line.strip()
|
||||
|
||||
# Title
|
||||
if self._is_title(line):
|
||||
item = self._parse_title(line)
|
||||
level = item["level"]
|
||||
|
||||
parents[level] = doc.add_text(
|
||||
text=item["text"], label=DocItemLabel.TITLE
|
||||
)
|
||||
|
||||
# Section headers
|
||||
elif self._is_section_header(line):
|
||||
item = self._parse_section_header(line)
|
||||
level = item["level"]
|
||||
|
||||
parents[level] = doc.add_heading(
|
||||
text=item["text"], level=item["level"], parent=parents[level - 1]
|
||||
)
|
||||
for k, v in parents.items():
|
||||
if k > level:
|
||||
parents[k] = None
|
||||
|
||||
# Lists
|
||||
elif self._is_list_item(line):
|
||||
|
||||
_log.debug(f"line: {line}")
|
||||
item = self._parse_list_item(line)
|
||||
_log.debug(f"parsed list-item: {item}")
|
||||
|
||||
level = self._get_current_level(parents)
|
||||
|
||||
if not in_list:
|
||||
in_list = True
|
||||
|
||||
parents[level + 1] = doc.add_group(
|
||||
parent=parents[level], name="list", label=GroupLabel.LIST
|
||||
)
|
||||
indents[level + 1] = item["indent"]
|
||||
|
||||
elif in_list and item["indent"] > indents[level]:
|
||||
parents[level + 1] = doc.add_group(
|
||||
parent=parents[level], name="list", label=GroupLabel.LIST
|
||||
)
|
||||
indents[level + 1] = item["indent"]
|
||||
|
||||
elif in_list and item["indent"] < indents[level]:
|
||||
|
||||
# print(item["indent"], " => ", indents[level])
|
||||
while item["indent"] < indents[level]:
|
||||
# print(item["indent"], " => ", indents[level])
|
||||
parents[level] = None
|
||||
indents[level] = None
|
||||
level -= 1
|
||||
|
||||
doc.add_list_item(
|
||||
item["text"], parent=self._get_current_parent(parents)
|
||||
)
|
||||
|
||||
elif in_list and not self._is_list_item(line):
|
||||
in_list = False
|
||||
|
||||
level = self._get_current_level(parents)
|
||||
parents[level] = None
|
||||
|
||||
# Tables
|
||||
elif line.strip() == "|===" and not in_table: # start of table
|
||||
in_table = True
|
||||
|
||||
elif self._is_table_line(line): # within a table
|
||||
in_table = True
|
||||
table_data.append(self._parse_table_line(line))
|
||||
|
||||
elif in_table and (
|
||||
(not self._is_table_line(line)) or line.strip() == "|==="
|
||||
): # end of table
|
||||
|
||||
caption = None
|
||||
if len(caption_data) > 0:
|
||||
caption = doc.add_text(
|
||||
text=" ".join(caption_data), label=DocItemLabel.CAPTION
|
||||
)
|
||||
|
||||
caption_data = []
|
||||
|
||||
data = self._populate_table_as_grid(table_data)
|
||||
doc.add_table(
|
||||
data=data, parent=self._get_current_parent(parents), caption=caption
|
||||
)
|
||||
|
||||
in_table = False
|
||||
table_data = []
|
||||
|
||||
# Picture
|
||||
elif self._is_picture(line):
|
||||
|
||||
caption = None
|
||||
if len(caption_data) > 0:
|
||||
caption = doc.add_text(
|
||||
text=" ".join(caption_data), label=DocItemLabel.CAPTION
|
||||
)
|
||||
|
||||
caption_data = []
|
||||
|
||||
item = self._parse_picture(line)
|
||||
|
||||
size = None
|
||||
if "width" in item and "height" in item:
|
||||
size = Size(width=int(item["width"]), height=int(item["height"]))
|
||||
|
||||
uri = None
|
||||
if (
|
||||
"uri" in item
|
||||
and not item["uri"].startswith("http")
|
||||
and item["uri"].startswith("//")
|
||||
):
|
||||
uri = "file:" + item["uri"]
|
||||
elif (
|
||||
"uri" in item
|
||||
and not item["uri"].startswith("http")
|
||||
and item["uri"].startswith("/")
|
||||
):
|
||||
uri = "file:/" + item["uri"]
|
||||
elif "uri" in item and not item["uri"].startswith("http"):
|
||||
uri = "file://" + item["uri"]
|
||||
|
||||
image = ImageRef(mimetype="image/png", size=size, dpi=70, uri=uri)
|
||||
doc.add_picture(image=image, caption=caption)
|
||||
|
||||
# Caption
|
||||
elif self._is_caption(line) and len(caption_data) == 0:
|
||||
item = self._parse_caption(line)
|
||||
caption_data.append(item["text"])
|
||||
|
||||
elif (
|
||||
len(line.strip()) > 0 and len(caption_data) > 0
|
||||
): # allow multiline captions
|
||||
item = self._parse_text(line)
|
||||
caption_data.append(item["text"])
|
||||
|
||||
# Plain text
|
||||
elif len(line.strip()) == 0 and len(text_data) > 0:
|
||||
doc.add_text(
|
||||
text=" ".join(text_data),
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
parent=self._get_current_parent(parents),
|
||||
)
|
||||
text_data = []
|
||||
|
||||
elif len(line.strip()) > 0: # allow multiline texts
|
||||
|
||||
item = self._parse_text(line)
|
||||
text_data.append(item["text"])
|
||||
|
||||
if len(text_data) > 0:
|
||||
doc.add_text(
|
||||
text=" ".join(text_data),
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
parent=self._get_current_parent(parents),
|
||||
)
|
||||
text_data = []
|
||||
|
||||
if in_table and len(table_data) > 0:
|
||||
data = self._populate_table_as_grid(table_data)
|
||||
doc.add_table(data=data, parent=self._get_current_parent(parents))
|
||||
|
||||
in_table = False
|
||||
table_data = []
|
||||
|
||||
return doc
|
||||
|
||||
def _get_current_level(self, parents):
|
||||
for k, v in parents.items():
|
||||
if v == None and k > 0:
|
||||
return k - 1
|
||||
|
||||
return 0
|
||||
|
||||
def _get_current_parent(self, parents):
|
||||
for k, v in parents.items():
|
||||
if v == None and k > 0:
|
||||
return parents[k - 1]
|
||||
|
||||
return None
|
||||
|
||||
# ========= Title
|
||||
def _is_title(self, line):
|
||||
return re.match(r"^= ", line)
|
||||
|
||||
def _parse_title(self, line):
|
||||
return {"type": "title", "text": line[2:].strip(), "level": 0}
|
||||
|
||||
# ========= Section headers
|
||||
def _is_section_header(self, line):
|
||||
return re.match(r"^==+", line)
|
||||
|
||||
def _parse_section_header(self, line):
|
||||
match = re.match(r"^(=+)\s+(.*)", line)
|
||||
|
||||
marker = match.group(1) # The list marker (e.g., "*", "-", "1.")
|
||||
text = match.group(2) # The actual text of the list item
|
||||
|
||||
header_level = marker.count("=") # number of '=' represents level
|
||||
return {
|
||||
"type": "header",
|
||||
"level": header_level - 1,
|
||||
"text": text.strip(),
|
||||
}
|
||||
|
||||
# ========= Lists
|
||||
def _is_list_item(self, line):
|
||||
return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)
|
||||
|
||||
def _parse_list_item(self, line):
|
||||
"""Extract the item marker (number or bullet symbol) and the text of the item."""
|
||||
|
||||
match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
|
||||
if match:
|
||||
indent = match.group(1)
|
||||
marker = match.group(2) # The list marker (e.g., "*", "-", "1.")
|
||||
text = match.group(3) # The actual text of the list item
|
||||
|
||||
if marker == "*" or marker == "-":
|
||||
return {
|
||||
"type": "list_item",
|
||||
"marker": marker,
|
||||
"text": text.strip(),
|
||||
"numbered": False,
|
||||
"indent": 0 if indent == None else len(indent),
|
||||
}
|
||||
else:
|
||||
return {
|
||||
"type": "list_item",
|
||||
"marker": marker,
|
||||
"text": text.strip(),
|
||||
"numbered": True,
|
||||
"indent": 0 if indent == None else len(indent),
|
||||
}
|
||||
else:
|
||||
# Fallback if no match
|
||||
return {
|
||||
"type": "list_item",
|
||||
"marker": "-",
|
||||
"text": line,
|
||||
"numbered": False,
|
||||
"indent": 0,
|
||||
}
|
||||
|
||||
# ========= Tables
|
||||
def _is_table_line(self, line):
|
||||
return re.match(r"^\|.*\|", line)
|
||||
|
||||
def _parse_table_line(self, line):
|
||||
# Split table cells and trim extra spaces
|
||||
return [cell.strip() for cell in line.split("|") if cell.strip()]
|
||||
|
||||
def _populate_table_as_grid(self, table_data):
|
||||
|
||||
num_rows = len(table_data)
|
||||
|
||||
# Adjust the table data into a grid format
|
||||
num_cols = max(len(row) for row in table_data)
|
||||
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[])
|
||||
for row_idx, row in enumerate(table_data):
|
||||
# Pad rows with empty strings to match column count
|
||||
# grid.append(row + [''] * (max_cols - len(row)))
|
||||
|
||||
for col_idx, text in enumerate(row):
|
||||
row_span = 1
|
||||
col_span = 1
|
||||
|
||||
cell = TableCell(
|
||||
text=text,
|
||||
row_span=row_span,
|
||||
col_span=col_span,
|
||||
start_row_offset_idx=row_idx,
|
||||
end_row_offset_idx=row_idx + row_span,
|
||||
start_col_offset_idx=col_idx,
|
||||
end_col_offset_idx=col_idx + col_span,
|
||||
col_header=False,
|
||||
row_header=False,
|
||||
)
|
||||
data.table_cells.append(cell)
|
||||
|
||||
return data
|
||||
|
||||
# ========= Pictures
|
||||
def _is_picture(self, line):
|
||||
return re.match(r"^image::", line)
|
||||
|
||||
def _parse_picture(self, line):
|
||||
"""
|
||||
Parse an image macro, extracting its path and attributes.
|
||||
Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
|
||||
"""
|
||||
mtch = re.match(r"^image::(.+)\[(.*)\]$", line)
|
||||
if mtch:
|
||||
picture_path = mtch.group(1).strip()
|
||||
attributes = mtch.group(2).split(",")
|
||||
picture_info = {"type": "picture", "uri": picture_path}
|
||||
|
||||
# Extract optional attributes (alt text, width, height, alignment)
|
||||
if attributes:
|
||||
picture_info["alt"] = attributes[0].strip() if attributes[0] else ""
|
||||
for attr in attributes[1:]:
|
||||
key, value = attr.split("=")
|
||||
picture_info[key.strip()] = value.strip()
|
||||
|
||||
return picture_info
|
||||
|
||||
return {"type": "picture", "uri": line}
|
||||
|
||||
# ========= Captions
|
||||
def _is_caption(self, line):
|
||||
return re.match(r"^\.(.+)", line)
|
||||
|
||||
def _parse_caption(self, line):
|
||||
mtch = re.match(r"^\.(.+)", line)
|
||||
if mtch:
|
||||
text = mtch.group(1)
|
||||
return {"type": "caption", "text": text}
|
||||
|
||||
return {"type": "caption", "text": ""}
|
||||
|
||||
# ========= Plain text
|
||||
def _parse_text(self, line):
|
||||
return {"type": "text", "text": line.strip()}
|
@ -7,6 +7,7 @@ from bs4 import BeautifulSoup
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupLabel,
|
||||
TableCell,
|
||||
TableData,
|
||||
@ -66,7 +67,13 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
# access self.path_or_stream to load stuff
|
||||
doc = DoclingDocument(name="dummy")
|
||||
origin = DocumentOrigin(
|
||||
filename=self.file.name or "file",
|
||||
mimetype="text/html",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
|
||||
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||
_log.debug("Trying to convert HTML...")
|
||||
|
||||
if self.is_valid():
|
||||
|
293
docling/backend/md_backend.py
Normal file
293
docling/backend/md_backend.py
Normal file
@ -0,0 +1,293 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
import marko
|
||||
import marko.ext
|
||||
import marko.ext.gfm
|
||||
import marko.inline
|
||||
from docling_core.types.doc import (
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupLabel,
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
from marko import Markdown
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
_log.debug("MD INIT!!!")
|
||||
|
||||
# Markdown file:
|
||||
self.path_or_stream = path_or_stream
|
||||
self.valid = True
|
||||
self.markdown = "" # To store original Markdown string
|
||||
|
||||
self.in_table = False
|
||||
self.md_table_buffer: list[str] = []
|
||||
self.inline_text_buffer = ""
|
||||
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||
self.markdown = text_stream
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||
md_content = f.read()
|
||||
self.markdown = md_content
|
||||
self.valid = True
|
||||
|
||||
_log.debug(self.markdown)
|
||||
except Exception as e:
|
||||
raise RuntimeError(
|
||||
f"Could not initialize MD backend for file with hash {self.document_hash}."
|
||||
) from e
|
||||
return
|
||||
|
||||
def close_table(self, doc=None):
|
||||
if self.in_table:
|
||||
_log.debug("=== TABLE START ===")
|
||||
for md_table_row in self.md_table_buffer:
|
||||
_log.debug(md_table_row)
|
||||
_log.debug("=== TABLE END ===")
|
||||
tcells = []
|
||||
result_table = []
|
||||
for n, md_table_row in enumerate(self.md_table_buffer):
|
||||
data = []
|
||||
if n == 0:
|
||||
header = [t.strip() for t in md_table_row.split("|")[1:-1]]
|
||||
for value in header:
|
||||
data.append(value)
|
||||
result_table.append(data)
|
||||
if n > 1:
|
||||
values = [t.strip() for t in md_table_row.split("|")[1:-1]]
|
||||
for value in values:
|
||||
data.append(value)
|
||||
result_table.append(data)
|
||||
|
||||
for trow_ind, trow in enumerate(result_table):
|
||||
for tcol_ind, cellval in enumerate(trow):
|
||||
row_span = (
|
||||
1 # currently supporting just simple tables (without spans)
|
||||
)
|
||||
col_span = (
|
||||
1 # currently supporting just simple tables (without spans)
|
||||
)
|
||||
icell = TableCell(
|
||||
text=cellval.strip(),
|
||||
row_span=row_span,
|
||||
col_span=col_span,
|
||||
start_row_offset_idx=trow_ind,
|
||||
end_row_offset_idx=trow_ind + row_span,
|
||||
start_col_offset_idx=tcol_ind,
|
||||
end_col_offset_idx=tcol_ind + col_span,
|
||||
col_header=False,
|
||||
row_header=False,
|
||||
)
|
||||
tcells.append(icell)
|
||||
|
||||
num_rows = len(result_table)
|
||||
num_cols = len(result_table[0])
|
||||
self.in_table = False
|
||||
self.md_table_buffer = [] # clean table markdown buffer
|
||||
# Initialize Docling TableData
|
||||
data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
|
||||
# Populate
|
||||
for tcell in tcells:
|
||||
data.table_cells.append(tcell)
|
||||
if len(tcells) > 0:
|
||||
doc.add_table(data=data)
|
||||
return
|
||||
|
||||
def process_inline_text(self, parent_element, doc=None):
|
||||
# self.inline_text_buffer += str(text_in)
|
||||
txt = self.inline_text_buffer.strip()
|
||||
if len(txt) > 0:
|
||||
doc.add_text(
|
||||
label=DocItemLabel.PARAGRAPH,
|
||||
parent=parent_element,
|
||||
text=txt,
|
||||
)
|
||||
self.inline_text_buffer = ""
|
||||
|
||||
def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
|
||||
# Iterates over all elements in the AST
|
||||
# Check for different element types and process relevant details
|
||||
if isinstance(element, marko.block.Heading):
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
_log.debug(
|
||||
f" - Heading level {element.level}, content: {element.children[0].children}"
|
||||
)
|
||||
if element.level == 1:
|
||||
doc_label = DocItemLabel.TITLE
|
||||
else:
|
||||
doc_label = DocItemLabel.SECTION_HEADER
|
||||
snippet_text = element.children[0].children.strip()
|
||||
|
||||
parent_element = doc.add_text(
|
||||
label=doc_label, parent=parent_element, text=snippet_text
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.block.List):
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
|
||||
list_label = GroupLabel.LIST
|
||||
if element.ordered:
|
||||
list_label = GroupLabel.ORDERED_LIST
|
||||
parent_element = doc.add_group(
|
||||
label=list_label, name=f"list", parent=parent_element
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.block.ListItem):
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
_log.debug(" - List item")
|
||||
|
||||
snippet_text = str(element.children[0].children[0].children)
|
||||
is_numbered = False
|
||||
if parent_element.label == GroupLabel.ORDERED_LIST:
|
||||
is_numbered = True
|
||||
doc.add_list_item(
|
||||
enumerated=is_numbered, parent=parent_element, text=snippet_text
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.inline.Image):
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
|
||||
doc.add_picture(parent=parent_element, caption=element.title)
|
||||
|
||||
elif isinstance(element, marko.block.Paragraph):
|
||||
self.process_inline_text(parent_element, doc)
|
||||
|
||||
elif isinstance(element, marko.inline.RawText):
|
||||
_log.debug(f" - Paragraph (raw text): {element.children}")
|
||||
snippet_text = str(element.children).strip()
|
||||
# Detect start of the table:
|
||||
if "|" in snippet_text:
|
||||
# most likely part of the markdown table
|
||||
self.in_table = True
|
||||
if len(self.md_table_buffer) > 0:
|
||||
self.md_table_buffer[len(self.md_table_buffer) - 1] += str(
|
||||
snippet_text
|
||||
)
|
||||
else:
|
||||
self.md_table_buffer.append(snippet_text)
|
||||
else:
|
||||
self.close_table(doc)
|
||||
self.in_table = False
|
||||
# most likely just inline text
|
||||
self.inline_text_buffer += str(
|
||||
element.children
|
||||
) # do not strip an inline text, as it may contain important spaces
|
||||
|
||||
elif isinstance(element, marko.inline.CodeSpan):
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
_log.debug(f" - Code Span: {element.children}")
|
||||
snippet_text = str(element.children).strip()
|
||||
doc.add_text(
|
||||
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.block.CodeBlock):
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
_log.debug(f" - Code Block: {element.children}")
|
||||
snippet_text = str(element.children[0].children).strip()
|
||||
doc.add_text(
|
||||
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.block.FencedCode):
|
||||
self.close_table(doc)
|
||||
self.process_inline_text(parent_element, doc)
|
||||
_log.debug(f" - Code Block: {element.children}")
|
||||
snippet_text = str(element.children[0].children).strip()
|
||||
doc.add_text(
|
||||
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
||||
)
|
||||
|
||||
elif isinstance(element, marko.inline.LineBreak):
|
||||
self.process_inline_text(parent_element, doc)
|
||||
if self.in_table:
|
||||
_log.debug("Line break in a table")
|
||||
self.md_table_buffer.append("")
|
||||
|
||||
elif isinstance(element, marko.block.HTMLBlock):
|
||||
self.process_inline_text(parent_element, doc)
|
||||
self.close_table(doc)
|
||||
_log.debug("HTML Block: {}".format(element))
|
||||
if (
|
||||
len(element.children) > 0
|
||||
): # If Marko doesn't return any content for HTML block, skip it
|
||||
snippet_text = str(element.children).strip()
|
||||
doc.add_text(
|
||||
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
|
||||
)
|
||||
else:
|
||||
if not isinstance(element, str):
|
||||
self.close_table(doc)
|
||||
_log.debug("Some other element: {}".format(element))
|
||||
|
||||
# Iterate through the element's children (if any)
|
||||
if not isinstance(element, marko.block.ListItem):
|
||||
if not isinstance(element, marko.block.Heading):
|
||||
if not isinstance(element, marko.block.FencedCode):
|
||||
# if not isinstance(element, marko.block.Paragraph):
|
||||
if hasattr(element, "children"):
|
||||
for child in element.children:
|
||||
self.iterate_elements(child, depth + 1, doc, parent_element)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
def unload(self):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.path_or_stream.close()
|
||||
self.path_or_stream = None
|
||||
|
||||
@classmethod
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return {InputFormat.MD}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
_log.debug("converting Markdown...")
|
||||
|
||||
origin = DocumentOrigin(
|
||||
filename=self.file.name or "file",
|
||||
mimetype="text/markdown",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
|
||||
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||
|
||||
if self.is_valid():
|
||||
# Parse the markdown into an abstract syntax tree (AST)
|
||||
marko_parser = Markdown()
|
||||
parsed_ast = marko_parser.parse(self.markdown)
|
||||
# Start iterating from the root of the AST
|
||||
self.iterate_elements(parsed_ast, 0, doc, None)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Cannot convert md with {self.document_hash} because the backend failed to init."
|
||||
)
|
||||
return doc
|
@ -83,21 +83,14 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
# Parses the PPTX into a structured document model.
|
||||
# origin = DocumentOrigin(filename=self.path_or_stream.name, mimetype=next(iter(FormatToMimeType.get(InputFormat.PPTX))), binary_hash=self.document_hash)
|
||||
|
||||
fname = ""
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
fname = self.path_or_stream.name
|
||||
|
||||
origin = DocumentOrigin(
|
||||
filename=fname,
|
||||
filename=self.file.name or "file",
|
||||
mimetype="application/vnd.ms-powerpoint",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
if len(fname) > 0:
|
||||
docname = Path(fname).stem
|
||||
else:
|
||||
docname = "stream"
|
||||
|
||||
doc = DoclingDocument(
|
||||
name=docname, origin=origin
|
||||
name=self.file.stem or "file", origin=origin
|
||||
) # must add origin information
|
||||
doc = self.walk_linear(self.pptx_obj, doc)
|
||||
|
||||
@ -119,10 +112,16 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
|
||||
def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
|
||||
is_a_list = False
|
||||
is_list_group_created = False
|
||||
enum_list_item_value = 0
|
||||
new_list = None
|
||||
bullet_type = "None"
|
||||
list_text = ""
|
||||
list_label = GroupLabel.LIST
|
||||
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
|
||||
|
||||
# Identify if shape contains lists
|
||||
for paragraph in shape.text_frame.paragraphs:
|
||||
enum_list_item_value += 1
|
||||
bullet_type = "None"
|
||||
# Check if paragraph is a bullet point using the `element` XML
|
||||
p = paragraph._element
|
||||
if (
|
||||
@ -143,29 +142,32 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
if paragraph.level > 0:
|
||||
# Most likely a sub-list
|
||||
is_a_list = True
|
||||
list_text = paragraph.text.strip()
|
||||
|
||||
prov = self.generate_prov(shape, slide_ind, shape.text.strip())
|
||||
|
||||
if is_a_list:
|
||||
# Determine if this is an unordered list or an ordered list.
|
||||
# Set GroupLabel.ORDERED_LIST when it fits.
|
||||
list_label = GroupLabel.LIST
|
||||
if bullet_type == "Numbered":
|
||||
list_label = GroupLabel.ORDERED_LIST
|
||||
|
||||
new_list = doc.add_group(
|
||||
label=list_label, name=f"list", parent=parent_slide
|
||||
)
|
||||
else:
|
||||
new_list = None
|
||||
|
||||
if is_a_list:
|
||||
_log.debug("LIST DETECTED!")
|
||||
else:
|
||||
_log.debug("No List")
|
||||
|
||||
# for e in p.iter():
|
||||
# If there is a list inside of the shape, create a new docling list to assign list items to
|
||||
# if is_a_list:
|
||||
# new_list = doc.add_group(
|
||||
# label=list_label, name=f"list", parent=parent_slide
|
||||
# )
|
||||
|
||||
# Iterate through paragraphs to build up text
|
||||
for paragraph in shape.text_frame.paragraphs:
|
||||
# p_text = paragraph.text.strip()
|
||||
p = paragraph._element
|
||||
enum_list_item_value += 1
|
||||
inline_paragraph_text = ""
|
||||
inline_list_item_text = ""
|
||||
|
||||
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
|
||||
if len(e.text.strip()) > 0:
|
||||
e_is_a_list_item = False
|
||||
@ -187,15 +189,17 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
e_is_a_list_item = False
|
||||
|
||||
if e_is_a_list_item:
|
||||
if len(inline_paragraph_text) > 0:
|
||||
# output accumulated inline text:
|
||||
doc.add_text(
|
||||
label=doc_label,
|
||||
parent=parent_slide,
|
||||
text=inline_paragraph_text,
|
||||
prov=prov,
|
||||
)
|
||||
# Set marker and enumerated arguments if this is an enumeration element.
|
||||
enum_marker = str(enum_list_item_value) + "."
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_list,
|
||||
text=list_text,
|
||||
prov=prov,
|
||||
)
|
||||
inline_list_item_text += e.text
|
||||
# print(e.text)
|
||||
else:
|
||||
# Assign proper label to the text, depending if it's a Title or Section Header
|
||||
# For other types of text, assign - PARAGRAPH
|
||||
@ -210,15 +214,34 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
doc_label = DocItemLabel.TITLE
|
||||
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
|
||||
DocItemLabel.SECTION_HEADER
|
||||
|
||||
enum_list_item_value = 0
|
||||
inline_paragraph_text += e.text
|
||||
|
||||
doc.add_text(
|
||||
label=doc_label,
|
||||
parent=parent_slide,
|
||||
text=list_text,
|
||||
prov=prov,
|
||||
)
|
||||
if len(inline_paragraph_text) > 0:
|
||||
# output accumulated inline text:
|
||||
doc.add_text(
|
||||
label=doc_label,
|
||||
parent=parent_slide,
|
||||
text=inline_paragraph_text,
|
||||
prov=prov,
|
||||
)
|
||||
|
||||
if len(inline_list_item_text) > 0:
|
||||
enum_marker = ""
|
||||
if is_numbered:
|
||||
enum_marker = str(enum_list_item_value) + "."
|
||||
if not is_list_group_created:
|
||||
new_list = doc.add_group(
|
||||
label=list_label, name=f"list", parent=parent_slide
|
||||
)
|
||||
is_list_group_created = True
|
||||
doc.add_list_item(
|
||||
marker=enum_marker,
|
||||
enumerated=is_numbered,
|
||||
parent=new_list,
|
||||
text=inline_list_item_text,
|
||||
prov=prov,
|
||||
)
|
||||
return
|
||||
|
||||
def handle_title(self, shape, parent_slide, slide_ind, doc):
|
||||
@ -311,7 +334,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
if len(tcells) > 0:
|
||||
# If table is not fully empty...
|
||||
# Create Docling table
|
||||
doc.add_table(data=data, prov=prov)
|
||||
doc.add_table(parent=parent_slide, data=data, prov=prov)
|
||||
return
|
||||
|
||||
def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
|
||||
|
@ -85,20 +85,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
def convert(self) -> DoclingDocument:
|
||||
# Parses the DOCX into a structured document model.
|
||||
|
||||
fname = ""
|
||||
if isinstance(self.path_or_stream, Path):
|
||||
fname = self.path_or_stream.name
|
||||
|
||||
origin = DocumentOrigin(
|
||||
filename=fname,
|
||||
filename=self.file.name or "file",
|
||||
mimetype="application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
binary_hash=self.document_hash,
|
||||
)
|
||||
if len(fname) > 0:
|
||||
docname = Path(fname).stem
|
||||
else:
|
||||
docname = "stream"
|
||||
doc = DoclingDocument(name=docname, origin=origin)
|
||||
|
||||
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||
if self.is_valid():
|
||||
assert self.docx_obj is not None
|
||||
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
||||
|
@ -30,6 +30,8 @@ class InputFormat(str, Enum):
|
||||
HTML = "html"
|
||||
IMAGE = "image"
|
||||
PDF = "pdf"
|
||||
ASCIIDOC = "asciidoc"
|
||||
MD = "md"
|
||||
|
||||
|
||||
class OutputFormat(str, Enum):
|
||||
@ -43,29 +45,33 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
|
||||
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
|
||||
InputFormat.PDF: ["pdf"],
|
||||
InputFormat.MD: ["md"],
|
||||
InputFormat.HTML: ["html", "htm", "xhtml"],
|
||||
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
||||
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
||||
}
|
||||
|
||||
FormatToMimeType: Dict[InputFormat, Set[str]] = {
|
||||
InputFormat.DOCX: {
|
||||
FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.DOCX: [
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
|
||||
},
|
||||
InputFormat.PPTX: {
|
||||
],
|
||||
InputFormat.PPTX: [
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.template",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
|
||||
},
|
||||
InputFormat.HTML: {"text/html", "application/xhtml+xml"},
|
||||
InputFormat.IMAGE: {
|
||||
],
|
||||
InputFormat.HTML: ["text/html", "application/xhtml+xml"],
|
||||
InputFormat.IMAGE: [
|
||||
"image/png",
|
||||
"image/jpeg",
|
||||
"image/tiff",
|
||||
"image/gif",
|
||||
"image/bmp",
|
||||
},
|
||||
InputFormat.PDF: {"application/pdf"},
|
||||
],
|
||||
InputFormat.PDF: ["application/pdf"],
|
||||
InputFormat.ASCIIDOC: ["text/asciidoc"],
|
||||
InputFormat.MD: ["text/markdown", "text/x-markdown"],
|
||||
}
|
||||
MimeTypeToFormat = {
|
||||
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
|
||||
|
@ -45,6 +45,8 @@ from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
DocumentStream,
|
||||
ErrorItem,
|
||||
FormatToExtensions,
|
||||
FormatToMimeType,
|
||||
InputFormat,
|
||||
MimeTypeToFormat,
|
||||
Page,
|
||||
@ -484,26 +486,48 @@ class _DocumentConversionInput(BaseModel):
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected obj type in iterator: {type(obj)}")
|
||||
|
||||
def _guess_format(self, obj):
|
||||
content = None
|
||||
def _guess_format(self, obj: Union[Path, DocumentStream]):
|
||||
content = b"" # empty binary blob
|
||||
format = None
|
||||
|
||||
if isinstance(obj, Path):
|
||||
mime = filetype.guess_mime(str(obj))
|
||||
if mime is None:
|
||||
ext = obj.suffix[1:]
|
||||
mime = self._mime_from_extension(ext)
|
||||
if mime is None: # must guess from
|
||||
with obj.open("rb") as f:
|
||||
content = f.read(1024) # Read first 1KB
|
||||
|
||||
elif isinstance(obj, DocumentStream):
|
||||
obj.stream.seek(0)
|
||||
content = obj.stream.read(8192)
|
||||
obj.stream.seek(0)
|
||||
mime = filetype.guess_mime(content)
|
||||
if mime is None:
|
||||
ext = (
|
||||
obj.name.rsplit(".", 1)[-1]
|
||||
if ("." in obj.name and not obj.name.startswith("."))
|
||||
else ""
|
||||
)
|
||||
mime = self._mime_from_extension(ext)
|
||||
|
||||
if mime is None:
|
||||
mime = self._detect_html_xhtml(content)
|
||||
mime = mime or self._detect_html_xhtml(content)
|
||||
mime = mime or "text/plain"
|
||||
|
||||
format = MimeTypeToFormat.get(mime)
|
||||
return format
|
||||
|
||||
def _mime_from_extension(self, ext):
|
||||
mime = None
|
||||
if ext in FormatToExtensions[InputFormat.ASCIIDOC]:
|
||||
mime = FormatToMimeType[InputFormat.ASCIIDOC][0]
|
||||
elif ext in FormatToExtensions[InputFormat.HTML]:
|
||||
mime = FormatToMimeType[InputFormat.HTML][0]
|
||||
elif ext in FormatToExtensions[InputFormat.MD]:
|
||||
mime = FormatToMimeType[InputFormat.MD][0]
|
||||
|
||||
return mime
|
||||
|
||||
def _detect_html_xhtml(self, content):
|
||||
content_str = content.decode("ascii", errors="ignore").lower()
|
||||
# Remove XML comments
|
||||
|
@ -8,8 +8,10 @@ from typing import Dict, Iterable, Iterator, List, Optional, Type
|
||||
from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.asciidoc_backend import AsciiDocBackend
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
||||
@ -52,6 +54,16 @@ class PowerpointFormatOption(FormatOption):
|
||||
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
|
||||
|
||||
|
||||
class MarkdownFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
|
||||
|
||||
|
||||
class AsciiDocFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = AsciiDocBackend
|
||||
|
||||
|
||||
class HTMLFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
||||
@ -74,6 +86,12 @@ _format_to_default_options = {
|
||||
InputFormat.PPTX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
||||
),
|
||||
InputFormat.MD: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
|
||||
),
|
||||
InputFormat.ASCIIDOC: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
|
||||
),
|
||||
InputFormat.HTML: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
||||
),
|
||||
|
46
docs/examples/run_md.py
Normal file
46
docs/examples/run_md.py
Normal file
@ -0,0 +1,46 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def main():
|
||||
input_paths = [Path("README.md")]
|
||||
|
||||
for path in input_paths:
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=path,
|
||||
format=InputFormat.PDF,
|
||||
backend=MarkdownDocumentBackend,
|
||||
)
|
||||
mdb = MarkdownDocumentBackend(in_doc=in_doc, path_or_stream=path)
|
||||
document = mdb.convert()
|
||||
|
||||
out_path = Path("scratch")
|
||||
print(
|
||||
f"Document {path} converted." f"\nSaved markdown output to: {str(out_path)}"
|
||||
)
|
||||
|
||||
# Export Docling document format to markdowndoc:
|
||||
fn = os.path.basename(path)
|
||||
|
||||
with (out_path / f"{fn}.md").open("w") as fp:
|
||||
fp.write(document.export_to_markdown())
|
||||
|
||||
with (out_path / f"{fn}.json").open("w") as fp:
|
||||
fp.write(json.dumps(document.export_to_dict()))
|
||||
|
||||
with (out_path / f"{fn}.yaml").open("w") as fp:
|
||||
fp.write(yaml.safe_dump(document.export_to_dict()))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -19,12 +19,15 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
def main():
|
||||
input_paths = [
|
||||
Path("README.md"),
|
||||
Path("tests/data/wiki_duck.html"),
|
||||
Path("tests/data/word_sample.docx"),
|
||||
Path("tests/data/lorem_ipsum.docx"),
|
||||
Path("tests/data/powerpoint_sample.pptx"),
|
||||
Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
Path("tests/data/2206.01062.pdf"),
|
||||
Path("tests/data/test_01.asciidoc"),
|
||||
Path("tests/data/test_01.asciidoc"),
|
||||
]
|
||||
|
||||
## for defaults use:
|
||||
@ -40,6 +43,8 @@ def main():
|
||||
InputFormat.DOCX,
|
||||
InputFormat.HTML,
|
||||
InputFormat.PPTX,
|
||||
InputFormat.ASCIIDOC,
|
||||
InputFormat.MD,
|
||||
], # whitelist formats, non-matching files are ignored.
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
@ -60,15 +65,15 @@ def main():
|
||||
f"Document {res.input.file.name} converted."
|
||||
f"\nSaved markdown output to: {str(out_path)}"
|
||||
)
|
||||
# print(res.docdocument.export_to_markdown())
|
||||
_log.debug(res.document._export_to_indented_text(max_text_len=16))
|
||||
# Export Docling document format to markdowndoc:
|
||||
with (out_path / f"{res.input.file.name}.md").open("w") as fp:
|
||||
with (out_path / f"{res.input.file.stem}.md").open("w") as fp:
|
||||
fp.write(res.document.export_to_markdown())
|
||||
|
||||
with (out_path / f"{res.input.file.name}.json").open("w") as fp:
|
||||
with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
|
||||
fp.write(json.dumps(res.document.export_to_dict()))
|
||||
|
||||
with (out_path / f"{res.input.file.name}.yaml").open("w") as fp:
|
||||
with (out_path / f"{res.input.file.stem}.yaml").open("w") as fp:
|
||||
fp.write(yaml.safe_dump(res.document.export_to_dict()))
|
||||
|
||||
|
||||
|
1738
poetry.lock
generated
1738
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -37,9 +37,9 @@ torchvision = [
|
||||
######################
|
||||
python = "^3.10"
|
||||
pydantic = "^2.0.0"
|
||||
docling-core = "^2.0.0"
|
||||
docling-core = "^2.1.0"
|
||||
docling-ibm-models = "^2.0.1"
|
||||
deepsearch-glm = "^0.25.0"
|
||||
deepsearch-glm = "^0.26.1"
|
||||
filetype = "^1.2.0"
|
||||
pypdfium2 = "^4.30.0"
|
||||
pydantic-settings = "^2.3.0"
|
||||
@ -57,6 +57,7 @@ python-docx = "^1.1.2"
|
||||
python-pptx = "^1.0.2"
|
||||
beautifulsoup4 = "^4.12.3"
|
||||
pandas = "^2.1.4"
|
||||
marko = "^2.1.2"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = {extras = ["jupyter"], version = "^24.4.2"}
|
||||
|
@ -34,10 +34,10 @@
|
||||
<paragraph><location><page_2><loc_8><loc_71><loc_47><loc_88></location>The second problem is called table-structure decomposition. The latter is a long standing problem in the community of document understanding [6, 4, 14]. Contrary to the table-location problem, there are no commonly used approaches that can easily be re-purposed to solve this problem. Lately, a set of new model-architectures has been proposed by the community to address table-structure decomposition [37, 36, 18, 20]. All these models have some weaknesses (see Sec. 2). The common denominator here is the reliance on textual features and/or the inability to provide the bounding box of each table-cell in the original image.</paragraph>
|
||||
<paragraph><location><page_2><loc_8><loc_53><loc_47><loc_71></location>In this paper, we want to address these weaknesses and present a robust table-structure decomposition algorithm. The design criteria for our model are the following. First, we want our algorithm to be language agnostic. In this way, we can obtain the structure of any table, irregardless of the language. Second, we want our algorithm to leverage as much data as possible from the original PDF document. For programmatic PDF documents, the text-cells can often be extracted much faster and with higher accuracy compared to OCR methods. Last but not least, we want to have a direct link between the table-cell and its bounding box in the image.</paragraph>
|
||||
<paragraph><location><page_2><loc_8><loc_45><loc_47><loc_53></location>To meet the design criteria listed above, we developed a new model called TableFormer and a synthetically generated table structure dataset called SynthTabNet $^{1}$. In particular, our contributions in this work can be summarised as follows:</paragraph>
|
||||
<paragraph><location><page_2><loc_10><loc_38><loc_47><loc_44></location>· We propose TableFormer , a transformer based model that predicts tables structure and bounding boxes for the table content simultaneously in an end-to-end approach.</paragraph>
|
||||
<paragraph><location><page_2><loc_10><loc_31><loc_47><loc_37></location>· Across all benchmark datasets TableFormer significantly outperforms existing state-of-the-art metrics, while being much more efficient in training and inference to existing works.</paragraph>
|
||||
<paragraph><location><page_2><loc_10><loc_25><loc_47><loc_29></location>· We present SynthTabNet a synthetically generated dataset, with various appearance styles and complexity.</paragraph>
|
||||
<paragraph><location><page_2><loc_10><loc_19><loc_47><loc_24></location>· An augmented dataset based on PubTabNet [37], FinTabNet [36], and TableBank [17] with generated ground-truth for reproducibility.</paragraph>
|
||||
<paragraph><location><page_2><loc_10><loc_38><loc_47><loc_44></location>- · We propose TableFormer , a transformer based model that predicts tables structure and bounding boxes for the table content simultaneously in an end-to-end approach.</paragraph>
|
||||
<paragraph><location><page_2><loc_10><loc_31><loc_47><loc_37></location>- · Across all benchmark datasets TableFormer significantly outperforms existing state-of-the-art metrics, while being much more efficient in training and inference to existing works.</paragraph>
|
||||
<paragraph><location><page_2><loc_10><loc_25><loc_47><loc_29></location>- · We present SynthTabNet a synthetically generated dataset, with various appearance styles and complexity.</paragraph>
|
||||
<paragraph><location><page_2><loc_10><loc_19><loc_47><loc_24></location>- · An augmented dataset based on PubTabNet [37], FinTabNet [36], and TableBank [17] with generated ground-truth for reproducibility.</paragraph>
|
||||
<paragraph><location><page_2><loc_8><loc_12><loc_47><loc_18></location>The paper is structured as follows. In Sec. 2, we give a brief overview of the current state-of-the-art. In Sec. 3, we describe the datasets on which we train. In Sec. 4, we introduce the TableFormer model-architecture and describe</paragraph>
|
||||
<paragraph><location><page_2><loc_50><loc_86><loc_89><loc_91></location>its results & performance in Sec. 5. As a conclusion, we describe how this new model-architecture can be re-purposed for other tasks in the computer-vision community.</paragraph>
|
||||
<subtitle-level-1><location><page_2><loc_50><loc_83><loc_81><loc_85></location>2. Previous work and State of the Art</subtitle-level-1>
|
||||
@ -210,50 +210,50 @@
|
||||
<subtitle-level-1><location><page_8><loc_50><loc_37><loc_75><loc_38></location>6. Future Work & Conclusion</subtitle-level-1>
|
||||
<paragraph><location><page_8><loc_50><loc_18><loc_89><loc_35></location>In this paper, we presented TableFormer an end-to-end transformer based approach to predict table structures and bounding boxes of cells from an image. This approach enables us to recreate the table structure, and extract the cell content from PDF or OCR by using bounding boxes. Additionally, it provides the versatility required in real-world scenarios when dealing with various types of PDF documents, and languages. Furthermore, our method outperforms all state-of-the-arts with a wide margin. Finally, we introduce "SynthTabNet" a challenging synthetically generated dataset that reinforces missing characteristics from other datasets.</paragraph>
|
||||
<subtitle-level-1><location><page_8><loc_50><loc_14><loc_60><loc_15></location>References</subtitle-level-1>
|
||||
<paragraph><location><page_8><loc_51><loc_10><loc_89><loc_13></location>[1] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-</paragraph>
|
||||
<paragraph><location><page_8><loc_51><loc_10><loc_89><loc_13></location>- [1] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-</paragraph>
|
||||
<figure>
|
||||
<location><page_8><loc_50><loc_77><loc_91><loc_88></location>
|
||||
</figure>
|
||||
<paragraph><location><page_9><loc_11><loc_85><loc_47><loc_91></location>end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_81><loc_47><loc_85></location>[2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_77><loc_47><loc_81></location>[3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_71><loc_47><loc_77></location>[4] Herv'e D'ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_66><loc_47><loc_71></location>[5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_60><loc_47><loc_65></location>[6] Max Gobel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_56><loc_47><loc_60></location>[7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_49><loc_47><loc_56></location>[8] Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_45><loc_47><loc_49></location>[9] Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1</paragraph>
|
||||
<paragraph><location><page_9><loc_8><loc_39><loc_47><loc_44></location>[10] Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_8><loc_32><loc_47><loc_39></location>[11] Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_8><loc_25><loc_47><loc_32></location>[12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_8><loc_18><loc_47><loc_25></location>[13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl'ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_8><loc_14><loc_47><loc_18></location>[14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_8><loc_10><loc_47><loc_14></location>[15] Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_82><loc_89><loc_91></location>[16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_78><loc_89><loc_82></location>[17] Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou, and Zhoujun Li. Tablebank: A benchmark dataset for table detection and recognition, 2019. 2, 3</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_67><loc_89><loc_78></location>[18] Yiren Li, Zheng Huang, Junchi Yan, Yi Zhou, Fan Ye, and Xianhui Liu. Gfte: Graph-based financial table extraction. In Alberto Del Bimbo, Rita Cucchiara, Stan Sclaroff, Giovanni Maria Farinella, Tao Mei, Marco Bertini, Hugo Jair Escalante, and Roberto Vezzani, editors, Pattern Recognition. ICPR International Workshops and Challenges , pages 644-658, Cham, 2021. Springer International Publishing. 2, 3</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_59><loc_89><loc_67></location>[19] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_53><loc_89><loc_58></location>[20] Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_45><loc_89><loc_53></location>[21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_30><loc_89><loc_44></location>[22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch'e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_21><loc_89><loc_29></location>[23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_16><loc_89><loc_21></location>[24] Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_10><loc_89><loc_15></location>[25] Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on</paragraph>
|
||||
<paragraph><location><page_9><loc_11><loc_85><loc_47><loc_91></location>- end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_81><loc_47><loc_85></location>- [2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_77><loc_47><loc_81></location>- [3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_71><loc_47><loc_77></location>- [4] Herv'e D'ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_66><loc_47><loc_71></location>- [5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_60><loc_47><loc_65></location>- [6] Max Gobel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_56><loc_47><loc_60></location>- [7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_49><loc_47><loc_56></location>- [8] Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_45><loc_47><loc_49></location>- [9] Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1</paragraph>
|
||||
<paragraph><location><page_9><loc_8><loc_39><loc_47><loc_44></location>- [10] Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_8><loc_32><loc_47><loc_39></location>- [11] Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_8><loc_25><loc_47><loc_32></location>- [12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_8><loc_18><loc_47><loc_25></location>- [13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl'ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_8><loc_14><loc_47><loc_18></location>- [14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_8><loc_10><loc_47><loc_14></location>- [15] Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_82><loc_89><loc_91></location>- [16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_78><loc_89><loc_82></location>- [17] Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou, and Zhoujun Li. Tablebank: A benchmark dataset for table detection and recognition, 2019. 2, 3</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_67><loc_89><loc_78></location>- [18] Yiren Li, Zheng Huang, Junchi Yan, Yi Zhou, Fan Ye, and Xianhui Liu. Gfte: Graph-based financial table extraction. In Alberto Del Bimbo, Rita Cucchiara, Stan Sclaroff, Giovanni Maria Farinella, Tao Mei, Marco Bertini, Hugo Jair Escalante, and Roberto Vezzani, editors, Pattern Recognition. ICPR International Workshops and Challenges , pages 644-658, Cham, 2021. Springer International Publishing. 2, 3</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_59><loc_89><loc_67></location>- [19] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_53><loc_89><loc_58></location>- [20] Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_45><loc_89><loc_53></location>- [21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_30><loc_89><loc_44></location>- [22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch'e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_21><loc_89><loc_29></location>- [23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_16><loc_89><loc_21></location>- [24] Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3</paragraph>
|
||||
<paragraph><location><page_9><loc_50><loc_10><loc_89><loc_15></location>- [25] Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on</paragraph>
|
||||
<paragraph><location><page_10><loc_11><loc_88><loc_47><loc_91></location>Computer Vision and Pattern Recognition , pages 658-666, 2019. 6</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_80><loc_47><loc_88></location>[26] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 11621167, 2017. 1</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_71><loc_47><loc_79></location>[27] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR international conference on document analysis and recognition (ICDAR) , volume 1, pages 1162-1167. IEEE, 2017. 3</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_66><loc_47><loc_71></location>[28] Faisal Shafait and Ray Smith. Table detection in heterogeneous documents. In Proceedings of the 9th IAPR International Workshop on Document Analysis Systems , pages 6572, 2010. 2</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_59><loc_47><loc_65></location>[29] Shoaib Ahmed Siddiqui, Imran Ali Fateh, Syed Tahseen Raza Rizvi, Andreas Dengel, and Sheraz Ahmed. Deeptabstr: Deep learning based table structure recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1403-1409. IEEE, 2019. 3</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_52><loc_47><loc_58></location>[30] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD , KDD '18, pages 774-782, New York, NY, USA, 2018. ACM. 1</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_42><loc_47><loc_51></location>[31] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Ł ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, Advances in Neural Information Processing Systems 30 , pages 5998-6008. Curran Associates, Inc., 2017. 5</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_37><loc_47><loc_42></location>[32] Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. Show and tell: A neural image caption generator. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) , June 2015. 2</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_31><loc_47><loc_36></location>[33] Wenyuan Xue, Qingyong Li, and Dacheng Tao. Res2tim: reconstruct syntactic structures from table images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 749-755. IEEE, 2019. 3</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_25><loc_47><loc_31></location>[34] Wenyuan Xue, Baosheng Yu, Wen Wang, Dacheng Tao, and Qingyong Li. Tgrnet: A table graph reconstruction network for table structure recognition. arXiv preprint arXiv:2106.10598 , 2021. 3</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_20><loc_47><loc_25></location>[35] Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 4651-4659, 2016. 4</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_13><loc_47><loc_19></location>[36] Xinyi Zheng, Doug Burdick, Lucian Popa, Peter Zhong, and Nancy Xin Ru Wang. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. Winter Conference for Applications in Computer Vision (WACV) , 2021. 2, 3</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_10><loc_47><loc_12></location>[37] Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model,</paragraph>
|
||||
<paragraph><location><page_10><loc_54><loc_85><loc_89><loc_91></location>and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7</paragraph>
|
||||
<paragraph><location><page_10><loc_50><loc_80><loc_89><loc_85></location>[38] Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_80><loc_47><loc_88></location>- [26] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 11621167, 2017. 1</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_71><loc_47><loc_79></location>- [27] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR international conference on document analysis and recognition (ICDAR) , volume 1, pages 1162-1167. IEEE, 2017. 3</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_66><loc_47><loc_71></location>- [28] Faisal Shafait and Ray Smith. Table detection in heterogeneous documents. In Proceedings of the 9th IAPR International Workshop on Document Analysis Systems , pages 6572, 2010. 2</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_59><loc_47><loc_65></location>- [29] Shoaib Ahmed Siddiqui, Imran Ali Fateh, Syed Tahseen Raza Rizvi, Andreas Dengel, and Sheraz Ahmed. Deeptabstr: Deep learning based table structure recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1403-1409. IEEE, 2019. 3</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_52><loc_47><loc_58></location>- [30] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD , KDD '18, pages 774-782, New York, NY, USA, 2018. ACM. 1</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_42><loc_47><loc_51></location>- [31] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Ł ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, Advances in Neural Information Processing Systems 30 , pages 5998-6008. Curran Associates, Inc., 2017. 5</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_37><loc_47><loc_42></location>- [32] Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. Show and tell: A neural image caption generator. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) , June 2015. 2</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_31><loc_47><loc_36></location>- [33] Wenyuan Xue, Qingyong Li, and Dacheng Tao. Res2tim: reconstruct syntactic structures from table images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 749-755. IEEE, 2019. 3</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_25><loc_47><loc_31></location>- [34] Wenyuan Xue, Baosheng Yu, Wen Wang, Dacheng Tao, and Qingyong Li. Tgrnet: A table graph reconstruction network for table structure recognition. arXiv preprint arXiv:2106.10598 , 2021. 3</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_20><loc_47><loc_25></location>- [35] Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 4651-4659, 2016. 4</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_13><loc_47><loc_19></location>- [36] Xinyi Zheng, Doug Burdick, Lucian Popa, Peter Zhong, and Nancy Xin Ru Wang. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. Winter Conference for Applications in Computer Vision (WACV) , 2021. 2, 3</paragraph>
|
||||
<paragraph><location><page_10><loc_8><loc_10><loc_47><loc_12></location>- [37] Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model,</paragraph>
|
||||
<paragraph><location><page_10><loc_54><loc_85><loc_89><loc_91></location>- and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7</paragraph>
|
||||
<paragraph><location><page_10><loc_50><loc_80><loc_89><loc_85></location>- [38] Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1</paragraph>
|
||||
<subtitle-level-1><location><page_11><loc_22><loc_83><loc_76><loc_86></location>TableFormer: Table Structure Understanding with Transformers Supplementary Material</subtitle-level-1>
|
||||
<subtitle-level-1><location><page_11><loc_8><loc_78><loc_29><loc_80></location>1. Details on the datasets</subtitle-level-1>
|
||||
<subtitle-level-1><location><page_11><loc_8><loc_76><loc_25><loc_77></location>1.1. Data preparation</subtitle-level-1>
|
||||
@ -264,11 +264,11 @@
|
||||
<paragraph><location><page_11><loc_8><loc_10><loc_47><loc_14></location>Aiming to train and evaluate our models in a broader spectrum of table data we have synthesized four types of datasets. Each one contains tables with different appear-</paragraph>
|
||||
<paragraph><location><page_11><loc_50><loc_74><loc_89><loc_80></location>ances in regard to their size, structure, style and content. Every synthetic dataset contains 150k examples, summing up to 600k synthetic examples. All datasets are divided into Train, Test and Val splits (80%, 10%, 10%).</paragraph>
|
||||
<paragraph><location><page_11><loc_50><loc_71><loc_89><loc_73></location>The process of generating a synthetic dataset can be decomposed into the following steps:</paragraph>
|
||||
<paragraph><location><page_11><loc_50><loc_60><loc_89><loc_70></location>1. Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.).</paragraph>
|
||||
<paragraph><location><page_11><loc_50><loc_43><loc_89><loc_60></location>2. Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans.</paragraph>
|
||||
<paragraph><location><page_11><loc_50><loc_37><loc_89><loc_43></location>3. Generate content: Based on the dataset theme , a set of suitable content templates is chosen first. Then, this content can be combined with purely random text to produce the synthetic content.</paragraph>
|
||||
<paragraph><location><page_11><loc_50><loc_31><loc_89><loc_37></location>4. Apply styling templates: Depending on the domain of the synthetic dataset, a set of styling templates is first manually selected. Then, a style is randomly selected to format the appearance of the synthesized table.</paragraph>
|
||||
<paragraph><location><page_11><loc_50><loc_23><loc_89><loc_31></location>5. Render the complete tables: The synthetic table is finally rendered by a web browser engine to generate the bounding boxes for each table cell. A batching technique is utilized to optimize the runtime overhead of the rendering process.</paragraph>
|
||||
<paragraph><location><page_11><loc_50><loc_60><loc_89><loc_70></location>- 1. Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.).</paragraph>
|
||||
<paragraph><location><page_11><loc_50><loc_43><loc_89><loc_60></location>- 2. Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans.</paragraph>
|
||||
<paragraph><location><page_11><loc_50><loc_37><loc_89><loc_43></location>- 3. Generate content: Based on the dataset theme , a set of suitable content templates is chosen first. Then, this content can be combined with purely random text to produce the synthetic content.</paragraph>
|
||||
<paragraph><location><page_11><loc_50><loc_31><loc_89><loc_37></location>- 4. Apply styling templates: Depending on the domain of the synthetic dataset, a set of styling templates is first manually selected. Then, a style is randomly selected to format the appearance of the synthesized table.</paragraph>
|
||||
<paragraph><location><page_11><loc_50><loc_23><loc_89><loc_31></location>- 5. Render the complete tables: The synthetic table is finally rendered by a web browser engine to generate the bounding boxes for each table cell. A batching technique is utilized to optimize the runtime overhead of the rendering process.</paragraph>
|
||||
<subtitle-level-1><location><page_11><loc_50><loc_18><loc_89><loc_22></location>2. Prediction post-processing for PDF documents</subtitle-level-1>
|
||||
<paragraph><location><page_11><loc_50><loc_10><loc_89><loc_17></location>Although TableFormer can predict the table structure and the bounding boxes for tables recognized inside PDF documents, this is not enough when a full reconstruction of the original table is required. This happens mainly due the following reasons:</paragraph>
|
||||
<caption><location><page_12><loc_8><loc_76><loc_89><loc_79></location>Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.</caption>
|
||||
@ -276,27 +276,27 @@
|
||||
<location><page_12><loc_9><loc_81><loc_89><loc_91></location>
|
||||
<caption>Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.</caption>
|
||||
</figure>
|
||||
<paragraph><location><page_12><loc_10><loc_71><loc_47><loc_73></location>· TableFormer output does not include the table cell content.</paragraph>
|
||||
<paragraph><location><page_12><loc_10><loc_67><loc_47><loc_69></location>· There are occasional inaccuracies in the predictions of the bounding boxes.</paragraph>
|
||||
<paragraph><location><page_12><loc_10><loc_71><loc_47><loc_73></location>- · TableFormer output does not include the table cell content.</paragraph>
|
||||
<paragraph><location><page_12><loc_10><loc_67><loc_47><loc_69></location>- · There are occasional inaccuracies in the predictions of the bounding boxes.</paragraph>
|
||||
<paragraph><location><page_12><loc_8><loc_50><loc_47><loc_65></location>However, it is possible to mitigate those limitations by combining the TableFormer predictions with the information already present inside a programmatic PDF document. More specifically, PDF documents can be seen as a sequence of PDF cells where each cell is described by its content and bounding box. If we are able to associate the PDF cells with the predicted table cells, we can directly link the PDF cell content to the table cell structure and use the PDF bounding boxes to correct misalignments in the predicted table cell bounding boxes.</paragraph>
|
||||
<paragraph><location><page_12><loc_8><loc_47><loc_47><loc_50></location>Here is a step-by-step description of the prediction postprocessing:</paragraph>
|
||||
<paragraph><location><page_12><loc_8><loc_42><loc_47><loc_47></location>1. Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure.</paragraph>
|
||||
<paragraph><location><page_12><loc_8><loc_36><loc_47><loc_42></location>2. Generate pair-wise matches between the bounding boxes of the PDF cells and the predicted cells. The Intersection Over Union (IOU) metric is used to evaluate the quality of the matches.</paragraph>
|
||||
<paragraph><location><page_12><loc_8><loc_33><loc_47><loc_36></location>3. Use a carefully selected IOU threshold to designate the matches as "good" ones and "bad" ones.</paragraph>
|
||||
<paragraph><location><page_12><loc_8><loc_29><loc_47><loc_33></location>3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column.</paragraph>
|
||||
<paragraph><location><page_12><loc_8><loc_24><loc_47><loc_28></location>4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula:</paragraph>
|
||||
<paragraph><location><page_12><loc_8><loc_42><loc_47><loc_47></location>- 1. Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure.</paragraph>
|
||||
<paragraph><location><page_12><loc_8><loc_36><loc_47><loc_42></location>- 2. Generate pair-wise matches between the bounding boxes of the PDF cells and the predicted cells. The Intersection Over Union (IOU) metric is used to evaluate the quality of the matches.</paragraph>
|
||||
<paragraph><location><page_12><loc_8><loc_33><loc_47><loc_36></location>- 3. Use a carefully selected IOU threshold to designate the matches as "good" ones and "bad" ones.</paragraph>
|
||||
<paragraph><location><page_12><loc_8><loc_29><loc_47><loc_33></location>- 3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column.</paragraph>
|
||||
<paragraph><location><page_12><loc_8><loc_24><loc_47><loc_28></location>- 4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula:</paragraph>
|
||||
<paragraph><location><page_12><loc_8><loc_13><loc_47><loc_16></location>where c is one of { left, centroid, right } and x$_{c}$ is the xcoordinate for the corresponding point.</paragraph>
|
||||
<paragraph><location><page_12><loc_8><loc_10><loc_47><loc_13></location>5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-</paragraph>
|
||||
<paragraph><location><page_12><loc_8><loc_10><loc_47><loc_13></location>- 5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-</paragraph>
|
||||
<paragraph><location><page_12><loc_50><loc_68><loc_89><loc_73></location>dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.</paragraph>
|
||||
<paragraph><location><page_12><loc_50><loc_65><loc_89><loc_67></location>6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.</paragraph>
|
||||
<paragraph><location><page_12><loc_50><loc_51><loc_89><loc_64></location>7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells.</paragraph>
|
||||
<paragraph><location><page_12><loc_50><loc_42><loc_89><loc_51></location>8. In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score.</paragraph>
|
||||
<paragraph><location><page_12><loc_50><loc_28><loc_89><loc_42></location>9. Pick up the remaining orphan cells. There could be cases, when after applying all the previous post-processing steps, some PDF cells could still remain without any match to predicted cells. However, it is still possible to deduce the correct matching for an orphan PDF cell by mapping its bounding box on the geometry of the grid. This mapping decides if the content of the orphan cell will be appended to an already matched table cell, or a new table cell should be created to match with the orphan.</paragraph>
|
||||
<paragraph><location><page_12><loc_50><loc_65><loc_89><loc_67></location>- 6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.</paragraph>
|
||||
<paragraph><location><page_12><loc_50><loc_51><loc_89><loc_64></location>- 7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells.</paragraph>
|
||||
<paragraph><location><page_12><loc_50><loc_42><loc_89><loc_51></location>- 8. In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score.</paragraph>
|
||||
<paragraph><location><page_12><loc_50><loc_28><loc_89><loc_42></location>- 9. Pick up the remaining orphan cells. There could be cases, when after applying all the previous post-processing steps, some PDF cells could still remain without any match to predicted cells. However, it is still possible to deduce the correct matching for an orphan PDF cell by mapping its bounding box on the geometry of the grid. This mapping decides if the content of the orphan cell will be appended to an already matched table cell, or a new table cell should be created to match with the orphan.</paragraph>
|
||||
<paragraph><location><page_12><loc_50><loc_24><loc_89><loc_28></location>9a. Compute the top and bottom boundary of the horizontal band for each grid row (min/max y coordinates per row).</paragraph>
|
||||
<paragraph><location><page_12><loc_50><loc_21><loc_89><loc_23></location>9b. Intersect the orphan's bounding box with the row bands, and map the cell to the closest grid row.</paragraph>
|
||||
<paragraph><location><page_12><loc_50><loc_16><loc_89><loc_20></location>9c. Compute the left and right boundary of the vertical band for each grid column (min/max x coordinates per column).</paragraph>
|
||||
<paragraph><location><page_12><loc_50><loc_13><loc_89><loc_16></location>9d. Intersect the orphan's bounding box with the column bands, and map the cell to the closest grid column.</paragraph>
|
||||
<paragraph><location><page_12><loc_50><loc_10><loc_89><loc_13></location>9e. If the table cell under the identified row and column is not empty, extend its content with the content of the or-</paragraph>
|
||||
<paragraph><location><page_12><loc_50><loc_21><loc_89><loc_23></location>- 9b. Intersect the orphan's bounding box with the row bands, and map the cell to the closest grid row.</paragraph>
|
||||
<paragraph><location><page_12><loc_50><loc_16><loc_89><loc_20></location>- 9c. Compute the left and right boundary of the vertical band for each grid column (min/max x coordinates per column).</paragraph>
|
||||
<paragraph><location><page_12><loc_50><loc_13><loc_89><loc_16></location>- 9d. Intersect the orphan's bounding box with the column bands, and map the cell to the closest grid column.</paragraph>
|
||||
<paragraph><location><page_12><loc_50><loc_10><loc_89><loc_13></location>- 9e. If the table cell under the identified row and column is not empty, extend its content with the content of the or-</paragraph>
|
||||
<paragraph><location><page_13><loc_8><loc_89><loc_15><loc_91></location>phan cell.</paragraph>
|
||||
<paragraph><location><page_13><loc_8><loc_86><loc_47><loc_89></location>9f. Otherwise create a new structural cell and match it wit the orphan cell.</paragraph>
|
||||
<paragraph><location><page_13><loc_8><loc_83><loc_47><loc_86></location>Aditional images with examples of TableFormer predictions and post-processing can be found below.</paragraph>
|
||||
|
File diff suppressed because one or more lines are too long
@ -46,13 +46,13 @@ In this paper, we want to address these weaknesses and present a robust table-st
|
||||
|
||||
To meet the design criteria listed above, we developed a new model called TableFormer and a synthetically generated table structure dataset called SynthTabNet $^{1}$. In particular, our contributions in this work can be summarised as follows:
|
||||
|
||||
· We propose TableFormer , a transformer based model that predicts tables structure and bounding boxes for the table content simultaneously in an end-to-end approach.
|
||||
- · We propose TableFormer , a transformer based model that predicts tables structure and bounding boxes for the table content simultaneously in an end-to-end approach.
|
||||
|
||||
· Across all benchmark datasets TableFormer significantly outperforms existing state-of-the-art metrics, while being much more efficient in training and inference to existing works.
|
||||
- · Across all benchmark datasets TableFormer significantly outperforms existing state-of-the-art metrics, while being much more efficient in training and inference to existing works.
|
||||
|
||||
· We present SynthTabNet a synthetically generated dataset, with various appearance styles and complexity.
|
||||
- · We present SynthTabNet a synthetically generated dataset, with various appearance styles and complexity.
|
||||
|
||||
· An augmented dataset based on PubTabNet [37], FinTabNet [36], and TableBank [17] with generated ground-truth for reproducibility.
|
||||
- · An augmented dataset based on PubTabNet [37], FinTabNet [36], and TableBank [17] with generated ground-truth for reproducibility.
|
||||
|
||||
The paper is structured as follows. In Sec. 2, we give a brief overview of the current state-of-the-art. In Sec. 3, we describe the datasets on which we train. In Sec. 4, we introduce the TableFormer model-architecture and describe
|
||||
|
||||
@ -279,90 +279,90 @@ In this paper, we presented TableFormer an end-to-end transformer based approach
|
||||
|
||||
## References
|
||||
|
||||
[1] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-
|
||||
- [1] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5
|
||||
- end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5
|
||||
|
||||
[2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3
|
||||
- [2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3
|
||||
|
||||
[3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2
|
||||
- [3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2
|
||||
|
||||
[4] Herv'e D'ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2
|
||||
- [4] Herv'e D'ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2
|
||||
|
||||
[5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2
|
||||
- [5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2
|
||||
|
||||
[6] Max Gobel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2
|
||||
- [6] Max Gobel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2
|
||||
|
||||
[7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2
|
||||
- [7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2
|
||||
|
||||
[8] Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1
|
||||
- [8] Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1
|
||||
|
||||
[9] Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1
|
||||
- [9] Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1
|
||||
|
||||
[10] Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2
|
||||
- [10] Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2
|
||||
|
||||
[11] Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2
|
||||
- [11] Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2
|
||||
|
||||
[12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2
|
||||
- [12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2
|
||||
|
||||
[13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl'ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2
|
||||
- [13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl'ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2
|
||||
|
||||
[14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2
|
||||
- [14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2
|
||||
|
||||
[15] Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6
|
||||
- [15] Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6
|
||||
|
||||
[16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4
|
||||
- [16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4
|
||||
|
||||
[17] Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou, and Zhoujun Li. Tablebank: A benchmark dataset for table detection and recognition, 2019. 2, 3
|
||||
- [17] Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou, and Zhoujun Li. Tablebank: A benchmark dataset for table detection and recognition, 2019. 2, 3
|
||||
|
||||
[18] Yiren Li, Zheng Huang, Junchi Yan, Yi Zhou, Fan Ye, and Xianhui Liu. Gfte: Graph-based financial table extraction. In Alberto Del Bimbo, Rita Cucchiara, Stan Sclaroff, Giovanni Maria Farinella, Tao Mei, Marco Bertini, Hugo Jair Escalante, and Roberto Vezzani, editors, Pattern Recognition. ICPR International Workshops and Challenges , pages 644-658, Cham, 2021. Springer International Publishing. 2, 3
|
||||
- [18] Yiren Li, Zheng Huang, Junchi Yan, Yi Zhou, Fan Ye, and Xianhui Liu. Gfte: Graph-based financial table extraction. In Alberto Del Bimbo, Rita Cucchiara, Stan Sclaroff, Giovanni Maria Farinella, Tao Mei, Marco Bertini, Hugo Jair Escalante, and Roberto Vezzani, editors, Pattern Recognition. ICPR International Workshops and Challenges , pages 644-658, Cham, 2021. Springer International Publishing. 2, 3
|
||||
|
||||
[19] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1
|
||||
- [19] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1
|
||||
|
||||
[20] Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2
|
||||
- [20] Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2
|
||||
|
||||
[21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1
|
||||
- [21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1
|
||||
|
||||
[22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch'e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6
|
||||
- [22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch'e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6
|
||||
|
||||
[23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1
|
||||
- [23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1
|
||||
|
||||
[24] Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3
|
||||
- [24] Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3
|
||||
|
||||
[25] Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on
|
||||
- [25] Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on
|
||||
|
||||
Computer Vision and Pattern Recognition , pages 658-666, 2019. 6
|
||||
|
||||
[26] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 11621167, 2017. 1
|
||||
- [26] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 11621167, 2017. 1
|
||||
|
||||
[27] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR international conference on document analysis and recognition (ICDAR) , volume 1, pages 1162-1167. IEEE, 2017. 3
|
||||
- [27] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR international conference on document analysis and recognition (ICDAR) , volume 1, pages 1162-1167. IEEE, 2017. 3
|
||||
|
||||
[28] Faisal Shafait and Ray Smith. Table detection in heterogeneous documents. In Proceedings of the 9th IAPR International Workshop on Document Analysis Systems , pages 6572, 2010. 2
|
||||
- [28] Faisal Shafait and Ray Smith. Table detection in heterogeneous documents. In Proceedings of the 9th IAPR International Workshop on Document Analysis Systems , pages 6572, 2010. 2
|
||||
|
||||
[29] Shoaib Ahmed Siddiqui, Imran Ali Fateh, Syed Tahseen Raza Rizvi, Andreas Dengel, and Sheraz Ahmed. Deeptabstr: Deep learning based table structure recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1403-1409. IEEE, 2019. 3
|
||||
- [29] Shoaib Ahmed Siddiqui, Imran Ali Fateh, Syed Tahseen Raza Rizvi, Andreas Dengel, and Sheraz Ahmed. Deeptabstr: Deep learning based table structure recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1403-1409. IEEE, 2019. 3
|
||||
|
||||
[30] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD , KDD '18, pages 774-782, New York, NY, USA, 2018. ACM. 1
|
||||
- [30] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD , KDD '18, pages 774-782, New York, NY, USA, 2018. ACM. 1
|
||||
|
||||
[31] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Ł ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, Advances in Neural Information Processing Systems 30 , pages 5998-6008. Curran Associates, Inc., 2017. 5
|
||||
- [31] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Ł ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, Advances in Neural Information Processing Systems 30 , pages 5998-6008. Curran Associates, Inc., 2017. 5
|
||||
|
||||
[32] Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. Show and tell: A neural image caption generator. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) , June 2015. 2
|
||||
- [32] Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. Show and tell: A neural image caption generator. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) , June 2015. 2
|
||||
|
||||
[33] Wenyuan Xue, Qingyong Li, and Dacheng Tao. Res2tim: reconstruct syntactic structures from table images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 749-755. IEEE, 2019. 3
|
||||
- [33] Wenyuan Xue, Qingyong Li, and Dacheng Tao. Res2tim: reconstruct syntactic structures from table images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 749-755. IEEE, 2019. 3
|
||||
|
||||
[34] Wenyuan Xue, Baosheng Yu, Wen Wang, Dacheng Tao, and Qingyong Li. Tgrnet: A table graph reconstruction network for table structure recognition. arXiv preprint arXiv:2106.10598 , 2021. 3
|
||||
- [34] Wenyuan Xue, Baosheng Yu, Wen Wang, Dacheng Tao, and Qingyong Li. Tgrnet: A table graph reconstruction network for table structure recognition. arXiv preprint arXiv:2106.10598 , 2021. 3
|
||||
|
||||
[35] Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 4651-4659, 2016. 4
|
||||
- [35] Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 4651-4659, 2016. 4
|
||||
|
||||
[36] Xinyi Zheng, Doug Burdick, Lucian Popa, Peter Zhong, and Nancy Xin Ru Wang. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. Winter Conference for Applications in Computer Vision (WACV) , 2021. 2, 3
|
||||
- [36] Xinyi Zheng, Doug Burdick, Lucian Popa, Peter Zhong, and Nancy Xin Ru Wang. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. Winter Conference for Applications in Computer Vision (WACV) , 2021. 2, 3
|
||||
|
||||
[37] Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model,
|
||||
- [37] Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model,
|
||||
|
||||
and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7
|
||||
- and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7
|
||||
|
||||
[38] Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1
|
||||
- [38] Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1
|
||||
|
||||
## TableFormer: Table Structure Understanding with Transformers Supplementary Material
|
||||
|
||||
@ -384,15 +384,15 @@ ances in regard to their size, structure, style and content. Every synthetic dat
|
||||
|
||||
The process of generating a synthetic dataset can be decomposed into the following steps:
|
||||
|
||||
1. Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.).
|
||||
- 1. Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.).
|
||||
|
||||
2. Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans.
|
||||
- 2. Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans.
|
||||
|
||||
3. Generate content: Based on the dataset theme , a set of suitable content templates is chosen first. Then, this content can be combined with purely random text to produce the synthetic content.
|
||||
- 3. Generate content: Based on the dataset theme , a set of suitable content templates is chosen first. Then, this content can be combined with purely random text to produce the synthetic content.
|
||||
|
||||
4. Apply styling templates: Depending on the domain of the synthetic dataset, a set of styling templates is first manually selected. Then, a style is randomly selected to format the appearance of the synthesized table.
|
||||
- 4. Apply styling templates: Depending on the domain of the synthetic dataset, a set of styling templates is first manually selected. Then, a style is randomly selected to format the appearance of the synthesized table.
|
||||
|
||||
5. Render the complete tables: The synthetic table is finally rendered by a web browser engine to generate the bounding boxes for each table cell. A batching technique is utilized to optimize the runtime overhead of the rendering process.
|
||||
- 5. Render the complete tables: The synthetic table is finally rendered by a web browser engine to generate the bounding boxes for each table cell. A batching technique is utilized to optimize the runtime overhead of the rendering process.
|
||||
|
||||
## 2. Prediction post-processing for PDF documents
|
||||
|
||||
@ -401,47 +401,47 @@ Although TableFormer can predict the table structure and the bounding boxes for
|
||||
Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.
|
||||
<!-- image -->
|
||||
|
||||
· TableFormer output does not include the table cell content.
|
||||
- · TableFormer output does not include the table cell content.
|
||||
|
||||
· There are occasional inaccuracies in the predictions of the bounding boxes.
|
||||
- · There are occasional inaccuracies in the predictions of the bounding boxes.
|
||||
|
||||
However, it is possible to mitigate those limitations by combining the TableFormer predictions with the information already present inside a programmatic PDF document. More specifically, PDF documents can be seen as a sequence of PDF cells where each cell is described by its content and bounding box. If we are able to associate the PDF cells with the predicted table cells, we can directly link the PDF cell content to the table cell structure and use the PDF bounding boxes to correct misalignments in the predicted table cell bounding boxes.
|
||||
|
||||
Here is a step-by-step description of the prediction postprocessing:
|
||||
|
||||
1. Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure.
|
||||
- 1. Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure.
|
||||
|
||||
2. Generate pair-wise matches between the bounding boxes of the PDF cells and the predicted cells. The Intersection Over Union (IOU) metric is used to evaluate the quality of the matches.
|
||||
- 2. Generate pair-wise matches between the bounding boxes of the PDF cells and the predicted cells. The Intersection Over Union (IOU) metric is used to evaluate the quality of the matches.
|
||||
|
||||
3. Use a carefully selected IOU threshold to designate the matches as "good" ones and "bad" ones.
|
||||
- 3. Use a carefully selected IOU threshold to designate the matches as "good" ones and "bad" ones.
|
||||
|
||||
3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column.
|
||||
- 3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column.
|
||||
|
||||
4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula:
|
||||
- 4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula:
|
||||
|
||||
where c is one of { left, centroid, right } and x$_{c}$ is the xcoordinate for the corresponding point.
|
||||
|
||||
5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-
|
||||
- 5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-
|
||||
|
||||
dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.
|
||||
|
||||
6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.
|
||||
- 6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.
|
||||
|
||||
7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells.
|
||||
- 7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells.
|
||||
|
||||
8. In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score.
|
||||
- 8. In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score.
|
||||
|
||||
9. Pick up the remaining orphan cells. There could be cases, when after applying all the previous post-processing steps, some PDF cells could still remain without any match to predicted cells. However, it is still possible to deduce the correct matching for an orphan PDF cell by mapping its bounding box on the geometry of the grid. This mapping decides if the content of the orphan cell will be appended to an already matched table cell, or a new table cell should be created to match with the orphan.
|
||||
- 9. Pick up the remaining orphan cells. There could be cases, when after applying all the previous post-processing steps, some PDF cells could still remain without any match to predicted cells. However, it is still possible to deduce the correct matching for an orphan PDF cell by mapping its bounding box on the geometry of the grid. This mapping decides if the content of the orphan cell will be appended to an already matched table cell, or a new table cell should be created to match with the orphan.
|
||||
|
||||
9a. Compute the top and bottom boundary of the horizontal band for each grid row (min/max y coordinates per row).
|
||||
|
||||
9b. Intersect the orphan's bounding box with the row bands, and map the cell to the closest grid row.
|
||||
- 9b. Intersect the orphan's bounding box with the row bands, and map the cell to the closest grid row.
|
||||
|
||||
9c. Compute the left and right boundary of the vertical band for each grid column (min/max x coordinates per column).
|
||||
- 9c. Compute the left and right boundary of the vertical band for each grid column (min/max x coordinates per column).
|
||||
|
||||
9d. Intersect the orphan's bounding box with the column bands, and map the cell to the closest grid column.
|
||||
- 9d. Intersect the orphan's bounding box with the column bands, and map the cell to the closest grid column.
|
||||
|
||||
9e. If the table cell under the identified row and column is not empty, extend its content with the content of the or-
|
||||
- 9e. If the table cell under the identified row and column is not empty, extend its content with the content of the or-
|
||||
|
||||
phan cell.
|
||||
|
||||
|
File diff suppressed because one or more lines are too long
@ -36,12 +36,12 @@
|
||||
<paragraph><location><page_2><loc_9><loc_71><loc_50><loc_86></location>Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1.</paragraph>
|
||||
<paragraph><location><page_2><loc_9><loc_37><loc_48><loc_71></location>A key problem in the process of document conversion is to understand the structure of a single document page, i.e. which segments of text should be grouped together in a unit. To train models for this task, there are currently two large datasets available to the community, PubLayNet [6] and DocBank [7]. They were introduced in 2019 and 2020 respectively and significantly accelerated the implementation of layout detection and segmentation models due to their sizes of 300K and 500K ground-truth pages. These sizes were achieved by leveraging an automation approach. The benefit of automated ground-truth generation is obvious: one can generate large ground-truth datasets at virtually no cost. However, the automation introduces a constraint on the variability in the dataset, because corresponding structured source data must be available. PubLayNet and DocBank were both generated from scientific document repositories (PubMed and arXiv), which provide XML or L A T E X sources. Those scientific documents present a limited variability in their layouts, because they are typeset in uniform templates provided by the publishers. Obviously, documents such as technical manuals, annual company reports, legal text, government tenders, etc. have very different and partially unique layouts. As a consequence, the layout predictions obtained from models trained on PubLayNet or DocBank is very reasonable when applied on scientific documents. However, for more artistic or free-style layouts, we see sub-par prediction quality from these models, which we demonstrate in Section 5.</paragraph>
|
||||
<paragraph><location><page_2><loc_9><loc_27><loc_48><loc_36></location>In this paper, we present the DocLayNet dataset. It provides pageby-page layout annotation ground-truth using bounding-boxes for 11 distinct class labels on 80863 unique document pages, of which a fraction carry double- or triple-annotations. DocLayNet is similar in spirit to PubLayNet and DocBank and will likewise be made available to the public 1 in order to stimulate the document-layout analysis community. It distinguishes itself in the following aspects:</paragraph>
|
||||
<paragraph><location><page_2><loc_10><loc_22><loc_48><loc_26></location>(1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on human annotation instead of automation approaches to generate the data set.</paragraph>
|
||||
<paragraph><location><page_2><loc_10><loc_20><loc_48><loc_22></location>(2) Large Layout Variability : We include diverse and complex layouts from a large variety of public sources.</paragraph>
|
||||
<paragraph><location><page_2><loc_10><loc_15><loc_48><loc_19></location>(3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours.</paragraph>
|
||||
<paragraph><location><page_2><loc_11><loc_13><loc_48><loc_15></location>(4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.</paragraph>
|
||||
<paragraph><location><page_2><loc_10><loc_22><loc_48><loc_26></location>- (1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on human annotation instead of automation approaches to generate the data set.</paragraph>
|
||||
<paragraph><location><page_2><loc_10><loc_20><loc_48><loc_22></location>- (2) Large Layout Variability : We include diverse and complex layouts from a large variety of public sources.</paragraph>
|
||||
<paragraph><location><page_2><loc_10><loc_15><loc_48><loc_19></location>- (3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours.</paragraph>
|
||||
<paragraph><location><page_2><loc_11><loc_13><loc_48><loc_15></location>- (4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.</paragraph>
|
||||
<paragraph><location><page_2><loc_56><loc_87><loc_91><loc_89></location>This enables experimentation with annotation uncertainty and quality control analysis.</paragraph>
|
||||
<paragraph><location><page_2><loc_54><loc_80><loc_91><loc_86></location>(5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.</paragraph>
|
||||
<paragraph><location><page_2><loc_54><loc_80><loc_91><loc_86></location>- (5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.</paragraph>
|
||||
<paragraph><location><page_2><loc_52><loc_72><loc_91><loc_79></location>All aspects outlined above are detailed in Section 3. In Section 4, we will elaborate on how we designed and executed this large-scale human annotation campaign. We will also share key insights and lessons learned that might prove helpful for other parties planning to set up annotation campaigns.</paragraph>
|
||||
<paragraph><location><page_2><loc_52><loc_61><loc_91><loc_72></location>In Section 5, we will present baseline accuracy numbers for a variety of object detection methods (Faster R-CNN, Mask R-CNN and YOLOv5) trained on DocLayNet. We further show how the model performance is impacted by varying the DocLayNet dataset size, reducing the label set and modifying the train/test-split. Last but not least, we compare the performance of models trained on PubLayNet, DocBank and DocLayNet and demonstrate that a model trained on DocLayNet provides overall more robust layout recovery.</paragraph>
|
||||
<subtitle-level-1><location><page_2><loc_52><loc_58><loc_69><loc_59></location>2 RELATED WORK</subtitle-level-1>
|
||||
@ -96,12 +96,12 @@
|
||||
<paragraph><location><page_5><loc_9><loc_86><loc_48><loc_89></location>the textual content of an element, which goes beyond visual layout recognition, in particular outside the Scientific Articles category.</paragraph>
|
||||
<paragraph><location><page_5><loc_9><loc_68><loc_48><loc_86></location>At first sight, the task of visual document-layout interpretation appears intuitive enough to obtain plausible annotations in most cases. However, during early trial-runs in the core team, we observed many cases in which annotators use different annotation styles, especially for documents with challenging layouts. For example, if a figure is presented with subfigures, one annotator might draw a single figure bounding-box, while another might annotate each subfigure separately. The same applies for lists, where one might annotate all list items in one block or each list item separately. In essence, we observed that challenging layouts would be annotated in different but plausible ways. To illustrate this, we show in Figure 4 multiple examples of plausible but inconsistent annotations on the same pages.</paragraph>
|
||||
<paragraph><location><page_5><loc_9><loc_57><loc_48><loc_68></location>Obviously, this inconsistency in annotations is not desirable for datasets which are intended to be used for model training. To minimise these inconsistencies, we created a detailed annotation guideline. While perfect consistency across 40 annotation staff members is clearly not possible to achieve, we saw a huge improvement in annotation consistency after the introduction of our annotation guideline. A few selected, non-trivial highlights of the guideline are:</paragraph>
|
||||
<paragraph><location><page_5><loc_11><loc_51><loc_48><loc_56></location>(1) Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object.</paragraph>
|
||||
<paragraph><location><page_5><loc_11><loc_45><loc_48><loc_51></location>(2) A List-item is a paragraph with hanging indentation. Singleline elements can qualify as List-item if the neighbour elements expose hanging indentation. Bullet or enumeration symbols are not a requirement.</paragraph>
|
||||
<paragraph><location><page_5><loc_10><loc_42><loc_48><loc_45></location>(3) For every Caption , there must be exactly one corresponding Picture or Table .</paragraph>
|
||||
<paragraph><location><page_5><loc_10><loc_40><loc_48><loc_42></location>(4) Connected sub-pictures are grouped together in one Picture object.</paragraph>
|
||||
<paragraph><location><page_5><loc_10><loc_38><loc_43><loc_39></location>(5) Formula numbers are included in a Formula object.</paragraph>
|
||||
<paragraph><location><page_5><loc_11><loc_34><loc_48><loc_38></location>(6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line.</paragraph>
|
||||
<paragraph><location><page_5><loc_11><loc_51><loc_48><loc_56></location>- (1) Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object.</paragraph>
|
||||
<paragraph><location><page_5><loc_11><loc_45><loc_48><loc_51></location>- (2) A List-item is a paragraph with hanging indentation. Singleline elements can qualify as List-item if the neighbour elements expose hanging indentation. Bullet or enumeration symbols are not a requirement.</paragraph>
|
||||
<paragraph><location><page_5><loc_10><loc_42><loc_48><loc_45></location>- (3) For every Caption , there must be exactly one corresponding Picture or Table .</paragraph>
|
||||
<paragraph><location><page_5><loc_10><loc_40><loc_48><loc_42></location>- (4) Connected sub-pictures are grouped together in one Picture object.</paragraph>
|
||||
<paragraph><location><page_5><loc_10><loc_38><loc_43><loc_39></location>- (5) Formula numbers are included in a Formula object.</paragraph>
|
||||
<paragraph><location><page_5><loc_11><loc_34><loc_48><loc_38></location>- (6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line.</paragraph>
|
||||
<paragraph><location><page_5><loc_9><loc_27><loc_48><loc_33></location>The complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.</paragraph>
|
||||
<paragraph><location><page_5><loc_9><loc_11><loc_48><loc_27></location>Phase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore we prepared a subset of pages with two different complexity levels, each with a practice and an exam part. 974 pages were reference-annotated by one proficient core team member. Annotation staff were then given the task to annotate the same subsets (blinded from the reference). By comparing the annotations of each staff member with the reference annotations, we could quantify how closely their annotations matched the reference. Only after passing two exam levels with high annotation quality, staff were admitted into the production phase. Practice iterations</paragraph>
|
||||
<caption><location><page_5><loc_52><loc_36><loc_91><loc_40></location>Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.</caption>
|
||||
@ -216,19 +216,19 @@
|
||||
<paragraph><location><page_8><loc_52><loc_64><loc_91><loc_76></location>From the dataset, we have derived on the one hand reference metrics for human performance on document-layout annotation (through double and triple annotations) and on the other hand evaluated the baseline performance of commonly used object detection methods. We also illustrated the impact of various dataset-related aspects on model performance through data-ablation experiments, both from a size and class-label perspective. Last but not least, we compared the accuracy of models trained on other public datasets and showed that DocLayNet trained models are more robust.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_60><loc_91><loc_64></location>To date, there is still a significant gap between human and ML accuracy on the layout interpretation task, and we hope that this work will inspire the research community to close that gap.</paragraph>
|
||||
<subtitle-level-1><location><page_8><loc_52><loc_56><loc_63><loc_58></location>REFERENCES</subtitle-level-1>
|
||||
<paragraph><location><page_8><loc_52><loc_53><loc_91><loc_56></location>[1] Max Göbel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_49><loc_91><loc_53></location>[2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher. Icdar2017 competition on recognition of documents with complex layouts rdcl2017. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 1404-1410, 2017.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_46><loc_91><loc_49></location>[3] Hervé Déjean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), April 2019. http://sac.founderit.com/.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_42><loc_91><loc_46></location>[4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on scientific literature parsing. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag, sep 2021.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_38><loc_91><loc_42></location>[5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis: not dead yet. International Journal on Document Analysis and Recognition (IJDAR) , pages 1-11, 01 2022.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_35><loc_91><loc_38></location>[6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest dataset ever for document layout analysis. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep 2019.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_30><loc_91><loc_35></location>[7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings of the 28th International Conference on Computational Linguistics , COLING, pages 949-960. International Committee on Computational Linguistics, dec 2020.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_27><loc_91><loc_30></location>[8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC , 2016.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_23><loc_91><loc_27></location>[9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich feature hierarchies for accurate object detection and semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587. IEEE Computer Society, jun 2014.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_21><loc_91><loc_23></location>[10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_18><loc_91><loc_21></location>[11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_15><loc_91><loc_18></location>[12] Kaiming He, Georgia Gkioxari, Piotr Dollár, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_11><loc_91><loc_15></location>[13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_53><loc_91><loc_56></location>- [1] Max Göbel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_49><loc_91><loc_53></location>- [2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher. Icdar2017 competition on recognition of documents with complex layouts rdcl2017. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 1404-1410, 2017.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_46><loc_91><loc_49></location>- [3] Hervé Déjean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), April 2019. http://sac.founderit.com/.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_42><loc_91><loc_46></location>- [4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on scientific literature parsing. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag, sep 2021.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_38><loc_91><loc_42></location>- [5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis: not dead yet. International Journal on Document Analysis and Recognition (IJDAR) , pages 1-11, 01 2022.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_35><loc_91><loc_38></location>- [6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest dataset ever for document layout analysis. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep 2019.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_30><loc_91><loc_35></location>- [7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings of the 28th International Conference on Computational Linguistics , COLING, pages 949-960. International Committee on Computational Linguistics, dec 2020.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_27><loc_91><loc_30></location>- [8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC , 2016.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_23><loc_91><loc_27></location>- [9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich feature hierarchies for accurate object detection and semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587. IEEE Computer Society, jun 2014.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_21><loc_91><loc_23></location>- [10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_18><loc_91><loc_21></location>- [11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_15><loc_91><loc_18></location>- [12] Kaiming He, Georgia Gkioxari, Piotr Dollár, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017.</paragraph>
|
||||
<paragraph><location><page_8><loc_52><loc_11><loc_91><loc_15></location>- [13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu</paragraph>
|
||||
<caption><location><page_9><loc_9><loc_43><loc_52><loc_44></location>Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title</caption>
|
||||
<figure>
|
||||
<location><page_9><loc_9><loc_44><loc_91><loc_89></location>
|
||||
@ -236,14 +236,14 @@
|
||||
</figure>
|
||||
<paragraph><location><page_9><loc_9><loc_36><loc_91><loc_41></location>Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes.</paragraph>
|
||||
<paragraph><location><page_9><loc_11><loc_31><loc_48><loc_34></location>Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021.</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_28><loc_48><loc_30></location>[14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020.</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_26><loc_48><loc_27></location>[15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019.</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_23><loc_48><loc_25></location>[16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014.</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_21><loc_48><loc_23></location>[17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019.</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_16><loc_48><loc_20></location>[18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021.</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_10><loc_48><loc_15></location>[19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery.</paragraph>
|
||||
<paragraph><location><page_9><loc_52><loc_32><loc_91><loc_34></location>[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.</paragraph>
|
||||
<paragraph><location><page_9><loc_52><loc_29><loc_91><loc_31></location>[21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021.</paragraph>
|
||||
<paragraph><location><page_9><loc_52><loc_25><loc_91><loc_28></location>[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.</paragraph>
|
||||
<paragraph><location><page_9><loc_52><loc_23><loc_91><loc_24></location>[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_28><loc_48><loc_30></location>- [14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020.</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_26><loc_48><loc_27></location>- [15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019.</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_23><loc_48><loc_25></location>- [16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014.</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_21><loc_48><loc_23></location>- [17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019.</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_16><loc_48><loc_20></location>- [18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021.</paragraph>
|
||||
<paragraph><location><page_9><loc_9><loc_10><loc_48><loc_15></location>- [19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery.</paragraph>
|
||||
<paragraph><location><page_9><loc_52><loc_32><loc_91><loc_34></location>- [20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.</paragraph>
|
||||
<paragraph><location><page_9><loc_52><loc_29><loc_91><loc_31></location>- [21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021.</paragraph>
|
||||
<paragraph><location><page_9><loc_52><loc_25><loc_91><loc_28></location>- [22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.</paragraph>
|
||||
<paragraph><location><page_9><loc_52><loc_23><loc_91><loc_24></location>- [23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.</paragraph>
|
||||
</document>
|
File diff suppressed because one or more lines are too long
@ -59,17 +59,17 @@ A key problem in the process of document conversion is to understand the structu
|
||||
|
||||
In this paper, we present the DocLayNet dataset. It provides pageby-page layout annotation ground-truth using bounding-boxes for 11 distinct class labels on 80863 unique document pages, of which a fraction carry double- or triple-annotations. DocLayNet is similar in spirit to PubLayNet and DocBank and will likewise be made available to the public 1 in order to stimulate the document-layout analysis community. It distinguishes itself in the following aspects:
|
||||
|
||||
(1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on human annotation instead of automation approaches to generate the data set.
|
||||
- (1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on human annotation instead of automation approaches to generate the data set.
|
||||
|
||||
(2) Large Layout Variability : We include diverse and complex layouts from a large variety of public sources.
|
||||
- (2) Large Layout Variability : We include diverse and complex layouts from a large variety of public sources.
|
||||
|
||||
(3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours.
|
||||
- (3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours.
|
||||
|
||||
(4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.
|
||||
- (4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.
|
||||
|
||||
This enables experimentation with annotation uncertainty and quality control analysis.
|
||||
|
||||
(5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.
|
||||
- (5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.
|
||||
|
||||
All aspects outlined above are detailed in Section 3. In Section 4, we will elaborate on how we designed and executed this large-scale human annotation campaign. We will also share key insights and lessons learned that might prove helpful for other parties planning to set up annotation campaigns.
|
||||
|
||||
@ -145,17 +145,17 @@ At first sight, the task of visual document-layout interpretation appears intuit
|
||||
|
||||
Obviously, this inconsistency in annotations is not desirable for datasets which are intended to be used for model training. To minimise these inconsistencies, we created a detailed annotation guideline. While perfect consistency across 40 annotation staff members is clearly not possible to achieve, we saw a huge improvement in annotation consistency after the introduction of our annotation guideline. A few selected, non-trivial highlights of the guideline are:
|
||||
|
||||
(1) Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object.
|
||||
- (1) Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object.
|
||||
|
||||
(2) A List-item is a paragraph with hanging indentation. Singleline elements can qualify as List-item if the neighbour elements expose hanging indentation. Bullet or enumeration symbols are not a requirement.
|
||||
- (2) A List-item is a paragraph with hanging indentation. Singleline elements can qualify as List-item if the neighbour elements expose hanging indentation. Bullet or enumeration symbols are not a requirement.
|
||||
|
||||
(3) For every Caption , there must be exactly one corresponding Picture or Table .
|
||||
- (3) For every Caption , there must be exactly one corresponding Picture or Table .
|
||||
|
||||
(4) Connected sub-pictures are grouped together in one Picture object.
|
||||
- (4) Connected sub-pictures are grouped together in one Picture object.
|
||||
|
||||
(5) Formula numbers are included in a Formula object.
|
||||
- (5) Formula numbers are included in a Formula object.
|
||||
|
||||
(6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line.
|
||||
- (6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line.
|
||||
|
||||
The complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.
|
||||
|
||||
@ -295,31 +295,31 @@ To date, there is still a significant gap between human and ML accuracy on the l
|
||||
|
||||
## REFERENCES
|
||||
|
||||
[1] Max Göbel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013.
|
||||
- [1] Max Göbel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013.
|
||||
|
||||
[2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher. Icdar2017 competition on recognition of documents with complex layouts rdcl2017. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 1404-1410, 2017.
|
||||
- [2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher. Icdar2017 competition on recognition of documents with complex layouts rdcl2017. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 1404-1410, 2017.
|
||||
|
||||
[3] Hervé Déjean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), April 2019. http://sac.founderit.com/.
|
||||
- [3] Hervé Déjean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), April 2019. http://sac.founderit.com/.
|
||||
|
||||
[4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on scientific literature parsing. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag, sep 2021.
|
||||
- [4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on scientific literature parsing. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag, sep 2021.
|
||||
|
||||
[5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis: not dead yet. International Journal on Document Analysis and Recognition (IJDAR) , pages 1-11, 01 2022.
|
||||
- [5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis: not dead yet. International Journal on Document Analysis and Recognition (IJDAR) , pages 1-11, 01 2022.
|
||||
|
||||
[6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest dataset ever for document layout analysis. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep 2019.
|
||||
- [6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest dataset ever for document layout analysis. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep 2019.
|
||||
|
||||
[7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings of the 28th International Conference on Computational Linguistics , COLING, pages 949-960. International Committee on Computational Linguistics, dec 2020.
|
||||
- [7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings of the 28th International Conference on Computational Linguistics , COLING, pages 949-960. International Committee on Computational Linguistics, dec 2020.
|
||||
|
||||
[8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC , 2016.
|
||||
- [8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC , 2016.
|
||||
|
||||
[9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich feature hierarchies for accurate object detection and semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587. IEEE Computer Society, jun 2014.
|
||||
- [9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich feature hierarchies for accurate object detection and semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587. IEEE Computer Society, jun 2014.
|
||||
|
||||
[10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015.
|
||||
- [10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015.
|
||||
|
||||
[11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017.
|
||||
- [11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017.
|
||||
|
||||
[12] Kaiming He, Georgia Gkioxari, Piotr Dollár, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017.
|
||||
- [12] Kaiming He, Georgia Gkioxari, Piotr Dollár, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017.
|
||||
|
||||
[13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu
|
||||
- [13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu
|
||||
|
||||
Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title
|
||||
<!-- image -->
|
||||
@ -328,22 +328,22 @@ Figure 6: Example layout predictions on selected pages from the DocLayNet test-s
|
||||
|
||||
Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021.
|
||||
|
||||
[14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020.
|
||||
- [14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020.
|
||||
|
||||
[15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019.
|
||||
- [15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019.
|
||||
|
||||
[16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014.
|
||||
- [16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014.
|
||||
|
||||
[17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019.
|
||||
- [17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019.
|
||||
|
||||
[18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021.
|
||||
- [18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021.
|
||||
|
||||
[19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery.
|
||||
- [19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery.
|
||||
|
||||
[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.
|
||||
- [20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.
|
||||
|
||||
[21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021.
|
||||
- [21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021.
|
||||
|
||||
[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.
|
||||
- [22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.
|
||||
|
||||
[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.
|
||||
- [23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.
|
File diff suppressed because one or more lines are too long
@ -40,11 +40,11 @@
|
||||
<subtitle-level-1><location><page_6><loc_22><loc_40><loc_43><loc_41></location>4.1 Language Definition</subtitle-level-1>
|
||||
<paragraph><location><page_6><loc_22><loc_34><loc_79><loc_38></location>In Figure 3, we illustrate how the OTSL is defined. In essence, the OTSL defines only 5 tokens that directly describe a tabular structure based on an atomic 2D grid.</paragraph>
|
||||
<paragraph><location><page_6><loc_24><loc_32><loc_67><loc_34></location>The OTSL vocabulary is comprised of the following tokens:</paragraph>
|
||||
<paragraph><location><page_6><loc_23><loc_30><loc_75><loc_31></location>-"C" cell a new table cell that either has or does not have cell content</paragraph>
|
||||
<paragraph><location><page_6><loc_23><loc_27><loc_79><loc_29></location>-"L" cell left-looking cell , merging with the left neighbor cell to create a span</paragraph>
|
||||
<paragraph><location><page_6><loc_23><loc_24><loc_79><loc_26></location>-"U" cell up-looking cell , merging with the upper neighbor cell to create a span</paragraph>
|
||||
<paragraph><location><page_6><loc_23><loc_22><loc_74><loc_23></location>-"X" cell cross cell , to merge with both left and upper neighbor cells</paragraph>
|
||||
<paragraph><location><page_6><loc_23><loc_20><loc_54><loc_22></location>-"NL" new-line , switch to the next row.</paragraph>
|
||||
<paragraph><location><page_6><loc_23><loc_30><loc_75><loc_31></location>- -"C" cell a new table cell that either has or does not have cell content</paragraph>
|
||||
<paragraph><location><page_6><loc_23><loc_27><loc_79><loc_29></location>- -"L" cell left-looking cell , merging with the left neighbor cell to create a span</paragraph>
|
||||
<paragraph><location><page_6><loc_23><loc_24><loc_79><loc_26></location>- -"U" cell up-looking cell , merging with the upper neighbor cell to create a span</paragraph>
|
||||
<paragraph><location><page_6><loc_23><loc_22><loc_74><loc_23></location>- -"X" cell cross cell , to merge with both left and upper neighbor cells</paragraph>
|
||||
<paragraph><location><page_6><loc_23><loc_20><loc_54><loc_22></location>- -"NL" new-line , switch to the next row.</paragraph>
|
||||
<paragraph><location><page_6><loc_22><loc_16><loc_79><loc_19></location>A notable attribute of OTSL is that it has the capability of achieving lossless conversion to HTML.</paragraph>
|
||||
<caption><location><page_7><loc_22><loc_80><loc_79><loc_84></location>Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding</caption>
|
||||
<figure>
|
||||
@ -53,13 +53,13 @@
|
||||
</figure>
|
||||
<subtitle-level-1><location><page_7><loc_22><loc_60><loc_40><loc_62></location>4.2 Language Syntax</subtitle-level-1>
|
||||
<paragraph><location><page_7><loc_22><loc_58><loc_59><loc_59></location>The OTSL representation follows these syntax rules:</paragraph>
|
||||
<paragraph><location><page_7><loc_23><loc_54><loc_79><loc_56></location>1. Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell.</paragraph>
|
||||
<paragraph><location><page_7><loc_23><loc_51><loc_79><loc_53></location>2. Up-looking cell rule : The upper neighbour of a "U" cell must be either another "U" cell or a "C" cell.</paragraph>
|
||||
<paragraph><location><page_7><loc_23><loc_54><loc_79><loc_56></location>- 1. Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell.</paragraph>
|
||||
<paragraph><location><page_7><loc_23><loc_51><loc_79><loc_53></location>- 2. Up-looking cell rule : The upper neighbour of a "U" cell must be either another "U" cell or a "C" cell.</paragraph>
|
||||
<subtitle-level-1><location><page_7><loc_23><loc_49><loc_37><loc_50></location>3. Cross cell rule :</subtitle-level-1>
|
||||
<paragraph><location><page_7><loc_24><loc_44><loc_79><loc_49></location>The left neighbour of an "X" cell must be either another "X" cell or a "U" cell, and the upper neighbour of an "X" cell must be either another "X" cell or an "L" cell.</paragraph>
|
||||
<paragraph><location><page_7><loc_23><loc_43><loc_78><loc_44></location>4. First row rule : Only "L" cells and "C" cells are allowed in the first row.</paragraph>
|
||||
<paragraph><location><page_7><loc_23><loc_40><loc_79><loc_43></location>5. First column rule : Only "U" cells and "C" cells are allowed in the first column.</paragraph>
|
||||
<paragraph><location><page_7><loc_23><loc_37><loc_79><loc_40></location>6. Rectangular rule : The table representation is always rectangular - all rows must have an equal number of tokens, terminated with "NL" token.</paragraph>
|
||||
<paragraph><location><page_7><loc_24><loc_44><loc_79><loc_49></location>- The left neighbour of an "X" cell must be either another "X" cell or a "U" cell, and the upper neighbour of an "X" cell must be either another "X" cell or an "L" cell.</paragraph>
|
||||
<paragraph><location><page_7><loc_23><loc_43><loc_78><loc_44></location>- 4. First row rule : Only "L" cells and "C" cells are allowed in the first row.</paragraph>
|
||||
<paragraph><location><page_7><loc_23><loc_40><loc_79><loc_43></location>- 5. First column rule : Only "U" cells and "C" cells are allowed in the first column.</paragraph>
|
||||
<paragraph><location><page_7><loc_23><loc_37><loc_79><loc_40></location>- 6. Rectangular rule : The table representation is always rectangular - all rows must have an equal number of tokens, terminated with "NL" token.</paragraph>
|
||||
<paragraph><location><page_7><loc_22><loc_19><loc_79><loc_35></location>The application of these rules gives OTSL a set of unique properties. First of all, the OTSL enforces a strictly rectangular structure representation, where every new-line token starts a new row. As a consequence, all rows and all columns have exactly the same number of tokens, irrespective of cell spans. Secondly, the OTSL representation is unambiguous: Every table structure is represented in one way. In this representation every table cell corresponds to a "C"-cell token, which in case of spans is always located in the top-left corner of the table cell definition. Third, OTSL syntax rules are only backward-looking. As a consequence, every predicted token can be validated straight during sequence generation by looking at the previously predicted sequence. As such, OTSL can guarantee that every predicted sequence is syntactically valid.</paragraph>
|
||||
<paragraph><location><page_7><loc_22><loc_16><loc_79><loc_19></location>These characteristics can be easily learned by sequence generator networks, as we demonstrate further below. We find strong indications that this pattern</paragraph>
|
||||
<paragraph><location><page_8><loc_22><loc_82><loc_79><loc_85></location>reduces significantly the column drift seen in the HTML based models (see Figure 5).</paragraph>
|
||||
@ -123,27 +123,27 @@
|
||||
<paragraph><location><page_12><loc_22><loc_59><loc_79><loc_74></location>First and foremost, given the same network configuration, inference time for a table-structure prediction is about 2 times faster compared to the conventional HTML approach. This is primarily owed to the shorter sequence length of the OTSL representation. Additional performance benefits can be obtained with HPO (hyper parameter optimization). As we demonstrate in our experiments, models trained on OTSL can be significantly smaller, e.g. by reducing the number of encoder and decoder layers, while preserving comparatively good prediction quality. This can further improve inference performance, yielding 5-6 times faster inference speed in OTSL with prediction quality comparable to models trained on HTML (see Table 1).</paragraph>
|
||||
<paragraph><location><page_12><loc_22><loc_41><loc_79><loc_59></location>Secondly, OTSL has more inherent structure and a significantly restricted vocabulary size. This allows autoregressive models to perform better in the TED metric, but especially with regards to prediction accuracy of the table-cell bounding boxes (see Table 2). As shown in Figure 5, we observe that the OTSL drastically reduces the drift for table cell bounding boxes at high row count and in sparse tables. This leads to more accurate predictions and a significant reduction in post-processing complexity, which is an undesired necessity in HTML-based Im2Seq models. Significant novelty lies in OTSL syntactical rules, which are few, simple and always backwards looking. Each new token can be validated only by analyzing the sequence of previous tokens, without requiring the entire sequence to detect mistakes. This in return allows to perform structural error detection and correction on-the-fly during sequence generation.</paragraph>
|
||||
<subtitle-level-1><location><page_12><loc_22><loc_36><loc_32><loc_38></location>References</subtitle-level-1>
|
||||
<paragraph><location><page_12><loc_23><loc_29><loc_79><loc_34></location>1. Auer, C., Dolfi, M., Carvalho, A., Ramis, C.B., Staar, P.W.J.: Delivering document conversion as a cloud service with high throughput and responsiveness. CoRR abs/2206.00785 (2022). https://doi.org/10.48550/arXiv.2206.00785 , https://doi.org/10.48550/arXiv.2206.00785</paragraph>
|
||||
<paragraph><location><page_12><loc_23><loc_23><loc_79><loc_29></location>2. Chen, B., Peng, D., Zhang, J., Ren, Y., Jin, L.: Complex table structure recognition in the wild using transformer and identity matrix-based augmentation. In: Porwal, U., Fornés, A., Shafait, F. (eds.) Frontiers in Handwriting Recognition. pp. 545561. Springer International Publishing, Cham (2022)</paragraph>
|
||||
<paragraph><location><page_12><loc_23><loc_20><loc_79><loc_23></location>3. Chi, Z., Huang, H., Xu, H.D., Yu, H., Yin, W., Mao, X.L.: Complicated table structure recognition. arXiv preprint arXiv:1908.04729 (2019)</paragraph>
|
||||
<paragraph><location><page_12><loc_23><loc_16><loc_79><loc_20></location>4. Deng, Y., Rosenberg, D., Mann, G.: Challenges in end-to-end neural scientific table recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 894-901. IEEE (2019)</paragraph>
|
||||
<paragraph><location><page_13><loc_23><loc_81><loc_79><loc_85></location>5. Kayal, P., Anand, M., Desai, H., Singh, M.: Tables to latex: structure and content extraction from scientific tables. International Journal on Document Analysis and Recognition (IJDAR) pp. 1-10 (2022)</paragraph>
|
||||
<paragraph><location><page_13><loc_23><loc_76><loc_79><loc_81></location>6. Lee, E., Kwon, J., Yang, H., Park, J., Lee, S., Koo, H.I., Cho, N.I.: Table structure recognition based on grid shape graph. In: 2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). pp. 18681873. IEEE (2022)</paragraph>
|
||||
<paragraph><location><page_13><loc_23><loc_73><loc_79><loc_75></location>7. Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: A benchmark dataset for table detection and recognition (2019)</paragraph>
|
||||
<paragraph><location><page_13><loc_23><loc_66><loc_79><loc_72></location>8. Livathinos, N., Berrospi, C., Lysak, M., Kuropiatnyk, V., Nassar, A., Carvalho, A., Dolfi, M., Auer, C., Dinkla, K., Staar, P.: Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence 35 (17), 15137-15145 (May 2021), https://ojs.aaai.org/index.php/ AAAI/article/view/17777</paragraph>
|
||||
<paragraph><location><page_13><loc_23><loc_62><loc_79><loc_66></location>9. Nassar, A., Livathinos, N., Lysak, M., Staar, P.: Tableformer: Table structure understanding with transformers. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4614-4623 (June 2022)</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_53><loc_79><loc_61></location>10. Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.W.J.: Doclaynet: A large human-annotated dataset for document-layout segmentation. In: Zhang, A., Rangwala, H. (eds.) KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Washington, DC, USA, August 14 - 18, 2022. pp. 3743-3751. ACM (2022). https://doi.org/10.1145/3534678.3539043 , https:// doi.org/10.1145/3534678.3539043</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_48><loc_79><loc_53></location>11. Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from imagebased documents. In: Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops. pp. 572-573 (2020)</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_42><loc_79><loc_48></location>12. Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162-1167. IEEE (2017)</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_37><loc_79><loc_42></location>13. Siddiqui, S.A., Fateh, I.A., Rizvi, S.T.R., Dengel, A., Ahmed, S.: Deeptabstr: Deep learning based table structure recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1403-1409 (2019). https:// doi.org/10.1109/ICDAR.2019.00226</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_31><loc_79><loc_37></location>14. Smock, B., Pesala, R., Abraham, R.: PubTables-1M: Towards comprehensive table extraction from unstructured documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4634-4642 (June 2022)</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_23><loc_79><loc_31></location>15. Staar, P.W.J., Dolfi, M., Auer, C., Bekas, C.: Corpus conversion service: A machine learning platform to ingest documents at scale. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. pp. 774-782. KDD '18, Association for Computing Machinery, New York, NY, USA (2018). https://doi.org/10.1145/3219819.3219834 , https://doi.org/10. 1145/3219819.3219834</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_20><loc_79><loc_23></location>16. Wang, X.: Tabular Abstraction, Editing, and Formatting. Ph.D. thesis, CAN (1996), aAINN09397</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_16><loc_79><loc_20></location>17. Xue, W., Li, Q., Tao, D.: Res2tim: Reconstruct syntactic structures from table images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 749-755. IEEE (2019)</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_81><loc_79><loc_85></location>18. Xue, W., Yu, B., Wang, W., Tao, D., Li, Q.: Tgrnet: A table graph reconstruction network for table structure recognition. In: Proceedings of the IEEE/CVF International Conference on Computer Vision. pp. 1295-1304 (2021)</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_76><loc_79><loc_81></location>19. Ye, J., Qi, X., He, Y., Chen, Y., Gu, D., Gao, P., Xiao, R.: Pingan-vcgroup's solution for icdar 2021 competition on scientific literature parsing task b: Table recognition to html (2021). https://doi.org/10.48550/ARXIV.2105.01848 , https://arxiv.org/abs/2105.01848</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_73><loc_79><loc_75></location>20. Zhang, Z., Zhang, J., Du, J., Wang, F.: Split, embed and merge: An accurate table structure recognizer. Pattern Recognition 126 , 108565 (2022)</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_66><loc_79><loc_73></location>21. Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. In: 2021 IEEE Winter Conference on Applications of Computer Vision (WACV). pp. 697-706 (2021). https://doi.org/10.1109/WACV48630.2021. 00074</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_60><loc_79><loc_66></location>22. Zhong, X., ShafieiBavani, E., Jimeno Yepes, A.: Image-based table recognition: Data, model, and evaluation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) Computer Vision - ECCV 2020. pp. 564-580. Springer International Publishing, Cham (2020)</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_56><loc_79><loc_60></location>23. Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (2019)</paragraph>
|
||||
<paragraph><location><page_12><loc_23><loc_29><loc_79><loc_34></location>- 1. Auer, C., Dolfi, M., Carvalho, A., Ramis, C.B., Staar, P.W.J.: Delivering document conversion as a cloud service with high throughput and responsiveness. CoRR abs/2206.00785 (2022). https://doi.org/10.48550/arXiv.2206.00785 , https://doi.org/10.48550/arXiv.2206.00785</paragraph>
|
||||
<paragraph><location><page_12><loc_23><loc_23><loc_79><loc_29></location>- 2. Chen, B., Peng, D., Zhang, J., Ren, Y., Jin, L.: Complex table structure recognition in the wild using transformer and identity matrix-based augmentation. In: Porwal, U., Fornés, A., Shafait, F. (eds.) Frontiers in Handwriting Recognition. pp. 545561. Springer International Publishing, Cham (2022)</paragraph>
|
||||
<paragraph><location><page_12><loc_23><loc_20><loc_79><loc_23></location>- 3. Chi, Z., Huang, H., Xu, H.D., Yu, H., Yin, W., Mao, X.L.: Complicated table structure recognition. arXiv preprint arXiv:1908.04729 (2019)</paragraph>
|
||||
<paragraph><location><page_12><loc_23><loc_16><loc_79><loc_20></location>- 4. Deng, Y., Rosenberg, D., Mann, G.: Challenges in end-to-end neural scientific table recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 894-901. IEEE (2019)</paragraph>
|
||||
<paragraph><location><page_13><loc_23><loc_81><loc_79><loc_85></location>- 5. Kayal, P., Anand, M., Desai, H., Singh, M.: Tables to latex: structure and content extraction from scientific tables. International Journal on Document Analysis and Recognition (IJDAR) pp. 1-10 (2022)</paragraph>
|
||||
<paragraph><location><page_13><loc_23><loc_76><loc_79><loc_81></location>- 6. Lee, E., Kwon, J., Yang, H., Park, J., Lee, S., Koo, H.I., Cho, N.I.: Table structure recognition based on grid shape graph. In: 2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). pp. 18681873. IEEE (2022)</paragraph>
|
||||
<paragraph><location><page_13><loc_23><loc_73><loc_79><loc_75></location>- 7. Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: A benchmark dataset for table detection and recognition (2019)</paragraph>
|
||||
<paragraph><location><page_13><loc_23><loc_66><loc_79><loc_72></location>- 8. Livathinos, N., Berrospi, C., Lysak, M., Kuropiatnyk, V., Nassar, A., Carvalho, A., Dolfi, M., Auer, C., Dinkla, K., Staar, P.: Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence 35 (17), 15137-15145 (May 2021), https://ojs.aaai.org/index.php/ AAAI/article/view/17777</paragraph>
|
||||
<paragraph><location><page_13><loc_23><loc_62><loc_79><loc_66></location>- 9. Nassar, A., Livathinos, N., Lysak, M., Staar, P.: Tableformer: Table structure understanding with transformers. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4614-4623 (June 2022)</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_53><loc_79><loc_61></location>- 10. Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.W.J.: Doclaynet: A large human-annotated dataset for document-layout segmentation. In: Zhang, A., Rangwala, H. (eds.) KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Washington, DC, USA, August 14 - 18, 2022. pp. 3743-3751. ACM (2022). https://doi.org/10.1145/3534678.3539043 , https:// doi.org/10.1145/3534678.3539043</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_48><loc_79><loc_53></location>- 11. Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from imagebased documents. In: Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops. pp. 572-573 (2020)</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_42><loc_79><loc_48></location>- 12. Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162-1167. IEEE (2017)</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_37><loc_79><loc_42></location>- 13. Siddiqui, S.A., Fateh, I.A., Rizvi, S.T.R., Dengel, A., Ahmed, S.: Deeptabstr: Deep learning based table structure recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1403-1409 (2019). https:// doi.org/10.1109/ICDAR.2019.00226</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_31><loc_79><loc_37></location>- 14. Smock, B., Pesala, R., Abraham, R.: PubTables-1M: Towards comprehensive table extraction from unstructured documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4634-4642 (June 2022)</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_23><loc_79><loc_31></location>- 15. Staar, P.W.J., Dolfi, M., Auer, C., Bekas, C.: Corpus conversion service: A machine learning platform to ingest documents at scale. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. pp. 774-782. KDD '18, Association for Computing Machinery, New York, NY, USA (2018). https://doi.org/10.1145/3219819.3219834 , https://doi.org/10. 1145/3219819.3219834</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_20><loc_79><loc_23></location>- 16. Wang, X.: Tabular Abstraction, Editing, and Formatting. Ph.D. thesis, CAN (1996), aAINN09397</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_16><loc_79><loc_20></location>- 17. Xue, W., Li, Q., Tao, D.: Res2tim: Reconstruct syntactic structures from table images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 749-755. IEEE (2019)</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_81><loc_79><loc_85></location>- 18. Xue, W., Yu, B., Wang, W., Tao, D., Li, Q.: Tgrnet: A table graph reconstruction network for table structure recognition. In: Proceedings of the IEEE/CVF International Conference on Computer Vision. pp. 1295-1304 (2021)</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_76><loc_79><loc_81></location>- 19. Ye, J., Qi, X., He, Y., Chen, Y., Gu, D., Gao, P., Xiao, R.: Pingan-vcgroup's solution for icdar 2021 competition on scientific literature parsing task b: Table recognition to html (2021). https://doi.org/10.48550/ARXIV.2105.01848 , https://arxiv.org/abs/2105.01848</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_73><loc_79><loc_75></location>- 20. Zhang, Z., Zhang, J., Du, J., Wang, F.: Split, embed and merge: An accurate table structure recognizer. Pattern Recognition 126 , 108565 (2022)</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_66><loc_79><loc_73></location>- 21. Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. In: 2021 IEEE Winter Conference on Applications of Computer Vision (WACV). pp. 697-706 (2021). https://doi.org/10.1109/WACV48630.2021. 00074</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_60><loc_79><loc_66></location>- 22. Zhong, X., ShafieiBavani, E., Jimeno Yepes, A.: Image-based table recognition: Data, model, and evaluation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) Computer Vision - ECCV 2020. pp. 564-580. Springer International Publishing, Cham (2020)</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_56><loc_79><loc_60></location>- 23. Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (2019)</paragraph>
|
||||
</document>
|
File diff suppressed because one or more lines are too long
@ -66,15 +66,15 @@ In Figure 3, we illustrate how the OTSL is defined. In essence, the OTSL defines
|
||||
|
||||
The OTSL vocabulary is comprised of the following tokens:
|
||||
|
||||
-"C" cell a new table cell that either has or does not have cell content
|
||||
- -"C" cell a new table cell that either has or does not have cell content
|
||||
|
||||
-"L" cell left-looking cell , merging with the left neighbor cell to create a span
|
||||
- -"L" cell left-looking cell , merging with the left neighbor cell to create a span
|
||||
|
||||
-"U" cell up-looking cell , merging with the upper neighbor cell to create a span
|
||||
- -"U" cell up-looking cell , merging with the upper neighbor cell to create a span
|
||||
|
||||
-"X" cell cross cell , to merge with both left and upper neighbor cells
|
||||
- -"X" cell cross cell , to merge with both left and upper neighbor cells
|
||||
|
||||
-"NL" new-line , switch to the next row.
|
||||
- -"NL" new-line , switch to the next row.
|
||||
|
||||
A notable attribute of OTSL is that it has the capability of achieving lossless conversion to HTML.
|
||||
|
||||
@ -85,19 +85,19 @@ Fig. 3. OTSL description of table structure: A - table example; B - graphical re
|
||||
|
||||
The OTSL representation follows these syntax rules:
|
||||
|
||||
1. Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell.
|
||||
- 1. Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell.
|
||||
|
||||
2. Up-looking cell rule : The upper neighbour of a "U" cell must be either another "U" cell or a "C" cell.
|
||||
- 2. Up-looking cell rule : The upper neighbour of a "U" cell must be either another "U" cell or a "C" cell.
|
||||
|
||||
## 3. Cross cell rule :
|
||||
|
||||
The left neighbour of an "X" cell must be either another "X" cell or a "U" cell, and the upper neighbour of an "X" cell must be either another "X" cell or an "L" cell.
|
||||
- The left neighbour of an "X" cell must be either another "X" cell or a "U" cell, and the upper neighbour of an "X" cell must be either another "X" cell or an "L" cell.
|
||||
|
||||
4. First row rule : Only "L" cells and "C" cells are allowed in the first row.
|
||||
- 4. First row rule : Only "L" cells and "C" cells are allowed in the first row.
|
||||
|
||||
5. First column rule : Only "U" cells and "C" cells are allowed in the first column.
|
||||
- 5. First column rule : Only "U" cells and "C" cells are allowed in the first column.
|
||||
|
||||
6. Rectangular rule : The table representation is always rectangular - all rows must have an equal number of tokens, terminated with "NL" token.
|
||||
- 6. Rectangular rule : The table representation is always rectangular - all rows must have an equal number of tokens, terminated with "NL" token.
|
||||
|
||||
The application of these rules gives OTSL a set of unique properties. First of all, the OTSL enforces a strictly rectangular structure representation, where every new-line token starts a new row. As a consequence, all rows and all columns have exactly the same number of tokens, irrespective of cell spans. Secondly, the OTSL representation is unambiguous: Every table structure is represented in one way. In this representation every table cell corresponds to a "C"-cell token, which in case of spans is always located in the top-left corner of the table cell definition. Third, OTSL syntax rules are only backward-looking. As a consequence, every predicted token can be validated straight during sequence generation by looking at the previously predicted sequence. As such, OTSL can guarantee that every predicted sequence is syntactically valid.
|
||||
|
||||
@ -177,48 +177,48 @@ Secondly, OTSL has more inherent structure and a significantly restricted vocabu
|
||||
|
||||
## References
|
||||
|
||||
1. Auer, C., Dolfi, M., Carvalho, A., Ramis, C.B., Staar, P.W.J.: Delivering document conversion as a cloud service with high throughput and responsiveness. CoRR abs/2206.00785 (2022). https://doi.org/10.48550/arXiv.2206.00785 , https://doi.org/10.48550/arXiv.2206.00785
|
||||
- 1. Auer, C., Dolfi, M., Carvalho, A., Ramis, C.B., Staar, P.W.J.: Delivering document conversion as a cloud service with high throughput and responsiveness. CoRR abs/2206.00785 (2022). https://doi.org/10.48550/arXiv.2206.00785 , https://doi.org/10.48550/arXiv.2206.00785
|
||||
|
||||
2. Chen, B., Peng, D., Zhang, J., Ren, Y., Jin, L.: Complex table structure recognition in the wild using transformer and identity matrix-based augmentation. In: Porwal, U., Fornés, A., Shafait, F. (eds.) Frontiers in Handwriting Recognition. pp. 545561. Springer International Publishing, Cham (2022)
|
||||
- 2. Chen, B., Peng, D., Zhang, J., Ren, Y., Jin, L.: Complex table structure recognition in the wild using transformer and identity matrix-based augmentation. In: Porwal, U., Fornés, A., Shafait, F. (eds.) Frontiers in Handwriting Recognition. pp. 545561. Springer International Publishing, Cham (2022)
|
||||
|
||||
3. Chi, Z., Huang, H., Xu, H.D., Yu, H., Yin, W., Mao, X.L.: Complicated table structure recognition. arXiv preprint arXiv:1908.04729 (2019)
|
||||
- 3. Chi, Z., Huang, H., Xu, H.D., Yu, H., Yin, W., Mao, X.L.: Complicated table structure recognition. arXiv preprint arXiv:1908.04729 (2019)
|
||||
|
||||
4. Deng, Y., Rosenberg, D., Mann, G.: Challenges in end-to-end neural scientific table recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 894-901. IEEE (2019)
|
||||
- 4. Deng, Y., Rosenberg, D., Mann, G.: Challenges in end-to-end neural scientific table recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 894-901. IEEE (2019)
|
||||
|
||||
5. Kayal, P., Anand, M., Desai, H., Singh, M.: Tables to latex: structure and content extraction from scientific tables. International Journal on Document Analysis and Recognition (IJDAR) pp. 1-10 (2022)
|
||||
- 5. Kayal, P., Anand, M., Desai, H., Singh, M.: Tables to latex: structure and content extraction from scientific tables. International Journal on Document Analysis and Recognition (IJDAR) pp. 1-10 (2022)
|
||||
|
||||
6. Lee, E., Kwon, J., Yang, H., Park, J., Lee, S., Koo, H.I., Cho, N.I.: Table structure recognition based on grid shape graph. In: 2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). pp. 18681873. IEEE (2022)
|
||||
- 6. Lee, E., Kwon, J., Yang, H., Park, J., Lee, S., Koo, H.I., Cho, N.I.: Table structure recognition based on grid shape graph. In: 2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). pp. 18681873. IEEE (2022)
|
||||
|
||||
7. Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: A benchmark dataset for table detection and recognition (2019)
|
||||
- 7. Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: A benchmark dataset for table detection and recognition (2019)
|
||||
|
||||
8. Livathinos, N., Berrospi, C., Lysak, M., Kuropiatnyk, V., Nassar, A., Carvalho, A., Dolfi, M., Auer, C., Dinkla, K., Staar, P.: Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence 35 (17), 15137-15145 (May 2021), https://ojs.aaai.org/index.php/ AAAI/article/view/17777
|
||||
- 8. Livathinos, N., Berrospi, C., Lysak, M., Kuropiatnyk, V., Nassar, A., Carvalho, A., Dolfi, M., Auer, C., Dinkla, K., Staar, P.: Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence 35 (17), 15137-15145 (May 2021), https://ojs.aaai.org/index.php/ AAAI/article/view/17777
|
||||
|
||||
9. Nassar, A., Livathinos, N., Lysak, M., Staar, P.: Tableformer: Table structure understanding with transformers. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4614-4623 (June 2022)
|
||||
- 9. Nassar, A., Livathinos, N., Lysak, M., Staar, P.: Tableformer: Table structure understanding with transformers. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4614-4623 (June 2022)
|
||||
|
||||
10. Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.W.J.: Doclaynet: A large human-annotated dataset for document-layout segmentation. In: Zhang, A., Rangwala, H. (eds.) KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Washington, DC, USA, August 14 - 18, 2022. pp. 3743-3751. ACM (2022). https://doi.org/10.1145/3534678.3539043 , https:// doi.org/10.1145/3534678.3539043
|
||||
- 10. Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.W.J.: Doclaynet: A large human-annotated dataset for document-layout segmentation. In: Zhang, A., Rangwala, H. (eds.) KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Washington, DC, USA, August 14 - 18, 2022. pp. 3743-3751. ACM (2022). https://doi.org/10.1145/3534678.3539043 , https:// doi.org/10.1145/3534678.3539043
|
||||
|
||||
11. Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from imagebased documents. In: Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops. pp. 572-573 (2020)
|
||||
- 11. Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from imagebased documents. In: Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops. pp. 572-573 (2020)
|
||||
|
||||
12. Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162-1167. IEEE (2017)
|
||||
- 12. Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162-1167. IEEE (2017)
|
||||
|
||||
13. Siddiqui, S.A., Fateh, I.A., Rizvi, S.T.R., Dengel, A., Ahmed, S.: Deeptabstr: Deep learning based table structure recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1403-1409 (2019). https:// doi.org/10.1109/ICDAR.2019.00226
|
||||
- 13. Siddiqui, S.A., Fateh, I.A., Rizvi, S.T.R., Dengel, A., Ahmed, S.: Deeptabstr: Deep learning based table structure recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1403-1409 (2019). https:// doi.org/10.1109/ICDAR.2019.00226
|
||||
|
||||
14. Smock, B., Pesala, R., Abraham, R.: PubTables-1M: Towards comprehensive table extraction from unstructured documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4634-4642 (June 2022)
|
||||
- 14. Smock, B., Pesala, R., Abraham, R.: PubTables-1M: Towards comprehensive table extraction from unstructured documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4634-4642 (June 2022)
|
||||
|
||||
15. Staar, P.W.J., Dolfi, M., Auer, C., Bekas, C.: Corpus conversion service: A machine learning platform to ingest documents at scale. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. pp. 774-782. KDD '18, Association for Computing Machinery, New York, NY, USA (2018). https://doi.org/10.1145/3219819.3219834 , https://doi.org/10. 1145/3219819.3219834
|
||||
- 15. Staar, P.W.J., Dolfi, M., Auer, C., Bekas, C.: Corpus conversion service: A machine learning platform to ingest documents at scale. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. pp. 774-782. KDD '18, Association for Computing Machinery, New York, NY, USA (2018). https://doi.org/10.1145/3219819.3219834 , https://doi.org/10. 1145/3219819.3219834
|
||||
|
||||
16. Wang, X.: Tabular Abstraction, Editing, and Formatting. Ph.D. thesis, CAN (1996), aAINN09397
|
||||
- 16. Wang, X.: Tabular Abstraction, Editing, and Formatting. Ph.D. thesis, CAN (1996), aAINN09397
|
||||
|
||||
17. Xue, W., Li, Q., Tao, D.: Res2tim: Reconstruct syntactic structures from table images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 749-755. IEEE (2019)
|
||||
- 17. Xue, W., Li, Q., Tao, D.: Res2tim: Reconstruct syntactic structures from table images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 749-755. IEEE (2019)
|
||||
|
||||
18. Xue, W., Yu, B., Wang, W., Tao, D., Li, Q.: Tgrnet: A table graph reconstruction network for table structure recognition. In: Proceedings of the IEEE/CVF International Conference on Computer Vision. pp. 1295-1304 (2021)
|
||||
- 18. Xue, W., Yu, B., Wang, W., Tao, D., Li, Q.: Tgrnet: A table graph reconstruction network for table structure recognition. In: Proceedings of the IEEE/CVF International Conference on Computer Vision. pp. 1295-1304 (2021)
|
||||
|
||||
19. Ye, J., Qi, X., He, Y., Chen, Y., Gu, D., Gao, P., Xiao, R.: Pingan-vcgroup's solution for icdar 2021 competition on scientific literature parsing task b: Table recognition to html (2021). https://doi.org/10.48550/ARXIV.2105.01848 , https://arxiv.org/abs/2105.01848
|
||||
- 19. Ye, J., Qi, X., He, Y., Chen, Y., Gu, D., Gao, P., Xiao, R.: Pingan-vcgroup's solution for icdar 2021 competition on scientific literature parsing task b: Table recognition to html (2021). https://doi.org/10.48550/ARXIV.2105.01848 , https://arxiv.org/abs/2105.01848
|
||||
|
||||
20. Zhang, Z., Zhang, J., Du, J., Wang, F.: Split, embed and merge: An accurate table structure recognizer. Pattern Recognition 126 , 108565 (2022)
|
||||
- 20. Zhang, Z., Zhang, J., Du, J., Wang, F.: Split, embed and merge: An accurate table structure recognizer. Pattern Recognition 126 , 108565 (2022)
|
||||
|
||||
21. Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. In: 2021 IEEE Winter Conference on Applications of Computer Vision (WACV). pp. 697-706 (2021). https://doi.org/10.1109/WACV48630.2021. 00074
|
||||
- 21. Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. In: 2021 IEEE Winter Conference on Applications of Computer Vision (WACV). pp. 697-706 (2021). https://doi.org/10.1109/WACV48630.2021. 00074
|
||||
|
||||
22. Zhong, X., ShafieiBavani, E., Jimeno Yepes, A.: Image-based table recognition: Data, model, and evaluation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) Computer Vision - ECCV 2020. pp. 564-580. Springer International Publishing, Cham (2020)
|
||||
- 22. Zhong, X., ShafieiBavani, E., Jimeno Yepes, A.: Image-based table recognition: Data, model, and evaluation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) Computer Vision - ECCV 2020. pp. 564-580. Springer International Publishing, Cham (2020)
|
||||
|
||||
23. Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (2019)
|
||||
- 23. Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (2019)
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
@ -26,12 +26,12 @@
|
||||
<subtitle-level-1><location><page_4><loc_11><loc_89><loc_35><loc_91></location>IBM Z: An overview</subtitle-level-1>
|
||||
<paragraph><location><page_4><loc_22><loc_80><loc_88><loc_87></location>Ever wonder how many transactions a bank processes per day? What about the pace at which these transactions happen? According to an IBMfi report, 44 of 50 of the world's top banks use IBM Z mainframes for these daily transactions.$^{2}$ IBM Z is a platform that is designed for voluminous data, maximum security, real-time transaction analysis, and cost efficiency.</paragraph>
|
||||
<paragraph><location><page_4><loc_22><loc_75><loc_84><loc_78></location>The most recent platform for IBM Z is IBM z16™. The IBM z16 supports the following features:</paragraph>
|
||||
<paragraph><location><page_4><loc_22><loc_73><loc_42><loc_75></location>GLYPH<SM590000> On-chip AI acceleration</paragraph>
|
||||
<paragraph><location><page_4><loc_22><loc_71><loc_47><loc_72></location>GLYPH<SM590000> Quantum-safe crypto discovery</paragraph>
|
||||
<paragraph><location><page_4><loc_22><loc_69><loc_41><loc_70></location>GLYPH<SM590000> Simplified compliance</paragraph>
|
||||
<paragraph><location><page_4><loc_22><loc_67><loc_37><loc_68></location>GLYPH<SM590000> Flexible capacity</paragraph>
|
||||
<paragraph><location><page_4><loc_22><loc_65><loc_46><loc_66></location>GLYPH<SM590000> Modernization of applications</paragraph>
|
||||
<paragraph><location><page_4><loc_22><loc_62><loc_34><loc_64></location>GLYPH<SM590000> Sustainability</paragraph>
|
||||
<paragraph><location><page_4><loc_22><loc_73><loc_42><loc_75></location>- GLYPH<SM590000> On-chip AI acceleration</paragraph>
|
||||
<paragraph><location><page_4><loc_22><loc_71><loc_47><loc_72></location>- GLYPH<SM590000> Quantum-safe crypto discovery</paragraph>
|
||||
<paragraph><location><page_4><loc_22><loc_69><loc_41><loc_70></location>- GLYPH<SM590000> Simplified compliance</paragraph>
|
||||
<paragraph><location><page_4><loc_22><loc_67><loc_37><loc_68></location>- GLYPH<SM590000> Flexible capacity</paragraph>
|
||||
<paragraph><location><page_4><loc_22><loc_65><loc_46><loc_66></location>- GLYPH<SM590000> Modernization of applications</paragraph>
|
||||
<paragraph><location><page_4><loc_22><loc_62><loc_34><loc_64></location>- GLYPH<SM590000> Sustainability</paragraph>
|
||||
<paragraph><location><page_4><loc_22><loc_58><loc_85><loc_61></location>With these features, enterprises can upgrade applications while preserving secure and resilient data.</paragraph>
|
||||
<paragraph><location><page_4><loc_22><loc_55><loc_71><loc_57></location>To learn more about these features, see the IBM z16 product page.</paragraph>
|
||||
<paragraph><location><page_4><loc_22><loc_53><loc_68><loc_54></location>Figure 1 on page 3 shows a picture of the IBM z16 mainframe.</paragraph>
|
||||
@ -80,19 +80,19 @@
|
||||
<caption>Figure 6 Solution overview of Cloud Pak for Data</caption>
|
||||
</figure>
|
||||
<paragraph><location><page_10><loc_22><loc_35><loc_85><loc_36></location>We highlight the four main pillars that make IBM Z the correct infrastructure for CP4D:</paragraph>
|
||||
<paragraph><location><page_10><loc_22><loc_33><loc_42><loc_34></location>GLYPH<SM590000> Performance and Scale</paragraph>
|
||||
<paragraph><location><page_10><loc_22><loc_31><loc_42><loc_32></location>GLYPH<SM590000> Embedded Accelerators</paragraph>
|
||||
<paragraph><location><page_10><loc_22><loc_28><loc_43><loc_30></location>GLYPH<SM590000> Reliability and Availability</paragraph>
|
||||
<paragraph><location><page_10><loc_22><loc_26><loc_44><loc_28></location>GLYPH<SM590000> Security and Governance.</paragraph>
|
||||
<paragraph><location><page_10><loc_22><loc_33><loc_42><loc_34></location>- GLYPH<SM590000> Performance and Scale</paragraph>
|
||||
<paragraph><location><page_10><loc_22><loc_31><loc_42><loc_32></location>- GLYPH<SM590000> Embedded Accelerators</paragraph>
|
||||
<paragraph><location><page_10><loc_22><loc_28><loc_43><loc_30></location>- GLYPH<SM590000> Reliability and Availability</paragraph>
|
||||
<paragraph><location><page_10><loc_22><loc_26><loc_44><loc_28></location>- GLYPH<SM590000> Security and Governance.</paragraph>
|
||||
<paragraph><location><page_10><loc_22><loc_13><loc_89><loc_25></location>From a performance perspective, CP4D on IBM Z provides your data and AI with high transaction processing and a powerful infrastructure. From the embedded accelerators perspective, CP4D on IBM Z can investigate each transaction thanks to a cutting-edge DL inference technology even in the most demanding, sensitive, and latency-prone real-time workloads. From a reliability perspective, CP4D on IBM Z provides high availability and resiliency. Lastly from the security perspective, CP4D on IBM Z is suitable for protecting sensitive data and AI models for enterprises in highly regulated industries or those industries that are worried about security.</paragraph>
|
||||
<subtitle-level-1><location><page_11><loc_11><loc_89><loc_85><loc_91></location>Cloud Pak for Data capabilities on IBM Z and IBM LinuxONE</subtitle-level-1>
|
||||
<paragraph><location><page_11><loc_22><loc_81><loc_89><loc_87></location>With CP4D on IBM Z and IBM LinuxONE, users can develop, train, and deploy AI and ML models. Users can accomplish this task by using the CP4D IBM Watsonfi Studio and IBM Watson Machine Learning (WLM) services. By using these two fundamental services, users can accomplish the following tasks:</paragraph>
|
||||
<paragraph><location><page_11><loc_22><loc_79><loc_56><loc_80></location>GLYPH<SM590000> Provision various containerized databases.</paragraph>
|
||||
<paragraph><location><page_11><loc_22><loc_77><loc_69><loc_78></location>GLYPH<SM590000> Explore, clean, shape, and alter data by using Data Refinery.</paragraph>
|
||||
<paragraph><location><page_11><loc_22><loc_75><loc_74><loc_76></location>GLYPH<SM590000> Use project-specific data that is uploaded, or connect to distant data.</paragraph>
|
||||
<paragraph><location><page_11><loc_22><loc_73><loc_54><loc_74></location>GLYPH<SM590000> Create Spark run times and applications.</paragraph>
|
||||
<paragraph><location><page_11><loc_22><loc_70><loc_89><loc_72></location>GLYPH<SM590000> Create, build, evaluate, and deploy analytics and ML models with trust and transparency.</paragraph>
|
||||
<paragraph><location><page_11><loc_22><loc_68><loc_82><loc_70></location>GLYPH<SM590000> Leverage the AI Integrated Accelerator for TensorFlow 2.7.2 and Snap ML 1.9.</paragraph>
|
||||
<paragraph><location><page_11><loc_22><loc_79><loc_56><loc_80></location>- GLYPH<SM590000> Provision various containerized databases.</paragraph>
|
||||
<paragraph><location><page_11><loc_22><loc_77><loc_69><loc_78></location>- GLYPH<SM590000> Explore, clean, shape, and alter data by using Data Refinery.</paragraph>
|
||||
<paragraph><location><page_11><loc_22><loc_75><loc_74><loc_76></location>- GLYPH<SM590000> Use project-specific data that is uploaded, or connect to distant data.</paragraph>
|
||||
<paragraph><location><page_11><loc_22><loc_73><loc_54><loc_74></location>- GLYPH<SM590000> Create Spark run times and applications.</paragraph>
|
||||
<paragraph><location><page_11><loc_22><loc_70><loc_89><loc_72></location>- GLYPH<SM590000> Create, build, evaluate, and deploy analytics and ML models with trust and transparency.</paragraph>
|
||||
<paragraph><location><page_11><loc_22><loc_68><loc_82><loc_70></location>- GLYPH<SM590000> Leverage the AI Integrated Accelerator for TensorFlow 2.7.2 and Snap ML 1.9.</paragraph>
|
||||
<paragraph><location><page_11><loc_22><loc_64><loc_88><loc_67></location>For more information about the specifics of these capabilities, see Capabilities on Linux on IBM Z and IBM LinuxONE.</paragraph>
|
||||
<subtitle-level-1><location><page_11><loc_11><loc_59><loc_41><loc_61></location>Open-source ecosystem</subtitle-level-1>
|
||||
<paragraph><location><page_11><loc_22><loc_48><loc_89><loc_56></location>These days, innovation and product development are not limited to closed doors within an organization. In any industry sector, the solutions include a mix of proprietary code addressing the core business solution that is supported or integrated into other software components from open source. In some cases, enterprises business solutions also are built from open-source community offerings. Thus, open-source software becomes an important ingredient in modern-day solution building.</paragraph>
|
||||
@ -114,12 +114,12 @@
|
||||
<caption>Figure 7 Developing, training, and deploying an AI model on Cloud Pak for Data on IBM Z and IBM LinuxONE</caption>
|
||||
</figure>
|
||||
<paragraph><location><page_13><loc_22><loc_51><loc_81><loc_53></location>In summary, here are some of the reasons why you should choose AI on IBM Z:</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_49><loc_68><loc_50></location>GLYPH<SM590000> World-class AI inference platform for enterprise workloads:</paragraph>
|
||||
<paragraph><location><page_13><loc_25><loc_46><loc_86><loc_48></location>-Embedded accelerators: A centralized on-chip AI accelerator that is shared by all cores.</paragraph>
|
||||
<paragraph><location><page_13><loc_25><loc_42><loc_89><loc_45></location>-Industry standard AI ecosystem: Many industry open-source data science frameworks are available on the platform.</paragraph>
|
||||
<paragraph><location><page_13><loc_25><loc_38><loc_89><loc_41></location>-Seamlessly integrate AI into existing enterprise workload stacks: Train anywhere, and then deploy on IBM Z.</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_36><loc_80><loc_37></location>GLYPH<SM590000> Security: Encrypted memory, and improved trusted execution environments.</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_32><loc_89><loc_35></location>GLYPH<SM590000> Sustainability: Reduce your energy consumption with real-time monitoring tools about the energy consumption of the system.</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_49><loc_68><loc_50></location>- GLYPH<SM590000> World-class AI inference platform for enterprise workloads:</paragraph>
|
||||
<paragraph><location><page_13><loc_25><loc_46><loc_86><loc_48></location>- -Embedded accelerators: A centralized on-chip AI accelerator that is shared by all cores.</paragraph>
|
||||
<paragraph><location><page_13><loc_25><loc_42><loc_89><loc_45></location>- -Industry standard AI ecosystem: Many industry open-source data science frameworks are available on the platform.</paragraph>
|
||||
<paragraph><location><page_13><loc_25><loc_38><loc_89><loc_41></location>- -Seamlessly integrate AI into existing enterprise workload stacks: Train anywhere, and then deploy on IBM Z.</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_36><loc_80><loc_37></location>- GLYPH<SM590000> Security: Encrypted memory, and improved trusted execution environments.</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_32><loc_89><loc_35></location>- GLYPH<SM590000> Sustainability: Reduce your energy consumption with real-time monitoring tools about the energy consumption of the system.</paragraph>
|
||||
<subtitle-level-1><location><page_13><loc_11><loc_27><loc_26><loc_29></location>AI use cases</subtitle-level-1>
|
||||
<paragraph><location><page_13><loc_22><loc_21><loc_87><loc_25></location>With billions of transactions per day in many of today's industries, it is key to get real-time insights about what is happening in your data. AI on the IBM Z stack understands these situations, and it delivers in-transaction inference in real time and at scale.</paragraph>
|
||||
<paragraph><location><page_13><loc_22><loc_13><loc_89><loc_19></location>Core banking solutions running on IBM Z that are involved in processing inbound transactions need real-time fraud detection to prevent fraud. Other types of possible use cases might be credit risk analysis, anti-money laundering, loan approval, fraud detection in payments, and instant payments.</paragraph>
|
||||
@ -127,35 +127,35 @@
|
||||
<paragraph><location><page_14><loc_22><loc_87><loc_86><loc_91></location>For the health care industry, medical image processing (such as MRIs and x-rays), skin cancer detection, and patient monitoring activities such as infant motion analysis, is important.</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_81><loc_89><loc_85></location>For the airline industry, processes such as air traffic management, flight management systems, and flight maintenance predictions are use cases that are ideal candidates for using AI on IBM Z.</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_78><loc_68><loc_79></location>In the following sections, we describe the following use cases:</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_71><loc_89><loc_77></location>GLYPH<SM590000> "Use case 1: Responsible AI augmented with risk and regulatory compliance" on page 12 AI model lifecycle governance, risk management, and regulatory compliance are key to the success of the enterprises. It is imperative to adopt a typical AI model lifecycle to protect new end-to-end risks.</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_69><loc_66><loc_70></location>GLYPH<SM590000> "Use case 2: Credit default risk assessment" on page 22</paragraph>
|
||||
<paragraph><location><page_14><loc_25><loc_62><loc_89><loc_68></location>Core banking solutions running on IBM Z that are involved in processing inbound transactions need real-time fraud detection to prevent fraud. Other types of possible use cases might be credit risk analysis, anti-money laundering, loan approval, fraud detection in payments, and instant payments.</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_60><loc_61><loc_61></location>GLYPH<SM590000> "Use case 3: Clearing and settlement" on page 25</paragraph>
|
||||
<paragraph><location><page_14><loc_25><loc_56><loc_88><loc_59></location>The use of AI can help to predict which trades or transactions have high risk exposures, and propose solutions for a more efficient settlement process.</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_54><loc_74><loc_55></location>GLYPH<SM590000> "Use case 4: Remaining Useful Life of an aircraft engine" on page 27</paragraph>
|
||||
<paragraph><location><page_14><loc_25><loc_50><loc_87><loc_53></location>We describe how AI can help to avoid unplanned aircraft downtime by determining the remaining time or cycles that an aircraft engine is likely to operate before failure.</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_47><loc_88><loc_50></location>GLYPH<SM590000> "Use case 5: AI-powered video analytics on an infant's motions for health prediction" on page 30</paragraph>
|
||||
<paragraph><location><page_14><loc_25><loc_43><loc_89><loc_46></location>In this section, we describe how AI can predict an infant's health conditions by monitoring real-time body movements.</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_71><loc_89><loc_77></location>- GLYPH<SM590000> "Use case 1: Responsible AI augmented with risk and regulatory compliance" on page 12 AI model lifecycle governance, risk management, and regulatory compliance are key to the success of the enterprises. It is imperative to adopt a typical AI model lifecycle to protect new end-to-end risks.</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_69><loc_66><loc_70></location>- GLYPH<SM590000> "Use case 2: Credit default risk assessment" on page 22</paragraph>
|
||||
<paragraph><location><page_14><loc_25><loc_62><loc_89><loc_68></location>- Core banking solutions running on IBM Z that are involved in processing inbound transactions need real-time fraud detection to prevent fraud. Other types of possible use cases might be credit risk analysis, anti-money laundering, loan approval, fraud detection in payments, and instant payments.</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_60><loc_61><loc_61></location>- GLYPH<SM590000> "Use case 3: Clearing and settlement" on page 25</paragraph>
|
||||
<paragraph><location><page_14><loc_25><loc_56><loc_88><loc_59></location>- The use of AI can help to predict which trades or transactions have high risk exposures, and propose solutions for a more efficient settlement process.</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_54><loc_74><loc_55></location>- GLYPH<SM590000> "Use case 4: Remaining Useful Life of an aircraft engine" on page 27</paragraph>
|
||||
<paragraph><location><page_14><loc_25><loc_50><loc_87><loc_53></location>- We describe how AI can help to avoid unplanned aircraft downtime by determining the remaining time or cycles that an aircraft engine is likely to operate before failure.</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_47><loc_88><loc_50></location>- GLYPH<SM590000> "Use case 5: AI-powered video analytics on an infant's motions for health prediction" on page 30</paragraph>
|
||||
<paragraph><location><page_14><loc_25><loc_43><loc_89><loc_46></location>- In this section, we describe how AI can predict an infant's health conditions by monitoring real-time body movements.</paragraph>
|
||||
<subtitle-level-1><location><page_14><loc_11><loc_35><loc_89><loc_40></location>Use case 1: Responsible AI augmented with risk and regulatory compliance</subtitle-level-1>
|
||||
<paragraph><location><page_14><loc_22><loc_27><loc_89><loc_33></location>Advancement in AI is changing the world, and organizations must adopt AI to embrace new challenges daily. Many enterprises see tremendous value in adopting AI and ML technologies while establishing organization trust in the models, underlying data, and the process to be followed. An AI model lifecycle can be a daunting task.</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_23><loc_89><loc_26></location>How mature is your AI governance? In this section, we provide a use case demonstrating the trustworthiness of AI and its importance in daily monitoring.</paragraph>
|
||||
<subtitle-level-1><location><page_14><loc_11><loc_19><loc_31><loc_21></location>Industry challenges</subtitle-level-1>
|
||||
<paragraph><location><page_14><loc_22><loc_16><loc_83><loc_17></location>Here are the three main reasons why organizations struggle with the adoption of AI:</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_14><loc_48><loc_15></location>GLYPH<SM590000> Scaling with growing regulations</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_12><loc_71><loc_13></location>GLYPH<SM590000> Lack of confidence in operationalized AI (making responsible AI)</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_9><loc_76><loc_11></location>GLYPH<SM590000> Challenges around managing the risk throughout the entire AI workflow</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_14><loc_48><loc_15></location>- GLYPH<SM590000> Scaling with growing regulations</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_12><loc_71><loc_13></location>- GLYPH<SM590000> Lack of confidence in operationalized AI (making responsible AI)</paragraph>
|
||||
<paragraph><location><page_14><loc_22><loc_9><loc_76><loc_11></location>- GLYPH<SM590000> Challenges around managing the risk throughout the entire AI workflow</paragraph>
|
||||
<subtitle-level-1><location><page_15><loc_22><loc_90><loc_53><loc_91></location>Scaling with growing regulations</subtitle-level-1>
|
||||
<paragraph><location><page_15><loc_22><loc_80><loc_88><loc_89></location>Laws and regulations in the data and AI space are accelerating, and many countries are proposing strict AI policies. Countries are monitoring adherence of these policies by the enterprises and imposing fines for any violations. Responding to these regulations are challenging global organizations where multiple regulations apply. For enterprises, it is important to adopt AI policies when there is change, and to validate explainable models to protect against discrimination.</paragraph>
|
||||
<subtitle-level-1><location><page_15><loc_22><loc_77><loc_37><loc_78></location>Responsible AI</subtitle-level-1>
|
||||
<paragraph><location><page_15><loc_22><loc_71><loc_89><loc_76></location>Responsible AI protects against loss of data privacy, and reduced customer loyalty and trust. A data scientist cannot maximize accuracy and model performance above all other concerns. Practicing responsible AI is a best practice, and you must establish protection and validation to ensure that any models that are placed into production are fair and explainable.</paragraph>
|
||||
<subtitle-level-1><location><page_15><loc_22><loc_67><loc_59><loc_69></location>Risks throughout the entire AI workflow</subtitle-level-1>
|
||||
<paragraph><location><page_15><loc_22><loc_65><loc_64><loc_67></location>Organizations need to mitigate risk of the following items:</paragraph>
|
||||
<paragraph><location><page_15><loc_22><loc_63><loc_63><loc_65></location>GLYPH<SM590000> Deciding not to use certain technologies or practices</paragraph>
|
||||
<paragraph><location><page_15><loc_22><loc_61><loc_74><loc_62></location>GLYPH<SM590000> Using personal information when needed and with a user's consent</paragraph>
|
||||
<paragraph><location><page_15><loc_22><loc_59><loc_60><loc_60></location>GLYPH<SM590000> Ensuring automated decisions are free from bias</paragraph>
|
||||
<paragraph><location><page_15><loc_22><loc_57><loc_76><loc_58></location>GLYPH<SM590000> Customer confidence by providing explanations for business decisions</paragraph>
|
||||
<paragraph><location><page_15><loc_22><loc_55><loc_63><loc_56></location>GLYPH<SM590000> Fraud to the organization and to customer's accounts</paragraph>
|
||||
<paragraph><location><page_15><loc_22><loc_52><loc_54><loc_54></location>GLYPH<SM590000> Delays in putting models into production</paragraph>
|
||||
<paragraph><location><page_15><loc_22><loc_63><loc_63><loc_65></location>- GLYPH<SM590000> Deciding not to use certain technologies or practices</paragraph>
|
||||
<paragraph><location><page_15><loc_22><loc_61><loc_74><loc_62></location>- GLYPH<SM590000> Using personal information when needed and with a user's consent</paragraph>
|
||||
<paragraph><location><page_15><loc_22><loc_59><loc_60><loc_60></location>- GLYPH<SM590000> Ensuring automated decisions are free from bias</paragraph>
|
||||
<paragraph><location><page_15><loc_22><loc_57><loc_76><loc_58></location>- GLYPH<SM590000> Customer confidence by providing explanations for business decisions</paragraph>
|
||||
<paragraph><location><page_15><loc_22><loc_55><loc_63><loc_56></location>- GLYPH<SM590000> Fraud to the organization and to customer's accounts</paragraph>
|
||||
<paragraph><location><page_15><loc_22><loc_52><loc_54><loc_54></location>- GLYPH<SM590000> Delays in putting models into production</paragraph>
|
||||
<paragraph><location><page_15><loc_22><loc_47><loc_89><loc_51></location>In fact, in a recent survey, these concerns were echoed by real AI adopters when asked what aspects of trust are most important to them. Although explaining how AI decides is the primary concern, all of these concerns are important.</paragraph>
|
||||
<paragraph><location><page_15><loc_22><loc_38><loc_89><loc_45></location>The key point here is that risk exists throughout the entire AI lifecycle starting with the underlying data and the business justification behind the "why" of the project and continuing into production. Without a formalized process, there is no way to mitigate these risks to unlock the scale that is required to make automated decisions profitable. With these decisions, the business can operate proactively instead of reactively.</paragraph>
|
||||
<paragraph><location><page_16><loc_22><loc_85><loc_89><loc_91></location>For example, a business can start testing a model before production for fairness metrics. For this task, enterprises need an end-to-end workflow with approvals to mitigate these risks and increase the scale of AI investments, as shown in Figure 8, which presents a typical AI model lifecycle in an enterprise.</paragraph>
|
||||
@ -171,23 +171,23 @@
|
||||
<paragraph><location><page_16><loc_22><loc_16><loc_89><loc_21></location>In a world where trust, transparency and explainable AI matters, every organization wants compliance along with the comfort of understanding how analytic insights and decisions are made. The following sections describe some of the principles and organizational requirements for AI governance.</paragraph>
|
||||
<subtitle-level-1><location><page_17><loc_22><loc_90><loc_41><loc_91></location>Lifecycle governance</subtitle-level-1>
|
||||
<paragraph><location><page_17><loc_22><loc_85><loc_89><loc_89></location>Lifecycle governance helps you manage your business information throughout its lifecycle, that is, from creation to deletion. IBM AI governance addresses the problems that challenge records managements:</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_83><loc_85><loc_84></location>GLYPH<SM590000> Monitor, catalog, and govern AI models from anywhere throughout the AI lifecycle.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_81><loc_70><loc_82></location>GLYPH<SM590000> Automate the capture of model metadata for report generation.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_78><loc_58><loc_80></location>GLYPH<SM590000> Drive transparent and explainable AI at scale.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_76><loc_87><loc_78></location>GLYPH<SM590000> Increase accuracy of predictions by identifying how AI is used and where it is lagging.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_83><loc_85><loc_84></location>- GLYPH<SM590000> Monitor, catalog, and govern AI models from anywhere throughout the AI lifecycle.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_81><loc_70><loc_82></location>- GLYPH<SM590000> Automate the capture of model metadata for report generation.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_78><loc_58><loc_80></location>- GLYPH<SM590000> Drive transparent and explainable AI at scale.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_76><loc_87><loc_78></location>- GLYPH<SM590000> Increase accuracy of predictions by identifying how AI is used and where it is lagging.</paragraph>
|
||||
<subtitle-level-1><location><page_17><loc_22><loc_73><loc_38><loc_75></location>Risk management</subtitle-level-1>
|
||||
<paragraph><location><page_17><loc_22><loc_70><loc_89><loc_73></location>Risk management is used in IBM AI governance to identify, manage, monitor, and report on risk and compliance initiatives at scale:</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_68><loc_81><loc_69></location>GLYPH<SM590000> Automate facts and workflow management to comply with business standards.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_66><loc_74><loc_67></location>GLYPH<SM590000> Use dynamic dashboards for clear and concise customizable results.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_64><loc_72><loc_65></location>GLYPH<SM590000> Enhanced collaboration across multiple regions and geographies.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_68><loc_81><loc_69></location>- GLYPH<SM590000> Automate facts and workflow management to comply with business standards.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_66><loc_74><loc_67></location>- GLYPH<SM590000> Use dynamic dashboards for clear and concise customizable results.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_64><loc_72><loc_65></location>- GLYPH<SM590000> Enhanced collaboration across multiple regions and geographies.</paragraph>
|
||||
<subtitle-level-1><location><page_17><loc_22><loc_61><loc_42><loc_62></location>Regulatory compliance</subtitle-level-1>
|
||||
<paragraph><location><page_17><loc_22><loc_54><loc_89><loc_60></location>Regulatory compliance is a set of rules that organizations must follow to protect sensitive information and ensure human safety. Any business that works with digital assets, consumer data, health regulations, employee safety, and private communications is subject to regulatory compliance.$^{3}$ The IBM AI governance solution for IBM Z includes the following tasks:</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_52><loc_71><loc_53></location>GLYPH<SM590000> Help adhere to external AI regulations for audit and compliance.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_50><loc_76><loc_51></location>GLYPH<SM590000> Convert external AI regulations into policies for automatic enforcement.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_48><loc_82><loc_49></location>GLYPH<SM590000> Use dynamic dashboards for compliance status across policies and regulations.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_52><loc_71><loc_53></location>- GLYPH<SM590000> Help adhere to external AI regulations for audit and compliance.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_50><loc_76><loc_51></location>- GLYPH<SM590000> Convert external AI regulations into policies for automatic enforcement.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_48><loc_82><loc_49></location>- GLYPH<SM590000> Use dynamic dashboards for compliance status across policies and regulations.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_40><loc_89><loc_46></location>Enterprises can develop AI models and deploy them by using IBM Watson Studio or WML on CP4D on Red Hat OpenShift on a virtual machine that is based on IBM z/VM or Red Hat Enterprise Linux KVM on IBM Z. AI governance on IBM LinuxONE is supported in the following two ways:</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_37><loc_86><loc_40></location>GLYPH<SM590000> Monitor the AI models with Watson OpenScale on CP4D on Red Hat OpenShift on a virtual machine on IBM Z.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_28><loc_89><loc_36></location>GLYPH<SM590000> Enterprises can develop AI models by creating and training models by using Watson Studio and development tools such as Jupyter Notebook or JupyterLab, and then deploying the model onto WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z. Then, these enterprises can achieve end-end AI governance by running AI Factsheets, IBM Watson OpenScale, and IBM Watson OpenPagesfi on CP4D on x86.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_37><loc_86><loc_40></location>- GLYPH<SM590000> Monitor the AI models with Watson OpenScale on CP4D on Red Hat OpenShift on a virtual machine on IBM Z.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_28><loc_89><loc_36></location>- GLYPH<SM590000> Enterprises can develop AI models by creating and training models by using Watson Studio and development tools such as Jupyter Notebook or JupyterLab, and then deploying the model onto WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z. Then, these enterprises can achieve end-end AI governance by running AI Factsheets, IBM Watson OpenScale, and IBM Watson OpenPagesfi on CP4D on x86.</paragraph>
|
||||
<paragraph><location><page_17><loc_22><loc_26><loc_84><loc_27></location>Figure 9 on page 16 shows the end-to-end flow for a remote AI governance solution.</paragraph>
|
||||
<caption><location><page_18><loc_11><loc_62><loc_48><loc_63></location>Figure 9 Remote AI governance solution end-to-end flow</caption>
|
||||
<figure>
|
||||
@ -195,25 +195,25 @@
|
||||
<caption>Figure 9 Remote AI governance solution end-to-end flow</caption>
|
||||
</figure>
|
||||
<paragraph><location><page_18><loc_22><loc_59><loc_72><loc_60></location>To achieve end-to-end AI governance, complete the following steps:</paragraph>
|
||||
<paragraph><location><page_18><loc_22><loc_55><loc_89><loc_58></location>1. Create a model entry in IBM OpenPages by using CP4D on a x86 platform, as shown in Figure 10.</paragraph>
|
||||
<paragraph><location><page_18><loc_22><loc_55><loc_89><loc_58></location>- 1. Create a model entry in IBM OpenPages by using CP4D on a x86 platform, as shown in Figure 10.</paragraph>
|
||||
<caption><location><page_18><loc_10><loc_14><loc_46><loc_16></location>Figure 10 Creating a model entry in IBM OpenPages</caption>
|
||||
<figure>
|
||||
<location><page_18><loc_10><loc_16><loc_89><loc_53></location>
|
||||
<caption>Figure 10 Creating a model entry in IBM OpenPages</caption>
|
||||
</figure>
|
||||
<paragraph><location><page_19><loc_22><loc_87><loc_89><loc_91></location>2. Train a model by using Watson Studio and by using development tools such as Jupyter Notebook or JupyterLab on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, as shown in Figure 11.</paragraph>
|
||||
<paragraph><location><page_19><loc_22><loc_87><loc_89><loc_91></location>- 2. Train a model by using Watson Studio and by using development tools such as Jupyter Notebook or JupyterLab on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, as shown in Figure 11.</paragraph>
|
||||
<caption><location><page_19><loc_11><loc_46><loc_47><loc_47></location>Figure 11 Training an AI model by using Watson Studio</caption>
|
||||
<figure>
|
||||
<location><page_19><loc_10><loc_48><loc_89><loc_85></location>
|
||||
<caption>Figure 11 Training an AI model by using Watson Studio</caption>
|
||||
</figure>
|
||||
<paragraph><location><page_19><loc_22><loc_42><loc_89><loc_45></location>3. Deploy the model by using WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, as shown in Figure 12.</paragraph>
|
||||
<paragraph><location><page_19><loc_22><loc_42><loc_89><loc_45></location>- 3. Deploy the model by using WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, as shown in Figure 12.</paragraph>
|
||||
<caption><location><page_19><loc_11><loc_7><loc_57><loc_8></location>Figure 12 Deploying an AI model by using WML on Cloud Pak for Data</caption>
|
||||
<figure>
|
||||
<location><page_19><loc_11><loc_9><loc_90><loc_40></location>
|
||||
<caption>Figure 12 Deploying an AI model by using WML on Cloud Pak for Data</caption>
|
||||
</figure>
|
||||
<paragraph><location><page_20><loc_22><loc_85><loc_89><loc_91></location>4. Track the external model lifecycle by browsing through the Catalogs/Platform assets catalog by using AI Factsheets and OpenPages while using CP4D on an x86 platform, as shown in Figure 13. The external model (deployed on CP4D on Red Hat OpenShift on a virtual machine on IBM Z) is saved as a platform asset catalog on the x86 platform.</paragraph>
|
||||
<paragraph><location><page_20><loc_22><loc_85><loc_89><loc_91></location>- 4. Track the external model lifecycle by browsing through the Catalogs/Platform assets catalog by using AI Factsheets and OpenPages while using CP4D on an x86 platform, as shown in Figure 13. The external model (deployed on CP4D on Red Hat OpenShift on a virtual machine on IBM Z) is saved as a platform asset catalog on the x86 platform.</paragraph>
|
||||
<caption><location><page_20><loc_22><loc_50><loc_40><loc_51></location>Figure 13 External model</caption>
|
||||
<figure>
|
||||
<location><page_20><loc_22><loc_51><loc_87><loc_83></location>
|
||||
@ -231,7 +231,7 @@
|
||||
<location><page_21><loc_10><loc_48><loc_89><loc_86></location>
|
||||
<caption>Figure 15 Model facts that are tracked and synchronized to IBM OpenPages on an x86 platform</caption>
|
||||
</figure>
|
||||
<paragraph><location><page_22><loc_22><loc_88><loc_86><loc_91></location>5. Create an external model by using IBM OpenScale on the x86 platform, as shown in Figure 16.</paragraph>
|
||||
<paragraph><location><page_22><loc_22><loc_88><loc_86><loc_91></location>- 5. Create an external model by using IBM OpenScale on the x86 platform, as shown in Figure 16.</paragraph>
|
||||
<caption><location><page_22><loc_11><loc_50><loc_48><loc_52></location>Figure 16 Creating an external model on an x86 platform</caption>
|
||||
<figure>
|
||||
<location><page_22><loc_10><loc_52><loc_89><loc_86></location>
|
||||
@ -275,11 +275,11 @@
|
||||
</figure>
|
||||
<paragraph><location><page_26><loc_22><loc_46><loc_87><loc_52></location>Data scientists can start creating and training a DL AI model by using a Jupyter Notebook instance and Watson Studio. Then, they can deploy the model by using WML on CP4D running on IBM Z, which provides an endpoint. Other applications, including the IBM WebSphere server, can produce credit risk results by using the model's endpoint.</paragraph>
|
||||
<paragraph><location><page_26><loc_22><loc_42><loc_89><loc_44></location>In summary, here are some considerations for developing real-time AI models, such as credit risk assessment:</paragraph>
|
||||
<paragraph><location><page_26><loc_22><loc_39><loc_85><loc_41></location>GLYPH<SM590000> A preference for in-platform run times of the model, such as faster execution results.</paragraph>
|
||||
<paragraph><location><page_26><loc_22><loc_37><loc_73><loc_38></location>GLYPH<SM590000> Less overhead in the end-to-end flows might improve scoring time.</paragraph>
|
||||
<paragraph><location><page_26><loc_22><loc_34><loc_89><loc_36></location>GLYPH<SM590000> If you are using models that are not deployable, CP4D offers a custom Python run time to build your own stack if they are not available on the platform.</paragraph>
|
||||
<paragraph><location><page_26><loc_22><loc_30><loc_89><loc_33></location>GLYPH<SM590000> AI inferencing based on ML or DL models can increase the accuracy of better credit risk assessment.</paragraph>
|
||||
<paragraph><location><page_26><loc_22><loc_25><loc_87><loc_29></location>GLYPH<SM590000> Using IBM z16 and on-chip AI acceleration with the Telum chip that is embedded with regular Integrated Facility for Linux (IFLs) provides an execution speed for your transactions that cannot be achieved by other means.</paragraph>
|
||||
<paragraph><location><page_26><loc_22><loc_39><loc_85><loc_41></location>- GLYPH<SM590000> A preference for in-platform run times of the model, such as faster execution results.</paragraph>
|
||||
<paragraph><location><page_26><loc_22><loc_37><loc_73><loc_38></location>- GLYPH<SM590000> Less overhead in the end-to-end flows might improve scoring time.</paragraph>
|
||||
<paragraph><location><page_26><loc_22><loc_34><loc_89><loc_36></location>- GLYPH<SM590000> If you are using models that are not deployable, CP4D offers a custom Python run time to build your own stack if they are not available on the platform.</paragraph>
|
||||
<paragraph><location><page_26><loc_22><loc_30><loc_89><loc_33></location>- GLYPH<SM590000> AI inferencing based on ML or DL models can increase the accuracy of better credit risk assessment.</paragraph>
|
||||
<paragraph><location><page_26><loc_22><loc_25><loc_87><loc_29></location>- GLYPH<SM590000> Using IBM z16 and on-chip AI acceleration with the Telum chip that is embedded with regular Integrated Facility for Linux (IFLs) provides an execution speed for your transactions that cannot be achieved by other means.</paragraph>
|
||||
<subtitle-level-1><location><page_27><loc_11><loc_89><loc_55><loc_91></location>Use case 3: Clearing and settlement</subtitle-level-1>
|
||||
<paragraph><location><page_27><loc_22><loc_80><loc_88><loc_87></location>Clearing and settlements involve banks or financial institutions sending and receiving wire transfers by using secure interbank payments networks that can clear or settle numerous transactions. When an individual or business entity initiates a wire transfer, clearing begins the fund delivery process. Banks can begin the settlement phase either immediately after clearing takes place or later, mostly at the end of the business day.</paragraph>
|
||||
<subtitle-level-1><location><page_27><loc_11><loc_76><loc_29><loc_77></location>Industry challenge</subtitle-level-1>
|
||||
@ -296,28 +296,28 @@
|
||||
<caption>Figure 21 Clearing and settlement use case for financial transactions by using Cloud Pak for Data</caption>
|
||||
</figure>
|
||||
<paragraph><location><page_28><loc_22><loc_56><loc_58><loc_57></location>Here are the steps of the high-level process flow:</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_53><loc_86><loc_55></location>1. Create a connection to a database (for example, an IBM Db2fi database) where the historical data will be used for ML model building.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_49><loc_89><loc_52></location>2. Read the data from the database and prepare the data for AI by using the Data Refinery tool in CP4D.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_44><loc_89><loc_48></location>3. A Jupyter Notebook or JupyterLab IDE that is provided by the Watson Studio component in CP4D helps us build and train the AI model. The trained model can be saved into a WML repository.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_42><loc_77><loc_43></location>4. Deploy the saved model into a deployment space for batch deployment.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_39><loc_68><loc_41></location>5. Create a batch deployment by using any of these interfaces:</paragraph>
|
||||
<paragraph><location><page_28><loc_25><loc_37><loc_75><loc_39></location>a. Watson Studio user interface from an Analytics deployment space.</paragraph>
|
||||
<paragraph><location><page_28><loc_25><loc_35><loc_41><loc_36></location>b. WML Python client.</paragraph>
|
||||
<paragraph><location><page_28><loc_25><loc_33><loc_40><loc_34></location>c. WML REST APIs.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_31><loc_68><loc_32></location>6. A hardware configuration can be chosen for the deployment.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_27><loc_89><loc_30></location>7. A batch deployment processes input data from a file, data connection, or connected data in a storage bucket, and writes the output to a selected destination.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_23><loc_83><loc_26></location>8. One way to run batch deployment to predict or score is to create and run a batch deployment job.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_21><loc_44><loc_23></location>9. Provide an input data type:</paragraph>
|
||||
<paragraph><location><page_28><loc_25><loc_19><loc_61><loc_20></location>a. Inline data for entering a JSON format payload.</paragraph>
|
||||
<paragraph><location><page_28><loc_25><loc_17><loc_80><loc_18></location>b. Select Data asset , click Select data source , and then specify your asset.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_15><loc_77><loc_16></location>10.The output data type can be a new output file or a connected data asset.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_11><loc_89><loc_14></location>11.A Kubernetes admin can change the maximum number of concurrent batch jobs that can be run.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_8><loc_87><loc_10></location>12.Get the deployment endpoint URL. For more information, see Getting the deployment endpoint URL.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_53><loc_86><loc_55></location>- 1. Create a connection to a database (for example, an IBM Db2fi database) where the historical data will be used for ML model building.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_49><loc_89><loc_52></location>- 2. Read the data from the database and prepare the data for AI by using the Data Refinery tool in CP4D.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_44><loc_89><loc_48></location>- 3. A Jupyter Notebook or JupyterLab IDE that is provided by the Watson Studio component in CP4D helps us build and train the AI model. The trained model can be saved into a WML repository.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_42><loc_77><loc_43></location>- 4. Deploy the saved model into a deployment space for batch deployment.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_39><loc_68><loc_41></location>- 5. Create a batch deployment by using any of these interfaces:</paragraph>
|
||||
<paragraph><location><page_28><loc_25><loc_37><loc_75><loc_39></location>- a. Watson Studio user interface from an Analytics deployment space.</paragraph>
|
||||
<paragraph><location><page_28><loc_25><loc_35><loc_41><loc_36></location>- b. WML Python client.</paragraph>
|
||||
<paragraph><location><page_28><loc_25><loc_33><loc_40><loc_34></location>- c. WML REST APIs.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_31><loc_68><loc_32></location>- 6. A hardware configuration can be chosen for the deployment.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_27><loc_89><loc_30></location>- 7. A batch deployment processes input data from a file, data connection, or connected data in a storage bucket, and writes the output to a selected destination.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_23><loc_83><loc_26></location>- 8. One way to run batch deployment to predict or score is to create and run a batch deployment job.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_21><loc_44><loc_23></location>- 9. Provide an input data type:</paragraph>
|
||||
<paragraph><location><page_28><loc_25><loc_19><loc_61><loc_20></location>- a. Inline data for entering a JSON format payload.</paragraph>
|
||||
<paragraph><location><page_28><loc_25><loc_17><loc_80><loc_18></location>- b. Select Data asset , click Select data source , and then specify your asset.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_15><loc_77><loc_16></location>- 10.The output data type can be a new output file or a connected data asset.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_11><loc_89><loc_14></location>- 11.A Kubernetes admin can change the maximum number of concurrent batch jobs that can be run.</paragraph>
|
||||
<paragraph><location><page_28><loc_22><loc_8><loc_87><loc_10></location>- 12.Get the deployment endpoint URL. For more information, see Getting the deployment endpoint URL.</paragraph>
|
||||
<subtitle-level-1><location><page_29><loc_11><loc_89><loc_20><loc_91></location>Summary</subtitle-level-1>
|
||||
<paragraph><location><page_29><loc_22><loc_83><loc_87><loc_88></location>With this use case, we attempted to demonstrate how to predict, in real time, whether the transaction that is being processed might be a fraudulent transaction or not. By using the method, you have the following advantages:</paragraph>
|
||||
<paragraph><location><page_29><loc_22><loc_81><loc_61><loc_83></location>GLYPH<SM590000> No Impact to SLAs and the batch process window.</paragraph>
|
||||
<paragraph><location><page_29><loc_22><loc_79><loc_83><loc_80></location>GLYPH<SM590000> Proactively stop losses, and lower operational, regulatory, and compliance costs.</paragraph>
|
||||
<paragraph><location><page_29><loc_22><loc_76><loc_87><loc_78></location>GLYPH<SM590000> The solution is using a DL framework like TensorFlow for high-performing, low latency scoring.</paragraph>
|
||||
<paragraph><location><page_29><loc_22><loc_81><loc_61><loc_83></location>- GLYPH<SM590000> No Impact to SLAs and the batch process window.</paragraph>
|
||||
<paragraph><location><page_29><loc_22><loc_79><loc_83><loc_80></location>- GLYPH<SM590000> Proactively stop losses, and lower operational, regulatory, and compliance costs.</paragraph>
|
||||
<paragraph><location><page_29><loc_22><loc_76><loc_87><loc_78></location>- GLYPH<SM590000> The solution is using a DL framework like TensorFlow for high-performing, low latency scoring.</paragraph>
|
||||
<subtitle-level-1><location><page_29><loc_11><loc_70><loc_79><loc_72></location>Use case 4: Remaining Useful Life of an aircraft engine</subtitle-level-1>
|
||||
<paragraph><location><page_29><loc_22><loc_65><loc_89><loc_68></location>In this use case, we describe how an airline can deploy an AI model for inferencing by using IBMfi zSystems.</paragraph>
|
||||
<paragraph><location><page_29><loc_22><loc_58><loc_89><loc_64></location>Remaining Useful Life (RUL) is the remaining time or cycles that an aircraft engine is likely to operate without any failure. In this case, it is the equivalent of the number of flights remaining for the engine after the last flight. By estimating RUL, the operator can decide on the next maintenance schedule and avoid unplanned downtime.</paragraph>
|
||||
@ -342,11 +342,11 @@
|
||||
<caption>Figure 23 In-depth architectural view</caption>
|
||||
</figure>
|
||||
<paragraph><location><page_31><loc_22><loc_39><loc_82><loc_41></location>In summary, consider the following points while developing an AI-based predictive maintenance application:</paragraph>
|
||||
<paragraph><location><page_31><loc_22><loc_33><loc_89><loc_38></location>GLYPH<SM590000> CP4D offers a Python run time to build a custom solution stack, but also supports different components like Watson Studio, WML, Db2, Data Refinery, OpenScale, AI Factsheets, and OpenPages.</paragraph>
|
||||
<paragraph><location><page_31><loc_22><loc_31><loc_80><loc_33></location>GLYPH<SM590000> The trustworthiness of the predicted output is important for critical use cases.</paragraph>
|
||||
<paragraph><location><page_31><loc_22><loc_28><loc_87><loc_30></location>GLYPH<SM590000> IBM Z provides high data security and low latency requirements at scale for the critical applications.</paragraph>
|
||||
<paragraph><location><page_31><loc_22><loc_24><loc_89><loc_27></location>GLYPH<SM590000> A data scientist can choose to train the model and deploy it on CP4D seamlessly with the latest tech stack that is available.</paragraph>
|
||||
<paragraph><location><page_31><loc_22><loc_20><loc_82><loc_23></location>GLYPH<SM590000> The AIOps and MLOps supported by CP4D to track AI model and data lifecycle throughout the application lifecycle.</paragraph>
|
||||
<paragraph><location><page_31><loc_22><loc_33><loc_89><loc_38></location>- GLYPH<SM590000> CP4D offers a Python run time to build a custom solution stack, but also supports different components like Watson Studio, WML, Db2, Data Refinery, OpenScale, AI Factsheets, and OpenPages.</paragraph>
|
||||
<paragraph><location><page_31><loc_22><loc_31><loc_80><loc_33></location>- GLYPH<SM590000> The trustworthiness of the predicted output is important for critical use cases.</paragraph>
|
||||
<paragraph><location><page_31><loc_22><loc_28><loc_87><loc_30></location>- GLYPH<SM590000> IBM Z provides high data security and low latency requirements at scale for the critical applications.</paragraph>
|
||||
<paragraph><location><page_31><loc_22><loc_24><loc_89><loc_27></location>- GLYPH<SM590000> A data scientist can choose to train the model and deploy it on CP4D seamlessly with the latest tech stack that is available.</paragraph>
|
||||
<paragraph><location><page_31><loc_22><loc_20><loc_82><loc_23></location>- GLYPH<SM590000> The AIOps and MLOps supported by CP4D to track AI model and data lifecycle throughout the application lifecycle.</paragraph>
|
||||
<subtitle-level-1><location><page_32><loc_11><loc_87><loc_89><loc_91></location>Use case 5: AI-powered video analytics on an infant's motions for health prediction</subtitle-level-1>
|
||||
<paragraph><location><page_32><loc_22><loc_77><loc_89><loc_85></location>Each year, approximately 5 million newborns worldwide are suffering from a neuro-developmental disorder. Due to the lack of early diagnoses and intervention, many infants are disabled and abandoned, especially in countries with limited numbers of pediatricians with extensive experience in neuro-developmental disorders. This situation is a conundrum that plagues many families around the world.</paragraph>
|
||||
<paragraph><location><page_32><loc_22><loc_70><loc_89><loc_76></location>Infant motion analysis plays critical importance to understanding and comprehending healthy childhood development. In infants, monitoring their poses provides information about their health that can lead to a better prediction of early developmental risk assessment and diagnosis.</paragraph>
|
||||
@ -369,18 +369,18 @@
|
||||
<paragraph><location><page_33><loc_22><loc_35><loc_89><loc_45></location>Live camera feeds or recorded videos of an infant's movement are the inputs for a pose detection model. This video streaming data was stored in IBM Cloudfi Object Storage for image processing. Video data must be transformed into frames so that the infant's body poses can be detected. These post-estimation components of the pipeline predict the location of all 17-person key points with 3 degrees of freedom each (x, y location and visibility) plus two virtual alignment key points. This approach also embraces a compute-intensive heat map prediction of infant body posture.</paragraph>
|
||||
<paragraph><location><page_33><loc_22><loc_24><loc_88><loc_33></location>When changes in body posture or movement happen, analytics can be performed, and a threshold can be set for the angle of the body and posture movements. An analysis can be performed on movement that is based on that threshold to help to predict an infant's health index in the output video stream by leveraging the IBM z16 on-chip AI acceleration, which provides an execution speed in real time on an edge device, which cannot be achieved by other means.</paragraph>
|
||||
<paragraph><location><page_33><loc_22><loc_22><loc_72><loc_23></location>We can leverage the following AI technology stack for this use case:</paragraph>
|
||||
<paragraph><location><page_33><loc_22><loc_18><loc_89><loc_21></location>GLYPH<SM590000> Convolutional neural network: Build an artificial neural network model on video streaming and images.</paragraph>
|
||||
<paragraph><location><page_33><loc_22><loc_16><loc_74><loc_17></location>GLYPH<SM590000> TensorFlow: A DL back-end framework that is based on TensorFlow.</paragraph>
|
||||
<paragraph><location><page_33><loc_22><loc_12><loc_89><loc_15></location>GLYPH<SM590000> Mediapipe: A library that helps with video streaming processing and prediction of human pose estimation.</paragraph>
|
||||
<paragraph><location><page_33><loc_22><loc_10><loc_84><loc_11></location>GLYPH<SM590000> OpenCV: A real-time computer vision library that helps perform image processing.</paragraph>
|
||||
<paragraph><location><page_33><loc_22><loc_18><loc_89><loc_21></location>- GLYPH<SM590000> Convolutional neural network: Build an artificial neural network model on video streaming and images.</paragraph>
|
||||
<paragraph><location><page_33><loc_22><loc_16><loc_74><loc_17></location>- GLYPH<SM590000> TensorFlow: A DL back-end framework that is based on TensorFlow.</paragraph>
|
||||
<paragraph><location><page_33><loc_22><loc_12><loc_89><loc_15></location>- GLYPH<SM590000> Mediapipe: A library that helps with video streaming processing and prediction of human pose estimation.</paragraph>
|
||||
<paragraph><location><page_33><loc_22><loc_10><loc_84><loc_11></location>- GLYPH<SM590000> OpenCV: A real-time computer vision library that helps perform image processing.</paragraph>
|
||||
<paragraph><location><page_34><loc_22><loc_87><loc_89><loc_91></location>WML was used for deployment of the pose detection model and generated notifications to users with web and mobile applications, and it integrates with Fitbit for push notifications so that hospitals and parents can take preventive actions.</paragraph>
|
||||
<subtitle-level-1><location><page_34><loc_11><loc_81><loc_37><loc_83></location>Additional resources</subtitle-level-1>
|
||||
<paragraph><location><page_34><loc_22><loc_76><loc_89><loc_79></location>GLYPH<SM590000> The Cloud Pak for Data 4.5 on IBM Z Overview Demo video provides an overview of some of the more important features of CP4D on IBM Z.</paragraph>
|
||||
<paragraph><location><page_34><loc_22><loc_74><loc_49><loc_76></location>GLYPH<SM590000> IBM Cloud Pak for Data Tutorials.</paragraph>
|
||||
<paragraph><location><page_34><loc_22><loc_71><loc_85><loc_73></location>GLYPH<SM590000> Here are some additional use cases that use the data science frameworks that are available as part of CP4D on IBM Z and IBM LinuxONE:</paragraph>
|
||||
<paragraph><location><page_34><loc_25><loc_67><loc_86><loc_70></location>-Payment Card Fraud Detection by using TensorFlow on CP4D on IBM Z and IBM LinuxONE is a payment card fraud detection use case.</paragraph>
|
||||
<paragraph><location><page_34><loc_25><loc_63><loc_88><loc_66></location>-Fashion-MNIST clothing classification with PyTorch on Cloud Pak for Data on IBM Z and IBM LinuxONE is a Fashion-MNIST clothing classification use case.</paragraph>
|
||||
<paragraph><location><page_34><loc_25><loc_57><loc_89><loc_62></location>-Payment Card Fraud Prevention by using Snap ML on IBM Cloud Pak for Data on Red Hat OpenShift on a virtual machine on IBM Z and IBM LinuxONE, which leverage the z16 integrated AI accelerator describes a use case that uses Snap Machine Learning in Cloud Pak for Data on IBM Z and IBM LinuxONE. It is a Snap ML use case.</paragraph>
|
||||
<paragraph><location><page_34><loc_22><loc_76><loc_89><loc_79></location>- GLYPH<SM590000> The Cloud Pak for Data 4.5 on IBM Z Overview Demo video provides an overview of some of the more important features of CP4D on IBM Z.</paragraph>
|
||||
<paragraph><location><page_34><loc_22><loc_74><loc_49><loc_76></location>- GLYPH<SM590000> IBM Cloud Pak for Data Tutorials.</paragraph>
|
||||
<paragraph><location><page_34><loc_22><loc_71><loc_85><loc_73></location>- GLYPH<SM590000> Here are some additional use cases that use the data science frameworks that are available as part of CP4D on IBM Z and IBM LinuxONE:</paragraph>
|
||||
<paragraph><location><page_34><loc_25><loc_67><loc_86><loc_70></location>- -Payment Card Fraud Detection by using TensorFlow on CP4D on IBM Z and IBM LinuxONE is a payment card fraud detection use case.</paragraph>
|
||||
<paragraph><location><page_34><loc_25><loc_63><loc_88><loc_66></location>- -Fashion-MNIST clothing classification with PyTorch on Cloud Pak for Data on IBM Z and IBM LinuxONE is a Fashion-MNIST clothing classification use case.</paragraph>
|
||||
<paragraph><location><page_34><loc_25><loc_57><loc_89><loc_62></location>- -Payment Card Fraud Prevention by using Snap ML on IBM Cloud Pak for Data on Red Hat OpenShift on a virtual machine on IBM Z and IBM LinuxONE, which leverage the z16 integrated AI accelerator describes a use case that uses Snap Machine Learning in Cloud Pak for Data on IBM Z and IBM LinuxONE. It is a Snap ML use case.</paragraph>
|
||||
<paragraph><location><page_34><loc_27><loc_53><loc_89><loc_56></location>A companion video can be found at Credit Card Fraud Detection by using Snap ML on IBM Cloud Pak for Data on IBM Z and IBM LinuxONE.</paragraph>
|
||||
<subtitle-level-1><location><page_34><loc_11><loc_47><loc_23><loc_49></location>Summary</subtitle-level-1>
|
||||
<paragraph><location><page_34><loc_22><loc_32><loc_89><loc_45></location>This IBM Redbooksfi publication presented an overview of how IBM Cloud Pak for Data on IBM Z can modernize your data infrastructure; develop and deploy ML and AI models; and instantiate highly efficient analytics deployment on IBM LinuxONE. This publication demonstrated these tasks by guiding the reader through five common use cases where CP4D on IBM Z and IBM LinuxONE uses the different features that are supported on the platform, and showing how the associated features can help an enterprise to build AI and ML models with core transactional data, which results in a highly efficient analytics deployment that minimizes latency, cost inefficiencies, and potential security exposures that are connected with data transportation.</paragraph>
|
||||
@ -399,11 +399,11 @@
|
||||
<paragraph><location><page_35><loc_22><loc_21><loc_89><loc_22></location>Find out more about the residency program, browse the residency index, and apply online at:</paragraph>
|
||||
<paragraph><location><page_35><loc_22><loc_19><loc_49><loc_20></location>ibm.com /redbooks/residencies.html</paragraph>
|
||||
<subtitle-level-1><location><page_36><loc_11><loc_89><loc_44><loc_91></location>Stay connected to IBM Redbooks</subtitle-level-1>
|
||||
<paragraph><location><page_36><loc_22><loc_87><loc_39><loc_88></location>GLYPH<SM590000> Find us on LinkedIn:</paragraph>
|
||||
<paragraph><location><page_36><loc_22><loc_87><loc_39><loc_88></location>- GLYPH<SM590000> Find us on LinkedIn:</paragraph>
|
||||
<paragraph><location><page_36><loc_25><loc_84><loc_64><loc_86></location>http://www.linkedin.com/groups?home=&gid=2130806</paragraph>
|
||||
<paragraph><location><page_36><loc_22><loc_81><loc_89><loc_83></location>GLYPH<SM590000> Explore new Redbooks publications, residencies, and workshops with the IBM Redbooks weekly newsletter:</paragraph>
|
||||
<paragraph><location><page_36><loc_25><loc_79><loc_74><loc_80></location>https://www.redbooks.ibm.com/Redbooks.nsf/subscribe?OpenForm</paragraph>
|
||||
<paragraph><location><page_36><loc_22><loc_76><loc_70><loc_78></location>GLYPH<SM590000> Stay current on recent Redbooks publications with RSS Feeds:</paragraph>
|
||||
<paragraph><location><page_36><loc_22><loc_81><loc_89><loc_83></location>- GLYPH<SM590000> Explore new Redbooks publications, residencies, and workshops with the IBM Redbooks weekly newsletter:</paragraph>
|
||||
<paragraph><location><page_36><loc_25><loc_79><loc_74><loc_80></location>- https://www.redbooks.ibm.com/Redbooks.nsf/subscribe?OpenForm</paragraph>
|
||||
<paragraph><location><page_36><loc_22><loc_76><loc_70><loc_78></location>- GLYPH<SM590000> Stay current on recent Redbooks publications with RSS Feeds:</paragraph>
|
||||
<paragraph><location><page_36><loc_25><loc_74><loc_54><loc_76></location>http://www.redbooks.ibm.com/rss.html</paragraph>
|
||||
<subtitle-level-1><location><page_37><loc_11><loc_88><loc_25><loc_91></location>Notices</subtitle-level-1>
|
||||
<paragraph><location><page_37><loc_10><loc_80><loc_89><loc_83></location>This information was developed for products and services offered in the US. This material might be available from IBM in other languages. However, you may be required to own a copy of the product or product version in that language in order to access it.</paragraph>
|
||||
|
File diff suppressed because one or more lines are too long
@ -40,17 +40,17 @@ Ever wonder how many transactions a bank processes per day? What about the pace
|
||||
|
||||
The most recent platform for IBM Z is IBM z16™. The IBM z16 supports the following features:
|
||||
|
||||
GLYPH<SM590000> On-chip AI acceleration
|
||||
- GLYPH<SM590000> On-chip AI acceleration
|
||||
|
||||
GLYPH<SM590000> Quantum-safe crypto discovery
|
||||
- GLYPH<SM590000> Quantum-safe crypto discovery
|
||||
|
||||
GLYPH<SM590000> Simplified compliance
|
||||
- GLYPH<SM590000> Simplified compliance
|
||||
|
||||
GLYPH<SM590000> Flexible capacity
|
||||
- GLYPH<SM590000> Flexible capacity
|
||||
|
||||
GLYPH<SM590000> Modernization of applications
|
||||
- GLYPH<SM590000> Modernization of applications
|
||||
|
||||
GLYPH<SM590000> Sustainability
|
||||
- GLYPH<SM590000> Sustainability
|
||||
|
||||
With these features, enterprises can upgrade applications while preserving secure and resilient data.
|
||||
|
||||
@ -106,13 +106,13 @@ Figure 6 Solution overview of Cloud Pak for Data
|
||||
|
||||
We highlight the four main pillars that make IBM Z the correct infrastructure for CP4D:
|
||||
|
||||
GLYPH<SM590000> Performance and Scale
|
||||
- GLYPH<SM590000> Performance and Scale
|
||||
|
||||
GLYPH<SM590000> Embedded Accelerators
|
||||
- GLYPH<SM590000> Embedded Accelerators
|
||||
|
||||
GLYPH<SM590000> Reliability and Availability
|
||||
- GLYPH<SM590000> Reliability and Availability
|
||||
|
||||
GLYPH<SM590000> Security and Governance.
|
||||
- GLYPH<SM590000> Security and Governance.
|
||||
|
||||
From a performance perspective, CP4D on IBM Z provides your data and AI with high transaction processing and a powerful infrastructure. From the embedded accelerators perspective, CP4D on IBM Z can investigate each transaction thanks to a cutting-edge DL inference technology even in the most demanding, sensitive, and latency-prone real-time workloads. From a reliability perspective, CP4D on IBM Z provides high availability and resiliency. Lastly from the security perspective, CP4D on IBM Z is suitable for protecting sensitive data and AI models for enterprises in highly regulated industries or those industries that are worried about security.
|
||||
|
||||
@ -120,17 +120,17 @@ From a performance perspective, CP4D on IBM Z provides your data and AI with hig
|
||||
|
||||
With CP4D on IBM Z and IBM LinuxONE, users can develop, train, and deploy AI and ML models. Users can accomplish this task by using the CP4D IBM Watsonfi Studio and IBM Watson Machine Learning (WLM) services. By using these two fundamental services, users can accomplish the following tasks:
|
||||
|
||||
GLYPH<SM590000> Provision various containerized databases.
|
||||
- GLYPH<SM590000> Provision various containerized databases.
|
||||
|
||||
GLYPH<SM590000> Explore, clean, shape, and alter data by using Data Refinery.
|
||||
- GLYPH<SM590000> Explore, clean, shape, and alter data by using Data Refinery.
|
||||
|
||||
GLYPH<SM590000> Use project-specific data that is uploaded, or connect to distant data.
|
||||
- GLYPH<SM590000> Use project-specific data that is uploaded, or connect to distant data.
|
||||
|
||||
GLYPH<SM590000> Create Spark run times and applications.
|
||||
- GLYPH<SM590000> Create Spark run times and applications.
|
||||
|
||||
GLYPH<SM590000> Create, build, evaluate, and deploy analytics and ML models with trust and transparency.
|
||||
- GLYPH<SM590000> Create, build, evaluate, and deploy analytics and ML models with trust and transparency.
|
||||
|
||||
GLYPH<SM590000> Leverage the AI Integrated Accelerator for TensorFlow 2.7.2 and Snap ML 1.9.
|
||||
- GLYPH<SM590000> Leverage the AI Integrated Accelerator for TensorFlow 2.7.2 and Snap ML 1.9.
|
||||
|
||||
For more information about the specifics of these capabilities, see Capabilities on Linux on IBM Z and IBM LinuxONE.
|
||||
|
||||
@ -167,17 +167,17 @@ Figure 7 Developing, training, and deploying an AI model on Cloud Pak for Data o
|
||||
|
||||
In summary, here are some of the reasons why you should choose AI on IBM Z:
|
||||
|
||||
GLYPH<SM590000> World-class AI inference platform for enterprise workloads:
|
||||
- GLYPH<SM590000> World-class AI inference platform for enterprise workloads:
|
||||
|
||||
-Embedded accelerators: A centralized on-chip AI accelerator that is shared by all cores.
|
||||
- -Embedded accelerators: A centralized on-chip AI accelerator that is shared by all cores.
|
||||
|
||||
-Industry standard AI ecosystem: Many industry open-source data science frameworks are available on the platform.
|
||||
- -Industry standard AI ecosystem: Many industry open-source data science frameworks are available on the platform.
|
||||
|
||||
-Seamlessly integrate AI into existing enterprise workload stacks: Train anywhere, and then deploy on IBM Z.
|
||||
- -Seamlessly integrate AI into existing enterprise workload stacks: Train anywhere, and then deploy on IBM Z.
|
||||
|
||||
GLYPH<SM590000> Security: Encrypted memory, and improved trusted execution environments.
|
||||
- GLYPH<SM590000> Security: Encrypted memory, and improved trusted execution environments.
|
||||
|
||||
GLYPH<SM590000> Sustainability: Reduce your energy consumption with real-time monitoring tools about the energy consumption of the system.
|
||||
- GLYPH<SM590000> Sustainability: Reduce your energy consumption with real-time monitoring tools about the energy consumption of the system.
|
||||
|
||||
## AI use cases
|
||||
|
||||
@ -193,23 +193,23 @@ For the airline industry, processes such as air traffic management, flight manag
|
||||
|
||||
In the following sections, we describe the following use cases:
|
||||
|
||||
GLYPH<SM590000> "Use case 1: Responsible AI augmented with risk and regulatory compliance" on page 12 AI model lifecycle governance, risk management, and regulatory compliance are key to the success of the enterprises. It is imperative to adopt a typical AI model lifecycle to protect new end-to-end risks.
|
||||
- GLYPH<SM590000> "Use case 1: Responsible AI augmented with risk and regulatory compliance" on page 12 AI model lifecycle governance, risk management, and regulatory compliance are key to the success of the enterprises. It is imperative to adopt a typical AI model lifecycle to protect new end-to-end risks.
|
||||
|
||||
GLYPH<SM590000> "Use case 2: Credit default risk assessment" on page 22
|
||||
- GLYPH<SM590000> "Use case 2: Credit default risk assessment" on page 22
|
||||
|
||||
Core banking solutions running on IBM Z that are involved in processing inbound transactions need real-time fraud detection to prevent fraud. Other types of possible use cases might be credit risk analysis, anti-money laundering, loan approval, fraud detection in payments, and instant payments.
|
||||
- Core banking solutions running on IBM Z that are involved in processing inbound transactions need real-time fraud detection to prevent fraud. Other types of possible use cases might be credit risk analysis, anti-money laundering, loan approval, fraud detection in payments, and instant payments.
|
||||
|
||||
GLYPH<SM590000> "Use case 3: Clearing and settlement" on page 25
|
||||
- GLYPH<SM590000> "Use case 3: Clearing and settlement" on page 25
|
||||
|
||||
The use of AI can help to predict which trades or transactions have high risk exposures, and propose solutions for a more efficient settlement process.
|
||||
- The use of AI can help to predict which trades or transactions have high risk exposures, and propose solutions for a more efficient settlement process.
|
||||
|
||||
GLYPH<SM590000> "Use case 4: Remaining Useful Life of an aircraft engine" on page 27
|
||||
- GLYPH<SM590000> "Use case 4: Remaining Useful Life of an aircraft engine" on page 27
|
||||
|
||||
We describe how AI can help to avoid unplanned aircraft downtime by determining the remaining time or cycles that an aircraft engine is likely to operate before failure.
|
||||
- We describe how AI can help to avoid unplanned aircraft downtime by determining the remaining time or cycles that an aircraft engine is likely to operate before failure.
|
||||
|
||||
GLYPH<SM590000> "Use case 5: AI-powered video analytics on an infant's motions for health prediction" on page 30
|
||||
- GLYPH<SM590000> "Use case 5: AI-powered video analytics on an infant's motions for health prediction" on page 30
|
||||
|
||||
In this section, we describe how AI can predict an infant's health conditions by monitoring real-time body movements.
|
||||
- In this section, we describe how AI can predict an infant's health conditions by monitoring real-time body movements.
|
||||
|
||||
## Use case 1: Responsible AI augmented with risk and regulatory compliance
|
||||
|
||||
@ -221,11 +221,11 @@ How mature is your AI governance? In this section, we provide a use case demonst
|
||||
|
||||
Here are the three main reasons why organizations struggle with the adoption of AI:
|
||||
|
||||
GLYPH<SM590000> Scaling with growing regulations
|
||||
- GLYPH<SM590000> Scaling with growing regulations
|
||||
|
||||
GLYPH<SM590000> Lack of confidence in operationalized AI (making responsible AI)
|
||||
- GLYPH<SM590000> Lack of confidence in operationalized AI (making responsible AI)
|
||||
|
||||
GLYPH<SM590000> Challenges around managing the risk throughout the entire AI workflow
|
||||
- GLYPH<SM590000> Challenges around managing the risk throughout the entire AI workflow
|
||||
|
||||
## Scaling with growing regulations
|
||||
|
||||
@ -239,17 +239,17 @@ Responsible AI protects against loss of data privacy, and reduced customer loyal
|
||||
|
||||
Organizations need to mitigate risk of the following items:
|
||||
|
||||
GLYPH<SM590000> Deciding not to use certain technologies or practices
|
||||
- GLYPH<SM590000> Deciding not to use certain technologies or practices
|
||||
|
||||
GLYPH<SM590000> Using personal information when needed and with a user's consent
|
||||
- GLYPH<SM590000> Using personal information when needed and with a user's consent
|
||||
|
||||
GLYPH<SM590000> Ensuring automated decisions are free from bias
|
||||
- GLYPH<SM590000> Ensuring automated decisions are free from bias
|
||||
|
||||
GLYPH<SM590000> Customer confidence by providing explanations for business decisions
|
||||
- GLYPH<SM590000> Customer confidence by providing explanations for business decisions
|
||||
|
||||
GLYPH<SM590000> Fraud to the organization and to customer's accounts
|
||||
- GLYPH<SM590000> Fraud to the organization and to customer's accounts
|
||||
|
||||
GLYPH<SM590000> Delays in putting models into production
|
||||
- GLYPH<SM590000> Delays in putting models into production
|
||||
|
||||
In fact, in a recent survey, these concerns were echoed by real AI adopters when asked what aspects of trust are most important to them. Although explaining how AI decides is the primary concern, all of these concerns are important.
|
||||
|
||||
@ -274,39 +274,39 @@ In a world where trust, transparency and explainable AI matters, every organizat
|
||||
|
||||
Lifecycle governance helps you manage your business information throughout its lifecycle, that is, from creation to deletion. IBM AI governance addresses the problems that challenge records managements:
|
||||
|
||||
GLYPH<SM590000> Monitor, catalog, and govern AI models from anywhere throughout the AI lifecycle.
|
||||
- GLYPH<SM590000> Monitor, catalog, and govern AI models from anywhere throughout the AI lifecycle.
|
||||
|
||||
GLYPH<SM590000> Automate the capture of model metadata for report generation.
|
||||
- GLYPH<SM590000> Automate the capture of model metadata for report generation.
|
||||
|
||||
GLYPH<SM590000> Drive transparent and explainable AI at scale.
|
||||
- GLYPH<SM590000> Drive transparent and explainable AI at scale.
|
||||
|
||||
GLYPH<SM590000> Increase accuracy of predictions by identifying how AI is used and where it is lagging.
|
||||
- GLYPH<SM590000> Increase accuracy of predictions by identifying how AI is used and where it is lagging.
|
||||
|
||||
## Risk management
|
||||
|
||||
Risk management is used in IBM AI governance to identify, manage, monitor, and report on risk and compliance initiatives at scale:
|
||||
|
||||
GLYPH<SM590000> Automate facts and workflow management to comply with business standards.
|
||||
- GLYPH<SM590000> Automate facts and workflow management to comply with business standards.
|
||||
|
||||
GLYPH<SM590000> Use dynamic dashboards for clear and concise customizable results.
|
||||
- GLYPH<SM590000> Use dynamic dashboards for clear and concise customizable results.
|
||||
|
||||
GLYPH<SM590000> Enhanced collaboration across multiple regions and geographies.
|
||||
- GLYPH<SM590000> Enhanced collaboration across multiple regions and geographies.
|
||||
|
||||
## Regulatory compliance
|
||||
|
||||
Regulatory compliance is a set of rules that organizations must follow to protect sensitive information and ensure human safety. Any business that works with digital assets, consumer data, health regulations, employee safety, and private communications is subject to regulatory compliance.$^{3}$ The IBM AI governance solution for IBM Z includes the following tasks:
|
||||
|
||||
GLYPH<SM590000> Help adhere to external AI regulations for audit and compliance.
|
||||
- GLYPH<SM590000> Help adhere to external AI regulations for audit and compliance.
|
||||
|
||||
GLYPH<SM590000> Convert external AI regulations into policies for automatic enforcement.
|
||||
- GLYPH<SM590000> Convert external AI regulations into policies for automatic enforcement.
|
||||
|
||||
GLYPH<SM590000> Use dynamic dashboards for compliance status across policies and regulations.
|
||||
- GLYPH<SM590000> Use dynamic dashboards for compliance status across policies and regulations.
|
||||
|
||||
Enterprises can develop AI models and deploy them by using IBM Watson Studio or WML on CP4D on Red Hat OpenShift on a virtual machine that is based on IBM z/VM or Red Hat Enterprise Linux KVM on IBM Z. AI governance on IBM LinuxONE is supported in the following two ways:
|
||||
|
||||
GLYPH<SM590000> Monitor the AI models with Watson OpenScale on CP4D on Red Hat OpenShift on a virtual machine on IBM Z.
|
||||
- GLYPH<SM590000> Monitor the AI models with Watson OpenScale on CP4D on Red Hat OpenShift on a virtual machine on IBM Z.
|
||||
|
||||
GLYPH<SM590000> Enterprises can develop AI models by creating and training models by using Watson Studio and development tools such as Jupyter Notebook or JupyterLab, and then deploying the model onto WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z. Then, these enterprises can achieve end-end AI governance by running AI Factsheets, IBM Watson OpenScale, and IBM Watson OpenPagesfi on CP4D on x86.
|
||||
- GLYPH<SM590000> Enterprises can develop AI models by creating and training models by using Watson Studio and development tools such as Jupyter Notebook or JupyterLab, and then deploying the model onto WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z. Then, these enterprises can achieve end-end AI governance by running AI Factsheets, IBM Watson OpenScale, and IBM Watson OpenPagesfi on CP4D on x86.
|
||||
|
||||
Figure 9 on page 16 shows the end-to-end flow for a remote AI governance solution.
|
||||
|
||||
@ -315,22 +315,22 @@ Figure 9 Remote AI governance solution end-to-end flow
|
||||
|
||||
To achieve end-to-end AI governance, complete the following steps:
|
||||
|
||||
1. Create a model entry in IBM OpenPages by using CP4D on a x86 platform, as shown in Figure 10.
|
||||
- 1. Create a model entry in IBM OpenPages by using CP4D on a x86 platform, as shown in Figure 10.
|
||||
|
||||
Figure 10 Creating a model entry in IBM OpenPages
|
||||
<!-- image -->
|
||||
|
||||
2. Train a model by using Watson Studio and by using development tools such as Jupyter Notebook or JupyterLab on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, as shown in Figure 11.
|
||||
- 2. Train a model by using Watson Studio and by using development tools such as Jupyter Notebook or JupyterLab on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, as shown in Figure 11.
|
||||
|
||||
Figure 11 Training an AI model by using Watson Studio
|
||||
<!-- image -->
|
||||
|
||||
3. Deploy the model by using WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, as shown in Figure 12.
|
||||
- 3. Deploy the model by using WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, as shown in Figure 12.
|
||||
|
||||
Figure 12 Deploying an AI model by using WML on Cloud Pak for Data
|
||||
<!-- image -->
|
||||
|
||||
4. Track the external model lifecycle by browsing through the Catalogs/Platform assets catalog by using AI Factsheets and OpenPages while using CP4D on an x86 platform, as shown in Figure 13. The external model (deployed on CP4D on Red Hat OpenShift on a virtual machine on IBM Z) is saved as a platform asset catalog on the x86 platform.
|
||||
- 4. Track the external model lifecycle by browsing through the Catalogs/Platform assets catalog by using AI Factsheets and OpenPages while using CP4D on an x86 platform, as shown in Figure 13. The external model (deployed on CP4D on Red Hat OpenShift on a virtual machine on IBM Z) is saved as a platform asset catalog on the x86 platform.
|
||||
|
||||
Figure 13 External model
|
||||
<!-- image -->
|
||||
@ -345,7 +345,7 @@ You can see that the model facts are tracked and synchronized to IBM OpenPages f
|
||||
Figure 15 Model facts that are tracked and synchronized to IBM OpenPages on an x86 platform
|
||||
<!-- image -->
|
||||
|
||||
5. Create an external model by using IBM OpenScale on the x86 platform, as shown in Figure 16.
|
||||
- 5. Create an external model by using IBM OpenScale on the x86 platform, as shown in Figure 16.
|
||||
|
||||
Figure 16 Creating an external model on an x86 platform
|
||||
<!-- image -->
|
||||
@ -398,15 +398,15 @@ Data scientists can start creating and training a DL AI model by using a Jupyter
|
||||
|
||||
In summary, here are some considerations for developing real-time AI models, such as credit risk assessment:
|
||||
|
||||
GLYPH<SM590000> A preference for in-platform run times of the model, such as faster execution results.
|
||||
- GLYPH<SM590000> A preference for in-platform run times of the model, such as faster execution results.
|
||||
|
||||
GLYPH<SM590000> Less overhead in the end-to-end flows might improve scoring time.
|
||||
- GLYPH<SM590000> Less overhead in the end-to-end flows might improve scoring time.
|
||||
|
||||
GLYPH<SM590000> If you are using models that are not deployable, CP4D offers a custom Python run time to build your own stack if they are not available on the platform.
|
||||
- GLYPH<SM590000> If you are using models that are not deployable, CP4D offers a custom Python run time to build your own stack if they are not available on the platform.
|
||||
|
||||
GLYPH<SM590000> AI inferencing based on ML or DL models can increase the accuracy of better credit risk assessment.
|
||||
- GLYPH<SM590000> AI inferencing based on ML or DL models can increase the accuracy of better credit risk assessment.
|
||||
|
||||
GLYPH<SM590000> Using IBM z16 and on-chip AI acceleration with the Telum chip that is embedded with regular Integrated Facility for Linux (IFLs) provides an execution speed for your transactions that cannot be achieved by other means.
|
||||
- GLYPH<SM590000> Using IBM z16 and on-chip AI acceleration with the Telum chip that is embedded with regular Integrated Facility for Linux (IFLs) provides an execution speed for your transactions that cannot be achieved by other means.
|
||||
|
||||
## Use case 3: Clearing and settlement
|
||||
|
||||
@ -433,49 +433,49 @@ Figure 21 Clearing and settlement use case for financial transactions by using C
|
||||
|
||||
Here are the steps of the high-level process flow:
|
||||
|
||||
1. Create a connection to a database (for example, an IBM Db2fi database) where the historical data will be used for ML model building.
|
||||
- 1. Create a connection to a database (for example, an IBM Db2fi database) where the historical data will be used for ML model building.
|
||||
|
||||
2. Read the data from the database and prepare the data for AI by using the Data Refinery tool in CP4D.
|
||||
- 2. Read the data from the database and prepare the data for AI by using the Data Refinery tool in CP4D.
|
||||
|
||||
3. A Jupyter Notebook or JupyterLab IDE that is provided by the Watson Studio component in CP4D helps us build and train the AI model. The trained model can be saved into a WML repository.
|
||||
- 3. A Jupyter Notebook or JupyterLab IDE that is provided by the Watson Studio component in CP4D helps us build and train the AI model. The trained model can be saved into a WML repository.
|
||||
|
||||
4. Deploy the saved model into a deployment space for batch deployment.
|
||||
- 4. Deploy the saved model into a deployment space for batch deployment.
|
||||
|
||||
5. Create a batch deployment by using any of these interfaces:
|
||||
- 5. Create a batch deployment by using any of these interfaces:
|
||||
|
||||
a. Watson Studio user interface from an Analytics deployment space.
|
||||
- a. Watson Studio user interface from an Analytics deployment space.
|
||||
|
||||
b. WML Python client.
|
||||
- b. WML Python client.
|
||||
|
||||
c. WML REST APIs.
|
||||
- c. WML REST APIs.
|
||||
|
||||
6. A hardware configuration can be chosen for the deployment.
|
||||
- 6. A hardware configuration can be chosen for the deployment.
|
||||
|
||||
7. A batch deployment processes input data from a file, data connection, or connected data in a storage bucket, and writes the output to a selected destination.
|
||||
- 7. A batch deployment processes input data from a file, data connection, or connected data in a storage bucket, and writes the output to a selected destination.
|
||||
|
||||
8. One way to run batch deployment to predict or score is to create and run a batch deployment job.
|
||||
- 8. One way to run batch deployment to predict or score is to create and run a batch deployment job.
|
||||
|
||||
9. Provide an input data type:
|
||||
- 9. Provide an input data type:
|
||||
|
||||
a. Inline data for entering a JSON format payload.
|
||||
- a. Inline data for entering a JSON format payload.
|
||||
|
||||
b. Select Data asset , click Select data source , and then specify your asset.
|
||||
- b. Select Data asset , click Select data source , and then specify your asset.
|
||||
|
||||
10.The output data type can be a new output file or a connected data asset.
|
||||
- 10.The output data type can be a new output file or a connected data asset.
|
||||
|
||||
11.A Kubernetes admin can change the maximum number of concurrent batch jobs that can be run.
|
||||
- 11.A Kubernetes admin can change the maximum number of concurrent batch jobs that can be run.
|
||||
|
||||
12.Get the deployment endpoint URL. For more information, see Getting the deployment endpoint URL.
|
||||
- 12.Get the deployment endpoint URL. For more information, see Getting the deployment endpoint URL.
|
||||
|
||||
## Summary
|
||||
|
||||
With this use case, we attempted to demonstrate how to predict, in real time, whether the transaction that is being processed might be a fraudulent transaction or not. By using the method, you have the following advantages:
|
||||
|
||||
GLYPH<SM590000> No Impact to SLAs and the batch process window.
|
||||
- GLYPH<SM590000> No Impact to SLAs and the batch process window.
|
||||
|
||||
GLYPH<SM590000> Proactively stop losses, and lower operational, regulatory, and compliance costs.
|
||||
- GLYPH<SM590000> Proactively stop losses, and lower operational, regulatory, and compliance costs.
|
||||
|
||||
GLYPH<SM590000> The solution is using a DL framework like TensorFlow for high-performing, low latency scoring.
|
||||
- GLYPH<SM590000> The solution is using a DL framework like TensorFlow for high-performing, low latency scoring.
|
||||
|
||||
## Use case 4: Remaining Useful Life of an aircraft engine
|
||||
|
||||
@ -511,15 +511,15 @@ Figure 23 In-depth architectural view
|
||||
|
||||
In summary, consider the following points while developing an AI-based predictive maintenance application:
|
||||
|
||||
GLYPH<SM590000> CP4D offers a Python run time to build a custom solution stack, but also supports different components like Watson Studio, WML, Db2, Data Refinery, OpenScale, AI Factsheets, and OpenPages.
|
||||
- GLYPH<SM590000> CP4D offers a Python run time to build a custom solution stack, but also supports different components like Watson Studio, WML, Db2, Data Refinery, OpenScale, AI Factsheets, and OpenPages.
|
||||
|
||||
GLYPH<SM590000> The trustworthiness of the predicted output is important for critical use cases.
|
||||
- GLYPH<SM590000> The trustworthiness of the predicted output is important for critical use cases.
|
||||
|
||||
GLYPH<SM590000> IBM Z provides high data security and low latency requirements at scale for the critical applications.
|
||||
- GLYPH<SM590000> IBM Z provides high data security and low latency requirements at scale for the critical applications.
|
||||
|
||||
GLYPH<SM590000> A data scientist can choose to train the model and deploy it on CP4D seamlessly with the latest tech stack that is available.
|
||||
- GLYPH<SM590000> A data scientist can choose to train the model and deploy it on CP4D seamlessly with the latest tech stack that is available.
|
||||
|
||||
GLYPH<SM590000> The AIOps and MLOps supported by CP4D to track AI model and data lifecycle throughout the application lifecycle.
|
||||
- GLYPH<SM590000> The AIOps and MLOps supported by CP4D to track AI model and data lifecycle throughout the application lifecycle.
|
||||
|
||||
## Use case 5: AI-powered video analytics on an infant's motions for health prediction
|
||||
|
||||
@ -558,29 +558,29 @@ When changes in body posture or movement happen, analytics can be performed, and
|
||||
|
||||
We can leverage the following AI technology stack for this use case:
|
||||
|
||||
GLYPH<SM590000> Convolutional neural network: Build an artificial neural network model on video streaming and images.
|
||||
- GLYPH<SM590000> Convolutional neural network: Build an artificial neural network model on video streaming and images.
|
||||
|
||||
GLYPH<SM590000> TensorFlow: A DL back-end framework that is based on TensorFlow.
|
||||
- GLYPH<SM590000> TensorFlow: A DL back-end framework that is based on TensorFlow.
|
||||
|
||||
GLYPH<SM590000> Mediapipe: A library that helps with video streaming processing and prediction of human pose estimation.
|
||||
- GLYPH<SM590000> Mediapipe: A library that helps with video streaming processing and prediction of human pose estimation.
|
||||
|
||||
GLYPH<SM590000> OpenCV: A real-time computer vision library that helps perform image processing.
|
||||
- GLYPH<SM590000> OpenCV: A real-time computer vision library that helps perform image processing.
|
||||
|
||||
WML was used for deployment of the pose detection model and generated notifications to users with web and mobile applications, and it integrates with Fitbit for push notifications so that hospitals and parents can take preventive actions.
|
||||
|
||||
## Additional resources
|
||||
|
||||
GLYPH<SM590000> The Cloud Pak for Data 4.5 on IBM Z Overview Demo video provides an overview of some of the more important features of CP4D on IBM Z.
|
||||
- GLYPH<SM590000> The Cloud Pak for Data 4.5 on IBM Z Overview Demo video provides an overview of some of the more important features of CP4D on IBM Z.
|
||||
|
||||
GLYPH<SM590000> IBM Cloud Pak for Data Tutorials.
|
||||
- GLYPH<SM590000> IBM Cloud Pak for Data Tutorials.
|
||||
|
||||
GLYPH<SM590000> Here are some additional use cases that use the data science frameworks that are available as part of CP4D on IBM Z and IBM LinuxONE:
|
||||
- GLYPH<SM590000> Here are some additional use cases that use the data science frameworks that are available as part of CP4D on IBM Z and IBM LinuxONE:
|
||||
|
||||
-Payment Card Fraud Detection by using TensorFlow on CP4D on IBM Z and IBM LinuxONE is a payment card fraud detection use case.
|
||||
- -Payment Card Fraud Detection by using TensorFlow on CP4D on IBM Z and IBM LinuxONE is a payment card fraud detection use case.
|
||||
|
||||
-Fashion-MNIST clothing classification with PyTorch on Cloud Pak for Data on IBM Z and IBM LinuxONE is a Fashion-MNIST clothing classification use case.
|
||||
- -Fashion-MNIST clothing classification with PyTorch on Cloud Pak for Data on IBM Z and IBM LinuxONE is a Fashion-MNIST clothing classification use case.
|
||||
|
||||
-Payment Card Fraud Prevention by using Snap ML on IBM Cloud Pak for Data on Red Hat OpenShift on a virtual machine on IBM Z and IBM LinuxONE, which leverage the z16 integrated AI accelerator describes a use case that uses Snap Machine Learning in Cloud Pak for Data on IBM Z and IBM LinuxONE. It is a Snap ML use case.
|
||||
- -Payment Card Fraud Prevention by using Snap ML on IBM Cloud Pak for Data on Red Hat OpenShift on a virtual machine on IBM Z and IBM LinuxONE, which leverage the z16 integrated AI accelerator describes a use case that uses Snap Machine Learning in Cloud Pak for Data on IBM Z and IBM LinuxONE. It is a Snap ML use case.
|
||||
|
||||
A companion video can be found at Credit Card Fraud Detection by using Snap ML on IBM Cloud Pak for Data on IBM Z and IBM LinuxONE.
|
||||
|
||||
@ -618,15 +618,15 @@ ibm.com /redbooks/residencies.html
|
||||
|
||||
## Stay connected to IBM Redbooks
|
||||
|
||||
GLYPH<SM590000> Find us on LinkedIn:
|
||||
- GLYPH<SM590000> Find us on LinkedIn:
|
||||
|
||||
http://www.linkedin.com/groups?home=&gid=2130806
|
||||
|
||||
GLYPH<SM590000> Explore new Redbooks publications, residencies, and workshops with the IBM Redbooks weekly newsletter:
|
||||
- GLYPH<SM590000> Explore new Redbooks publications, residencies, and workshops with the IBM Redbooks weekly newsletter:
|
||||
|
||||
https://www.redbooks.ibm.com/Redbooks.nsf/subscribe?OpenForm
|
||||
- https://www.redbooks.ibm.com/Redbooks.nsf/subscribe?OpenForm
|
||||
|
||||
GLYPH<SM590000> Stay current on recent Redbooks publications with RSS Feeds:
|
||||
- GLYPH<SM590000> Stay current on recent Redbooks publications with RSS Feeds:
|
||||
|
||||
http://www.redbooks.ibm.com/rss.html
|
||||
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -14,8 +14,6 @@ The occurrence of tables in documents is ubiquitous. They often summarise quanti
|
||||
|
||||
Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.
|
||||
|
||||
|
||||
|
||||
b. Red-annotation of bounding boxes, Blue-predictions by TableFormer
|
||||
|
||||
<!-- image -->
|
||||
@ -26,7 +24,6 @@ Structure predicted by TableFormer:
|
||||
|
||||
Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.
|
||||
|
||||
|
||||
| 0 | 1 | 1 | 2 1 | 2 1 | |
|
||||
|-----|-----|-----|-------|-------|----|
|
||||
| 3 | 4 | 5 3 | 6 | 7 | |
|
||||
@ -46,13 +43,10 @@ In this paper, we want to address these weaknesses and present a robust table-st
|
||||
|
||||
To meet the design criteria listed above, we developed a new model called TableFormer and a synthetically generated table structure dataset called SynthTabNet $^{1}$. In particular, our contributions in this work can be summarised as follows:
|
||||
|
||||
· We propose TableFormer , a transformer based model that predicts tables structure and bounding boxes for the table content simultaneously in an end-to-end approach.
|
||||
|
||||
· Across all benchmark datasets TableFormer significantly outperforms existing state-of-the-art metrics, while being much more efficient in training and inference to existing works.
|
||||
|
||||
· We present SynthTabNet a synthetically generated dataset, with various appearance styles and complexity.
|
||||
|
||||
· An augmented dataset based on PubTabNet [37], FinTabNet [36], and TableBank [17] with generated ground-truth for reproducibility.
|
||||
- · We propose TableFormer , a transformer based model that predicts tables structure and bounding boxes for the table content simultaneously in an end-to-end approach.
|
||||
- · Across all benchmark datasets TableFormer significantly outperforms existing state-of-the-art metrics, while being much more efficient in training and inference to existing works.
|
||||
- · We present SynthTabNet a synthetically generated dataset, with various appearance styles and complexity.
|
||||
- · An augmented dataset based on PubTabNet [37], FinTabNet [36], and TableBank [17] with generated ground-truth for reproducibility.
|
||||
|
||||
The paper is structured as follows. In Sec. 2, we give a brief overview of the current state-of-the-art. In Sec. 3, we describe the datasets on which we train. In Sec. 4, we introduce the TableFormer model-architecture and describe
|
||||
|
||||
@ -80,7 +74,6 @@ We rely on large-scale datasets such as PubTabNet [37], FinTabNet [36], and Tabl
|
||||
|
||||
Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
balance in the previous datasets.
|
||||
@ -101,7 +94,6 @@ In this regard, we have prepared four synthetic datasets, each one containing 15
|
||||
|
||||
Table 1: Both "Combined-Tabnet" and "CombinedTabnet" are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank.
|
||||
|
||||
|
||||
| | Tags | Bbox | Size | Format |
|
||||
|--------------------|--------|--------|--------|----------|
|
||||
| PubTabNet | 3 | 3 | 509k | PNG |
|
||||
@ -127,12 +119,10 @@ CNN Backbone Network. A ResNet-18 CNN is the backbone that receives the table im
|
||||
|
||||
Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Figure 4: Given an input image of a table, the Encoder produces fixed-length features that represent the input image. The features are then passed to both the Structure Decoder and Cell BBox Decoder . During training, the Structure Decoder receives 'tokenized tags' of the HTML code that represent the table structure. Afterwards, a transformer encoder and decoder architecture is employed to produce features that are received by a linear layer, and the Cell BBox Decoder. The linear layer is applied to the features to predict the tags. Simultaneously, the Cell BBox Decoder selects features referring to the data cells (' < td > ', ' < ') and passes them through an attention network, an MLP, and a linear layer to predict the bounding boxes.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
forming classification, and adding an adaptive pooling layer of size 28*28. ResNet by default downsamples the image resolution by 32 and then the encoded image is provided to both the Structure Decoder , and Cell BBox Decoder .
|
||||
@ -195,7 +185,6 @@ Structure. As shown in Tab. 2, TableFormer outperforms all SOTA methods across d
|
||||
|
||||
Table 2: Structure results on PubTabNet (PTN), FinTabNet (FTN), TableBank (TB) and SynthTabNet (STN).
|
||||
|
||||
|
||||
| Model | Dataset | Simple | TEDS Complex | All |
|
||||
|-------------|-----------|----------|----------------|-------|
|
||||
| EDD | PTN | 91.1 | 88.7 | 89.9 |
|
||||
@ -217,7 +206,6 @@ our Cell BBox Decoder accuracy for cells with a class label of 'content' only us
|
||||
|
||||
Table 3: Cell Bounding Box detection results on PubTabNet, and FinTabNet. PP: Post-processing.
|
||||
|
||||
|
||||
| Model | Dataset | mAP | mAP (PP) |
|
||||
|-------------|-------------|-------|------------|
|
||||
| EDD+BBox | PubTabNet | 79.2 | 82.7 |
|
||||
@ -228,7 +216,6 @@ Cell Content. In this section, we evaluate the entire pipeline of recovering a t
|
||||
|
||||
Table 4: Results of structure with content retrieved using cell detection on PubTabNet. In all cases the input is PDF documents with cropped tables.
|
||||
|
||||
|
||||
| Model | Simple | TEDS Complex | All |
|
||||
|-------------|----------|----------------|-------|
|
||||
| Tabula | 78 | 57.8 | 67.9 |
|
||||
@ -264,7 +251,6 @@ Structure predicted by TableFormer, with superimposed matched PDF cell text:
|
||||
|
||||
Text is aligned to match original for ease of viewing
|
||||
|
||||
|
||||
| | Shares (in millions) | Shares (in millions) | Weighted Average Grant Date Fair Value | Weighted Average Grant Date Fair Value |
|
||||
|--------------------------|------------------------|------------------------|------------------------------------------|------------------------------------------|
|
||||
| | RS U s | PSUs | RSUs | PSUs |
|
||||
@ -276,14 +262,12 @@ Text is aligned to match original for ease of viewing
|
||||
|
||||
Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Figure 6: An example of TableFormer predictions (bounding boxes and structure) from generated SynthTabNet table.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
## 5.5. Qualitative Analysis
|
||||
@ -296,89 +280,54 @@ In this paper, we presented TableFormer an end-to-end transformer based approach
|
||||
|
||||
## References
|
||||
|
||||
[1] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-
|
||||
- [1] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-
|
||||
|
||||
<!-- image -->
|
||||
|
||||
end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5
|
||||
- end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5
|
||||
- [2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3
|
||||
- [3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2
|
||||
- [4] Herv'e D'ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2
|
||||
- [5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2
|
||||
- [6] Max Gobel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2
|
||||
- [7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2
|
||||
- [8] Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1
|
||||
- [9] Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1
|
||||
- [10] Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2
|
||||
- [11] Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2
|
||||
- [12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2
|
||||
- [13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl'ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2
|
||||
- [14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2
|
||||
- [15] Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6
|
||||
|
||||
[2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3
|
||||
|
||||
[3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2
|
||||
|
||||
[4] Herv'e D'ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2
|
||||
|
||||
[5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2
|
||||
|
||||
[6] Max Gobel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2
|
||||
|
||||
[7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2
|
||||
|
||||
[8] Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1
|
||||
|
||||
[9] Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1
|
||||
|
||||
[10] Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2
|
||||
|
||||
[11] Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2
|
||||
|
||||
[12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2
|
||||
|
||||
[13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl'ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2
|
||||
|
||||
[14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2
|
||||
|
||||
[15] Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6
|
||||
|
||||
[16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4
|
||||
|
||||
[17] Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou, and Zhoujun Li. Tablebank: A benchmark dataset for table detection and recognition, 2019. 2, 3
|
||||
|
||||
[18] Yiren Li, Zheng Huang, Junchi Yan, Yi Zhou, Fan Ye, and Xianhui Liu. Gfte: Graph-based financial table extraction. In Alberto Del Bimbo, Rita Cucchiara, Stan Sclaroff, Giovanni Maria Farinella, Tao Mei, Marco Bertini, Hugo Jair Escalante, and Roberto Vezzani, editors, Pattern Recognition. ICPR International Workshops and Challenges , pages 644-658, Cham, 2021. Springer International Publishing. 2, 3
|
||||
|
||||
[19] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1
|
||||
|
||||
[20] Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2
|
||||
|
||||
[21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1
|
||||
|
||||
[22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch'e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6
|
||||
|
||||
[23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1
|
||||
|
||||
[24] Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3
|
||||
|
||||
[25] Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on
|
||||
- [16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4
|
||||
- [17] Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou, and Zhoujun Li. Tablebank: A benchmark dataset for table detection and recognition, 2019. 2, 3
|
||||
- [18] Yiren Li, Zheng Huang, Junchi Yan, Yi Zhou, Fan Ye, and Xianhui Liu. Gfte: Graph-based financial table extraction. In Alberto Del Bimbo, Rita Cucchiara, Stan Sclaroff, Giovanni Maria Farinella, Tao Mei, Marco Bertini, Hugo Jair Escalante, and Roberto Vezzani, editors, Pattern Recognition. ICPR International Workshops and Challenges , pages 644-658, Cham, 2021. Springer International Publishing. 2, 3
|
||||
- [19] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1
|
||||
- [20] Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2
|
||||
- [21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1
|
||||
- [22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch'e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6
|
||||
- [23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1
|
||||
- [24] Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3
|
||||
- [25] Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on
|
||||
|
||||
Computer Vision and Pattern Recognition , pages 658-666, 2019. 6
|
||||
|
||||
[26] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 11621167, 2017. 1
|
||||
- [26] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 11621167, 2017. 1
|
||||
- [27] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR international conference on document analysis and recognition (ICDAR) , volume 1, pages 1162-1167. IEEE, 2017. 3
|
||||
- [28] Faisal Shafait and Ray Smith. Table detection in heterogeneous documents. In Proceedings of the 9th IAPR International Workshop on Document Analysis Systems , pages 6572, 2010. 2
|
||||
- [29] Shoaib Ahmed Siddiqui, Imran Ali Fateh, Syed Tahseen Raza Rizvi, Andreas Dengel, and Sheraz Ahmed. Deeptabstr: Deep learning based table structure recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1403-1409. IEEE, 2019. 3
|
||||
- [30] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD , KDD '18, pages 774-782, New York, NY, USA, 2018. ACM. 1
|
||||
- [31] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Ł ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, Advances in Neural Information Processing Systems 30 , pages 5998-6008. Curran Associates, Inc., 2017. 5
|
||||
- [32] Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. Show and tell: A neural image caption generator. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) , June 2015. 2
|
||||
- [33] Wenyuan Xue, Qingyong Li, and Dacheng Tao. Res2tim: reconstruct syntactic structures from table images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 749-755. IEEE, 2019. 3
|
||||
- [34] Wenyuan Xue, Baosheng Yu, Wen Wang, Dacheng Tao, and Qingyong Li. Tgrnet: A table graph reconstruction network for table structure recognition. arXiv preprint arXiv:2106.10598 , 2021. 3
|
||||
- [35] Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 4651-4659, 2016. 4
|
||||
- [36] Xinyi Zheng, Doug Burdick, Lucian Popa, Peter Zhong, and Nancy Xin Ru Wang. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. Winter Conference for Applications in Computer Vision (WACV) , 2021. 2, 3
|
||||
- [37] Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model,
|
||||
|
||||
[27] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR international conference on document analysis and recognition (ICDAR) , volume 1, pages 1162-1167. IEEE, 2017. 3
|
||||
|
||||
[28] Faisal Shafait and Ray Smith. Table detection in heterogeneous documents. In Proceedings of the 9th IAPR International Workshop on Document Analysis Systems , pages 6572, 2010. 2
|
||||
|
||||
[29] Shoaib Ahmed Siddiqui, Imran Ali Fateh, Syed Tahseen Raza Rizvi, Andreas Dengel, and Sheraz Ahmed. Deeptabstr: Deep learning based table structure recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1403-1409. IEEE, 2019. 3
|
||||
|
||||
[30] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD , KDD '18, pages 774-782, New York, NY, USA, 2018. ACM. 1
|
||||
|
||||
[31] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Ł ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, Advances in Neural Information Processing Systems 30 , pages 5998-6008. Curran Associates, Inc., 2017. 5
|
||||
|
||||
[32] Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. Show and tell: A neural image caption generator. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) , June 2015. 2
|
||||
|
||||
[33] Wenyuan Xue, Qingyong Li, and Dacheng Tao. Res2tim: reconstruct syntactic structures from table images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 749-755. IEEE, 2019. 3
|
||||
|
||||
[34] Wenyuan Xue, Baosheng Yu, Wen Wang, Dacheng Tao, and Qingyong Li. Tgrnet: A table graph reconstruction network for table structure recognition. arXiv preprint arXiv:2106.10598 , 2021. 3
|
||||
|
||||
[35] Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 4651-4659, 2016. 4
|
||||
|
||||
[36] Xinyi Zheng, Doug Burdick, Lucian Popa, Peter Zhong, and Nancy Xin Ru Wang. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. Winter Conference for Applications in Computer Vision (WACV) , 2021. 2, 3
|
||||
|
||||
[37] Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model,
|
||||
|
||||
and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7
|
||||
|
||||
[38] Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1
|
||||
- and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7
|
||||
- [38] Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1
|
||||
|
||||
## TableFormer: Table Structure Understanding with Transformers Supplementary Material
|
||||
|
||||
@ -400,15 +349,11 @@ ances in regard to their size, structure, style and content. Every synthetic dat
|
||||
|
||||
The process of generating a synthetic dataset can be decomposed into the following steps:
|
||||
|
||||
1. Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.).
|
||||
|
||||
2. Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans.
|
||||
|
||||
3. Generate content: Based on the dataset theme , a set of suitable content templates is chosen first. Then, this content can be combined with purely random text to produce the synthetic content.
|
||||
|
||||
4. Apply styling templates: Depending on the domain of the synthetic dataset, a set of styling templates is first manually selected. Then, a style is randomly selected to format the appearance of the synthesized table.
|
||||
|
||||
5. Render the complete tables: The synthetic table is finally rendered by a web browser engine to generate the bounding boxes for each table cell. A batching technique is utilized to optimize the runtime overhead of the rendering process.
|
||||
- 1. Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.).
|
||||
- 2. Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans.
|
||||
- 3. Generate content: Based on the dataset theme , a set of suitable content templates is chosen first. Then, this content can be combined with purely random text to produce the synthetic content.
|
||||
- 4. Apply styling templates: Depending on the domain of the synthetic dataset, a set of styling templates is first manually selected. Then, a style is randomly selected to format the appearance of the synthesized table.
|
||||
- 5. Render the complete tables: The synthetic table is finally rendered by a web browser engine to generate the bounding boxes for each table cell. A batching technique is utilized to optimize the runtime overhead of the rendering process.
|
||||
|
||||
## 2. Prediction post-processing for PDF documents
|
||||
|
||||
@ -416,52 +361,40 @@ Although TableFormer can predict the table structure and the bounding boxes for
|
||||
|
||||
Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
· TableFormer output does not include the table cell content.
|
||||
|
||||
· There are occasional inaccuracies in the predictions of the bounding boxes.
|
||||
- · TableFormer output does not include the table cell content.
|
||||
- · There are occasional inaccuracies in the predictions of the bounding boxes.
|
||||
|
||||
However, it is possible to mitigate those limitations by combining the TableFormer predictions with the information already present inside a programmatic PDF document. More specifically, PDF documents can be seen as a sequence of PDF cells where each cell is described by its content and bounding box. If we are able to associate the PDF cells with the predicted table cells, we can directly link the PDF cell content to the table cell structure and use the PDF bounding boxes to correct misalignments in the predicted table cell bounding boxes.
|
||||
|
||||
Here is a step-by-step description of the prediction postprocessing:
|
||||
|
||||
1. Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure.
|
||||
|
||||
2. Generate pair-wise matches between the bounding boxes of the PDF cells and the predicted cells. The Intersection Over Union (IOU) metric is used to evaluate the quality of the matches.
|
||||
|
||||
3. Use a carefully selected IOU threshold to designate the matches as "good" ones and "bad" ones.
|
||||
|
||||
3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column.
|
||||
|
||||
4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula:
|
||||
- 1. Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure.
|
||||
- 2. Generate pair-wise matches between the bounding boxes of the PDF cells and the predicted cells. The Intersection Over Union (IOU) metric is used to evaluate the quality of the matches.
|
||||
- 3. Use a carefully selected IOU threshold to designate the matches as "good" ones and "bad" ones.
|
||||
- 3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column.
|
||||
- 4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula:
|
||||
|
||||
alignment = arg min c { D$_{c}$ } D$_{c}$ = max { x$_{c}$ } - min { x$_{c}$ } (4)
|
||||
|
||||
where c is one of { left, centroid, right } and x$_{c}$ is the xcoordinate for the corresponding point.
|
||||
|
||||
5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-
|
||||
- 5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-
|
||||
|
||||
dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.
|
||||
|
||||
6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.
|
||||
|
||||
7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells.
|
||||
|
||||
8. In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score.
|
||||
|
||||
9. Pick up the remaining orphan cells. There could be cases, when after applying all the previous post-processing steps, some PDF cells could still remain without any match to predicted cells. However, it is still possible to deduce the correct matching for an orphan PDF cell by mapping its bounding box on the geometry of the grid. This mapping decides if the content of the orphan cell will be appended to an already matched table cell, or a new table cell should be created to match with the orphan.
|
||||
- 6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.
|
||||
- 7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells.
|
||||
- 8. In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score.
|
||||
- 9. Pick up the remaining orphan cells. There could be cases, when after applying all the previous post-processing steps, some PDF cells could still remain without any match to predicted cells. However, it is still possible to deduce the correct matching for an orphan PDF cell by mapping its bounding box on the geometry of the grid. This mapping decides if the content of the orphan cell will be appended to an already matched table cell, or a new table cell should be created to match with the orphan.
|
||||
|
||||
9a. Compute the top and bottom boundary of the horizontal band for each grid row (min/max y coordinates per row).
|
||||
|
||||
9b. Intersect the orphan's bounding box with the row bands, and map the cell to the closest grid row.
|
||||
|
||||
9c. Compute the left and right boundary of the vertical band for each grid column (min/max x coordinates per column).
|
||||
|
||||
9d. Intersect the orphan's bounding box with the column bands, and map the cell to the closest grid column.
|
||||
|
||||
9e. If the table cell under the identified row and column is not empty, extend its content with the content of the or-
|
||||
- 9b. Intersect the orphan's bounding box with the row bands, and map the cell to the closest grid row.
|
||||
- 9c. Compute the left and right boundary of the vertical band for each grid column (min/max x coordinates per column).
|
||||
- 9d. Intersect the orphan's bounding box with the column bands, and map the cell to the closest grid column.
|
||||
- 9e. If the table cell under the identified row and column is not empty, extend its content with the content of the or-
|
||||
|
||||
phan cell.
|
||||
|
||||
@ -473,27 +406,22 @@ Figure 8: Example of a table with multi-line header.
|
||||
|
||||
Figure 9: Example of a table with big empty distance between cells.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Figure 10: Example of a complex table with empty cells.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Figure 14: Example with multi-line text.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Figure 11: Simple table with different style and empty cells.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Figure 12: Simple table predictions and post processing.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
<!-- image -->
|
||||
@ -502,22 +430,18 @@ Figure 12: Simple table predictions and post processing.
|
||||
|
||||
Figure 13: Table predictions example on colorful table.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Figure 15: Example with triangular table.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.
|
||||
|
||||
|
||||
<!-- image -->
|
File diff suppressed because one or more lines are too long
@ -26,7 +26,6 @@ KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the o
|
||||
|
||||
Figure 1: Four examples of complex page layouts across different document categories
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
<!-- image -->
|
||||
@ -59,17 +58,14 @@ A key problem in the process of document conversion is to understand the structu
|
||||
|
||||
In this paper, we present the DocLayNet dataset. It provides pageby-page layout annotation ground-truth using bounding-boxes for 11 distinct class labels on 80863 unique document pages, of which a fraction carry double- or triple-annotations. DocLayNet is similar in spirit to PubLayNet and DocBank and will likewise be made available to the public 1 in order to stimulate the document-layout analysis community. It distinguishes itself in the following aspects:
|
||||
|
||||
(1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on human annotation instead of automation approaches to generate the data set.
|
||||
|
||||
(2) Large Layout Variability : We include diverse and complex layouts from a large variety of public sources.
|
||||
|
||||
(3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours.
|
||||
|
||||
(4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.
|
||||
- (1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on human annotation instead of automation approaches to generate the data set.
|
||||
- (2) Large Layout Variability : We include diverse and complex layouts from a large variety of public sources.
|
||||
- (3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours.
|
||||
- (4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.
|
||||
|
||||
This enables experimentation with annotation uncertainty and quality control analysis.
|
||||
|
||||
(5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.
|
||||
- (5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.
|
||||
|
||||
All aspects outlined above are detailed in Section 3. In Section 4, we will elaborate on how we designed and executed this large-scale human annotation campaign. We will also share key insights and lessons learned that might prove helpful for other parties planning to set up annotation campaigns.
|
||||
|
||||
@ -89,7 +85,6 @@ In addition to open intellectual property constraints for the source documents,
|
||||
|
||||
Figure 2: Distribution of DocLayNet pages across document categories.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
to a minimum, since they introduce difficulties in annotation (see Section 4). As a second condition, we focussed on medium to large documents ( > 10 pages) with technical content, dense in complex tables, figures, plots and captions. Such documents carry a lot of information value, but are often hard to analyse with high accuracy due to their challenging layouts. Counterexamples of documents not included in the dataset are receipts, invoices, hand-written documents or photographs showing "text in the wild".
|
||||
@ -112,7 +107,6 @@ The annotation campaign was carried out in four phases. In phase one, we identif
|
||||
|
||||
Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as % of row "Total") in the train, test and validation sets. The inter-annotator agreement is computed as the mAP@0.5-0.95 metric between pairwise annotations from the triple-annotated pages, from which we obtain accuracy ranges.
|
||||
|
||||
|
||||
| | | % of Total | % of Total | % of Total | % of Total | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) | triple inter-annotator mAP @ 0.5-0.95 (%) |
|
||||
|----------------|---------|--------------|--------------|--------------|--------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|---------------------------------------------|
|
||||
| class label | Count | Train | Test | Val | All | Fin | Man | Sci | Law | Pat | Ten |
|
||||
@ -131,7 +125,6 @@ Table 1: DocLayNet dataset overview. Along with the frequency of each class labe
|
||||
|
||||
Figure 3: Corpus Conversion Service annotation user interface. The PDF page is shown in the background, with overlaid text-cells (in darker shades). The annotation boxes can be drawn by dragging a rectangle over each segment with the respective label from the palette on the right.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
we distributed the annotation workload and performed continuous quality controls. Phase one and two required a small team of experts only. For phases three and four, a group of 40 dedicated annotators were assembled and supervised.
|
||||
@ -150,17 +143,12 @@ At first sight, the task of visual document-layout interpretation appears intuit
|
||||
|
||||
Obviously, this inconsistency in annotations is not desirable for datasets which are intended to be used for model training. To minimise these inconsistencies, we created a detailed annotation guideline. While perfect consistency across 40 annotation staff members is clearly not possible to achieve, we saw a huge improvement in annotation consistency after the introduction of our annotation guideline. A few selected, non-trivial highlights of the guideline are:
|
||||
|
||||
(1) Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object.
|
||||
|
||||
(2) A List-item is a paragraph with hanging indentation. Singleline elements can qualify as List-item if the neighbour elements expose hanging indentation. Bullet or enumeration symbols are not a requirement.
|
||||
|
||||
(3) For every Caption , there must be exactly one corresponding Picture or Table .
|
||||
|
||||
(4) Connected sub-pictures are grouped together in one Picture object.
|
||||
|
||||
(5) Formula numbers are included in a Formula object.
|
||||
|
||||
(6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line.
|
||||
- (1) Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object.
|
||||
- (2) A List-item is a paragraph with hanging indentation. Singleline elements can qualify as List-item if the neighbour elements expose hanging indentation. Bullet or enumeration symbols are not a requirement.
|
||||
- (3) For every Caption , there must be exactly one corresponding Picture or Table .
|
||||
- (4) Connected sub-pictures are grouped together in one Picture object.
|
||||
- (5) Formula numbers are included in a Formula object.
|
||||
- (6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line.
|
||||
|
||||
The complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.
|
||||
|
||||
@ -168,7 +156,6 @@ Phase 3: Training. After a first trial with a small group of people, we realised
|
||||
|
||||
Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar.
|
||||
@ -177,7 +164,6 @@ Phase 4: Production annotation. The previously selected 80K pages were annotated
|
||||
|
||||
Table 2: Prediction performance (mAP@0.5-0.95) of object detection networks on DocLayNet test set. The MRCNN (Mask R-CNN) and FRCNN (Faster R-CNN) models with ResNet-50 or ResNet-101 backbone were trained based on the network architectures from the detectron2 model zoo (Mask R-CNN R50, R101-FPN 3x, Faster R-CNN R101-FPN 3x), with default configurations. The YOLO implementation utilized was YOLOv5x6 [13]. All models were initialised using pre-trained weights from the COCO 2017 dataset.
|
||||
|
||||
|
||||
| | human | MRCNN | MRCNN | FRCNN | YOLO |
|
||||
|----------------|---------|---------|---------|---------|--------|
|
||||
| | human | R50 | R101 | R101 | v5x6 |
|
||||
@ -202,7 +188,6 @@ The primary goal of DocLayNet is to obtain high-quality ML models capable of acc
|
||||
|
||||
Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask R-CNN network with ResNet50 backbone trained on increasing fractions of the DocLayNet dataset. The learning curve flattens around the 80% mark, indicating that increasing the size of the DocLayNet dataset with similar data will not yield significantly better predictions.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
paper and leave the detailed evaluation of more recent methods mentioned in Section 2 for future work.
|
||||
@ -217,7 +202,6 @@ Table 3: Performance of a Mask R-CNN R50 network in mAP@0.5-0.95 scores trained
|
||||
|
||||
Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wise split for different label sets. Naive page-wise split will result in GLYPH<tildelow> 10% point improvement.
|
||||
|
||||
|
||||
| Class-count | 11 | 6 | 5 | 4 |
|
||||
|----------------|------|---------|---------|---------|
|
||||
| Caption | 68 | Text | Text | Text |
|
||||
@ -269,7 +253,6 @@ Throughout this paper, we claim that DocLayNet's wider variety of document layou
|
||||
|
||||
Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.
|
||||
|
||||
|
||||
| | | Testing on | Testing on | Testing on |
|
||||
|-----------------|------------|--------------|--------------|--------------|
|
||||
| Training on | labels | PLN | DB | DLN |
|
||||
@ -305,57 +288,35 @@ To date, there is still a significant gap between human and ML accuracy on the l
|
||||
|
||||
## REFERENCES
|
||||
|
||||
[1] Max Göbel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013.
|
||||
|
||||
[2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher. Icdar2017 competition on recognition of documents with complex layouts rdcl2017. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 1404-1410, 2017.
|
||||
|
||||
[3] Hervé Déjean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), April 2019. http://sac.founderit.com/.
|
||||
|
||||
[4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on scientific literature parsing. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag, sep 2021.
|
||||
|
||||
[5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis: not dead yet. International Journal on Document Analysis and Recognition (IJDAR) , pages 1-11, 01 2022.
|
||||
|
||||
[6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest dataset ever for document layout analysis. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep 2019.
|
||||
|
||||
[7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings of the 28th International Conference on Computational Linguistics , COLING, pages 949-960. International Committee on Computational Linguistics, dec 2020.
|
||||
|
||||
[8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC , 2016.
|
||||
|
||||
[9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich feature hierarchies for accurate object detection and semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587. IEEE Computer Society, jun 2014.
|
||||
|
||||
[10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015.
|
||||
|
||||
[11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017.
|
||||
|
||||
[12] Kaiming He, Georgia Gkioxari, Piotr Dollár, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017.
|
||||
|
||||
[13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu
|
||||
- [1] Max Göbel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013.
|
||||
- [2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher. Icdar2017 competition on recognition of documents with complex layouts rdcl2017. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 1404-1410, 2017.
|
||||
- [3] Hervé Déjean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), April 2019. http://sac.founderit.com/.
|
||||
- [4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on scientific literature parsing. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag, sep 2021.
|
||||
- [5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis: not dead yet. International Journal on Document Analysis and Recognition (IJDAR) , pages 1-11, 01 2022.
|
||||
- [6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest dataset ever for document layout analysis. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep 2019.
|
||||
- [7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings of the 28th International Conference on Computational Linguistics , COLING, pages 949-960. International Committee on Computational Linguistics, dec 2020.
|
||||
- [8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC , 2016.
|
||||
- [9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich feature hierarchies for accurate object detection and semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587. IEEE Computer Society, jun 2014.
|
||||
- [10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015.
|
||||
- [11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017.
|
||||
- [12] Kaiming He, Georgia Gkioxari, Piotr Dollár, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017.
|
||||
- [13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu
|
||||
|
||||
Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes.
|
||||
|
||||
Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021.
|
||||
|
||||
[14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020.
|
||||
|
||||
[15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019.
|
||||
|
||||
[16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014.
|
||||
|
||||
[17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019.
|
||||
|
||||
[18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021.
|
||||
|
||||
[19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery.
|
||||
|
||||
[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.
|
||||
|
||||
[21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021.
|
||||
|
||||
[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.
|
||||
|
||||
[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.
|
||||
- [14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020.
|
||||
- [15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019.
|
||||
- [16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014.
|
||||
- [17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019.
|
||||
- [18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021.
|
||||
- [19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery.
|
||||
- [20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.
|
||||
- [21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021.
|
||||
- [22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.
|
||||
- [23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.
|
File diff suppressed because one or more lines are too long
@ -6,7 +6,6 @@ We have chosen the PubTabNet data set to perform HPO, since it includes a highly
|
||||
|
||||
Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.
|
||||
|
||||
|
||||
| # | # | Language | TEDs | TEDs | TEDs | mAP | Inference |
|
||||
|------------|------------|------------|-------------|-------------|-------------|-------------|-------------|
|
||||
| enc-layers | dec-layers | Language | simple | complex | all | (0.75) | time (secs) |
|
||||
|
File diff suppressed because one or more lines are too long
@ -16,7 +16,6 @@ In modern document understanding systems [1,15], table extraction is typically a
|
||||
|
||||
Fig. 1. Comparison between HTML and OTSL table structure representation: (A) table-example with complex row and column headers, including a 2D empty span, (B) minimal graphical representation of table structure using rectangular layout, (C) HTML representation, (D) OTSL representation. This example demonstrates many of the key-features of OTSL, namely its reduced vocabulary size (12 versus 5 in this case), its reduced sequence length (55 versus 30) and a enhanced internal structure (variable token sequence length per row in HTML versus a fixed length of rows in OTSL).
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
today, table detection in documents is a well understood problem, and the latest state-of-the-art (SOTA) object detection methods provide an accuracy comparable to human observers [7,8,10,14,23]. On the other hand, the problem of table structure recognition (TSR) is a lot more challenging and remains a very active area of research, in which many novel machine learning algorithms are being explored [3,4,5,9,11,12,13,14,17,18,21,22].
|
||||
@ -47,7 +46,6 @@ ulary and can be interpreted as a table structure. For example, with the HTML to
|
||||
|
||||
Fig. 2. Frequency of tokens in HTML and OTSL as they appear in PubTabNet.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Obviously, HTML and other general-purpose markup languages were not designed for Im2Seq models. As such, they have some serious drawbacks. First, the token vocabulary needs to be artificially large in order to describe all plausible tabular structures. Since most Im2Seq models use an autoregressive approach, they generate the sequence token by token. Therefore, to reduce inference time, a shorter sequence length is critical. Every table-cell is represented by at least two tokens ( <td> and </td> ). Furthermore, when tokenizing the HTML structure, one needs to explicitly enumerate possible column-spans and row-spans as words. In practice, this ends up requiring 28 different HTML tokens (when including column- and row-spans up to 10 cells) just to describe every table in the PubTabNet dataset. Clearly, not every token is equally represented, as is depicted in Figure 2. This skewed distribution of tokens in combination with variable token row-length makes it challenging for models to learn the HTML structure.
|
||||
@ -70,40 +68,31 @@ In Figure 3, we illustrate how the OTSL is defined. In essence, the OTSL defines
|
||||
|
||||
The OTSL vocabulary is comprised of the following tokens:
|
||||
|
||||
-"C" cell a new table cell that either has or does not have cell content
|
||||
|
||||
-"L" cell left-looking cell , merging with the left neighbor cell to create a span
|
||||
|
||||
-"U" cell up-looking cell , merging with the upper neighbor cell to create a span
|
||||
|
||||
-"X" cell cross cell , to merge with both left and upper neighbor cells
|
||||
|
||||
-"NL" new-line , switch to the next row.
|
||||
- -"C" cell a new table cell that either has or does not have cell content
|
||||
- -"L" cell left-looking cell , merging with the left neighbor cell to create a span
|
||||
- -"U" cell up-looking cell , merging with the upper neighbor cell to create a span
|
||||
- -"X" cell cross cell , to merge with both left and upper neighbor cells
|
||||
- -"NL" new-line , switch to the next row.
|
||||
|
||||
A notable attribute of OTSL is that it has the capability of achieving lossless conversion to HTML.
|
||||
|
||||
Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
## 4.2 Language Syntax
|
||||
|
||||
The OTSL representation follows these syntax rules:
|
||||
|
||||
1. Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell.
|
||||
|
||||
2. Up-looking cell rule : The upper neighbour of a "U" cell must be either another "U" cell or a "C" cell.
|
||||
- 1. Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell.
|
||||
- 2. Up-looking cell rule : The upper neighbour of a "U" cell must be either another "U" cell or a "C" cell.
|
||||
|
||||
## 3. Cross cell rule :
|
||||
|
||||
The left neighbour of an "X" cell must be either another "X" cell or a "U" cell, and the upper neighbour of an "X" cell must be either another "X" cell or an "L" cell.
|
||||
|
||||
4. First row rule : Only "L" cells and "C" cells are allowed in the first row.
|
||||
|
||||
5. First column rule : Only "U" cells and "C" cells are allowed in the first column.
|
||||
|
||||
6. Rectangular rule : The table representation is always rectangular - all rows must have an equal number of tokens, terminated with "NL" token.
|
||||
- The left neighbour of an "X" cell must be either another "X" cell or a "U" cell, and the upper neighbour of an "X" cell must be either another "X" cell or an "L" cell.
|
||||
- 4. First row rule : Only "L" cells and "C" cells are allowed in the first row.
|
||||
- 5. First column rule : Only "U" cells and "C" cells are allowed in the first column.
|
||||
- 6. Rectangular rule : The table representation is always rectangular - all rows must have an equal number of tokens, terminated with "NL" token.
|
||||
|
||||
The application of these rules gives OTSL a set of unique properties. First of all, the OTSL enforces a strictly rectangular structure representation, where every new-line token starts a new row. As a consequence, all rows and all columns have exactly the same number of tokens, irrespective of cell spans. Secondly, the OTSL representation is unambiguous: Every table structure is represented in one way. In this representation every table cell corresponds to a "C"-cell token, which in case of spans is always located in the top-left corner of the table cell definition. Third, OTSL syntax rules are only backward-looking. As a consequence, every predicted token can be validated straight during sequence generation by looking at the previously predicted sequence. As such, OTSL can guarantee that every predicted sequence is syntactically valid.
|
||||
|
||||
@ -121,7 +110,6 @@ To evaluate the impact of OTSL on prediction accuracy and inference times, we co
|
||||
|
||||
Fig. 4. Architecture sketch of the TableFormer model, which is a representative for the Im2Seq approach.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
We rely on standard metrics such as Tree Edit Distance score (TEDs) for table structure prediction, and Mean Average Precision (mAP) with 0.75 Intersection Over Union (IOU) threshold for the bounding-box predictions of table cells. The predicted OTSL structures were converted back to HTML format in
|
||||
@ -134,7 +122,6 @@ We have chosen the PubTabNet data set to perform HPO, since it includes a highly
|
||||
|
||||
Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.
|
||||
|
||||
|
||||
| # | # | Language | TEDs | TEDs | TEDs | mAP | Inference |
|
||||
|------------|------------|------------|-------------|-------------|-------------|-------------|-------------|
|
||||
| enc-layers | dec-layers | Language | simple | complex | all | (0.75) | time (secs) |
|
||||
@ -152,7 +139,6 @@ Additionally, the results show that OTSL has an advantage over HTML when applied
|
||||
|
||||
Table 2. TSR and cell detection results compared between OTSL and HTML on the PubTabNet [22], FinTabNet [21] and PubTables-1M [14] data sets using TableFormer [9] (with enc=6, dec=6, heads=8).
|
||||
|
||||
|
||||
| | Language | TEDs | TEDs | TEDs | mAP(0.75) | Inference time (secs) |
|
||||
|--------------|------------|--------|---------|--------|-------------|-------------------------|
|
||||
| | Language | simple | complex | all | mAP(0.75) | Inference time (secs) |
|
||||
@ -169,7 +155,6 @@ To illustrate the qualitative differences between OTSL and HTML, Figure 5 demons
|
||||
|
||||
Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (E) than the HTML model (D), when predicting the structure of a sparse table (A), at twice the inference speed because of shorter sequence length (B),(C). "PMC2807444_006_00.png" PubTabNet. μ
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
μ
|
||||
@ -178,7 +163,6 @@ Fig. 5. The OTSL model produces more accurate bounding boxes with less overlap (
|
||||
|
||||
Fig. 6. Visualization of predicted structure and detected bounding boxes on a complex table with many rows. The OTSL model (B) captured repeating pattern of horizontally merged cells from the GT (A), unlike the HTML model (C). The HTML model also didn't complete the HTML sequence correctly and displayed a lot more of drift and overlap of bounding boxes. "PMC5406406_003_01.png" PubTabNet.
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
## 6 Conclusion
|
||||
@ -191,48 +175,28 @@ Secondly, OTSL has more inherent structure and a significantly restricted vocabu
|
||||
|
||||
## References
|
||||
|
||||
1. Auer, C., Dolfi, M., Carvalho, A., Ramis, C.B., Staar, P.W.J.: Delivering document conversion as a cloud service with high throughput and responsiveness. CoRR abs/2206.00785 (2022). https://doi.org/10.48550/arXiv.2206.00785 , https://doi.org/10.48550/arXiv.2206.00785
|
||||
- 1. Auer, C., Dolfi, M., Carvalho, A., Ramis, C.B., Staar, P.W.J.: Delivering document conversion as a cloud service with high throughput and responsiveness. CoRR abs/2206.00785 (2022). https://doi.org/10.48550/arXiv.2206.00785 , https://doi.org/10.48550/arXiv.2206.00785
|
||||
- 2. Chen, B., Peng, D., Zhang, J., Ren, Y., Jin, L.: Complex table structure recognition in the wild using transformer and identity matrix-based augmentation. In: Porwal, U., Fornés, A., Shafait, F. (eds.) Frontiers in Handwriting Recognition. pp. 545561. Springer International Publishing, Cham (2022)
|
||||
- 3. Chi, Z., Huang, H., Xu, H.D., Yu, H., Yin, W., Mao, X.L.: Complicated table structure recognition. arXiv preprint arXiv:1908.04729 (2019)
|
||||
- 4. Deng, Y., Rosenberg, D., Mann, G.: Challenges in end-to-end neural scientific table recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 894-901. IEEE (2019)
|
||||
|
||||
2. Chen, B., Peng, D., Zhang, J., Ren, Y., Jin, L.: Complex table structure recognition in the wild using transformer and identity matrix-based augmentation. In: Porwal, U., Fornés, A., Shafait, F. (eds.) Frontiers in Handwriting Recognition. pp. 545561. Springer International Publishing, Cham (2022)
|
||||
- 5. Kayal, P., Anand, M., Desai, H., Singh, M.: Tables to latex: structure and content extraction from scientific tables. International Journal on Document Analysis and Recognition (IJDAR) pp. 1-10 (2022)
|
||||
- 6. Lee, E., Kwon, J., Yang, H., Park, J., Lee, S., Koo, H.I., Cho, N.I.: Table structure recognition based on grid shape graph. In: 2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). pp. 18681873. IEEE (2022)
|
||||
- 7. Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: A benchmark dataset for table detection and recognition (2019)
|
||||
- 8. Livathinos, N., Berrospi, C., Lysak, M., Kuropiatnyk, V., Nassar, A., Carvalho, A., Dolfi, M., Auer, C., Dinkla, K., Staar, P.: Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence 35 (17), 15137-15145 (May 2021), https://ojs.aaai.org/index.php/ AAAI/article/view/17777
|
||||
- 9. Nassar, A., Livathinos, N., Lysak, M., Staar, P.: Tableformer: Table structure understanding with transformers. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4614-4623 (June 2022)
|
||||
- 10. Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.W.J.: Doclaynet: A large human-annotated dataset for document-layout segmentation. In: Zhang, A., Rangwala, H. (eds.) KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Washington, DC, USA, August 14 - 18, 2022. pp. 3743-3751. ACM (2022). https://doi.org/10.1145/3534678.3539043 , https:// doi.org/10.1145/3534678.3539043
|
||||
- 11. Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from imagebased documents. In: Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops. pp. 572-573 (2020)
|
||||
- 12. Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162-1167. IEEE (2017)
|
||||
- 13. Siddiqui, S.A., Fateh, I.A., Rizvi, S.T.R., Dengel, A., Ahmed, S.: Deeptabstr: Deep learning based table structure recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1403-1409 (2019). https:// doi.org/10.1109/ICDAR.2019.00226
|
||||
- 14. Smock, B., Pesala, R., Abraham, R.: PubTables-1M: Towards comprehensive table extraction from unstructured documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4634-4642 (June 2022)
|
||||
- 15. Staar, P.W.J., Dolfi, M., Auer, C., Bekas, C.: Corpus conversion service: A machine learning platform to ingest documents at scale. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. pp. 774-782. KDD '18, Association for Computing Machinery, New York, NY, USA (2018). https://doi.org/10.1145/3219819.3219834 , https://doi.org/10. 1145/3219819.3219834
|
||||
- 16. Wang, X.: Tabular Abstraction, Editing, and Formatting. Ph.D. thesis, CAN (1996), aAINN09397
|
||||
- 17. Xue, W., Li, Q., Tao, D.: Res2tim: Reconstruct syntactic structures from table images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 749-755. IEEE (2019)
|
||||
|
||||
3. Chi, Z., Huang, H., Xu, H.D., Yu, H., Yin, W., Mao, X.L.: Complicated table structure recognition. arXiv preprint arXiv:1908.04729 (2019)
|
||||
|
||||
4. Deng, Y., Rosenberg, D., Mann, G.: Challenges in end-to-end neural scientific table recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 894-901. IEEE (2019)
|
||||
|
||||
5. Kayal, P., Anand, M., Desai, H., Singh, M.: Tables to latex: structure and content extraction from scientific tables. International Journal on Document Analysis and Recognition (IJDAR) pp. 1-10 (2022)
|
||||
|
||||
6. Lee, E., Kwon, J., Yang, H., Park, J., Lee, S., Koo, H.I., Cho, N.I.: Table structure recognition based on grid shape graph. In: 2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). pp. 18681873. IEEE (2022)
|
||||
|
||||
7. Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: A benchmark dataset for table detection and recognition (2019)
|
||||
|
||||
8. Livathinos, N., Berrospi, C., Lysak, M., Kuropiatnyk, V., Nassar, A., Carvalho, A., Dolfi, M., Auer, C., Dinkla, K., Staar, P.: Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence 35 (17), 15137-15145 (May 2021), https://ojs.aaai.org/index.php/ AAAI/article/view/17777
|
||||
|
||||
9. Nassar, A., Livathinos, N., Lysak, M., Staar, P.: Tableformer: Table structure understanding with transformers. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4614-4623 (June 2022)
|
||||
|
||||
10. Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.W.J.: Doclaynet: A large human-annotated dataset for document-layout segmentation. In: Zhang, A., Rangwala, H. (eds.) KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Washington, DC, USA, August 14 - 18, 2022. pp. 3743-3751. ACM (2022). https://doi.org/10.1145/3534678.3539043 , https:// doi.org/10.1145/3534678.3539043
|
||||
|
||||
11. Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from imagebased documents. In: Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops. pp. 572-573 (2020)
|
||||
|
||||
12. Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162-1167. IEEE (2017)
|
||||
|
||||
13. Siddiqui, S.A., Fateh, I.A., Rizvi, S.T.R., Dengel, A., Ahmed, S.: Deeptabstr: Deep learning based table structure recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1403-1409 (2019). https:// doi.org/10.1109/ICDAR.2019.00226
|
||||
|
||||
14. Smock, B., Pesala, R., Abraham, R.: PubTables-1M: Towards comprehensive table extraction from unstructured documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4634-4642 (June 2022)
|
||||
|
||||
15. Staar, P.W.J., Dolfi, M., Auer, C., Bekas, C.: Corpus conversion service: A machine learning platform to ingest documents at scale. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. pp. 774-782. KDD '18, Association for Computing Machinery, New York, NY, USA (2018). https://doi.org/10.1145/3219819.3219834 , https://doi.org/10. 1145/3219819.3219834
|
||||
|
||||
16. Wang, X.: Tabular Abstraction, Editing, and Formatting. Ph.D. thesis, CAN (1996), aAINN09397
|
||||
|
||||
17. Xue, W., Li, Q., Tao, D.: Res2tim: Reconstruct syntactic structures from table images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 749-755. IEEE (2019)
|
||||
|
||||
18. Xue, W., Yu, B., Wang, W., Tao, D., Li, Q.: Tgrnet: A table graph reconstruction network for table structure recognition. In: Proceedings of the IEEE/CVF International Conference on Computer Vision. pp. 1295-1304 (2021)
|
||||
|
||||
19. Ye, J., Qi, X., He, Y., Chen, Y., Gu, D., Gao, P., Xiao, R.: Pingan-vcgroup's solution for icdar 2021 competition on scientific literature parsing task b: Table recognition to html (2021). https://doi.org/10.48550/ARXIV.2105.01848 , https://arxiv.org/abs/2105.01848
|
||||
|
||||
20. Zhang, Z., Zhang, J., Du, J., Wang, F.: Split, embed and merge: An accurate table structure recognizer. Pattern Recognition 126 , 108565 (2022)
|
||||
|
||||
21. Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. In: 2021 IEEE Winter Conference on Applications of Computer Vision (WACV). pp. 697-706 (2021). https://doi.org/10.1109/WACV48630.2021. 00074
|
||||
|
||||
22. Zhong, X., ShafieiBavani, E., Jimeno Yepes, A.: Image-based table recognition: Data, model, and evaluation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) Computer Vision - ECCV 2020. pp. 564-580. Springer International Publishing, Cham (2020)
|
||||
|
||||
23. Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (2019)
|
||||
- 18. Xue, W., Yu, B., Wang, W., Tao, D., Li, Q.: Tgrnet: A table graph reconstruction network for table structure recognition. In: Proceedings of the IEEE/CVF International Conference on Computer Vision. pp. 1295-1304 (2021)
|
||||
- 19. Ye, J., Qi, X., He, Y., Chen, Y., Gu, D., Gao, P., Xiao, R.: Pingan-vcgroup's solution for icdar 2021 competition on scientific literature parsing task b: Table recognition to html (2021). https://doi.org/10.48550/ARXIV.2105.01848 , https://arxiv.org/abs/2105.01848
|
||||
- 20. Zhang, Z., Zhang, J., Du, J., Wang, F.: Split, embed and merge: An accurate table structure recognizer. Pattern Recognition 126 , 108565 (2022)
|
||||
- 21. Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. In: 2021 IEEE Winter Conference on Applications of Computer Vision (WACV). pp. 697-706 (2021). https://doi.org/10.1109/WACV48630.2021. 00074
|
||||
- 22. Zhong, X., ShafieiBavani, E., Jimeno Yepes, A.: Image-based table recognition: Data, model, and evaluation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) Computer Vision - ECCV 2020. pp. 564-580. Springer International Publishing, Cham (2020)
|
||||
- 23. Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (2019)
|
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
@ -36,17 +36,12 @@ Ever wonder how many transactions a bank processes per day? What about the pace
|
||||
|
||||
The most recent platform for IBM Z is IBM z16™. The IBM z16 supports the following features:
|
||||
|
||||
GLYPH<SM590000> On-chip AI acceleration
|
||||
|
||||
GLYPH<SM590000> Quantum-safe crypto discovery
|
||||
|
||||
GLYPH<SM590000> Simplified compliance
|
||||
|
||||
GLYPH<SM590000> Flexible capacity
|
||||
|
||||
GLYPH<SM590000> Modernization of applications
|
||||
|
||||
GLYPH<SM590000> Sustainability
|
||||
- GLYPH<SM590000> On-chip AI acceleration
|
||||
- GLYPH<SM590000> Quantum-safe crypto discovery
|
||||
- GLYPH<SM590000> Simplified compliance
|
||||
- GLYPH<SM590000> Flexible capacity
|
||||
- GLYPH<SM590000> Modernization of applications
|
||||
- GLYPH<SM590000> Sustainability
|
||||
|
||||
With these features, enterprises can upgrade applications while preserving secure and resilient data.
|
||||
|
||||
@ -56,7 +51,6 @@ Figure 1 on page 3 shows a picture of the IBM z16 mainframe.
|
||||
|
||||
Figure 1 IBM z16
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
## IBM z16 and IBM LinuxONE Emperor 4 features
|
||||
@ -67,7 +61,6 @@ Figure 2 provides a snapshot of the IBM Z processor roadmap, which depicts the j
|
||||
|
||||
Figure 2 IBM Z: Processor roadmap
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
The IBM z16 and IBM LinuxONE Emperor 4 are the latest of the IBM Z, and they are developed with a 'built to build' focus to provide a powerful, cyberresilient, open, and secure platform for business with an extra focus on sustainability to help build sustainable data centers. Although the z16 server can host both IBM z/OSfi and Linux workloads, LinuxONE Emperor 4 is built to host Linux only workloads with a focus on consolidation and resiliency. Depending on the workload, consolidation from numerous x86 servers into a LinuxONE Emperor 4 can help reduce energy consumption by 75% and data center floor space by 50%, which helps to achieve the sustainability goals of the organization.
|
||||
@ -76,7 +69,6 @@ Figure 3 on page 5 shows a summary of the system design of IBM LinuxONE Emperor
|
||||
|
||||
Figure 3 System design of IBM z16 LinuxONE Emperor 4
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
The IBM z16 and IBM LinuxONE Emperor 4 servers are built with 7-nm technology at a 5.2 GHz speed. They consist of four dual-chip modules (DCMs) per central processor complex (CPC) drawer, each of which is built with two 8-core Telum processor chips that has "first in the industry" on-chip acceleration for mid-transaction, real-time AI inferencing, which supports many different use cases, including fraud detection.
|
||||
@ -87,7 +79,6 @@ Figure 4 provides more information about the features of AI Accelerator integrat
|
||||
|
||||
Figure 4 IBM z16 on-chip AI Accelerator integration with IBM Z processor cores
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
The IBM z16 and IBM LinuxONE Emperor 4 server platforms are built with the hardware features that are shown in Figure 4 with addressing data and AI workloads in mind. Regardless of where the ML and deep learning (DL) frameworks are used to build and train data and AI models, the inferencing on existing enterprise application data can happen along currently running enterprise business applications. CP4D 4.6 supports Tensorflow and IBM Snap ML frameworks, which are optimized to use the on-chip AI Accelerator during inferencing. Support for various other frameworks is planned for future releases.
|
||||
@ -96,7 +87,6 @@ Figure 5 on page 7 shows the seamless integration of AI into existing enterprise
|
||||
|
||||
Figure 5 Seamless integration
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
## What is Cloud Pak for Data on IBM Z
|
||||
@ -109,18 +99,14 @@ Figure 6 shows a solution overview of CP4D. The infrastructure alternatives are
|
||||
|
||||
Figure 6 Solution overview of Cloud Pak for Data
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
We highlight the four main pillars that make IBM Z the correct infrastructure for CP4D:
|
||||
|
||||
GLYPH<SM590000> Performance and Scale
|
||||
|
||||
GLYPH<SM590000> Embedded Accelerators
|
||||
|
||||
GLYPH<SM590000> Reliability and Availability
|
||||
|
||||
GLYPH<SM590000> Security and Governance.
|
||||
- GLYPH<SM590000> Performance and Scale
|
||||
- GLYPH<SM590000> Embedded Accelerators
|
||||
- GLYPH<SM590000> Reliability and Availability
|
||||
- GLYPH<SM590000> Security and Governance.
|
||||
|
||||
From a performance perspective, CP4D on IBM Z provides your data and AI with high transaction processing and a powerful infrastructure. From the embedded accelerators perspective, CP4D on IBM Z can investigate each transaction thanks to a cutting-edge DL inference technology even in the most demanding, sensitive, and latency-prone real-time workloads. From a reliability perspective, CP4D on IBM Z provides high availability and resiliency. Lastly from the security perspective, CP4D on IBM Z is suitable for protecting sensitive data and AI models for enterprises in highly regulated industries or those industries that are worried about security.
|
||||
|
||||
@ -128,17 +114,12 @@ From a performance perspective, CP4D on IBM Z provides your data and AI with hig
|
||||
|
||||
With CP4D on IBM Z and IBM LinuxONE, users can develop, train, and deploy AI and ML models. Users can accomplish this task by using the CP4D IBM Watsonfi Studio and IBM Watson Machine Learning (WLM) services. By using these two fundamental services, users can accomplish the following tasks:
|
||||
|
||||
GLYPH<SM590000> Provision various containerized databases.
|
||||
|
||||
GLYPH<SM590000> Explore, clean, shape, and alter data by using Data Refinery.
|
||||
|
||||
GLYPH<SM590000> Use project-specific data that is uploaded, or connect to distant data.
|
||||
|
||||
GLYPH<SM590000> Create Spark run times and applications.
|
||||
|
||||
GLYPH<SM590000> Create, build, evaluate, and deploy analytics and ML models with trust and transparency.
|
||||
|
||||
GLYPH<SM590000> Leverage the AI Integrated Accelerator for TensorFlow 2.7.2 and Snap ML 1.9.
|
||||
- GLYPH<SM590000> Provision various containerized databases.
|
||||
- GLYPH<SM590000> Explore, clean, shape, and alter data by using Data Refinery.
|
||||
- GLYPH<SM590000> Use project-specific data that is uploaded, or connect to distant data.
|
||||
- GLYPH<SM590000> Create Spark run times and applications.
|
||||
- GLYPH<SM590000> Create, build, evaluate, and deploy analytics and ML models with trust and transparency.
|
||||
- GLYPH<SM590000> Leverage the AI Integrated Accelerator for TensorFlow 2.7.2 and Snap ML 1.9.
|
||||
|
||||
For more information about the specifics of these capabilities, see Capabilities on Linux on IBM Z and IBM LinuxONE.
|
||||
|
||||
@ -172,22 +153,16 @@ Figure 7 on page 11 provides an overview of the components that are supported on
|
||||
|
||||
Figure 7 Developing, training, and deploying an AI model on Cloud Pak for Data on IBM Z and IBM LinuxONE
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
In summary, here are some of the reasons why you should choose AI on IBM Z:
|
||||
|
||||
GLYPH<SM590000> World-class AI inference platform for enterprise workloads:
|
||||
|
||||
-Embedded accelerators: A centralized on-chip AI accelerator that is shared by all cores.
|
||||
|
||||
-Industry standard AI ecosystem: Many industry open-source data science frameworks are available on the platform.
|
||||
|
||||
-Seamlessly integrate AI into existing enterprise workload stacks: Train anywhere, and then deploy on IBM Z.
|
||||
|
||||
GLYPH<SM590000> Security: Encrypted memory, and improved trusted execution environments.
|
||||
|
||||
GLYPH<SM590000> Sustainability: Reduce your energy consumption with real-time monitoring tools about the energy consumption of the system.
|
||||
- GLYPH<SM590000> World-class AI inference platform for enterprise workloads:
|
||||
- -Embedded accelerators: A centralized on-chip AI accelerator that is shared by all cores.
|
||||
- -Industry standard AI ecosystem: Many industry open-source data science frameworks are available on the platform.
|
||||
- -Seamlessly integrate AI into existing enterprise workload stacks: Train anywhere, and then deploy on IBM Z.
|
||||
- GLYPH<SM590000> Security: Encrypted memory, and improved trusted execution environments.
|
||||
- GLYPH<SM590000> Sustainability: Reduce your energy consumption with real-time monitoring tools about the energy consumption of the system.
|
||||
|
||||
## AI use cases
|
||||
|
||||
@ -203,23 +178,15 @@ For the airline industry, processes such as air traffic management, flight manag
|
||||
|
||||
In the following sections, we describe the following use cases:
|
||||
|
||||
GLYPH<SM590000> "Use case 1: Responsible AI augmented with risk and regulatory compliance" on page 12 AI model lifecycle governance, risk management, and regulatory compliance are key to the success of the enterprises. It is imperative to adopt a typical AI model lifecycle to protect new end-to-end risks.
|
||||
|
||||
GLYPH<SM590000> "Use case 2: Credit default risk assessment" on page 22
|
||||
|
||||
Core banking solutions running on IBM Z that are involved in processing inbound transactions need real-time fraud detection to prevent fraud. Other types of possible use cases might be credit risk analysis, anti-money laundering, loan approval, fraud detection in payments, and instant payments.
|
||||
|
||||
GLYPH<SM590000> "Use case 3: Clearing and settlement" on page 25
|
||||
|
||||
The use of AI can help to predict which trades or transactions have high risk exposures, and propose solutions for a more efficient settlement process.
|
||||
|
||||
GLYPH<SM590000> "Use case 4: Remaining Useful Life of an aircraft engine" on page 27
|
||||
|
||||
We describe how AI can help to avoid unplanned aircraft downtime by determining the remaining time or cycles that an aircraft engine is likely to operate before failure.
|
||||
|
||||
GLYPH<SM590000> "Use case 5: AI-powered video analytics on an infant's motions for health prediction" on page 30
|
||||
|
||||
In this section, we describe how AI can predict an infant's health conditions by monitoring real-time body movements.
|
||||
- GLYPH<SM590000> "Use case 1: Responsible AI augmented with risk and regulatory compliance" on page 12 AI model lifecycle governance, risk management, and regulatory compliance are key to the success of the enterprises. It is imperative to adopt a typical AI model lifecycle to protect new end-to-end risks.
|
||||
- GLYPH<SM590000> "Use case 2: Credit default risk assessment" on page 22
|
||||
- Core banking solutions running on IBM Z that are involved in processing inbound transactions need real-time fraud detection to prevent fraud. Other types of possible use cases might be credit risk analysis, anti-money laundering, loan approval, fraud detection in payments, and instant payments.
|
||||
- GLYPH<SM590000> "Use case 3: Clearing and settlement" on page 25
|
||||
- The use of AI can help to predict which trades or transactions have high risk exposures, and propose solutions for a more efficient settlement process.
|
||||
- GLYPH<SM590000> "Use case 4: Remaining Useful Life of an aircraft engine" on page 27
|
||||
- We describe how AI can help to avoid unplanned aircraft downtime by determining the remaining time or cycles that an aircraft engine is likely to operate before failure.
|
||||
- GLYPH<SM590000> "Use case 5: AI-powered video analytics on an infant's motions for health prediction" on page 30
|
||||
- In this section, we describe how AI can predict an infant's health conditions by monitoring real-time body movements.
|
||||
|
||||
## Use case 1: Responsible AI augmented with risk and regulatory compliance
|
||||
|
||||
@ -231,11 +198,9 @@ How mature is your AI governance? In this section, we provide a use case demonst
|
||||
|
||||
Here are the three main reasons why organizations struggle with the adoption of AI:
|
||||
|
||||
GLYPH<SM590000> Scaling with growing regulations
|
||||
|
||||
GLYPH<SM590000> Lack of confidence in operationalized AI (making responsible AI)
|
||||
|
||||
GLYPH<SM590000> Challenges around managing the risk throughout the entire AI workflow
|
||||
- GLYPH<SM590000> Scaling with growing regulations
|
||||
- GLYPH<SM590000> Lack of confidence in operationalized AI (making responsible AI)
|
||||
- GLYPH<SM590000> Challenges around managing the risk throughout the entire AI workflow
|
||||
|
||||
## Scaling with growing regulations
|
||||
|
||||
@ -249,17 +214,12 @@ Responsible AI protects against loss of data privacy, and reduced customer loyal
|
||||
|
||||
Organizations need to mitigate risk of the following items:
|
||||
|
||||
GLYPH<SM590000> Deciding not to use certain technologies or practices
|
||||
|
||||
GLYPH<SM590000> Using personal information when needed and with a user's consent
|
||||
|
||||
GLYPH<SM590000> Ensuring automated decisions are free from bias
|
||||
|
||||
GLYPH<SM590000> Customer confidence by providing explanations for business decisions
|
||||
|
||||
GLYPH<SM590000> Fraud to the organization and to customer's accounts
|
||||
|
||||
GLYPH<SM590000> Delays in putting models into production
|
||||
- GLYPH<SM590000> Deciding not to use certain technologies or practices
|
||||
- GLYPH<SM590000> Using personal information when needed and with a user's consent
|
||||
- GLYPH<SM590000> Ensuring automated decisions are free from bias
|
||||
- GLYPH<SM590000> Customer confidence by providing explanations for business decisions
|
||||
- GLYPH<SM590000> Fraud to the organization and to customer's accounts
|
||||
- GLYPH<SM590000> Delays in putting models into production
|
||||
|
||||
In fact, in a recent survey, these concerns were echoed by real AI adopters when asked what aspects of trust are most important to them. Although explaining how AI decides is the primary concern, all of these concerns are important.
|
||||
|
||||
@ -269,7 +229,6 @@ For example, a business can start testing a model before production for fairness
|
||||
|
||||
Figure 8 Typical AI model lifecycle
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Due to regulations, more stakeholders adopt the typical AI model lifecycle to protect their brand from new end-to-end risks. To ensure various aspects of both regulatory compliance and security, the personas that must be involved include the chief financial officer (CFO), chief marketing officer (CMO), chief data officer (CDO), HR, and chief regulatory officer (CRO), along with the data engineers, data scientists, and business analysts, who build AI workflows.
|
||||
@ -286,110 +245,92 @@ In a world where trust, transparency and explainable AI matters, every organizat
|
||||
|
||||
Lifecycle governance helps you manage your business information throughout its lifecycle, that is, from creation to deletion. IBM AI governance addresses the problems that challenge records managements:
|
||||
|
||||
GLYPH<SM590000> Monitor, catalog, and govern AI models from anywhere throughout the AI lifecycle.
|
||||
|
||||
GLYPH<SM590000> Automate the capture of model metadata for report generation.
|
||||
|
||||
GLYPH<SM590000> Drive transparent and explainable AI at scale.
|
||||
|
||||
GLYPH<SM590000> Increase accuracy of predictions by identifying how AI is used and where it is lagging.
|
||||
- GLYPH<SM590000> Monitor, catalog, and govern AI models from anywhere throughout the AI lifecycle.
|
||||
- GLYPH<SM590000> Automate the capture of model metadata for report generation.
|
||||
- GLYPH<SM590000> Drive transparent and explainable AI at scale.
|
||||
- GLYPH<SM590000> Increase accuracy of predictions by identifying how AI is used and where it is lagging.
|
||||
|
||||
## Risk management
|
||||
|
||||
Risk management is used in IBM AI governance to identify, manage, monitor, and report on risk and compliance initiatives at scale:
|
||||
|
||||
GLYPH<SM590000> Automate facts and workflow management to comply with business standards.
|
||||
|
||||
GLYPH<SM590000> Use dynamic dashboards for clear and concise customizable results.
|
||||
|
||||
GLYPH<SM590000> Enhanced collaboration across multiple regions and geographies.
|
||||
- GLYPH<SM590000> Automate facts and workflow management to comply with business standards.
|
||||
- GLYPH<SM590000> Use dynamic dashboards for clear and concise customizable results.
|
||||
- GLYPH<SM590000> Enhanced collaboration across multiple regions and geographies.
|
||||
|
||||
## Regulatory compliance
|
||||
|
||||
Regulatory compliance is a set of rules that organizations must follow to protect sensitive information and ensure human safety. Any business that works with digital assets, consumer data, health regulations, employee safety, and private communications is subject to regulatory compliance.$^{3}$ The IBM AI governance solution for IBM Z includes the following tasks:
|
||||
|
||||
GLYPH<SM590000> Help adhere to external AI regulations for audit and compliance.
|
||||
|
||||
GLYPH<SM590000> Convert external AI regulations into policies for automatic enforcement.
|
||||
|
||||
GLYPH<SM590000> Use dynamic dashboards for compliance status across policies and regulations.
|
||||
- GLYPH<SM590000> Help adhere to external AI regulations for audit and compliance.
|
||||
- GLYPH<SM590000> Convert external AI regulations into policies for automatic enforcement.
|
||||
- GLYPH<SM590000> Use dynamic dashboards for compliance status across policies and regulations.
|
||||
|
||||
Enterprises can develop AI models and deploy them by using IBM Watson Studio or WML on CP4D on Red Hat OpenShift on a virtual machine that is based on IBM z/VM or Red Hat Enterprise Linux KVM on IBM Z. AI governance on IBM LinuxONE is supported in the following two ways:
|
||||
|
||||
GLYPH<SM590000> Monitor the AI models with Watson OpenScale on CP4D on Red Hat OpenShift on a virtual machine on IBM Z.
|
||||
|
||||
GLYPH<SM590000> Enterprises can develop AI models by creating and training models by using Watson Studio and development tools such as Jupyter Notebook or JupyterLab, and then deploying the model onto WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z. Then, these enterprises can achieve end-end AI governance by running AI Factsheets, IBM Watson OpenScale, and IBM Watson OpenPagesfi on CP4D on x86.
|
||||
- GLYPH<SM590000> Monitor the AI models with Watson OpenScale on CP4D on Red Hat OpenShift on a virtual machine on IBM Z.
|
||||
- GLYPH<SM590000> Enterprises can develop AI models by creating and training models by using Watson Studio and development tools such as Jupyter Notebook or JupyterLab, and then deploying the model onto WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z. Then, these enterprises can achieve end-end AI governance by running AI Factsheets, IBM Watson OpenScale, and IBM Watson OpenPagesfi on CP4D on x86.
|
||||
|
||||
Figure 9 on page 16 shows the end-to-end flow for a remote AI governance solution.
|
||||
|
||||
Figure 9 Remote AI governance solution end-to-end flow
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
To achieve end-to-end AI governance, complete the following steps:
|
||||
|
||||
1. Create a model entry in IBM OpenPages by using CP4D on a x86 platform, as shown in Figure 10.
|
||||
- 1. Create a model entry in IBM OpenPages by using CP4D on a x86 platform, as shown in Figure 10.
|
||||
|
||||
Figure 10 Creating a model entry in IBM OpenPages
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
2. Train a model by using Watson Studio and by using development tools such as Jupyter Notebook or JupyterLab on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, as shown in Figure 11.
|
||||
- 2. Train a model by using Watson Studio and by using development tools such as Jupyter Notebook or JupyterLab on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, as shown in Figure 11.
|
||||
|
||||
Figure 11 Training an AI model by using Watson Studio
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
3. Deploy the model by using WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, as shown in Figure 12.
|
||||
- 3. Deploy the model by using WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, as shown in Figure 12.
|
||||
|
||||
Figure 12 Deploying an AI model by using WML on Cloud Pak for Data
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
4. Track the external model lifecycle by browsing through the Catalogs/Platform assets catalog by using AI Factsheets and OpenPages while using CP4D on an x86 platform, as shown in Figure 13. The external model (deployed on CP4D on Red Hat OpenShift on a virtual machine on IBM Z) is saved as a platform asset catalog on the x86 platform.
|
||||
- 4. Track the external model lifecycle by browsing through the Catalogs/Platform assets catalog by using AI Factsheets and OpenPages while using CP4D on an x86 platform, as shown in Figure 13. The external model (deployed on CP4D on Red Hat OpenShift on a virtual machine on IBM Z) is saved as a platform asset catalog on the x86 platform.
|
||||
|
||||
Figure 13 External model
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
You can track the model through each stage of the model lifecycle, as shown in Figure 14, by using AI Factsheets and OpenPages.
|
||||
|
||||
Figure 14 Tracking the model
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
You can see that the model facts are tracked and synchronized to IBM OpenPages for risk management, as shown in Figure 15.
|
||||
|
||||
Figure 15 Model facts that are tracked and synchronized to IBM OpenPages on an x86 platform
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
5. Create an external model by using IBM OpenScale on the x86 platform, as shown in Figure 16.
|
||||
- 5. Create an external model by using IBM OpenScale on the x86 platform, as shown in Figure 16.
|
||||
|
||||
Figure 16 Creating an external model on an x86 platform
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
IBM OpenScale provides a comprehensive dashboard that tracks fairness, quality monitoring, drift, and explainability of a model. Fairness determines whether your model produces biased outcomes. Quality determines how well your model predicts outcomes. Drift is the degradation of predictive performance over time. A sample is shown in Figure 17 on page 21.
|
||||
|
||||
Figure 17 IBM OpenScale dashboard that is used to monitor the external model
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
You developed and deployed the AI model by using Watson Studio, WML on CP4D on Red Hat OpenShift on a virtual machine on IBM Z, and end-to-end AI model governance by leveraging AI Factsheets, OpenScale, and OpenPages on CP4D on a x86 platform. Figure 18 shows end-to-end AI governance when using IBM OpenPages, AI Factsheets, and OpenScale.
|
||||
|
||||
Figure 18 Final result: End-to-end AI governance when using IBM OpenPages, AI Factsheets, and OpenScale
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
## Use case 2: Credit default risk assessment
|
||||
@ -410,7 +351,6 @@ Figure 19 on page 23 shows a sample architecture about how to design and develop
|
||||
|
||||
Figure 19 Architecture for credit risk prediction by using an ML AI model on IBM Z
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
A data scientist can leverage Watson Studio to develop and train an AI model and WML to deploy and score the model. In this sample architecture, the WML Python run time leverages the ML framework, IBM Snap Machine Learning (Snap ML), for scoring, can leverage an integrated AI accelerator at the time of model import.
|
||||
@ -427,22 +367,17 @@ Figure 20 shows an architecture for predicting credit risk by using DL on IBM Z.
|
||||
|
||||
Figure 20 Architecture for credit risk prediction by using DL on IBM Z
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Data scientists can start creating and training a DL AI model by using a Jupyter Notebook instance and Watson Studio. Then, they can deploy the model by using WML on CP4D running on IBM Z, which provides an endpoint. Other applications, including the IBM WebSphere server, can produce credit risk results by using the model's endpoint.
|
||||
|
||||
In summary, here are some considerations for developing real-time AI models, such as credit risk assessment:
|
||||
|
||||
GLYPH<SM590000> A preference for in-platform run times of the model, such as faster execution results.
|
||||
|
||||
GLYPH<SM590000> Less overhead in the end-to-end flows might improve scoring time.
|
||||
|
||||
GLYPH<SM590000> If you are using models that are not deployable, CP4D offers a custom Python run time to build your own stack if they are not available on the platform.
|
||||
|
||||
GLYPH<SM590000> AI inferencing based on ML or DL models can increase the accuracy of better credit risk assessment.
|
||||
|
||||
GLYPH<SM590000> Using IBM z16 and on-chip AI acceleration with the Telum chip that is embedded with regular Integrated Facility for Linux (IFLs) provides an execution speed for your transactions that cannot be achieved by other means.
|
||||
- GLYPH<SM590000> A preference for in-platform run times of the model, such as faster execution results.
|
||||
- GLYPH<SM590000> Less overhead in the end-to-end flows might improve scoring time.
|
||||
- GLYPH<SM590000> If you are using models that are not deployable, CP4D offers a custom Python run time to build your own stack if they are not available on the platform.
|
||||
- GLYPH<SM590000> AI inferencing based on ML or DL models can increase the accuracy of better credit risk assessment.
|
||||
- GLYPH<SM590000> Using IBM z16 and on-chip AI acceleration with the Telum chip that is embedded with regular Integrated Facility for Linux (IFLs) provides an execution speed for your transactions that cannot be achieved by other means.
|
||||
|
||||
## Use case 3: Clearing and settlement
|
||||
|
||||
@ -466,54 +401,35 @@ Figure 21 provides a high-level diagram of a clearing and settlement use case fo
|
||||
|
||||
Figure 21 Clearing and settlement use case for financial transactions by using Cloud Pak for Data
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Here are the steps of the high-level process flow:
|
||||
|
||||
1. Create a connection to a database (for example, an IBM Db2fi database) where the historical data will be used for ML model building.
|
||||
|
||||
2. Read the data from the database and prepare the data for AI by using the Data Refinery tool in CP4D.
|
||||
|
||||
3. A Jupyter Notebook or JupyterLab IDE that is provided by the Watson Studio component in CP4D helps us build and train the AI model. The trained model can be saved into a WML repository.
|
||||
|
||||
4. Deploy the saved model into a deployment space for batch deployment.
|
||||
|
||||
5. Create a batch deployment by using any of these interfaces:
|
||||
|
||||
a. Watson Studio user interface from an Analytics deployment space.
|
||||
|
||||
b. WML Python client.
|
||||
|
||||
c. WML REST APIs.
|
||||
|
||||
6. A hardware configuration can be chosen for the deployment.
|
||||
|
||||
7. A batch deployment processes input data from a file, data connection, or connected data in a storage bucket, and writes the output to a selected destination.
|
||||
|
||||
8. One way to run batch deployment to predict or score is to create and run a batch deployment job.
|
||||
|
||||
9. Provide an input data type:
|
||||
|
||||
a. Inline data for entering a JSON format payload.
|
||||
|
||||
b. Select Data asset , click Select data source , and then specify your asset.
|
||||
|
||||
10.The output data type can be a new output file or a connected data asset.
|
||||
|
||||
11.A Kubernetes admin can change the maximum number of concurrent batch jobs that can be run.
|
||||
|
||||
12.Get the deployment endpoint URL. For more information, see Getting the deployment endpoint URL.
|
||||
- 1. Create a connection to a database (for example, an IBM Db2fi database) where the historical data will be used for ML model building.
|
||||
- 2. Read the data from the database and prepare the data for AI by using the Data Refinery tool in CP4D.
|
||||
- 3. A Jupyter Notebook or JupyterLab IDE that is provided by the Watson Studio component in CP4D helps us build and train the AI model. The trained model can be saved into a WML repository.
|
||||
- 4. Deploy the saved model into a deployment space for batch deployment.
|
||||
- 5. Create a batch deployment by using any of these interfaces:
|
||||
- a. Watson Studio user interface from an Analytics deployment space.
|
||||
- b. WML Python client.
|
||||
- c. WML REST APIs.
|
||||
- 6. A hardware configuration can be chosen for the deployment.
|
||||
- 7. A batch deployment processes input data from a file, data connection, or connected data in a storage bucket, and writes the output to a selected destination.
|
||||
- 8. One way to run batch deployment to predict or score is to create and run a batch deployment job.
|
||||
- 9. Provide an input data type:
|
||||
- a. Inline data for entering a JSON format payload.
|
||||
- b. Select Data asset , click Select data source , and then specify your asset.
|
||||
- 10.The output data type can be a new output file or a connected data asset.
|
||||
- 11.A Kubernetes admin can change the maximum number of concurrent batch jobs that can be run.
|
||||
- 12.Get the deployment endpoint URL. For more information, see Getting the deployment endpoint URL.
|
||||
|
||||
## Summary
|
||||
|
||||
With this use case, we attempted to demonstrate how to predict, in real time, whether the transaction that is being processed might be a fraudulent transaction or not. By using the method, you have the following advantages:
|
||||
|
||||
GLYPH<SM590000> No Impact to SLAs and the batch process window.
|
||||
|
||||
GLYPH<SM590000> Proactively stop losses, and lower operational, regulatory, and compliance costs.
|
||||
|
||||
GLYPH<SM590000> The solution is using a DL framework like TensorFlow for high-performing, low latency scoring.
|
||||
- GLYPH<SM590000> No Impact to SLAs and the batch process window.
|
||||
- GLYPH<SM590000> Proactively stop losses, and lower operational, regulatory, and compliance costs.
|
||||
- GLYPH<SM590000> The solution is using a DL framework like TensorFlow for high-performing, low latency scoring.
|
||||
|
||||
## Use case 4: Remaining Useful Life of an aircraft engine
|
||||
|
||||
@ -525,7 +441,6 @@ Figure 22 provides an overview of the inferencing architecture for the RUL of an
|
||||
|
||||
Figure 22 Inferencing architecture on IBM Z
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Because we are looking into data-driven model development, the data set of our target is the run-to-failure data of the engine. We are looking into a supervised learning problem, and we use regression techniques to learn from the data. DL techniques such as Long Short-Term Memory (LSTM) or Gated Recurrent Units (GRU) are our choice because we are looking into a time series data set. TensorFlow or PyTorch frameworks are leveraged to create models. AI governance monitors the data and model drift to maintain the model quality throughout the model's life.
|
||||
@ -548,20 +463,15 @@ Figure 23 on page 29 provides a more in-depth view of the architecture of an AI-
|
||||
|
||||
Figure 23 In-depth architectural view
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
In summary, consider the following points while developing an AI-based predictive maintenance application:
|
||||
|
||||
GLYPH<SM590000> CP4D offers a Python run time to build a custom solution stack, but also supports different components like Watson Studio, WML, Db2, Data Refinery, OpenScale, AI Factsheets, and OpenPages.
|
||||
|
||||
GLYPH<SM590000> The trustworthiness of the predicted output is important for critical use cases.
|
||||
|
||||
GLYPH<SM590000> IBM Z provides high data security and low latency requirements at scale for the critical applications.
|
||||
|
||||
GLYPH<SM590000> A data scientist can choose to train the model and deploy it on CP4D seamlessly with the latest tech stack that is available.
|
||||
|
||||
GLYPH<SM590000> The AIOps and MLOps supported by CP4D to track AI model and data lifecycle throughout the application lifecycle.
|
||||
- GLYPH<SM590000> CP4D offers a Python run time to build a custom solution stack, but also supports different components like Watson Studio, WML, Db2, Data Refinery, OpenScale, AI Factsheets, and OpenPages.
|
||||
- GLYPH<SM590000> The trustworthiness of the predicted output is important for critical use cases.
|
||||
- GLYPH<SM590000> IBM Z provides high data security and low latency requirements at scale for the critical applications.
|
||||
- GLYPH<SM590000> A data scientist can choose to train the model and deploy it on CP4D seamlessly with the latest tech stack that is available.
|
||||
- GLYPH<SM590000> The AIOps and MLOps supported by CP4D to track AI model and data lifecycle throughout the application lifecycle.
|
||||
|
||||
## Use case 5: AI-powered video analytics on an infant's motions for health prediction
|
||||
|
||||
@ -593,7 +503,6 @@ Figure 24 shows an architectural diagram about how to design and develop an AI m
|
||||
|
||||
Figure 24 Architecture for AI-powered video analytics
|
||||
|
||||
|
||||
<!-- image -->
|
||||
|
||||
Live camera feeds or recorded videos of an infant's movement are the inputs for a pose detection model. This video streaming data was stored in IBM Cloudfi Object Storage for image processing. Video data must be transformed into frames so that the infant's body poses can be detected. These post-estimation components of the pipeline predict the location of all 17-person key points with 3 degrees of freedom each (x, y location and visibility) plus two virtual alignment key points. This approach also embraces a compute-intensive heat map prediction of infant body posture.
|
||||
@ -602,29 +511,21 @@ When changes in body posture or movement happen, analytics can be performed, and
|
||||
|
||||
We can leverage the following AI technology stack for this use case:
|
||||
|
||||
GLYPH<SM590000> Convolutional neural network: Build an artificial neural network model on video streaming and images.
|
||||
|
||||
GLYPH<SM590000> TensorFlow: A DL back-end framework that is based on TensorFlow.
|
||||
|
||||
GLYPH<SM590000> Mediapipe: A library that helps with video streaming processing and prediction of human pose estimation.
|
||||
|
||||
GLYPH<SM590000> OpenCV: A real-time computer vision library that helps perform image processing.
|
||||
- GLYPH<SM590000> Convolutional neural network: Build an artificial neural network model on video streaming and images.
|
||||
- GLYPH<SM590000> TensorFlow: A DL back-end framework that is based on TensorFlow.
|
||||
- GLYPH<SM590000> Mediapipe: A library that helps with video streaming processing and prediction of human pose estimation.
|
||||
- GLYPH<SM590000> OpenCV: A real-time computer vision library that helps perform image processing.
|
||||
|
||||
WML was used for deployment of the pose detection model and generated notifications to users with web and mobile applications, and it integrates with Fitbit for push notifications so that hospitals and parents can take preventive actions.
|
||||
|
||||
## Additional resources
|
||||
|
||||
GLYPH<SM590000> The Cloud Pak for Data 4.5 on IBM Z Overview Demo video provides an overview of some of the more important features of CP4D on IBM Z.
|
||||
|
||||
GLYPH<SM590000> IBM Cloud Pak for Data Tutorials.
|
||||
|
||||
GLYPH<SM590000> Here are some additional use cases that use the data science frameworks that are available as part of CP4D on IBM Z and IBM LinuxONE:
|
||||
|
||||
-Payment Card Fraud Detection by using TensorFlow on CP4D on IBM Z and IBM LinuxONE is a payment card fraud detection use case.
|
||||
|
||||
-Fashion-MNIST clothing classification with PyTorch on Cloud Pak for Data on IBM Z and IBM LinuxONE is a Fashion-MNIST clothing classification use case.
|
||||
|
||||
-Payment Card Fraud Prevention by using Snap ML on IBM Cloud Pak for Data on Red Hat OpenShift on a virtual machine on IBM Z and IBM LinuxONE, which leverage the z16 integrated AI accelerator describes a use case that uses Snap Machine Learning in Cloud Pak for Data on IBM Z and IBM LinuxONE. It is a Snap ML use case.
|
||||
- GLYPH<SM590000> The Cloud Pak for Data 4.5 on IBM Z Overview Demo video provides an overview of some of the more important features of CP4D on IBM Z.
|
||||
- GLYPH<SM590000> IBM Cloud Pak for Data Tutorials.
|
||||
- GLYPH<SM590000> Here are some additional use cases that use the data science frameworks that are available as part of CP4D on IBM Z and IBM LinuxONE:
|
||||
- -Payment Card Fraud Detection by using TensorFlow on CP4D on IBM Z and IBM LinuxONE is a payment card fraud detection use case.
|
||||
- -Fashion-MNIST clothing classification with PyTorch on Cloud Pak for Data on IBM Z and IBM LinuxONE is a Fashion-MNIST clothing classification use case.
|
||||
- -Payment Card Fraud Prevention by using Snap ML on IBM Cloud Pak for Data on Red Hat OpenShift on a virtual machine on IBM Z and IBM LinuxONE, which leverage the z16 integrated AI accelerator describes a use case that uses Snap Machine Learning in Cloud Pak for Data on IBM Z and IBM LinuxONE. It is a Snap ML use case.
|
||||
|
||||
A companion video can be found at Credit Card Fraud Detection by using Snap ML on IBM Cloud Pak for Data on IBM Z and IBM LinuxONE.
|
||||
|
||||
@ -662,15 +563,13 @@ ibm.com /redbooks/residencies.html
|
||||
|
||||
## Stay connected to IBM Redbooks
|
||||
|
||||
GLYPH<SM590000> Find us on LinkedIn:
|
||||
- GLYPH<SM590000> Find us on LinkedIn:
|
||||
|
||||
http://www.linkedin.com/groups?home=&gid=2130806
|
||||
|
||||
GLYPH<SM590000> Explore new Redbooks publications, residencies, and workshops with the IBM Redbooks weekly newsletter:
|
||||
|
||||
https://www.redbooks.ibm.com/Redbooks.nsf/subscribe?OpenForm
|
||||
|
||||
GLYPH<SM590000> Stay current on recent Redbooks publications with RSS Feeds:
|
||||
- GLYPH<SM590000> Explore new Redbooks publications, residencies, and workshops with the IBM Redbooks weekly newsletter:
|
||||
- https://www.redbooks.ibm.com/Redbooks.nsf/subscribe?OpenForm
|
||||
- GLYPH<SM590000> Stay current on recent Redbooks publications with RSS Feeds:
|
||||
|
||||
http://www.redbooks.ibm.com/rss.html
|
||||
|
||||
|
24
tests/data/groundtruth/docling_v2/test_01.asciidoc.md
Normal file
24
tests/data/groundtruth/docling_v2/test_01.asciidoc.md
Normal file
@ -0,0 +1,24 @@
|
||||
# Sample Document Title
|
||||
|
||||
## Section 1
|
||||
|
||||
This is some introductory text in section 1.
|
||||
|
||||
## Subsection 1.1
|
||||
|
||||
- * First list item
|
||||
|
||||
- * Second list item
|
||||
|
||||
This is some introductory text in section 1.1.
|
||||
|
||||
- - A dash list item
|
||||
|
||||
## Section 2
|
||||
|
||||
This is some text in section 2.
|
||||
|
||||
| Header 1 | Header 2 |
|
||||
|------------|------------|
|
||||
| Value 1 | Value 2 |
|
||||
| Value 3 | Value 4 |
|
83
tests/data/groundtruth/docling_v2/test_02.asciidoc.md
Normal file
83
tests/data/groundtruth/docling_v2/test_02.asciidoc.md
Normal file
@ -0,0 +1,83 @@
|
||||
2nd Sample Document Title
|
||||
|
||||
This is an abstract.
|
||||
|
||||
Section 1: Testing nestedlists
|
||||
|
||||
- First item
|
||||
- Nested item 1
|
||||
- Nested item 2
|
||||
- Second item
|
||||
- Nested ordered item 1
|
||||
- Nested ordered item 2
|
||||
- Deeper nested unordered item
|
||||
- Third item
|
||||
- Nested ordered item 1
|
||||
- Nested ordered item 2
|
||||
- Deeper nested unordered item
|
||||
- Nested ordered item 2
|
||||
|
||||
Section 2
|
||||
|
||||
bla bla
|
||||
|
||||
bla bla bla
|
||||
|
||||
Section 3: test image
|
||||
|
||||
image::images/example1.png[Example Image, width=200, height=150, align=center]
|
||||
|
||||
.An example caption for the image
|
||||
|
||||
image::images/example2.png[Example Image, width=200, height=150, align=center]
|
||||
|
||||
Section 4: test tables
|
||||
|
||||
|
||||
| Header 1 | Header 2 |
|
||||
|------------|------------|
|
||||
| Value 1 | Value 2 |
|
||||
| Value 3 | Value 4 |
|
||||
|
||||
.Caption for the table 1
|
||||
|
||||
|===
|
||||
|
||||
|
||||
| Header 1 | Header 2 |
|
||||
|------------|------------|
|
||||
| Value 1 | Value 2 |
|
||||
| Value 3 | Value 4 |
|
||||
|
||||
.Caption for the table 2
|
||||
|
||||
|===
|
||||
|
||||
|
||||
| Column 1 Heading | Column 2 Heading | Column 3 Heading |
|
||||
|--------------------|--------------------|------------------------|
|
||||
| Cell 1 | Cell 2 | Cell 3 |
|
||||
| Cell 4 | Cell 5 colspan=2 | Cell spans two columns |
|
||||
|
||||
.Caption for the table 3
|
||||
|
||||
|===
|
||||
|
||||
|
||||
| Column 1 Heading | Column 2 Heading | Column 3 Heading |
|
||||
|--------------------|--------------------|--------------------|
|
||||
| Rowspan=2 | Cell 2 | Cell 3 |
|
||||
| Cell 5 | Cell 6 | |
|
||||
|
||||
.Caption for the table 4
|
||||
|
||||
|===
|
||||
|
||||
|
||||
| Col 1 | Col 2 | Col 3 | Col 4 |
|
||||
|---------------------|------------------------------------|---------|---------|
|
||||
| Rowspan=2.Colspan=2 | Cell spanning 2 rows and 2 columns | Col 3 | Col 4 |
|
||||
| Col 3 | Col 4 | | |
|
||||
| Col 1 | Col 2 | Col 3 | Col 4 |
|
||||
|
||||
SubSubSection 2.1.1
|
Binary file not shown.
25
tests/data/test_01.asciidoc
Normal file
25
tests/data/test_01.asciidoc
Normal file
@ -0,0 +1,25 @@
|
||||
= 1st Sample Document Title
|
||||
|
||||
This is an abstract.
|
||||
|
||||
== Section 1
|
||||
|
||||
This is some introductory text in section 1.
|
||||
|
||||
This spans multiple lines but should be treated
|
||||
as a single paragraph.
|
||||
|
||||
=== Subsection 1.1
|
||||
* First list item
|
||||
* Second list item
|
||||
|
||||
This is some introductory text in section 1.1.
|
||||
|
||||
- A dash list item
|
||||
|
||||
== Section 2
|
||||
This is some text in section 2.
|
||||
|
||||
|Header 1|Header 2|
|
||||
|Value 1|Value 2|
|
||||
|Value 3|Value 4|
|
69
tests/data/test_02.asciidoc
Normal file
69
tests/data/test_02.asciidoc
Normal file
@ -0,0 +1,69 @@
|
||||
= 2nd Sample Document Title
|
||||
|
||||
This is an abstract.
|
||||
|
||||
== Section 1: Testing nestedlists
|
||||
|
||||
* First item
|
||||
* Nested item 1
|
||||
* Nested item 2
|
||||
* Second item
|
||||
1. Nested ordered item 1
|
||||
2. Nested ordered item 2
|
||||
* Deeper nested unordered item
|
||||
* Third item
|
||||
1. Nested ordered item 1
|
||||
2. Nested ordered item 2
|
||||
* Deeper nested unordered item
|
||||
3. Nested ordered item 2
|
||||
|
||||
== Section 2
|
||||
|
||||
bla bla
|
||||
|
||||
==== SubSubSection 2.1.1
|
||||
|
||||
bla bla bla
|
||||
bli bla ble
|
||||
|
||||
== Section 3: test image
|
||||
|
||||
image::images/example1.png[Example Image, width=200, height=150, align=center]
|
||||
|
||||
.An example caption for the image
|
||||
image::images/example2.png[Example Image, width=200, height=150, align=center]
|
||||
|
||||
== Section 4: test tables
|
||||
|
||||
|Header 1|Header 2|
|
||||
|Value 1|Value 2|
|
||||
|Value 3|Value 4|
|
||||
|
||||
.Caption for the table 1
|
||||
|===
|
||||
|Header 1 |Header 2
|
||||
|Value 1 |Value 2
|
||||
|Value 3 |Value 4
|
||||
|===
|
||||
|
||||
.Caption for the table 2
|
||||
|===
|
||||
|Column 1 Heading |Column 2 Heading |Column 3 Heading
|
||||
|Cell 1 |Cell 2 |Cell 3
|
||||
|Cell 4 |Cell 5 colspan=2|Cell spans two columns
|
||||
|===
|
||||
|
||||
.Caption for the table 3
|
||||
|===
|
||||
|Column 1 Heading |Column 2 Heading |Column 3 Heading
|
||||
|Rowspan=2 |Cell 2 |Cell 3
|
||||
| |Cell 5 |Cell 6
|
||||
|===
|
||||
|
||||
.Caption for the table 4
|
||||
|===
|
||||
|Col 1 |Col 2 |Col 3 |Col 4
|
||||
|Rowspan=2.Colspan=2|Cell spanning 2 rows and 2 columns |Col 3 |Col 4
|
||||
| | |Col 3 |Col 4
|
||||
|Col 1 |Col 2 |Col 3 |Col 4
|
||||
|===
|
54
tests/test_backend_asciidoc.py
Normal file
54
tests/test_backend_asciidoc.py
Normal file
@ -0,0 +1,54 @@
|
||||
import glob
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from docling_core.types.doc import BoundingBox
|
||||
|
||||
from docling.backend.asciidoc_backend import AsciiDocBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
|
||||
def _get_backend(fname):
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=fname,
|
||||
format=InputFormat.ASCIIDOC,
|
||||
backend=AsciiDocBackend,
|
||||
)
|
||||
|
||||
doc_backend = in_doc._backend
|
||||
return doc_backend
|
||||
|
||||
|
||||
def test_asciidocs_examples():
|
||||
|
||||
fnames = sorted(glob.glob("./tests/data/*.asciidoc"))
|
||||
|
||||
for fname in fnames:
|
||||
print(f"reading {fname}")
|
||||
|
||||
bname = os.path.basename(fname)
|
||||
gname = os.path.join("./tests/data/groundtruth/docling_v2/", bname + ".md")
|
||||
|
||||
doc_backend = _get_backend(Path(fname))
|
||||
doc = doc_backend.convert()
|
||||
|
||||
pred_itdoc = doc._export_to_indented_text(max_text_len=16)
|
||||
print("\n\n", pred_itdoc)
|
||||
|
||||
pred_mddoc = doc.export_to_markdown()
|
||||
print("\n\n", pred_mddoc)
|
||||
|
||||
if os.path.exists(gname):
|
||||
with open(gname, "r") as fr:
|
||||
true_mddoc = fr.read()
|
||||
|
||||
# assert pred_mddoc == true_mddoc, "pred_mddoc!=true_mddoc for asciidoc"
|
||||
else:
|
||||
with open(gname, "w") as fw:
|
||||
fw.write(pred_mddoc)
|
||||
|
||||
# print("\n\n", doc.export_to_markdown())
|
||||
|
||||
assert True
|
Loading…
Reference in New Issue
Block a user