diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 7c71690..3b9a55a 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -17,6 +17,7 @@ from docling_core.types.doc import ( TableData, ) from docling_core.types.doc.document import ContentLayer +from pydantic import BaseModel from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend @@ -48,6 +49,11 @@ TAGS_FOR_NODE_ITEMS: Final = [ ] +class _Context(BaseModel): + list_ordered_flag_by_ref: dict[str, bool] = {} + list_start_by_ref: dict[str, int] = {} + + class HTMLDocumentBackend(DeclarativeDocumentBackend): @override def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): @@ -59,6 +65,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.max_levels = 10 self.level = 0 self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {} + self.ctx = _Context() for i in range(self.max_levels): self.parents[i] = None @@ -121,6 +128,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.content_layer = ( ContentLayer.BODY if headers is None else ContentLayer.FURNITURE ) + self.ctx = _Context() # reset context self.walk(content, doc) else: raise RuntimeError( @@ -294,28 +302,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def handle_list(self, element: Tag, doc: DoclingDocument) -> None: """Handles list tags (ul, ol) and their list items.""" - if element.name == "ul": - # create a list group - self.parents[self.level + 1] = doc.add_group( - parent=self.parents[self.level], - name="list", - label=GroupLabel.LIST, - content_layer=self.content_layer, - ) - elif element.name == "ol": + start: Optional[int] = None + if is_ordered := element.name == "ol": start_attr = element.get("start") - start: int = ( - int(start_attr) - if isinstance(start_attr, str) and start_attr.isnumeric() - else 1 - ) - # create a list group - self.parents[self.level + 1] = doc.add_group( - parent=self.parents[self.level], - name="ordered list" + (f" start {start}" if start != 1 else ""), - label=GroupLabel.ORDERED_LIST, - content_layer=self.content_layer, - ) + if isinstance(start_attr, str) and start_attr.isnumeric(): + start = int(start_attr) + name = "ordered list" + (f" start {start}" if start is not None else "") + else: + name = "list" + # create a list group + list_group = doc.add_list_group( + name=name, + parent=self.parents[self.level], + content_layer=self.content_layer, + ) + self.parents[self.level + 1] = list_group + self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered + if is_ordered and start is not None: + self.ctx.list_start_by_ref[list_group.self_ref] = start + self.level += 1 self.walk(element, doc) @@ -331,16 +336,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if parent is None: _log.debug(f"list-item has no parent in DoclingDocument: {element}") return - parent_label: str = parent.label - index_in_list = len(parent.children) + 1 - if ( - parent_label == GroupLabel.ORDERED_LIST - and isinstance(parent, GroupItem) - and parent.name - ): - start_in_list: str = parent.name.split(" ")[-1] - start: int = int(start_in_list) if start_in_list.isnumeric() else 1 - index_in_list += start - 1 + enumerated = self.ctx.list_ordered_flag_by_ref.get(parent.self_ref, False) + if enumerated and (start := self.ctx.list_start_by_ref.get(parent.self_ref)): + marker = f"{start + len(parent.children)}." + else: + marker = "" if nested_list: # Text in list item can be hidden within hierarchy, hence @@ -350,12 +350,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): text = text.replace("\n", "").replace("\r", "") text = " ".join(text.split()).strip() - marker = "" - enumerated = False - if parent_label == GroupLabel.ORDERED_LIST: - marker = str(index_in_list) - enumerated = True - if len(text) > 0: # create a list-item self.parents[self.level + 1] = doc.add_list_item( @@ -375,11 +369,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): elif element.text.strip(): text = element.text.strip() - marker = "" - enumerated = False - if parent_label == GroupLabel.ORDERED_LIST: - marker = f"{index_in_list!s}." - enumerated = True doc.add_list_item( text=text, enumerated=enumerated, diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 58c0e6e..fb42547 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -14,13 +14,12 @@ from docling_core.types.doc import ( DocItemLabel, DoclingDocument, DocumentOrigin, - GroupLabel, NodeItem, TableCell, TableData, TextItem, ) -from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList +from docling_core.types.doc.document import Formatting from marko import Markdown from pydantic import AnyUrl, BaseModel, Field, TypeAdapter from typing_extensions import Annotated @@ -51,6 +50,7 @@ class _HeadingCreationPayload(BaseModel): class _ListItemCreationPayload(BaseModel): kind: Literal["list_item"] = "list_item" + enumerated: bool _CreationPayload = Annotated[ @@ -187,15 +187,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): doc: DoclingDocument, parent_item: Optional[NodeItem], text: str, + enumerated: bool, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, ): - if not isinstance(parent_item, (OrderedList, UnorderedList)): - _log.warning("ListItem would have not had a list parent, adding one.") - parent_item = doc.add_unordered_list(parent=parent_item) item = doc.add_list_item( text=text, - enumerated=(isinstance(parent_item, OrderedList)), + enumerated=enumerated, parent=parent_item, formatting=formatting, hyperlink=hyperlink, @@ -238,6 +236,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): creation_stack: list[ _CreationPayload ], # stack for lazy item creation triggered deep in marko's AST (on RawText) + list_ordered_flag_by_ref: dict[str, bool], parent_item: Optional[NodeItem] = None, formatting: Optional[Formatting] = None, hyperlink: Optional[Union[AnyUrl, Path]] = None, @@ -275,10 +274,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): self._close_table(doc) _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}") if has_non_empty_list_items: - label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST - parent_item = doc.add_group( - label=label, name="list", parent=parent_item - ) + parent_item = doc.add_list_group(name="list", parent=parent_item) + list_ordered_flag_by_ref[parent_item.self_ref] = element.ordered elif ( isinstance(element, marko.block.ListItem) @@ -289,16 +286,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): self._close_table(doc) _log.debug(" - List item") + enumerated = ( + list_ordered_flag_by_ref.get(parent_item.self_ref, False) + if parent_item + else False + ) if len(child.children) > 1: # inline group will be created further down parent_item = self._create_list_item( doc=doc, parent_item=parent_item, text="", + enumerated=enumerated, formatting=formatting, hyperlink=hyperlink, ) else: - creation_stack.append(_ListItemCreationPayload()) + creation_stack.append(_ListItemCreationPayload(enumerated=enumerated)) elif isinstance(element, marko.inline.Image): self._close_table(doc) @@ -349,10 +352,18 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): while len(creation_stack) > 0: to_create = creation_stack.pop() if isinstance(to_create, _ListItemCreationPayload): + enumerated = ( + list_ordered_flag_by_ref.get( + parent_item.self_ref, False + ) + if parent_item + else False + ) parent_item = self._create_list_item( doc=doc, parent_item=parent_item, text=snippet_text, + enumerated=enumerated, formatting=formatting, hyperlink=hyperlink, ) @@ -453,6 +464,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): doc=doc, visited=visited, creation_stack=creation_stack, + list_ordered_flag_by_ref=list_ordered_flag_by_ref, parent_item=parent_item, formatting=formatting, hyperlink=hyperlink, @@ -497,6 +509,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): parent_item=None, visited=set(), creation_stack=[], + list_ordered_flag_by_ref={}, ) self._close_table(doc=doc) # handle any last hanging table diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 63aa9e9..f512fb7 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -121,7 +121,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB return prov - def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): + def handle_text_elements( + self, shape, parent_slide, slide_ind, doc: DoclingDocument, slide_size + ): is_list_group_created = False enum_list_item_value = 0 new_list = None @@ -165,10 +167,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB enumerated = bullet_type == "Numbered" if not is_list_group_created: - new_list = doc.add_group( - label=GroupLabel.ORDERED_LIST - if enumerated - else GroupLabel.LIST, + new_list = doc.add_list_group( name="list", parent=parent_slide, ) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 3e84d64..abbcc6f 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -10,11 +10,12 @@ from docling_core.types.doc import ( DocumentOrigin, GroupLabel, ImageRef, + ListGroup, NodeItem, TableCell, TableData, ) -from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList +from docling_core.types.doc.document import Formatting from docx import Document from docx.document import Document as DocxDocument from docx.oxml.table import CT_Tc @@ -688,7 +689,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): paragraph_elements: list, ) -> Optional[NodeItem]: return ( - doc.add_group(label=GroupLabel.INLINE, parent=prev_parent) + doc.add_inline_group(parent=prev_parent) if len(paragraph_elements) > 1 else prev_parent ) @@ -781,9 +782,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): else: # Inline equation level = self._get_level() - inline_equation = doc.add_group( - label=GroupLabel.INLINE, parent=self.parents[level - 1] - ) + inline_equation = doc.add_inline_group(parent=self.parents[level - 1]) text_tmp = text for eq in equations: if len(text_tmp) == 0: @@ -931,18 +930,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): level: int, ) -> None: # This should not happen by construction - if not isinstance(self.parents[level], (OrderedList, UnorderedList)): + if not isinstance(self.parents[level], ListGroup): return + if not elements: + return + if len(elements) == 1: text, format, hyperlink = elements[0] - doc.add_list_item( - marker=marker, - enumerated=enumerated, - parent=self.parents[level], - text=text, - formatting=format, - hyperlink=hyperlink, - ) + if text: + doc.add_list_item( + marker=marker, + enumerated=enumerated, + parent=self.parents[level], + text=text, + formatting=format, + hyperlink=hyperlink, + ) else: new_item = doc.add_list_item( marker=marker, @@ -950,15 +953,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[level], text="", ) - new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item) + new_parent = doc.add_inline_group(parent=new_item) for text, format, hyperlink in elements: - doc.add_text( - label=DocItemLabel.TEXT, - parent=new_parent, - text=text, - formatting=format, - hyperlink=hyperlink, - ) + if text: + doc.add_text( + label=DocItemLabel.TEXT, + parent=new_parent, + text=text, + formatting=format, + hyperlink=hyperlink, + ) def _add_list_item( self, @@ -979,8 +983,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if self._prev_numid() is None: # Open new list self.level_at_new_list = level - self.parents[level] = doc.add_group( - label=GroupLabel.LIST, name="list", parent=self.parents[level - 1] + self.parents[level] = doc.add_list_group( + name="list", parent=self.parents[level - 1] ) # Set marker and enumerated arguments if this is an enumeration element. @@ -1001,19 +1005,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.level_at_new_list + prev_indent + 1, self.level_at_new_list + ilevel + 1, ): - # Determine if this is an unordered list or an ordered list. - # Set GroupLabel.ORDERED_LIST when it fits. self.listIter = 0 - if is_numbered: - self.parents[i] = doc.add_group( - label=GroupLabel.ORDERED_LIST, - name="list", - parent=self.parents[i - 1], - ) - else: - self.parents[i] = doc.add_group( - label=GroupLabel.LIST, name="list", parent=self.parents[i - 1] - ) + self.parents[i] = doc.add_list_group( + name="list", parent=self.parents[i - 1] + ) # TODO: Set marker and enumerated arguments if this is an enumeration element. self.listIter += 1 diff --git a/pyproject.toml b/pyproject.toml index 8ba5951..2218a27 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,7 @@ authors = [ requires-python = '>=3.9,<4.0' dependencies = [ 'pydantic (>=2.0.0,<3.0.0)', - 'docling-core[chunking] (>=2.29.0,<3.0.0)', + 'docling-core[chunking] (>=2.39.0,<3.0.0)', 'docling-ibm-models (>=3.4.4,<4.0.0)', 'docling-parse (>=4.0.0,<5.0.0)', 'filetype (>=1.2.0,<2.0.0)', diff --git a/tests/data/groundtruth/docling_v1/2203.01017v2.doctags.txt b/tests/data/groundtruth/docling_v1/2203.01017v2.doctags.txt index 94fd99a..d047d93 100644 --- a/tests/data/groundtruth/docling_v1/2203.01017v2.doctags.txt +++ b/tests/data/groundtruth/docling_v1/2203.01017v2.doctags.txt @@ -14,8 +14,8 @@ 1 -- b. Red-annotation of bounding boxes, Blue-predictions by TableFormer -- c. Structure predicted by TableFormer: +b. Red-annotation of bounding boxes, Blue-predictions by TableFormer +c. Structure predicted by TableFormer:
@@ -38,10 +38,10 @@ The second problem is called table-structure decomposition. The latter is a long standing problem in the community of document understanding [6, 4, 14]. Contrary to the table-location problem, there are no commonly used approaches that can easily be re-purposed to solve this problem. Lately, a set of new model-architectures has been proposed by the community to address table-structure decomposition [37, 36, 18, 20]. All these models have some weaknesses (see Sec. 2). The common denominator here is the reliance on textual features and/or the inability to provide the bounding box of each table-cell in the original image. In this paper, we want to address these weaknesses and present a robust table-structure decomposition algorithm. The design criteria for our model are the following. First, we want our algorithm to be language agnostic. In this way, we can obtain the structure of any table, irregardless of the language. Second, we want our algorithm to leverage as much data as possible from the original PDF document. For programmatic PDF documents, the text-cells can often be extracted much faster and with higher accuracy compared to OCR methods. Last but not least, we want to have a direct link between the table-cell and its bounding box in the image. To meet the design criteria listed above, we developed a new model called TableFormer and a synthetically generated table structure dataset called SynthTabNet $^{1}$. In particular, our contributions in this work can be summarised as follows: -- · We propose TableFormer , a transformer based model that predicts tables structure and bounding boxes for the table content simultaneously in an end-to-end approach. -- · Across all benchmark datasets TableFormer significantly outperforms existing state-of-the-art metrics, while being much more efficient in training and inference to existing works. -- · We present SynthTabNet a synthetically generated dataset, with various appearance styles and complexity. -- · An augmented dataset based on PubTabNet [37], FinTabNet [36], and TableBank [17] with generated ground-truth for reproducibility. +· We propose TableFormer , a transformer based model that predicts tables structure and bounding boxes for the table content simultaneously in an end-to-end approach. +· Across all benchmark datasets TableFormer significantly outperforms existing state-of-the-art metrics, while being much more efficient in training and inference to existing works. +· We present SynthTabNet a synthetically generated dataset, with various appearance styles and complexity. +· An augmented dataset based on PubTabNet [37], FinTabNet [36], and TableBank [17] with generated ground-truth for reproducibility. The paper is structured as follows. In Sec. 2, we give a brief overview of the current state-of-the-art. In Sec. 3, we describe the datasets on which we train. In Sec. 4, we introduce the TableFormer model-architecture and describe its results & performance in Sec. 5. As a conclusion, we describe how this new model-architecture can be re-purposed for other tasks in the computer-vision community. 2. Previous work and State of the Art @@ -160,8 +160,8 @@ TableFormer95.490.193.6 Table 4: Results of structure with content retrieved using cell detection on PubTabNet. In all cases the input is PDF documents with cropped tables. -- a. -- Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells +a. +Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells Japanese language (previously unseen by TableFormer): Example table from FinTabNet:
@@ -215,47 +215,47 @@ We showcase several visualizations for the different components of our network on various "complex" tables within datasets presented in this work in Fig. 5 and Fig. 6 As it is shown, our model is able to predict bounding boxes for all table cells, even for the empty ones. Additionally, our post-processing techniques can extract the cell content by matching the predicted bounding boxes to the PDF cells based on their overlap and spatial proximity. The left part of Fig. 5 demonstrates also the adaptability of our method to any language, as it can successfully extract Japanese text, although the training set contains only English content. We provide more visualizations including the intermediate steps in the supplementary material. Overall these illustrations justify the versatility of our method across a diverse range of table appearances and content type. In this paper, we presented TableFormer an end-to-end transformer based approach to predict table structures and bounding boxes of cells from an image. This approach enables us to recreate the table structure, and extract the cell content from PDF or OCR by using bounding boxes. Additionally, it provides the versatility required in real-world scenarios when dealing with various types of PDF documents, and languages. Furthermore, our method outperforms all state-of-the-arts with a wide margin. Finally, we introduce "SynthTabNet" a challenging synthetically generated dataset that reinforces missing characteristics from other datasets. References -- [1] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to- -- end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5 -- [2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3 -- [3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2 -- [4] Herv´e D´ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2 -- [5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2 -- [6] Max G¨obel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2 -- [7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2 -- [8] Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1 -- [9] Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1 -- [10] Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2 -- [11] Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2 -- [12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2 -- [13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl´ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2 -- [14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2 -- [15] Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6 -- [16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4 -- [17] Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou, and Zhoujun Li. Tablebank: A benchmark dataset for table detection and recognition, 2019. 2, 3 -- [18] Yiren Li, Zheng Huang, Junchi Yan, Yi Zhou, Fan Ye, and Xianhui Liu. Gfte: Graph-based financial table extraction. In Alberto Del Bimbo, Rita Cucchiara, Stan Sclaroff, Giovanni Maria Farinella, Tao Mei, Marco Bertini, Hugo Jair Escalante, and Roberto Vezzani, editors, Pattern Recognition. ICPR International Workshops and Challenges , pages 644-658, Cham, 2021. Springer International Publishing. 2, 3 -- [19] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1 -- [20] Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2 -- [21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1 -- [22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch´e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6 -- [23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1 -- [24] Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3 -- [25] Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on +[1] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to- +end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5 +[2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3 +[3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2 +[4] Herv´e D´ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2 +[5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2 +[6] Max G¨obel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2 +[7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2 +[8] Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1 +[9] Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1 +[10] Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2 +[11] Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2 +[12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2 +[13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl´ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2 +[14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2 +[15] Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6 +[16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4 +[17] Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou, and Zhoujun Li. Tablebank: A benchmark dataset for table detection and recognition, 2019. 2, 3 +[18] Yiren Li, Zheng Huang, Junchi Yan, Yi Zhou, Fan Ye, and Xianhui Liu. Gfte: Graph-based financial table extraction. In Alberto Del Bimbo, Rita Cucchiara, Stan Sclaroff, Giovanni Maria Farinella, Tao Mei, Marco Bertini, Hugo Jair Escalante, and Roberto Vezzani, editors, Pattern Recognition. ICPR International Workshops and Challenges , pages 644-658, Cham, 2021. Springer International Publishing. 2, 3 +[19] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1 +[20] Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2 +[21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1 +[22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch´e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6 +[23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1 +[24] Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3 +[25] Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , pages 658-666, 2019. 6 -- [26] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 11621167, 2017. 1 -- [27] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR international conference on document analysis and recognition (ICDAR) , volume 1, pages 1162-1167. IEEE, 2017. 3 -- [28] Faisal Shafait and Ray Smith. Table detection in heterogeneous documents. In Proceedings of the 9th IAPR International Workshop on Document Analysis Systems , pages 6572, 2010. 2 -- [29] Shoaib Ahmed Siddiqui, Imran Ali Fateh, Syed Tahseen Raza Rizvi, Andreas Dengel, and Sheraz Ahmed. Deeptabstr: Deep learning based table structure recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1403-1409. IEEE, 2019. 3 -- [30] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD , KDD '18, pages 774-782, New York, NY, USA, 2018. ACM. 1 -- [31] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Ł ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, Advances in Neural Information Processing Systems 30 , pages 5998-6008. Curran Associates, Inc., 2017. 5 -- [32] Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. Show and tell: A neural image caption generator. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) , June 2015. 2 -- [33] Wenyuan Xue, Qingyong Li, and Dacheng Tao. Res2tim: reconstruct syntactic structures from table images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 749-755. IEEE, 2019. 3 -- [34] Wenyuan Xue, Baosheng Yu, Wen Wang, Dacheng Tao, and Qingyong Li. Tgrnet: A table graph reconstruction network for table structure recognition. arXiv preprint arXiv:2106.10598 , 2021. 3 -- [35] Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 4651-4659, 2016. 4 -- [36] Xinyi Zheng, Doug Burdick, Lucian Popa, Peter Zhong, and Nancy Xin Ru Wang. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. Winter Conference for Applications in Computer Vision (WACV) , 2021. 2, 3 -- [37] Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model, -- and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7 -- [38] Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1 +[26] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 11621167, 2017. 1 +[27] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR international conference on document analysis and recognition (ICDAR) , volume 1, pages 1162-1167. IEEE, 2017. 3 +[28] Faisal Shafait and Ray Smith. Table detection in heterogeneous documents. In Proceedings of the 9th IAPR International Workshop on Document Analysis Systems , pages 6572, 2010. 2 +[29] Shoaib Ahmed Siddiqui, Imran Ali Fateh, Syed Tahseen Raza Rizvi, Andreas Dengel, and Sheraz Ahmed. Deeptabstr: Deep learning based table structure recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1403-1409. IEEE, 2019. 3 +[30] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD , KDD '18, pages 774-782, New York, NY, USA, 2018. ACM. 1 +[31] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Ł ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, Advances in Neural Information Processing Systems 30 , pages 5998-6008. Curran Associates, Inc., 2017. 5 +[32] Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. Show and tell: A neural image caption generator. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) , June 2015. 2 +[33] Wenyuan Xue, Qingyong Li, and Dacheng Tao. Res2tim: reconstruct syntactic structures from table images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 749-755. IEEE, 2019. 3 +[34] Wenyuan Xue, Baosheng Yu, Wen Wang, Dacheng Tao, and Qingyong Li. Tgrnet: A table graph reconstruction network for table structure recognition. arXiv preprint arXiv:2106.10598 , 2021. 3 +[35] Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 4651-4659, 2016. 4 +[36] Xinyi Zheng, Doug Burdick, Lucian Popa, Peter Zhong, and Nancy Xin Ru Wang. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. Winter Conference for Applications in Computer Vision (WACV) , 2021. 2, 3 +[37] Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model, +and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7 +[38] Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1 TableFormer: Table Structure Understanding with Transformers Supplementary Material 1. Details on the datasets 1.1. Data preparation @@ -265,11 +265,11 @@ 1.2. Synthetic datasets Aiming to train and evaluate our models in a broader spectrum of table data we have synthesized four types of datasets. Each one contains tables with different appear- ances in regard to their size, structure, style and content. Every synthetic dataset contains 150k examples, summing up to 600k synthetic examples. All datasets are divided into Train, Test and Val splits (80%, 10%, 10%). The process of generating a synthetic dataset can be decomposed into the following steps: -- 1. Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.). -- 2. Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans. -- 3. Generate content: Based on the dataset theme , a set of suitable content templates is chosen first. Then, this content can be combined with purely random text to produce the synthetic content. -- 4. Apply styling templates: Depending on the domain of the synthetic dataset, a set of styling templates is first manually selected. Then, a style is randomly selected to format the appearance of the synthesized table. -- 5. Render the complete tables: The synthetic table is finally rendered by a web browser engine to generate the bounding boxes for each table cell. A batching technique is utilized to optimize the runtime overhead of the rendering process. +1. Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.). +2. Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans. +3. Generate content: Based on the dataset theme , a set of suitable content templates is chosen first. Then, this content can be combined with purely random text to produce the synthetic content. +4. Apply styling templates: Depending on the domain of the synthetic dataset, a set of styling templates is first manually selected. Then, a style is randomly selected to format the appearance of the synthesized table. +5. Render the complete tables: The synthetic table is finally rendered by a web browser engine to generate the bounding boxes for each table cell. A batching technique is utilized to optimize the runtime overhead of the rendering process. 2. Prediction post-processing for PDF documents Although TableFormer can predict the table structure and the bounding boxes for tables recognized inside PDF documents, this is not enough when a full reconstruction of the original table is required. This happens mainly due the following reasons:
@@ -277,27 +277,27 @@ Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity.
Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity. -- · TableFormer output does not include the table cell content. -- · There are occasional inaccuracies in the predictions of the bounding boxes. +· TableFormer output does not include the table cell content. +· There are occasional inaccuracies in the predictions of the bounding boxes. dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal. However, it is possible to mitigate those limitations by combining the TableFormer predictions with the information already present inside a programmatic PDF document. More specifically, PDF documents can be seen as a sequence of PDF cells where each cell is described by its content and bounding box. If we are able to associate the PDF cells with the predicted table cells, we can directly link the PDF cell content to the table cell structure and use the PDF bounding boxes to correct misalignments in the predicted table cell bounding boxes. Here is a step-by-step description of the prediction postprocessing: -- 1. Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure. -- 2. Generate pair-wise matches between the bounding boxes of the PDF cells and the predicted cells. The Intersection Over Union (IOU) metric is used to evaluate the quality of the matches. -- 3. Use a carefully selected IOU threshold to designate the matches as "good" ones and "bad" ones. -- 3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column. -- 4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula: +1. Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure. +2. Generate pair-wise matches between the bounding boxes of the PDF cells and the predicted cells. The Intersection Over Union (IOU) metric is used to evaluate the quality of the matches. +3. Use a carefully selected IOU threshold to designate the matches as "good" ones and "bad" ones. +3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column. +4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula: where c is one of { left, centroid, right } and x$_{c}$ is the xcoordinate for the corresponding point. -- 5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me- -- 6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes. -- 7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells. -- 8. In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score. -- 9. Pick up the remaining orphan cells. There could be cases, when after applying all the previous post-processing steps, some PDF cells could still remain without any match to predicted cells. However, it is still possible to deduce the correct matching for an orphan PDF cell by mapping its bounding box on the geometry of the grid. This mapping decides if the content of the orphan cell will be appended to an already matched table cell, or a new table cell should be created to match with the orphan. +5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me- +6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes. +7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells. +8. In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score. +9. Pick up the remaining orphan cells. There could be cases, when after applying all the previous post-processing steps, some PDF cells could still remain without any match to predicted cells. However, it is still possible to deduce the correct matching for an orphan PDF cell by mapping its bounding box on the geometry of the grid. This mapping decides if the content of the orphan cell will be appended to an already matched table cell, or a new table cell should be created to match with the orphan. 9a. Compute the top and bottom boundary of the horizontal band for each grid row (min/max y coordinates per row). -- 9b. Intersect the orphan's bounding box with the row bands, and map the cell to the closest grid row. -- 9c. Compute the left and right boundary of the vertical band for each grid column (min/max x coordinates per column). -- 9d. Intersect the orphan's bounding box with the column bands, and map the cell to the closest grid column. -- 9e. If the table cell under the identified row and column is not empty, extend its content with the content of the or- +9b. Intersect the orphan's bounding box with the row bands, and map the cell to the closest grid row. +9c. Compute the left and right boundary of the vertical band for each grid column (min/max x coordinates per column). +9d. Intersect the orphan's bounding box with the column bands, and map the cell to the closest grid column. +9e. If the table cell under the identified row and column is not empty, extend its content with the content of the or- phan cell. 9f. Otherwise create a new structural cell and match it wit the orphan cell. Aditional images with examples of TableFormer predictions and post-processing can be found below. diff --git a/tests/data/groundtruth/docling_v1/2203.01017v2.json b/tests/data/groundtruth/docling_v1/2203.01017v2.json index ac10879..20027e8 100644 --- a/tests/data/groundtruth/docling_v1/2203.01017v2.json +++ b/tests/data/groundtruth/docling_v1/2203.01017v2.json @@ -326,7 +326,7 @@ "__ref_s3_data": null } ], - "text": "- b. Red-annotation of bounding boxes, Blue-predictions by TableFormer", + "text": "b. Red-annotation of bounding boxes, Blue-predictions by TableFormer", "type": "paragraph", "payload": null, "name": "List-item", @@ -349,7 +349,7 @@ "__ref_s3_data": null } ], - "text": "- c. Structure predicted by TableFormer:", + "text": "c. Structure predicted by TableFormer:", "type": "paragraph", "payload": null, "name": "List-item", @@ -548,7 +548,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 We propose TableFormer , a transformer based model that predicts tables structure and bounding boxes for the table content simultaneously in an end-to-end approach.", + "text": "\u00b7 We propose TableFormer , a transformer based model that predicts tables structure and bounding boxes for the table content simultaneously in an end-to-end approach.", "type": "paragraph", "payload": null, "name": "List-item", @@ -571,7 +571,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 Across all benchmark datasets TableFormer significantly outperforms existing state-of-the-art metrics, while being much more efficient in training and inference to existing works.", + "text": "\u00b7 Across all benchmark datasets TableFormer significantly outperforms existing state-of-the-art metrics, while being much more efficient in training and inference to existing works.", "type": "paragraph", "payload": null, "name": "List-item", @@ -594,7 +594,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 We present SynthTabNet a synthetically generated dataset, with various appearance styles and complexity.", + "text": "\u00b7 We present SynthTabNet a synthetically generated dataset, with various appearance styles and complexity.", "type": "paragraph", "payload": null, "name": "List-item", @@ -617,7 +617,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 An augmented dataset based on PubTabNet [37], FinTabNet [36], and TableBank [17] with generated ground-truth for reproducibility.", + "text": "\u00b7 An augmented dataset based on PubTabNet [37], FinTabNet [36], and TableBank [17] with generated ground-truth for reproducibility.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2221,7 +2221,7 @@ "__ref_s3_data": null } ], - "text": "- a.", + "text": "a.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2244,7 +2244,7 @@ "__ref_s3_data": null } ], - "text": "- Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells", + "text": "Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells", "type": "paragraph", "payload": null, "name": "List-item", @@ -2555,7 +2555,7 @@ "__ref_s3_data": null } ], - "text": "- [1] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-", + "text": "[1] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-", "type": "paragraph", "payload": null, "name": "List-item", @@ -2578,7 +2578,7 @@ "__ref_s3_data": null } ], - "text": "- end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5", + "text": "end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5", "type": "paragraph", "payload": null, "name": "List-item", @@ -2601,7 +2601,7 @@ "__ref_s3_data": null } ], - "text": "- [2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3", + "text": "[2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3", "type": "paragraph", "payload": null, "name": "List-item", @@ -2624,7 +2624,7 @@ "__ref_s3_data": null } ], - "text": "- [3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2", + "text": "[3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2", "type": "paragraph", "payload": null, "name": "List-item", @@ -2647,7 +2647,7 @@ "__ref_s3_data": null } ], - "text": "- [4] Herv\u00b4e D\u00b4ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2", + "text": "[4] Herv\u00b4e D\u00b4ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2", "type": "paragraph", "payload": null, "name": "List-item", @@ -2670,7 +2670,7 @@ "__ref_s3_data": null } ], - "text": "- [5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2", + "text": "[5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2", "type": "paragraph", "payload": null, "name": "List-item", @@ -2693,7 +2693,7 @@ "__ref_s3_data": null } ], - "text": "- [6] Max G\u00a8obel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2", + "text": "[6] Max G\u00a8obel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2", "type": "paragraph", "payload": null, "name": "List-item", @@ -2716,7 +2716,7 @@ "__ref_s3_data": null } ], - "text": "- [7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2", + "text": "[7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2", "type": "paragraph", "payload": null, "name": "List-item", @@ -2739,7 +2739,7 @@ "__ref_s3_data": null } ], - "text": "- [8] Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1", + "text": "[8] Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1", "type": "paragraph", "payload": null, "name": "List-item", @@ -2762,7 +2762,7 @@ "__ref_s3_data": null } ], - "text": "- [9] Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1", + "text": "[9] Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1", "type": "paragraph", "payload": null, "name": "List-item", @@ -2785,7 +2785,7 @@ "__ref_s3_data": null } ], - "text": "- [10] Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2", + "text": "[10] Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2", "type": "paragraph", "payload": null, "name": "List-item", @@ -2808,7 +2808,7 @@ "__ref_s3_data": null } ], - "text": "- [11] Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2", + "text": "[11] Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2", "type": "paragraph", "payload": null, "name": "List-item", @@ -2831,7 +2831,7 @@ "__ref_s3_data": null } ], - "text": "- [12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2", + "text": "[12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2", "type": "paragraph", "payload": null, "name": "List-item", @@ -2854,7 +2854,7 @@ "__ref_s3_data": null } ], - "text": "- [13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl\u00b4ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2", + "text": "[13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl\u00b4ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2", "type": "paragraph", "payload": null, "name": "List-item", @@ -2877,7 +2877,7 @@ "__ref_s3_data": null } ], - "text": "- [14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2", + "text": "[14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2", "type": "paragraph", "payload": null, "name": "List-item", @@ -2900,7 +2900,7 @@ "__ref_s3_data": null } ], - "text": "- [15] Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6", + "text": "[15] Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6", "type": "paragraph", "payload": null, "name": "List-item", @@ -2923,7 +2923,7 @@ "__ref_s3_data": null } ], - "text": "- [16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4", + "text": "[16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4", "type": "paragraph", "payload": null, "name": "List-item", @@ -2946,7 +2946,7 @@ "__ref_s3_data": null } ], - "text": "- [17] Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou, and Zhoujun Li. Tablebank: A benchmark dataset for table detection and recognition, 2019. 2, 3", + "text": "[17] Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou, and Zhoujun Li. Tablebank: A benchmark dataset for table detection and recognition, 2019. 2, 3", "type": "paragraph", "payload": null, "name": "List-item", @@ -2969,7 +2969,7 @@ "__ref_s3_data": null } ], - "text": "- [18] Yiren Li, Zheng Huang, Junchi Yan, Yi Zhou, Fan Ye, and Xianhui Liu. Gfte: Graph-based financial table extraction. In Alberto Del Bimbo, Rita Cucchiara, Stan Sclaroff, Giovanni Maria Farinella, Tao Mei, Marco Bertini, Hugo Jair Escalante, and Roberto Vezzani, editors, Pattern Recognition. ICPR International Workshops and Challenges , pages 644-658, Cham, 2021. Springer International Publishing. 2, 3", + "text": "[18] Yiren Li, Zheng Huang, Junchi Yan, Yi Zhou, Fan Ye, and Xianhui Liu. Gfte: Graph-based financial table extraction. In Alberto Del Bimbo, Rita Cucchiara, Stan Sclaroff, Giovanni Maria Farinella, Tao Mei, Marco Bertini, Hugo Jair Escalante, and Roberto Vezzani, editors, Pattern Recognition. ICPR International Workshops and Challenges , pages 644-658, Cham, 2021. Springer International Publishing. 2, 3", "type": "paragraph", "payload": null, "name": "List-item", @@ -2992,7 +2992,7 @@ "__ref_s3_data": null } ], - "text": "- [19] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1", + "text": "[19] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1", "type": "paragraph", "payload": null, "name": "List-item", @@ -3015,7 +3015,7 @@ "__ref_s3_data": null } ], - "text": "- [20] Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2", + "text": "[20] Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2", "type": "paragraph", "payload": null, "name": "List-item", @@ -3038,7 +3038,7 @@ "__ref_s3_data": null } ], - "text": "- [21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1", + "text": "[21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1", "type": "paragraph", "payload": null, "name": "List-item", @@ -3061,7 +3061,7 @@ "__ref_s3_data": null } ], - "text": "- [22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch\u00b4e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6", + "text": "[22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch\u00b4e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6", "type": "paragraph", "payload": null, "name": "List-item", @@ -3084,7 +3084,7 @@ "__ref_s3_data": null } ], - "text": "- [23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1", + "text": "[23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1", "type": "paragraph", "payload": null, "name": "List-item", @@ -3107,7 +3107,7 @@ "__ref_s3_data": null } ], - "text": "- [24] Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3", + "text": "[24] Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3", "type": "paragraph", "payload": null, "name": "List-item", @@ -3130,7 +3130,7 @@ "__ref_s3_data": null } ], - "text": "- [25] Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on", + "text": "[25] Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on", "type": "paragraph", "payload": null, "name": "List-item", @@ -3176,7 +3176,7 @@ "__ref_s3_data": null } ], - "text": "- [26] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 11621167, 2017. 1", + "text": "[26] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 11621167, 2017. 1", "type": "paragraph", "payload": null, "name": "List-item", @@ -3199,7 +3199,7 @@ "__ref_s3_data": null } ], - "text": "- [27] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR international conference on document analysis and recognition (ICDAR) , volume 1, pages 1162-1167. IEEE, 2017. 3", + "text": "[27] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR international conference on document analysis and recognition (ICDAR) , volume 1, pages 1162-1167. IEEE, 2017. 3", "type": "paragraph", "payload": null, "name": "List-item", @@ -3222,7 +3222,7 @@ "__ref_s3_data": null } ], - "text": "- [28] Faisal Shafait and Ray Smith. Table detection in heterogeneous documents. In Proceedings of the 9th IAPR International Workshop on Document Analysis Systems , pages 6572, 2010. 2", + "text": "[28] Faisal Shafait and Ray Smith. Table detection in heterogeneous documents. In Proceedings of the 9th IAPR International Workshop on Document Analysis Systems , pages 6572, 2010. 2", "type": "paragraph", "payload": null, "name": "List-item", @@ -3245,7 +3245,7 @@ "__ref_s3_data": null } ], - "text": "- [29] Shoaib Ahmed Siddiqui, Imran Ali Fateh, Syed Tahseen Raza Rizvi, Andreas Dengel, and Sheraz Ahmed. Deeptabstr: Deep learning based table structure recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1403-1409. IEEE, 2019. 3", + "text": "[29] Shoaib Ahmed Siddiqui, Imran Ali Fateh, Syed Tahseen Raza Rizvi, Andreas Dengel, and Sheraz Ahmed. Deeptabstr: Deep learning based table structure recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1403-1409. IEEE, 2019. 3", "type": "paragraph", "payload": null, "name": "List-item", @@ -3268,7 +3268,7 @@ "__ref_s3_data": null } ], - "text": "- [30] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD , KDD '18, pages 774-782, New York, NY, USA, 2018. ACM. 1", + "text": "[30] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD , KDD '18, pages 774-782, New York, NY, USA, 2018. ACM. 1", "type": "paragraph", "payload": null, "name": "List-item", @@ -3291,7 +3291,7 @@ "__ref_s3_data": null } ], - "text": "- [31] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141 ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, Advances in Neural Information Processing Systems 30 , pages 5998-6008. Curran Associates, Inc., 2017. 5", + "text": "[31] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141 ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, Advances in Neural Information Processing Systems 30 , pages 5998-6008. Curran Associates, Inc., 2017. 5", "type": "paragraph", "payload": null, "name": "List-item", @@ -3314,7 +3314,7 @@ "__ref_s3_data": null } ], - "text": "- [32] Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. Show and tell: A neural image caption generator. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) , June 2015. 2", + "text": "[32] Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. Show and tell: A neural image caption generator. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) , June 2015. 2", "type": "paragraph", "payload": null, "name": "List-item", @@ -3337,7 +3337,7 @@ "__ref_s3_data": null } ], - "text": "- [33] Wenyuan Xue, Qingyong Li, and Dacheng Tao. Res2tim: reconstruct syntactic structures from table images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 749-755. IEEE, 2019. 3", + "text": "[33] Wenyuan Xue, Qingyong Li, and Dacheng Tao. Res2tim: reconstruct syntactic structures from table images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 749-755. IEEE, 2019. 3", "type": "paragraph", "payload": null, "name": "List-item", @@ -3360,7 +3360,7 @@ "__ref_s3_data": null } ], - "text": "- [34] Wenyuan Xue, Baosheng Yu, Wen Wang, Dacheng Tao, and Qingyong Li. Tgrnet: A table graph reconstruction network for table structure recognition. arXiv preprint arXiv:2106.10598 , 2021. 3", + "text": "[34] Wenyuan Xue, Baosheng Yu, Wen Wang, Dacheng Tao, and Qingyong Li. Tgrnet: A table graph reconstruction network for table structure recognition. arXiv preprint arXiv:2106.10598 , 2021. 3", "type": "paragraph", "payload": null, "name": "List-item", @@ -3383,7 +3383,7 @@ "__ref_s3_data": null } ], - "text": "- [35] Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 4651-4659, 2016. 4", + "text": "[35] Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 4651-4659, 2016. 4", "type": "paragraph", "payload": null, "name": "List-item", @@ -3406,7 +3406,7 @@ "__ref_s3_data": null } ], - "text": "- [36] Xinyi Zheng, Doug Burdick, Lucian Popa, Peter Zhong, and Nancy Xin Ru Wang. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. Winter Conference for Applications in Computer Vision (WACV) , 2021. 2, 3", + "text": "[36] Xinyi Zheng, Doug Burdick, Lucian Popa, Peter Zhong, and Nancy Xin Ru Wang. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. Winter Conference for Applications in Computer Vision (WACV) , 2021. 2, 3", "type": "paragraph", "payload": null, "name": "List-item", @@ -3429,7 +3429,7 @@ "__ref_s3_data": null } ], - "text": "- [37] Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model,", + "text": "[37] Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model,", "type": "paragraph", "payload": null, "name": "List-item", @@ -3452,7 +3452,7 @@ "__ref_s3_data": null } ], - "text": "- and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7", + "text": "and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7", "type": "paragraph", "payload": null, "name": "List-item", @@ -3475,7 +3475,7 @@ "__ref_s3_data": null } ], - "text": "- [38] Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1", + "text": "[38] Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1", "type": "paragraph", "payload": null, "name": "List-item", @@ -3719,7 +3719,7 @@ "__ref_s3_data": null } ], - "text": "- 1. Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.).", + "text": "1. Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.).", "type": "paragraph", "payload": null, "name": "List-item", @@ -3742,7 +3742,7 @@ "__ref_s3_data": null } ], - "text": "- 2. Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans.", + "text": "2. Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans.", "type": "paragraph", "payload": null, "name": "List-item", @@ -3765,7 +3765,7 @@ "__ref_s3_data": null } ], - "text": "- 3. Generate content: Based on the dataset theme , a set of suitable content templates is chosen first. Then, this content can be combined with purely random text to produce the synthetic content.", + "text": "3. Generate content: Based on the dataset theme , a set of suitable content templates is chosen first. Then, this content can be combined with purely random text to produce the synthetic content.", "type": "paragraph", "payload": null, "name": "List-item", @@ -3788,7 +3788,7 @@ "__ref_s3_data": null } ], - "text": "- 4. Apply styling templates: Depending on the domain of the synthetic dataset, a set of styling templates is first manually selected. Then, a style is randomly selected to format the appearance of the synthesized table.", + "text": "4. Apply styling templates: Depending on the domain of the synthetic dataset, a set of styling templates is first manually selected. Then, a style is randomly selected to format the appearance of the synthesized table.", "type": "paragraph", "payload": null, "name": "List-item", @@ -3811,7 +3811,7 @@ "__ref_s3_data": null } ], - "text": "- 5. Render the complete tables: The synthetic table is finally rendered by a web browser engine to generate the bounding boxes for each table cell. A batching technique is utilized to optimize the runtime overhead of the rendering process.", + "text": "5. Render the complete tables: The synthetic table is finally rendered by a web browser engine to generate the bounding boxes for each table cell. A batching technique is utilized to optimize the runtime overhead of the rendering process.", "type": "paragraph", "payload": null, "name": "List-item", @@ -3908,7 +3908,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 TableFormer output does not include the table cell content.", + "text": "\u00b7 TableFormer output does not include the table cell content.", "type": "paragraph", "payload": null, "name": "List-item", @@ -3931,7 +3931,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 There are occasional inaccuracies in the predictions of the bounding boxes.", + "text": "\u00b7 There are occasional inaccuracies in the predictions of the bounding boxes.", "type": "paragraph", "payload": null, "name": "List-item", @@ -4023,7 +4023,7 @@ "__ref_s3_data": null } ], - "text": "- 1. Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure.", + "text": "1. Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure.", "type": "paragraph", "payload": null, "name": "List-item", @@ -4046,7 +4046,7 @@ "__ref_s3_data": null } ], - "text": "- 2. Generate pair-wise matches between the bounding boxes of the PDF cells and the predicted cells. The Intersection Over Union (IOU) metric is used to evaluate the quality of the matches.", + "text": "2. Generate pair-wise matches between the bounding boxes of the PDF cells and the predicted cells. The Intersection Over Union (IOU) metric is used to evaluate the quality of the matches.", "type": "paragraph", "payload": null, "name": "List-item", @@ -4069,7 +4069,7 @@ "__ref_s3_data": null } ], - "text": "- 3. Use a carefully selected IOU threshold to designate the matches as \"good\" ones and \"bad\" ones.", + "text": "3. Use a carefully selected IOU threshold to designate the matches as \"good\" ones and \"bad\" ones.", "type": "paragraph", "payload": null, "name": "List-item", @@ -4092,7 +4092,7 @@ "__ref_s3_data": null } ], - "text": "- 3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column.", + "text": "3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column.", "type": "paragraph", "payload": null, "name": "List-item", @@ -4115,7 +4115,7 @@ "__ref_s3_data": null } ], - "text": "- 4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula:", + "text": "4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula:", "type": "paragraph", "payload": null, "name": "List-item", @@ -4184,7 +4184,7 @@ "__ref_s3_data": null } ], - "text": "- 5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-", + "text": "5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-", "type": "paragraph", "payload": null, "name": "List-item", @@ -4207,7 +4207,7 @@ "__ref_s3_data": null } ], - "text": "- 6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.", + "text": "6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.", "type": "paragraph", "payload": null, "name": "List-item", @@ -4230,7 +4230,7 @@ "__ref_s3_data": null } ], - "text": "- 7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells.", + "text": "7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells.", "type": "paragraph", "payload": null, "name": "List-item", @@ -4253,7 +4253,7 @@ "__ref_s3_data": null } ], - "text": "- 8. In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score.", + "text": "8. In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score.", "type": "paragraph", "payload": null, "name": "List-item", @@ -4276,7 +4276,7 @@ "__ref_s3_data": null } ], - "text": "- 9. Pick up the remaining orphan cells. There could be cases, when after applying all the previous post-processing steps, some PDF cells could still remain without any match to predicted cells. However, it is still possible to deduce the correct matching for an orphan PDF cell by mapping its bounding box on the geometry of the grid. This mapping decides if the content of the orphan cell will be appended to an already matched table cell, or a new table cell should be created to match with the orphan.", + "text": "9. Pick up the remaining orphan cells. There could be cases, when after applying all the previous post-processing steps, some PDF cells could still remain without any match to predicted cells. However, it is still possible to deduce the correct matching for an orphan PDF cell by mapping its bounding box on the geometry of the grid. This mapping decides if the content of the orphan cell will be appended to an already matched table cell, or a new table cell should be created to match with the orphan.", "type": "paragraph", "payload": null, "name": "List-item", @@ -4322,7 +4322,7 @@ "__ref_s3_data": null } ], - "text": "- 9b. Intersect the orphan's bounding box with the row bands, and map the cell to the closest grid row.", + "text": "9b. Intersect the orphan's bounding box with the row bands, and map the cell to the closest grid row.", "type": "paragraph", "payload": null, "name": "List-item", @@ -4345,7 +4345,7 @@ "__ref_s3_data": null } ], - "text": "- 9c. Compute the left and right boundary of the vertical band for each grid column (min/max x coordinates per column).", + "text": "9c. Compute the left and right boundary of the vertical band for each grid column (min/max x coordinates per column).", "type": "paragraph", "payload": null, "name": "List-item", @@ -4368,7 +4368,7 @@ "__ref_s3_data": null } ], - "text": "- 9d. Intersect the orphan's bounding box with the column bands, and map the cell to the closest grid column.", + "text": "9d. Intersect the orphan's bounding box with the column bands, and map the cell to the closest grid column.", "type": "paragraph", "payload": null, "name": "List-item", @@ -4391,7 +4391,7 @@ "__ref_s3_data": null } ], - "text": "- 9e. If the table cell under the identified row and column is not empty, extend its content with the content of the or-", + "text": "9e. If the table cell under the identified row and column is not empty, extend its content with the content of the or-", "type": "paragraph", "payload": null, "name": "List-item", diff --git a/tests/data/groundtruth/docling_v1/2203.01017v2.md b/tests/data/groundtruth/docling_v1/2203.01017v2.md index 3c7a8da..2eb1843 100644 --- a/tests/data/groundtruth/docling_v1/2203.01017v2.md +++ b/tests/data/groundtruth/docling_v1/2203.01017v2.md @@ -16,9 +16,9 @@ The occurrence of tables in documents is ubiquitous. They often summarise quanti -- b. Red-annotation of bounding boxes, Blue-predictions by TableFormer +b. Red-annotation of bounding boxes, Blue-predictions by TableFormer -- c. Structure predicted by TableFormer: +c. Structure predicted by TableFormer: @@ -44,13 +44,13 @@ In this paper, we want to address these weaknesses and present a robust table-st To meet the design criteria listed above, we developed a new model called TableFormer and a synthetically generated table structure dataset called SynthTabNet $^{1}$. In particular, our contributions in this work can be summarised as follows: -- · We propose TableFormer , a transformer based model that predicts tables structure and bounding boxes for the table content simultaneously in an end-to-end approach. +· We propose TableFormer , a transformer based model that predicts tables structure and bounding boxes for the table content simultaneously in an end-to-end approach. -- · Across all benchmark datasets TableFormer significantly outperforms existing state-of-the-art metrics, while being much more efficient in training and inference to existing works. +· Across all benchmark datasets TableFormer significantly outperforms existing state-of-the-art metrics, while being much more efficient in training and inference to existing works. -- · We present SynthTabNet a synthetically generated dataset, with various appearance styles and complexity. +· We present SynthTabNet a synthetically generated dataset, with various appearance styles and complexity. -- · An augmented dataset based on PubTabNet [37], FinTabNet [36], and TableBank [17] with generated ground-truth for reproducibility. +· An augmented dataset based on PubTabNet [37], FinTabNet [36], and TableBank [17] with generated ground-truth for reproducibility. The paper is structured as follows. In Sec. 2, we give a brief overview of the current state-of-the-art. In Sec. 3, we describe the datasets on which we train. In Sec. 4, we introduce the TableFormer model-architecture and describe @@ -216,9 +216,9 @@ Table 4: Results of structure with content retrieved using cell detection on Pub | EDD | 91.2 | 85.4 | 88.3 | | TableFormer | 95.4 | 90.1 | 93.6 | -- a. +a. -- Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells +Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells ## Japanese language (previously unseen by TableFormer): @@ -270,87 +270,87 @@ In this paper, we presented TableFormer an end-to-end transformer based approach ## References -- [1] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to- +[1] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to- -- end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5 +end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5 -- [2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3 +[2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3 -- [3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2 +[3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2 -- [4] Herv´e D´ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2 +[4] Herv´e D´ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2 -- [5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2 +[5] Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2 -- [6] Max G¨obel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2 +[6] Max G¨obel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2 -- [7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2 +[7] EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2 -- [8] Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1 +[8] Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1 -- [9] Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1 +[9] Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1 -- [10] Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2 +[10] Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2 -- [11] Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2 +[11] Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2 -- [12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2 +[12] Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2 -- [13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl´ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2 +[13] Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl´ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2 -- [14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2 +[14] Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2 -- [15] Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6 +[15] Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6 -- [16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4 +[16] Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4 -- [17] Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou, and Zhoujun Li. Tablebank: A benchmark dataset for table detection and recognition, 2019. 2, 3 +[17] Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou, and Zhoujun Li. Tablebank: A benchmark dataset for table detection and recognition, 2019. 2, 3 -- [18] Yiren Li, Zheng Huang, Junchi Yan, Yi Zhou, Fan Ye, and Xianhui Liu. Gfte: Graph-based financial table extraction. In Alberto Del Bimbo, Rita Cucchiara, Stan Sclaroff, Giovanni Maria Farinella, Tao Mei, Marco Bertini, Hugo Jair Escalante, and Roberto Vezzani, editors, Pattern Recognition. ICPR International Workshops and Challenges , pages 644-658, Cham, 2021. Springer International Publishing. 2, 3 +[18] Yiren Li, Zheng Huang, Junchi Yan, Yi Zhou, Fan Ye, and Xianhui Liu. Gfte: Graph-based financial table extraction. In Alberto Del Bimbo, Rita Cucchiara, Stan Sclaroff, Giovanni Maria Farinella, Tao Mei, Marco Bertini, Hugo Jair Escalante, and Roberto Vezzani, editors, Pattern Recognition. ICPR International Workshops and Challenges , pages 644-658, Cham, 2021. Springer International Publishing. 2, 3 -- [19] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1 +[19] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1 -- [20] Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2 +[20] Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2 -- [21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1 +[21] Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1 -- [22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch´e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6 +[22] Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch´e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6 -- [23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1 +[23] Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1 -- [24] Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3 +[24] Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3 -- [25] Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on +[25] Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition , pages 658-666, 2019. 6 -- [26] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 11621167, 2017. 1 +[26] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 11621167, 2017. 1 -- [27] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR international conference on document analysis and recognition (ICDAR) , volume 1, pages 1162-1167. IEEE, 2017. 3 +[27] Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR international conference on document analysis and recognition (ICDAR) , volume 1, pages 1162-1167. IEEE, 2017. 3 -- [28] Faisal Shafait and Ray Smith. Table detection in heterogeneous documents. In Proceedings of the 9th IAPR International Workshop on Document Analysis Systems , pages 6572, 2010. 2 +[28] Faisal Shafait and Ray Smith. Table detection in heterogeneous documents. In Proceedings of the 9th IAPR International Workshop on Document Analysis Systems , pages 6572, 2010. 2 -- [29] Shoaib Ahmed Siddiqui, Imran Ali Fateh, Syed Tahseen Raza Rizvi, Andreas Dengel, and Sheraz Ahmed. Deeptabstr: Deep learning based table structure recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1403-1409. IEEE, 2019. 3 +[29] Shoaib Ahmed Siddiqui, Imran Ali Fateh, Syed Tahseen Raza Rizvi, Andreas Dengel, and Sheraz Ahmed. Deeptabstr: Deep learning based table structure recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1403-1409. IEEE, 2019. 3 -- [30] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD , KDD '18, pages 774-782, New York, NY, USA, 2018. ACM. 1 +[30] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD , KDD '18, pages 774-782, New York, NY, USA, 2018. ACM. 1 -- [31] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Ł ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, Advances in Neural Information Processing Systems 30 , pages 5998-6008. Curran Associates, Inc., 2017. 5 +[31] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, Ł ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, Advances in Neural Information Processing Systems 30 , pages 5998-6008. Curran Associates, Inc., 2017. 5 -- [32] Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. Show and tell: A neural image caption generator. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) , June 2015. 2 +[32] Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. Show and tell: A neural image caption generator. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) , June 2015. 2 -- [33] Wenyuan Xue, Qingyong Li, and Dacheng Tao. Res2tim: reconstruct syntactic structures from table images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 749-755. IEEE, 2019. 3 +[33] Wenyuan Xue, Qingyong Li, and Dacheng Tao. Res2tim: reconstruct syntactic structures from table images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 749-755. IEEE, 2019. 3 -- [34] Wenyuan Xue, Baosheng Yu, Wen Wang, Dacheng Tao, and Qingyong Li. Tgrnet: A table graph reconstruction network for table structure recognition. arXiv preprint arXiv:2106.10598 , 2021. 3 +[34] Wenyuan Xue, Baosheng Yu, Wen Wang, Dacheng Tao, and Qingyong Li. Tgrnet: A table graph reconstruction network for table structure recognition. arXiv preprint arXiv:2106.10598 , 2021. 3 -- [35] Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 4651-4659, 2016. 4 +[35] Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 4651-4659, 2016. 4 -- [36] Xinyi Zheng, Doug Burdick, Lucian Popa, Peter Zhong, and Nancy Xin Ru Wang. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. Winter Conference for Applications in Computer Vision (WACV) , 2021. 2, 3 +[36] Xinyi Zheng, Doug Burdick, Lucian Popa, Peter Zhong, and Nancy Xin Ru Wang. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. Winter Conference for Applications in Computer Vision (WACV) , 2021. 2, 3 -- [37] Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model, +[37] Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model, -- and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7 +and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7 -- [38] Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1 +[38] Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1 ## TableFormer: Table Structure Understanding with Transformers Supplementary Material @@ -370,15 +370,15 @@ Aiming to train and evaluate our models in a broader spectrum of table data we h The process of generating a synthetic dataset can be decomposed into the following steps: -- 1. Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.). +1. Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.). -- 2. Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans. +2. Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans. -- 3. Generate content: Based on the dataset theme , a set of suitable content templates is chosen first. Then, this content can be combined with purely random text to produce the synthetic content. +3. Generate content: Based on the dataset theme , a set of suitable content templates is chosen first. Then, this content can be combined with purely random text to produce the synthetic content. -- 4. Apply styling templates: Depending on the domain of the synthetic dataset, a set of styling templates is first manually selected. Then, a style is randomly selected to format the appearance of the synthesized table. +4. Apply styling templates: Depending on the domain of the synthetic dataset, a set of styling templates is first manually selected. Then, a style is randomly selected to format the appearance of the synthesized table. -- 5. Render the complete tables: The synthetic table is finally rendered by a web browser engine to generate the bounding boxes for each table cell. A batching technique is utilized to optimize the runtime overhead of the rendering process. +5. Render the complete tables: The synthetic table is finally rendered by a web browser engine to generate the bounding boxes for each table cell. A batching technique is utilized to optimize the runtime overhead of the rendering process. ## 2. Prediction post-processing for PDF documents @@ -387,9 +387,9 @@ Although TableFormer can predict the table structure and the bounding boxes for Figure 7: Distribution of the tables across different dimensions per dataset. Simple vs complex tables per dataset and split, strict vs non strict html structures per dataset and table complexity, missing bboxes per dataset and table complexity. -- · TableFormer output does not include the table cell content. +· TableFormer output does not include the table cell content. -- · There are occasional inaccuracies in the predictions of the bounding boxes. +· There are occasional inaccuracies in the predictions of the bounding boxes. dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal. @@ -397,37 +397,37 @@ However, it is possible to mitigate those limitations by combining the TableForm Here is a step-by-step description of the prediction postprocessing: -- 1. Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure. +1. Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure. -- 2. Generate pair-wise matches between the bounding boxes of the PDF cells and the predicted cells. The Intersection Over Union (IOU) metric is used to evaluate the quality of the matches. +2. Generate pair-wise matches between the bounding boxes of the PDF cells and the predicted cells. The Intersection Over Union (IOU) metric is used to evaluate the quality of the matches. -- 3. Use a carefully selected IOU threshold to designate the matches as "good" ones and "bad" ones. +3. Use a carefully selected IOU threshold to designate the matches as "good" ones and "bad" ones. -- 3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column. +3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column. -- 4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula: +4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula: where c is one of { left, centroid, right } and x$_{c}$ is the xcoordinate for the corresponding point. -- 5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me- +5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me- -- 6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes. +6. Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes. -- 7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells. +7. Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells. -- 8. In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score. +8. In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score. -- 9. Pick up the remaining orphan cells. There could be cases, when after applying all the previous post-processing steps, some PDF cells could still remain without any match to predicted cells. However, it is still possible to deduce the correct matching for an orphan PDF cell by mapping its bounding box on the geometry of the grid. This mapping decides if the content of the orphan cell will be appended to an already matched table cell, or a new table cell should be created to match with the orphan. +9. Pick up the remaining orphan cells. There could be cases, when after applying all the previous post-processing steps, some PDF cells could still remain without any match to predicted cells. However, it is still possible to deduce the correct matching for an orphan PDF cell by mapping its bounding box on the geometry of the grid. This mapping decides if the content of the orphan cell will be appended to an already matched table cell, or a new table cell should be created to match with the orphan. 9a. Compute the top and bottom boundary of the horizontal band for each grid row (min/max y coordinates per row). -- 9b. Intersect the orphan's bounding box with the row bands, and map the cell to the closest grid row. +9b. Intersect the orphan's bounding box with the row bands, and map the cell to the closest grid row. -- 9c. Compute the left and right boundary of the vertical band for each grid column (min/max x coordinates per column). +9c. Compute the left and right boundary of the vertical band for each grid column (min/max x coordinates per column). -- 9d. Intersect the orphan's bounding box with the column bands, and map the cell to the closest grid column. +9d. Intersect the orphan's bounding box with the column bands, and map the cell to the closest grid column. -- 9e. If the table cell under the identified row and column is not empty, extend its content with the content of the or- +9e. If the table cell under the identified row and column is not empty, extend its content with the content of the or- phan cell. diff --git a/tests/data/groundtruth/docling_v1/2206.01062.doctags.txt b/tests/data/groundtruth/docling_v1/2206.01062.doctags.txt index 0028eca..cbd93fb 100644 --- a/tests/data/groundtruth/docling_v1/2206.01062.doctags.txt +++ b/tests/data/groundtruth/docling_v1/2206.01062.doctags.txt @@ -27,12 +27,12 @@ Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1. A key problem in the process of document conversion is to understand the structure of a single document page, i.e. which segments of text should be grouped together in a unit. To train models for this task, there are currently two large datasets available to the community, PubLayNet [6] and DocBank [7]. They were introduced in 2019 and 2020 respectively and significantly accelerated the implementation of layout detection and segmentation models due to their sizes of 300K and 500K ground-truth pages. These sizes were achieved by leveraging an automation approach. The benefit of automated ground-truth generation is obvious: one can generate large ground-truth datasets at virtually no cost. However, the automation introduces a constraint on the variability in the dataset, because corresponding structured source data must be available. PubLayNet and DocBank were both generated from scientific document repositories (PubMed and arXiv), which provide XML or L A T E X sources. Those scientific documents present a limited variability in their layouts, because they are typeset in uniform templates provided by the publishers. Obviously, documents such as technical manuals, annual company reports, legal text, government tenders, etc. have very different and partially unique layouts. As a consequence, the layout predictions obtained from models trained on PubLayNet or DocBank is very reasonable when applied on scientific documents. However, for more artistic or free-style layouts, we see sub-par prediction quality from these models, which we demonstrate in Section 5. In this paper, we present the DocLayNet dataset. It provides pageby-page layout annotation ground-truth using bounding-boxes for 11 distinct class labels on 80863 unique document pages, of which a fraction carry double- or triple-annotations. DocLayNet is similar in spirit to PubLayNet and DocBank and will likewise be made available to the public 1 in order to stimulate the document-layout analysis community. It distinguishes itself in the following aspects: -- (1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on human annotation instead of automation approaches to generate the data set. -- (2) Large Layout Variability : We include diverse and complex layouts from a large variety of public sources. -- (3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours. -- (4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation. +(1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on human annotation instead of automation approaches to generate the data set. +(2) Large Layout Variability : We include diverse and complex layouts from a large variety of public sources. +(3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours. +(4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation. This enables experimentation with annotation uncertainty and quality control analysis. -- (5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores. +(5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores. All aspects outlined above are detailed in Section 3. In Section 4, we will elaborate on how we designed and executed this large-scale human annotation campaign. We will also share key insights and lessons learned that might prove helpful for other parties planning to set up annotation campaigns. In Section 5, we will present baseline accuracy numbers for a variety of object detection methods (Faster R-CNN, Mask R-CNN and YOLOv5) trained on DocLayNet. We further show how the model performance is impacted by varying the DocLayNet dataset size, reducing the label set and modifying the train/test-split. Last but not least, we compare the performance of models trained on PubLayNet, DocBank and DocLayNet and demonstrate that a model trained on DocLayNet provides overall more robust layout recovery. 2 RELATED WORK @@ -86,12 +86,12 @@ the textual content of an element, which goes beyond visual layout recognition, in particular outside the Scientific Articles category. At first sight, the task of visual document-layout interpretation appears intuitive enough to obtain plausible annotations in most cases. However, during early trial-runs in the core team, we observed many cases in which annotators use different annotation styles, especially for documents with challenging layouts. For example, if a figure is presented with subfigures, one annotator might draw a single figure bounding-box, while another might annotate each subfigure separately. The same applies for lists, where one might annotate all list items in one block or each list item separately. In essence, we observed that challenging layouts would be annotated in different but plausible ways. To illustrate this, we show in Figure 4 multiple examples of plausible but inconsistent annotations on the same pages. Obviously, this inconsistency in annotations is not desirable for datasets which are intended to be used for model training. To minimise these inconsistencies, we created a detailed annotation guideline. While perfect consistency across 40 annotation staff members is clearly not possible to achieve, we saw a huge improvement in annotation consistency after the introduction of our annotation guideline. A few selected, non-trivial highlights of the guideline are: -- (1) Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object. -- (2) A List-item is a paragraph with hanging indentation. Singleline elements can qualify as List-item if the neighbour elements expose hanging indentation. Bullet or enumeration symbols are not a requirement. -- (3) For every Caption , there must be exactly one corresponding Picture or Table . -- (4) Connected sub-pictures are grouped together in one Picture object. -- (5) Formula numbers are included in a Formula object. -- (6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line. +(1) Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object. +(2) A List-item is a paragraph with hanging indentation. Singleline elements can qualify as List-item if the neighbour elements expose hanging indentation. Bullet or enumeration symbols are not a requirement. +(3) For every Caption , there must be exactly one corresponding Picture or Table . +(4) Connected sub-pictures are grouped together in one Picture object. +(5) Formula numbers are included in a Formula object. +(6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line. The complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference. Phase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore we prepared a subset of pages with two different complexity levels, each with a practice and an exam part. 974 pages were reference-annotated by one proficient core team member. Annotation staff were then given the task to annotate the same subsets (blinded from the reference). By comparing the annotations of each staff member with the reference annotations, we could quantify how closely their annotations matched the reference. Only after passing two exam levels with high annotation quality, staff were admitted into the production phase. Practice iterations
@@ -203,19 +203,19 @@ From the dataset, we have derived on the one hand reference metrics for human performance on document-layout annotation (through double and triple annotations) and on the other hand evaluated the baseline performance of commonly used object detection methods. We also illustrated the impact of various dataset-related aspects on model performance through data-ablation experiments, both from a size and class-label perspective. Last but not least, we compared the accuracy of models trained on other public datasets and showed that DocLayNet trained models are more robust. To date, there is still a significant gap between human and ML accuracy on the layout interpretation task, and we hope that this work will inspire the research community to close that gap. REFERENCES -- [1] Max Göbel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. -- [2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher. Icdar2017 competition on recognition of documents with complex layouts rdcl2017. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 1404-1410, 2017. -- [3] Hervé Déjean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), April 2019. http://sac.founderit.com/. -- [4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on scientific literature parsing. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag, sep 2021. -- [5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis: not dead yet. International Journal on Document Analysis and Recognition (IJDAR) , pages 1-11, 01 2022. -- [6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest dataset ever for document layout analysis. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep 2019. -- [7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings of the 28th International Conference on Computational Linguistics , COLING, pages 949-960. International Committee on Computational Linguistics, dec 2020. -- [8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC , 2016. -- [9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich feature hierarchies for accurate object detection and semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587. IEEE Computer Society, jun 2014. -- [10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015. -- [11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017. -- [12] Kaiming He, Georgia Gkioxari, Piotr Dollár, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017. -- [13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu +[1] Max Göbel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. +[2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher. Icdar2017 competition on recognition of documents with complex layouts rdcl2017. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 1404-1410, 2017. +[3] Hervé Déjean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), April 2019. http://sac.founderit.com/. +[4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on scientific literature parsing. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag, sep 2021. +[5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis: not dead yet. International Journal on Document Analysis and Recognition (IJDAR) , pages 1-11, 01 2022. +[6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest dataset ever for document layout analysis. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep 2019. +[7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings of the 28th International Conference on Computational Linguistics , COLING, pages 949-960. International Committee on Computational Linguistics, dec 2020. +[8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC , 2016. +[9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich feature hierarchies for accurate object detection and semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587. IEEE Computer Society, jun 2014. +[10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015. +[11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017. +[12] Kaiming He, Georgia Gkioxari, Piotr Dollár, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017. +[13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu
Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title @@ -223,14 +223,14 @@ Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes. Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021. -- [20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021. -- [14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020. -- [15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019. -- [16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014. -- [17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019. -- [18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021. -- [19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery. -- [21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021. -- [22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018. -- [23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019. +[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021. +[14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020. +[15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019. +[16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014. +[17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019. +[18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021. +[19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery. +[21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021. +[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018. +[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v1/2206.01062.json b/tests/data/groundtruth/docling_v1/2206.01062.json index 9a07af4..789e01c 100644 --- a/tests/data/groundtruth/docling_v1/2206.01062.json +++ b/tests/data/groundtruth/docling_v1/2206.01062.json @@ -654,7 +654,7 @@ "__ref_s3_data": null } ], - "text": "- (1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on human annotation instead of automation approaches to generate the data set.", + "text": "(1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on human annotation instead of automation approaches to generate the data set.", "type": "paragraph", "payload": null, "name": "List-item", @@ -677,7 +677,7 @@ "__ref_s3_data": null } ], - "text": "- (2) Large Layout Variability : We include diverse and complex layouts from a large variety of public sources.", + "text": "(2) Large Layout Variability : We include diverse and complex layouts from a large variety of public sources.", "type": "paragraph", "payload": null, "name": "List-item", @@ -700,7 +700,7 @@ "__ref_s3_data": null } ], - "text": "- (3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours.", + "text": "(3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours.", "type": "paragraph", "payload": null, "name": "List-item", @@ -723,7 +723,7 @@ "__ref_s3_data": null } ], - "text": "- (4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.", + "text": "(4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.", "type": "paragraph", "payload": null, "name": "List-item", @@ -792,7 +792,7 @@ "__ref_s3_data": null } ], - "text": "- (5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.", + "text": "(5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1511,7 +1511,7 @@ "__ref_s3_data": null } ], - "text": "- (1) Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object.", + "text": "(1) Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1534,7 +1534,7 @@ "__ref_s3_data": null } ], - "text": "- (2) A List-item is a paragraph with hanging indentation. Singleline elements can qualify as List-item if the neighbour elements expose hanging indentation. Bullet or enumeration symbols are not a requirement.", + "text": "(2) A List-item is a paragraph with hanging indentation. Singleline elements can qualify as List-item if the neighbour elements expose hanging indentation. Bullet or enumeration symbols are not a requirement.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1557,7 +1557,7 @@ "__ref_s3_data": null } ], - "text": "- (3) For every Caption , there must be exactly one corresponding Picture or Table .", + "text": "(3) For every Caption , there must be exactly one corresponding Picture or Table .", "type": "paragraph", "payload": null, "name": "List-item", @@ -1580,7 +1580,7 @@ "__ref_s3_data": null } ], - "text": "- (4) Connected sub-pictures are grouped together in one Picture object.", + "text": "(4) Connected sub-pictures are grouped together in one Picture object.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1603,7 +1603,7 @@ "__ref_s3_data": null } ], - "text": "- (5) Formula numbers are included in a Formula object.", + "text": "(5) Formula numbers are included in a Formula object.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1626,7 +1626,7 @@ "__ref_s3_data": null } ], - "text": "- (6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line.", + "text": "(6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2507,7 +2507,7 @@ "__ref_s3_data": null } ], - "text": "- [1] Max G\u00f6bel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013.", + "text": "[1] Max G\u00f6bel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2530,7 +2530,7 @@ "__ref_s3_data": null } ], - "text": "- [2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher. Icdar2017 competition on recognition of documents with complex layouts rdcl2017. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 1404-1410, 2017.", + "text": "[2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher. Icdar2017 competition on recognition of documents with complex layouts rdcl2017. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 1404-1410, 2017.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2553,7 +2553,7 @@ "__ref_s3_data": null } ], - "text": "- [3] Herv\u00e9 D\u00e9jean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), April 2019. http://sac.founderit.com/.", + "text": "[3] Herv\u00e9 D\u00e9jean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), April 2019. http://sac.founderit.com/.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2576,7 +2576,7 @@ "__ref_s3_data": null } ], - "text": "- [4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on scientific literature parsing. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag, sep 2021.", + "text": "[4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on scientific literature parsing. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag, sep 2021.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2599,7 +2599,7 @@ "__ref_s3_data": null } ], - "text": "- [5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis: not dead yet. International Journal on Document Analysis and Recognition (IJDAR) , pages 1-11, 01 2022.", + "text": "[5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis: not dead yet. International Journal on Document Analysis and Recognition (IJDAR) , pages 1-11, 01 2022.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2622,7 +2622,7 @@ "__ref_s3_data": null } ], - "text": "- [6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest dataset ever for document layout analysis. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep 2019.", + "text": "[6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest dataset ever for document layout analysis. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep 2019.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2645,7 +2645,7 @@ "__ref_s3_data": null } ], - "text": "- [7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings of the 28th International Conference on Computational Linguistics , COLING, pages 949-960. International Committee on Computational Linguistics, dec 2020.", + "text": "[7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings of the 28th International Conference on Computational Linguistics , COLING, pages 949-960. International Committee on Computational Linguistics, dec 2020.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2668,7 +2668,7 @@ "__ref_s3_data": null } ], - "text": "- [8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC , 2016.", + "text": "[8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC , 2016.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2691,7 +2691,7 @@ "__ref_s3_data": null } ], - "text": "- [9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich feature hierarchies for accurate object detection and semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587. IEEE Computer Society, jun 2014.", + "text": "[9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich feature hierarchies for accurate object detection and semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587. IEEE Computer Society, jun 2014.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2714,7 +2714,7 @@ "__ref_s3_data": null } ], - "text": "- [10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015.", + "text": "[10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2737,7 +2737,7 @@ "__ref_s3_data": null } ], - "text": "- [11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017.", + "text": "[11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2760,7 +2760,7 @@ "__ref_s3_data": null } ], - "text": "- [12] Kaiming He, Georgia Gkioxari, Piotr Doll\u00e1r, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017.", + "text": "[12] Kaiming He, Georgia Gkioxari, Piotr Doll\u00e1r, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2783,7 +2783,7 @@ "__ref_s3_data": null } ], - "text": "- [13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu", + "text": "[13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu", "type": "paragraph", "payload": null, "name": "List-item", @@ -2880,7 +2880,7 @@ "__ref_s3_data": null } ], - "text": "- [20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.", + "text": "[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2903,7 +2903,7 @@ "__ref_s3_data": null } ], - "text": "- [14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020.", + "text": "[14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2926,7 +2926,7 @@ "__ref_s3_data": null } ], - "text": "- [15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019.", + "text": "[15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2949,7 +2949,7 @@ "__ref_s3_data": null } ], - "text": "- [16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014.", + "text": "[16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2972,7 +2972,7 @@ "__ref_s3_data": null } ], - "text": "- [17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019.", + "text": "[17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2995,7 +2995,7 @@ "__ref_s3_data": null } ], - "text": "- [18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021.", + "text": "[18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021.", "type": "paragraph", "payload": null, "name": "List-item", @@ -3018,7 +3018,7 @@ "__ref_s3_data": null } ], - "text": "- [19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery.", + "text": "[19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery.", "type": "paragraph", "payload": null, "name": "List-item", @@ -3041,7 +3041,7 @@ "__ref_s3_data": null } ], - "text": "- [21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021.", + "text": "[21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021.", "type": "paragraph", "payload": null, "name": "List-item", @@ -3064,7 +3064,7 @@ "__ref_s3_data": null } ], - "text": "- [22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.", + "text": "[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.", "type": "paragraph", "payload": null, "name": "List-item", @@ -3087,7 +3087,7 @@ "__ref_s3_data": null } ], - "text": "- [23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.", + "text": "[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.", "type": "paragraph", "payload": null, "name": "List-item", diff --git a/tests/data/groundtruth/docling_v1/2206.01062.md b/tests/data/groundtruth/docling_v1/2206.01062.md index f53dd58..ef8fb31 100644 --- a/tests/data/groundtruth/docling_v1/2206.01062.md +++ b/tests/data/groundtruth/docling_v1/2206.01062.md @@ -47,17 +47,17 @@ A key problem in the process of document conversion is to understand the structu In this paper, we present the DocLayNet dataset. It provides pageby-page layout annotation ground-truth using bounding-boxes for 11 distinct class labels on 80863 unique document pages, of which a fraction carry double- or triple-annotations. DocLayNet is similar in spirit to PubLayNet and DocBank and will likewise be made available to the public 1 in order to stimulate the document-layout analysis community. It distinguishes itself in the following aspects: -- (1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on human annotation instead of automation approaches to generate the data set. +(1) Human Annotation : In contrast to PubLayNet and DocBank, we relied on human annotation instead of automation approaches to generate the data set. -- (2) Large Layout Variability : We include diverse and complex layouts from a large variety of public sources. +(2) Large Layout Variability : We include diverse and complex layouts from a large variety of public sources. -- (3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours. +(3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours. -- (4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation. +(4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation. This enables experimentation with annotation uncertainty and quality control analysis. -- (5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores. +(5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores. All aspects outlined above are detailed in Section 3. In Section 4, we will elaborate on how we designed and executed this large-scale human annotation campaign. We will also share key insights and lessons learned that might prove helpful for other parties planning to set up annotation campaigns. @@ -131,17 +131,17 @@ At first sight, the task of visual document-layout interpretation appears intuit Obviously, this inconsistency in annotations is not desirable for datasets which are intended to be used for model training. To minimise these inconsistencies, we created a detailed annotation guideline. While perfect consistency across 40 annotation staff members is clearly not possible to achieve, we saw a huge improvement in annotation consistency after the introduction of our annotation guideline. A few selected, non-trivial highlights of the guideline are: -- (1) Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object. +(1) Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object. -- (2) A List-item is a paragraph with hanging indentation. Singleline elements can qualify as List-item if the neighbour elements expose hanging indentation. Bullet or enumeration symbols are not a requirement. +(2) A List-item is a paragraph with hanging indentation. Singleline elements can qualify as List-item if the neighbour elements expose hanging indentation. Bullet or enumeration symbols are not a requirement. -- (3) For every Caption , there must be exactly one corresponding Picture or Table . +(3) For every Caption , there must be exactly one corresponding Picture or Table . -- (4) Connected sub-pictures are grouped together in one Picture object. +(4) Connected sub-pictures are grouped together in one Picture object. -- (5) Formula numbers are included in a Formula object. +(5) Formula numbers are included in a Formula object. -- (6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line. +(6) Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line. The complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference. @@ -282,31 +282,31 @@ To date, there is still a significant gap between human and ML accuracy on the l ## REFERENCES -- [1] Max Göbel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. +[1] Max Göbel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. -- [2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher. Icdar2017 competition on recognition of documents with complex layouts rdcl2017. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 1404-1410, 2017. +[2] Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher. Icdar2017 competition on recognition of documents with complex layouts rdcl2017. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 1404-1410, 2017. -- [3] Hervé Déjean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), April 2019. http://sac.founderit.com/. +[3] Hervé Déjean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), April 2019. http://sac.founderit.com/. -- [4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on scientific literature parsing. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag, sep 2021. +[4] Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on scientific literature parsing. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag, sep 2021. -- [5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis: not dead yet. International Journal on Document Analysis and Recognition (IJDAR) , pages 1-11, 01 2022. +[5] Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis: not dead yet. International Journal on Document Analysis and Recognition (IJDAR) , pages 1-11, 01 2022. -- [6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest dataset ever for document layout analysis. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep 2019. +[6] Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest dataset ever for document layout analysis. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep 2019. -- [7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings of the 28th International Conference on Computational Linguistics , COLING, pages 949-960. International Committee on Computational Linguistics, dec 2020. +[7] Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings of the 28th International Conference on Computational Linguistics , COLING, pages 949-960. International Committee on Computational Linguistics, dec 2020. -- [8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC , 2016. +[8] Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC , 2016. -- [9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich feature hierarchies for accurate object detection and semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587. IEEE Computer Society, jun 2014. +[9] Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich feature hierarchies for accurate object detection and semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587. IEEE Computer Society, jun 2014. -- [10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015. +[10] Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015. -- [11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017. +[11] Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017. -- [12] Kaiming He, Georgia Gkioxari, Piotr Dollár, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017. +[12] Kaiming He, Georgia Gkioxari, Piotr Dollár, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017. -- [13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu +[13] Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title @@ -315,22 +315,22 @@ Figure 6: Example layout predictions on selected pages from the DocLayNet test-s Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021. -- [20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021. +[20] Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021. -- [14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020. +[14] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020. -- [15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019. +[15] Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019. -- [16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014. +[16] Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014. -- [17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019. +[17] Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019. -- [18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021. +[18] Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021. -- [19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery. +[19] Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery. -- [21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021. +[21] Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021. -- [22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018. +[22] Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018. -- [23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019. \ No newline at end of file +[23] Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v1/2305.03393v1.doctags.txt b/tests/data/groundtruth/docling_v1/2305.03393v1.doctags.txt index 847f387..8d9385d 100644 --- a/tests/data/groundtruth/docling_v1/2305.03393v1.doctags.txt +++ b/tests/data/groundtruth/docling_v1/2305.03393v1.doctags.txt @@ -42,11 +42,11 @@ 4.1 Language Definition In Figure 3, we illustrate how the OTSL is defined. In essence, the OTSL defines only 5 tokens that directly describe a tabular structure based on an atomic 2D grid. The OTSL vocabulary is comprised of the following tokens: -- -"C" cell a new table cell that either has or does not have cell content -- -"L" cell left-looking cell , merging with the left neighbor cell to create a span -- -"U" cell up-looking cell , merging with the upper neighbor cell to create a span -- -"X" cell cross cell , to merge with both left and upper neighbor cells -- -"NL" new-line , switch to the next row. +-"C" cell a new table cell that either has or does not have cell content +-"L" cell left-looking cell , merging with the left neighbor cell to create a span +-"U" cell up-looking cell , merging with the upper neighbor cell to create a span +-"X" cell cross cell , to merge with both left and upper neighbor cells +-"NL" new-line , switch to the next row. A notable attribute of OTSL is that it has the capability of achieving lossless conversion to HTML.
@@ -55,13 +55,13 @@ Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding 4.2 Language Syntax The OTSL representation follows these syntax rules: -- 1. Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell. -- 2. Up-looking cell rule : The upper neighbour of a "U" cell must be either another "U" cell or a "C" cell. +1. Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell. +2. Up-looking cell rule : The upper neighbour of a "U" cell must be either another "U" cell or a "C" cell. 3. Cross cell rule : -- The left neighbour of an "X" cell must be either another "X" cell or a "U" cell, and the upper neighbour of an "X" cell must be either another "X" cell or an "L" cell. -- 4. First row rule : Only "L" cells and "C" cells are allowed in the first row. -- 5. First column rule : Only "U" cells and "C" cells are allowed in the first column. -- 6. Rectangular rule : The table representation is always rectangular - all rows must have an equal number of tokens, terminated with "NL" token. +The left neighbour of an "X" cell must be either another "X" cell or a "U" cell, and the upper neighbour of an "X" cell must be either another "X" cell or an "L" cell. +4. First row rule : Only "L" cells and "C" cells are allowed in the first row. +5. First column rule : Only "U" cells and "C" cells are allowed in the first column. +6. Rectangular rule : The table representation is always rectangular - all rows must have an equal number of tokens, terminated with "NL" token. The application of these rules gives OTSL a set of unique properties. First of all, the OTSL enforces a strictly rectangular structure representation, where every new-line token starts a new row. As a consequence, all rows and all columns have exactly the same number of tokens, irrespective of cell spans. Secondly, the OTSL representation is unambiguous: Every table structure is represented in one way. In this representation every table cell corresponds to a "C"-cell token, which in case of spans is always located in the top-left corner of the table cell definition. Third, OTSL syntax rules are only backward-looking. As a consequence, every predicted token can be validated straight during sequence generation by looking at the previously predicted sequence. As such, OTSL can guarantee that every predicted sequence is syntactically valid. These characteristics can be easily learned by sequence generator networks, as we demonstrate further below. We find strong indications that this pattern reduces significantly the column drift seen in the HTML based models (see Figure 5). @@ -121,27 +121,27 @@ First and foremost, given the same network configuration, inference time for a table-structure prediction is about 2 times faster compared to the conventional HTML approach. This is primarily owed to the shorter sequence length of the OTSL representation. Additional performance benefits can be obtained with HPO (hyper parameter optimization). As we demonstrate in our experiments, models trained on OTSL can be significantly smaller, e.g. by reducing the number of encoder and decoder layers, while preserving comparatively good prediction quality. This can further improve inference performance, yielding 5-6 times faster inference speed in OTSL with prediction quality comparable to models trained on HTML (see Table 1). Secondly, OTSL has more inherent structure and a significantly restricted vocabulary size. This allows autoregressive models to perform better in the TED metric, but especially with regards to prediction accuracy of the table-cell bounding boxes (see Table 2). As shown in Figure 5, we observe that the OTSL drastically reduces the drift for table cell bounding boxes at high row count and in sparse tables. This leads to more accurate predictions and a significant reduction in post-processing complexity, which is an undesired necessity in HTML-based Im2Seq models. Significant novelty lies in OTSL syntactical rules, which are few, simple and always backwards looking. Each new token can be validated only by analyzing the sequence of previous tokens, without requiring the entire sequence to detect mistakes. This in return allows to perform structural error detection and correction on-the-fly during sequence generation. References -- 1. Auer, C., Dolfi, M., Carvalho, A., Ramis, C.B., Staar, P.W.J.: Delivering document conversion as a cloud service with high throughput and responsiveness. CoRR abs/2206.00785 (2022). https://doi.org/10.48550/arXiv.2206.00785 , https://doi.org/10.48550/arXiv.2206.00785 -- 2. Chen, B., Peng, D., Zhang, J., Ren, Y., Jin, L.: Complex table structure recognition in the wild using transformer and identity matrix-based augmentation. In: Porwal, U., Fornés, A., Shafait, F. (eds.) Frontiers in Handwriting Recognition. pp. 545561. Springer International Publishing, Cham (2022) -- 3. Chi, Z., Huang, H., Xu, H.D., Yu, H., Yin, W., Mao, X.L.: Complicated table structure recognition. arXiv preprint arXiv:1908.04729 (2019) -- 4. Deng, Y., Rosenberg, D., Mann, G.: Challenges in end-to-end neural scientific table recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 894-901. IEEE (2019) -- 5. Kayal, P., Anand, M., Desai, H., Singh, M.: Tables to latex: structure and content extraction from scientific tables. International Journal on Document Analysis and Recognition (IJDAR) pp. 1-10 (2022) -- 6. Lee, E., Kwon, J., Yang, H., Park, J., Lee, S., Koo, H.I., Cho, N.I.: Table structure recognition based on grid shape graph. In: 2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). pp. 18681873. IEEE (2022) -- 7. Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: A benchmark dataset for table detection and recognition (2019) -- 8. Livathinos, N., Berrospi, C., Lysak, M., Kuropiatnyk, V., Nassar, A., Carvalho, A., Dolfi, M., Auer, C., Dinkla, K., Staar, P.: Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence 35 (17), 15137-15145 (May 2021), https://ojs.aaai.org/index.php/ AAAI/article/view/17777 -- 9. Nassar, A., Livathinos, N., Lysak, M., Staar, P.: Tableformer: Table structure understanding with transformers. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4614-4623 (June 2022) -- 10. Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.W.J.: Doclaynet: A large human-annotated dataset for document-layout segmentation. In: Zhang, A., Rangwala, H. (eds.) KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Washington, DC, USA, August 14 - 18, 2022. pp. 3743-3751. ACM (2022). https://doi.org/10.1145/3534678.3539043 , https:// doi.org/10.1145/3534678.3539043 -- 11. Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from imagebased documents. In: Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops. pp. 572-573 (2020) -- 12. Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162-1167. IEEE (2017) -- 13. Siddiqui, S.A., Fateh, I.A., Rizvi, S.T.R., Dengel, A., Ahmed, S.: Deeptabstr: Deep learning based table structure recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1403-1409 (2019). https:// doi.org/10.1109/ICDAR.2019.00226 -- 14. Smock, B., Pesala, R., Abraham, R.: PubTables-1M: Towards comprehensive table extraction from unstructured documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4634-4642 (June 2022) -- 15. Staar, P.W.J., Dolfi, M., Auer, C., Bekas, C.: Corpus conversion service: A machine learning platform to ingest documents at scale. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. pp. 774-782. KDD '18, Association for Computing Machinery, New York, NY, USA (2018). https://doi.org/10.1145/3219819.3219834 , https://doi.org/10. 1145/3219819.3219834 -- 16. Wang, X.: Tabular Abstraction, Editing, and Formatting. Ph.D. thesis, CAN (1996), aAINN09397 -- 17. Xue, W., Li, Q., Tao, D.: Res2tim: Reconstruct syntactic structures from table images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 749-755. IEEE (2019) -- 18. Xue, W., Yu, B., Wang, W., Tao, D., Li, Q.: Tgrnet: A table graph reconstruction network for table structure recognition. In: Proceedings of the IEEE/CVF International Conference on Computer Vision. pp. 1295-1304 (2021) -- 19. Ye, J., Qi, X., He, Y., Chen, Y., Gu, D., Gao, P., Xiao, R.: Pingan-vcgroup's solution for icdar 2021 competition on scientific literature parsing task b: Table recognition to html (2021). https://doi.org/10.48550/ARXIV.2105.01848 , https://arxiv.org/abs/2105.01848 -- 20. Zhang, Z., Zhang, J., Du, J., Wang, F.: Split, embed and merge: An accurate table structure recognizer. Pattern Recognition 126 , 108565 (2022) -- 21. Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. In: 2021 IEEE Winter Conference on Applications of Computer Vision (WACV). pp. 697-706 (2021). https://doi.org/10.1109/WACV48630.2021. 00074 -- 22. Zhong, X., ShafieiBavani, E., Jimeno Yepes, A.: Image-based table recognition: Data, model, and evaluation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) Computer Vision - ECCV 2020. pp. 564-580. Springer International Publishing, Cham (2020) -- 23. Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (2019) +1. Auer, C., Dolfi, M., Carvalho, A., Ramis, C.B., Staar, P.W.J.: Delivering document conversion as a cloud service with high throughput and responsiveness. CoRR abs/2206.00785 (2022). https://doi.org/10.48550/arXiv.2206.00785 , https://doi.org/10.48550/arXiv.2206.00785 +2. Chen, B., Peng, D., Zhang, J., Ren, Y., Jin, L.: Complex table structure recognition in the wild using transformer and identity matrix-based augmentation. In: Porwal, U., Fornés, A., Shafait, F. (eds.) Frontiers in Handwriting Recognition. pp. 545561. Springer International Publishing, Cham (2022) +3. Chi, Z., Huang, H., Xu, H.D., Yu, H., Yin, W., Mao, X.L.: Complicated table structure recognition. arXiv preprint arXiv:1908.04729 (2019) +4. Deng, Y., Rosenberg, D., Mann, G.: Challenges in end-to-end neural scientific table recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 894-901. IEEE (2019) +5. Kayal, P., Anand, M., Desai, H., Singh, M.: Tables to latex: structure and content extraction from scientific tables. International Journal on Document Analysis and Recognition (IJDAR) pp. 1-10 (2022) +6. Lee, E., Kwon, J., Yang, H., Park, J., Lee, S., Koo, H.I., Cho, N.I.: Table structure recognition based on grid shape graph. In: 2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). pp. 18681873. IEEE (2022) +7. Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: A benchmark dataset for table detection and recognition (2019) +8. Livathinos, N., Berrospi, C., Lysak, M., Kuropiatnyk, V., Nassar, A., Carvalho, A., Dolfi, M., Auer, C., Dinkla, K., Staar, P.: Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence 35 (17), 15137-15145 (May 2021), https://ojs.aaai.org/index.php/ AAAI/article/view/17777 +9. Nassar, A., Livathinos, N., Lysak, M., Staar, P.: Tableformer: Table structure understanding with transformers. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4614-4623 (June 2022) +10. Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.W.J.: Doclaynet: A large human-annotated dataset for document-layout segmentation. In: Zhang, A., Rangwala, H. (eds.) KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Washington, DC, USA, August 14 - 18, 2022. pp. 3743-3751. ACM (2022). https://doi.org/10.1145/3534678.3539043 , https:// doi.org/10.1145/3534678.3539043 +11. Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from imagebased documents. In: Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops. pp. 572-573 (2020) +12. Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162-1167. IEEE (2017) +13. Siddiqui, S.A., Fateh, I.A., Rizvi, S.T.R., Dengel, A., Ahmed, S.: Deeptabstr: Deep learning based table structure recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1403-1409 (2019). https:// doi.org/10.1109/ICDAR.2019.00226 +14. Smock, B., Pesala, R., Abraham, R.: PubTables-1M: Towards comprehensive table extraction from unstructured documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4634-4642 (June 2022) +15. Staar, P.W.J., Dolfi, M., Auer, C., Bekas, C.: Corpus conversion service: A machine learning platform to ingest documents at scale. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. pp. 774-782. KDD '18, Association for Computing Machinery, New York, NY, USA (2018). https://doi.org/10.1145/3219819.3219834 , https://doi.org/10. 1145/3219819.3219834 +16. Wang, X.: Tabular Abstraction, Editing, and Formatting. Ph.D. thesis, CAN (1996), aAINN09397 +17. Xue, W., Li, Q., Tao, D.: Res2tim: Reconstruct syntactic structures from table images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 749-755. IEEE (2019) +18. Xue, W., Yu, B., Wang, W., Tao, D., Li, Q.: Tgrnet: A table graph reconstruction network for table structure recognition. In: Proceedings of the IEEE/CVF International Conference on Computer Vision. pp. 1295-1304 (2021) +19. Ye, J., Qi, X., He, Y., Chen, Y., Gu, D., Gao, P., Xiao, R.: Pingan-vcgroup's solution for icdar 2021 competition on scientific literature parsing task b: Table recognition to html (2021). https://doi.org/10.48550/ARXIV.2105.01848 , https://arxiv.org/abs/2105.01848 +20. Zhang, Z., Zhang, J., Du, J., Wang, F.: Split, embed and merge: An accurate table structure recognizer. Pattern Recognition 126 , 108565 (2022) +21. Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. In: 2021 IEEE Winter Conference on Applications of Computer Vision (WACV). pp. 697-706 (2021). https://doi.org/10.1109/WACV48630.2021. 00074 +22. Zhong, X., ShafieiBavani, E., Jimeno Yepes, A.: Image-based table recognition: Data, model, and evaluation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) Computer Vision - ECCV 2020. pp. 564-580. Springer International Publishing, Cham (2020) +23. Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (2019) \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v1/2305.03393v1.json b/tests/data/groundtruth/docling_v1/2305.03393v1.json index 140b25b..b5282aa 100644 --- a/tests/data/groundtruth/docling_v1/2305.03393v1.json +++ b/tests/data/groundtruth/docling_v1/2305.03393v1.json @@ -937,7 +937,7 @@ "__ref_s3_data": null } ], - "text": "- -\"C\" cell a new table cell that either has or does not have cell content", + "text": "-\"C\" cell a new table cell that either has or does not have cell content", "type": "paragraph", "payload": null, "name": "List-item", @@ -960,7 +960,7 @@ "__ref_s3_data": null } ], - "text": "- -\"L\" cell left-looking cell , merging with the left neighbor cell to create a span", + "text": "-\"L\" cell left-looking cell , merging with the left neighbor cell to create a span", "type": "paragraph", "payload": null, "name": "List-item", @@ -983,7 +983,7 @@ "__ref_s3_data": null } ], - "text": "- -\"U\" cell up-looking cell , merging with the upper neighbor cell to create a span", + "text": "-\"U\" cell up-looking cell , merging with the upper neighbor cell to create a span", "type": "paragraph", "payload": null, "name": "List-item", @@ -1006,7 +1006,7 @@ "__ref_s3_data": null } ], - "text": "- -\"X\" cell cross cell , to merge with both left and upper neighbor cells", + "text": "-\"X\" cell cross cell , to merge with both left and upper neighbor cells", "type": "paragraph", "payload": null, "name": "List-item", @@ -1029,7 +1029,7 @@ "__ref_s3_data": null } ], - "text": "- -\"NL\" new-line , switch to the next row.", + "text": "-\"NL\" new-line , switch to the next row.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1149,7 +1149,7 @@ "__ref_s3_data": null } ], - "text": "- 1. Left-looking cell rule : The left neighbour of an \"L\" cell must be either another \"L\" cell or a \"C\" cell.", + "text": "1. Left-looking cell rule : The left neighbour of an \"L\" cell must be either another \"L\" cell or a \"C\" cell.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1172,7 +1172,7 @@ "__ref_s3_data": null } ], - "text": "- 2. Up-looking cell rule : The upper neighbour of a \"U\" cell must be either another \"U\" cell or a \"C\" cell.", + "text": "2. Up-looking cell rule : The upper neighbour of a \"U\" cell must be either another \"U\" cell or a \"C\" cell.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1218,7 +1218,7 @@ "__ref_s3_data": null } ], - "text": "- The left neighbour of an \"X\" cell must be either another \"X\" cell or a \"U\" cell, and the upper neighbour of an \"X\" cell must be either another \"X\" cell or an \"L\" cell.", + "text": "The left neighbour of an \"X\" cell must be either another \"X\" cell or a \"U\" cell, and the upper neighbour of an \"X\" cell must be either another \"X\" cell or an \"L\" cell.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1241,7 +1241,7 @@ "__ref_s3_data": null } ], - "text": "- 4. First row rule : Only \"L\" cells and \"C\" cells are allowed in the first row.", + "text": "4. First row rule : Only \"L\" cells and \"C\" cells are allowed in the first row.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1264,7 +1264,7 @@ "__ref_s3_data": null } ], - "text": "- 5. First column rule : Only \"U\" cells and \"C\" cells are allowed in the first column.", + "text": "5. First column rule : Only \"U\" cells and \"C\" cells are allowed in the first column.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1287,7 +1287,7 @@ "__ref_s3_data": null } ], - "text": "- 6. Rectangular rule : The table representation is always rectangular - all rows must have an equal number of tokens, terminated with \"NL\" token.", + "text": "6. Rectangular rule : The table representation is always rectangular - all rows must have an equal number of tokens, terminated with \"NL\" token.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1979,7 +1979,7 @@ "__ref_s3_data": null } ], - "text": "- 1. Auer, C., Dolfi, M., Carvalho, A., Ramis, C.B., Staar, P.W.J.: Delivering document conversion as a cloud service with high throughput and responsiveness. CoRR abs/2206.00785 (2022). https://doi.org/10.48550/arXiv.2206.00785 , https://doi.org/10.48550/arXiv.2206.00785", + "text": "1. Auer, C., Dolfi, M., Carvalho, A., Ramis, C.B., Staar, P.W.J.: Delivering document conversion as a cloud service with high throughput and responsiveness. CoRR abs/2206.00785 (2022). https://doi.org/10.48550/arXiv.2206.00785 , https://doi.org/10.48550/arXiv.2206.00785", "type": "paragraph", "payload": null, "name": "List-item", @@ -2002,7 +2002,7 @@ "__ref_s3_data": null } ], - "text": "- 2. Chen, B., Peng, D., Zhang, J., Ren, Y., Jin, L.: Complex table structure recognition in the wild using transformer and identity matrix-based augmentation. In: Porwal, U., Forn\u00e9s, A., Shafait, F. (eds.) Frontiers in Handwriting Recognition. pp. 545561. Springer International Publishing, Cham (2022)", + "text": "2. Chen, B., Peng, D., Zhang, J., Ren, Y., Jin, L.: Complex table structure recognition in the wild using transformer and identity matrix-based augmentation. In: Porwal, U., Forn\u00e9s, A., Shafait, F. (eds.) Frontiers in Handwriting Recognition. pp. 545561. Springer International Publishing, Cham (2022)", "type": "paragraph", "payload": null, "name": "List-item", @@ -2025,7 +2025,7 @@ "__ref_s3_data": null } ], - "text": "- 3. Chi, Z., Huang, H., Xu, H.D., Yu, H., Yin, W., Mao, X.L.: Complicated table structure recognition. arXiv preprint arXiv:1908.04729 (2019)", + "text": "3. Chi, Z., Huang, H., Xu, H.D., Yu, H., Yin, W., Mao, X.L.: Complicated table structure recognition. arXiv preprint arXiv:1908.04729 (2019)", "type": "paragraph", "payload": null, "name": "List-item", @@ -2048,7 +2048,7 @@ "__ref_s3_data": null } ], - "text": "- 4. Deng, Y., Rosenberg, D., Mann, G.: Challenges in end-to-end neural scientific table recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 894-901. IEEE (2019)", + "text": "4. Deng, Y., Rosenberg, D., Mann, G.: Challenges in end-to-end neural scientific table recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 894-901. IEEE (2019)", "type": "paragraph", "payload": null, "name": "List-item", @@ -2071,7 +2071,7 @@ "__ref_s3_data": null } ], - "text": "- 5. Kayal, P., Anand, M., Desai, H., Singh, M.: Tables to latex: structure and content extraction from scientific tables. International Journal on Document Analysis and Recognition (IJDAR) pp. 1-10 (2022)", + "text": "5. Kayal, P., Anand, M., Desai, H., Singh, M.: Tables to latex: structure and content extraction from scientific tables. International Journal on Document Analysis and Recognition (IJDAR) pp. 1-10 (2022)", "type": "paragraph", "payload": null, "name": "List-item", @@ -2094,7 +2094,7 @@ "__ref_s3_data": null } ], - "text": "- 6. Lee, E., Kwon, J., Yang, H., Park, J., Lee, S., Koo, H.I., Cho, N.I.: Table structure recognition based on grid shape graph. In: 2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). pp. 18681873. IEEE (2022)", + "text": "6. Lee, E., Kwon, J., Yang, H., Park, J., Lee, S., Koo, H.I., Cho, N.I.: Table structure recognition based on grid shape graph. In: 2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). pp. 18681873. IEEE (2022)", "type": "paragraph", "payload": null, "name": "List-item", @@ -2117,7 +2117,7 @@ "__ref_s3_data": null } ], - "text": "- 7. Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: A benchmark dataset for table detection and recognition (2019)", + "text": "7. Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: A benchmark dataset for table detection and recognition (2019)", "type": "paragraph", "payload": null, "name": "List-item", @@ -2140,7 +2140,7 @@ "__ref_s3_data": null } ], - "text": "- 8. Livathinos, N., Berrospi, C., Lysak, M., Kuropiatnyk, V., Nassar, A., Carvalho, A., Dolfi, M., Auer, C., Dinkla, K., Staar, P.: Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence 35 (17), 15137-15145 (May 2021), https://ojs.aaai.org/index.php/ AAAI/article/view/17777", + "text": "8. Livathinos, N., Berrospi, C., Lysak, M., Kuropiatnyk, V., Nassar, A., Carvalho, A., Dolfi, M., Auer, C., Dinkla, K., Staar, P.: Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence 35 (17), 15137-15145 (May 2021), https://ojs.aaai.org/index.php/ AAAI/article/view/17777", "type": "paragraph", "payload": null, "name": "List-item", @@ -2163,7 +2163,7 @@ "__ref_s3_data": null } ], - "text": "- 9. Nassar, A., Livathinos, N., Lysak, M., Staar, P.: Tableformer: Table structure understanding with transformers. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4614-4623 (June 2022)", + "text": "9. Nassar, A., Livathinos, N., Lysak, M., Staar, P.: Tableformer: Table structure understanding with transformers. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4614-4623 (June 2022)", "type": "paragraph", "payload": null, "name": "List-item", @@ -2186,7 +2186,7 @@ "__ref_s3_data": null } ], - "text": "- 10. Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.W.J.: Doclaynet: A large human-annotated dataset for document-layout segmentation. In: Zhang, A., Rangwala, H. (eds.) KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Washington, DC, USA, August 14 - 18, 2022. pp. 3743-3751. ACM (2022). https://doi.org/10.1145/3534678.3539043 , https:// doi.org/10.1145/3534678.3539043", + "text": "10. Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.W.J.: Doclaynet: A large human-annotated dataset for document-layout segmentation. In: Zhang, A., Rangwala, H. (eds.) KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Washington, DC, USA, August 14 - 18, 2022. pp. 3743-3751. ACM (2022). https://doi.org/10.1145/3534678.3539043 , https:// doi.org/10.1145/3534678.3539043", "type": "paragraph", "payload": null, "name": "List-item", @@ -2209,7 +2209,7 @@ "__ref_s3_data": null } ], - "text": "- 11. Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from imagebased documents. In: Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops. pp. 572-573 (2020)", + "text": "11. Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from imagebased documents. In: Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops. pp. 572-573 (2020)", "type": "paragraph", "payload": null, "name": "List-item", @@ -2232,7 +2232,7 @@ "__ref_s3_data": null } ], - "text": "- 12. Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162-1167. IEEE (2017)", + "text": "12. Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162-1167. IEEE (2017)", "type": "paragraph", "payload": null, "name": "List-item", @@ -2255,7 +2255,7 @@ "__ref_s3_data": null } ], - "text": "- 13. Siddiqui, S.A., Fateh, I.A., Rizvi, S.T.R., Dengel, A., Ahmed, S.: Deeptabstr: Deep learning based table structure recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1403-1409 (2019). https:// doi.org/10.1109/ICDAR.2019.00226", + "text": "13. Siddiqui, S.A., Fateh, I.A., Rizvi, S.T.R., Dengel, A., Ahmed, S.: Deeptabstr: Deep learning based table structure recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1403-1409 (2019). https:// doi.org/10.1109/ICDAR.2019.00226", "type": "paragraph", "payload": null, "name": "List-item", @@ -2278,7 +2278,7 @@ "__ref_s3_data": null } ], - "text": "- 14. Smock, B., Pesala, R., Abraham, R.: PubTables-1M: Towards comprehensive table extraction from unstructured documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4634-4642 (June 2022)", + "text": "14. Smock, B., Pesala, R., Abraham, R.: PubTables-1M: Towards comprehensive table extraction from unstructured documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4634-4642 (June 2022)", "type": "paragraph", "payload": null, "name": "List-item", @@ -2301,7 +2301,7 @@ "__ref_s3_data": null } ], - "text": "- 15. Staar, P.W.J., Dolfi, M., Auer, C., Bekas, C.: Corpus conversion service: A machine learning platform to ingest documents at scale. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. pp. 774-782. KDD '18, Association for Computing Machinery, New York, NY, USA (2018). https://doi.org/10.1145/3219819.3219834 , https://doi.org/10. 1145/3219819.3219834", + "text": "15. Staar, P.W.J., Dolfi, M., Auer, C., Bekas, C.: Corpus conversion service: A machine learning platform to ingest documents at scale. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. pp. 774-782. KDD '18, Association for Computing Machinery, New York, NY, USA (2018). https://doi.org/10.1145/3219819.3219834 , https://doi.org/10. 1145/3219819.3219834", "type": "paragraph", "payload": null, "name": "List-item", @@ -2324,7 +2324,7 @@ "__ref_s3_data": null } ], - "text": "- 16. Wang, X.: Tabular Abstraction, Editing, and Formatting. Ph.D. thesis, CAN (1996), aAINN09397", + "text": "16. Wang, X.: Tabular Abstraction, Editing, and Formatting. Ph.D. thesis, CAN (1996), aAINN09397", "type": "paragraph", "payload": null, "name": "List-item", @@ -2347,7 +2347,7 @@ "__ref_s3_data": null } ], - "text": "- 17. Xue, W., Li, Q., Tao, D.: Res2tim: Reconstruct syntactic structures from table images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 749-755. IEEE (2019)", + "text": "17. Xue, W., Li, Q., Tao, D.: Res2tim: Reconstruct syntactic structures from table images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 749-755. IEEE (2019)", "type": "paragraph", "payload": null, "name": "List-item", @@ -2370,7 +2370,7 @@ "__ref_s3_data": null } ], - "text": "- 18. Xue, W., Yu, B., Wang, W., Tao, D., Li, Q.: Tgrnet: A table graph reconstruction network for table structure recognition. In: Proceedings of the IEEE/CVF International Conference on Computer Vision. pp. 1295-1304 (2021)", + "text": "18. Xue, W., Yu, B., Wang, W., Tao, D., Li, Q.: Tgrnet: A table graph reconstruction network for table structure recognition. In: Proceedings of the IEEE/CVF International Conference on Computer Vision. pp. 1295-1304 (2021)", "type": "paragraph", "payload": null, "name": "List-item", @@ -2393,7 +2393,7 @@ "__ref_s3_data": null } ], - "text": "- 19. Ye, J., Qi, X., He, Y., Chen, Y., Gu, D., Gao, P., Xiao, R.: Pingan-vcgroup's solution for icdar 2021 competition on scientific literature parsing task b: Table recognition to html (2021). https://doi.org/10.48550/ARXIV.2105.01848 , https://arxiv.org/abs/2105.01848", + "text": "19. Ye, J., Qi, X., He, Y., Chen, Y., Gu, D., Gao, P., Xiao, R.: Pingan-vcgroup's solution for icdar 2021 competition on scientific literature parsing task b: Table recognition to html (2021). https://doi.org/10.48550/ARXIV.2105.01848 , https://arxiv.org/abs/2105.01848", "type": "paragraph", "payload": null, "name": "List-item", @@ -2416,7 +2416,7 @@ "__ref_s3_data": null } ], - "text": "- 20. Zhang, Z., Zhang, J., Du, J., Wang, F.: Split, embed and merge: An accurate table structure recognizer. Pattern Recognition 126 , 108565 (2022)", + "text": "20. Zhang, Z., Zhang, J., Du, J., Wang, F.: Split, embed and merge: An accurate table structure recognizer. Pattern Recognition 126 , 108565 (2022)", "type": "paragraph", "payload": null, "name": "List-item", @@ -2439,7 +2439,7 @@ "__ref_s3_data": null } ], - "text": "- 21. Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. In: 2021 IEEE Winter Conference on Applications of Computer Vision (WACV). pp. 697-706 (2021). https://doi.org/10.1109/WACV48630.2021. 00074", + "text": "21. Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. In: 2021 IEEE Winter Conference on Applications of Computer Vision (WACV). pp. 697-706 (2021). https://doi.org/10.1109/WACV48630.2021. 00074", "type": "paragraph", "payload": null, "name": "List-item", @@ -2462,7 +2462,7 @@ "__ref_s3_data": null } ], - "text": "- 22. Zhong, X., ShafieiBavani, E., Jimeno Yepes, A.: Image-based table recognition: Data, model, and evaluation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) Computer Vision - ECCV 2020. pp. 564-580. Springer International Publishing, Cham (2020)", + "text": "22. Zhong, X., ShafieiBavani, E., Jimeno Yepes, A.: Image-based table recognition: Data, model, and evaluation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) Computer Vision - ECCV 2020. pp. 564-580. Springer International Publishing, Cham (2020)", "type": "paragraph", "payload": null, "name": "List-item", @@ -2485,7 +2485,7 @@ "__ref_s3_data": null } ], - "text": "- 23. Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (2019)", + "text": "23. Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (2019)", "type": "paragraph", "payload": null, "name": "List-item", diff --git a/tests/data/groundtruth/docling_v1/2305.03393v1.md b/tests/data/groundtruth/docling_v1/2305.03393v1.md index 5d1d7b8..91dd0e8 100644 --- a/tests/data/groundtruth/docling_v1/2305.03393v1.md +++ b/tests/data/groundtruth/docling_v1/2305.03393v1.md @@ -70,15 +70,15 @@ In Figure 3, we illustrate how the OTSL is defined. In essence, the OTSL defines The OTSL vocabulary is comprised of the following tokens: -- -"C" cell a new table cell that either has or does not have cell content +-"C" cell a new table cell that either has or does not have cell content -- -"L" cell left-looking cell , merging with the left neighbor cell to create a span +-"L" cell left-looking cell , merging with the left neighbor cell to create a span -- -"U" cell up-looking cell , merging with the upper neighbor cell to create a span +-"U" cell up-looking cell , merging with the upper neighbor cell to create a span -- -"X" cell cross cell , to merge with both left and upper neighbor cells +-"X" cell cross cell , to merge with both left and upper neighbor cells -- -"NL" new-line , switch to the next row. +-"NL" new-line , switch to the next row. A notable attribute of OTSL is that it has the capability of achieving lossless conversion to HTML. @@ -89,19 +89,19 @@ Fig. 3. OTSL description of table structure: A - table example; B - graphical re The OTSL representation follows these syntax rules: -- 1. Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell. +1. Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell. -- 2. Up-looking cell rule : The upper neighbour of a "U" cell must be either another "U" cell or a "C" cell. +2. Up-looking cell rule : The upper neighbour of a "U" cell must be either another "U" cell or a "C" cell. ## 3. Cross cell rule : -- The left neighbour of an "X" cell must be either another "X" cell or a "U" cell, and the upper neighbour of an "X" cell must be either another "X" cell or an "L" cell. +The left neighbour of an "X" cell must be either another "X" cell or a "U" cell, and the upper neighbour of an "X" cell must be either another "X" cell or an "L" cell. -- 4. First row rule : Only "L" cells and "C" cells are allowed in the first row. +4. First row rule : Only "L" cells and "C" cells are allowed in the first row. -- 5. First column rule : Only "U" cells and "C" cells are allowed in the first column. +5. First column rule : Only "U" cells and "C" cells are allowed in the first column. -- 6. Rectangular rule : The table representation is always rectangular - all rows must have an equal number of tokens, terminated with "NL" token. +6. Rectangular rule : The table representation is always rectangular - all rows must have an equal number of tokens, terminated with "NL" token. The application of these rules gives OTSL a set of unique properties. First of all, the OTSL enforces a strictly rectangular structure representation, where every new-line token starts a new row. As a consequence, all rows and all columns have exactly the same number of tokens, irrespective of cell spans. Secondly, the OTSL representation is unambiguous: Every table structure is represented in one way. In this representation every table cell corresponds to a "C"-cell token, which in case of spans is always located in the top-left corner of the table cell definition. Third, OTSL syntax rules are only backward-looking. As a consequence, every predicted token can be validated straight during sequence generation by looking at the previously predicted sequence. As such, OTSL can guarantee that every predicted sequence is syntactically valid. @@ -177,48 +177,48 @@ Secondly, OTSL has more inherent structure and a significantly restricted vocabu ## References -- 1. Auer, C., Dolfi, M., Carvalho, A., Ramis, C.B., Staar, P.W.J.: Delivering document conversion as a cloud service with high throughput and responsiveness. CoRR abs/2206.00785 (2022). https://doi.org/10.48550/arXiv.2206.00785 , https://doi.org/10.48550/arXiv.2206.00785 +1. Auer, C., Dolfi, M., Carvalho, A., Ramis, C.B., Staar, P.W.J.: Delivering document conversion as a cloud service with high throughput and responsiveness. CoRR abs/2206.00785 (2022). https://doi.org/10.48550/arXiv.2206.00785 , https://doi.org/10.48550/arXiv.2206.00785 -- 2. Chen, B., Peng, D., Zhang, J., Ren, Y., Jin, L.: Complex table structure recognition in the wild using transformer and identity matrix-based augmentation. In: Porwal, U., Fornés, A., Shafait, F. (eds.) Frontiers in Handwriting Recognition. pp. 545561. Springer International Publishing, Cham (2022) +2. Chen, B., Peng, D., Zhang, J., Ren, Y., Jin, L.: Complex table structure recognition in the wild using transformer and identity matrix-based augmentation. In: Porwal, U., Fornés, A., Shafait, F. (eds.) Frontiers in Handwriting Recognition. pp. 545561. Springer International Publishing, Cham (2022) -- 3. Chi, Z., Huang, H., Xu, H.D., Yu, H., Yin, W., Mao, X.L.: Complicated table structure recognition. arXiv preprint arXiv:1908.04729 (2019) +3. Chi, Z., Huang, H., Xu, H.D., Yu, H., Yin, W., Mao, X.L.: Complicated table structure recognition. arXiv preprint arXiv:1908.04729 (2019) -- 4. Deng, Y., Rosenberg, D., Mann, G.: Challenges in end-to-end neural scientific table recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 894-901. IEEE (2019) +4. Deng, Y., Rosenberg, D., Mann, G.: Challenges in end-to-end neural scientific table recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 894-901. IEEE (2019) -- 5. Kayal, P., Anand, M., Desai, H., Singh, M.: Tables to latex: structure and content extraction from scientific tables. International Journal on Document Analysis and Recognition (IJDAR) pp. 1-10 (2022) +5. Kayal, P., Anand, M., Desai, H., Singh, M.: Tables to latex: structure and content extraction from scientific tables. International Journal on Document Analysis and Recognition (IJDAR) pp. 1-10 (2022) -- 6. Lee, E., Kwon, J., Yang, H., Park, J., Lee, S., Koo, H.I., Cho, N.I.: Table structure recognition based on grid shape graph. In: 2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). pp. 18681873. IEEE (2022) +6. Lee, E., Kwon, J., Yang, H., Park, J., Lee, S., Koo, H.I., Cho, N.I.: Table structure recognition based on grid shape graph. In: 2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). pp. 18681873. IEEE (2022) -- 7. Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: A benchmark dataset for table detection and recognition (2019) +7. Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: A benchmark dataset for table detection and recognition (2019) -- 8. Livathinos, N., Berrospi, C., Lysak, M., Kuropiatnyk, V., Nassar, A., Carvalho, A., Dolfi, M., Auer, C., Dinkla, K., Staar, P.: Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence 35 (17), 15137-15145 (May 2021), https://ojs.aaai.org/index.php/ AAAI/article/view/17777 +8. Livathinos, N., Berrospi, C., Lysak, M., Kuropiatnyk, V., Nassar, A., Carvalho, A., Dolfi, M., Auer, C., Dinkla, K., Staar, P.: Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence 35 (17), 15137-15145 (May 2021), https://ojs.aaai.org/index.php/ AAAI/article/view/17777 -- 9. Nassar, A., Livathinos, N., Lysak, M., Staar, P.: Tableformer: Table structure understanding with transformers. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4614-4623 (June 2022) +9. Nassar, A., Livathinos, N., Lysak, M., Staar, P.: Tableformer: Table structure understanding with transformers. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4614-4623 (June 2022) -- 10. Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.W.J.: Doclaynet: A large human-annotated dataset for document-layout segmentation. In: Zhang, A., Rangwala, H. (eds.) KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Washington, DC, USA, August 14 - 18, 2022. pp. 3743-3751. ACM (2022). https://doi.org/10.1145/3534678.3539043 , https:// doi.org/10.1145/3534678.3539043 +10. Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.W.J.: Doclaynet: A large human-annotated dataset for document-layout segmentation. In: Zhang, A., Rangwala, H. (eds.) KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Washington, DC, USA, August 14 - 18, 2022. pp. 3743-3751. ACM (2022). https://doi.org/10.1145/3534678.3539043 , https:// doi.org/10.1145/3534678.3539043 -- 11. Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from imagebased documents. In: Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops. pp. 572-573 (2020) +11. Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from imagebased documents. In: Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops. pp. 572-573 (2020) -- 12. Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162-1167. IEEE (2017) +12. Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162-1167. IEEE (2017) -- 13. Siddiqui, S.A., Fateh, I.A., Rizvi, S.T.R., Dengel, A., Ahmed, S.: Deeptabstr: Deep learning based table structure recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1403-1409 (2019). https:// doi.org/10.1109/ICDAR.2019.00226 +13. Siddiqui, S.A., Fateh, I.A., Rizvi, S.T.R., Dengel, A., Ahmed, S.: Deeptabstr: Deep learning based table structure recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1403-1409 (2019). https:// doi.org/10.1109/ICDAR.2019.00226 -- 14. Smock, B., Pesala, R., Abraham, R.: PubTables-1M: Towards comprehensive table extraction from unstructured documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4634-4642 (June 2022) +14. Smock, B., Pesala, R., Abraham, R.: PubTables-1M: Towards comprehensive table extraction from unstructured documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4634-4642 (June 2022) -- 15. Staar, P.W.J., Dolfi, M., Auer, C., Bekas, C.: Corpus conversion service: A machine learning platform to ingest documents at scale. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. pp. 774-782. KDD '18, Association for Computing Machinery, New York, NY, USA (2018). https://doi.org/10.1145/3219819.3219834 , https://doi.org/10. 1145/3219819.3219834 +15. Staar, P.W.J., Dolfi, M., Auer, C., Bekas, C.: Corpus conversion service: A machine learning platform to ingest documents at scale. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. pp. 774-782. KDD '18, Association for Computing Machinery, New York, NY, USA (2018). https://doi.org/10.1145/3219819.3219834 , https://doi.org/10. 1145/3219819.3219834 -- 16. Wang, X.: Tabular Abstraction, Editing, and Formatting. Ph.D. thesis, CAN (1996), aAINN09397 +16. Wang, X.: Tabular Abstraction, Editing, and Formatting. Ph.D. thesis, CAN (1996), aAINN09397 -- 17. Xue, W., Li, Q., Tao, D.: Res2tim: Reconstruct syntactic structures from table images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 749-755. IEEE (2019) +17. Xue, W., Li, Q., Tao, D.: Res2tim: Reconstruct syntactic structures from table images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 749-755. IEEE (2019) -- 18. Xue, W., Yu, B., Wang, W., Tao, D., Li, Q.: Tgrnet: A table graph reconstruction network for table structure recognition. In: Proceedings of the IEEE/CVF International Conference on Computer Vision. pp. 1295-1304 (2021) +18. Xue, W., Yu, B., Wang, W., Tao, D., Li, Q.: Tgrnet: A table graph reconstruction network for table structure recognition. In: Proceedings of the IEEE/CVF International Conference on Computer Vision. pp. 1295-1304 (2021) -- 19. Ye, J., Qi, X., He, Y., Chen, Y., Gu, D., Gao, P., Xiao, R.: Pingan-vcgroup's solution for icdar 2021 competition on scientific literature parsing task b: Table recognition to html (2021). https://doi.org/10.48550/ARXIV.2105.01848 , https://arxiv.org/abs/2105.01848 +19. Ye, J., Qi, X., He, Y., Chen, Y., Gu, D., Gao, P., Xiao, R.: Pingan-vcgroup's solution for icdar 2021 competition on scientific literature parsing task b: Table recognition to html (2021). https://doi.org/10.48550/ARXIV.2105.01848 , https://arxiv.org/abs/2105.01848 -- 20. Zhang, Z., Zhang, J., Du, J., Wang, F.: Split, embed and merge: An accurate table structure recognizer. Pattern Recognition 126 , 108565 (2022) +20. Zhang, Z., Zhang, J., Du, J., Wang, F.: Split, embed and merge: An accurate table structure recognizer. Pattern Recognition 126 , 108565 (2022) -- 21. Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. In: 2021 IEEE Winter Conference on Applications of Computer Vision (WACV). pp. 697-706 (2021). https://doi.org/10.1109/WACV48630.2021. 00074 +21. Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. In: 2021 IEEE Winter Conference on Applications of Computer Vision (WACV). pp. 697-706 (2021). https://doi.org/10.1109/WACV48630.2021. 00074 -- 22. Zhong, X., ShafieiBavani, E., Jimeno Yepes, A.: Image-based table recognition: Data, model, and evaluation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) Computer Vision - ECCV 2020. pp. 564-580. Springer International Publishing, Cham (2020) +22. Zhong, X., ShafieiBavani, E., Jimeno Yepes, A.: Image-based table recognition: Data, model, and evaluation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) Computer Vision - ECCV 2020. pp. 564-580. Springer International Publishing, Cham (2020) -- 23. Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (2019) \ No newline at end of file +23. Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (2019) \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v1/multi_page.doctags.txt b/tests/data/groundtruth/docling_v1/multi_page.doctags.txt index f58abcf..5e1e064 100644 --- a/tests/data/groundtruth/docling_v1/multi_page.doctags.txt +++ b/tests/data/groundtruth/docling_v1/multi_page.doctags.txt @@ -6,50 +6,50 @@ During this period, the term "word processing" didn't exist, but the typewriter laid the groundwork for future developments. Over time, advancements such as carbon paper (for copies) and the electric typewriter (introduced by IBM in 1935) improved the speed and convenience of document creation. The Birth of Word Processing (1960s - 1970s) The term "word processor" first emerged in the 1960s and referred to any system designed to streamline written communication and document production. Early word processors were not software programs but rather standalone machines. -- · IBM MT/ST (Magnetic Tape/Selectric Typewriter) : Introduced in 1964, this machine combined IBM's Selectric typewriter with magnetic tape storage. It allowed users to record, edit, and replay typed content-an early example of digital text storage. -- · Wang Laboratories : In the 1970s, Wang introduced dedicated word processing machines. These devices, like the Wang 1200, featured small screens and floppy disks, making them revolutionary for their time. +· IBM MT/ST (Magnetic Tape/Selectric Typewriter) : Introduced in 1964, this machine combined IBM's Selectric typewriter with magnetic tape storage. It allowed users to record, edit, and replay typed content-an early example of digital text storage. +· Wang Laboratories : In the 1970s, Wang introduced dedicated word processing machines. These devices, like the Wang 1200, featured small screens and floppy disks, making them revolutionary for their time. These machines were primarily used in offices, where secretarial pools benefited from their ability to make revisions without retyping entire documents. The Rise of Personal Computers (1980s) The advent of personal computers in the late 1970s and early 1980s transformed word processing from a niche tool to an essential technology for businesses and individuals alike. -- · WordStar (1978) : Developed for the CP/M operating system, WordStar was one of the first widely used word processing programs. It featured early examples of modern features like cut, copy, and paste. -- · Microsoft Word (1983) : Microsoft launched Word for MS-DOS in 1983, introducing a graphical user interface (GUI) and mouse support. Over the years, Microsoft Word became the industry standard for word processing. +· WordStar (1978) : Developed for the CP/M operating system, WordStar was one of the first widely used word processing programs. It featured early examples of modern features like cut, copy, and paste. +· Microsoft Word (1983) : Microsoft launched Word for MS-DOS in 1983, introducing a graphical user interface (GUI) and mouse support. Over the years, Microsoft Word became the industry standard for word processing. Other notable software from this era included WordPerfect, which was popular among legal professionals, and Apple's MacWrite, which leveraged the Macintosh's graphical capabilities. The Modern Era (1990s - Present) By the 1990s, word processing software had become more sophisticated, with features like spell check, grammar check, templates, and collaborative tools. -- · Microsoft Office Suite : Microsoft continued to dominate with its Office Suite, integrating Word with other productivity tools like Excel and PowerPoint. -- · OpenOffice and LibreOffice : Open-source alternatives emerged in the early 2000s, offering free and flexible word processing options. -- · Google Docs (2006) : The introduction of cloud-based word processing revolutionized collaboration. Google Docs enabled real-time editing and sharing, making it a staple for teams and remote work. +· Microsoft Office Suite : Microsoft continued to dominate with its Office Suite, integrating Word with other productivity tools like Excel and PowerPoint. +· OpenOffice and LibreOffice : Open-source alternatives emerged in the early 2000s, offering free and flexible word processing options. +· Google Docs (2006) : The introduction of cloud-based word processing revolutionized collaboration. Google Docs enabled real-time editing and sharing, making it a staple for teams and remote work. Future of Word Processing Today, word processors are more than just tools for typing. They integrate artificial intelligence for grammar and style suggestions (e.g., Grammarly), voice-to-text features, and advanced layout options. As AI continues to advance, word processors may evolve into even more intuitive tools that predict user needs, automate repetitive tasks, and support richer multimedia integration. From the clunky typewriters of the 19th century to the AI-powered cloud tools of today, the word processor has come a long way. It remains an essential tool for communication and creativity, shaping how we write and share ideas. Specialized Word Processing Tools In addition to general-purpose word processors, specialized tools have emerged to cater to specific industries and needs. These tools incorporate unique features tailored to their users' workflows: -- · Academic and Technical Writing : Tools like LaTeX gained popularity among academics, scientists, and engineers. Unlike traditional word processors, LaTeX focuses on precise formatting, particularly for complex mathematical equations, scientific papers, and technical documents. It relies on a markup language to produce polished documents suitable for publishing. -- · Screenwriting Software : For screenwriters, tools like Final Draft and Celtx are specialized to handle scripts for film and television. These programs automate the formatting of dialogue, scene descriptions, and other elements unique to screenwriting. -- · Legal Document Processors : Word processors tailored for legal professionals, like WordPerfect, offered features such as redlining (early version tracking) and document comparison. Even today, many law firms rely on these tools due to their robust formatting options for contracts and legal briefs. +· Academic and Technical Writing : Tools like LaTeX gained popularity among academics, scientists, and engineers. Unlike traditional word processors, LaTeX focuses on precise formatting, particularly for complex mathematical equations, scientific papers, and technical documents. It relies on a markup language to produce polished documents suitable for publishing. +· Screenwriting Software : For screenwriters, tools like Final Draft and Celtx are specialized to handle scripts for film and television. These programs automate the formatting of dialogue, scene descriptions, and other elements unique to screenwriting. +· Legal Document Processors : Word processors tailored for legal professionals, like WordPerfect, offered features such as redlining (early version tracking) and document comparison. Even today, many law firms rely on these tools due to their robust formatting options for contracts and legal briefs. Key Features That Changed Word Processing The evolution of word processors wasn't just about hardware or software improvements-it was about the features that revolutionized how people wrote and edited. Some of these transformative features include: -- 1. Undo/Redo : Introduced in the 1980s, the ability to undo mistakes and redo actions made experimentation and error correction much easier. -- 2. Spell Check and Grammar Check : By the 1990s, these became standard, allowing users to spot errors automatically. -- 3. Templates : Pre-designed formats for documents, such as resumes, letters, and invoices, helped users save time. -- 4. Track Changes : A game-changer for collaboration, this feature allowed multiple users to suggest edits while maintaining the original text. -- 5. Real-Time Collaboration : Tools like Google Docs and Microsoft 365 enabled multiple users to edit the same document simultaneously, forever changing teamwork dynamics. +1. Undo/Redo : Introduced in the 1980s, the ability to undo mistakes and redo actions made experimentation and error correction much easier. +2. Spell Check and Grammar Check : By the 1990s, these became standard, allowing users to spot errors automatically. +3. Templates : Pre-designed formats for documents, such as resumes, letters, and invoices, helped users save time. +4. Track Changes : A game-changer for collaboration, this feature allowed multiple users to suggest edits while maintaining the original text. +5. Real-Time Collaboration : Tools like Google Docs and Microsoft 365 enabled multiple users to edit the same document simultaneously, forever changing teamwork dynamics. The Cultural Impact of Word Processors The word processor didn't just change workplaces-it changed culture. It democratized writing, enabling anyone with access to a computer to produce professional-quality documents. This shift had profound implications for education, business, and creative fields: -- · Accessibility : Writers no longer needed expensive publishing equipment or training in typesetting to create polished work. This accessibility paved the way for selfpublishing, blogging, and even fan fiction communities. -- · Education : Word processors became a cornerstone of education, teaching students not only how to write essays but also how to use technology effectively. Features like bibliography generators and integrated research tools enhanced learning. -- · Creative Writing : Writers gained powerful tools to organize their ideas. Programs like Scrivener allowed authors to manage large projects, from novels to screenplays, with features like chapter outlines and character notes. +· Accessibility : Writers no longer needed expensive publishing equipment or training in typesetting to create polished work. This accessibility paved the way for selfpublishing, blogging, and even fan fiction communities. +· Education : Word processors became a cornerstone of education, teaching students not only how to write essays but also how to use technology effectively. Features like bibliography generators and integrated research tools enhanced learning. +· Creative Writing : Writers gained powerful tools to organize their ideas. Programs like Scrivener allowed authors to manage large projects, from novels to screenplays, with features like chapter outlines and character notes. Word Processors in a Post-Digital Era As we move further into the 21st century, the role of the word processor continues to evolve: -- 1. Artificial Intelligence : Modern word processors are leveraging AI to suggest content improvements. Tools like Grammarly, ProWritingAid, and even native features in Word now analyze tone, conciseness, and clarity. Some AI systems can even generate entire paragraphs or rewrite sentences. -- 2. Integration with Other Tools : Word processors are no longer standalone. They integrate with task managers, cloud storage, and project management platforms. For instance, Google Docs syncs with Google Drive, while Microsoft Word integrates seamlessly with OneDrive and Teams. -- 3. Voice Typing : Speech-to-text capabilities have made word processing more accessible, particularly for those with disabilities. Tools like Dragon NaturallySpeaking and built-in options in Google Docs and Microsoft Word have made dictation mainstream. -- 4. Multimedia Documents : Word processing has expanded beyond text. Modern tools allow users to embed images, videos, charts, and interactive elements, transforming simple documents into rich multimedia experiences. -- 5. Cross-Platform Accessibility : Thanks to cloud computing, documents can now be accessed and edited across devices. Whether you're on a desktop, tablet, or smartphone, you can continue working seamlessly. +1. Artificial Intelligence : Modern word processors are leveraging AI to suggest content improvements. Tools like Grammarly, ProWritingAid, and even native features in Word now analyze tone, conciseness, and clarity. Some AI systems can even generate entire paragraphs or rewrite sentences. +2. Integration with Other Tools : Word processors are no longer standalone. They integrate with task managers, cloud storage, and project management platforms. For instance, Google Docs syncs with Google Drive, while Microsoft Word integrates seamlessly with OneDrive and Teams. +3. Voice Typing : Speech-to-text capabilities have made word processing more accessible, particularly for those with disabilities. Tools like Dragon NaturallySpeaking and built-in options in Google Docs and Microsoft Word have made dictation mainstream. +4. Multimedia Documents : Word processing has expanded beyond text. Modern tools allow users to embed images, videos, charts, and interactive elements, transforming simple documents into rich multimedia experiences. +5. Cross-Platform Accessibility : Thanks to cloud computing, documents can now be accessed and edited across devices. Whether you're on a desktop, tablet, or smartphone, you can continue working seamlessly. A Glimpse Into the Future The word processor's future lies in adaptability and intelligence. Some exciting possibilities include: -- · Fully AI-Assisted Writing : Imagine a word processor that understands your writing style, drafts emails, or creates entire essays based on minimal input. -- · Immersive Interfaces : As augmented reality (AR) and virtual reality (VR) technology advance, users may be able to write and edit in 3D spaces, collaborating in virtual environments. -- · Hyper-Personalization : Word processors could offer dynamic suggestions based on industry-specific needs, user habits, or even regional language variations. +· Fully AI-Assisted Writing : Imagine a word processor that understands your writing style, drafts emails, or creates entire essays based on minimal input. +· Immersive Interfaces : As augmented reality (AR) and virtual reality (VR) technology advance, users may be able to write and edit in 3D spaces, collaborating in virtual environments. +· Hyper-Personalization : Word processors could offer dynamic suggestions based on industry-specific needs, user habits, or even regional language variations. The journey of the word processor-from clunky typewriters to AI-powered platformsreflects humanity's broader technological progress. What began as a tool to simply replace handwriting has transformed into a powerful ally for creativity, communication, and collaboration. As technology continues to advance, the word processor will undoubtedly remain at the heart of how we express ideas and connect with one another. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v1/multi_page.json b/tests/data/groundtruth/docling_v1/multi_page.json index 36f82cd..059f629 100644 --- a/tests/data/groundtruth/docling_v1/multi_page.json +++ b/tests/data/groundtruth/docling_v1/multi_page.json @@ -238,7 +238,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 IBM MT/ST (Magnetic Tape/Selectric Typewriter) : Introduced in 1964, this machine combined IBM's Selectric typewriter with magnetic tape storage. It allowed users to record, edit, and replay typed content-an early example of digital text storage.", + "text": "\u00b7 IBM MT/ST (Magnetic Tape/Selectric Typewriter) : Introduced in 1964, this machine combined IBM's Selectric typewriter with magnetic tape storage. It allowed users to record, edit, and replay typed content-an early example of digital text storage.", "type": "paragraph", "payload": null, "name": "List-item", @@ -261,7 +261,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 Wang Laboratories : In the 1970s, Wang introduced dedicated word processing machines. These devices, like the Wang 1200, featured small screens and floppy disks, making them revolutionary for their time.", + "text": "\u00b7 Wang Laboratories : In the 1970s, Wang introduced dedicated word processing machines. These devices, like the Wang 1200, featured small screens and floppy disks, making them revolutionary for their time.", "type": "paragraph", "payload": null, "name": "List-item", @@ -353,7 +353,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 WordStar (1978) : Developed for the CP/M operating system, WordStar was one of the first widely used word processing programs. It featured early examples of modern features like cut, copy, and paste.", + "text": "\u00b7 WordStar (1978) : Developed for the CP/M operating system, WordStar was one of the first widely used word processing programs. It featured early examples of modern features like cut, copy, and paste.", "type": "paragraph", "payload": null, "name": "List-item", @@ -376,7 +376,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 Microsoft Word (1983) : Microsoft launched Word for MS-DOS in 1983, introducing a graphical user interface (GUI) and mouse support. Over the years, Microsoft Word became the industry standard for word processing.", + "text": "\u00b7 Microsoft Word (1983) : Microsoft launched Word for MS-DOS in 1983, introducing a graphical user interface (GUI) and mouse support. Over the years, Microsoft Word became the industry standard for word processing.", "type": "paragraph", "payload": null, "name": "List-item", @@ -468,7 +468,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 Microsoft Office Suite : Microsoft continued to dominate with its Office Suite, integrating Word with other productivity tools like Excel and PowerPoint.", + "text": "\u00b7 Microsoft Office Suite : Microsoft continued to dominate with its Office Suite, integrating Word with other productivity tools like Excel and PowerPoint.", "type": "paragraph", "payload": null, "name": "List-item", @@ -491,7 +491,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 OpenOffice and LibreOffice : Open-source alternatives emerged in the early 2000s, offering free and flexible word processing options.", + "text": "\u00b7 OpenOffice and LibreOffice : Open-source alternatives emerged in the early 2000s, offering free and flexible word processing options.", "type": "paragraph", "payload": null, "name": "List-item", @@ -514,7 +514,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 Google Docs (2006) : The introduction of cloud-based word processing revolutionized collaboration. Google Docs enabled real-time editing and sharing, making it a staple for teams and remote work.", + "text": "\u00b7 Google Docs (2006) : The introduction of cloud-based word processing revolutionized collaboration. Google Docs enabled real-time editing and sharing, making it a staple for teams and remote work.", "type": "paragraph", "payload": null, "name": "List-item", @@ -652,7 +652,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 Academic and Technical Writing : Tools like LaTeX gained popularity among academics, scientists, and engineers. Unlike traditional word processors, LaTeX focuses on precise formatting, particularly for complex mathematical equations, scientific papers, and technical documents. It relies on a markup language to produce polished documents suitable for publishing.", + "text": "\u00b7 Academic and Technical Writing : Tools like LaTeX gained popularity among academics, scientists, and engineers. Unlike traditional word processors, LaTeX focuses on precise formatting, particularly for complex mathematical equations, scientific papers, and technical documents. It relies on a markup language to produce polished documents suitable for publishing.", "type": "paragraph", "payload": null, "name": "List-item", @@ -675,7 +675,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 Screenwriting Software : For screenwriters, tools like Final Draft and Celtx are specialized to handle scripts for film and television. These programs automate the formatting of dialogue, scene descriptions, and other elements unique to screenwriting.", + "text": "\u00b7 Screenwriting Software : For screenwriters, tools like Final Draft and Celtx are specialized to handle scripts for film and television. These programs automate the formatting of dialogue, scene descriptions, and other elements unique to screenwriting.", "type": "paragraph", "payload": null, "name": "List-item", @@ -698,7 +698,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 Legal Document Processors : Word processors tailored for legal professionals, like WordPerfect, offered features such as redlining (early version tracking) and document comparison. Even today, many law firms rely on these tools due to their robust formatting options for contracts and legal briefs.", + "text": "\u00b7 Legal Document Processors : Word processors tailored for legal professionals, like WordPerfect, offered features such as redlining (early version tracking) and document comparison. Even today, many law firms rely on these tools due to their robust formatting options for contracts and legal briefs.", "type": "paragraph", "payload": null, "name": "List-item", @@ -767,7 +767,7 @@ "__ref_s3_data": null } ], - "text": "- 1. Undo/Redo : Introduced in the 1980s, the ability to undo mistakes and redo actions made experimentation and error correction much easier.", + "text": "1. Undo/Redo : Introduced in the 1980s, the ability to undo mistakes and redo actions made experimentation and error correction much easier.", "type": "paragraph", "payload": null, "name": "List-item", @@ -790,7 +790,7 @@ "__ref_s3_data": null } ], - "text": "- 2. Spell Check and Grammar Check : By the 1990s, these became standard, allowing users to spot errors automatically.", + "text": "2. Spell Check and Grammar Check : By the 1990s, these became standard, allowing users to spot errors automatically.", "type": "paragraph", "payload": null, "name": "List-item", @@ -813,7 +813,7 @@ "__ref_s3_data": null } ], - "text": "- 3. Templates : Pre-designed formats for documents, such as resumes, letters, and invoices, helped users save time.", + "text": "3. Templates : Pre-designed formats for documents, such as resumes, letters, and invoices, helped users save time.", "type": "paragraph", "payload": null, "name": "List-item", @@ -836,7 +836,7 @@ "__ref_s3_data": null } ], - "text": "- 4. Track Changes : A game-changer for collaboration, this feature allowed multiple users to suggest edits while maintaining the original text.", + "text": "4. Track Changes : A game-changer for collaboration, this feature allowed multiple users to suggest edits while maintaining the original text.", "type": "paragraph", "payload": null, "name": "List-item", @@ -859,7 +859,7 @@ "__ref_s3_data": null } ], - "text": "- 5. Real-Time Collaboration : Tools like Google Docs and Microsoft 365 enabled multiple users to edit the same document simultaneously, forever changing teamwork dynamics.", + "text": "5. Real-Time Collaboration : Tools like Google Docs and Microsoft 365 enabled multiple users to edit the same document simultaneously, forever changing teamwork dynamics.", "type": "paragraph", "payload": null, "name": "List-item", @@ -928,7 +928,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 Accessibility : Writers no longer needed expensive publishing equipment or training in typesetting to create polished work. This accessibility paved the way for selfpublishing, blogging, and even fan fiction communities.", + "text": "\u00b7 Accessibility : Writers no longer needed expensive publishing equipment or training in typesetting to create polished work. This accessibility paved the way for selfpublishing, blogging, and even fan fiction communities.", "type": "paragraph", "payload": null, "name": "List-item", @@ -951,7 +951,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 Education : Word processors became a cornerstone of education, teaching students not only how to write essays but also how to use technology effectively. Features like bibliography generators and integrated research tools enhanced learning.", + "text": "\u00b7 Education : Word processors became a cornerstone of education, teaching students not only how to write essays but also how to use technology effectively. Features like bibliography generators and integrated research tools enhanced learning.", "type": "paragraph", "payload": null, "name": "List-item", @@ -974,7 +974,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 Creative Writing : Writers gained powerful tools to organize their ideas. Programs like Scrivener allowed authors to manage large projects, from novels to screenplays, with features like chapter outlines and character notes.", + "text": "\u00b7 Creative Writing : Writers gained powerful tools to organize their ideas. Programs like Scrivener allowed authors to manage large projects, from novels to screenplays, with features like chapter outlines and character notes.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1043,7 +1043,7 @@ "__ref_s3_data": null } ], - "text": "- 1. Artificial Intelligence : Modern word processors are leveraging AI to suggest content improvements. Tools like Grammarly, ProWritingAid, and even native features in Word now analyze tone, conciseness, and clarity. Some AI systems can even generate entire paragraphs or rewrite sentences.", + "text": "1. Artificial Intelligence : Modern word processors are leveraging AI to suggest content improvements. Tools like Grammarly, ProWritingAid, and even native features in Word now analyze tone, conciseness, and clarity. Some AI systems can even generate entire paragraphs or rewrite sentences.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1066,7 +1066,7 @@ "__ref_s3_data": null } ], - "text": "- 2. Integration with Other Tools : Word processors are no longer standalone. They integrate with task managers, cloud storage, and project management platforms. For instance, Google Docs syncs with Google Drive, while Microsoft Word integrates seamlessly with OneDrive and Teams.", + "text": "2. Integration with Other Tools : Word processors are no longer standalone. They integrate with task managers, cloud storage, and project management platforms. For instance, Google Docs syncs with Google Drive, while Microsoft Word integrates seamlessly with OneDrive and Teams.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1089,7 +1089,7 @@ "__ref_s3_data": null } ], - "text": "- 3. Voice Typing : Speech-to-text capabilities have made word processing more accessible, particularly for those with disabilities. Tools like Dragon NaturallySpeaking and built-in options in Google Docs and Microsoft Word have made dictation mainstream.", + "text": "3. Voice Typing : Speech-to-text capabilities have made word processing more accessible, particularly for those with disabilities. Tools like Dragon NaturallySpeaking and built-in options in Google Docs and Microsoft Word have made dictation mainstream.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1112,7 +1112,7 @@ "__ref_s3_data": null } ], - "text": "- 4. Multimedia Documents : Word processing has expanded beyond text. Modern tools allow users to embed images, videos, charts, and interactive elements, transforming simple documents into rich multimedia experiences.", + "text": "4. Multimedia Documents : Word processing has expanded beyond text. Modern tools allow users to embed images, videos, charts, and interactive elements, transforming simple documents into rich multimedia experiences.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1135,7 +1135,7 @@ "__ref_s3_data": null } ], - "text": "- 5. Cross-Platform Accessibility : Thanks to cloud computing, documents can now be accessed and edited across devices. Whether you're on a desktop, tablet, or smartphone, you can continue working seamlessly.", + "text": "5. Cross-Platform Accessibility : Thanks to cloud computing, documents can now be accessed and edited across devices. Whether you're on a desktop, tablet, or smartphone, you can continue working seamlessly.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1204,7 +1204,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 Fully AI-Assisted Writing : Imagine a word processor that understands your writing style, drafts emails, or creates entire essays based on minimal input.", + "text": "\u00b7 Fully AI-Assisted Writing : Imagine a word processor that understands your writing style, drafts emails, or creates entire essays based on minimal input.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1227,7 +1227,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 Immersive Interfaces : As augmented reality (AR) and virtual reality (VR) technology advance, users may be able to write and edit in 3D spaces, collaborating in virtual environments.", + "text": "\u00b7 Immersive Interfaces : As augmented reality (AR) and virtual reality (VR) technology advance, users may be able to write and edit in 3D spaces, collaborating in virtual environments.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1250,7 +1250,7 @@ "__ref_s3_data": null } ], - "text": "- \u00b7 Hyper-Personalization : Word processors could offer dynamic suggestions based on industry-specific needs, user habits, or even regional language variations.", + "text": "\u00b7 Hyper-Personalization : Word processors could offer dynamic suggestions based on industry-specific needs, user habits, or even regional language variations.", "type": "paragraph", "payload": null, "name": "List-item", diff --git a/tests/data/groundtruth/docling_v1/multi_page.md b/tests/data/groundtruth/docling_v1/multi_page.md index cefba83..dc643b2 100644 --- a/tests/data/groundtruth/docling_v1/multi_page.md +++ b/tests/data/groundtruth/docling_v1/multi_page.md @@ -12,9 +12,9 @@ During this period, the term "word processing" didn't exist, but the typewriter The term "word processor" first emerged in the 1960s and referred to any system designed to streamline written communication and document production. Early word processors were not software programs but rather standalone machines. -- · IBM MT/ST (Magnetic Tape/Selectric Typewriter) : Introduced in 1964, this machine combined IBM's Selectric typewriter with magnetic tape storage. It allowed users to record, edit, and replay typed content-an early example of digital text storage. +· IBM MT/ST (Magnetic Tape/Selectric Typewriter) : Introduced in 1964, this machine combined IBM's Selectric typewriter with magnetic tape storage. It allowed users to record, edit, and replay typed content-an early example of digital text storage. -- · Wang Laboratories : In the 1970s, Wang introduced dedicated word processing machines. These devices, like the Wang 1200, featured small screens and floppy disks, making them revolutionary for their time. +· Wang Laboratories : In the 1970s, Wang introduced dedicated word processing machines. These devices, like the Wang 1200, featured small screens and floppy disks, making them revolutionary for their time. These machines were primarily used in offices, where secretarial pools benefited from their ability to make revisions without retyping entire documents. @@ -22,9 +22,9 @@ These machines were primarily used in offices, where secretarial pools benefited The advent of personal computers in the late 1970s and early 1980s transformed word processing from a niche tool to an essential technology for businesses and individuals alike. -- · WordStar (1978) : Developed for the CP/M operating system, WordStar was one of the first widely used word processing programs. It featured early examples of modern features like cut, copy, and paste. +· WordStar (1978) : Developed for the CP/M operating system, WordStar was one of the first widely used word processing programs. It featured early examples of modern features like cut, copy, and paste. -- · Microsoft Word (1983) : Microsoft launched Word for MS-DOS in 1983, introducing a graphical user interface (GUI) and mouse support. Over the years, Microsoft Word became the industry standard for word processing. +· Microsoft Word (1983) : Microsoft launched Word for MS-DOS in 1983, introducing a graphical user interface (GUI) and mouse support. Over the years, Microsoft Word became the industry standard for word processing. Other notable software from this era included WordPerfect, which was popular among legal professionals, and Apple's MacWrite, which leveraged the Macintosh's graphical capabilities. @@ -32,11 +32,11 @@ Other notable software from this era included WordPerfect, which was popular amo By the 1990s, word processing software had become more sophisticated, with features like spell check, grammar check, templates, and collaborative tools. -- · Microsoft Office Suite : Microsoft continued to dominate with its Office Suite, integrating Word with other productivity tools like Excel and PowerPoint. +· Microsoft Office Suite : Microsoft continued to dominate with its Office Suite, integrating Word with other productivity tools like Excel and PowerPoint. -- · OpenOffice and LibreOffice : Open-source alternatives emerged in the early 2000s, offering free and flexible word processing options. +· OpenOffice and LibreOffice : Open-source alternatives emerged in the early 2000s, offering free and flexible word processing options. -- · Google Docs (2006) : The introduction of cloud-based word processing revolutionized collaboration. Google Docs enabled real-time editing and sharing, making it a staple for teams and remote work. +· Google Docs (2006) : The introduction of cloud-based word processing revolutionized collaboration. Google Docs enabled real-time editing and sharing, making it a staple for teams and remote work. ## Future of Word Processing @@ -48,58 +48,58 @@ From the clunky typewriters of the 19th century to the AI-powered cloud tools of In addition to general-purpose word processors, specialized tools have emerged to cater to specific industries and needs. These tools incorporate unique features tailored to their users' workflows: -- · Academic and Technical Writing : Tools like LaTeX gained popularity among academics, scientists, and engineers. Unlike traditional word processors, LaTeX focuses on precise formatting, particularly for complex mathematical equations, scientific papers, and technical documents. It relies on a markup language to produce polished documents suitable for publishing. +· Academic and Technical Writing : Tools like LaTeX gained popularity among academics, scientists, and engineers. Unlike traditional word processors, LaTeX focuses on precise formatting, particularly for complex mathematical equations, scientific papers, and technical documents. It relies on a markup language to produce polished documents suitable for publishing. -- · Screenwriting Software : For screenwriters, tools like Final Draft and Celtx are specialized to handle scripts for film and television. These programs automate the formatting of dialogue, scene descriptions, and other elements unique to screenwriting. +· Screenwriting Software : For screenwriters, tools like Final Draft and Celtx are specialized to handle scripts for film and television. These programs automate the formatting of dialogue, scene descriptions, and other elements unique to screenwriting. -- · Legal Document Processors : Word processors tailored for legal professionals, like WordPerfect, offered features such as redlining (early version tracking) and document comparison. Even today, many law firms rely on these tools due to their robust formatting options for contracts and legal briefs. +· Legal Document Processors : Word processors tailored for legal professionals, like WordPerfect, offered features such as redlining (early version tracking) and document comparison. Even today, many law firms rely on these tools due to their robust formatting options for contracts and legal briefs. ## Key Features That Changed Word Processing The evolution of word processors wasn't just about hardware or software improvements-it was about the features that revolutionized how people wrote and edited. Some of these transformative features include: -- 1. Undo/Redo : Introduced in the 1980s, the ability to undo mistakes and redo actions made experimentation and error correction much easier. +1. Undo/Redo : Introduced in the 1980s, the ability to undo mistakes and redo actions made experimentation and error correction much easier. -- 2. Spell Check and Grammar Check : By the 1990s, these became standard, allowing users to spot errors automatically. +2. Spell Check and Grammar Check : By the 1990s, these became standard, allowing users to spot errors automatically. -- 3. Templates : Pre-designed formats for documents, such as resumes, letters, and invoices, helped users save time. +3. Templates : Pre-designed formats for documents, such as resumes, letters, and invoices, helped users save time. -- 4. Track Changes : A game-changer for collaboration, this feature allowed multiple users to suggest edits while maintaining the original text. +4. Track Changes : A game-changer for collaboration, this feature allowed multiple users to suggest edits while maintaining the original text. -- 5. Real-Time Collaboration : Tools like Google Docs and Microsoft 365 enabled multiple users to edit the same document simultaneously, forever changing teamwork dynamics. +5. Real-Time Collaboration : Tools like Google Docs and Microsoft 365 enabled multiple users to edit the same document simultaneously, forever changing teamwork dynamics. ## The Cultural Impact of Word Processors The word processor didn't just change workplaces-it changed culture. It democratized writing, enabling anyone with access to a computer to produce professional-quality documents. This shift had profound implications for education, business, and creative fields: -- · Accessibility : Writers no longer needed expensive publishing equipment or training in typesetting to create polished work. This accessibility paved the way for selfpublishing, blogging, and even fan fiction communities. +· Accessibility : Writers no longer needed expensive publishing equipment or training in typesetting to create polished work. This accessibility paved the way for selfpublishing, blogging, and even fan fiction communities. -- · Education : Word processors became a cornerstone of education, teaching students not only how to write essays but also how to use technology effectively. Features like bibliography generators and integrated research tools enhanced learning. +· Education : Word processors became a cornerstone of education, teaching students not only how to write essays but also how to use technology effectively. Features like bibliography generators and integrated research tools enhanced learning. -- · Creative Writing : Writers gained powerful tools to organize their ideas. Programs like Scrivener allowed authors to manage large projects, from novels to screenplays, with features like chapter outlines and character notes. +· Creative Writing : Writers gained powerful tools to organize their ideas. Programs like Scrivener allowed authors to manage large projects, from novels to screenplays, with features like chapter outlines and character notes. ## Word Processors in a Post-Digital Era As we move further into the 21st century, the role of the word processor continues to evolve: -- 1. Artificial Intelligence : Modern word processors are leveraging AI to suggest content improvements. Tools like Grammarly, ProWritingAid, and even native features in Word now analyze tone, conciseness, and clarity. Some AI systems can even generate entire paragraphs or rewrite sentences. +1. Artificial Intelligence : Modern word processors are leveraging AI to suggest content improvements. Tools like Grammarly, ProWritingAid, and even native features in Word now analyze tone, conciseness, and clarity. Some AI systems can even generate entire paragraphs or rewrite sentences. -- 2. Integration with Other Tools : Word processors are no longer standalone. They integrate with task managers, cloud storage, and project management platforms. For instance, Google Docs syncs with Google Drive, while Microsoft Word integrates seamlessly with OneDrive and Teams. +2. Integration with Other Tools : Word processors are no longer standalone. They integrate with task managers, cloud storage, and project management platforms. For instance, Google Docs syncs with Google Drive, while Microsoft Word integrates seamlessly with OneDrive and Teams. -- 3. Voice Typing : Speech-to-text capabilities have made word processing more accessible, particularly for those with disabilities. Tools like Dragon NaturallySpeaking and built-in options in Google Docs and Microsoft Word have made dictation mainstream. +3. Voice Typing : Speech-to-text capabilities have made word processing more accessible, particularly for those with disabilities. Tools like Dragon NaturallySpeaking and built-in options in Google Docs and Microsoft Word have made dictation mainstream. -- 4. Multimedia Documents : Word processing has expanded beyond text. Modern tools allow users to embed images, videos, charts, and interactive elements, transforming simple documents into rich multimedia experiences. +4. Multimedia Documents : Word processing has expanded beyond text. Modern tools allow users to embed images, videos, charts, and interactive elements, transforming simple documents into rich multimedia experiences. -- 5. Cross-Platform Accessibility : Thanks to cloud computing, documents can now be accessed and edited across devices. Whether you're on a desktop, tablet, or smartphone, you can continue working seamlessly. +5. Cross-Platform Accessibility : Thanks to cloud computing, documents can now be accessed and edited across devices. Whether you're on a desktop, tablet, or smartphone, you can continue working seamlessly. ## A Glimpse Into the Future The word processor's future lies in adaptability and intelligence. Some exciting possibilities include: -- · Fully AI-Assisted Writing : Imagine a word processor that understands your writing style, drafts emails, or creates entire essays based on minimal input. +· Fully AI-Assisted Writing : Imagine a word processor that understands your writing style, drafts emails, or creates entire essays based on minimal input. -- · Immersive Interfaces : As augmented reality (AR) and virtual reality (VR) technology advance, users may be able to write and edit in 3D spaces, collaborating in virtual environments. +· Immersive Interfaces : As augmented reality (AR) and virtual reality (VR) technology advance, users may be able to write and edit in 3D spaces, collaborating in virtual environments. -- · Hyper-Personalization : Word processors could offer dynamic suggestions based on industry-specific needs, user habits, or even regional language variations. +· Hyper-Personalization : Word processors could offer dynamic suggestions based on industry-specific needs, user habits, or even regional language variations. The journey of the word processor-from clunky typewriters to AI-powered platformsreflects humanity's broader technological progress. What began as a tool to simply replace handwriting has transformed into a powerful ally for creativity, communication, and collaboration. As technology continues to advance, the word processor will undoubtedly remain at the heart of how we express ideas and connect with one another. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v1/redp5110_sampled.doctags.txt b/tests/data/groundtruth/docling_v1/redp5110_sampled.doctags.txt index d90ad51..a2a164c 100644 --- a/tests/data/groundtruth/docling_v1/redp5110_sampled.doctags.txt +++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.doctags.txt @@ -17,10 +17,10 @@
Highlights -- GLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH -- GLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH -- GLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH -- GLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH +GLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH +GLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH +GLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH +GLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH
@@ -33,15 +33,15 @@ With combined experiences and direct access to development groups, we're the experts in IBM DB2® for i. The DB2 for i Center of Excellence (CoE) can help you achieve-perhaps reexamine and exceed-your business requirements and gain more confidence and satisfaction in IBM product data management products and solutions. Who we are, some of what we do Global CoE engagements cover topics including: -- r Database performance and scalability -- r Advanced SQL knowledge and skills transfer -- r Business intelligence and analytics -- r DB2 Web Query -- r Query/400 modernization for better reporting and analysis capabilities -- r Database modernization and re-engineering -- r Data-centric architecture and design -- r Extremely large database and overcoming limits to growth -- r ISV education and enablement +r Database performance and scalability +r Advanced SQL knowledge and skills transfer +r Business intelligence and analytics +r DB2 Web Query +r Query/400 modernization for better reporting and analysis capabilities +r Database modernization and re-engineering +r Data-centric architecture and design +r Extremely large database and overcoming limits to growth +r ISV education and enablement Preface This IBMfi Redpaper™ publication provides information about the IBM i 7.2 feature of IBM DB2fi for i Row and Column Access Control (RCAC). It offers a broad description of the function and advantages of controlling access to data in a comprehensive and transparent way. This publication helps you understand the capabilities of RCAC and provides examples of defining, creating, and implementing the row permissions and column masks in a relational database environment. This paper is intended for database engineers, data-centric application developers, and security officers who want to design and implement RCAC as a part of their data control and governance policy. A solid background in IBM i object level security, DB2 for i relational database concepts, and SQL is assumed. @@ -64,15 +64,15 @@ Recent news headlines are filled with reports of data breaches and cyber-attacks impacting global businesses of all sizes. The Identity Theft Resource Center$^{1}$ reports that almost 5000 data breaches have occurred since 2005, exposing over 600 million records of data. The financial cost of these data breaches is skyrocketing. Studies from the Ponemon Institute$^{2}$ revealed that the average cost of a data breach increased in 2013 by 15% globally and resulted in a brand equity loss of $9.4 million per attack. The average cost that is incurred for each lost record containing sensitive information increased more than 9% to $145 per record. Businesses must make a serious effort to secure their data and recognize that securing information assets is a cost of doing business. In many parts of the world and in many industries, securing the data is required by law and subject to audits. Data security is no longer an option; it is a requirement. This chapter describes how you can secure and protect data in DB2 for i. The following topics are covered in this chapter: -- GLYPH Security fundamentals -- GLYPH Current state of IBM i security -- GLYPH DB2 for i security controls +GLYPH Security fundamentals +GLYPH Current state of IBM i security +GLYPH DB2 for i security controls 1.1 Security fundamentals Before reviewing database security techniques, there are two fundamental steps in securing information assets that must be described: -- GLYPH First, and most important, is the definition of a company's security policy . Without a security policy, there is no definition of what are acceptable practices for using, accessing, and storing information by who, what, when, where, and how. A security policy should minimally address three things: confidentiality, integrity, and availability. -- The monitoring and assessment of adherence to the security policy determines whether your security strategy is working. Often, IBM security consultants are asked to perform security assessments for companies without regard to the security policy. Although these assessments can be useful for observing how the system is defined and how data is being accessed, they cannot determine the level of security without a security policy. Without a security policy, it really is not an assessment as much as it is a baseline for monitoring the changes in the security settings that are captured. +GLYPH First, and most important, is the definition of a company's security policy . Without a security policy, there is no definition of what are acceptable practices for using, accessing, and storing information by who, what, when, where, and how. A security policy should minimally address three things: confidentiality, integrity, and availability. +The monitoring and assessment of adherence to the security policy determines whether your security strategy is working. Often, IBM security consultants are asked to perform security assessments for companies without regard to the security policy. Although these assessments can be useful for observing how the system is defined and how data is being accessed, they cannot determine the level of security without a security policy. Without a security policy, it really is not an assessment as much as it is a baseline for monitoring the changes in the security settings that are captured. A security policy is what defines whether the system and its settings are secure (or not). -- GLYPH The second fundamental in securing data assets is the use of resource security . If implemented properly, resource security prevents data breaches from both internal and external intrusions. Resource security controls are closely tied to the part of the security policy that defines who should have access to what information resources. A hacker might be good enough to get through your company firewalls and sift his way through to your system, but if they do not have explicit access to your database, the hacker cannot compromise your information assets. +GLYPH The second fundamental in securing data assets is the use of resource security . If implemented properly, resource security prevents data breaches from both internal and external intrusions. Resource security controls are closely tied to the part of the security policy that defines who should have access to what information resources. A hacker might be good enough to get through your company firewalls and sift his way through to your system, but if they do not have explicit access to your database, the hacker cannot compromise your information assets. With your eyes now open to the importance of securing information assets, the rest of this chapter reviews the methods that are available for securing database resources on IBM i. 1.2 Current state of IBM i security Because of the inherently secure nature of IBM i, many clients rely on the default system settings to protect their business data that is stored in DB2 for i. In most cases, this means no data protection because the default setting for the Create default public authority (QCRTAUT) system value is *CHANGE. @@ -90,9 +90,9 @@ Figure 1-2 Existing row and column controls 2.1.6 Change Function Usage CL command The following CL commands can be used to work with, display, or change function usage IDs: -- GLYPH Work Function Usage ( WRKFCNUSG ) -- GLYPH Change Function Usage ( CHGFCNUSG ) -- GLYPH Display Function Usage ( DSPFCNUSG ) +GLYPH Work Function Usage ( WRKFCNUSG ) +GLYPH Change Function Usage ( CHGFCNUSG ) +GLYPH Display Function Usage ( DSPFCNUSG ) For example, the following CHGFCNUSG command shows granting authorization to user HBEDOYA to administer and manage RCAC rules: CHGFCNUSG FCNID(QIBM_DB_SECADM) USER(HBEDOYA) USAGE(*ALLOWED) 2.1.7 Verifying function usage IDs for RCAC with the FUNCTION_USAGE view @@ -165,11 +165,11 @@ Table 3-1 Special registers and their corresponding values Figure 3-5 shows the difference in the special register values when an adopted authority is used: -- GLYPH A user connects to the server using the user profile ALICE. -- GLYPH USER and CURRENT USER initially have the same value of ALICE. -- GLYPH ALICE calls an SQL procedure that is named proc1, which is owned by user profile JOE and was created to adopt JOE's authority when it is called. -- GLYPH While the procedure is running, the special register USER still contains the value of ALICE because it excludes any adopted authority. The special register CURRENT USER contains the value of JOE because it includes any adopted authority. -- GLYPH When proc1 ends, the session reverts to its original state with both USER and CURRENT USER having the value of ALICE. +GLYPH A user connects to the server using the user profile ALICE. +GLYPH USER and CURRENT USER initially have the same value of ALICE. +GLYPH ALICE calls an SQL procedure that is named proc1, which is owned by user profile JOE and was created to adopt JOE's authority when it is called. +GLYPH While the procedure is running, the special register USER still contains the value of ALICE because it excludes any adopted authority. The special register CURRENT USER contains the value of JOE because it includes any adopted authority. +GLYPH When proc1 ends, the session reverts to its original state with both USER and CURRENT USER having the value of ALICE.
Figure 3-5 Special registers and adopted authority @@ -198,22 +198,22 @@ The VERIFY_GROUP_FOR_USER function was added in IBM i 7.2. Although it is primarily intended for use with RCAC permissions and masks, it can be used in other SQL statements. The first parameter must be one of these three special registers: SESSION_USER, USER, or CURRENT_USER. The second and subsequent parameters are a list of user or group profiles. Each of these values must be 1 - 10 characters in length. These values are not validated for their existence, which means that you can specify the names of user profiles that do not exist without receiving any kind of error. If a special register value is in the list of user profiles or it is a member of a group profile included in the list, the function returns a long integer value of 1. Otherwise, it returns a value of 0. It never returns the null value. Here is an example of using the VERIFY_GROUP_FOR_USER function: -- 1. There are user profiles for MGR, JANE, JUDY, and TONY. -- 2. The user profile JANE specifies a group profile of MGR. -- 3. If a user is connected to the server using user profile JANE, all of the following function invocations return a value of 1: +1. There are user profiles for MGR, JANE, JUDY, and TONY. +2. The user profile JANE specifies a group profile of MGR. +3. If a user is connected to the server using user profile JANE, all of the following function invocations return a value of 1: VERIFY_GROUP_FOR_USER (CURRENT_USER, 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR', 'STEVE') The following function invocation returns a value of 0: VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JUDY', 'TONY') RETURN CASE WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'HR', 'EMP' ) = 1 THEN EMPLOYEES . DATE_OF_BIRTH WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER = EMPLOYEES . USER_ID THEN EMPLOYEES . DATE_OF_BIRTH WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER <> EMPLOYEES . USER_ID THEN ( 9999 || '-' || MONTH ( EMPLOYEES . DATE_OF_BIRTH ) || '-' || DAY (EMPLOYEES.DATE_OF_BIRTH )) ELSE NULL END ENABLE ; -- 2. The other column to mask in this example is the TAX_ID information. In this example, the rules to enforce include the following ones: -- -Human Resources can see the unmasked TAX_ID of the employees. -- -Employees can see only their own unmasked TAX_ID. -- -Managers see a masked version of TAX_ID with the first five characters replaced with the X character (for example, XXX-XX-1234). -- -Any other person sees the entire TAX_ID as masked, for example, XXX-XX-XXXX. -- To implement this column mask, run the SQL statement that is shown in Example 3-9. +2. The other column to mask in this example is the TAX_ID information. In this example, the rules to enforce include the following ones: +-Human Resources can see the unmasked TAX_ID of the employees. +-Employees can see only their own unmasked TAX_ID. +-Managers see a masked version of TAX_ID with the first five characters replaced with the X character (for example, XXX-XX-1234). +-Any other person sees the entire TAX_ID as masked, for example, XXX-XX-XXXX. +To implement this column mask, run the SQL statement that is shown in Example 3-9. CREATE MASK HR_SCHEMA.MASK_TAX_ID_ON_EMPLOYEES ON HR_SCHEMA.EMPLOYEES AS EMPLOYEES FOR COLUMN TAX_ID RETURN CASE WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'HR' ) = 1 THEN EMPLOYEES . TAX_ID WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER = EMPLOYEES . USER_ID THEN EMPLOYEES . TAX_ID WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER <> EMPLOYEES . USER_ID THEN ( 'XXX-XX-' CONCAT QSYS2 . SUBSTR ( EMPLOYEES . TAX_ID , 8 , 4 ) ) WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'EMP' ) = 1 THEN EMPLOYEES . TAX_ID ELSE 'XXX-XX-XXXX' END ENABLE ; Example 3-9 Creating a mask on the TAX_ID column -- 3. Figure 3-10 shows the masks that are created in the HR_SCHEMA. +3. Figure 3-10 shows the masks that are created in the HR_SCHEMA.
Figure 3-10 Column masks shown in System i Navigator @@ -221,22 +221,22 @@ Figure 3-10 Column masks shown in System i Navigator 3.6.6 Activating RCAC Now that you have created the row permission and the two column masks, RCAC must be activated. The row permission and the two column masks are enabled (last clause in the scripts), but now you must activate RCAC on the table. To do so, complete the following steps: -- 1. Run the SQL statements that are shown in Example 3-10. +1. Run the SQL statements that are shown in Example 3-10. Example 3-10 Activating RCAC on the EMPLOYEES table -- /* Active Row Access Control (permissions) */ -- /* Active Column Access Control (masks) +/* Active Row Access Control (permissions) */ +/* Active Column Access Control (masks) */ ALTER TABLE HR_SCHEMA.EMPLOYEES ACTIVATE ROW ACCESS CONTROL ACTIVATE COLUMN ACCESS CONTROL; -- 2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition . +2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition .
Figure 3-11 Selecting the EMPLOYEES table from System i Navigator
Figure 3-11 Selecting the EMPLOYEES table from System i Navigator -- 2. Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause. -- 3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause. +2. Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause. +3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause.
Figure 4-68 Visual Explain with RCAC enabled diff --git a/tests/data/groundtruth/docling_v1/redp5110_sampled.json b/tests/data/groundtruth/docling_v1/redp5110_sampled.json index 36386c4..076f32c 100644 --- a/tests/data/groundtruth/docling_v1/redp5110_sampled.json +++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.json @@ -305,7 +305,7 @@ "__ref_s3_data": null } ], - "text": "- GLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH", + "text": "GLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH", "type": "paragraph", "payload": null, "name": "List-item", @@ -328,7 +328,7 @@ "__ref_s3_data": null } ], - "text": "- GLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH", + "text": "GLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH", "type": "paragraph", "payload": null, "name": "List-item", @@ -351,7 +351,7 @@ "__ref_s3_data": null } ], - "text": "- GLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH", + "text": "GLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH", "type": "paragraph", "payload": null, "name": "List-item", @@ -374,7 +374,7 @@ "__ref_s3_data": null } ], - "text": "- GLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH", + "text": "GLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH", "type": "paragraph", "payload": null, "name": "List-item", @@ -609,7 +609,7 @@ "__ref_s3_data": null } ], - "text": "- r Database performance and scalability", + "text": "r Database performance and scalability", "type": "paragraph", "payload": null, "name": "List-item", @@ -632,7 +632,7 @@ "__ref_s3_data": null } ], - "text": "- r Advanced SQL knowledge and skills transfer", + "text": "r Advanced SQL knowledge and skills transfer", "type": "paragraph", "payload": null, "name": "List-item", @@ -655,7 +655,7 @@ "__ref_s3_data": null } ], - "text": "- r Business intelligence and analytics", + "text": "r Business intelligence and analytics", "type": "paragraph", "payload": null, "name": "List-item", @@ -678,7 +678,7 @@ "__ref_s3_data": null } ], - "text": "- r DB2 Web Query", + "text": "r DB2 Web Query", "type": "paragraph", "payload": null, "name": "List-item", @@ -701,7 +701,7 @@ "__ref_s3_data": null } ], - "text": "- r Query/400 modernization for better reporting and analysis capabilities", + "text": "r Query/400 modernization for better reporting and analysis capabilities", "type": "paragraph", "payload": null, "name": "List-item", @@ -724,7 +724,7 @@ "__ref_s3_data": null } ], - "text": "- r Database modernization and re-engineering", + "text": "r Database modernization and re-engineering", "type": "paragraph", "payload": null, "name": "List-item", @@ -747,7 +747,7 @@ "__ref_s3_data": null } ], - "text": "- r Data-centric architecture and design", + "text": "r Data-centric architecture and design", "type": "paragraph", "payload": null, "name": "List-item", @@ -770,7 +770,7 @@ "__ref_s3_data": null } ], - "text": "- r Extremely large database and overcoming limits to growth", + "text": "r Extremely large database and overcoming limits to growth", "type": "paragraph", "payload": null, "name": "List-item", @@ -793,7 +793,7 @@ "__ref_s3_data": null } ], - "text": "- r ISV education and enablement", + "text": "r ISV education and enablement", "type": "paragraph", "payload": null, "name": "List-item", @@ -1130,7 +1130,7 @@ "__ref_s3_data": null } ], - "text": "- GLYPH Security fundamentals", + "text": "GLYPH Security fundamentals", "type": "paragraph", "payload": null, "name": "List-item", @@ -1153,7 +1153,7 @@ "__ref_s3_data": null } ], - "text": "- GLYPH Current state of IBM i security", + "text": "GLYPH Current state of IBM i security", "type": "paragraph", "payload": null, "name": "List-item", @@ -1176,7 +1176,7 @@ "__ref_s3_data": null } ], - "text": "- GLYPH DB2 for i security controls", + "text": "GLYPH DB2 for i security controls", "type": "paragraph", "payload": null, "name": "List-item", @@ -1291,7 +1291,7 @@ "__ref_s3_data": null } ], - "text": "- GLYPH First, and most important, is the definition of a company's security policy . Without a security policy, there is no definition of what are acceptable practices for using, accessing, and storing information by who, what, when, where, and how. A security policy should minimally address three things: confidentiality, integrity, and availability.", + "text": "GLYPH First, and most important, is the definition of a company's security policy . Without a security policy, there is no definition of what are acceptable practices for using, accessing, and storing information by who, what, when, where, and how. A security policy should minimally address three things: confidentiality, integrity, and availability.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1314,7 +1314,7 @@ "__ref_s3_data": null } ], - "text": "- The monitoring and assessment of adherence to the security policy determines whether your security strategy is working. Often, IBM security consultants are asked to perform security assessments for companies without regard to the security policy. Although these assessments can be useful for observing how the system is defined and how data is being accessed, they cannot determine the level of security without a security policy. Without a security policy, it really is not an assessment as much as it is a baseline for monitoring the changes in the security settings that are captured.", + "text": "The monitoring and assessment of adherence to the security policy determines whether your security strategy is working. Often, IBM security consultants are asked to perform security assessments for companies without regard to the security policy. Although these assessments can be useful for observing how the system is defined and how data is being accessed, they cannot determine the level of security without a security policy. Without a security policy, it really is not an assessment as much as it is a baseline for monitoring the changes in the security settings that are captured.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1360,7 +1360,7 @@ "__ref_s3_data": null } ], - "text": "- GLYPH The second fundamental in securing data assets is the use of resource security . If implemented properly, resource security prevents data breaches from both internal and external intrusions. Resource security controls are closely tied to the part of the security policy that defines who should have access to what information resources. A hacker might be good enough to get through your company firewalls and sift his way through to your system, but if they do not have explicit access to your database, the hacker cannot compromise your information assets.", + "text": "GLYPH The second fundamental in securing data assets is the use of resource security . If implemented properly, resource security prevents data breaches from both internal and external intrusions. Resource security controls are closely tied to the part of the security policy that defines who should have access to what information resources. A hacker might be good enough to get through your company firewalls and sift his way through to your system, but if they do not have explicit access to your database, the hacker cannot compromise your information assets.", "type": "paragraph", "payload": null, "name": "List-item", @@ -1687,7 +1687,7 @@ "__ref_s3_data": null } ], - "text": "- GLYPH Work Function Usage ( WRKFCNUSG )", + "text": "GLYPH Work Function Usage ( WRKFCNUSG )", "type": "paragraph", "payload": null, "name": "List-item", @@ -1710,7 +1710,7 @@ "__ref_s3_data": null } ], - "text": "- GLYPH Change Function Usage ( CHGFCNUSG )", + "text": "GLYPH Change Function Usage ( CHGFCNUSG )", "type": "paragraph", "payload": null, "name": "List-item", @@ -1733,7 +1733,7 @@ "__ref_s3_data": null } ], - "text": "- GLYPH Display Function Usage ( DSPFCNUSG )", + "text": "GLYPH Display Function Usage ( DSPFCNUSG )", "type": "paragraph", "payload": null, "name": "List-item", @@ -2558,7 +2558,7 @@ "__ref_s3_data": null } ], - "text": "- GLYPH A user connects to the server using the user profile ALICE.", + "text": "GLYPH A user connects to the server using the user profile ALICE.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2581,7 +2581,7 @@ "__ref_s3_data": null } ], - "text": "- GLYPH USER and CURRENT USER initially have the same value of ALICE.", + "text": "GLYPH USER and CURRENT USER initially have the same value of ALICE.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2604,7 +2604,7 @@ "__ref_s3_data": null } ], - "text": "- GLYPH ALICE calls an SQL procedure that is named proc1, which is owned by user profile JOE and was created to adopt JOE's authority when it is called.", + "text": "GLYPH ALICE calls an SQL procedure that is named proc1, which is owned by user profile JOE and was created to adopt JOE's authority when it is called.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2627,7 +2627,7 @@ "__ref_s3_data": null } ], - "text": "- GLYPH While the procedure is running, the special register USER still contains the value of ALICE because it excludes any adopted authority. The special register CURRENT USER contains the value of JOE because it includes any adopted authority.", + "text": "GLYPH While the procedure is running, the special register USER still contains the value of ALICE because it excludes any adopted authority. The special register CURRENT USER contains the value of JOE because it includes any adopted authority.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2650,7 +2650,7 @@ "__ref_s3_data": null } ], - "text": "- GLYPH When proc1 ends, the session reverts to its original state with both USER and CURRENT USER having the value of ALICE.", + "text": "GLYPH When proc1 ends, the session reverts to its original state with both USER and CURRENT USER having the value of ALICE.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2913,7 +2913,7 @@ "__ref_s3_data": null } ], - "text": "- 1. There are user profiles for MGR, JANE, JUDY, and TONY.", + "text": "1. There are user profiles for MGR, JANE, JUDY, and TONY.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2936,7 +2936,7 @@ "__ref_s3_data": null } ], - "text": "- 2. The user profile JANE specifies a group profile of MGR.", + "text": "2. The user profile JANE specifies a group profile of MGR.", "type": "paragraph", "payload": null, "name": "List-item", @@ -2959,7 +2959,7 @@ "__ref_s3_data": null } ], - "text": "- 3. If a user is connected to the server using user profile JANE, all of the following function invocations return a value of 1:", + "text": "3. If a user is connected to the server using user profile JANE, all of the following function invocations return a value of 1:", "type": "paragraph", "payload": null, "name": "List-item", @@ -3074,7 +3074,7 @@ "__ref_s3_data": null } ], - "text": "- 2. The other column to mask in this example is the TAX_ID information. In this example, the rules to enforce include the following ones:", + "text": "2. The other column to mask in this example is the TAX_ID information. In this example, the rules to enforce include the following ones:", "type": "paragraph", "payload": null, "name": "List-item", @@ -3097,7 +3097,7 @@ "__ref_s3_data": null } ], - "text": "- -Human Resources can see the unmasked TAX_ID of the employees.", + "text": "-Human Resources can see the unmasked TAX_ID of the employees.", "type": "paragraph", "payload": null, "name": "List-item", @@ -3120,7 +3120,7 @@ "__ref_s3_data": null } ], - "text": "- -Employees can see only their own unmasked TAX_ID.", + "text": "-Employees can see only their own unmasked TAX_ID.", "type": "paragraph", "payload": null, "name": "List-item", @@ -3143,7 +3143,7 @@ "__ref_s3_data": null } ], - "text": "- -Managers see a masked version of TAX_ID with the first five characters replaced with the X character (for example, XXX-XX-1234).", + "text": "-Managers see a masked version of TAX_ID with the first five characters replaced with the X character (for example, XXX-XX-1234).", "type": "paragraph", "payload": null, "name": "List-item", @@ -3166,7 +3166,7 @@ "__ref_s3_data": null } ], - "text": "- -Any other person sees the entire TAX_ID as masked, for example, XXX-XX-XXXX.", + "text": "-Any other person sees the entire TAX_ID as masked, for example, XXX-XX-XXXX.", "type": "paragraph", "payload": null, "name": "List-item", @@ -3189,7 +3189,7 @@ "__ref_s3_data": null } ], - "text": "- To implement this column mask, run the SQL statement that is shown in Example 3-9.", + "text": "To implement this column mask, run the SQL statement that is shown in Example 3-9.", "type": "paragraph", "payload": null, "name": "List-item", @@ -3258,7 +3258,7 @@ "__ref_s3_data": null } ], - "text": "- 3. Figure 3-10 shows the masks that are created in the HR_SCHEMA.", + "text": "3. Figure 3-10 shows the masks that are created in the HR_SCHEMA.", "type": "paragraph", "payload": null, "name": "List-item", @@ -3355,7 +3355,7 @@ "__ref_s3_data": null } ], - "text": "- 1. Run the SQL statements that are shown in Example 3-10.", + "text": "1. Run the SQL statements that are shown in Example 3-10.", "type": "paragraph", "payload": null, "name": "List-item", @@ -3401,7 +3401,7 @@ "__ref_s3_data": null } ], - "text": "- /* Active Row Access Control (permissions) */", + "text": "/* Active Row Access Control (permissions) */", "type": "paragraph", "payload": null, "name": "List-item", @@ -3424,7 +3424,7 @@ "__ref_s3_data": null } ], - "text": "- /* Active Column Access Control (masks)", + "text": "/* Active Column Access Control (masks)", "type": "paragraph", "payload": null, "name": "List-item", @@ -3539,7 +3539,7 @@ "__ref_s3_data": null } ], - "text": "- 2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas \uf0ae HR_SCHEMA \uf0ae Tables , right-click the EMPLOYEES table, and click Definition .", + "text": "2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas \uf0ae HR_SCHEMA \uf0ae Tables , right-click the EMPLOYEES table, and click Definition .", "type": "paragraph", "payload": null, "name": "List-item", @@ -3590,7 +3590,7 @@ "__ref_s3_data": null } ], - "text": "- 2. Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause.", + "text": "2. Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause.", "type": "paragraph", "payload": null, "name": "List-item", @@ -3613,7 +3613,7 @@ "__ref_s3_data": null } ], - "text": "- 3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause.", + "text": "3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause.", "type": "paragraph", "payload": null, "name": "List-item", diff --git a/tests/data/groundtruth/docling_v1/redp5110_sampled.md b/tests/data/groundtruth/docling_v1/redp5110_sampled.md index 8c6709b..186595e 100644 --- a/tests/data/groundtruth/docling_v1/redp5110_sampled.md +++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.md @@ -18,13 +18,13 @@ Solution Brief IBM Systems Lab Services and Training ## Highlights -- GLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH +GLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH -- GLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH +GLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH -- GLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH +GLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH -- GLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH +GLYPHGLYPH GLYPH GLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPH GLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH GLYPHGLYPHGLYPH GLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPHGLYPH @@ -46,23 +46,23 @@ With combined experiences and direct access to development groups, we're the exp Global CoE engagements cover topics including: -- r Database performance and scalability +r Database performance and scalability -- r Advanced SQL knowledge and skills transfer +r Advanced SQL knowledge and skills transfer -- r Business intelligence and analytics +r Business intelligence and analytics -- r DB2 Web Query +r DB2 Web Query -- r Query/400 modernization for better reporting and analysis capabilities +r Query/400 modernization for better reporting and analysis capabilities -- r Database modernization and re-engineering +r Database modernization and re-engineering -- r Data-centric architecture and design +r Data-centric architecture and design -- r Extremely large database and overcoming limits to growth +r Extremely large database and overcoming limits to growth -- r ISV education and enablement +r ISV education and enablement ## Preface @@ -96,23 +96,23 @@ Businesses must make a serious effort to secure their data and recognize that se This chapter describes how you can secure and protect data in DB2 for i. The following topics are covered in this chapter: -- GLYPH Security fundamentals +GLYPH Security fundamentals -- GLYPH Current state of IBM i security +GLYPH Current state of IBM i security -- GLYPH DB2 for i security controls +GLYPH DB2 for i security controls ## 1.1 Security fundamentals Before reviewing database security techniques, there are two fundamental steps in securing information assets that must be described: -- GLYPH First, and most important, is the definition of a company's security policy . Without a security policy, there is no definition of what are acceptable practices for using, accessing, and storing information by who, what, when, where, and how. A security policy should minimally address three things: confidentiality, integrity, and availability. +GLYPH First, and most important, is the definition of a company's security policy . Without a security policy, there is no definition of what are acceptable practices for using, accessing, and storing information by who, what, when, where, and how. A security policy should minimally address three things: confidentiality, integrity, and availability. -- The monitoring and assessment of adherence to the security policy determines whether your security strategy is working. Often, IBM security consultants are asked to perform security assessments for companies without regard to the security policy. Although these assessments can be useful for observing how the system is defined and how data is being accessed, they cannot determine the level of security without a security policy. Without a security policy, it really is not an assessment as much as it is a baseline for monitoring the changes in the security settings that are captured. +The monitoring and assessment of adherence to the security policy determines whether your security strategy is working. Often, IBM security consultants are asked to perform security assessments for companies without regard to the security policy. Although these assessments can be useful for observing how the system is defined and how data is being accessed, they cannot determine the level of security without a security policy. Without a security policy, it really is not an assessment as much as it is a baseline for monitoring the changes in the security settings that are captured. A security policy is what defines whether the system and its settings are secure (or not). -- GLYPH The second fundamental in securing data assets is the use of resource security . If implemented properly, resource security prevents data breaches from both internal and external intrusions. Resource security controls are closely tied to the part of the security policy that defines who should have access to what information resources. A hacker might be good enough to get through your company firewalls and sift his way through to your system, but if they do not have explicit access to your database, the hacker cannot compromise your information assets. +GLYPH The second fundamental in securing data assets is the use of resource security . If implemented properly, resource security prevents data breaches from both internal and external intrusions. Resource security controls are closely tied to the part of the security policy that defines who should have access to what information resources. A hacker might be good enough to get through your company firewalls and sift his way through to your system, but if they do not have explicit access to your database, the hacker cannot compromise your information assets. With your eyes now open to the importance of securing information assets, the rest of this chapter reviews the methods that are available for securing database resources on IBM i. @@ -141,11 +141,11 @@ Figure 1-2 Existing row and column controls The following CL commands can be used to work with, display, or change function usage IDs: -- GLYPH Work Function Usage ( WRKFCNUSG ) +GLYPH Work Function Usage ( WRKFCNUSG ) -- GLYPH Change Function Usage ( CHGFCNUSG ) +GLYPH Change Function Usage ( CHGFCNUSG ) -- GLYPH Display Function Usage ( DSPFCNUSG ) +GLYPH Display Function Usage ( DSPFCNUSG ) For example, the following CHGFCNUSG command shows granting authorization to user HBEDOYA to administer and manage RCAC rules: @@ -244,15 +244,15 @@ Table 3-1 Special registers and their corresponding values Figure 3-5 shows the difference in the special register values when an adopted authority is used: -- GLYPH A user connects to the server using the user profile ALICE. +GLYPH A user connects to the server using the user profile ALICE. -- GLYPH USER and CURRENT USER initially have the same value of ALICE. +GLYPH USER and CURRENT USER initially have the same value of ALICE. -- GLYPH ALICE calls an SQL procedure that is named proc1, which is owned by user profile JOE and was created to adopt JOE's authority when it is called. +GLYPH ALICE calls an SQL procedure that is named proc1, which is owned by user profile JOE and was created to adopt JOE's authority when it is called. -- GLYPH While the procedure is running, the special register USER still contains the value of ALICE because it excludes any adopted authority. The special register CURRENT USER contains the value of JOE because it includes any adopted authority. +GLYPH While the procedure is running, the special register USER still contains the value of ALICE because it excludes any adopted authority. The special register CURRENT USER contains the value of JOE because it includes any adopted authority. -- GLYPH When proc1 ends, the session reverts to its original state with both USER and CURRENT USER having the value of ALICE. +GLYPH When proc1 ends, the session reverts to its original state with both USER and CURRENT USER having the value of ALICE. Figure 3-5 Special registers and adopted authority @@ -287,11 +287,11 @@ If a special register value is in the list of user profiles or it is a member of Here is an example of using the VERIFY_GROUP_FOR_USER function: -- 1. There are user profiles for MGR, JANE, JUDY, and TONY. +1. There are user profiles for MGR, JANE, JUDY, and TONY. -- 2. The user profile JANE specifies a group profile of MGR. +2. The user profile JANE specifies a group profile of MGR. -- 3. If a user is connected to the server using user profile JANE, all of the following function invocations return a value of 1: +3. If a user is connected to the server using user profile JANE, all of the following function invocations return a value of 1: VERIFY_GROUP_FOR_USER (CURRENT_USER, 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR', 'STEVE') The following function invocation returns a value of 0: VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JUDY', 'TONY') @@ -301,23 +301,23 @@ CASE WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'HR', 'EMP' ) = 1 THEN EMPLOYEES . DATE_OF_BIRTH WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER = EMPLOYEES . USER_ID THEN EMPLOYEES . DATE_OF_BIRTH WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER <> EMPLOYEES . USER_ID THEN ( 9999 || '-' || MONTH ( EMPLOYEES . DATE_OF_BIRTH ) || '-' || DAY (EMPLOYEES.DATE_OF_BIRTH )) ELSE NULL END ENABLE ; -- 2. The other column to mask in this example is the TAX_ID information. In this example, the rules to enforce include the following ones: +2. The other column to mask in this example is the TAX_ID information. In this example, the rules to enforce include the following ones: -- -Human Resources can see the unmasked TAX_ID of the employees. +-Human Resources can see the unmasked TAX_ID of the employees. -- -Employees can see only their own unmasked TAX_ID. +-Employees can see only their own unmasked TAX_ID. -- -Managers see a masked version of TAX_ID with the first five characters replaced with the X character (for example, XXX-XX-1234). +-Managers see a masked version of TAX_ID with the first five characters replaced with the X character (for example, XXX-XX-1234). -- -Any other person sees the entire TAX_ID as masked, for example, XXX-XX-XXXX. +-Any other person sees the entire TAX_ID as masked, for example, XXX-XX-XXXX. -- To implement this column mask, run the SQL statement that is shown in Example 3-9. +To implement this column mask, run the SQL statement that is shown in Example 3-9. CREATE MASK HR_SCHEMA.MASK_TAX_ID_ON_EMPLOYEES ON HR_SCHEMA.EMPLOYEES AS EMPLOYEES FOR COLUMN TAX_ID RETURN CASE WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'HR' ) = 1 THEN EMPLOYEES . TAX_ID WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER = EMPLOYEES . USER_ID THEN EMPLOYEES . TAX_ID WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER <> EMPLOYEES . USER_ID THEN ( 'XXX-XX-' CONCAT QSYS2 . SUBSTR ( EMPLOYEES . TAX_ID , 8 , 4 ) ) WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'EMP' ) = 1 THEN EMPLOYEES . TAX_ID ELSE 'XXX-XX-XXXX' END ENABLE ; Example 3-9 Creating a mask on the TAX_ID column -- 3. Figure 3-10 shows the masks that are created in the HR_SCHEMA. +3. Figure 3-10 shows the masks that are created in the HR_SCHEMA. Figure 3-10 Column masks shown in System i Navigator @@ -326,13 +326,13 @@ Figure 3-10 Column masks shown in System i Navigator Now that you have created the row permission and the two column masks, RCAC must be activated. The row permission and the two column masks are enabled (last clause in the scripts), but now you must activate RCAC on the table. To do so, complete the following steps: -- 1. Run the SQL statements that are shown in Example 3-10. +1. Run the SQL statements that are shown in Example 3-10. ## Example 3-10 Activating RCAC on the EMPLOYEES table -- /* Active Row Access Control (permissions) */ +/* Active Row Access Control (permissions) */ -- /* Active Column Access Control (masks) +/* Active Column Access Control (masks) */ @@ -342,14 +342,14 @@ ACTIVATE ROW ACCESS CONTROL ACTIVATE COLUMN ACCESS CONTROL; -- 2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition . +2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition . Figure 3-11 Selecting the EMPLOYEES table from System i Navigator -- 2. Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause. +2. Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause. -- 3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause. +3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause. Figure 4-68 Visual Explain with RCAC enabled diff --git a/tests/data/groundtruth/docling_v2/2203.01017v2.json b/tests/data/groundtruth/docling_v2/2203.01017v2.json index eabd1a0..3d72e7e 100644 --- a/tests/data/groundtruth/docling_v2/2203.01017v2.json +++ b/tests/data/groundtruth/docling_v2/2203.01017v2.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "2203.01017v2", "origin": { "mimetype": "application/pdf", @@ -1340,7 +1340,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/13", @@ -2096,7 +2096,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/39", @@ -3055,7 +3055,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/72", @@ -3086,7 +3086,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/73", @@ -3117,7 +3117,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/74", @@ -3148,7 +3148,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/75", @@ -9249,7 +9249,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/284", @@ -9280,7 +9280,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/285", @@ -11288,7 +11288,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/354", @@ -11348,7 +11348,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/356", @@ -11379,7 +11379,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/357", @@ -11410,7 +11410,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/358", @@ -11441,7 +11441,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/359", @@ -11472,7 +11472,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/360", @@ -11503,7 +11503,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/361", @@ -11534,7 +11534,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/362", @@ -11565,7 +11565,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/363", @@ -11596,7 +11596,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/364", @@ -11627,7 +11627,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/365", @@ -11658,7 +11658,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/366", @@ -11689,7 +11689,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/367", @@ -11720,7 +11720,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/368", @@ -11751,7 +11751,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/369", @@ -11782,7 +11782,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/370", @@ -11813,7 +11813,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/371", @@ -11844,7 +11844,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/372", @@ -11875,7 +11875,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/373", @@ -11906,7 +11906,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/374", @@ -11937,7 +11937,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/375", @@ -11968,7 +11968,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/376", @@ -11999,7 +11999,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/377", @@ -12030,7 +12030,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/378", @@ -12061,7 +12061,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/379", @@ -12092,7 +12092,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/380", @@ -12181,7 +12181,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/383", @@ -12212,7 +12212,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/384", @@ -12243,7 +12243,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/385", @@ -12274,7 +12274,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/386", @@ -12305,7 +12305,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/387", @@ -12336,7 +12336,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/388", @@ -12367,7 +12367,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/389", @@ -12398,7 +12398,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/390", @@ -12429,7 +12429,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/391", @@ -12460,7 +12460,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/392", @@ -12491,7 +12491,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/393", @@ -12522,7 +12522,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/394", @@ -12553,7 +12553,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/395", @@ -12584,7 +12584,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/396", @@ -12923,7 +12923,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/407", @@ -12954,7 +12954,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/408", @@ -12985,7 +12985,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/409", @@ -13016,7 +13016,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/410", @@ -13047,7 +13047,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/411", @@ -14906,7 +14906,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/475", @@ -14937,7 +14937,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/476", @@ -15055,7 +15055,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/480", @@ -15086,7 +15086,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/481", @@ -15117,7 +15117,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/482", @@ -15148,7 +15148,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/483", @@ -15179,7 +15179,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/484", @@ -15268,7 +15268,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/487", @@ -15299,7 +15299,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/488", @@ -15330,7 +15330,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/489", @@ -15361,7 +15361,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/490", @@ -15392,7 +15392,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/491", @@ -15452,7 +15452,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/493", @@ -15483,7 +15483,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/494", @@ -15514,7 +15514,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/495", @@ -15545,7 +15545,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/496", diff --git a/tests/data/groundtruth/docling_v2/2206.01062.json b/tests/data/groundtruth/docling_v2/2206.01062.json index 940d304..6ea8a61 100644 --- a/tests/data/groundtruth/docling_v2/2206.01062.json +++ b/tests/data/groundtruth/docling_v2/2206.01062.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "2206.01062", "origin": { "mimetype": "application/pdf", @@ -10866,7 +10866,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/356", @@ -10897,7 +10897,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/357", @@ -10928,7 +10928,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/358", @@ -10959,7 +10959,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/359", @@ -11048,7 +11048,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/362", @@ -12430,7 +12430,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/409", @@ -12461,7 +12461,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/410", @@ -12492,7 +12492,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/411", @@ -12523,7 +12523,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/412", @@ -12554,7 +12554,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/413", @@ -12585,7 +12585,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/414", @@ -14713,7 +14713,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/487", @@ -14744,7 +14744,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/488", @@ -14775,7 +14775,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/489", @@ -14806,7 +14806,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/490", @@ -14837,7 +14837,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/491", @@ -14868,7 +14868,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/492", @@ -14899,7 +14899,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/493", @@ -14930,7 +14930,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/494", @@ -14961,7 +14961,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/495", @@ -14992,7 +14992,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/496", @@ -15023,7 +15023,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/497", @@ -15054,7 +15054,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/498", @@ -15085,7 +15085,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/499", @@ -15580,7 +15580,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/516", @@ -15611,7 +15611,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/517", @@ -15642,7 +15642,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/518", @@ -15673,7 +15673,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/519", @@ -15704,7 +15704,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/520", @@ -15735,7 +15735,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/521", @@ -15766,7 +15766,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/522", @@ -15797,7 +15797,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/523", @@ -15828,7 +15828,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/524", @@ -15859,7 +15859,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" } ], "pictures": [ diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json index 8ce7f74..bdf39cc 100644 --- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json +++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "2305.03393v1-pg9", "origin": { "mimetype": "application/pdf", diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1.doctags.txt b/tests/data/groundtruth/docling_v2/2305.03393v1.doctags.txt index e462c32..167c872 100644 --- a/tests/data/groundtruth/docling_v2/2305.03393v1.doctags.txt +++ b/tests/data/groundtruth/docling_v2/2305.03393v1.doctags.txt @@ -60,8 +60,6 @@ Optimized Table Tokenization for Table Structure Recognition 7 Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding -4 - 2d merges: "C", "L", "U", "X" - 4.2 Language Syntax The OTSL representation follows these syntax rules: 1. Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell. diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1.json b/tests/data/groundtruth/docling_v2/2305.03393v1.json index dd3f173..c64ce68 100644 --- a/tests/data/groundtruth/docling_v2/2305.03393v1.json +++ b/tests/data/groundtruth/docling_v2/2305.03393v1.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "2305.03393v1", "origin": { "mimetype": "application/pdf", @@ -164,23 +164,20 @@ { "cref": "#/pictures/2" }, - { - "cref": "#/texts/228" - }, { "cref": "#/texts/229" }, { - "cref": "#/groups/2" - }, - { - "cref": "#/texts/232" + "cref": "#/texts/230" }, { "cref": "#/groups/3" }, { - "cref": "#/texts/237" + "cref": "#/texts/233" + }, + { + "cref": "#/groups/4" }, { "cref": "#/texts/238" @@ -207,10 +204,10 @@ "cref": "#/texts/245" }, { - "cref": "#/pictures/3" + "cref": "#/texts/246" }, { - "cref": "#/texts/319" + "cref": "#/pictures/3" }, { "cref": "#/texts/320" @@ -228,10 +225,10 @@ "cref": "#/texts/324" }, { - "cref": "#/tables/0" + "cref": "#/texts/325" }, { - "cref": "#/texts/326" + "cref": "#/tables/0" }, { "cref": "#/texts/327" @@ -246,19 +243,19 @@ "cref": "#/texts/330" }, { - "cref": "#/tables/1" + "cref": "#/texts/331" }, { - "cref": "#/texts/332" + "cref": "#/tables/1" }, { "cref": "#/texts/333" }, { - "cref": "#/pictures/4" + "cref": "#/texts/334" }, { - "cref": "#/texts/432" + "cref": "#/pictures/4" }, { "cref": "#/texts/433" @@ -270,10 +267,10 @@ "cref": "#/texts/435" }, { - "cref": "#/pictures/5" + "cref": "#/texts/436" }, { - "cref": "#/texts/448" + "cref": "#/pictures/5" }, { "cref": "#/texts/449" @@ -294,25 +291,28 @@ "cref": "#/texts/454" }, { - "cref": "#/groups/4" - }, - { - "cref": "#/texts/459" - }, - { - "cref": "#/texts/460" + "cref": "#/texts/455" }, { "cref": "#/groups/5" }, { - "cref": "#/texts/474" + "cref": "#/texts/460" + }, + { + "cref": "#/texts/461" + }, + { + "cref": "#/groups/6" }, { "cref": "#/texts/475" }, { - "cref": "#/groups/6" + "cref": "#/texts/476" + }, + { + "cref": "#/groups/7" } ], "content_layer": "body", @@ -372,18 +372,15 @@ { "self_ref": "#/groups/2", "parent": { - "cref": "#/body" + "cref": "#/pictures/2" }, "children": [ { - "cref": "#/texts/230" - }, - { - "cref": "#/texts/231" + "cref": "#/texts/215" } ], "content_layer": "body", - "name": "list", + "name": "group", "label": "list" }, { @@ -393,16 +390,10 @@ }, "children": [ { - "cref": "#/texts/233" + "cref": "#/texts/231" }, { - "cref": "#/texts/234" - }, - { - "cref": "#/texts/235" - }, - { - "cref": "#/texts/236" + "cref": "#/texts/232" } ], "content_layer": "body", @@ -416,16 +407,16 @@ }, "children": [ { - "cref": "#/texts/455" + "cref": "#/texts/234" }, { - "cref": "#/texts/456" + "cref": "#/texts/235" }, { - "cref": "#/texts/457" + "cref": "#/texts/236" }, { - "cref": "#/texts/458" + "cref": "#/texts/237" } ], "content_layer": "body", @@ -439,8 +430,28 @@ }, "children": [ { - "cref": "#/texts/461" + "cref": "#/texts/456" }, + { + "cref": "#/texts/457" + }, + { + "cref": "#/texts/458" + }, + { + "cref": "#/texts/459" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/6", + "parent": { + "cref": "#/body" + }, + "children": [ { "cref": "#/texts/462" }, @@ -476,6 +487,9 @@ }, { "cref": "#/texts/473" + }, + { + "cref": "#/texts/474" } ], "content_layer": "body", @@ -483,14 +497,11 @@ "label": "list" }, { - "self_ref": "#/groups/6", + "self_ref": "#/groups/7", "parent": { "cref": "#/body" }, "children": [ - { - "cref": "#/texts/476" - }, { "cref": "#/texts/477" }, @@ -505,24 +516,13 @@ }, { "cref": "#/texts/481" - } - ], - "content_layer": "body", - "name": "list", - "label": "list" - }, - { - "self_ref": "#/groups/7", - "parent": { - "cref": "#/pictures/2" - }, - "children": [ + }, { "cref": "#/texts/482" } ], "content_layer": "body", - "name": "group", + "name": "list", "label": "list" } ], @@ -4970,7 +4970,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/153", @@ -5001,7 +5001,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/154", @@ -5032,7 +5032,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/155", @@ -5063,7 +5063,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/156", @@ -5094,7 +5094,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/157", @@ -6780,6 +6780,37 @@ }, { "self_ref": "#/texts/215", + "parent": { + "cref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [ + { + "page_no": 7, + "bbox": { + "l": 334.51135, + "t": 519.19159, + "r": 426.59875, + "b": 512.97711, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 33 + ] + } + ], + "orig": "4 - 2d merges: \"C\", \"L\", \"U\", \"X\"", + "text": "4 - 2d merges: \"C\", \"L\", \"U\", \"X\"", + "formatting": null, + "hyperlink": null, + "enumerated": false, + "marker": "" + }, + { + "self_ref": "#/texts/216", "parent": { "cref": "#/pictures/2" }, @@ -6808,7 +6839,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/216", + "self_ref": "#/texts/217", "parent": { "cref": "#/pictures/2" }, @@ -6837,7 +6868,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/217", + "self_ref": "#/texts/218", "parent": { "cref": "#/pictures/2" }, @@ -6866,7 +6897,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/218", + "self_ref": "#/texts/219", "parent": { "cref": "#/pictures/2" }, @@ -6895,7 +6926,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/219", + "self_ref": "#/texts/220", "parent": { "cref": "#/pictures/2" }, @@ -6924,7 +6955,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/220", + "self_ref": "#/texts/221", "parent": { "cref": "#/pictures/2" }, @@ -6953,7 +6984,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/221", + "self_ref": "#/texts/222", "parent": { "cref": "#/pictures/2" }, @@ -6982,7 +7013,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/222", + "self_ref": "#/texts/223", "parent": { "cref": "#/pictures/2" }, @@ -7011,7 +7042,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/223", + "self_ref": "#/texts/224", "parent": { "cref": "#/pictures/2" }, @@ -7040,7 +7071,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/224", + "self_ref": "#/texts/225", "parent": { "cref": "#/pictures/2" }, @@ -7069,7 +7100,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/225", + "self_ref": "#/texts/226", "parent": { "cref": "#/pictures/2" }, @@ -7098,7 +7129,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/226", + "self_ref": "#/texts/227", "parent": { "cref": "#/pictures/2" }, @@ -7127,7 +7158,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/227", + "self_ref": "#/texts/228", "parent": { "cref": "#/pictures/2" }, @@ -7156,7 +7187,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/228", + "self_ref": "#/texts/229", "parent": { "cref": "#/body" }, @@ -7186,7 +7217,7 @@ "level": 1 }, { - "self_ref": "#/texts/229", + "self_ref": "#/texts/230", "parent": { "cref": "#/body" }, @@ -7215,9 +7246,9 @@ "hyperlink": null }, { - "self_ref": "#/texts/230", + "self_ref": "#/texts/231", "parent": { - "cref": "#/groups/2" + "cref": "#/groups/3" }, "children": [], "content_layer": "body", @@ -7243,12 +7274,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/231", + "self_ref": "#/texts/232", "parent": { - "cref": "#/groups/2" + "cref": "#/groups/3" }, "children": [], "content_layer": "body", @@ -7274,10 +7305,10 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/232", + "self_ref": "#/texts/233", "parent": { "cref": "#/body" }, @@ -7307,9 +7338,9 @@ "level": 1 }, { - "self_ref": "#/texts/233", + "self_ref": "#/texts/234", "parent": { - "cref": "#/groups/3" + "cref": "#/groups/4" }, "children": [], "content_layer": "body", @@ -7335,12 +7366,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/234", + "self_ref": "#/texts/235", "parent": { - "cref": "#/groups/3" + "cref": "#/groups/4" }, "children": [], "content_layer": "body", @@ -7366,12 +7397,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/235", + "self_ref": "#/texts/236", "parent": { - "cref": "#/groups/3" + "cref": "#/groups/4" }, "children": [], "content_layer": "body", @@ -7397,12 +7428,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/236", + "self_ref": "#/texts/237", "parent": { - "cref": "#/groups/3" + "cref": "#/groups/4" }, "children": [], "content_layer": "body", @@ -7428,10 +7459,10 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/237", + "self_ref": "#/texts/238", "parent": { "cref": "#/body" }, @@ -7460,7 +7491,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/238", + "self_ref": "#/texts/239", "parent": { "cref": "#/body" }, @@ -7489,7 +7520,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/239", + "self_ref": "#/texts/240", "parent": { "cref": "#/body" }, @@ -7518,7 +7549,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/240", + "self_ref": "#/texts/241", "parent": { "cref": "#/body" }, @@ -7547,7 +7578,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/241", + "self_ref": "#/texts/242", "parent": { "cref": "#/body" }, @@ -7576,7 +7607,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/242", + "self_ref": "#/texts/243", "parent": { "cref": "#/body" }, @@ -7606,7 +7637,7 @@ "level": 1 }, { - "self_ref": "#/texts/243", + "self_ref": "#/texts/244", "parent": { "cref": "#/body" }, @@ -7635,7 +7666,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/244", + "self_ref": "#/texts/245", "parent": { "cref": "#/body" }, @@ -7665,7 +7696,7 @@ "level": 1 }, { - "self_ref": "#/texts/245", + "self_ref": "#/texts/246", "parent": { "cref": "#/body" }, @@ -7694,7 +7725,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/246", + "self_ref": "#/texts/247", "parent": { "cref": "#/pictures/3" }, @@ -7723,7 +7754,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/247", + "self_ref": "#/texts/248", "parent": { "cref": "#/pictures/3" }, @@ -7752,7 +7783,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/248", + "self_ref": "#/texts/249", "parent": { "cref": "#/pictures/3" }, @@ -7781,7 +7812,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/249", + "self_ref": "#/texts/250", "parent": { "cref": "#/pictures/3" }, @@ -7810,7 +7841,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/250", + "self_ref": "#/texts/251", "parent": { "cref": "#/pictures/3" }, @@ -7839,7 +7870,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/251", + "self_ref": "#/texts/252", "parent": { "cref": "#/pictures/3" }, @@ -7868,7 +7899,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/252", + "self_ref": "#/texts/253", "parent": { "cref": "#/pictures/3" }, @@ -7897,7 +7928,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/253", + "self_ref": "#/texts/254", "parent": { "cref": "#/pictures/3" }, @@ -7926,7 +7957,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/254", + "self_ref": "#/texts/255", "parent": { "cref": "#/pictures/3" }, @@ -7955,7 +7986,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/255", + "self_ref": "#/texts/256", "parent": { "cref": "#/pictures/3" }, @@ -7984,7 +8015,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/256", + "self_ref": "#/texts/257", "parent": { "cref": "#/pictures/3" }, @@ -8013,7 +8044,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/257", + "self_ref": "#/texts/258", "parent": { "cref": "#/pictures/3" }, @@ -8042,7 +8073,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/258", + "self_ref": "#/texts/259", "parent": { "cref": "#/pictures/3" }, @@ -8071,7 +8102,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/259", + "self_ref": "#/texts/260", "parent": { "cref": "#/pictures/3" }, @@ -8100,7 +8131,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/260", + "self_ref": "#/texts/261", "parent": { "cref": "#/pictures/3" }, @@ -8129,7 +8160,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/261", + "self_ref": "#/texts/262", "parent": { "cref": "#/pictures/3" }, @@ -8158,7 +8189,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/262", + "self_ref": "#/texts/263", "parent": { "cref": "#/pictures/3" }, @@ -8187,7 +8218,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/263", + "self_ref": "#/texts/264", "parent": { "cref": "#/pictures/3" }, @@ -8216,7 +8247,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/264", + "self_ref": "#/texts/265", "parent": { "cref": "#/pictures/3" }, @@ -8245,7 +8276,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/265", + "self_ref": "#/texts/266", "parent": { "cref": "#/pictures/3" }, @@ -8274,7 +8305,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/266", + "self_ref": "#/texts/267", "parent": { "cref": "#/pictures/3" }, @@ -8303,7 +8334,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/267", + "self_ref": "#/texts/268", "parent": { "cref": "#/pictures/3" }, @@ -8332,7 +8363,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/268", + "self_ref": "#/texts/269", "parent": { "cref": "#/pictures/3" }, @@ -8361,7 +8392,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/269", + "self_ref": "#/texts/270", "parent": { "cref": "#/pictures/3" }, @@ -8390,7 +8421,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/270", + "self_ref": "#/texts/271", "parent": { "cref": "#/pictures/3" }, @@ -8419,7 +8450,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/271", + "self_ref": "#/texts/272", "parent": { "cref": "#/pictures/3" }, @@ -8448,7 +8479,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/272", + "self_ref": "#/texts/273", "parent": { "cref": "#/pictures/3" }, @@ -8477,7 +8508,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/273", + "self_ref": "#/texts/274", "parent": { "cref": "#/pictures/3" }, @@ -8506,7 +8537,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/274", + "self_ref": "#/texts/275", "parent": { "cref": "#/pictures/3" }, @@ -8535,7 +8566,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/275", + "self_ref": "#/texts/276", "parent": { "cref": "#/pictures/3" }, @@ -8564,7 +8595,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/276", + "self_ref": "#/texts/277", "parent": { "cref": "#/pictures/3" }, @@ -8593,7 +8624,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/277", + "self_ref": "#/texts/278", "parent": { "cref": "#/pictures/3" }, @@ -8622,7 +8653,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/278", + "self_ref": "#/texts/279", "parent": { "cref": "#/pictures/3" }, @@ -8651,7 +8682,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/279", + "self_ref": "#/texts/280", "parent": { "cref": "#/pictures/3" }, @@ -8680,7 +8711,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/280", + "self_ref": "#/texts/281", "parent": { "cref": "#/pictures/3" }, @@ -8709,7 +8740,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/281", + "self_ref": "#/texts/282", "parent": { "cref": "#/pictures/3" }, @@ -8738,7 +8769,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/282", + "self_ref": "#/texts/283", "parent": { "cref": "#/pictures/3" }, @@ -8767,7 +8798,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/283", + "self_ref": "#/texts/284", "parent": { "cref": "#/pictures/3" }, @@ -8796,7 +8827,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/284", + "self_ref": "#/texts/285", "parent": { "cref": "#/pictures/3" }, @@ -8825,7 +8856,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/285", + "self_ref": "#/texts/286", "parent": { "cref": "#/pictures/3" }, @@ -8854,7 +8885,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/286", + "self_ref": "#/texts/287", "parent": { "cref": "#/pictures/3" }, @@ -8883,7 +8914,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/287", + "self_ref": "#/texts/288", "parent": { "cref": "#/pictures/3" }, @@ -8912,7 +8943,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/288", + "self_ref": "#/texts/289", "parent": { "cref": "#/pictures/3" }, @@ -8941,7 +8972,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/289", + "self_ref": "#/texts/290", "parent": { "cref": "#/pictures/3" }, @@ -8970,7 +9001,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/290", + "self_ref": "#/texts/291", "parent": { "cref": "#/pictures/3" }, @@ -8999,7 +9030,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/291", + "self_ref": "#/texts/292", "parent": { "cref": "#/pictures/3" }, @@ -9028,7 +9059,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/292", + "self_ref": "#/texts/293", "parent": { "cref": "#/pictures/3" }, @@ -9057,7 +9088,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/293", + "self_ref": "#/texts/294", "parent": { "cref": "#/pictures/3" }, @@ -9086,7 +9117,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/294", + "self_ref": "#/texts/295", "parent": { "cref": "#/pictures/3" }, @@ -9115,7 +9146,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/295", + "self_ref": "#/texts/296", "parent": { "cref": "#/pictures/3" }, @@ -9144,7 +9175,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/296", + "self_ref": "#/texts/297", "parent": { "cref": "#/pictures/3" }, @@ -9173,7 +9204,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/297", + "self_ref": "#/texts/298", "parent": { "cref": "#/pictures/3" }, @@ -9202,7 +9233,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/298", + "self_ref": "#/texts/299", "parent": { "cref": "#/pictures/3" }, @@ -9231,7 +9262,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/299", + "self_ref": "#/texts/300", "parent": { "cref": "#/pictures/3" }, @@ -9260,7 +9291,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/300", + "self_ref": "#/texts/301", "parent": { "cref": "#/pictures/3" }, @@ -9289,7 +9320,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/301", + "self_ref": "#/texts/302", "parent": { "cref": "#/pictures/3" }, @@ -9318,7 +9349,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/302", + "self_ref": "#/texts/303", "parent": { "cref": "#/pictures/3" }, @@ -9347,7 +9378,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/303", + "self_ref": "#/texts/304", "parent": { "cref": "#/pictures/3" }, @@ -9376,7 +9407,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/304", + "self_ref": "#/texts/305", "parent": { "cref": "#/pictures/3" }, @@ -9405,7 +9436,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/305", + "self_ref": "#/texts/306", "parent": { "cref": "#/pictures/3" }, @@ -9434,7 +9465,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/306", + "self_ref": "#/texts/307", "parent": { "cref": "#/pictures/3" }, @@ -9463,7 +9494,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/307", + "self_ref": "#/texts/308", "parent": { "cref": "#/pictures/3" }, @@ -9492,7 +9523,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/308", + "self_ref": "#/texts/309", "parent": { "cref": "#/pictures/3" }, @@ -9521,7 +9552,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/309", + "self_ref": "#/texts/310", "parent": { "cref": "#/pictures/3" }, @@ -9550,7 +9581,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/310", + "self_ref": "#/texts/311", "parent": { "cref": "#/pictures/3" }, @@ -9579,7 +9610,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/311", + "self_ref": "#/texts/312", "parent": { "cref": "#/pictures/3" }, @@ -9608,7 +9639,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/312", + "self_ref": "#/texts/313", "parent": { "cref": "#/pictures/3" }, @@ -9637,7 +9668,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/313", + "self_ref": "#/texts/314", "parent": { "cref": "#/pictures/3" }, @@ -9666,7 +9697,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/314", + "self_ref": "#/texts/315", "parent": { "cref": "#/pictures/3" }, @@ -9695,7 +9726,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/315", + "self_ref": "#/texts/316", "parent": { "cref": "#/pictures/3" }, @@ -9724,7 +9755,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/316", + "self_ref": "#/texts/317", "parent": { "cref": "#/pictures/3" }, @@ -9753,7 +9784,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/317", + "self_ref": "#/texts/318", "parent": { "cref": "#/pictures/3" }, @@ -9782,7 +9813,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/318", + "self_ref": "#/texts/319", "parent": { "cref": "#/pictures/3" }, @@ -9811,7 +9842,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/319", + "self_ref": "#/texts/320", "parent": { "cref": "#/body" }, @@ -9840,7 +9871,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/320", + "self_ref": "#/texts/321", "parent": { "cref": "#/body" }, @@ -9869,7 +9900,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/321", + "self_ref": "#/texts/322", "parent": { "cref": "#/body" }, @@ -9898,7 +9929,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/322", + "self_ref": "#/texts/323", "parent": { "cref": "#/body" }, @@ -9927,7 +9958,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/323", + "self_ref": "#/texts/324", "parent": { "cref": "#/body" }, @@ -9957,7 +9988,7 @@ "level": 1 }, { - "self_ref": "#/texts/324", + "self_ref": "#/texts/325", "parent": { "cref": "#/body" }, @@ -9986,7 +10017,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/325", + "self_ref": "#/texts/326", "parent": { "cref": "#/tables/0" }, @@ -10015,7 +10046,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/326", + "self_ref": "#/texts/327", "parent": { "cref": "#/body" }, @@ -10045,7 +10076,7 @@ "level": 1 }, { - "self_ref": "#/texts/327", + "self_ref": "#/texts/328", "parent": { "cref": "#/body" }, @@ -10074,7 +10105,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/328", + "self_ref": "#/texts/329", "parent": { "cref": "#/body" }, @@ -10103,7 +10134,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/329", + "self_ref": "#/texts/330", "parent": { "cref": "#/body" }, @@ -10132,7 +10163,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/330", + "self_ref": "#/texts/331", "parent": { "cref": "#/body" }, @@ -10161,7 +10192,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/331", + "self_ref": "#/texts/332", "parent": { "cref": "#/tables/1" }, @@ -10190,7 +10221,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/332", + "self_ref": "#/texts/333", "parent": { "cref": "#/body" }, @@ -10220,7 +10251,7 @@ "level": 1 }, { - "self_ref": "#/texts/333", + "self_ref": "#/texts/334", "parent": { "cref": "#/body" }, @@ -10249,7 +10280,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/334", + "self_ref": "#/texts/335", "parent": { "cref": "#/pictures/4" }, @@ -10278,7 +10309,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/335", + "self_ref": "#/texts/336", "parent": { "cref": "#/pictures/4" }, @@ -10307,7 +10338,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/336", + "self_ref": "#/texts/337", "parent": { "cref": "#/pictures/4" }, @@ -10336,7 +10367,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/337", + "self_ref": "#/texts/338", "parent": { "cref": "#/pictures/4" }, @@ -10365,7 +10396,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/338", + "self_ref": "#/texts/339", "parent": { "cref": "#/pictures/4" }, @@ -10394,7 +10425,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/339", + "self_ref": "#/texts/340", "parent": { "cref": "#/pictures/4" }, @@ -10423,7 +10454,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/340", + "self_ref": "#/texts/341", "parent": { "cref": "#/pictures/4" }, @@ -10452,7 +10483,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/341", + "self_ref": "#/texts/342", "parent": { "cref": "#/pictures/4" }, @@ -10481,7 +10512,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/342", + "self_ref": "#/texts/343", "parent": { "cref": "#/pictures/4" }, @@ -10510,7 +10541,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/343", + "self_ref": "#/texts/344", "parent": { "cref": "#/pictures/4" }, @@ -10539,7 +10570,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/344", + "self_ref": "#/texts/345", "parent": { "cref": "#/pictures/4" }, @@ -10568,7 +10599,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/345", + "self_ref": "#/texts/346", "parent": { "cref": "#/pictures/4" }, @@ -10597,7 +10628,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/346", + "self_ref": "#/texts/347", "parent": { "cref": "#/pictures/4" }, @@ -10626,7 +10657,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/347", + "self_ref": "#/texts/348", "parent": { "cref": "#/pictures/4" }, @@ -10655,7 +10686,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/348", + "self_ref": "#/texts/349", "parent": { "cref": "#/pictures/4" }, @@ -10683,35 +10714,6 @@ "formatting": null, "hyperlink": null }, - { - "self_ref": "#/texts/349", - "parent": { - "cref": "#/pictures/4" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 10, - "bbox": { - "l": 410.25699, - "t": 269.80075, - "r": 450.48605, - "b": 267.08404999999993, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 28 - ] - } - ], - "orig": "C C C C C C C C C C C C C NL", - "text": "C C C C C C C C C C C C C NL", - "formatting": null, - "hyperlink": null - }, { "self_ref": "#/texts/350", "parent": { @@ -10724,20 +10726,20 @@ { "page_no": 10, "bbox": { - "l": 407.38348, - "t": 265.90192, - "r": 408.82025, - "b": 263.18521, + "l": 410.25699, + "t": 269.80075, + "r": 450.48605, + "b": 267.08404999999993, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, - 1 + 28 ] } ], - "orig": "C", - "text": "C", + "orig": "C C C C C C C C C C C C C NL", + "text": "C C C C C C C C C C C C C NL", "formatting": null, "hyperlink": null }, @@ -10749,6 +10751,35 @@ "children": [], "content_layer": "body", "label": "text", + "prov": [ + { + "page_no": 10, + "bbox": { + "l": 407.38348, + "t": 265.90192, + "r": 408.82025, + "b": 263.18521, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 1 + ] + } + ], + "orig": "C", + "text": "C", + "formatting": null, + "hyperlink": null + }, + { + "self_ref": "#/texts/352", + "parent": { + "cref": "#/pictures/4" + }, + "children": [], + "content_layer": "body", + "label": "text", "prov": [ { "page_no": 10, @@ -10771,7 +10802,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/352", + "self_ref": "#/texts/353", "parent": { "cref": "#/pictures/4" }, @@ -10799,35 +10830,6 @@ "formatting": null, "hyperlink": null }, - { - "self_ref": "#/texts/353", - "parent": { - "cref": "#/pictures/4" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 10, - "bbox": { - "l": 410.25699, - "t": 262.00305000000003, - "r": 450.48605, - "b": 259.2863500000001, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 28 - ] - } - ], - "orig": "C C C C C C C C C C C C C NL", - "text": "C C C C C C C C C C C C C NL", - "formatting": null, - "hyperlink": null - }, { "self_ref": "#/texts/354", "parent": { @@ -10840,20 +10842,20 @@ { "page_no": 10, "bbox": { - "l": 407.38348, - "t": 258.10421999999994, - "r": 408.82025, - "b": 255.38750000000005, + "l": 410.25699, + "t": 262.00305000000003, + "r": 450.48605, + "b": 259.2863500000001, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, - 1 + 28 ] } ], - "orig": "C", - "text": "C", + "orig": "C C C C C C C C C C C C C NL", + "text": "C C C C C C C C C C C C C NL", "formatting": null, "hyperlink": null }, @@ -10865,6 +10867,35 @@ "children": [], "content_layer": "body", "label": "text", + "prov": [ + { + "page_no": 10, + "bbox": { + "l": 407.38348, + "t": 258.10421999999994, + "r": 408.82025, + "b": 255.38750000000005, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 1 + ] + } + ], + "orig": "C", + "text": "C", + "formatting": null, + "hyperlink": null + }, + { + "self_ref": "#/texts/356", + "parent": { + "cref": "#/pictures/4" + }, + "children": [], + "content_layer": "body", + "label": "text", "prov": [ { "page_no": 10, @@ -10887,7 +10918,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/356", + "self_ref": "#/texts/357", "parent": { "cref": "#/pictures/4" }, @@ -10915,35 +10946,6 @@ "formatting": null, "hyperlink": null }, - { - "self_ref": "#/texts/357", - "parent": { - "cref": "#/pictures/4" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 10, - "bbox": { - "l": 410.25699, - "t": 254.20537000000002, - "r": 450.48605, - "b": 251.48865, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 28 - ] - } - ], - "orig": "C C C C C C C C C C C C C NL", - "text": "C C C C C C C C C C C C C NL", - "formatting": null, - "hyperlink": null - }, { "self_ref": "#/texts/358", "parent": { @@ -10956,20 +10958,20 @@ { "page_no": 10, "bbox": { - "l": 407.38348, - "t": 250.30651999999998, - "r": 408.82025, - "b": 247.58979999999997, + "l": 410.25699, + "t": 254.20537000000002, + "r": 450.48605, + "b": 251.48865, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, - 1 + 28 ] } ], - "orig": "C", - "text": "C", + "orig": "C C C C C C C C C C C C C NL", + "text": "C C C C C C C C C C C C C NL", "formatting": null, "hyperlink": null }, @@ -10981,6 +10983,35 @@ "children": [], "content_layer": "body", "label": "text", + "prov": [ + { + "page_no": 10, + "bbox": { + "l": 407.38348, + "t": 250.30651999999998, + "r": 408.82025, + "b": 247.58979999999997, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 1 + ] + } + ], + "orig": "C", + "text": "C", + "formatting": null, + "hyperlink": null + }, + { + "self_ref": "#/texts/360", + "parent": { + "cref": "#/pictures/4" + }, + "children": [], + "content_layer": "body", + "label": "text", "prov": [ { "page_no": 10, @@ -11003,7 +11034,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/360", + "self_ref": "#/texts/361", "parent": { "cref": "#/pictures/4" }, @@ -11031,35 +11062,6 @@ "formatting": null, "hyperlink": null }, - { - "self_ref": "#/texts/361", - "parent": { - "cref": "#/pictures/4" - }, - "children": [], - "content_layer": "body", - "label": "text", - "prov": [ - { - "page_no": 10, - "bbox": { - "l": 410.25699, - "t": 246.40767000000005, - "r": 450.48605, - "b": 243.69094999999993, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 28 - ] - } - ], - "orig": "C C C C C C C C C C C C C NL", - "text": "C C C C C C C C C C C C C NL", - "formatting": null, - "hyperlink": null - }, { "self_ref": "#/texts/362", "parent": { @@ -11068,6 +11070,35 @@ "children": [], "content_layer": "body", "label": "text", + "prov": [ + { + "page_no": 10, + "bbox": { + "l": 410.25699, + "t": 246.40767000000005, + "r": 450.48605, + "b": 243.69094999999993, + "coord_origin": "BOTTOMLEFT" + }, + "charspan": [ + 0, + 28 + ] + } + ], + "orig": "C C C C C C C C C C C C C NL", + "text": "C C C C C C C C C C C C C NL", + "formatting": null, + "hyperlink": null + }, + { + "self_ref": "#/texts/363", + "parent": { + "cref": "#/pictures/4" + }, + "children": [], + "content_layer": "body", + "label": "text", "prov": [ { "page_no": 10, @@ -11090,7 +11121,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/363", + "self_ref": "#/texts/364", "parent": { "cref": "#/pictures/4" }, @@ -11119,7 +11150,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/364", + "self_ref": "#/texts/365", "parent": { "cref": "#/pictures/4" }, @@ -11148,7 +11179,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/365", + "self_ref": "#/texts/366", "parent": { "cref": "#/pictures/4" }, @@ -11177,7 +11208,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/366", + "self_ref": "#/texts/367", "parent": { "cref": "#/pictures/4" }, @@ -11206,7 +11237,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/367", + "self_ref": "#/texts/368", "parent": { "cref": "#/pictures/4" }, @@ -11235,7 +11266,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/368", + "self_ref": "#/texts/369", "parent": { "cref": "#/pictures/4" }, @@ -11264,7 +11295,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/369", + "self_ref": "#/texts/370", "parent": { "cref": "#/pictures/4" }, @@ -11293,7 +11324,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/370", + "self_ref": "#/texts/371", "parent": { "cref": "#/pictures/4" }, @@ -11322,7 +11353,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/371", + "self_ref": "#/texts/372", "parent": { "cref": "#/pictures/4" }, @@ -11351,7 +11382,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/372", + "self_ref": "#/texts/373", "parent": { "cref": "#/pictures/4" }, @@ -11380,7 +11411,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/373", + "self_ref": "#/texts/374", "parent": { "cref": "#/pictures/4" }, @@ -11409,7 +11440,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/374", + "self_ref": "#/texts/375", "parent": { "cref": "#/pictures/4" }, @@ -11438,7 +11469,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/375", + "self_ref": "#/texts/376", "parent": { "cref": "#/pictures/4" }, @@ -11467,7 +11498,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/376", + "self_ref": "#/texts/377", "parent": { "cref": "#/pictures/4" }, @@ -11496,7 +11527,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/377", + "self_ref": "#/texts/378", "parent": { "cref": "#/pictures/4" }, @@ -11525,7 +11556,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/378", + "self_ref": "#/texts/379", "parent": { "cref": "#/pictures/4" }, @@ -11554,7 +11585,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/379", + "self_ref": "#/texts/380", "parent": { "cref": "#/pictures/4" }, @@ -11583,7 +11614,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/380", + "self_ref": "#/texts/381", "parent": { "cref": "#/pictures/4" }, @@ -11612,7 +11643,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/381", + "self_ref": "#/texts/382", "parent": { "cref": "#/pictures/4" }, @@ -11641,7 +11672,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/382", + "self_ref": "#/texts/383", "parent": { "cref": "#/pictures/4" }, @@ -11670,7 +11701,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/383", + "self_ref": "#/texts/384", "parent": { "cref": "#/pictures/4" }, @@ -11699,7 +11730,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/384", + "self_ref": "#/texts/385", "parent": { "cref": "#/pictures/4" }, @@ -11728,7 +11759,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/385", + "self_ref": "#/texts/386", "parent": { "cref": "#/pictures/4" }, @@ -11757,7 +11788,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/386", + "self_ref": "#/texts/387", "parent": { "cref": "#/pictures/4" }, @@ -11786,7 +11817,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/387", + "self_ref": "#/texts/388", "parent": { "cref": "#/pictures/4" }, @@ -11815,7 +11846,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/388", + "self_ref": "#/texts/389", "parent": { "cref": "#/pictures/4" }, @@ -11844,7 +11875,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/389", + "self_ref": "#/texts/390", "parent": { "cref": "#/pictures/4" }, @@ -11873,7 +11904,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/390", + "self_ref": "#/texts/391", "parent": { "cref": "#/pictures/4" }, @@ -11902,7 +11933,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/391", + "self_ref": "#/texts/392", "parent": { "cref": "#/pictures/4" }, @@ -11931,7 +11962,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/392", + "self_ref": "#/texts/393", "parent": { "cref": "#/pictures/4" }, @@ -11960,7 +11991,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/393", + "self_ref": "#/texts/394", "parent": { "cref": "#/pictures/4" }, @@ -11989,7 +12020,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/394", + "self_ref": "#/texts/395", "parent": { "cref": "#/pictures/4" }, @@ -12018,7 +12049,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/395", + "self_ref": "#/texts/396", "parent": { "cref": "#/pictures/4" }, @@ -12047,7 +12078,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/396", + "self_ref": "#/texts/397", "parent": { "cref": "#/pictures/4" }, @@ -12076,7 +12107,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/397", + "self_ref": "#/texts/398", "parent": { "cref": "#/pictures/4" }, @@ -12105,7 +12136,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/398", + "self_ref": "#/texts/399", "parent": { "cref": "#/pictures/4" }, @@ -12134,7 +12165,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/399", + "self_ref": "#/texts/400", "parent": { "cref": "#/pictures/4" }, @@ -12163,7 +12194,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/400", + "self_ref": "#/texts/401", "parent": { "cref": "#/pictures/4" }, @@ -12192,7 +12223,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/401", + "self_ref": "#/texts/402", "parent": { "cref": "#/pictures/4" }, @@ -12221,7 +12252,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/402", + "self_ref": "#/texts/403", "parent": { "cref": "#/pictures/4" }, @@ -12250,7 +12281,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/403", + "self_ref": "#/texts/404", "parent": { "cref": "#/pictures/4" }, @@ -12279,7 +12310,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/404", + "self_ref": "#/texts/405", "parent": { "cref": "#/pictures/4" }, @@ -12308,7 +12339,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/405", + "self_ref": "#/texts/406", "parent": { "cref": "#/pictures/4" }, @@ -12337,7 +12368,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/406", + "self_ref": "#/texts/407", "parent": { "cref": "#/pictures/4" }, @@ -12366,7 +12397,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/407", + "self_ref": "#/texts/408", "parent": { "cref": "#/pictures/4" }, @@ -12395,7 +12426,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/408", + "self_ref": "#/texts/409", "parent": { "cref": "#/pictures/4" }, @@ -12424,7 +12455,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/409", + "self_ref": "#/texts/410", "parent": { "cref": "#/pictures/4" }, @@ -12453,7 +12484,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/410", + "self_ref": "#/texts/411", "parent": { "cref": "#/pictures/4" }, @@ -12482,7 +12513,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/411", + "self_ref": "#/texts/412", "parent": { "cref": "#/pictures/4" }, @@ -12511,7 +12542,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/412", + "self_ref": "#/texts/413", "parent": { "cref": "#/pictures/4" }, @@ -12540,7 +12571,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/413", + "self_ref": "#/texts/414", "parent": { "cref": "#/pictures/4" }, @@ -12569,7 +12600,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/414", + "self_ref": "#/texts/415", "parent": { "cref": "#/pictures/4" }, @@ -12598,7 +12629,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/415", + "self_ref": "#/texts/416", "parent": { "cref": "#/pictures/4" }, @@ -12627,7 +12658,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/416", + "self_ref": "#/texts/417", "parent": { "cref": "#/pictures/4" }, @@ -12656,7 +12687,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/417", + "self_ref": "#/texts/418", "parent": { "cref": "#/pictures/4" }, @@ -12685,7 +12716,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/418", + "self_ref": "#/texts/419", "parent": { "cref": "#/pictures/4" }, @@ -12714,7 +12745,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/419", + "self_ref": "#/texts/420", "parent": { "cref": "#/pictures/4" }, @@ -12743,7 +12774,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/420", + "self_ref": "#/texts/421", "parent": { "cref": "#/pictures/4" }, @@ -12772,7 +12803,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/421", + "self_ref": "#/texts/422", "parent": { "cref": "#/pictures/4" }, @@ -12801,7 +12832,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/422", + "self_ref": "#/texts/423", "parent": { "cref": "#/pictures/4" }, @@ -12830,7 +12861,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/423", + "self_ref": "#/texts/424", "parent": { "cref": "#/pictures/4" }, @@ -12859,7 +12890,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/424", + "self_ref": "#/texts/425", "parent": { "cref": "#/pictures/4" }, @@ -12888,7 +12919,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/425", + "self_ref": "#/texts/426", "parent": { "cref": "#/pictures/4" }, @@ -12917,7 +12948,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/426", + "self_ref": "#/texts/427", "parent": { "cref": "#/pictures/4" }, @@ -12946,7 +12977,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/427", + "self_ref": "#/texts/428", "parent": { "cref": "#/pictures/4" }, @@ -12975,7 +13006,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/428", + "self_ref": "#/texts/429", "parent": { "cref": "#/pictures/4" }, @@ -13004,7 +13035,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/429", + "self_ref": "#/texts/430", "parent": { "cref": "#/pictures/4" }, @@ -13033,7 +13064,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/430", + "self_ref": "#/texts/431", "parent": { "cref": "#/pictures/4" }, @@ -13062,7 +13093,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/431", + "self_ref": "#/texts/432", "parent": { "cref": "#/pictures/4" }, @@ -13091,7 +13122,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/432", + "self_ref": "#/texts/433", "parent": { "cref": "#/body" }, @@ -13120,7 +13151,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/433", + "self_ref": "#/texts/434", "parent": { "cref": "#/body" }, @@ -13149,7 +13180,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/434", + "self_ref": "#/texts/435", "parent": { "cref": "#/body" }, @@ -13178,7 +13209,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/435", + "self_ref": "#/texts/436", "parent": { "cref": "#/body" }, @@ -13207,7 +13238,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/436", + "self_ref": "#/texts/437", "parent": { "cref": "#/pictures/5" }, @@ -13236,7 +13267,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/437", + "self_ref": "#/texts/438", "parent": { "cref": "#/pictures/5" }, @@ -13265,7 +13296,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/438", + "self_ref": "#/texts/439", "parent": { "cref": "#/pictures/5" }, @@ -13294,7 +13325,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/439", + "self_ref": "#/texts/440", "parent": { "cref": "#/pictures/5" }, @@ -13323,7 +13354,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/440", + "self_ref": "#/texts/441", "parent": { "cref": "#/pictures/5" }, @@ -13352,7 +13383,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/441", + "self_ref": "#/texts/442", "parent": { "cref": "#/pictures/5" }, @@ -13381,7 +13412,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/442", + "self_ref": "#/texts/443", "parent": { "cref": "#/pictures/5" }, @@ -13410,7 +13441,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/443", + "self_ref": "#/texts/444", "parent": { "cref": "#/pictures/5" }, @@ -13439,7 +13470,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/444", + "self_ref": "#/texts/445", "parent": { "cref": "#/pictures/5" }, @@ -13468,7 +13499,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/445", + "self_ref": "#/texts/446", "parent": { "cref": "#/pictures/5" }, @@ -13497,7 +13528,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/446", + "self_ref": "#/texts/447", "parent": { "cref": "#/pictures/5" }, @@ -13526,7 +13557,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/447", + "self_ref": "#/texts/448", "parent": { "cref": "#/pictures/5" }, @@ -13555,7 +13586,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/448", + "self_ref": "#/texts/449", "parent": { "cref": "#/body" }, @@ -13584,7 +13615,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/449", + "self_ref": "#/texts/450", "parent": { "cref": "#/body" }, @@ -13613,7 +13644,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/450", + "self_ref": "#/texts/451", "parent": { "cref": "#/body" }, @@ -13643,7 +13674,7 @@ "level": 1 }, { - "self_ref": "#/texts/451", + "self_ref": "#/texts/452", "parent": { "cref": "#/body" }, @@ -13672,7 +13703,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/452", + "self_ref": "#/texts/453", "parent": { "cref": "#/body" }, @@ -13701,7 +13732,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/453", + "self_ref": "#/texts/454", "parent": { "cref": "#/body" }, @@ -13730,7 +13761,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/454", + "self_ref": "#/texts/455", "parent": { "cref": "#/body" }, @@ -13760,9 +13791,9 @@ "level": 1 }, { - "self_ref": "#/texts/455", + "self_ref": "#/texts/456", "parent": { - "cref": "#/groups/4" + "cref": "#/groups/5" }, "children": [], "content_layer": "body", @@ -13788,12 +13819,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/456", + "self_ref": "#/texts/457", "parent": { - "cref": "#/groups/4" + "cref": "#/groups/5" }, "children": [], "content_layer": "body", @@ -13819,12 +13850,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/457", + "self_ref": "#/texts/458", "parent": { - "cref": "#/groups/4" + "cref": "#/groups/5" }, "children": [], "content_layer": "body", @@ -13850,12 +13881,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/458", + "self_ref": "#/texts/459", "parent": { - "cref": "#/groups/4" + "cref": "#/groups/5" }, "children": [], "content_layer": "body", @@ -13881,10 +13912,10 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/459", + "self_ref": "#/texts/460", "parent": { "cref": "#/body" }, @@ -13913,7 +13944,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/460", + "self_ref": "#/texts/461", "parent": { "cref": "#/body" }, @@ -13942,9 +13973,9 @@ "hyperlink": null }, { - "self_ref": "#/texts/461", + "self_ref": "#/texts/462", "parent": { - "cref": "#/groups/5" + "cref": "#/groups/6" }, "children": [], "content_layer": "body", @@ -13970,12 +14001,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/462", + "self_ref": "#/texts/463", "parent": { - "cref": "#/groups/5" + "cref": "#/groups/6" }, "children": [], "content_layer": "body", @@ -14001,12 +14032,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/463", + "self_ref": "#/texts/464", "parent": { - "cref": "#/groups/5" + "cref": "#/groups/6" }, "children": [], "content_layer": "body", @@ -14032,12 +14063,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/464", + "self_ref": "#/texts/465", "parent": { - "cref": "#/groups/5" + "cref": "#/groups/6" }, "children": [], "content_layer": "body", @@ -14063,12 +14094,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/465", + "self_ref": "#/texts/466", "parent": { - "cref": "#/groups/5" + "cref": "#/groups/6" }, "children": [], "content_layer": "body", @@ -14094,12 +14125,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/466", + "self_ref": "#/texts/467", "parent": { - "cref": "#/groups/5" + "cref": "#/groups/6" }, "children": [], "content_layer": "body", @@ -14125,12 +14156,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/467", + "self_ref": "#/texts/468", "parent": { - "cref": "#/groups/5" + "cref": "#/groups/6" }, "children": [], "content_layer": "body", @@ -14156,12 +14187,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/468", + "self_ref": "#/texts/469", "parent": { - "cref": "#/groups/5" + "cref": "#/groups/6" }, "children": [], "content_layer": "body", @@ -14187,12 +14218,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/469", + "self_ref": "#/texts/470", "parent": { - "cref": "#/groups/5" + "cref": "#/groups/6" }, "children": [], "content_layer": "body", @@ -14218,12 +14249,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/470", + "self_ref": "#/texts/471", "parent": { - "cref": "#/groups/5" + "cref": "#/groups/6" }, "children": [], "content_layer": "body", @@ -14249,12 +14280,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/471", + "self_ref": "#/texts/472", "parent": { - "cref": "#/groups/5" + "cref": "#/groups/6" }, "children": [], "content_layer": "body", @@ -14280,12 +14311,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/472", + "self_ref": "#/texts/473", "parent": { - "cref": "#/groups/5" + "cref": "#/groups/6" }, "children": [], "content_layer": "body", @@ -14311,12 +14342,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/473", + "self_ref": "#/texts/474", "parent": { - "cref": "#/groups/5" + "cref": "#/groups/6" }, "children": [], "content_layer": "body", @@ -14342,10 +14373,10 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/474", + "self_ref": "#/texts/475", "parent": { "cref": "#/body" }, @@ -14374,7 +14405,7 @@ "hyperlink": null }, { - "self_ref": "#/texts/475", + "self_ref": "#/texts/476", "parent": { "cref": "#/body" }, @@ -14403,9 +14434,9 @@ "hyperlink": null }, { - "self_ref": "#/texts/476", + "self_ref": "#/texts/477", "parent": { - "cref": "#/groups/6" + "cref": "#/groups/7" }, "children": [], "content_layer": "body", @@ -14431,12 +14462,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/477", + "self_ref": "#/texts/478", "parent": { - "cref": "#/groups/6" + "cref": "#/groups/7" }, "children": [], "content_layer": "body", @@ -14462,12 +14493,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/478", + "self_ref": "#/texts/479", "parent": { - "cref": "#/groups/6" + "cref": "#/groups/7" }, "children": [], "content_layer": "body", @@ -14493,12 +14524,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/479", + "self_ref": "#/texts/480", "parent": { - "cref": "#/groups/6" + "cref": "#/groups/7" }, "children": [], "content_layer": "body", @@ -14524,12 +14555,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/480", + "self_ref": "#/texts/481", "parent": { - "cref": "#/groups/6" + "cref": "#/groups/7" }, "children": [], "content_layer": "body", @@ -14555,12 +14586,12 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/481", + "self_ref": "#/texts/482", "parent": { - "cref": "#/groups/6" + "cref": "#/groups/7" }, "children": [], "content_layer": "body", @@ -14586,38 +14617,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/482", - "parent": { - "cref": "#/groups/7" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [ - { - "page_no": 7, - "bbox": { - "l": 334.51135, - "t": 519.19159, - "r": 426.59875, - "b": 512.97711, - "coord_origin": "BOTTOMLEFT" - }, - "charspan": [ - 0, - 33 - ] - } - ], - "orig": "4 - 2d merges: \"C\", \"L\", \"U\", \"X\"", - "text": "4 - 2d merges: \"C\", \"L\", \"U\", \"X\"", - "formatting": null, - "hyperlink": null, - "enumerated": false, - "marker": "-" + "marker": "" } ], "pictures": [ @@ -15187,10 +15187,7 @@ "cref": "#/texts/214" }, { - "cref": "#/groups/7" - }, - { - "cref": "#/texts/215" + "cref": "#/groups/2" }, { "cref": "#/texts/216" @@ -15227,6 +15224,9 @@ }, { "cref": "#/texts/227" + }, + { + "cref": "#/texts/228" } ], "content_layer": "body", @@ -15263,9 +15263,6 @@ "cref": "#/body" }, "children": [ - { - "cref": "#/texts/246" - }, { "cref": "#/texts/247" }, @@ -15481,6 +15478,9 @@ }, { "cref": "#/texts/318" + }, + { + "cref": "#/texts/319" } ], "content_layer": "body", @@ -15503,7 +15503,7 @@ ], "captions": [ { - "cref": "#/texts/246" + "cref": "#/texts/247" } ], "references": [], @@ -15517,9 +15517,6 @@ "cref": "#/body" }, "children": [ - { - "cref": "#/texts/334" - }, { "cref": "#/texts/335" }, @@ -15810,6 +15807,9 @@ }, { "cref": "#/texts/431" + }, + { + "cref": "#/texts/432" } ], "content_layer": "body", @@ -15832,7 +15832,7 @@ ], "captions": [ { - "cref": "#/texts/334" + "cref": "#/texts/335" } ], "references": [], @@ -15846,9 +15846,6 @@ "cref": "#/body" }, "children": [ - { - "cref": "#/texts/436" - }, { "cref": "#/texts/437" }, @@ -15881,6 +15878,9 @@ }, { "cref": "#/texts/447" + }, + { + "cref": "#/texts/448" } ], "content_layer": "body", @@ -15903,7 +15903,7 @@ ], "captions": [ { - "cref": "#/texts/436" + "cref": "#/texts/437" } ], "references": [], @@ -15920,7 +15920,7 @@ }, "children": [ { - "cref": "#/texts/325" + "cref": "#/texts/326" } ], "content_layer": "body", @@ -15943,7 +15943,7 @@ ], "captions": [ { - "cref": "#/texts/325" + "cref": "#/texts/326" } ], "references": [], @@ -17707,7 +17707,7 @@ }, "children": [ { - "cref": "#/texts/331" + "cref": "#/texts/332" } ], "content_layer": "body", @@ -17730,7 +17730,7 @@ ], "captions": [ { - "cref": "#/texts/331" + "cref": "#/texts/332" } ], "references": [], diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1.md b/tests/data/groundtruth/docling_v2/2305.03393v1.md index e0fdb89..5885330 100644 --- a/tests/data/groundtruth/docling_v2/2305.03393v1.md +++ b/tests/data/groundtruth/docling_v2/2305.03393v1.md @@ -84,8 +84,6 @@ Fig. 3. OTSL description of table structure: A - table example; B - graphical re -- 4 - 2d merges: "C", "L", "U", "X" - ## 4.2 Language Syntax The OTSL representation follows these syntax rules: diff --git a/tests/data/groundtruth/docling_v2/amt_handbook_sample.json b/tests/data/groundtruth/docling_v2/amt_handbook_sample.json index 0cbe240..ccc6598 100644 --- a/tests/data/groundtruth/docling_v2/amt_handbook_sample.json +++ b/tests/data/groundtruth/docling_v2/amt_handbook_sample.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "amt_handbook_sample", "origin": { "mimetype": "application/pdf", diff --git a/tests/data/groundtruth/docling_v2/code_and_formula.json b/tests/data/groundtruth/docling_v2/code_and_formula.json index b296106..6398de4 100644 --- a/tests/data/groundtruth/docling_v2/code_and_formula.json +++ b/tests/data/groundtruth/docling_v2/code_and_formula.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "code_and_formula", "origin": { "mimetype": "application/pdf", diff --git a/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.json b/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.json index 82747ca..90dffc7 100644 --- a/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "csv-comma-in-cell", "origin": { "mimetype": "text/csv", diff --git a/tests/data/groundtruth/docling_v2/csv-comma.csv.json b/tests/data/groundtruth/docling_v2/csv-comma.csv.json index db7f3e3..79cec5e 100644 --- a/tests/data/groundtruth/docling_v2/csv-comma.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-comma.csv.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "csv-comma", "origin": { "mimetype": "text/csv", diff --git a/tests/data/groundtruth/docling_v2/csv-inconsistent-header.csv.json b/tests/data/groundtruth/docling_v2/csv-inconsistent-header.csv.json index d4a3305..17b13d2 100644 --- a/tests/data/groundtruth/docling_v2/csv-inconsistent-header.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-inconsistent-header.csv.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "csv-inconsistent-header", "origin": { "mimetype": "text/csv", diff --git a/tests/data/groundtruth/docling_v2/csv-pipe.csv.json b/tests/data/groundtruth/docling_v2/csv-pipe.csv.json index 73566a8..8df1fbe 100644 --- a/tests/data/groundtruth/docling_v2/csv-pipe.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-pipe.csv.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "csv-pipe", "origin": { "mimetype": "text/csv", diff --git a/tests/data/groundtruth/docling_v2/csv-semicolon.csv.json b/tests/data/groundtruth/docling_v2/csv-semicolon.csv.json index 54c28a6..5bc9b41 100644 --- a/tests/data/groundtruth/docling_v2/csv-semicolon.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-semicolon.csv.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "csv-semicolon", "origin": { "mimetype": "text/csv", diff --git a/tests/data/groundtruth/docling_v2/csv-tab.csv.json b/tests/data/groundtruth/docling_v2/csv-tab.csv.json index 0f179e4..d77210b 100644 --- a/tests/data/groundtruth/docling_v2/csv-tab.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-tab.csv.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "csv-tab", "origin": { "mimetype": "text/csv", diff --git a/tests/data/groundtruth/docling_v2/csv-too-few-columns.csv.json b/tests/data/groundtruth/docling_v2/csv-too-few-columns.csv.json index 46eec31..1b5e896 100644 --- a/tests/data/groundtruth/docling_v2/csv-too-few-columns.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-too-few-columns.csv.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "csv-too-few-columns", "origin": { "mimetype": "text/csv", diff --git a/tests/data/groundtruth/docling_v2/csv-too-many-columns.csv.json b/tests/data/groundtruth/docling_v2/csv-too-many-columns.csv.json index be17c38..1f8f0b0 100644 --- a/tests/data/groundtruth/docling_v2/csv-too-many-columns.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-too-many-columns.csv.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "csv-too-many-columns", "origin": { "mimetype": "text/csv", diff --git a/tests/data/groundtruth/docling_v2/equations.docx.json b/tests/data/groundtruth/docling_v2/equations.docx.json index 73779bf..a244b68 100644 --- a/tests/data/groundtruth/docling_v2/equations.docx.json +++ b/tests/data/groundtruth/docling_v2/equations.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "equations", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", diff --git a/tests/data/groundtruth/docling_v2/example_01.html.itxt b/tests/data/groundtruth/docling_v2/example_01.html.itxt index fbd7512..5db91c5 100644 --- a/tests/data/groundtruth/docling_v2/example_01.html.itxt +++ b/tests/data/groundtruth/docling_v2/example_01.html.itxt @@ -7,6 +7,9 @@ item-0 at level 0: unspecified: group _root_ item-6 at level 3: list: group list item-7 at level 4: list_item: First item in unordered list item-8 at level 4: list_item: Second item in unordered list - item-9 at level 3: ordered_list: group ordered list + item-9 at level 3: list: group ordered list item-10 at level 4: list_item: First item in ordered list - item-11 at level 4: list_item: Second item in ordered list \ No newline at end of file + item-11 at level 4: list_item: Second item in ordered list + item-12 at level 3: list: group ordered list start 42 + item-13 at level 4: list_item: First item in ordered list with start + item-14 at level 4: list_item: Second item in ordered list with start \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_01.html.json b/tests/data/groundtruth/docling_v2/example_01.html.json index c9e9384..2e8e510 100644 --- a/tests/data/groundtruth/docling_v2/example_01.html.json +++ b/tests/data/groundtruth/docling_v2/example_01.html.json @@ -1,10 +1,10 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "example_01", "origin": { "mimetype": "text/html", - "binary_hash": 13782069548509991617, + "binary_hash": 13726679883013609282, "filename": "example_01.html" }, "furniture": { @@ -58,7 +58,24 @@ ], "content_layer": "body", "name": "ordered list", - "label": "ordered_list" + "label": "list" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/texts/2" + }, + "children": [ + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + } + ], + "content_layer": "body", + "name": "ordered list start 42", + "label": "list" } ], "texts": [ @@ -110,6 +127,9 @@ }, { "$ref": "#/groups/1" + }, + { + "$ref": "#/groups/2" } ], "content_layer": "body", @@ -143,7 +163,7 @@ "orig": "First item in unordered list", "text": "First item in unordered list", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/5", @@ -157,7 +177,7 @@ "orig": "Second item in unordered list", "text": "Second item in unordered list", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/6", @@ -171,7 +191,7 @@ "orig": "First item in ordered list", "text": "First item in ordered list", "enumerated": true, - "marker": "1." + "marker": "" }, { "self_ref": "#/texts/7", @@ -185,7 +205,35 @@ "orig": "Second item in ordered list", "text": "Second item in ordered list", "enumerated": true, - "marker": "2." + "marker": "" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "First item in ordered list with start", + "text": "First item in ordered list with start", + "enumerated": true, + "marker": "42." + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Second item in ordered list with start", + "text": "Second item in ordered list with start", + "enumerated": true, + "marker": "43." } ], "pictures": [ diff --git a/tests/data/groundtruth/docling_v2/example_01.html.md b/tests/data/groundtruth/docling_v2/example_01.html.md index 73031c0..f36f785 100644 --- a/tests/data/groundtruth/docling_v2/example_01.html.md +++ b/tests/data/groundtruth/docling_v2/example_01.html.md @@ -12,4 +12,7 @@ Some background information here. - Second item in unordered list 1. First item in ordered list -2. Second item in ordered list \ No newline at end of file +2. Second item in ordered list + +42. First item in ordered list with start +43. Second item in ordered list with start \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_02.html.itxt b/tests/data/groundtruth/docling_v2/example_02.html.itxt index 49ea71e..15586e8 100644 --- a/tests/data/groundtruth/docling_v2/example_02.html.itxt +++ b/tests/data/groundtruth/docling_v2/example_02.html.itxt @@ -6,6 +6,6 @@ item-0 at level 0: unspecified: group _root_ item-5 at level 3: list: group list item-6 at level 4: list_item: First item in unordered list item-7 at level 4: list_item: Second item in unordered list - item-8 at level 3: ordered_list: group ordered list + item-8 at level 3: list: group ordered list item-9 at level 4: list_item: First item in ordered list item-10 at level 4: list_item: Second item in ordered list \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_02.html.json b/tests/data/groundtruth/docling_v2/example_02.html.json index bfbf9ec..786a26c 100644 --- a/tests/data/groundtruth/docling_v2/example_02.html.json +++ b/tests/data/groundtruth/docling_v2/example_02.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "example_02", "origin": { "mimetype": "text/html", @@ -58,7 +58,7 @@ ], "content_layer": "body", "name": "ordered list", - "label": "ordered_list" + "label": "list" } ], "texts": [ @@ -140,7 +140,7 @@ "orig": "First item in unordered list", "text": "First item in unordered list", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/5", @@ -154,7 +154,7 @@ "orig": "Second item in unordered list", "text": "Second item in unordered list", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/6", @@ -168,7 +168,7 @@ "orig": "First item in ordered list", "text": "First item in ordered list", "enumerated": true, - "marker": "1." + "marker": "" }, { "self_ref": "#/texts/7", @@ -182,7 +182,7 @@ "orig": "Second item in ordered list", "text": "Second item in ordered list", "enumerated": true, - "marker": "2." + "marker": "" } ], "pictures": [], diff --git a/tests/data/groundtruth/docling_v2/example_03.html.itxt b/tests/data/groundtruth/docling_v2/example_03.html.itxt index 5b31bf8..97285a1 100644 --- a/tests/data/groundtruth/docling_v2/example_03.html.itxt +++ b/tests/data/groundtruth/docling_v2/example_03.html.itxt @@ -10,9 +10,9 @@ item-0 at level 0: unspecified: group _root_ item-9 at level 6: list_item: Nested item 1 item-10 at level 6: list_item: Nested item 2 item-11 at level 4: list_item: Second item in unordered list - item-12 at level 3: ordered_list: group ordered list + item-12 at level 3: list: group ordered list item-13 at level 4: list_item: First item in ordered list - item-14 at level 5: ordered_list: group ordered list + item-14 at level 5: list: group ordered list item-15 at level 6: list_item: Nested ordered item 1 item-16 at level 6: list_item: Nested ordered item 2 item-17 at level 4: list_item: Second item in ordered list diff --git a/tests/data/groundtruth/docling_v2/example_03.html.json b/tests/data/groundtruth/docling_v2/example_03.html.json index 6a9fea2..8c8f91b 100644 --- a/tests/data/groundtruth/docling_v2/example_03.html.json +++ b/tests/data/groundtruth/docling_v2/example_03.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "example_03", "origin": { "mimetype": "text/html", @@ -75,7 +75,7 @@ ], "content_layer": "body", "name": "ordered list", - "label": "ordered_list" + "label": "list" }, { "self_ref": "#/groups/3", @@ -92,7 +92,7 @@ ], "content_layer": "body", "name": "ordered list", - "label": "ordered_list" + "label": "list" } ], "texts": [ @@ -198,7 +198,7 @@ "orig": "First item in unordered list", "text": "First item in unordered list", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/6", @@ -212,7 +212,7 @@ "orig": "Nested item 1", "text": "Nested item 1", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/7", @@ -226,7 +226,7 @@ "orig": "Nested item 2", "text": "Nested item 2", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/8", @@ -240,7 +240,7 @@ "orig": "Second item in unordered list", "text": "Second item in unordered list", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/9", @@ -258,7 +258,7 @@ "orig": "First item in ordered list", "text": "First item in ordered list", "enumerated": true, - "marker": "1" + "marker": "" }, { "self_ref": "#/texts/10", @@ -272,7 +272,7 @@ "orig": "Nested ordered item 1", "text": "Nested ordered item 1", "enumerated": true, - "marker": "1." + "marker": "" }, { "self_ref": "#/texts/11", @@ -286,7 +286,7 @@ "orig": "Nested ordered item 2", "text": "Nested ordered item 2", "enumerated": true, - "marker": "2." + "marker": "" }, { "self_ref": "#/texts/12", @@ -300,7 +300,7 @@ "orig": "Second item in ordered list", "text": "Second item in ordered list", "enumerated": true, - "marker": "2." + "marker": "" }, { "self_ref": "#/texts/13", diff --git a/tests/data/groundtruth/docling_v2/example_04.html.json b/tests/data/groundtruth/docling_v2/example_04.html.json index 40273c4..e65fe01 100644 --- a/tests/data/groundtruth/docling_v2/example_04.html.json +++ b/tests/data/groundtruth/docling_v2/example_04.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "example_04", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/example_05.html.json b/tests/data/groundtruth/docling_v2/example_05.html.json index e37e43f..7502cf0 100644 --- a/tests/data/groundtruth/docling_v2/example_05.html.json +++ b/tests/data/groundtruth/docling_v2/example_05.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "example_05", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/example_06.html.json b/tests/data/groundtruth/docling_v2/example_06.html.json index aed598d..5b0e69b 100644 --- a/tests/data/groundtruth/docling_v2/example_06.html.json +++ b/tests/data/groundtruth/docling_v2/example_06.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "example_06", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/example_07.html.json b/tests/data/groundtruth/docling_v2/example_07.html.json index ac26ba1..ad6b710 100644 --- a/tests/data/groundtruth/docling_v2/example_07.html.json +++ b/tests/data/groundtruth/docling_v2/example_07.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "example_07", "origin": { "mimetype": "text/html", @@ -169,7 +169,7 @@ "orig": "Asia", "text": "Asia", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/1", @@ -183,7 +183,7 @@ "orig": "China", "text": "China", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/2", @@ -197,7 +197,7 @@ "orig": "Japan", "text": "Japan", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/3", @@ -211,7 +211,7 @@ "orig": "Thailand", "text": "Thailand", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/4", @@ -229,7 +229,7 @@ "orig": "Europe", "text": "Europe", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/5", @@ -243,7 +243,7 @@ "orig": "UK", "text": "UK", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/6", @@ -257,7 +257,7 @@ "orig": "Germany", "text": "Germany", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/7", @@ -275,7 +275,7 @@ "orig": "Switzerland", "text": "Switzerland", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/8", @@ -289,7 +289,7 @@ "orig": "Bern", "text": "Bern", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/9", @@ -303,7 +303,7 @@ "orig": "Aargau", "text": "Aargau", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/10", @@ -321,7 +321,7 @@ "orig": "Italy", "text": "Italy", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/11", @@ -335,7 +335,7 @@ "orig": "Piedmont", "text": "Piedmont", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/12", @@ -349,7 +349,7 @@ "orig": "Liguria", "text": "Liguria", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/13", @@ -363,7 +363,7 @@ "orig": "Africa", "text": "Africa", "enumerated": false, - "marker": "-" + "marker": "" } ], "pictures": [], diff --git a/tests/data/groundtruth/docling_v2/example_08.html.json b/tests/data/groundtruth/docling_v2/example_08.html.json index 15ac428..02fde3b 100644 --- a/tests/data/groundtruth/docling_v2/example_08.html.json +++ b/tests/data/groundtruth/docling_v2/example_08.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "example_08", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml index f04fa50..801d2b7 100644 --- a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml +++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml @@ -56,7 +56,7 @@ groups: - $ref: '#/texts/27' - $ref: '#/texts/28' content_layer: body - label: ordered_list + label: list name: list parent: $ref: '#/body' @@ -430,7 +430,7 @@ texts: content_layer: body enumerated: true label: list_item - marker: '-' + marker: '' orig: '' parent: $ref: '#/groups/2' @@ -476,7 +476,7 @@ texts: content_layer: body enumerated: true label: list_item - marker: '-' + marker: '' orig: '' parent: $ref: '#/groups/2' @@ -519,7 +519,7 @@ texts: content_layer: body enumerated: true label: list_item - marker: '-' + marker: '' orig: '' parent: $ref: '#/groups/2' @@ -562,7 +562,7 @@ texts: content_layer: body enumerated: true label: list_item - marker: '-' + marker: '' orig: '' parent: $ref: '#/groups/2' @@ -604,7 +604,7 @@ texts: content_layer: body enumerated: true label: list_item - marker: '-' + marker: '' orig: Open a Pull Request parent: $ref: '#/groups/2' @@ -621,7 +621,7 @@ texts: strikethrough: false underline: false label: list_item - marker: '-' + marker: '' orig: Whole list item has same formatting parent: $ref: '#/groups/2' @@ -633,7 +633,7 @@ texts: content_layer: body enumerated: true label: list_item - marker: '-' + marker: '' orig: '' parent: $ref: '#/groups/2' @@ -693,7 +693,7 @@ texts: content_layer: body enumerated: false label: list_item - marker: '-' + marker: '' orig: '' parent: $ref: '#/groups/8' @@ -729,7 +729,7 @@ texts: content_layer: body enumerated: false label: list_item - marker: '-' + marker: '' orig: '' parent: $ref: '#/groups/8' @@ -878,4 +878,4 @@ texts: prov: [] self_ref: '#/texts/48' text: Table Heading -version: 1.4.0 +version: 1.5.0 diff --git a/tests/data/groundtruth/docling_v2/ipa20180000016.json b/tests/data/groundtruth/docling_v2/ipa20180000016.json index 835f3ef..a2c75ea 100644 --- a/tests/data/groundtruth/docling_v2/ipa20180000016.json +++ b/tests/data/groundtruth/docling_v2/ipa20180000016.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "ipa20180000016.xml", "origin": { "mimetype": "application/xml", diff --git a/tests/data/groundtruth/docling_v2/ipa20200022300.json b/tests/data/groundtruth/docling_v2/ipa20200022300.json index 1b86290..815d79d 100644 --- a/tests/data/groundtruth/docling_v2/ipa20200022300.json +++ b/tests/data/groundtruth/docling_v2/ipa20200022300.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "ipa20200022300.xml", "origin": { "mimetype": "application/xml", diff --git a/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json b/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json index 866513a..995e0d7 100644 --- a/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json +++ b/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "lorem_ipsum", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", diff --git a/tests/data/groundtruth/docling_v2/multi_page.json b/tests/data/groundtruth/docling_v2/multi_page.json index 812b130..32c0eb9 100644 --- a/tests/data/groundtruth/docling_v2/multi_page.json +++ b/tests/data/groundtruth/docling_v2/multi_page.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "multi_page", "origin": { "mimetype": "application/pdf", @@ -534,7 +534,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/8", @@ -565,7 +565,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/9", @@ -684,7 +684,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/13", @@ -715,7 +715,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/14", @@ -834,7 +834,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/18", @@ -865,7 +865,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/19", @@ -896,7 +896,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/20", @@ -1074,7 +1074,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/26", @@ -1105,7 +1105,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/27", @@ -1136,7 +1136,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/28", @@ -1226,7 +1226,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/31", @@ -1257,7 +1257,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/32", @@ -1288,7 +1288,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/33", @@ -1319,7 +1319,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/34", @@ -1350,7 +1350,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/35", @@ -1440,7 +1440,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/38", @@ -1471,7 +1471,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/39", @@ -1502,7 +1502,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/40", @@ -1592,7 +1592,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/43", @@ -1623,7 +1623,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/44", @@ -1654,7 +1654,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/45", @@ -1685,7 +1685,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/46", @@ -1716,7 +1716,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/47", @@ -1806,7 +1806,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/50", @@ -1837,7 +1837,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/51", @@ -1868,7 +1868,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/52", diff --git a/tests/data/groundtruth/docling_v2/pa20010031492.json b/tests/data/groundtruth/docling_v2/pa20010031492.json index 6186e47..b361265 100644 --- a/tests/data/groundtruth/docling_v2/pa20010031492.json +++ b/tests/data/groundtruth/docling_v2/pa20010031492.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "pa20010031492.xml", "origin": { "mimetype": "application/xml", diff --git a/tests/data/groundtruth/docling_v2/pftaps057006474.json b/tests/data/groundtruth/docling_v2/pftaps057006474.json index 94f5bd9..58cf609 100644 --- a/tests/data/groundtruth/docling_v2/pftaps057006474.json +++ b/tests/data/groundtruth/docling_v2/pftaps057006474.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "pftaps057006474.txt", "origin": { "mimetype": "text/plain", diff --git a/tests/data/groundtruth/docling_v2/pg06442728.json b/tests/data/groundtruth/docling_v2/pg06442728.json index c4fa375..9ae8a83 100644 --- a/tests/data/groundtruth/docling_v2/pg06442728.json +++ b/tests/data/groundtruth/docling_v2/pg06442728.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "pg06442728.xml", "origin": { "mimetype": "application/xml", diff --git a/tests/data/groundtruth/docling_v2/picture_classification.json b/tests/data/groundtruth/docling_v2/picture_classification.json index 908274e..cf6ff79 100644 --- a/tests/data/groundtruth/docling_v2/picture_classification.json +++ b/tests/data/groundtruth/docling_v2/picture_classification.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "picture_classification", "origin": { "mimetype": "application/pdf", diff --git a/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.json b/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.json index c4d08e4..4806fe4 100644 --- a/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.json +++ b/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "powerpoint_bad_text", "origin": { "mimetype": "application/vnd.ms-powerpoint", diff --git a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.itxt b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.itxt index ba86c3b..dcc5f01 100644 --- a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.itxt +++ b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.itxt @@ -11,7 +11,7 @@ item-0 at level 0: unspecified: group _root_ item-10 at level 2: paragraph: And baz things item-11 at level 2: paragraph: A rectangle shape with this text inside. item-12 at level 1: chapter: group slide-2 - item-13 at level 2: ordered_list: group list + item-13 at level 2: list: group list item-14 at level 3: list_item: List item4 item-15 at level 3: list_item: List item5 item-16 at level 3: list_item: List item6 @@ -25,7 +25,7 @@ item-0 at level 0: unspecified: group _root_ item-24 at level 3: list_item: Item A item-25 at level 3: list_item: Item B item-26 at level 2: paragraph: Maybe a list? - item-27 at level 2: ordered_list: group list + item-27 at level 2: list: group list item-28 at level 3: list_item: List1 item-29 at level 3: list_item: List2 item-30 at level 3: list_item: List3 diff --git a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json index 88a82ae..6cb9a6a 100644 --- a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json +++ b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "powerpoint_sample", "origin": { "mimetype": "application/vnd.ms-powerpoint", @@ -137,7 +137,7 @@ ], "content_layer": "body", "name": "list", - "label": "ordered_list" + "label": "list" }, { "self_ref": "#/groups/4", @@ -197,7 +197,7 @@ ], "content_layer": "body", "name": "list", - "label": "ordered_list" + "label": "list" }, { "self_ref": "#/groups/7", @@ -578,7 +578,7 @@ "orig": "I1", "text": "I1", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/13", @@ -607,7 +607,7 @@ "orig": "I2", "text": "I2", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/14", @@ -636,7 +636,7 @@ "orig": "I3", "text": "I3", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/15", @@ -665,7 +665,7 @@ "orig": "I4", "text": "I4", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/16", @@ -721,7 +721,7 @@ "orig": "Item A", "text": "Item A", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/18", @@ -750,7 +750,7 @@ "orig": "Item B", "text": "Item B", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/19", @@ -893,7 +893,7 @@ "orig": "l1 ", "text": "l1 ", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/24", @@ -922,7 +922,7 @@ "orig": "l2", "text": "l2", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/25", @@ -951,7 +951,7 @@ "orig": "l3", "text": "l3", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/26", diff --git a/tests/data/groundtruth/docling_v2/powerpoint_with_image.pptx.json b/tests/data/groundtruth/docling_v2/powerpoint_with_image.pptx.json index ffc77b6..126fca9 100644 --- a/tests/data/groundtruth/docling_v2/powerpoint_with_image.pptx.json +++ b/tests/data/groundtruth/docling_v2/powerpoint_with_image.pptx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "powerpoint_with_image", "origin": { "mimetype": "application/vnd.ms-powerpoint", diff --git a/tests/data/groundtruth/docling_v2/redp5110_sampled.json b/tests/data/groundtruth/docling_v2/redp5110_sampled.json index 5fb4ed2..8780723 100644 --- a/tests/data/groundtruth/docling_v2/redp5110_sampled.json +++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "redp5110_sampled", "origin": { "mimetype": "application/pdf", @@ -1295,7 +1295,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/15", @@ -1326,7 +1326,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/16", @@ -1357,7 +1357,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/17", @@ -1388,7 +1388,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/18", @@ -1683,7 +1683,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/28", @@ -1714,7 +1714,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/29", @@ -1745,7 +1745,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/30", @@ -1776,7 +1776,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/31", @@ -1807,7 +1807,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/32", @@ -1838,7 +1838,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/33", @@ -1869,7 +1869,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/34", @@ -1900,7 +1900,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/35", @@ -1931,7 +1931,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/36", @@ -2400,7 +2400,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/52", @@ -2431,7 +2431,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/53", @@ -2462,7 +2462,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/54", @@ -2668,7 +2668,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/61", @@ -2699,7 +2699,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/62", @@ -2759,7 +2759,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/64", @@ -3344,7 +3344,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/84", @@ -3375,7 +3375,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/85", @@ -3406,7 +3406,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/86", @@ -5992,7 +5992,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/175", @@ -6023,7 +6023,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/176", @@ -6054,7 +6054,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/177", @@ -6085,7 +6085,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/178", @@ -6116,7 +6116,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/179", @@ -6787,7 +6787,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/202", @@ -6818,7 +6818,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/203", @@ -6849,7 +6849,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/204", @@ -7064,7 +7064,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/211", @@ -7095,7 +7095,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/212", @@ -7126,7 +7126,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/213", @@ -7157,7 +7157,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/214", @@ -7188,7 +7188,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/215", @@ -7219,7 +7219,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/216", @@ -7379,7 +7379,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/221", @@ -7498,7 +7498,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/225", @@ -7559,7 +7559,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/227", @@ -7590,7 +7590,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/228", @@ -7737,7 +7737,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/233", @@ -7855,7 +7855,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/237", @@ -7915,7 +7915,7 @@ "formatting": null, "hyperlink": null, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/239", diff --git a/tests/data/groundtruth/docling_v2/right_to_left_01.json b/tests/data/groundtruth/docling_v2/right_to_left_01.json index d109834..568cc62 100644 --- a/tests/data/groundtruth/docling_v2/right_to_left_01.json +++ b/tests/data/groundtruth/docling_v2/right_to_left_01.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "right_to_left_01", "origin": { "mimetype": "application/pdf", diff --git a/tests/data/groundtruth/docling_v2/right_to_left_02.json b/tests/data/groundtruth/docling_v2/right_to_left_02.json index 3960d24..181691d 100644 --- a/tests/data/groundtruth/docling_v2/right_to_left_02.json +++ b/tests/data/groundtruth/docling_v2/right_to_left_02.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "right_to_left_02", "origin": { "mimetype": "application/pdf", diff --git a/tests/data/groundtruth/docling_v2/right_to_left_03.json b/tests/data/groundtruth/docling_v2/right_to_left_03.json index 2cfdcc7..a3a36fd 100644 --- a/tests/data/groundtruth/docling_v2/right_to_left_03.json +++ b/tests/data/groundtruth/docling_v2/right_to_left_03.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "right_to_left_03", "origin": { "mimetype": "application/pdf", diff --git a/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json b/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json index b798298..18c3d46 100644 --- a/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json +++ b/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "sample_sales_data", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", diff --git a/tests/data/groundtruth/docling_v2/tablecell.docx.json b/tests/data/groundtruth/docling_v2/tablecell.docx.json index ac1473d..be5af05 100644 --- a/tests/data/groundtruth/docling_v2/tablecell.docx.json +++ b/tests/data/groundtruth/docling_v2/tablecell.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "tablecell", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -82,7 +82,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/1", @@ -103,7 +103,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/2", diff --git a/tests/data/groundtruth/docling_v2/test-01.xlsx.json b/tests/data/groundtruth/docling_v2/test-01.xlsx.json index 2a23dc4..ea2ab55 100644 --- a/tests/data/groundtruth/docling_v2/test-01.xlsx.json +++ b/tests/data/groundtruth/docling_v2/test-01.xlsx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "test-01", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", diff --git a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json index 88a1002..b24ff85 100644 --- a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json +++ b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "test_emf_docx", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.itxt b/tests/data/groundtruth/docling_v2/textbox.docx.itxt index fa4ae97..4558be5 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.itxt +++ b/tests/data/groundtruth/docling_v2/textbox.docx.itxt @@ -29,64 +29,62 @@ item-0 at level 0: unspecified: group _root_ item-24 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System. item-25 at level 2: paragraph: item-26 at level 1: list: group list - item-27 at level 2: list_item: + item-27 at level 1: paragraph: item-28 at level 1: paragraph: item-29 at level 1: paragraph: item-30 at level 1: paragraph: item-31 at level 1: paragraph: - item-32 at level 1: paragraph: - item-33 at level 1: section: group textbox - item-34 at level 2: paragraph: Health Bureau: - item-35 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control. - item-36 at level 2: list: group list - item-37 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection. - item-38 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act. - item-39 at level 2: paragraph: - item-40 at level 1: list: group list - item-41 at level 2: list_item: - item-42 at level 1: paragraph: - item-43 at level 1: section: group textbox - item-44 at level 2: paragraph: Department of Education: + item-32 at level 1: section: group textbox + item-33 at level 2: paragraph: Health Bureau: + item-34 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control. + item-35 at level 2: list: group list + item-36 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection. + item-37 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act. + item-38 at level 2: paragraph: + item-39 at level 1: list: group list + item-40 at level 1: paragraph: + item-41 at level 1: section: group textbox + item-42 at level 2: paragraph: Department of Education: Collabo ... vention measures at all school levels. + item-43 at level 1: paragraph: + item-44 at level 1: paragraph: item-45 at level 1: paragraph: item-46 at level 1: paragraph: item-47 at level 1: paragraph: item-48 at level 1: paragraph: item-49 at level 1: paragraph: - item-50 at level 1: paragraph: - item-51 at level 1: paragraph: - item-52 at level 1: section: group textbox - item-53 at level 2: inline: group group - item-54 at level 3: paragraph: The Health Bureau will handle - item-55 at level 3: paragraph: reporting and specimen collection - item-56 at level 3: paragraph: . - item-57 at level 2: paragraph: + item-50 at level 1: section: group textbox + item-51 at level 2: inline: group group + item-52 at level 3: paragraph: The Health Bureau will handle + item-53 at level 3: paragraph: reporting and specimen collection + item-54 at level 3: paragraph: . + item-55 at level 2: paragraph: + item-56 at level 1: paragraph: + item-57 at level 1: paragraph: item-58 at level 1: paragraph: - item-59 at level 1: paragraph: - item-60 at level 1: paragraph: - item-61 at level 1: section: group textbox - item-62 at level 2: paragraph: Whether the epidemic has eased. - item-63 at level 2: paragraph: - item-64 at level 1: paragraph: - item-65 at level 1: section: group textbox - item-66 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease. - item-67 at level 2: paragraph: No - item-68 at level 1: paragraph: - item-69 at level 1: paragraph: - item-70 at level 1: section: group textbox - item-71 at level 2: paragraph: Yes - item-72 at level 1: paragraph: - item-73 at level 1: section: group textbox - item-74 at level 2: paragraph: Yes - item-75 at level 1: paragraph: - item-76 at level 1: paragraph: - item-77 at level 1: section: group textbox - item-78 at level 2: paragraph: Case closed. - item-79 at level 2: paragraph: - item-80 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary. - item-81 at level 1: paragraph: - item-82 at level 1: section: group textbox - item-83 at level 2: paragraph: No - item-84 at level 1: paragraph: - item-85 at level 1: paragraph: - item-86 at level 1: paragraph: \ No newline at end of file + item-59 at level 1: section: group textbox + item-60 at level 2: paragraph: Whether the epidemic has eased. + item-61 at level 2: paragraph: + item-62 at level 1: paragraph: + item-63 at level 1: section: group textbox + item-64 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease. + item-65 at level 2: paragraph: No + item-66 at level 1: paragraph: + item-67 at level 1: paragraph: + item-68 at level 1: section: group textbox + item-69 at level 2: paragraph: Yes + item-70 at level 1: paragraph: + item-71 at level 1: section: group textbox + item-72 at level 2: paragraph: Yes + item-73 at level 1: paragraph: + item-74 at level 1: paragraph: + item-75 at level 1: section: group textbox + item-76 at level 2: paragraph: Case closed. + item-77 at level 2: paragraph: + item-78 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary. + item-79 at level 1: paragraph: + item-80 at level 1: section: group textbox + item-81 at level 2: paragraph: No + item-82 at level 1: paragraph: + item-83 at level 1: paragraph: + item-84 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.json b/tests/data/groundtruth/docling_v2/textbox.docx.json index 1e91f06..9300c93 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.json +++ b/tests/data/groundtruth/docling_v2/textbox.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "textbox", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -65,6 +65,9 @@ { "$ref": "#/groups/6" }, + { + "$ref": "#/texts/19" + }, { "$ref": "#/texts/20" }, @@ -77,9 +80,6 @@ { "$ref": "#/texts/23" }, - { - "$ref": "#/texts/24" - }, { "$ref": "#/groups/7" }, @@ -87,11 +87,17 @@ "$ref": "#/groups/9" }, { - "$ref": "#/texts/31" + "$ref": "#/texts/29" }, { "$ref": "#/groups/10" }, + { + "$ref": "#/texts/31" + }, + { + "$ref": "#/texts/32" + }, { "$ref": "#/texts/33" }, @@ -107,71 +113,65 @@ { "$ref": "#/texts/37" }, - { - "$ref": "#/texts/38" - }, - { - "$ref": "#/texts/39" - }, { "$ref": "#/groups/11" }, + { + "$ref": "#/texts/42" + }, + { + "$ref": "#/texts/43" + }, { "$ref": "#/texts/44" }, - { - "$ref": "#/texts/45" - }, - { - "$ref": "#/texts/46" - }, { "$ref": "#/groups/13" }, { - "$ref": "#/texts/49" + "$ref": "#/texts/47" }, { "$ref": "#/groups/14" }, { - "$ref": "#/texts/52" + "$ref": "#/texts/50" }, { - "$ref": "#/texts/53" + "$ref": "#/texts/51" }, { "$ref": "#/groups/15" }, { - "$ref": "#/texts/55" + "$ref": "#/texts/53" }, { "$ref": "#/groups/16" }, { - "$ref": "#/texts/57" + "$ref": "#/texts/55" }, { - "$ref": "#/texts/58" + "$ref": "#/texts/56" }, { "$ref": "#/groups/17" }, { - "$ref": "#/texts/62" + "$ref": "#/texts/60" }, { "$ref": "#/groups/18" }, + { + "$ref": "#/texts/62" + }, + { + "$ref": "#/texts/63" + }, { "$ref": "#/texts/64" - }, - { - "$ref": "#/texts/65" - }, - { - "$ref": "#/texts/66" } ], "content_layer": "body", @@ -280,11 +280,7 @@ "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/19" - } - ], + "children": [], "content_layer": "body", "name": "list", "label": "list" @@ -296,16 +292,16 @@ }, "children": [ { - "$ref": "#/texts/25" + "$ref": "#/texts/24" }, { - "$ref": "#/texts/26" + "$ref": "#/texts/25" }, { "$ref": "#/groups/8" }, { - "$ref": "#/texts/29" + "$ref": "#/texts/28" } ], "content_layer": "body", @@ -319,10 +315,10 @@ }, "children": [ { - "$ref": "#/texts/27" + "$ref": "#/texts/26" }, { - "$ref": "#/texts/28" + "$ref": "#/texts/27" } ], "content_layer": "body", @@ -334,11 +330,7 @@ "parent": { "$ref": "#/body" }, - "children": [ - { - "$ref": "#/texts/30" - } - ], + "children": [], "content_layer": "body", "name": "list", "label": "list" @@ -350,7 +342,7 @@ }, "children": [ { - "$ref": "#/texts/32" + "$ref": "#/texts/30" } ], "content_layer": "body", @@ -367,7 +359,7 @@ "$ref": "#/groups/12" }, { - "$ref": "#/texts/43" + "$ref": "#/texts/41" } ], "content_layer": "body", @@ -380,14 +372,14 @@ "$ref": "#/groups/11" }, "children": [ + { + "$ref": "#/texts/38" + }, + { + "$ref": "#/texts/39" + }, { "$ref": "#/texts/40" - }, - { - "$ref": "#/texts/41" - }, - { - "$ref": "#/texts/42" } ], "content_layer": "body", @@ -401,10 +393,10 @@ }, "children": [ { - "$ref": "#/texts/47" + "$ref": "#/texts/45" }, { - "$ref": "#/texts/48" + "$ref": "#/texts/46" } ], "content_layer": "body", @@ -418,10 +410,10 @@ }, "children": [ { - "$ref": "#/texts/50" + "$ref": "#/texts/48" }, { - "$ref": "#/texts/51" + "$ref": "#/texts/49" } ], "content_layer": "body", @@ -435,7 +427,7 @@ }, "children": [ { - "$ref": "#/texts/54" + "$ref": "#/texts/52" } ], "content_layer": "body", @@ -449,7 +441,7 @@ }, "children": [ { - "$ref": "#/texts/56" + "$ref": "#/texts/54" } ], "content_layer": "body", @@ -462,14 +454,14 @@ "$ref": "#/body" }, "children": [ + { + "$ref": "#/texts/57" + }, + { + "$ref": "#/texts/58" + }, { "$ref": "#/texts/59" - }, - { - "$ref": "#/texts/60" - }, - { - "$ref": "#/texts/61" } ], "content_layer": "body", @@ -483,7 +475,7 @@ }, "children": [ { - "$ref": "#/texts/63" + "$ref": "#/texts/61" } ], "content_layer": "body", @@ -592,7 +584,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/6", @@ -747,7 +739,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/17", @@ -768,7 +760,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/18", @@ -785,16 +777,14 @@ { "self_ref": "#/texts/19", "parent": { - "$ref": "#/groups/6" + "$ref": "#/body" }, "children": [], "content_layer": "body", - "label": "list_item", + "label": "paragraph", "prov": [], "orig": "", - "text": "", - "enumerated": false, - "marker": "-" + "text": "" }, { "self_ref": "#/texts/20", @@ -846,18 +836,6 @@ }, { "self_ref": "#/texts/24", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/25", "parent": { "$ref": "#/groups/7" }, @@ -876,7 +854,7 @@ } }, { - "self_ref": "#/texts/26", + "self_ref": "#/texts/25", "parent": { "$ref": "#/groups/7" }, @@ -895,7 +873,7 @@ } }, { - "self_ref": "#/texts/27", + "self_ref": "#/texts/26", "parent": { "$ref": "#/groups/8" }, @@ -913,10 +891,10 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/28", + "self_ref": "#/texts/27", "parent": { "$ref": "#/groups/8" }, @@ -934,10 +912,10 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { - "self_ref": "#/texts/29", + "self_ref": "#/texts/28", "parent": { "$ref": "#/groups/7" }, @@ -949,21 +927,7 @@ "text": "" }, { - "self_ref": "#/texts/30", - "parent": { - "$ref": "#/groups/9" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "", - "text": "", - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/31", + "self_ref": "#/texts/29", "parent": { "$ref": "#/body" }, @@ -975,7 +939,7 @@ "text": "" }, { - "self_ref": "#/texts/32", + "self_ref": "#/texts/30", "parent": { "$ref": "#/groups/10" }, @@ -993,6 +957,30 @@ "script": "baseline" } }, + { + "self_ref": "#/texts/31", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/32", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/33", "parent": { @@ -1055,30 +1043,6 @@ }, { "self_ref": "#/texts/38", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/39", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/40", "parent": { "$ref": "#/groups/12" }, @@ -1097,7 +1061,7 @@ } }, { - "self_ref": "#/texts/41", + "self_ref": "#/texts/39", "parent": { "$ref": "#/groups/12" }, @@ -1116,7 +1080,7 @@ } }, { - "self_ref": "#/texts/42", + "self_ref": "#/texts/40", "parent": { "$ref": "#/groups/12" }, @@ -1135,7 +1099,7 @@ } }, { - "self_ref": "#/texts/43", + "self_ref": "#/texts/41", "parent": { "$ref": "#/groups/11" }, @@ -1146,6 +1110,30 @@ "orig": "", "text": "" }, + { + "self_ref": "#/texts/42", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/43", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/44", "parent": { @@ -1160,30 +1148,6 @@ }, { "self_ref": "#/texts/45", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/46", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/47", "parent": { "$ref": "#/groups/13" }, @@ -1202,7 +1166,7 @@ } }, { - "self_ref": "#/texts/48", + "self_ref": "#/texts/46", "parent": { "$ref": "#/groups/13" }, @@ -1214,7 +1178,7 @@ "text": "" }, { - "self_ref": "#/texts/49", + "self_ref": "#/texts/47", "parent": { "$ref": "#/body" }, @@ -1226,7 +1190,7 @@ "text": "" }, { - "self_ref": "#/texts/50", + "self_ref": "#/texts/48", "parent": { "$ref": "#/groups/14" }, @@ -1245,7 +1209,7 @@ } }, { - "self_ref": "#/texts/51", + "self_ref": "#/texts/49", "parent": { "$ref": "#/groups/14" }, @@ -1264,7 +1228,7 @@ } }, { - "self_ref": "#/texts/52", + "self_ref": "#/texts/50", "parent": { "$ref": "#/body" }, @@ -1275,6 +1239,37 @@ "orig": "", "text": "" }, + { + "self_ref": "#/texts/51", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/52", + "parent": { + "$ref": "#/groups/15" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Yes", + "text": "Yes", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, { "self_ref": "#/texts/53", "parent": { @@ -1290,7 +1285,7 @@ { "self_ref": "#/texts/54", "parent": { - "$ref": "#/groups/15" + "$ref": "#/groups/16" }, "children": [], "content_layer": "body", @@ -1321,48 +1316,17 @@ { "self_ref": "#/texts/56", "parent": { - "$ref": "#/groups/16" + "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "Yes", - "text": "Yes", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } + "orig": "", + "text": "" }, { "self_ref": "#/texts/57", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/58", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/59", "parent": { "$ref": "#/groups/17" }, @@ -1381,7 +1345,7 @@ } }, { - "self_ref": "#/texts/60", + "self_ref": "#/texts/58", "parent": { "$ref": "#/groups/17" }, @@ -1393,7 +1357,7 @@ "text": "" }, { - "self_ref": "#/texts/61", + "self_ref": "#/texts/59", "parent": { "$ref": "#/groups/17" }, @@ -1411,6 +1375,37 @@ "script": "baseline" } }, + { + "self_ref": "#/texts/60", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/61", + "parent": { + "$ref": "#/groups/18" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "No", + "text": "No", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, { "self_ref": "#/texts/62", "parent": { @@ -1426,21 +1421,14 @@ { "self_ref": "#/texts/63", "parent": { - "$ref": "#/groups/18" + "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "No", - "text": "No", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false, - "script": "baseline" - } + "orig": "", + "text": "" }, { "self_ref": "#/texts/64", @@ -1453,30 +1441,6 @@ "prov": [], "orig": "", "text": "" - }, - { - "self_ref": "#/texts/65", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/66", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" } ], "pictures": [], diff --git a/tests/data/groundtruth/docling_v2/unit_test_01.html.json b/tests/data/groundtruth/docling_v2/unit_test_01.html.json index 08669c2..3a118c8 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_01.html.json +++ b/tests/data/groundtruth/docling_v2/unit_test_01.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "unit_test_01", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json index 4173fc6..2e564e6 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "unit_test_formatting", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -429,7 +429,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/14", @@ -450,7 +450,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/15", @@ -471,7 +471,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/16", @@ -489,7 +489,7 @@ "orig": "", "text": "", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/17", @@ -583,7 +583,7 @@ "orig": "", "text": "", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/22", diff --git a/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json b/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json index 32288fe..b70bf7b 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json +++ b/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "unit_test_headers", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", diff --git a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json index a0883e7..340c13d 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json +++ b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "unit_test_headers_numbered", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", diff --git a/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json b/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json index 2f0b928..e93085d 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json +++ b/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "unit_test_lists", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -456,7 +456,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/9", @@ -477,7 +477,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/10", @@ -498,7 +498,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/11", @@ -551,7 +551,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/14", @@ -572,7 +572,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/15", @@ -593,7 +593,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/16", @@ -646,7 +646,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/19", @@ -667,7 +667,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/20", @@ -688,7 +688,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/21", @@ -709,7 +709,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/22", @@ -730,7 +730,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/23", @@ -751,7 +751,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/24", @@ -804,7 +804,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/27", @@ -825,7 +825,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/28", @@ -846,7 +846,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/29", @@ -899,7 +899,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/32", @@ -920,7 +920,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/33", @@ -941,7 +941,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/34", @@ -962,7 +962,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/35", @@ -1021,7 +1021,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/38", @@ -1042,7 +1042,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/39", @@ -1063,7 +1063,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/40", @@ -1084,7 +1084,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/41", @@ -1105,7 +1105,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/42", @@ -1126,7 +1126,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/43", diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt b/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt index c0f5fdc..ca342ea 100644 --- a/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.itxt @@ -302,7 +302,7 @@ item-0 at level 0: unspecified: group _root_ item-288 at level 4: list_item: Rubber duck item-289 at level 2: section_header: Notes item-290 at level 3: section_header: Citations - item-291 at level 4: ordered_list: group ordered list + item-291 at level 4: list: group ordered list item-292 at level 5: list_item: ^ "Duckling". The American Herit ... n Company. 2006. Retrieved 2015-05-22. item-293 at level 5: list_item: ^ "Duckling". Kernerman English ... Ltd. 2000–2006. Retrieved 2015-05-22. item-294 at level 5: list_item: ^ Dohner, Janet Vorwald (2001). ... University Press. ISBN 978-0300138139. diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.json b/tests/data/groundtruth/docling_v2/wiki_duck.html.json index 952c96b..4a46406 100644 --- a/tests/data/groundtruth/docling_v2/wiki_duck.html.json +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "wiki_duck", "origin": { "mimetype": "text/html", @@ -1309,7 +1309,7 @@ ], "content_layer": "body", "name": "ordered list", - "label": "ordered_list" + "label": "list" }, { "self_ref": "#/groups/40", @@ -1653,7 +1653,7 @@ "orig": "Main page", "text": "Main page", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/3", @@ -1667,7 +1667,7 @@ "orig": "Contents", "text": "Contents", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/4", @@ -1681,7 +1681,7 @@ "orig": "Current events", "text": "Current events", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/5", @@ -1695,7 +1695,7 @@ "orig": "Random article", "text": "Random article", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/6", @@ -1709,7 +1709,7 @@ "orig": "About Wikipedia", "text": "About Wikipedia", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/7", @@ -1723,7 +1723,7 @@ "orig": "Contact us", "text": "Contact us", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/8", @@ -1749,7 +1749,7 @@ "orig": "Help", "text": "Help", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/10", @@ -1763,7 +1763,7 @@ "orig": "Learn to edit", "text": "Learn to edit", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/11", @@ -1777,7 +1777,7 @@ "orig": "Community portal", "text": "Community portal", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/12", @@ -1791,7 +1791,7 @@ "orig": "Recent changes", "text": "Recent changes", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/13", @@ -1805,7 +1805,7 @@ "orig": "Upload file", "text": "Upload file", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/14", @@ -1819,7 +1819,7 @@ "orig": "Donate", "text": "Donate", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/15", @@ -1833,7 +1833,7 @@ "orig": "Create account", "text": "Create account", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/16", @@ -1847,7 +1847,7 @@ "orig": "Log in", "text": "Log in", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/17", @@ -1861,7 +1861,7 @@ "orig": "Create account", "text": "Create account", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/18", @@ -1875,7 +1875,7 @@ "orig": "Log in", "text": "Log in", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/19", @@ -1901,7 +1901,7 @@ "orig": "Contributions", "text": "Contributions", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/21", @@ -1915,7 +1915,7 @@ "orig": "Talk", "text": "Talk", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/22", @@ -1946,7 +1946,7 @@ "orig": "(Top)", "text": "(Top)", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/24", @@ -1964,7 +1964,7 @@ "orig": "1 Etymology", "text": "1 Etymology", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/25", @@ -1982,7 +1982,7 @@ "orig": "2 Taxonomy", "text": "2 Taxonomy", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/26", @@ -2000,7 +2000,7 @@ "orig": "3 Morphology", "text": "3 Morphology", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/27", @@ -2018,7 +2018,7 @@ "orig": "4 Distribution and habitat", "text": "4 Distribution and habitat", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/28", @@ -2036,7 +2036,7 @@ "orig": "5 Behaviour Toggle Behaviour subsection", "text": "5 Behaviour Toggle Behaviour subsection", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/29", @@ -2054,7 +2054,7 @@ "orig": "5.1 Feeding", "text": "5.1 Feeding", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/30", @@ -2072,7 +2072,7 @@ "orig": "5.2 Breeding", "text": "5.2 Breeding", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/31", @@ -2090,7 +2090,7 @@ "orig": "5.3 Communication", "text": "5.3 Communication", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/32", @@ -2108,7 +2108,7 @@ "orig": "5.4 Predators", "text": "5.4 Predators", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/33", @@ -2126,7 +2126,7 @@ "orig": "6 Relationship with humans Toggle Relationship with humans subsection", "text": "6 Relationship with humans Toggle Relationship with humans subsection", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/34", @@ -2144,7 +2144,7 @@ "orig": "6.1 Hunting", "text": "6.1 Hunting", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/35", @@ -2162,7 +2162,7 @@ "orig": "6.2 Domestication", "text": "6.2 Domestication", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/36", @@ -2180,7 +2180,7 @@ "orig": "6.3 Heraldry", "text": "6.3 Heraldry", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/37", @@ -2198,7 +2198,7 @@ "orig": "6.4 Cultural references", "text": "6.4 Cultural references", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/38", @@ -2216,7 +2216,7 @@ "orig": "7 See also", "text": "7 See also", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/39", @@ -2234,7 +2234,7 @@ "orig": "8 Notes Toggle Notes subsection", "text": "8 Notes Toggle Notes subsection", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/40", @@ -2252,7 +2252,7 @@ "orig": "8.1 Citations", "text": "8.1 Citations", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/41", @@ -2270,7 +2270,7 @@ "orig": "8.2 Sources", "text": "8.2 Sources", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/42", @@ -2288,7 +2288,7 @@ "orig": "9 External links", "text": "9 External links", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/43", @@ -2408,7 +2408,7 @@ "orig": "Acèh", "text": "Acèh", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/45", @@ -2422,7 +2422,7 @@ "orig": "Afrikaans", "text": "Afrikaans", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/46", @@ -2436,7 +2436,7 @@ "orig": "Alemannisch", "text": "Alemannisch", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/47", @@ -2450,7 +2450,7 @@ "orig": "አማርኛ", "text": "አማርኛ", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/48", @@ -2464,7 +2464,7 @@ "orig": "Ænglisc", "text": "Ænglisc", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/49", @@ -2478,7 +2478,7 @@ "orig": "العربية", "text": "العربية", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/50", @@ -2492,7 +2492,7 @@ "orig": "Aragonés", "text": "Aragonés", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/51", @@ -2506,7 +2506,7 @@ "orig": "ܐܪܡܝܐ", "text": "ܐܪܡܝܐ", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/52", @@ -2520,7 +2520,7 @@ "orig": "Armãneashti", "text": "Armãneashti", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/53", @@ -2534,7 +2534,7 @@ "orig": "Asturianu", "text": "Asturianu", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/54", @@ -2548,7 +2548,7 @@ "orig": "Atikamekw", "text": "Atikamekw", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/55", @@ -2562,7 +2562,7 @@ "orig": "Авар", "text": "Авар", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/56", @@ -2576,7 +2576,7 @@ "orig": "Aymar aru", "text": "Aymar aru", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/57", @@ -2590,7 +2590,7 @@ "orig": "تۆرکجه", "text": "تۆرکجه", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/58", @@ -2604,7 +2604,7 @@ "orig": "Basa Bali", "text": "Basa Bali", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/59", @@ -2618,7 +2618,7 @@ "orig": "বাংলা", "text": "বাংলা", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/60", @@ -2632,7 +2632,7 @@ "orig": "閩南語 / Bân-lâm-gú", "text": "閩南語 / Bân-lâm-gú", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/61", @@ -2646,7 +2646,7 @@ "orig": "Беларуская", "text": "Беларуская", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/62", @@ -2660,7 +2660,7 @@ "orig": "Беларуская (тарашкевіца)", "text": "Беларуская (тарашкевіца)", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/63", @@ -2674,7 +2674,7 @@ "orig": "Bikol Central", "text": "Bikol Central", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/64", @@ -2688,7 +2688,7 @@ "orig": "Български", "text": "Български", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/65", @@ -2702,7 +2702,7 @@ "orig": "Brezhoneg", "text": "Brezhoneg", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/66", @@ -2716,7 +2716,7 @@ "orig": "Буряад", "text": "Буряад", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/67", @@ -2730,7 +2730,7 @@ "orig": "Català", "text": "Català", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/68", @@ -2744,7 +2744,7 @@ "orig": "Чӑвашла", "text": "Чӑвашла", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/69", @@ -2758,7 +2758,7 @@ "orig": "Čeština", "text": "Čeština", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/70", @@ -2772,7 +2772,7 @@ "orig": "ChiShona", "text": "ChiShona", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/71", @@ -2786,7 +2786,7 @@ "orig": "Cymraeg", "text": "Cymraeg", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/72", @@ -2800,7 +2800,7 @@ "orig": "Dagbanli", "text": "Dagbanli", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/73", @@ -2814,7 +2814,7 @@ "orig": "Dansk", "text": "Dansk", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/74", @@ -2828,7 +2828,7 @@ "orig": "Deitsch", "text": "Deitsch", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/75", @@ -2842,7 +2842,7 @@ "orig": "Deutsch", "text": "Deutsch", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/76", @@ -2856,7 +2856,7 @@ "orig": "डोटेली", "text": "डोटेली", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/77", @@ -2870,7 +2870,7 @@ "orig": "Ελληνικά", "text": "Ελληνικά", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/78", @@ -2884,7 +2884,7 @@ "orig": "Emiliàn e rumagnòl", "text": "Emiliàn e rumagnòl", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/79", @@ -2898,7 +2898,7 @@ "orig": "Español", "text": "Español", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/80", @@ -2912,7 +2912,7 @@ "orig": "Esperanto", "text": "Esperanto", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/81", @@ -2926,7 +2926,7 @@ "orig": "Euskara", "text": "Euskara", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/82", @@ -2940,7 +2940,7 @@ "orig": "فارسی", "text": "فارسی", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/83", @@ -2954,7 +2954,7 @@ "orig": "Français", "text": "Français", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/84", @@ -2968,7 +2968,7 @@ "orig": "Gaeilge", "text": "Gaeilge", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/85", @@ -2982,7 +2982,7 @@ "orig": "Galego", "text": "Galego", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/86", @@ -2996,7 +2996,7 @@ "orig": "ГӀалгӀай", "text": "ГӀалгӀай", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/87", @@ -3010,7 +3010,7 @@ "orig": "贛語", "text": "贛語", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/88", @@ -3024,7 +3024,7 @@ "orig": "گیلکی", "text": "گیلکی", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/89", @@ -3038,7 +3038,7 @@ "orig": "𐌲𐌿𐍄𐌹𐍃𐌺", "text": "𐌲𐌿𐍄𐌹𐍃𐌺", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/90", @@ -3052,7 +3052,7 @@ "orig": "गोंयची कोंकणी / Gõychi Konknni", "text": "गोंयची कोंकणी / Gõychi Konknni", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/91", @@ -3066,7 +3066,7 @@ "orig": "客家語 / Hak-kâ-ngî", "text": "客家語 / Hak-kâ-ngî", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/92", @@ -3080,7 +3080,7 @@ "orig": "한국어", "text": "한국어", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/93", @@ -3094,7 +3094,7 @@ "orig": "Hausa", "text": "Hausa", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/94", @@ -3108,7 +3108,7 @@ "orig": "Հայերեն", "text": "Հայերեն", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/95", @@ -3122,7 +3122,7 @@ "orig": "हिन्दी", "text": "हिन्दी", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/96", @@ -3136,7 +3136,7 @@ "orig": "Hrvatski", "text": "Hrvatski", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/97", @@ -3150,7 +3150,7 @@ "orig": "Ido", "text": "Ido", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/98", @@ -3164,7 +3164,7 @@ "orig": "Bahasa Indonesia", "text": "Bahasa Indonesia", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/99", @@ -3178,7 +3178,7 @@ "orig": "Iñupiatun", "text": "Iñupiatun", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/100", @@ -3192,7 +3192,7 @@ "orig": "Íslenska", "text": "Íslenska", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/101", @@ -3206,7 +3206,7 @@ "orig": "Italiano", "text": "Italiano", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/102", @@ -3220,7 +3220,7 @@ "orig": "עברית", "text": "עברית", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/103", @@ -3234,7 +3234,7 @@ "orig": "Jawa", "text": "Jawa", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/104", @@ -3248,7 +3248,7 @@ "orig": "ಕನ್ನಡ", "text": "ಕನ್ನಡ", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/105", @@ -3262,7 +3262,7 @@ "orig": "Kapampangan", "text": "Kapampangan", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/106", @@ -3276,7 +3276,7 @@ "orig": "ქართული", "text": "ქართული", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/107", @@ -3290,7 +3290,7 @@ "orig": "कॉशुर / کٲشُر", "text": "कॉशुर / کٲشُر", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/108", @@ -3304,7 +3304,7 @@ "orig": "Қазақша", "text": "Қазақша", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/109", @@ -3318,7 +3318,7 @@ "orig": "Ikirundi", "text": "Ikirundi", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/110", @@ -3332,7 +3332,7 @@ "orig": "Kongo", "text": "Kongo", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/111", @@ -3346,7 +3346,7 @@ "orig": "Kreyòl ayisyen", "text": "Kreyòl ayisyen", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/112", @@ -3360,7 +3360,7 @@ "orig": "Кырык мары", "text": "Кырык мары", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/113", @@ -3374,7 +3374,7 @@ "orig": "ລາວ", "text": "ລາວ", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/114", @@ -3388,7 +3388,7 @@ "orig": "Latina", "text": "Latina", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/115", @@ -3402,7 +3402,7 @@ "orig": "Latviešu", "text": "Latviešu", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/116", @@ -3416,7 +3416,7 @@ "orig": "Lietuvių", "text": "Lietuvių", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/117", @@ -3430,7 +3430,7 @@ "orig": "Li Niha", "text": "Li Niha", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/118", @@ -3444,7 +3444,7 @@ "orig": "Ligure", "text": "Ligure", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/119", @@ -3458,7 +3458,7 @@ "orig": "Limburgs", "text": "Limburgs", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/120", @@ -3472,7 +3472,7 @@ "orig": "Lingála", "text": "Lingála", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/121", @@ -3486,7 +3486,7 @@ "orig": "Malagasy", "text": "Malagasy", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/122", @@ -3500,7 +3500,7 @@ "orig": "മലയാളം", "text": "മലയാളം", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/123", @@ -3514,7 +3514,7 @@ "orig": "मराठी", "text": "मराठी", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/124", @@ -3528,7 +3528,7 @@ "orig": "مازِرونی", "text": "مازِرونی", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/125", @@ -3542,7 +3542,7 @@ "orig": "Bahasa Melayu", "text": "Bahasa Melayu", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/126", @@ -3556,7 +3556,7 @@ "orig": "ꯃꯤꯇꯩ ꯂꯣꯟ", "text": "ꯃꯤꯇꯩ ꯂꯣꯟ", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/127", @@ -3570,7 +3570,7 @@ "orig": "閩東語 / Mìng-dĕ̤ng-ngṳ̄", "text": "閩東語 / Mìng-dĕ̤ng-ngṳ̄", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/128", @@ -3584,7 +3584,7 @@ "orig": "Мокшень", "text": "Мокшень", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/129", @@ -3598,7 +3598,7 @@ "orig": "Монгол", "text": "Монгол", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/130", @@ -3612,7 +3612,7 @@ "orig": "မြန်မာဘာသာ", "text": "မြန်မာဘာသာ", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/131", @@ -3626,7 +3626,7 @@ "orig": "Nederlands", "text": "Nederlands", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/132", @@ -3640,7 +3640,7 @@ "orig": "Nedersaksies", "text": "Nedersaksies", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/133", @@ -3654,7 +3654,7 @@ "orig": "नेपाली", "text": "नेपाली", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/134", @@ -3668,7 +3668,7 @@ "orig": "नेपाल भाषा", "text": "नेपाल भाषा", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/135", @@ -3682,7 +3682,7 @@ "orig": "日本語", "text": "日本語", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/136", @@ -3696,7 +3696,7 @@ "orig": "Нохчийн", "text": "Нохчийн", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/137", @@ -3710,7 +3710,7 @@ "orig": "Norsk nynorsk", "text": "Norsk nynorsk", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/138", @@ -3724,7 +3724,7 @@ "orig": "Occitan", "text": "Occitan", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/139", @@ -3738,7 +3738,7 @@ "orig": "Oromoo", "text": "Oromoo", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/140", @@ -3752,7 +3752,7 @@ "orig": "ਪੰਜਾਬੀ", "text": "ਪੰਜਾਬੀ", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/141", @@ -3766,7 +3766,7 @@ "orig": "Picard", "text": "Picard", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/142", @@ -3780,7 +3780,7 @@ "orig": "Plattdüütsch", "text": "Plattdüütsch", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/143", @@ -3794,7 +3794,7 @@ "orig": "Polski", "text": "Polski", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/144", @@ -3808,7 +3808,7 @@ "orig": "Português", "text": "Português", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/145", @@ -3822,7 +3822,7 @@ "orig": "Qırımtatarca", "text": "Qırımtatarca", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/146", @@ -3836,7 +3836,7 @@ "orig": "Română", "text": "Română", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/147", @@ -3850,7 +3850,7 @@ "orig": "Русский", "text": "Русский", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/148", @@ -3864,7 +3864,7 @@ "orig": "Саха тыла", "text": "Саха тыла", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/149", @@ -3878,7 +3878,7 @@ "orig": "ᱥᱟᱱᱛᱟᱲᱤ", "text": "ᱥᱟᱱᱛᱟᱲᱤ", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/150", @@ -3892,7 +3892,7 @@ "orig": "Sardu", "text": "Sardu", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/151", @@ -3906,7 +3906,7 @@ "orig": "Scots", "text": "Scots", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/152", @@ -3920,7 +3920,7 @@ "orig": "Seeltersk", "text": "Seeltersk", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/153", @@ -3934,7 +3934,7 @@ "orig": "Shqip", "text": "Shqip", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/154", @@ -3948,7 +3948,7 @@ "orig": "Sicilianu", "text": "Sicilianu", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/155", @@ -3962,7 +3962,7 @@ "orig": "සිංහල", "text": "සිංහල", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/156", @@ -3976,7 +3976,7 @@ "orig": "Simple English", "text": "Simple English", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/157", @@ -3990,7 +3990,7 @@ "orig": "سنڌي", "text": "سنڌي", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/158", @@ -4004,7 +4004,7 @@ "orig": "کوردی", "text": "کوردی", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/159", @@ -4018,7 +4018,7 @@ "orig": "Српски / srpski", "text": "Српски / srpski", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/160", @@ -4032,7 +4032,7 @@ "orig": "Srpskohrvatski / српскохрватски", "text": "Srpskohrvatski / српскохрватски", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/161", @@ -4046,7 +4046,7 @@ "orig": "Sunda", "text": "Sunda", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/162", @@ -4060,7 +4060,7 @@ "orig": "Svenska", "text": "Svenska", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/163", @@ -4074,7 +4074,7 @@ "orig": "Tagalog", "text": "Tagalog", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/164", @@ -4088,7 +4088,7 @@ "orig": "தமிழ்", "text": "தமிழ்", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/165", @@ -4102,7 +4102,7 @@ "orig": "Taqbaylit", "text": "Taqbaylit", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/166", @@ -4116,7 +4116,7 @@ "orig": "Татарча / tatarça", "text": "Татарча / tatarça", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/167", @@ -4130,7 +4130,7 @@ "orig": "ไทย", "text": "ไทย", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/168", @@ -4144,7 +4144,7 @@ "orig": "Türkçe", "text": "Türkçe", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/169", @@ -4158,7 +4158,7 @@ "orig": "Українська", "text": "Українська", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/170", @@ -4172,7 +4172,7 @@ "orig": "ئۇيغۇرچە / Uyghurche", "text": "ئۇيغۇرچە / Uyghurche", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/171", @@ -4186,7 +4186,7 @@ "orig": "Vahcuengh", "text": "Vahcuengh", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/172", @@ -4200,7 +4200,7 @@ "orig": "Tiếng Việt", "text": "Tiếng Việt", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/173", @@ -4214,7 +4214,7 @@ "orig": "Walon", "text": "Walon", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/174", @@ -4228,7 +4228,7 @@ "orig": "文言", "text": "文言", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/175", @@ -4242,7 +4242,7 @@ "orig": "Winaray", "text": "Winaray", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/176", @@ -4256,7 +4256,7 @@ "orig": "吴语", "text": "吴语", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/177", @@ -4270,7 +4270,7 @@ "orig": "粵語", "text": "粵語", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/178", @@ -4284,7 +4284,7 @@ "orig": "Žemaitėška", "text": "Žemaitėška", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/179", @@ -4298,7 +4298,7 @@ "orig": "中文", "text": "中文", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/180", @@ -4312,7 +4312,7 @@ "orig": "Article", "text": "Article", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/181", @@ -4326,7 +4326,7 @@ "orig": "Talk", "text": "Talk", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/182", @@ -4340,7 +4340,7 @@ "orig": "Read", "text": "Read", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/183", @@ -4354,7 +4354,7 @@ "orig": "View source", "text": "View source", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/184", @@ -4368,7 +4368,7 @@ "orig": "View history", "text": "View history", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/185", @@ -4406,7 +4406,7 @@ "orig": "Read", "text": "Read", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/188", @@ -4420,7 +4420,7 @@ "orig": "View source", "text": "View source", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/189", @@ -4434,7 +4434,7 @@ "orig": "View history", "text": "View history", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/190", @@ -4460,7 +4460,7 @@ "orig": "What links here", "text": "What links here", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/192", @@ -4474,7 +4474,7 @@ "orig": "Related changes", "text": "Related changes", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/193", @@ -4488,7 +4488,7 @@ "orig": "Upload file", "text": "Upload file", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/194", @@ -4502,7 +4502,7 @@ "orig": "Special pages", "text": "Special pages", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/195", @@ -4516,7 +4516,7 @@ "orig": "Permanent link", "text": "Permanent link", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/196", @@ -4530,7 +4530,7 @@ "orig": "Page information", "text": "Page information", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/197", @@ -4544,7 +4544,7 @@ "orig": "Cite this page", "text": "Cite this page", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/198", @@ -4558,7 +4558,7 @@ "orig": "Get shortened URL", "text": "Get shortened URL", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/199", @@ -4572,7 +4572,7 @@ "orig": "Download QR code", "text": "Download QR code", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/200", @@ -4586,7 +4586,7 @@ "orig": "Wikidata item", "text": "Wikidata item", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/201", @@ -4612,7 +4612,7 @@ "orig": "Download as PDF", "text": "Download as PDF", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/203", @@ -4626,7 +4626,7 @@ "orig": "Printable version", "text": "Printable version", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/204", @@ -4652,7 +4652,7 @@ "orig": "Wikimedia Commons", "text": "Wikimedia Commons", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/206", @@ -4666,7 +4666,7 @@ "orig": "Wikiquote", "text": "Wikiquote", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/207", @@ -5619,7 +5619,7 @@ "orig": "Birds portal", "text": "Birds portal", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/271", @@ -5633,7 +5633,7 @@ "orig": "Domestic duck", "text": "Domestic duck", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/272", @@ -5647,7 +5647,7 @@ "orig": "Duck as food", "text": "Duck as food", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/273", @@ -5661,7 +5661,7 @@ "orig": "Duck test", "text": "Duck test", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/274", @@ -5675,7 +5675,7 @@ "orig": "Duck breeds", "text": "Duck breeds", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/275", @@ -5689,7 +5689,7 @@ "orig": "Fictional ducks", "text": "Fictional ducks", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/276", @@ -5703,7 +5703,7 @@ "orig": "Rubber duck", "text": "Rubber duck", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/277", @@ -5754,7 +5754,7 @@ "orig": "^ \"Duckling\". The American Heritage Dictionary of the English Language, Fourth Edition. Houghton Mifflin Company. 2006. Retrieved 2015-05-22.", "text": "^ \"Duckling\". The American Heritage Dictionary of the English Language, Fourth Edition. Houghton Mifflin Company. 2006. Retrieved 2015-05-22.", "enumerated": true, - "marker": "1." + "marker": "" }, { "self_ref": "#/texts/280", @@ -5768,7 +5768,7 @@ "orig": "^ \"Duckling\". Kernerman English Multilingual Dictionary (Beta Version). K. Dictionaries Ltd. 2000–2006. Retrieved 2015-05-22.", "text": "^ \"Duckling\". Kernerman English Multilingual Dictionary (Beta Version). K. Dictionaries Ltd. 2000–2006. Retrieved 2015-05-22.", "enumerated": true, - "marker": "2." + "marker": "" }, { "self_ref": "#/texts/281", @@ -5782,7 +5782,7 @@ "orig": "^ Dohner, Janet Vorwald (2001). The Encyclopedia of Historic and Endangered Livestock and Poultry Breeds. Yale University Press. ISBN 978-0300138139.", "text": "^ Dohner, Janet Vorwald (2001). The Encyclopedia of Historic and Endangered Livestock and Poultry Breeds. Yale University Press. ISBN 978-0300138139.", "enumerated": true, - "marker": "3." + "marker": "" }, { "self_ref": "#/texts/282", @@ -5796,7 +5796,7 @@ "orig": "^ Visca, Curt; Visca, Kelley (2003). How to Draw Cartoon Birds. The Rosen Publishing Group. ISBN 9780823961566.", "text": "^ Visca, Curt; Visca, Kelley (2003). How to Draw Cartoon Birds. The Rosen Publishing Group. ISBN 9780823961566.", "enumerated": true, - "marker": "4." + "marker": "" }, { "self_ref": "#/texts/283", @@ -5810,7 +5810,7 @@ "orig": "^ a b c d Carboneras 1992, p. 536.", "text": "^ a b c d Carboneras 1992, p. 536.", "enumerated": true, - "marker": "5." + "marker": "" }, { "self_ref": "#/texts/284", @@ -5824,7 +5824,7 @@ "orig": "^ Livezey 1986, pp. 737–738.", "text": "^ Livezey 1986, pp. 737–738.", "enumerated": true, - "marker": "6." + "marker": "" }, { "self_ref": "#/texts/285", @@ -5838,7 +5838,7 @@ "orig": "^ Madsen, McHugh & de Kloet 1988, p. 452.", "text": "^ Madsen, McHugh & de Kloet 1988, p. 452.", "enumerated": true, - "marker": "7." + "marker": "" }, { "self_ref": "#/texts/286", @@ -5852,7 +5852,7 @@ "orig": "^ Donne-Goussé, Laudet & Hänni 2002, pp. 353–354.", "text": "^ Donne-Goussé, Laudet & Hänni 2002, pp. 353–354.", "enumerated": true, - "marker": "8." + "marker": "" }, { "self_ref": "#/texts/287", @@ -5866,7 +5866,7 @@ "orig": "^ a b c d e f Carboneras 1992, p. 540.", "text": "^ a b c d e f Carboneras 1992, p. 540.", "enumerated": true, - "marker": "9." + "marker": "" }, { "self_ref": "#/texts/288", @@ -5880,7 +5880,7 @@ "orig": "^ Elphick, Dunning & Sibley 2001, p. 191.", "text": "^ Elphick, Dunning & Sibley 2001, p. 191.", "enumerated": true, - "marker": "10." + "marker": "" }, { "self_ref": "#/texts/289", @@ -5894,7 +5894,7 @@ "orig": "^ Kear 2005, p. 448.", "text": "^ Kear 2005, p. 448.", "enumerated": true, - "marker": "11." + "marker": "" }, { "self_ref": "#/texts/290", @@ -5908,7 +5908,7 @@ "orig": "^ Kear 2005, p. 622–623.", "text": "^ Kear 2005, p. 622–623.", "enumerated": true, - "marker": "12." + "marker": "" }, { "self_ref": "#/texts/291", @@ -5922,7 +5922,7 @@ "orig": "^ Kear 2005, p. 686.", "text": "^ Kear 2005, p. 686.", "enumerated": true, - "marker": "13." + "marker": "" }, { "self_ref": "#/texts/292", @@ -5936,7 +5936,7 @@ "orig": "^ Elphick, Dunning & Sibley 2001, p. 193.", "text": "^ Elphick, Dunning & Sibley 2001, p. 193.", "enumerated": true, - "marker": "14." + "marker": "" }, { "self_ref": "#/texts/293", @@ -5950,7 +5950,7 @@ "orig": "^ a b c d e f g Carboneras 1992, p. 537.", "text": "^ a b c d e f g Carboneras 1992, p. 537.", "enumerated": true, - "marker": "15." + "marker": "" }, { "self_ref": "#/texts/294", @@ -5964,7 +5964,7 @@ "orig": "^ American Ornithologists' Union 1998, p. xix.", "text": "^ American Ornithologists' Union 1998, p. xix.", "enumerated": true, - "marker": "16." + "marker": "" }, { "self_ref": "#/texts/295", @@ -5978,7 +5978,7 @@ "orig": "^ American Ornithologists' Union 1998.", "text": "^ American Ornithologists' Union 1998.", "enumerated": true, - "marker": "17." + "marker": "" }, { "self_ref": "#/texts/296", @@ -5992,7 +5992,7 @@ "orig": "^ Carboneras 1992, p. 538.", "text": "^ Carboneras 1992, p. 538.", "enumerated": true, - "marker": "18." + "marker": "" }, { "self_ref": "#/texts/297", @@ -6006,7 +6006,7 @@ "orig": "^ Christidis & Boles 2008, p. 62.", "text": "^ Christidis & Boles 2008, p. 62.", "enumerated": true, - "marker": "19." + "marker": "" }, { "self_ref": "#/texts/298", @@ -6020,7 +6020,7 @@ "orig": "^ Shirihai 2008, pp. 239, 245.", "text": "^ Shirihai 2008, pp. 239, 245.", "enumerated": true, - "marker": "20." + "marker": "" }, { "self_ref": "#/texts/299", @@ -6034,7 +6034,7 @@ "orig": "^ a b Pratt, Bruner & Berrett 1987, pp. 98–107.", "text": "^ a b Pratt, Bruner & Berrett 1987, pp. 98–107.", "enumerated": true, - "marker": "21." + "marker": "" }, { "self_ref": "#/texts/300", @@ -6048,7 +6048,7 @@ "orig": "^ Fitter, Fitter & Hosking 2000, pp. 52–3.", "text": "^ Fitter, Fitter & Hosking 2000, pp. 52–3.", "enumerated": true, - "marker": "22." + "marker": "" }, { "self_ref": "#/texts/301", @@ -6062,7 +6062,7 @@ "orig": "^ \"Pacific Black Duck\". www.wiresnr.org. Retrieved 2018-04-27.", "text": "^ \"Pacific Black Duck\". www.wiresnr.org. Retrieved 2018-04-27.", "enumerated": true, - "marker": "23." + "marker": "" }, { "self_ref": "#/texts/302", @@ -6076,7 +6076,7 @@ "orig": "^ Ogden, Evans. \"Dabbling Ducks\". CWE. Retrieved 2006-11-02.", "text": "^ Ogden, Evans. \"Dabbling Ducks\". CWE. Retrieved 2006-11-02.", "enumerated": true, - "marker": "24." + "marker": "" }, { "self_ref": "#/texts/303", @@ -6090,7 +6090,7 @@ "orig": "^ Karl Mathiesen (16 March 2015). \"Don't feed the ducks bread, say conservationists\". The Guardian. Retrieved 13 November 2016.", "text": "^ Karl Mathiesen (16 March 2015). \"Don't feed the ducks bread, say conservationists\". The Guardian. Retrieved 13 November 2016.", "enumerated": true, - "marker": "25." + "marker": "" }, { "self_ref": "#/texts/304", @@ -6104,7 +6104,7 @@ "orig": "^ Rohwer, Frank C.; Anderson, Michael G. (1988). \"Female-Biased Philopatry, Monogamy, and the Timing of Pair Formation in Migratory Waterfowl\". Current Ornithology. pp. 187–221. doi:10.1007/978-1-4615-6787-5_4. ISBN 978-1-4615-6789-9.", "text": "^ Rohwer, Frank C.; Anderson, Michael G. (1988). \"Female-Biased Philopatry, Monogamy, and the Timing of Pair Formation in Migratory Waterfowl\". Current Ornithology. pp. 187–221. doi:10.1007/978-1-4615-6787-5_4. ISBN 978-1-4615-6789-9.", "enumerated": true, - "marker": "26." + "marker": "" }, { "self_ref": "#/texts/305", @@ -6118,7 +6118,7 @@ "orig": "^ Smith, Cyndi M.; Cooke, Fred; Robertson, Gregory J.; Goudie, R. Ian; Boyd, W. Sean (2000). \"Long-Term Pair Bonds in Harlequin Ducks\". The Condor. 102 (1): 201–205. doi:10.1093/condor/102.1.201. hdl:10315/13797.", "text": "^ Smith, Cyndi M.; Cooke, Fred; Robertson, Gregory J.; Goudie, R. Ian; Boyd, W. Sean (2000). \"Long-Term Pair Bonds in Harlequin Ducks\". The Condor. 102 (1): 201–205. doi:10.1093/condor/102.1.201. hdl:10315/13797.", "enumerated": true, - "marker": "27." + "marker": "" }, { "self_ref": "#/texts/306", @@ -6132,7 +6132,7 @@ "orig": "^ \"If You Find An Orphaned Duckling - Wildlife Rehabber\". wildliferehabber.com. Archived from the original on 2018-09-23. Retrieved 2018-12-22.", "text": "^ \"If You Find An Orphaned Duckling - Wildlife Rehabber\". wildliferehabber.com. Archived from the original on 2018-09-23. Retrieved 2018-12-22.", "enumerated": true, - "marker": "28." + "marker": "" }, { "self_ref": "#/texts/307", @@ -6146,7 +6146,7 @@ "orig": "^ Carver, Heather (2011). The Duck Bible. Lulu.com. ISBN 9780557901562.[self-published source]", "text": "^ Carver, Heather (2011). The Duck Bible. Lulu.com. ISBN 9780557901562.[self-published source]", "enumerated": true, - "marker": "29." + "marker": "" }, { "self_ref": "#/texts/308", @@ -6160,7 +6160,7 @@ "orig": "^ Titlow, Budd (2013-09-03). Bird Brains: Inside the Strange Minds of Our Fine Feathered Friends. Rowman & Littlefield. ISBN 9780762797707.", "text": "^ Titlow, Budd (2013-09-03). Bird Brains: Inside the Strange Minds of Our Fine Feathered Friends. Rowman & Littlefield. ISBN 9780762797707.", "enumerated": true, - "marker": "30." + "marker": "" }, { "self_ref": "#/texts/309", @@ -6174,7 +6174,7 @@ "orig": "^ Amos, Jonathan (2003-09-08). \"Sound science is quackers\". BBC News. Retrieved 2006-11-02.", "text": "^ Amos, Jonathan (2003-09-08). \"Sound science is quackers\". BBC News. Retrieved 2006-11-02.", "enumerated": true, - "marker": "31." + "marker": "" }, { "self_ref": "#/texts/310", @@ -6188,7 +6188,7 @@ "orig": "^ \"Mythbusters Episode 8\". 12 December 2003.", "text": "^ \"Mythbusters Episode 8\". 12 December 2003.", "enumerated": true, - "marker": "32." + "marker": "" }, { "self_ref": "#/texts/311", @@ -6202,7 +6202,7 @@ "orig": "^ Erlandson 1994, p. 171.", "text": "^ Erlandson 1994, p. 171.", "enumerated": true, - "marker": "33." + "marker": "" }, { "self_ref": "#/texts/312", @@ -6216,7 +6216,7 @@ "orig": "^ Jeffries 2008, pp. 168, 243.", "text": "^ Jeffries 2008, pp. 168, 243.", "enumerated": true, - "marker": "34." + "marker": "" }, { "self_ref": "#/texts/313", @@ -6230,7 +6230,7 @@ "orig": "^ a b Sued-Badillo 2003, p. 65.", "text": "^ a b Sued-Badillo 2003, p. 65.", "enumerated": true, - "marker": "35." + "marker": "" }, { "self_ref": "#/texts/314", @@ -6244,7 +6244,7 @@ "orig": "^ Thorpe 1996, p. 68.", "text": "^ Thorpe 1996, p. 68.", "enumerated": true, - "marker": "36." + "marker": "" }, { "self_ref": "#/texts/315", @@ -6258,7 +6258,7 @@ "orig": "^ Maisels 1999, p. 42.", "text": "^ Maisels 1999, p. 42.", "enumerated": true, - "marker": "37." + "marker": "" }, { "self_ref": "#/texts/316", @@ -6272,7 +6272,7 @@ "orig": "^ Rau 1876, p. 133.", "text": "^ Rau 1876, p. 133.", "enumerated": true, - "marker": "38." + "marker": "" }, { "self_ref": "#/texts/317", @@ -6286,7 +6286,7 @@ "orig": "^ Higman 2012, p. 23.", "text": "^ Higman 2012, p. 23.", "enumerated": true, - "marker": "39." + "marker": "" }, { "self_ref": "#/texts/318", @@ -6300,7 +6300,7 @@ "orig": "^ Hume 2012, p. 53.", "text": "^ Hume 2012, p. 53.", "enumerated": true, - "marker": "40." + "marker": "" }, { "self_ref": "#/texts/319", @@ -6314,7 +6314,7 @@ "orig": "^ Hume 2012, p. 52.", "text": "^ Hume 2012, p. 52.", "enumerated": true, - "marker": "41." + "marker": "" }, { "self_ref": "#/texts/320", @@ -6328,7 +6328,7 @@ "orig": "^ Fieldhouse 2002, p. 167.", "text": "^ Fieldhouse 2002, p. 167.", "enumerated": true, - "marker": "42." + "marker": "" }, { "self_ref": "#/texts/321", @@ -6342,7 +6342,7 @@ "orig": "^ Livingston, A. D. (1998-01-01). Guide to Edible Plants and Animals. Wordsworth Editions, Limited. ISBN 9781853263774.", "text": "^ Livingston, A. D. (1998-01-01). Guide to Edible Plants and Animals. Wordsworth Editions, Limited. ISBN 9781853263774.", "enumerated": true, - "marker": "43." + "marker": "" }, { "self_ref": "#/texts/322", @@ -6356,7 +6356,7 @@ "orig": "^ \"Study plan for waterfowl injury assessment: Determining PCB concentrations in Hudson river resident waterfowl\" (PDF). New York State Department of Environmental Conservation. US Department of Commerce. December 2008. p. 3. Archived (PDF) from the original on 2022-10-09. Retrieved 2 July 2019.", "text": "^ \"Study plan for waterfowl injury assessment: Determining PCB concentrations in Hudson river resident waterfowl\" (PDF). New York State Department of Environmental Conservation. US Department of Commerce. December 2008. p. 3. Archived (PDF) from the original on 2022-10-09. Retrieved 2 July 2019.", "enumerated": true, - "marker": "44." + "marker": "" }, { "self_ref": "#/texts/323", @@ -6370,7 +6370,7 @@ "orig": "^ \"FAOSTAT\". www.fao.org. Retrieved 2019-10-25.", "text": "^ \"FAOSTAT\". www.fao.org. Retrieved 2019-10-25.", "enumerated": true, - "marker": "45." + "marker": "" }, { "self_ref": "#/texts/324", @@ -6384,7 +6384,7 @@ "orig": "^ \"Anas platyrhynchos, Domestic Duck; DigiMorph Staff - The University of Texas at Austin\". Digimorph.org. Retrieved 2012-12-23.", "text": "^ \"Anas platyrhynchos, Domestic Duck; DigiMorph Staff - The University of Texas at Austin\". Digimorph.org. Retrieved 2012-12-23.", "enumerated": true, - "marker": "46." + "marker": "" }, { "self_ref": "#/texts/325", @@ -6398,7 +6398,7 @@ "orig": "^ Sy Montgomery. \"Mallard; Encyclopædia Britannica\". Britannica.com. Retrieved 2012-12-23.", "text": "^ Sy Montgomery. \"Mallard; Encyclopædia Britannica\". Britannica.com. Retrieved 2012-12-23.", "enumerated": true, - "marker": "47." + "marker": "" }, { "self_ref": "#/texts/326", @@ -6412,7 +6412,7 @@ "orig": "^ Glenday, Craig (2014). Guinness World Records. Guinness World Records Limited. pp. 135. ISBN 978-1-908843-15-9.", "text": "^ Glenday, Craig (2014). Guinness World Records. Guinness World Records Limited. pp. 135. ISBN 978-1-908843-15-9.", "enumerated": true, - "marker": "48." + "marker": "" }, { "self_ref": "#/texts/327", @@ -6426,7 +6426,7 @@ "orig": "^ Suomen kunnallisvaakunat (in Finnish). Suomen Kunnallisliitto. 1982. p. 147. ISBN 951-773-085-3.", "text": "^ Suomen kunnallisvaakunat (in Finnish). Suomen Kunnallisliitto. 1982. p. 147. ISBN 951-773-085-3.", "enumerated": true, - "marker": "49." + "marker": "" }, { "self_ref": "#/texts/328", @@ -6440,7 +6440,7 @@ "orig": "^ \"Lubānas simbolika\" (in Latvian). Retrieved September 9, 2021.", "text": "^ \"Lubānas simbolika\" (in Latvian). Retrieved September 9, 2021.", "enumerated": true, - "marker": "50." + "marker": "" }, { "self_ref": "#/texts/329", @@ -6454,7 +6454,7 @@ "orig": "^ \"Föglö\" (in Swedish). Retrieved September 9, 2021.", "text": "^ \"Föglö\" (in Swedish). Retrieved September 9, 2021.", "enumerated": true, - "marker": "51." + "marker": "" }, { "self_ref": "#/texts/330", @@ -6468,7 +6468,7 @@ "orig": "^ Young, Emma. \"World's funniest joke revealed\". New Scientist. Retrieved 7 January 2019.", "text": "^ Young, Emma. \"World's funniest joke revealed\". New Scientist. Retrieved 7 January 2019.", "enumerated": true, - "marker": "52." + "marker": "" }, { "self_ref": "#/texts/331", @@ -6482,7 +6482,7 @@ "orig": "^ \"Howard the Duck (character)\". Grand Comics Database.", "text": "^ \"Howard the Duck (character)\". Grand Comics Database.", "enumerated": true, - "marker": "53." + "marker": "" }, { "self_ref": "#/texts/332", @@ -6496,7 +6496,7 @@ "orig": "^ Sanderson, Peter; Gilbert, Laura (2008). \"1970s\". Marvel Chronicle A Year by Year History. London, United Kingdom: Dorling Kindersley. p. 161. ISBN 978-0756641238. December saw the debut of the cigar-smoking Howard the Duck. In this story by writer Steve Gerber and artist Val Mayerik, various beings from different realities had begun turning up in the Man-Thing's Florida swamp, including this bad-tempered talking duck.", "text": "^ Sanderson, Peter; Gilbert, Laura (2008). \"1970s\". Marvel Chronicle A Year by Year History. London, United Kingdom: Dorling Kindersley. p. 161. ISBN 978-0756641238. December saw the debut of the cigar-smoking Howard the Duck. In this story by writer Steve Gerber and artist Val Mayerik, various beings from different realities had begun turning up in the Man-Thing's Florida swamp, including this bad-tempered talking duck.", "enumerated": true, - "marker": "54." + "marker": "" }, { "self_ref": "#/texts/333", @@ -6510,7 +6510,7 @@ "orig": "^ \"The Duck\". University of Oregon Athletics. Retrieved 2022-01-20.", "text": "^ \"The Duck\". University of Oregon Athletics. Retrieved 2022-01-20.", "enumerated": true, - "marker": "55." + "marker": "" }, { "self_ref": "#/texts/334", @@ -6541,7 +6541,7 @@ "orig": "American Ornithologists' Union (1998). Checklist of North American Birds (PDF). Washington, DC: American Ornithologists' Union. ISBN 978-1-891276-00-2. Archived (PDF) from the original on 2022-10-09.", "text": "American Ornithologists' Union (1998). Checklist of North American Birds (PDF). Washington, DC: American Ornithologists' Union. ISBN 978-1-891276-00-2. Archived (PDF) from the original on 2022-10-09.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/336", @@ -6555,7 +6555,7 @@ "orig": "Carboneras, Carlos (1992). del Hoyo, Josep; Elliott, Andrew; Sargatal, Jordi (eds.). Handbook of the Birds of the World. Vol. 1: Ostrich to Ducks. Barcelona: Lynx Edicions. ISBN 978-84-87334-10-8.", "text": "Carboneras, Carlos (1992). del Hoyo, Josep; Elliott, Andrew; Sargatal, Jordi (eds.). Handbook of the Birds of the World. Vol. 1: Ostrich to Ducks. Barcelona: Lynx Edicions. ISBN 978-84-87334-10-8.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/337", @@ -6569,7 +6569,7 @@ "orig": "Christidis, Les; Boles, Walter E., eds. (2008). Systematics and Taxonomy of Australian Birds. Collingwood, VIC: Csiro Publishing. ISBN 978-0-643-06511-6.", "text": "Christidis, Les; Boles, Walter E., eds. (2008). Systematics and Taxonomy of Australian Birds. Collingwood, VIC: Csiro Publishing. ISBN 978-0-643-06511-6.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/338", @@ -6583,7 +6583,7 @@ "orig": "Donne-Goussé, Carole; Laudet, Vincent; Hänni, Catherine (July 2002). \"A molecular phylogeny of Anseriformes based on mitochondrial DNA analysis\". Molecular Phylogenetics and Evolution. 23 (3): 339–356. Bibcode:2002MolPE..23..339D. doi:10.1016/S1055-7903(02)00019-2. PMID 12099792.", "text": "Donne-Goussé, Carole; Laudet, Vincent; Hänni, Catherine (July 2002). \"A molecular phylogeny of Anseriformes based on mitochondrial DNA analysis\". Molecular Phylogenetics and Evolution. 23 (3): 339–356. Bibcode:2002MolPE..23..339D. doi:10.1016/S1055-7903(02)00019-2. PMID 12099792.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/339", @@ -6597,7 +6597,7 @@ "orig": "Elphick, Chris; Dunning, John B. Jr.; Sibley, David, eds. (2001). The Sibley Guide to Bird Life and Behaviour. London: Christopher Helm. ISBN 978-0-7136-6250-4.", "text": "Elphick, Chris; Dunning, John B. Jr.; Sibley, David, eds. (2001). The Sibley Guide to Bird Life and Behaviour. London: Christopher Helm. ISBN 978-0-7136-6250-4.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/340", @@ -6611,7 +6611,7 @@ "orig": "Erlandson, Jon M. (1994). Early Hunter-Gatherers of the California Coast. New York, NY: Springer Science & Business Media. ISBN 978-1-4419-3231-0.", "text": "Erlandson, Jon M. (1994). Early Hunter-Gatherers of the California Coast. New York, NY: Springer Science & Business Media. ISBN 978-1-4419-3231-0.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/341", @@ -6625,7 +6625,7 @@ "orig": "Fieldhouse, Paul (2002). Food, Feasts, and Faith: An Encyclopedia of Food Culture in World Religions. Vol. I: A–K. Santa Barbara: ABC-CLIO. ISBN 978-1-61069-412-4.", "text": "Fieldhouse, Paul (2002). Food, Feasts, and Faith: An Encyclopedia of Food Culture in World Religions. Vol. I: A–K. Santa Barbara: ABC-CLIO. ISBN 978-1-61069-412-4.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/342", @@ -6639,7 +6639,7 @@ "orig": "Fitter, Julian; Fitter, Daniel; Hosking, David (2000). Wildlife of the Galápagos. Princeton, NJ: Princeton University Press. ISBN 978-0-691-10295-5.", "text": "Fitter, Julian; Fitter, Daniel; Hosking, David (2000). Wildlife of the Galápagos. Princeton, NJ: Princeton University Press. ISBN 978-0-691-10295-5.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/343", @@ -6653,7 +6653,7 @@ "orig": "Higman, B. W. (2012). How Food Made History. Chichester, UK: John Wiley & Sons. ISBN 978-1-4051-8947-7.", "text": "Higman, B. W. (2012). How Food Made History. Chichester, UK: John Wiley & Sons. ISBN 978-1-4051-8947-7.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/344", @@ -6667,7 +6667,7 @@ "orig": "Hume, Julian H. (2012). Extinct Birds. London: Christopher Helm. ISBN 978-1-4729-3744-5.", "text": "Hume, Julian H. (2012). Extinct Birds. London: Christopher Helm. ISBN 978-1-4729-3744-5.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/345", @@ -6681,7 +6681,7 @@ "orig": "Jeffries, Richard (2008). Holocene Hunter-Gatherers of the Lower Ohio River Valley. Tuscaloosa: University of Alabama Press. ISBN 978-0-8173-1658-7.", "text": "Jeffries, Richard (2008). Holocene Hunter-Gatherers of the Lower Ohio River Valley. Tuscaloosa: University of Alabama Press. ISBN 978-0-8173-1658-7.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/346", @@ -6695,7 +6695,7 @@ "orig": "Kear, Janet, ed. (2005). Ducks, Geese and Swans: Species Accounts (Cairina to Mergus). Bird Families of the World. Oxford: Oxford University Press. ISBN 978-0-19-861009-0.", "text": "Kear, Janet, ed. (2005). Ducks, Geese and Swans: Species Accounts (Cairina to Mergus). Bird Families of the World. Oxford: Oxford University Press. ISBN 978-0-19-861009-0.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/347", @@ -6709,7 +6709,7 @@ "orig": "Livezey, Bradley C. (October 1986). \"A phylogenetic analysis of recent Anseriform genera using morphological characters\" (PDF). The Auk. 103 (4): 737–754. doi:10.1093/auk/103.4.737. Archived (PDF) from the original on 2022-10-09.", "text": "Livezey, Bradley C. (October 1986). \"A phylogenetic analysis of recent Anseriform genera using morphological characters\" (PDF). The Auk. 103 (4): 737–754. doi:10.1093/auk/103.4.737. Archived (PDF) from the original on 2022-10-09.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/348", @@ -6723,7 +6723,7 @@ "orig": "Madsen, Cort S.; McHugh, Kevin P.; de Kloet, Siwo R. (July 1988). \"A partial classification of waterfowl (Anatidae) based on single-copy DNA\" (PDF). The Auk. 105 (3): 452–459. doi:10.1093/auk/105.3.452. Archived (PDF) from the original on 2022-10-09.", "text": "Madsen, Cort S.; McHugh, Kevin P.; de Kloet, Siwo R. (July 1988). \"A partial classification of waterfowl (Anatidae) based on single-copy DNA\" (PDF). The Auk. 105 (3): 452–459. doi:10.1093/auk/105.3.452. Archived (PDF) from the original on 2022-10-09.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/349", @@ -6737,7 +6737,7 @@ "orig": "Maisels, Charles Keith (1999). Early Civilizations of the Old World. London: Routledge. ISBN 978-0-415-10975-8.", "text": "Maisels, Charles Keith (1999). Early Civilizations of the Old World. London: Routledge. ISBN 978-0-415-10975-8.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/350", @@ -6751,7 +6751,7 @@ "orig": "Pratt, H. Douglas; Bruner, Phillip L.; Berrett, Delwyn G. (1987). A Field Guide to the Birds of Hawaii and the Tropical Pacific. Princeton, NJ: Princeton University Press. ISBN 0-691-02399-9.", "text": "Pratt, H. Douglas; Bruner, Phillip L.; Berrett, Delwyn G. (1987). A Field Guide to the Birds of Hawaii and the Tropical Pacific. Princeton, NJ: Princeton University Press. ISBN 0-691-02399-9.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/351", @@ -6765,7 +6765,7 @@ "orig": "Rau, Charles (1876). Early Man in Europe. New York: Harper & Brothers. LCCN 05040168.", "text": "Rau, Charles (1876). Early Man in Europe. New York: Harper & Brothers. LCCN 05040168.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/352", @@ -6779,7 +6779,7 @@ "orig": "Shirihai, Hadoram (2008). A Complete Guide to Antarctic Wildlife. Princeton, NJ, US: Princeton University Press. ISBN 978-0-691-13666-0.", "text": "Shirihai, Hadoram (2008). A Complete Guide to Antarctic Wildlife. Princeton, NJ, US: Princeton University Press. ISBN 978-0-691-13666-0.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/353", @@ -6793,7 +6793,7 @@ "orig": "Sued-Badillo, Jalil (2003). Autochthonous Societies. General History of the Caribbean. Paris: UNESCO. ISBN 978-92-3-103832-7.", "text": "Sued-Badillo, Jalil (2003). Autochthonous Societies. General History of the Caribbean. Paris: UNESCO. ISBN 978-92-3-103832-7.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/354", @@ -6807,7 +6807,7 @@ "orig": "Thorpe, I. J. (1996). The Origins of Agriculture in Europe. New York: Routledge. ISBN 978-0-415-08009-5.", "text": "Thorpe, I. J. (1996). The Origins of Agriculture in Europe. New York: Routledge. ISBN 978-0-415-08009-5.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/355", @@ -6874,7 +6874,7 @@ "orig": "Definitions from Wiktionary", "text": "Definitions from Wiktionary", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/357", @@ -6888,7 +6888,7 @@ "orig": "Media from Commons", "text": "Media from Commons", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/358", @@ -6902,7 +6902,7 @@ "orig": "Quotations from Wikiquote", "text": "Quotations from Wikiquote", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/359", @@ -6916,7 +6916,7 @@ "orig": "Recipes from Wikibooks", "text": "Recipes from Wikibooks", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/360", @@ -6930,7 +6930,7 @@ "orig": "Taxa from Wikispecies", "text": "Taxa from Wikispecies", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/361", @@ -6944,7 +6944,7 @@ "orig": "Data from Wikidata", "text": "Data from Wikidata", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/362", @@ -6958,7 +6958,7 @@ "orig": "list of books (useful looking abstracts)", "text": "list of books (useful looking abstracts)", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/363", @@ -6972,7 +6972,7 @@ "orig": "Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine", "text": "Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/364", @@ -6986,7 +6986,7 @@ "orig": "Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl", "text": "Ducks at a Distance, by Rob Hines at Project Gutenberg - A modern illustrated guide to identification of US waterfowl", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/365", @@ -7024,7 +7024,7 @@ "orig": "Ducks", "text": "Ducks", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/368", @@ -7038,7 +7038,7 @@ "orig": "Game birds", "text": "Game birds", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/369", @@ -7052,7 +7052,7 @@ "orig": "Bird common names", "text": "Bird common names", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/370", @@ -7078,7 +7078,7 @@ "orig": "All accuracy disputes", "text": "All accuracy disputes", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/372", @@ -7092,7 +7092,7 @@ "orig": "Accuracy disputes from February 2020", "text": "Accuracy disputes from February 2020", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/373", @@ -7106,7 +7106,7 @@ "orig": "CS1 Finnish-language sources (fi)", "text": "CS1 Finnish-language sources (fi)", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/374", @@ -7120,7 +7120,7 @@ "orig": "CS1 Latvian-language sources (lv)", "text": "CS1 Latvian-language sources (lv)", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/375", @@ -7134,7 +7134,7 @@ "orig": "CS1 Swedish-language sources (sv)", "text": "CS1 Swedish-language sources (sv)", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/376", @@ -7148,7 +7148,7 @@ "orig": "Articles with short description", "text": "Articles with short description", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/377", @@ -7162,7 +7162,7 @@ "orig": "Short description is different from Wikidata", "text": "Short description is different from Wikidata", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/378", @@ -7176,7 +7176,7 @@ "orig": "Wikipedia indefinitely move-protected pages", "text": "Wikipedia indefinitely move-protected pages", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/379", @@ -7190,7 +7190,7 @@ "orig": "Wikipedia indefinitely semi-protected pages", "text": "Wikipedia indefinitely semi-protected pages", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/380", @@ -7204,7 +7204,7 @@ "orig": "Articles with 'species' microformats", "text": "Articles with 'species' microformats", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/381", @@ -7218,7 +7218,7 @@ "orig": "Articles containing Old English (ca. 450-1100)-language text", "text": "Articles containing Old English (ca. 450-1100)-language text", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/382", @@ -7232,7 +7232,7 @@ "orig": "Articles containing Dutch-language text", "text": "Articles containing Dutch-language text", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/383", @@ -7246,7 +7246,7 @@ "orig": "Articles containing German-language text", "text": "Articles containing German-language text", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/384", @@ -7260,7 +7260,7 @@ "orig": "Articles containing Norwegian-language text", "text": "Articles containing Norwegian-language text", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/385", @@ -7274,7 +7274,7 @@ "orig": "Articles containing Lithuanian-language text", "text": "Articles containing Lithuanian-language text", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/386", @@ -7288,7 +7288,7 @@ "orig": "Articles containing Ancient Greek (to 1453)-language text", "text": "Articles containing Ancient Greek (to 1453)-language text", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/387", @@ -7302,7 +7302,7 @@ "orig": "All articles with self-published sources", "text": "All articles with self-published sources", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/388", @@ -7316,7 +7316,7 @@ "orig": "Articles with self-published sources from February 2020", "text": "Articles with self-published sources from February 2020", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/389", @@ -7330,7 +7330,7 @@ "orig": "All articles with unsourced statements", "text": "All articles with unsourced statements", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/390", @@ -7344,7 +7344,7 @@ "orig": "Articles with unsourced statements from January 2022", "text": "Articles with unsourced statements from January 2022", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/391", @@ -7358,7 +7358,7 @@ "orig": "CS1: long volume value", "text": "CS1: long volume value", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/392", @@ -7372,7 +7372,7 @@ "orig": "Pages using Sister project links with wikidata mismatch", "text": "Pages using Sister project links with wikidata mismatch", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/393", @@ -7386,7 +7386,7 @@ "orig": "Pages using Sister project links with hidden wikidata", "text": "Pages using Sister project links with hidden wikidata", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/394", @@ -7400,7 +7400,7 @@ "orig": "Webarchive template wayback links", "text": "Webarchive template wayback links", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/395", @@ -7414,7 +7414,7 @@ "orig": "Articles with Project Gutenberg links", "text": "Articles with Project Gutenberg links", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/396", @@ -7428,7 +7428,7 @@ "orig": "Articles containing video clips", "text": "Articles containing video clips", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/397", @@ -7442,7 +7442,7 @@ "orig": "This page was last edited on 21 September 2024, at 12:11 (UTC).", "text": "This page was last edited on 21 September 2024, at 12:11 (UTC).", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/398", @@ -7456,7 +7456,7 @@ "orig": "Text is available under the Creative Commons Attribution-ShareAlike License 4.0;\nadditional terms may apply. By using this site, you agree to the Terms of Use and Privacy Policy. Wikipedia® is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization.", "text": "Text is available under the Creative Commons Attribution-ShareAlike License 4.0;\nadditional terms may apply. By using this site, you agree to the Terms of Use and Privacy Policy. Wikipedia® is a registered trademark of the Wikimedia Foundation, Inc., a non-profit organization.", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/399", @@ -7470,7 +7470,7 @@ "orig": "Privacy policy", "text": "Privacy policy", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/400", @@ -7484,7 +7484,7 @@ "orig": "About Wikipedia", "text": "About Wikipedia", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/401", @@ -7498,7 +7498,7 @@ "orig": "Disclaimers", "text": "Disclaimers", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/402", @@ -7512,7 +7512,7 @@ "orig": "Contact Wikipedia", "text": "Contact Wikipedia", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/403", @@ -7526,7 +7526,7 @@ "orig": "Code of Conduct", "text": "Code of Conduct", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/404", @@ -7540,7 +7540,7 @@ "orig": "Developers", "text": "Developers", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/405", @@ -7554,7 +7554,7 @@ "orig": "Statistics", "text": "Statistics", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/406", @@ -7568,7 +7568,7 @@ "orig": "Cookie statement", "text": "Cookie statement", "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/407", @@ -7582,7 +7582,7 @@ "orig": "Mobile view", "text": "Mobile view", "enumerated": false, - "marker": "-" + "marker": "" } ], "pictures": [ diff --git a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json index 4b75d8d..5de7bc3 100644 --- a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json +++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "word_image_anchors", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", diff --git a/tests/data/groundtruth/docling_v2/word_sample.docx.json b/tests/data/groundtruth/docling_v2/word_sample.docx.json index 1f94d91..a424d11 100644 --- a/tests/data/groundtruth/docling_v2/word_sample.docx.json +++ b/tests/data/groundtruth/docling_v2/word_sample.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "word_sample", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -243,7 +243,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/7", @@ -264,7 +264,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/8", @@ -285,7 +285,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/9", @@ -325,7 +325,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/11", @@ -346,7 +346,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/12", @@ -367,7 +367,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/13", @@ -530,7 +530,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/21", @@ -551,7 +551,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" }, { "self_ref": "#/texts/22", @@ -572,7 +572,7 @@ "script": "baseline" }, "enumerated": false, - "marker": "-" + "marker": "" } ], "pictures": [ diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.json b/tests/data/groundtruth/docling_v2/word_tables.docx.json index e215c27..66af38b 100644 --- a/tests/data/groundtruth/docling_v2/word_tables.docx.json +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "word_tables", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", diff --git a/tests/data/html/example_01.html b/tests/data/html/example_01.html index 792dc6c..2f86b5b 100644 --- a/tests/data/html/example_01.html +++ b/tests/data/html/example_01.html @@ -13,5 +13,9 @@
  • First item in ordered list
  • Second item in ordered list
  • +
      +
    1. First item in ordered list with start
    2. +
    3. Second item in ordered list with start
    4. +
    diff --git a/tests/data/webp/groundtruth/docling_v2/webp-test.json b/tests/data/webp/groundtruth/docling_v2/webp-test.json index a53da5c..80e2d78 100644 --- a/tests/data/webp/groundtruth/docling_v2/webp-test.json +++ b/tests/data/webp/groundtruth/docling_v2/webp-test.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "webp-test", "origin": { "mimetype": "application/pdf", diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test.json index 4c796c0..aea68d8 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "ocr_test", "origin": { "mimetype": "application/pdf", diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json index 5ecd3ec..070e16b 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "ocr_test_rotated_180", "origin": { "mimetype": "application/pdf", diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json index 17633a7..53d5f85 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "ocr_test_rotated_270", "origin": { "mimetype": "application/pdf", diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json index 32e62f7..828c9e5 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.4.0", + "version": "1.5.0", "name": "ocr_test_rotated_90", "origin": { "mimetype": "application/pdf", diff --git a/tests/test_backend_pptx.py b/tests/test_backend_pptx.py index 4f73c87..ffffea3 100644 --- a/tests/test_backend_pptx.py +++ b/tests/test_backend_pptx.py @@ -41,12 +41,12 @@ def test_e2e_pptx_conversions(): doc: DoclingDocument = conv_result.document pred_md: str = doc.export_to_markdown() - assert verify_export(pred_md, str(gt_path) + ".md"), "export to md" + assert verify_export(pred_md, str(gt_path) + ".md", GENERATE), "export to md" pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) - assert verify_export(pred_itxt, str(gt_path) + ".itxt"), ( + assert verify_export(pred_itxt, str(gt_path) + ".itxt", GENERATE), ( "export to indented-text" ) diff --git a/uv.lock b/uv.lock index 947ec38..f12ab93 100644 --- a/uv.lock +++ b/uv.lock @@ -912,7 +912,7 @@ requires-dist = [ { name = "accelerate", marker = "extra == 'vlm'", specifier = ">=1.2.1,<2.0.0" }, { name = "beautifulsoup4", specifier = ">=4.12.3,<5.0.0" }, { name = "certifi", specifier = ">=2024.7.4" }, - { name = "docling-core", extras = ["chunking"], specifier = ">=2.29.0,<3.0.0" }, + { name = "docling-core", extras = ["chunking"], specifier = ">=2.39.0,<3.0.0" }, { name = "docling-ibm-models", specifier = ">=3.4.4,<4.0.0" }, { name = "docling-parse", specifier = ">=4.0.0,<5.0.0" }, { name = "easyocr", specifier = ">=1.7,<2.0" }, @@ -987,7 +987,7 @@ examples = [ [[package]] name = "docling-core" -version = "2.38.1" +version = "2.39.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jsonref" }, @@ -1001,9 +1001,9 @@ dependencies = [ { name = "typer" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/38/f7/33bb17aa13e73722bf18ecfb7f13d6fbfb384c22003209bd72708123b33f/docling_core-2.38.1.tar.gz", hash = "sha256:a0566df2316eec4d22953ca7dac839b926dd57549b4c07ac810e87dbbaf91a10", size = 146276, upload-time = "2025-06-20T12:28:48.422Z" } +sdist = { url = "https://files.pythonhosted.org/packages/6a/8b/5613467523bed58d9f2b94220947783914b6d9910a8d20908cf148805427/docling_core-2.39.0.tar.gz", hash = "sha256:77530156c79c9000fe3104894935437d3e2d46dc0f567b5a500974d7c1a8b38b", size = 148005, upload-time = "2025-06-27T12:59:56.694Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/f0/c5/fb2e24602db94ec02cc3ac8eb7b9665f2a5f61ff81866beb67aff95a353a/docling_core-2.38.1-py3-none-any.whl", hash = "sha256:6859313561030503e8b53aec535aa5edb765a679af76ce2e2c60722d78c6c613", size = 151570, upload-time = "2025-06-20T12:28:46.764Z" }, + { url = "https://files.pythonhosted.org/packages/70/85/3d59ac46a47f62a0ed79e187c4163cecd2693d05006f771038db4781f9ff/docling_core-2.39.0-py3-none-any.whl", hash = "sha256:b7ce5142ab95bd8d5cfe5d7df167a96a6eb41d884f00ea42bb3dd8f40ade92ea", size = 152890, upload-time = "2025-06-27T12:59:55.327Z" }, ] [package.optional-dependencies]