{ "schema_name": "DoclingDocument", "version": "1.5.0", "name": "2203.01017v2", "origin": { "mimetype": "application/pdf", "binary_hash": 10763566541725197878, "filename": "2203.01017v2.pdf", "uri": null }, "furniture": { "self_ref": "#/furniture", "parent": null, "children": [], "content_layer": "furniture", "name": "_root_", "label": "unspecified" }, "body": { "self_ref": "#/body", "parent": null, "children": [ { "cref": "#/texts/0" }, { "cref": "#/texts/1" }, { "cref": "#/texts/2" }, { "cref": "#/groups/0" }, { "cref": "#/texts/4" }, { "cref": "#/texts/5" }, { "cref": "#/texts/6" }, { "cref": "#/texts/7" }, { "cref": "#/texts/8" }, { "cref": "#/pictures/0" }, { "cref": "#/tables/0" }, { "cref": "#/groups/1" }, { "cref": "#/pictures/1" }, { "cref": "#/pictures/2" }, { "cref": "#/tables/1" }, { "cref": "#/texts/64" }, { "cref": "#/texts/65" }, { "cref": "#/texts/66" }, { "cref": "#/texts/67" }, { "cref": "#/texts/68" }, { "cref": "#/texts/69" }, { "cref": "#/texts/70" }, { "cref": "#/groups/2" }, { "cref": "#/texts/75" }, { "cref": "#/texts/76" }, { "cref": "#/texts/77" }, { "cref": "#/texts/78" }, { "cref": "#/texts/79" }, { "cref": "#/texts/80" }, { "cref": "#/texts/81" }, { "cref": "#/texts/82" }, { "cref": "#/texts/83" }, { "cref": "#/texts/84" }, { "cref": "#/texts/85" }, { "cref": "#/texts/86" }, { "cref": "#/texts/87" }, { "cref": "#/texts/88" }, { "cref": "#/pictures/3" }, { "cref": "#/texts/123" }, { "cref": "#/texts/124" }, { "cref": "#/texts/125" }, { "cref": "#/texts/126" }, { "cref": "#/texts/127" }, { "cref": "#/texts/128" }, { "cref": "#/texts/129" }, { "cref": "#/texts/130" }, { "cref": "#/texts/131" }, { "cref": "#/tables/2" }, { "cref": "#/texts/133" }, { "cref": "#/texts/134" }, { "cref": "#/texts/135" }, { "cref": "#/texts/136" }, { "cref": "#/texts/137" }, { "cref": "#/texts/138" }, { "cref": "#/texts/139" }, { "cref": "#/texts/140" }, { "cref": "#/pictures/4" }, { "cref": "#/pictures/5" }, { "cref": "#/texts/246" }, { "cref": "#/texts/247" }, { "cref": "#/texts/248" }, { "cref": "#/texts/249" }, { "cref": "#/texts/250" }, { "cref": "#/texts/251" }, { "cref": "#/texts/252" }, { "cref": "#/texts/253" }, { "cref": "#/texts/254" }, { "cref": "#/texts/255" }, { "cref": "#/texts/256" }, { "cref": "#/texts/257" }, { "cref": "#/texts/258" }, { "cref": "#/texts/259" }, { "cref": "#/texts/260" }, { "cref": "#/texts/261" }, { "cref": "#/texts/262" }, { "cref": "#/texts/263" }, { "cref": "#/texts/264" }, { "cref": "#/texts/265" }, { "cref": "#/texts/266" }, { "cref": "#/texts/267" }, { "cref": "#/texts/268" }, { "cref": "#/texts/269" }, { "cref": "#/texts/270" }, { "cref": "#/texts/271" }, { "cref": "#/texts/272" }, { "cref": "#/texts/273" }, { "cref": "#/texts/274" }, { "cref": "#/texts/275" }, { "cref": "#/tables/3" }, { "cref": "#/texts/276" }, { "cref": "#/texts/277" }, { "cref": "#/texts/278" }, { "cref": "#/tables/4" }, { "cref": "#/texts/280" }, { "cref": "#/tables/5" }, { "cref": "#/texts/282" }, { "cref": "#/groups/3" }, { "cref": "#/texts/285" }, { "cref": "#/texts/286" }, { "cref": "#/pictures/6" }, { "cref": "#/pictures/7" }, { "cref": "#/tables/6" }, { "cref": "#/tables/7" }, { "cref": "#/pictures/8" }, { "cref": "#/pictures/9" }, { "cref": "#/pictures/10" }, { "cref": "#/texts/348" }, { "cref": "#/texts/349" }, { "cref": "#/texts/350" }, { "cref": "#/texts/351" }, { "cref": "#/texts/352" }, { "cref": "#/groups/4" }, { "cref": "#/texts/354" }, { "cref": "#/groups/5" }, { "cref": "#/texts/380" }, { "cref": "#/texts/381" }, { "cref": "#/groups/6" }, { "cref": "#/texts/396" }, { "cref": "#/texts/397" }, { "cref": "#/texts/398" }, { "cref": "#/texts/399" }, { "cref": "#/texts/400" }, { "cref": "#/texts/401" }, { "cref": "#/texts/402" }, { "cref": "#/texts/403" }, { "cref": "#/texts/404" }, { "cref": "#/texts/405" }, { "cref": "#/groups/7" }, { "cref": "#/texts/411" }, { "cref": "#/texts/412" }, { "cref": "#/texts/413" }, { "cref": "#/pictures/11" }, { "cref": "#/groups/8" }, { "cref": "#/texts/476" }, { "cref": "#/texts/477" }, { "cref": "#/texts/478" }, { "cref": "#/groups/9" }, { "cref": "#/texts/484" }, { "cref": "#/texts/485" }, { "cref": "#/groups/10" }, { "cref": "#/texts/491" }, { "cref": "#/groups/11" }, { "cref": "#/texts/496" }, { "cref": "#/texts/497" }, { "cref": "#/texts/498" }, { "cref": "#/texts/499" }, { "cref": "#/tables/8" }, { "cref": "#/tables/9" }, { "cref": "#/tables/10" }, { "cref": "#/tables/11" }, { "cref": "#/texts/500" }, { "cref": "#/tables/12" }, { "cref": "#/tables/13" }, { "cref": "#/tables/14" }, { "cref": "#/pictures/12" }, { "cref": "#/tables/15" }, { "cref": "#/tables/16" }, { "cref": "#/tables/17" }, { "cref": "#/tables/18" }, { "cref": "#/pictures/13" }, { "cref": "#/tables/19" }, { "cref": "#/texts/503" }, { "cref": "#/tables/20" }, { "cref": "#/pictures/14" }, { "cref": "#/texts/504" }, { "cref": "#/tables/21" }, { "cref": "#/tables/22" }, { "cref": "#/tables/23" }, { "cref": "#/pictures/15" }, { "cref": "#/texts/505" }, { "cref": "#/tables/24" }, { "cref": "#/tables/25" }, { "cref": "#/tables/26" }, { "cref": "#/pictures/16" }, { "cref": "#/tables/27" }, { "cref": "#/tables/28" }, { "cref": "#/tables/29" }, { "cref": "#/tables/30" }, { "cref": "#/texts/508" }, { "cref": "#/pictures/17" }, { "cref": "#/tables/31" }, { "cref": "#/pictures/18" }, { "cref": "#/tables/32" }, { "cref": "#/pictures/19" }, { "cref": "#/pictures/20" }, { "cref": "#/tables/33" }, { "cref": "#/tables/34" }, { "cref": "#/pictures/21" }, { "cref": "#/tables/35" }, { "cref": "#/pictures/22" }, { "cref": "#/tables/36" }, { "cref": "#/tables/37" }, { "cref": "#/texts/510" }, { "cref": "#/texts/511" }, { "cref": "#/pictures/23" }, { "cref": "#/texts/513" } ], "content_layer": "body", "name": "_root_", "label": "unspecified" }, "groups": [ { "self_ref": "#/groups/0", "parent": { "cref": "#/body" }, "children": [ { "cref": "#/texts/3" } ], "content_layer": "body", "name": "group", "label": "key_value_area" }, { "self_ref": "#/groups/1", "parent": { "cref": "#/body" }, "children": [ { "cref": "#/texts/12" }, { "cref": "#/texts/38" } ], "content_layer": "body", "name": "list", "label": "list" }, { "self_ref": "#/groups/2", "parent": { "cref": "#/body" }, "children": [ { "cref": "#/texts/71" }, { "cref": "#/texts/72" }, { "cref": "#/texts/73" }, { "cref": "#/texts/74" } ], "content_layer": "body", "name": "list", "label": "list" }, { "self_ref": "#/groups/3", "parent": { "cref": "#/body" }, "children": [ { "cref": "#/texts/283" }, { "cref": "#/texts/284" } ], "content_layer": "body", "name": "list", "label": "list" }, { "self_ref": "#/groups/4", "parent": { "cref": "#/body" }, "children": [ { "cref": "#/texts/353" } ], "content_layer": "body", "name": "list", "label": "list" }, { "self_ref": "#/groups/5", "parent": { "cref": "#/body" }, "children": [ { "cref": "#/texts/355" }, { "cref": "#/texts/356" }, { "cref": "#/texts/357" }, { "cref": "#/texts/358" }, { "cref": "#/texts/359" }, { "cref": "#/texts/360" }, { "cref": "#/texts/361" }, { "cref": "#/texts/362" }, { "cref": "#/texts/363" }, { "cref": "#/texts/364" }, { "cref": "#/texts/365" }, { "cref": "#/texts/366" }, { "cref": "#/texts/367" }, { "cref": "#/texts/368" }, { "cref": "#/texts/369" }, { "cref": "#/texts/370" }, { "cref": "#/texts/371" }, { "cref": "#/texts/372" }, { "cref": "#/texts/373" }, { "cref": "#/texts/374" }, { "cref": "#/texts/375" }, { "cref": "#/texts/376" }, { "cref": "#/texts/377" }, { "cref": "#/texts/378" }, { "cref": "#/texts/379" } ], "content_layer": "body", "name": "list", "label": "list" }, { "self_ref": "#/groups/6", "parent": { "cref": "#/body" }, "children": [ { "cref": "#/texts/382" }, { "cref": "#/texts/383" }, { "cref": "#/texts/384" }, { "cref": "#/texts/385" }, { "cref": "#/texts/386" }, { "cref": "#/texts/387" }, { "cref": "#/texts/388" }, { "cref": "#/texts/389" }, { "cref": "#/texts/390" }, { "cref": "#/texts/391" }, { "cref": "#/texts/392" }, { "cref": "#/texts/393" }, { "cref": "#/texts/394" }, { "cref": "#/texts/395" } ], "content_layer": "body", "name": "list", "label": "list" }, { "self_ref": "#/groups/7", "parent": { "cref": "#/body" }, "children": [ { "cref": "#/texts/406" }, { "cref": "#/texts/407" }, { "cref": "#/texts/408" }, { "cref": "#/texts/409" }, { "cref": "#/texts/410" } ], "content_layer": "body", "name": "list", "label": "list" }, { "self_ref": "#/groups/8", "parent": { "cref": "#/body" }, "children": [ { "cref": "#/texts/474" }, { "cref": "#/texts/475" } ], "content_layer": "body", "name": "list", "label": "list" }, { "self_ref": "#/groups/9", "parent": { "cref": "#/body" }, "children": [ { "cref": "#/texts/479" }, { "cref": "#/texts/480" }, { "cref": "#/texts/481" }, { "cref": "#/texts/482" }, { "cref": "#/texts/483" } ], "content_layer": "body", "name": "list", "label": "list" }, { "self_ref": "#/groups/10", "parent": { "cref": "#/body" }, "children": [ { "cref": "#/texts/486" }, { "cref": "#/texts/487" }, { "cref": "#/texts/488" }, { "cref": "#/texts/489" }, { "cref": "#/texts/490" } ], "content_layer": "body", "name": "list", "label": "list" }, { "self_ref": "#/groups/11", "parent": { "cref": "#/body" }, "children": [ { "cref": "#/texts/492" }, { "cref": "#/texts/493" }, { "cref": "#/texts/494" }, { "cref": "#/texts/495" } ], "content_layer": "body", "name": "list", "label": "list" } ], "texts": [ { "self_ref": "#/texts/0", "parent": { "cref": "#/body" }, "children": [], "content_layer": "furniture", "label": "page_header", "prov": [ { "page_no": 1, "bbox": { "l": 18.340221, "t": 584.17999, "r": 36.339779, "b": 231.99996999999996, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 38 ] } ], "orig": "arXiv:2203.01017v2 [cs.CV] 11 Mar 2022", "text": "arXiv:2203.01017v2 [cs.CV] 11 Mar 2022", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/1", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "section_header", "prov": [ { "page_no": 1, "bbox": { "l": 96.301003, "t": 684.96588, "r": 498.92708999999996, "b": 672.06866, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 61 ] } ], "orig": "TableFormer: Table Structure Understanding with Transformers.", "text": "TableFormer: Table Structure Understanding with Transformers.", "formatting": null, "hyperlink": null, "level": 1 }, { "self_ref": "#/texts/2", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "section_header", "prov": [ { "page_no": 1, "bbox": { "l": 142.47701, "t": 645.31464, "r": 452.75027, "b": 620.67963, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 73 ] } ], "orig": "Ahmed Nassar, Nikolaos Livathinos, Maksym Lysak, Peter Staar IBM Research", "text": "Ahmed Nassar, Nikolaos Livathinos, Maksym Lysak, Peter Staar IBM Research", "formatting": null, "hyperlink": null, "level": 1 }, { "self_ref": "#/texts/3", "parent": { "cref": "#/groups/0" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 208.123, "t": 616.03876, "r": 378.73257, "b": 607.57446, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 35 ] } ], "orig": "{ ahn,nli,mly,taa } @zurich.ibm.com", "text": "{ ahn,nli,mly,taa } @zurich.ibm.com", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/4", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "section_header", "prov": [ { "page_no": 1, "bbox": { "l": 145.99498, "t": 576.51703, "r": 190.48029, "b": 565.76929, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 8 ] } ], "orig": "Abstract", "text": "Abstract", "formatting": null, "hyperlink": null, "level": 1 }, { "self_ref": "#/texts/5", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "section_header", "prov": [ { "page_no": 1, "bbox": { "l": 315.56702, "t": 573.99316, "r": 408.4407, "b": 565.24518, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 22 ] } ], "orig": "a. Picture of a table:", "text": "a. Picture of a table:", "formatting": null, "hyperlink": null, "level": 1 }, { "self_ref": "#/texts/6", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 50.111977, "t": 550.60492, "r": 286.36511, "b": 279.00335999999993, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1320 ] } ], "orig": "Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.", "text": "Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph's, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF's directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/7", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "section_header", "prov": [ { "page_no": 1, "bbox": { "l": 50.111977, "t": 252.05723999999998, "r": 126.94804, "b": 241.30951000000005, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 15 ] } ], "orig": "1. Introduction", "text": "1. Introduction", "formatting": null, "hyperlink": null, "level": 1 }, { "self_ref": "#/texts/8", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 50.111977, "t": 231.21680000000003, "r": 286.36508, "b": 78.84822099999997, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 712 ] } ], "orig": "The occurrence of tables in documents is ubiquitous. They often summarise quantitative or factual data, which is cumbersome to describe in verbose text but nevertheless extremely valuable. Unfortunately, this compact representation is often not easy to parse by machines. There are many implicit conventions used to obtain a compact table representation. For example, tables often have complex columnand row-headers in order to reduce duplicated cell content. Lines of different shapes and sizes are leveraged to separate content or indicate a tree structure. Additionally, tables can also have empty/missing table-entries or multi-row textual table-entries. Fig. 1 shows a table which presents all these issues.", "text": "The occurrence of tables in documents is ubiquitous. They often summarise quantitative or factual data, which is cumbersome to describe in verbose text but nevertheless extremely valuable. Unfortunately, this compact representation is often not easy to parse by machines. There are many implicit conventions used to obtain a compact table representation. For example, tables often have complex columnand row-headers in order to reduce duplicated cell content. Lines of different shapes and sizes are leveraged to separate content or indicate a tree structure. Additionally, tables can also have empty/missing table-entries or multi-row textual table-entries. Fig. 1 shows a table which presents all these issues.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/9", "parent": { "cref": "#/pictures/0" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 451.9457100000001, "t": 556.65295, "r": 457.95050000000003, "b": 546.52252, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "1", "text": "1", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/10", "parent": { "cref": "#/pictures/0" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 331.19681, "t": 522.64734, "r": 337.2016, "b": 512.51691, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "2", "text": "2", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/11", "parent": { "cref": "#/pictures/0" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 384.0329, "t": 539.32104, "r": 390.03769, "b": 529.19061, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "3", "text": "3", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/12", "parent": { "cref": "#/groups/1" }, "children": [], "content_layer": "body", "label": "list_item", "prov": [ { "page_no": 1, "bbox": { "l": 315.56702, "t": 478.30521000000005, "r": 486.40194999999994, "b": 458.7572, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 68 ] } ], "orig": "b. Red-annotation of bounding boxes, Blue-predictions by TableFormer", "text": "Red-annotation of bounding boxes, Blue-predictions by TableFormer", "formatting": null, "hyperlink": null, "enumerated": false, "marker": "b." }, { "self_ref": "#/texts/13", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 408.14752, "t": 449.17172, "r": 412.54001, "b": 440.38678, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "1", "text": "1", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/14", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 356.11011, "t": 450.42783, "r": 360.50259, "b": 441.64288, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "0", "text": "0", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/15", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 500.6777, "t": 451.06232, "r": 505.0701900000001, "b": 442.2773700000001, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "2", "text": "2", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/16", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 356.13382, "t": 440.25211, "r": 360.52631, "b": 431.46716, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "3", "text": "3", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/17", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 402.53992, "t": 436.1235, "r": 406.9324, "b": 427.33856, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "4", "text": "4", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/18", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 448.58178999999996, "t": 439.15982, "r": 452.97427, "b": 430.37488, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "5", "text": "5", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/19", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 491.65161000000006, "t": 438.29343, "r": 496.0441, "b": 429.50848, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "6", "text": "6", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/20", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 535.13843, "t": 438.66031, "r": 539.53088, "b": 429.87537, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "7", "text": "7", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/21", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 348.82822, "t": 404.90219, "r": 353.2207, "b": 396.11725, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "8", "text": "8", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/22", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 389.27151, "t": 416.62772, "r": 393.664, "b": 407.84277, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "9", "text": "9", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/23", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 442.67479999999995, "t": 416.35379, "r": 451.45889000000005, "b": 407.56885, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "10", "text": "10", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/24", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 477.4382299999999, "t": 416.466, "r": 485.90167, "b": 407.68105999999995, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "11", "text": "11", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/25", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 522.57263, "t": 416.35379, "r": 531.35669, "b": 407.56885, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "12", "text": "12", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/26", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 400.22992, "t": 404.88571, "r": 409.01401, "b": 396.10077, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "13", "text": "13", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/27", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 442.30792, "t": 405.01018999999997, "r": 451.0920100000001, "b": 396.22524999999996, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "14", "text": "14", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/28", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 478.21941999999996, "t": 404.62531, "r": 487.00351000000006, "b": 395.84036, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "15", "text": "15", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/29", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 523.2287, "t": 405.01018999999997, "r": 532.01276, "b": 396.22524999999996, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "16", "text": "16", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/30", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 411.57233, "t": 392.57523, "r": 415.96481, "b": 383.79028, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "1", "text": "1", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/31", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 415.96393, "t": 392.57523, "r": 420.35641, "b": 383.79028, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "7", "text": "7", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/32", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 442.30521, "t": 392.9628000000001, "r": 451.08929, "b": 384.17786000000007, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "18", "text": "18", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/33", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 478.77893, "t": 393.00360000000006, "r": 487.56302, "b": 384.21866000000006, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "19", "text": "19", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/34", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 523.97241, "t": 393.3885200000001, "r": 532.75647, "b": 384.60358, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "20", "text": "20", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/35", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 385.09399, "t": 434.23969000000005, "r": 391.09879, "b": 424.10928, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "3", "text": "3", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/36", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 333.43451, "t": 411.2735, "r": 339.4393, "b": 401.14310000000006, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "2", "text": "2", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/37", "parent": { "cref": "#/pictures/1" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 478.07210999999995, "t": 450.9631999999999, "r": 484.0769, "b": 440.83279000000005, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "1", "text": "1", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/38", "parent": { "cref": "#/groups/1" }, "children": [], "content_layer": "body", "label": "list_item", "prov": [ { "page_no": 1, "bbox": { "l": 315.56702, "t": 371.8172, "r": 491.1912500000001, "b": 363.06918, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 38 ] } ], "orig": "c. Structure predicted by TableFormer:", "text": "Structure predicted by TableFormer:", "formatting": null, "hyperlink": null, "enumerated": false, "marker": "c." }, { "self_ref": "#/texts/39", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "caption", "prov": [ { "page_no": 1, "bbox": { "l": 308.862, "t": 277.49963, "r": 545.11517, "b": 232.72709999999995, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 220 ] } ], "orig": "Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.", "text": "Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/40", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 347.24872, "t": 354.31412, "r": 351.6412, "b": 345.52917, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "1", "text": "1", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/41", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 318.88071, "t": 354.31412, "r": 323.27319, "b": 345.52917, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "0", "text": "0", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/42", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 394.10422, "t": 354.31412, "r": 398.4967, "b": 345.52917, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "2", "text": "2", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/43", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 318.77316, "t": 342.4545, "r": 323.16565, "b": 333.66956, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "3", "text": "3", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/44", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 347.24872, "t": 342.4545, "r": 351.6412, "b": 333.66956, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "4", "text": "4", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/45", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 394.10422, "t": 342.4545, "r": 398.4967, "b": 333.66956, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "5", "text": "5", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/46", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 440.95941000000005, "t": 342.4545, "r": 445.3519, "b": 333.66956, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "6", "text": "6", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/47", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 487.81491, "t": 342.4545, "r": 492.2074, "b": 333.66956, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "7", "text": "7", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/48", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 318.77316, "t": 318.29575, "r": 323.16565, "b": 309.5108, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "8", "text": "8", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/49", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 347.24872, "t": 330.1554, "r": 351.6412, "b": 321.37045, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "9", "text": "9", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/50", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 394.10422, "t": 330.1554, "r": 402.88831, "b": 321.37045, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "10", "text": "10", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/51", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 440.95941000000005, "t": 330.1554, "r": 449.42285, "b": 321.37045, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "11", "text": "11", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/52", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 487.81491, "t": 330.1554, "r": 496.599, "b": 321.37045, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "12", "text": "12", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/53", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 347.24872, "t": 318.29575, "r": 356.03281, "b": 309.5108, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "13", "text": "13", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/54", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 394.10422, "t": 318.29575, "r": 402.88831, "b": 309.5108, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "14", "text": "14", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/55", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 440.95941000000005, "t": 318.29575, "r": 449.7435, "b": 309.5108, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "15", "text": "15", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/56", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 487.81491, "t": 318.29575, "r": 496.599, "b": 309.5108, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "16", "text": "16", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/57", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 347.24872, "t": 306.87531, "r": 356.03281, "b": 298.09036, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "17", "text": "17", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/58", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 394.10422, "t": 306.87531, "r": 402.88831, "b": 298.09036, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "18", "text": "18", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/59", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 440.95941000000005, "t": 306.87531, "r": 449.7435, "b": 298.09036, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "19", "text": "19", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/60", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 487.81491, "t": 306.87531, "r": 496.599, "b": 298.09036, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "20", "text": "20", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/61", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 366.70102, "t": 342.87918, "r": 372.70581, "b": 332.74878, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "3", "text": "3", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/62", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 331.90424, "t": 318.67709, "r": 337.90903, "b": 308.54669, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "2", "text": "2", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/63", "parent": { "cref": "#/pictures/2" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 459.87621999999993, "t": 354.4064, "r": 465.88101, "b": 344.276, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "1", "text": "1", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/64", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 308.862, "t": 207.59064, "r": 545.11517, "b": 126.95307000000003, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 363 ] } ], "orig": "Recently, significant progress has been made with vision based approaches to extract tables in documents. For the sake of completeness, the issue of table extraction from documents is typically decomposed into two separate challenges, i.e. (1) finding the location of the table(s) on a document-page and (2) finding the structure of a given table in the document.", "text": "Recently, significant progress has been made with vision based approaches to extract tables in documents. For the sake of completeness, the issue of table extraction from documents is typically decomposed into two separate challenges, i.e. (1) finding the location of the table(s) on a document-page and (2) finding the structure of a given table in the document.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/65", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 1, "bbox": { "l": 308.862, "t": 123.61964, "r": 545.11511, "b": 78.84806800000001, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 229 ] } ], "orig": "The first problem is called table-location and has been previously addressed [30, 38, 19, 21, 23, 26, 8] with stateof-the-art object-detection networks (e.g. YOLO and later on Mask-RCNN [9]). For all practical purposes, it can be", "text": "The first problem is called table-location and has been previously addressed [30, 38, 19, 21, 23, 26, 8] with stateof-the-art object-detection networks (e.g. YOLO and later on Mask-RCNN [9]). For all practical purposes, it can be", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/66", "parent": { "cref": "#/body" }, "children": [], "content_layer": "furniture", "label": "page_footer", "prov": [ { "page_no": 1, "bbox": { "l": 295.121, "t": 57.866633999999976, "r": 300.10229, "b": 48.96007199999997, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "1", "text": "1", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/67", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 2, "bbox": { "l": 50.112, "t": 716.79163, "r": 286.36505, "b": 695.93005, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 75 ] } ], "orig": "considered as a solved problem, given enough ground-truth data to train on.", "text": "considered as a solved problem, given enough ground-truth data to train on.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/68", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 2, "bbox": { "l": 50.112, "t": 692.42859, "r": 286.36514, "b": 563.96991, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 626 ] } ], "orig": "The second problem is called table-structure decomposition. The latter is a long standing problem in the community of document understanding [6, 4, 14]. Contrary to the table-location problem, there are no commonly used approaches that can easily be re-purposed to solve this problem. Lately, a set of new model-architectures has been proposed by the community to address table-structure decomposition [37, 36, 18, 20]. All these models have some weaknesses (see Sec. 2). The common denominator here is the reliance on textual features and/or the inability to provide the bounding box of each table-cell in the original image.", "text": "The second problem is called table-structure decomposition. The latter is a long standing problem in the community of document understanding [6, 4, 14]. Contrary to the table-location problem, there are no commonly used approaches that can easily be re-purposed to solve this problem. Lately, a set of new model-architectures has been proposed by the community to address table-structure decomposition [37, 36, 18, 20]. All these models have some weaknesses (see Sec. 2). The common denominator here is the reliance on textual features and/or the inability to provide the bounding box of each table-cell in the original image.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/69", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 2, "bbox": { "l": 50.112, "t": 560.46844, "r": 286.36511, "b": 420.05493, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 643 ] } ], "orig": "In this paper, we want to address these weaknesses and present a robust table-structure decomposition algorithm. The design criteria for our model are the following. First, we want our algorithm to be language agnostic. In this way, we can obtain the structure of any table, irregardless of the language. Second, we want our algorithm to leverage as much data as possible from the original PDF document. For programmatic PDF documents, the text-cells can often be extracted much faster and with higher accuracy compared to OCR methods. Last but not least, we want to have a direct link between the table-cell and its bounding box in the image.", "text": "In this paper, we want to address these weaknesses and present a robust table-structure decomposition algorithm. The design criteria for our model are the following. First, we want our algorithm to be language agnostic. In this way, we can obtain the structure of any table, irregardless of the language. Second, we want our algorithm to leverage as much data as possible from the original PDF document. For programmatic PDF documents, the text-cells can often be extracted much faster and with higher accuracy compared to OCR methods. Last but not least, we want to have a direct link between the table-cell and its bounding box in the image.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/70", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 2, "bbox": { "l": 50.112, "t": 416.5535, "r": 286.36658, "b": 359.827, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 242 ] } ], "orig": "To meet the design criteria listed above, we developed a new model called TableFormer and a synthetically generated table structure dataset called SynthTabNet $^{1}$. In particular, our contributions in this work can be summarised as follows:", "text": "To meet the design criteria listed above, we developed a new model called TableFormer and a synthetically generated table structure dataset called SynthTabNet $^{1}$. In particular, our contributions in this work can be summarised as follows:", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/71", "parent": { "cref": "#/groups/2" }, "children": [], "content_layer": "body", "label": "list_item", "prov": [ { "page_no": 2, "bbox": { "l": 61.569016, "t": 347.56812, "r": 286.3649, "b": 302.67703, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 166 ] } ], "orig": "\u00b7 We propose TableFormer , a transformer based model that predicts tables structure and bounding boxes for the table content simultaneously in an end-to-end approach.", "text": "We propose TableFormer , a transformer based model that predicts tables structure and bounding boxes for the table content simultaneously in an end-to-end approach.", "formatting": null, "hyperlink": null, "enumerated": false, "marker": "\u00b7" }, { "self_ref": "#/texts/72", "parent": { "cref": "#/groups/2" }, "children": [], "content_layer": "body", "label": "list_item", "prov": [ { "page_no": 2, "bbox": { "l": 61.569016, "t": 289.96616, "r": 286.3649, "b": 245.07404999999994, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 181 ] } ], "orig": "\u00b7 Across all benchmark datasets TableFormer significantly outperforms existing state-of-the-art metrics, while being much more efficient in training and inference to existing works.", "text": "Across all benchmark datasets TableFormer significantly outperforms existing state-of-the-art metrics, while being much more efficient in training and inference to existing works.", "formatting": null, "hyperlink": null, "enumerated": false, "marker": "\u00b7" }, { "self_ref": "#/texts/73", "parent": { "cref": "#/groups/2" }, "children": [], "content_layer": "body", "label": "list_item", "prov": [ { "page_no": 2, "bbox": { "l": 61.569, "t": 232.36316, "r": 286.36493, "b": 199.42705, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 106 ] } ], "orig": "\u00b7 We present SynthTabNet a synthetically generated dataset, with various appearance styles and complexity.", "text": "We present SynthTabNet a synthetically generated dataset, with various appearance styles and complexity.", "formatting": null, "hyperlink": null, "enumerated": false, "marker": "\u00b7" }, { "self_ref": "#/texts/74", "parent": { "cref": "#/groups/2" }, "children": [], "content_layer": "body", "label": "list_item", "prov": [ { "page_no": 2, "bbox": { "l": 61.569008000000004, "t": 186.59659999999997, "r": 286.36508, "b": 153.77904999999998, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 131 ] } ], "orig": "\u00b7 An augmented dataset based on PubTabNet [37], FinTabNet [36], and TableBank [17] with generated ground-truth for reproducibility.", "text": "An augmented dataset based on PubTabNet [37], FinTabNet [36], and TableBank [17] with generated ground-truth for reproducibility.", "formatting": null, "hyperlink": null, "enumerated": false, "marker": "\u00b7" }, { "self_ref": "#/texts/75", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 2, "bbox": { "l": 50.112007, "t": 141.40161, "r": 286.36511, "b": 96.630043, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 231 ] } ], "orig": "The paper is structured as follows. In Sec. 2, we give a brief overview of the current state-of-the-art. In Sec. 3, we describe the datasets on which we train. In Sec. 4, we introduce the TableFormer model-architecture and describe", "text": "The paper is structured as follows. In Sec. 2, we give a brief overview of the current state-of-the-art. In Sec. 3, we describe the datasets on which we train. In Sec. 4, we introduce the TableFormer model-architecture and describe", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/76", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "footnote", "prov": [ { "page_no": 2, "bbox": { "l": 60.97100100000001, "t": 86.40372500000001, "r": 183.73055, "b": 79.278458, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 40 ] } ], "orig": "$^{1}$https://github.com/IBM/SynthTabNet", "text": "$^{1}$https://github.com/IBM/SynthTabNet", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/77", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 2, "bbox": { "l": 308.862, "t": 716.79163, "r": 545.11511, "b": 683.9750400000001, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 166 ] } ], "orig": "its results & performance in Sec. 5. As a conclusion, we describe how this new model-architecture can be re-purposed for other tasks in the computer-vision community.", "text": "its results & performance in Sec. 5. As a conclusion, we describe how this new model-architecture can be re-purposed for other tasks in the computer-vision community.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/78", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "section_header", "prov": [ { "page_no": 2, "bbox": { "l": 308.862, "t": 670.26807, "r": 498.28021, "b": 659.52032, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 37 ] } ], "orig": "2. Previous work and State of the Art", "text": "2. Previous work and State of the Art", "formatting": null, "hyperlink": null, "level": 1 }, { "self_ref": "#/texts/79", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 2, "bbox": { "l": 308.862, "t": 649.77863, "r": 545.11517, "b": 461.54498, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 901 ] } ], "orig": "Identifying the structure of a table has been an outstanding problem in the document-parsing community, that motivates many organised public challenges [6, 4, 14]. The difficulty of the problem can be attributed to a number of factors. First, there is a large variety in the shapes and sizes of tables. Such large variety requires a flexible method. This is especially true for complex column- and row headers, which can be extremely intricate and demanding. A second factor of complexity is the lack of data with regard to table-structure. Until the publication of PubTabNet [37], there were no large datasets (i.e. > 100 K tables) that provided structure information. This happens primarily due to the fact that tables are notoriously time-consuming to annotate by hand. However, this has definitely changed in recent years with the deliverance of PubTabNet [37], FinTabNet [36], TableBank [17] etc.", "text": "Identifying the structure of a table has been an outstanding problem in the document-parsing community, that motivates many organised public challenges [6, 4, 14]. The difficulty of the problem can be attributed to a number of factors. First, there is a large variety in the shapes and sizes of tables. Such large variety requires a flexible method. This is especially true for complex column- and row headers, which can be extremely intricate and demanding. A second factor of complexity is the lack of data with regard to table-structure. Until the publication of PubTabNet [37], there were no large datasets (i.e. > 100 K tables) that provided structure information. This happens primarily due to the fact that tables are notoriously time-consuming to annotate by hand. However, this has definitely changed in recent years with the deliverance of PubTabNet [37], FinTabNet [36], TableBank [17] etc.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/80", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 2, "bbox": { "l": 308.862, "t": 458.43054, "r": 545.11523, "b": 341.9270900000001, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 552 ] } ], "orig": "Before the rising popularity of deep neural networks, the community relied heavily on heuristic and/or statistical methods to do table structure identification [3, 7, 11, 5, 13, 28]. Although such methods work well on constrained tables [12], a more data-driven approach can be applied due to the advent of convolutional neural networks (CNNs) and the availability of large datasets. To the best-of-our knowledge, there are currently two different types of network architecture that are being pursued for state-of-the-art tablestructure identification.", "text": "Before the rising popularity of deep neural networks, the community relied heavily on heuristic and/or statistical methods to do table structure identification [3, 7, 11, 5, 13, 28]. Although such methods work well on constrained tables [12], a more data-driven approach can be applied due to the advent of convolutional neural networks (CNNs) and the availability of large datasets. To the best-of-our knowledge, there are currently two different types of network architecture that are being pursued for state-of-the-art tablestructure identification.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/81", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 2, "bbox": { "l": 308.86197, "t": 338.93222, "r": 545.11688, "b": 78.84815200000003, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1262 ] } ], "orig": "Image-to-Text networks : In this type of network, one predicts a sequence of tokens starting from an encoded image. Such sequences of tokens can be HTML table tags [37, 17] or LaTeX symbols[10]. The choice of symbols is ultimately not very important, since one can be transformed into the other. There are however subtle variations in the Image-to-Text networks. The easiest network architectures are \"image-encoder \u2192 text-decoder\" (IETD), similar to network architectures that try to provide captions to images [32]. In these IETD networks, one expects as output the LaTeX/HTML string of the entire table, i.e. the symbols necessary for creating the table with the content of the table. Another approach is the \"image-encoder \u2192 dual decoder\" (IEDD) networks. In these type of networks, one has two consecutive decoders with different purposes. The first decoder is the tag-decoder , i.e. it only produces the HTML/LaTeX tags which construct an empty table. The second content-decoder uses the encoding of the image in combination with the output encoding of each cell-tag (from the tag-decoder ) to generate the textual content of each table cell. The network architecture of IEDD is certainly more elaborate, but it has the advantage that one can pre-train the", "text": "Image-to-Text networks : In this type of network, one predicts a sequence of tokens starting from an encoded image. Such sequences of tokens can be HTML table tags [37, 17] or LaTeX symbols[10]. The choice of symbols is ultimately not very important, since one can be transformed into the other. There are however subtle variations in the Image-to-Text networks. The easiest network architectures are \"image-encoder \u2192 text-decoder\" (IETD), similar to network architectures that try to provide captions to images [32]. In these IETD networks, one expects as output the LaTeX/HTML string of the entire table, i.e. the symbols necessary for creating the table with the content of the table. Another approach is the \"image-encoder \u2192 dual decoder\" (IEDD) networks. In these type of networks, one has two consecutive decoders with different purposes. The first decoder is the tag-decoder , i.e. it only produces the HTML/LaTeX tags which construct an empty table. The second content-decoder uses the encoding of the image in combination with the output encoding of each cell-tag (from the tag-decoder ) to generate the textual content of each table cell. The network architecture of IEDD is certainly more elaborate, but it has the advantage that one can pre-train the", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/82", "parent": { "cref": "#/body" }, "children": [], "content_layer": "furniture", "label": "page_footer", "prov": [ { "page_no": 2, "bbox": { "l": 295.121, "t": 57.86671799999999, "r": 300.10229, "b": 48.960154999999986, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "2", "text": "2", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/83", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 50.112, "t": 716.79163, "r": 250.15102, "b": 707.88507, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 51 ] } ], "orig": "tag-decoder which is constrained to the table-tags.", "text": "tag-decoder which is constrained to the table-tags.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/84", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 50.112, "t": 704.7806400000001, "r": 286.36514, "b": 516.5459, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 864 ] } ], "orig": "In practice, both network architectures (IETD and IEDD) require an implicit, custom trained object-characterrecognition (OCR) to obtain the content of the table-cells. In the case of IETD, this OCR engine is implicit in the decoder similar to [24]. For the IEDD, the OCR is solely embedded in the content-decoder. This reliance on a custom, implicit OCR decoder is of course problematic. OCR is a well known and extremely tough problem, that often needs custom training for each individual language. However, the limited availability for non-english content in the current datasets, makes it impractical to apply the IETD and IEDD methods on tables with other languages. Additionally, OCR can be completely omitted if the tables originate from programmatic PDF documents with known positions of each cell. The latter was the inspiration for the work of this paper.", "text": "In practice, both network architectures (IETD and IEDD) require an implicit, custom trained object-characterrecognition (OCR) to obtain the content of the table-cells. In the case of IETD, this OCR engine is implicit in the decoder similar to [24]. For the IEDD, the OCR is solely embedded in the content-decoder. This reliance on a custom, implicit OCR decoder is of course problematic. OCR is a well known and extremely tough problem, that often needs custom training for each individual language. However, the limited availability for non-english content in the current datasets, makes it impractical to apply the IETD and IEDD methods on tables with other languages. Additionally, OCR can be completely omitted if the tables originate from programmatic PDF documents with known positions of each cell. The latter was the inspiration for the work of this paper.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/85", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 50.111992, "t": 513.56104, "r": 286.36511, "b": 301.29712, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1007 ] } ], "orig": "Graph Neural networks : Graph Neural networks (GNN's) take a radically different approach to tablestructure extraction. Note that one table cell can constitute out of multiple text-cells. To obtain the table-structure, one creates an initial graph, where each of the text-cells becomes a node in the graph similar to [33, 34, 2]. Each node is then associated with en embedding vector coming from the encoded image, its coordinates and the encoded text. Furthermore, nodes that represent adjacent text-cells are linked. Graph Convolutional Networks (GCN's) based methods take the image as an input, but also the position of the text-cells and their content [18]. The purpose of a GCN is to transform the input graph into a new graph, which replaces the old links with new ones. The new links then represent the table-structure. With this approach, one can avoid the need to build custom OCR decoders. However, the quality of the reconstructed structure is not comparable to the current state-of-the-art [18].", "text": "Graph Neural networks : Graph Neural networks (GNN's) take a radically different approach to tablestructure extraction. Note that one table cell can constitute out of multiple text-cells. To obtain the table-structure, one creates an initial graph, where each of the text-cells becomes a node in the graph similar to [33, 34, 2]. Each node is then associated with en embedding vector coming from the encoded image, its coordinates and the encoded text. Furthermore, nodes that represent adjacent text-cells are linked. Graph Convolutional Networks (GCN's) based methods take the image as an input, but also the position of the text-cells and their content [18]. The purpose of a GCN is to transform the input graph into a new graph, which replaces the old links with new ones. The new links then represent the table-structure. With this approach, one can avoid the need to build custom OCR decoders. However, the quality of the reconstructed structure is not comparable to the current state-of-the-art [18].", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/86", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 50.111984, "t": 298.31125, "r": 286.36627, "b": 169.73315000000002, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 619 ] } ], "orig": "Hybrid Deep Learning-Rule-Based approach : A popular current model for table-structure identification is the use of a hybrid Deep Learning-Rule-Based approach similar to [27, 29]. In this approach, one first detects the position of the table-cells with object detection (e.g. YoloVx or MaskRCNN), then classifies the table into different types (from its images) and finally uses different rule-sets to obtain its table-structure. Currently, this approach achieves stateof-the-art results, but is not an end-to-end deep-learning method. As such, new rules need to be written if different types of tables are encountered.", "text": "Hybrid Deep Learning-Rule-Based approach : A popular current model for table-structure identification is the use of a hybrid Deep Learning-Rule-Based approach similar to [27, 29]. In this approach, one first detects the position of the table-cells with object detection (e.g. YoloVx or MaskRCNN), then classifies the table into different types (from its images) and finally uses different rule-sets to obtain its table-structure. Currently, this approach achieves stateof-the-art results, but is not an end-to-end deep-learning method. As such, new rules need to be written if different types of tables are encountered.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/87", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "section_header", "prov": [ { "page_no": 3, "bbox": { "l": 50.111984, "t": 156.05516, "r": 105.22546, "b": 145.30743000000007, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 11 ] } ], "orig": "3. Datasets", "text": "3. Datasets", "formatting": null, "hyperlink": null, "level": 1 }, { "self_ref": "#/texts/88", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 50.111984, "t": 135.57470999999998, "r": 286.36508, "b": 78.84813699999995, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 281 ] } ], "orig": "We rely on large-scale datasets such as PubTabNet [37], FinTabNet [36], and TableBank [17] datasets to train and evaluate our models. These datasets span over various appearance styles and content. We also introduce our own synthetically generated SynthTabNet dataset to fix an im-", "text": "We rely on large-scale datasets such as PubTabNet [37], FinTabNet [36], and TableBank [17] datasets to train and evaluate our models. These datasets span over various appearance styles and content. We also introduce our own synthetically generated SynthTabNet dataset to fix an im-", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/89", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "caption", "prov": [ { "page_no": 3, "bbox": { "l": 308.862, "t": 524.16364, "r": 545.11511, "b": 503.3020900000001, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 104 ] } ], "orig": "Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets", "text": "Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/90", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "section_header", "prov": [ { "page_no": 3, "bbox": { "l": 380.79849, "t": 712.1882300000001, "r": 486.84909, "b": 703.44025, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 21 ] } ], "orig": "PubTabNet + FinTabNet", "text": "PubTabNet + FinTabNet", "formatting": null, "hyperlink": null, "level": 1 }, { "self_ref": "#/texts/91", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 396.76776, "t": 549.97302, "r": 469.78748, "b": 541.22504, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 14 ] } ], "orig": "Rows / Columns", "text": "Rows / Columns", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/92", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 320.97653, "t": 558.57703, "r": 324.79254, "b": 552.745, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "0", "text": "0", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/93", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 410.483, "t": 558.57703, "r": 418.11319, "b": 552.745, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "20", "text": "20", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/94", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 500.84949, "t": 558.57703, "r": 508.47968000000003, "b": 552.745, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "40", "text": "40", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/95", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 365.29999, "t": 558.57703, "r": 372.93018, "b": 552.745, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "10", "text": "10", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/96", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 455.66626, "t": 558.57703, "r": 463.29645, "b": 552.745, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "30", "text": "30", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/97", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 542.03528, "t": 558.57703, "r": 549.66547, "b": 552.745, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "50", "text": "50", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/98", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 316.04474, "t": 561.55383, "r": 319.86075, "b": 555.7218, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "0", "text": "0", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/99", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 312.62521, "t": 593.30927, "r": 316.44122, "b": 587.47723, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "2", "text": "2", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/100", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 316.43942, "t": 593.30927, "r": 320.2554, "b": 587.47723, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "0", "text": "0", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/101", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 313.14951, "t": 623.90204, "r": 316.96552, "b": 618.07001, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "4", "text": "4", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/102", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 316.96371, "t": 623.90204, "r": 320.77969, "b": 618.07001, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "0", "text": "0", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/103", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 312.92972, "t": 655.41229, "r": 316.74573, "b": 649.58026, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "6", "text": "6", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/104", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 316.74393, "t": 655.41229, "r": 320.55991, "b": 649.58026, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "0", "text": "0", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/105", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 312.48227, "t": 686.39825, "r": 316.29828, "b": 680.56622, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "8", "text": "8", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/106", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 316.29648, "t": 686.39825, "r": 320.11246, "b": 680.56622, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "0", "text": "0", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/107", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 312.48227, "t": 579.74078, "r": 316.29828, "b": 573.90875, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "1", "text": "1", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/108", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 316.29648, "t": 579.74078, "r": 320.11246, "b": 573.90875, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "0", "text": "0", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/109", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 313.07639, "t": 608.27802, "r": 316.8924, "b": 602.44598, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "3", "text": "3", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/110", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 316.89059, "t": 608.27802, "r": 320.70657, "b": 602.44598, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "0", "text": "0", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/111", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 312.76321, "t": 639.526, "r": 316.57922, "b": 633.69397, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "5", "text": "5", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/112", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 316.57742, "t": 639.526, "r": 320.3934, "b": 633.69397, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "0", "text": "0", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/113", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 312.19775, "t": 671.4295, "r": 316.01376, "b": 665.59747, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "7", "text": "7", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/114", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 316.01196, "t": 671.4295, "r": 319.82794, "b": 665.59747, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "0", "text": "0", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/115", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 312.8165, "t": 701.8913, "r": 316.63251, "b": 696.05927, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "9", "text": "9", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/116", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 316.63071, "t": 701.8913, "r": 320.44669, "b": 696.05927, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "0", "text": "0", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/117", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 532.17426, "t": 569.27271, "r": 536.94427, "b": 561.98273, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "0", "text": "0", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/118", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 532.87952, "t": 683.7329700000001, "r": 547.61249, "b": 676.44299, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 3 ] } ], "orig": "10K", "text": "10K", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/119", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 532.7735, "t": 661.21899, "r": 542.73877, "b": 653.92902, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "8K", "text": "8K", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/120", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 532.79901, "t": 638.07648, "r": 542.76428, "b": 630.7865, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "6K", "text": "6K", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/121", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 532.5705, "t": 615.242, "r": 542.53577, "b": 607.95203, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "4K", "text": "4K", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/122", "parent": { "cref": "#/pictures/3" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 532.14551, "t": 592.3537, "r": 542.11078, "b": 585.06372, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "2K", "text": "2K", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/123", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 308.862, "t": 474.52664, "r": 437.27002, "b": 465.62009, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 33 ] } ], "orig": "balance in the previous datasets.", "text": "balance in the previous datasets.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/124", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 308.862, "t": 460.46863, "r": 545.11517, "b": 164.63825999999995, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1400 ] } ], "orig": "The PubTabNet dataset contains 509k tables delivered as annotated PNG images. The annotations consist of the table structure represented in HTML format, the tokenized text and its bounding boxes per table cell. Fig. 1 shows the appearance style of PubTabNet. Depending on its complexity, a table is characterized as \"simple\" when it does not contain row spans or column spans, otherwise it is \"complex\". The dataset is divided into Train and Val splits (roughly 98% and 2%). The Train split consists of 54% simple and 46% complex tables and the Val split of 51% and 49% respectively. The FinTabNet dataset contains 112k tables delivered as single-page PDF documents with mixed table structures and text content. Similarly to the PubTabNet, the annotations of FinTabNet include the table structure in HTML, the tokenized text and the bounding boxes on a table cell basis. The dataset is divided into Train, Test and Val splits (81%, 9.5%, 9.5%), and each one is almost equally divided into simple and complex tables (Train: 48% simple, 52% complex, Test: 48% simple, 52% complex, Test: 53% simple, 47% complex). Finally the TableBank dataset consists of 145k tables provided as JPEG images. The latter has annotations for the table structure, but only few with bounding boxes of the table cells. The entire dataset consists of simple tables and it is divided into 90% Train, 3% Test and 7% Val splits.", "text": "The PubTabNet dataset contains 509k tables delivered as annotated PNG images. The annotations consist of the table structure represented in HTML format, the tokenized text and its bounding boxes per table cell. Fig. 1 shows the appearance style of PubTabNet. Depending on its complexity, a table is characterized as \"simple\" when it does not contain row spans or column spans, otherwise it is \"complex\". The dataset is divided into Train and Val splits (roughly 98% and 2%). The Train split consists of 54% simple and 46% complex tables and the Val split of 51% and 49% respectively. The FinTabNet dataset contains 112k tables delivered as single-page PDF documents with mixed table structures and text content. Similarly to the PubTabNet, the annotations of FinTabNet include the table structure in HTML, the tokenized text and the bounding boxes on a table cell basis. The dataset is divided into Train, Test and Val splits (81%, 9.5%, 9.5%), and each one is almost equally divided into simple and complex tables (Train: 48% simple, 52% complex, Test: 48% simple, 52% complex, Test: 53% simple, 47% complex). Finally the TableBank dataset consists of 145k tables provided as JPEG images. The latter has annotations for the table structure, but only few with bounding boxes of the table cells. The entire dataset consists of simple tables and it is divided into 90% Train, 3% Test and 7% Val splits.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/125", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 3, "bbox": { "l": 308.862, "t": 159.48581000000001, "r": 545.11511, "b": 78.84823600000004, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 406 ] } ], "orig": "Due to the heterogeneity across the dataset formats, it was necessary to combine all available data into one homogenized dataset before we could train our models for practical purposes. Given the size of PubTabNet, we adopted its annotation format and we extracted and converted all tables as PNG images with a resolution of 72 dpi. Additionally, we have filtered out tables with extreme sizes due to small", "text": "Due to the heterogeneity across the dataset formats, it was necessary to combine all available data into one homogenized dataset before we could train our models for practical purposes. Given the size of PubTabNet, we adopted its annotation format and we extracted and converted all tables as PNG images with a resolution of 72 dpi. Additionally, we have filtered out tables with extreme sizes due to small", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/126", "parent": { "cref": "#/body" }, "children": [], "content_layer": "furniture", "label": "page_footer", "prov": [ { "page_no": 3, "bbox": { "l": 295.121, "t": 57.86680200000001, "r": 300.10229, "b": 48.960239, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "3", "text": "3", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/127", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 4, "bbox": { "l": 50.112, "t": 716.79163, "r": 286.36511, "b": 695.93005, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 93 ] } ], "orig": "amount of such tables, and kept only those ones ranging between 1*1 and 20*10 (rows/columns).", "text": "amount of such tables, and kept only those ones ranging between 1*1 and 20*10 (rows/columns).", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/128", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 4, "bbox": { "l": 50.112, "t": 691.03961, "r": 286.36514, "b": 478.89493, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 983 ] } ], "orig": "The availability of the bounding boxes for all table cells is essential to train our models. In order to distinguish between empty and non-empty bounding boxes, we have introduced a binary class in the annotation. Unfortunately, the original datasets either omit the bounding boxes for whole tables (e.g. TableBank) or they narrow their scope only to non-empty cells. Therefore, it was imperative to introduce a data pre-processing procedure that generates the missing bounding boxes out of the annotation information. This procedure first parses the provided table structure and calculates the dimensions of the most fine-grained grid that covers the table structure. Notice that each table cell may occupy multiple grid squares due to row or column spans. In case of PubTabNet we had to compute missing bounding boxes for 48% of the simple and 69% of the complex tables. Regarding FinTabNet, 68% of the simple and 98% of the complex tables require the generation of bounding boxes.", "text": "The availability of the bounding boxes for all table cells is essential to train our models. In order to distinguish between empty and non-empty bounding boxes, we have introduced a binary class in the annotation. Unfortunately, the original datasets either omit the bounding boxes for whole tables (e.g. TableBank) or they narrow their scope only to non-empty cells. Therefore, it was imperative to introduce a data pre-processing procedure that generates the missing bounding boxes out of the annotation information. This procedure first parses the provided table structure and calculates the dimensions of the most fine-grained grid that covers the table structure. Notice that each table cell may occupy multiple grid squares due to row or column spans. In case of PubTabNet we had to compute missing bounding boxes for 48% of the simple and 69% of the complex tables. Regarding FinTabNet, 68% of the simple and 98% of the complex tables require the generation of bounding boxes.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/129", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 4, "bbox": { "l": 50.112, "t": 474.00449000000003, "r": 286.36511, "b": 357.50104, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 571 ] } ], "orig": "As it is illustrated in Fig. 2, the table distributions from all datasets are skewed towards simpler structures with fewer number of rows/columns. Additionally, there is very limited variance in the table styles, which in case of PubTabNet and FinTabNet means one styling format for the majority of the tables. Similar limitations appear also in the type of table content, which in some cases (e.g. FinTabNet) is restricted to a certain domain. Ultimately, the lack of diversity in the training dataset damages the ability of the models to generalize well on unseen data.", "text": "As it is illustrated in Fig. 2, the table distributions from all datasets are skewed towards simpler structures with fewer number of rows/columns. Additionally, there is very limited variance in the table styles, which in case of PubTabNet and FinTabNet means one styling format for the majority of the tables. Similar limitations appear also in the type of table content, which in some cases (e.g. FinTabNet) is restricted to a certain domain. Ultimately, the lack of diversity in the training dataset damages the ability of the models to generalize well on unseen data.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/130", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 4, "bbox": { "l": 50.112, "t": 352.6106, "r": 286.36655, "b": 164.37611000000004, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 941 ] } ], "orig": "Motivated by those observations we aimed at generating a synthetic table dataset named SynthTabNet . This approach offers control over: 1) the size of the dataset, 2) the table structure, 3) the table style and 4) the type of content. The complexity of the table structure is described by the size of the table header and the table body, as well as the percentage of the table cells covered by row spans and column spans. A set of carefully designed styling templates provides the basis to build a wide range of table appearances. Lastly, the table content is generated out of a curated collection of text corpora. By controlling the size and scope of the synthetic datasets we are able to train and evaluate our models in a variety of different conditions. For example, we can first generate a highly diverse dataset to train our models and then evaluate their performance on other synthetic datasets which are focused on a specific domain.", "text": "Motivated by those observations we aimed at generating a synthetic table dataset named SynthTabNet . This approach offers control over: 1) the size of the dataset, 2) the table structure, 3) the table style and 4) the type of content. The complexity of the table structure is described by the size of the table header and the table body, as well as the percentage of the table cells covered by row spans and column spans. A set of carefully designed styling templates provides the basis to build a wide range of table appearances. Lastly, the table content is generated out of a curated collection of text corpora. By controlling the size and scope of the synthetic datasets we are able to train and evaluate our models in a variety of different conditions. For example, we can first generate a highly diverse dataset to train our models and then evaluate their performance on other synthetic datasets which are focused on a specific domain.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/131", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 4, "bbox": { "l": 50.112015, "t": 159.48567000000003, "r": 286.36511, "b": 78.84810600000003, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 405 ] } ], "orig": "In this regard, we have prepared four synthetic datasets, each one containing 150k examples. The corpora to generate the table text consists of the most frequent terms appearing in PubTabNet and FinTabNet together with randomly generated text. The first two synthetic datasets have been fine-tuned to mimic the appearance of the original datasets but encompass more complicated table structures. The third", "text": "In this regard, we have prepared four synthetic datasets, each one containing 150k examples. The corpora to generate the table text consists of the most frequent terms appearing in PubTabNet and FinTabNet together with randomly generated text. The first two synthetic datasets have been fine-tuned to mimic the appearance of the original datasets but encompass more complicated table structures. The third", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/132", "parent": { "cref": "#/tables/2" }, "children": [], "content_layer": "body", "label": "caption", "prov": [ { "page_no": 4, "bbox": { "l": 308.862, "t": 624.33862, "r": 545.11505, "b": 567.61102, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 267 ] } ], "orig": "Table 1: Both \"Combined-Tabnet\" and \"CombinedTabnet\" are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank.", "text": "Table 1: Both \"Combined-Tabnet\" and \"CombinedTabnet\" are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/133", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 4, "bbox": { "l": 308.862, "t": 542.37958, "r": 545.11517, "b": 497.60803, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 210 ] } ], "orig": "one adopts a colorful appearance with high contrast and the last one contains tables with sparse content. Lastly, we have combined all synthetic datasets into one big unified synthetic dataset of 600k examples.", "text": "one adopts a colorful appearance with high contrast and the last one contains tables with sparse content. Lastly, we have combined all synthetic datasets into one big unified synthetic dataset of 600k examples.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/134", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 4, "bbox": { "l": 320.81699, "t": 494.22759999999994, "r": 542.74396, "b": 485.32104, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 57 ] } ], "orig": "Tab. 1 summarizes the various attributes of the datasets.", "text": "Tab. 1 summarizes the various attributes of the datasets.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/135", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "section_header", "prov": [ { "page_no": 4, "bbox": { "l": 308.862, "t": 470.81604, "r": 444.93607000000003, "b": 460.06832999999995, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 24 ] } ], "orig": "4. The TableFormer model", "text": "4. The TableFormer model", "formatting": null, "hyperlink": null, "level": 1 }, { "self_ref": "#/texts/136", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 4, "bbox": { "l": 308.862, "t": 450.06061, "r": 545.11523, "b": 345.51314999999994, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 504 ] } ], "orig": "Given the image of a table, TableFormer is able to predict: 1) a sequence of tokens that represent the structure of a table, and 2) a bounding box coupled to a subset of those tokens. The conversion of an image into a sequence of tokens is a well-known task [35, 16]. While attention is often used as an implicit method to associate each token of the sequence with a position in the original image, an explicit association between the individual table-cells and the image bounding boxes is also required.", "text": "Given the image of a table, TableFormer is able to predict: 1) a sequence of tokens that represent the structure of a table, and 2) a bounding box coupled to a subset of those tokens. The conversion of an image into a sequence of tokens is a well-known task [35, 16]. While attention is often used as an implicit method to associate each token of the sequence with a position in the original image, an explicit association between the individual table-cells and the image bounding boxes is also required.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/137", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "section_header", "prov": [ { "page_no": 4, "bbox": { "l": 308.862, "t": 334.30573, "r": 420.16058, "b": 324.45367, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 24 ] } ], "orig": "4.1. Model architecture.", "text": "4.1. Model architecture.", "formatting": null, "hyperlink": null, "level": 1 }, { "self_ref": "#/texts/138", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 4, "bbox": { "l": 308.86197, "t": 315.23471, "r": 545.11572, "b": 127.00018999999998, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 907 ] } ], "orig": "We now describe in detail the proposed method, which is composed of three main components, see Fig. 4. Our CNN Backbone Network encodes the input as a feature vector of predefined length. The input feature vector of the encoded image is passed to the Structure Decoder to produce a sequence of HTML tags that represent the structure of the table. With each prediction of an HTML standard data cell (' < td > ') the hidden state of that cell is passed to the Cell BBox Decoder. As for spanning cells, such as row or column span, the tag is broken down to ' < ', 'rowspan=' or 'colspan=', with the number of spanning cells (attribute), and ' > '. The hidden state attached to ' < ' is passed to the Cell BBox Decoder. A shared feed forward network (FFN) receives the hidden states from the Structure Decoder, to provide the final detection predictions of the bounding box coordinates and their classification.", "text": "We now describe in detail the proposed method, which is composed of three main components, see Fig. 4. Our CNN Backbone Network encodes the input as a feature vector of predefined length. The input feature vector of the encoded image is passed to the Structure Decoder to produce a sequence of HTML tags that represent the structure of the table. With each prediction of an HTML standard data cell (' < td > ') the hidden state of that cell is passed to the Cell BBox Decoder. As for spanning cells, such as row or column span, the tag is broken down to ' < ', 'rowspan=' or 'colspan=', with the number of spanning cells (attribute), and ' > '. The hidden state attached to ' < ' is passed to the Cell BBox Decoder. A shared feed forward network (FFN) receives the hidden states from the Structure Decoder, to provide the final detection predictions of the bounding box coordinates and their classification.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/139", "parent": { "cref": "#/body" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 4, "bbox": { "l": 308.86197, "t": 123.73929999999996, "r": 545.11511, "b": 78.84818300000006, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 223 ] } ], "orig": "CNN Backbone Network. A ResNet-18 CNN is the backbone that receives the table image and encodes it as a vector of predefined length. The network has been modified by removing the linear and pooling layer, as we are not per-", "text": "CNN Backbone Network. A ResNet-18 CNN is the backbone that receives the table image and encodes it as a vector of predefined length. The network has been modified by removing the linear and pooling layer, as we are not per-", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/140", "parent": { "cref": "#/body" }, "children": [], "content_layer": "furniture", "label": "page_footer", "prov": [ { "page_no": 4, "bbox": { "l": 295.12097, "t": 57.86674900000003, "r": 300.10226, "b": 48.96018600000002, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 1 ] } ], "orig": "4", "text": "4", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/141", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "caption", "prov": [ { "page_no": 5, "bbox": { "l": 50.111992, "t": 588.01422, "r": 545.10846, "b": 567.03308, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 212 ] } ], "orig": "Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.", "text": "Figure 3: TableFormer takes in an image of the PDF and creates bounding box and HTML structure predictions that are synchronized. The bounding boxes grabs the content from the PDF and inserts it in the structure.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/142", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 81.688072, "t": 669.5603, "r": 84.927567, "b": 666.37109, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "1.", "text": "1.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/143", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 86.54731, "t": 669.5603, "r": 93.026291, "b": 666.37109, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 4 ] } ], "orig": "Item", "text": "Item", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/144", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 102.50498, "t": 676.74786, "r": 115.3461, "b": 673.55865, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 6 ] } ], "orig": "Amount", "text": "Amount", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/145", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 82.140205, "t": 676.7851, "r": 93.291527, "b": 673.59589, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 5 ] } ], "orig": "Names", "text": "Names", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/146", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 96.748268, "t": 669.5603, "r": 104.3119, "b": 666.37109, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 4 ] } ], "orig": "1000", "text": "1000", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/147", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 96.748268, "t": 664.2562900000001, "r": 102.42083, "b": 661.06708, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 3 ] } ], "orig": "500", "text": "500", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/148", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 96.748268, "t": 658.54431, "r": 104.3119, "b": 655.3551, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 4 ] } ], "orig": "3500", "text": "3500", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/149", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 96.748268, "t": 652.83228, "r": 102.42083, "b": 649.64307, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 3 ] } ], "orig": "150", "text": "150", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/150", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 110.66107, "t": 669.5603, "r": 116.14391, "b": 666.37109, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 4 ] } ], "orig": "unit", "text": "unit", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/151", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 110.66107, "t": 664.2562900000001, "r": 116.14391, "b": 661.06708, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 4 ] } ], "orig": "unit", "text": "unit", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/152", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 110.66107, "t": 658.54431, "r": 116.14391, "b": 655.3551, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 4 ] } ], "orig": "unit", "text": "unit", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/153", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 110.66107, "t": 652.83228, "r": 116.14391, "b": 649.64307, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 4 ] } ], "orig": "unit", "text": "unit", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/154", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 81.688072, "t": 664.2562900000001, "r": 84.927567, "b": 661.06708, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "2.", "text": "2.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/155", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 86.54731, "t": 664.2562900000001, "r": 93.026291, "b": 661.06708, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 4 ] } ], "orig": "Item", "text": "Item", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/156", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 81.688072, "t": 658.54431, "r": 84.927567, "b": 655.3551, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "3.", "text": "3.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/157", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 86.54731, "t": 658.54431, "r": 93.026291, "b": 655.3551, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 4 ] } ], "orig": "Item", "text": "Item", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/158", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 81.688072, "t": 652.83228, "r": 84.927567, "b": 649.64307, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 2 ] } ], "orig": "4.", "text": "4.", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/159", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 86.54731, "t": 652.83228, "r": 93.026291, "b": 649.64307, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 4 ] } ], "orig": "Item", "text": "Item", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/160", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 88.084389, "t": 701.50262, "r": 113.93649, "b": 695.76202, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 9 ] } ], "orig": "Extracted", "text": "Extracted", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/161", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 82.81002, "t": 694.36261, "r": 119.21240000000002, "b": 688.62201, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 12 ] } ], "orig": "Table Images", "text": "Table Images", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/162", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 143.94247, "t": 691.39764, "r": 180.01131, "b": 685.65704, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 12 ] } ], "orig": "Standardized", "text": "Standardized", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/163", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 151.94064, "t": 684.25763, "r": 172.0118, "b": 678.5170299999999, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 6 ] } ], "orig": "Images", "text": "Images", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/164", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 251.76939000000002, "t": 711.0690300000001, "r": 266.39557, "b": 705.32843, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 4 ] } ], "orig": "BBox", "text": "BBox", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/165", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 247.51601, "t": 705.96899, "r": 270.65021, "b": 700.22839, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 7 ] } ], "orig": "Decoder", "text": "Decoder", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/166", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 331.03699, "t": 713.44019, "r": 352.12589, "b": 707.69958, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 6 ] } ], "orig": "BBoxes", "text": "BBoxes", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/167", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 390.56421, "t": 695.96777, "r": 431.7261, "b": 690.2271700000001, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 13 ] } ], "orig": "BBoxes can be", "text": "BBoxes can be", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/168", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 386.82422, "t": 689.8477199999999, "r": 435.46966999999995, "b": 684.10712, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 18 ] } ], "orig": "traced back to the", "text": "traced back to the", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/169", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 388.69589, "t": 683.72772, "r": 433.6032400000001, "b": 677.9871199999999, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 17 ] } ], "orig": "original image to", "text": "original image to", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/170", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 391.07761, "t": 677.60773, "r": 431.22542999999996, "b": 671.8671300000001, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 15 ] } ], "orig": "extract content", "text": "extract content", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/171", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 431.22650000000004, "t": 640.31488, "r": 498.82068, "b": 634.57428, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 23 ] } ], "orig": "Structure Tags sequence", "text": "Structure Tags sequence", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/172", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 431.1738, "t": 634.19482, "r": 498.87753000000004, "b": 628.45422, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 27 ] } ], "orig": "provide full description of", "text": "provide full description of", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/173", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 440.5289, "t": 628.07483, "r": 489.51827999999995, "b": 622.33423, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 19 ] } ], "orig": "the table structure", "text": "the table structure", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/174", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 328.37479, "t": 613.74615, "r": 367.72333, "b": 608.00555, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 14 ] } ], "orig": "Structure Tags", "text": "Structure Tags", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/175", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 331.84451, "t": 668.09113, "r": 373.67963, "b": 662.3505199999998, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 14 ] } ], "orig": "BBoxes in sync", "text": "BBoxes in sync", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/176", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 331.84451, "t": 662.9911499999998, "r": 381.17786, "b": 657.25055, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 17 ] } ], "orig": "with tag sequence", "text": "with tag sequence", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/177", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 196.62633, "t": 703.88379, "r": 219.42332, "b": 698.14319, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 7 ] } ], "orig": "Encoder", "text": "Encoder", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/178", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 246.66771, "t": 662.5053099999999, "r": 271.49899, "b": 656.76471, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 9 ] } ], "orig": "Structure", "text": "Structure", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/179", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 247.51601, "t": 657.40527, "r": 270.65021, "b": 651.66467, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 7 ] } ], "orig": "Decoder", "text": "Decoder", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/180", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 330.63071, "t": 702.98077, "r": 365.55347, "b": 697.24017, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 16 ] } ], "orig": "[x1, y2, x2, y2]", "text": "[x1, y2, x2, y2]", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/181", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 330.63071, "t": 694.82074, "r": 370.22717, "b": 689.08014, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 20 ] } ], "orig": "[x1', y2', x2', y2']", "text": "[x1', y2', x2', y2']", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/182", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 330.63071, "t": 686.6607700000001, "r": 374.51157, "b": 680.92017, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 24 ] } ], "orig": "[x1'', y2'', x2'', y2'']", "text": "[x1'', y2'', x2'', y2'']", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/183", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 330.63071, "t": 678.5007300000001, "r": 335.73233, "b": 672.76013, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 3 ] } ], "orig": "...", "text": "...", "formatting": null, "hyperlink": null }, { "self_ref": "#/texts/184", "parent": { "cref": "#/pictures/4" }, "children": [], "content_layer": "body", "label": "text", "prov": [ { "page_no": 5, "bbox": { "l": 322.30579, "t": 650.20764, "r": 335.05988, "b": 645.42383, "coord_origin": "BOTTOMLEFT" }, "charspan": [ 0, 4 ] } ], "orig": "