Docling/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.json
Tobias Strebitzer 00d9405b0a
feat: Add support for CSV input with new backend to transform CSV files to DoclingDocument (#945)
* feat: Implement csv backend and format detection

Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>

* test: Implement csv parsing and format tests

Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>

* docs: Add example and CSV format documentation

Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>

* feat: Add support for various CSV dialects and update documentation

Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>

* feat: Add validation for delimiters and tests for inconsistent csv files

Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>

---------

Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>
2025-02-14 08:55:09 +01:00

546 lines
16 KiB
JSON

{
"schema_name": "DoclingDocument",
"version": "1.1.0",
"name": "csv-comma-in-cell",
"origin": {
"mimetype": "text/csv",
"binary_hash": 17599039665518552414,
"filename": "csv-comma-in-cell.csv"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/tables/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [],
"texts": [],
"pictures": [],
"tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "1",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "2",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "3",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "4",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "a",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "b",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "c",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "d",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "a",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": ",",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "c",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "d",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "a",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "b",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "c",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "d",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "a",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "b",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "c",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "d",
"column_header": false,
"row_header": false,
"row_section": false
}
],
"num_rows": 5,
"num_cols": 4,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "1",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "2",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "3",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "4",
"column_header": false,
"row_header": false,
"row_section": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "a",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "b",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "c",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "d",
"column_header": false,
"row_header": false,
"row_section": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "a",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": ",",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "c",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "d",
"column_header": false,
"row_header": false,
"row_section": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "a",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "b",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "c",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "d",
"column_header": false,
"row_header": false,
"row_section": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "a",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "b",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "c",
"column_header": false,
"row_header": false,
"row_section": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "d",
"column_header": false,
"row_header": false,
"row_section": false
}
]
]
}
}
],
"key_value_items": [],
"pages": {}
}