
* Support tableformer model choice Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update datamodel structure Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update docs Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Cleanup Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add test unit for table options Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Ensure import backwards-compatibility for PipelineOptions Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update README Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Adjust parameters on custom_convert Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> * Update Dockerfile Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>
26 lines
773 B
Python
26 lines
773 B
Python
from enum import Enum, auto
|
|
|
|
from pydantic import BaseModel
|
|
|
|
|
|
class TableFormerMode(str, Enum):
|
|
FAST = auto()
|
|
ACCURATE = auto()
|
|
|
|
|
|
class TableStructureOptions(BaseModel):
|
|
do_cell_matching: bool = (
|
|
True
|
|
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
|
# are merged across table columns.
|
|
# False: Let table structure model define the text cells, ignore PDF cells.
|
|
)
|
|
mode: TableFormerMode = TableFormerMode.FAST
|
|
|
|
|
|
class PipelineOptions(BaseModel):
|
|
do_table_structure: bool = True # True: perform table structure extraction
|
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
|
|
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|