feat: Optimize table extraction quality, add configuration options (#11)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
Christoph Auer
2024-07-17 16:13:21 +02:00
committed by GitHub
parent 3e2ede8107
commit e9526bb11e
5 changed files with 87 additions and 27 deletions

View File

@@ -19,18 +19,6 @@ class PageAssembleModel:
def __init__(self, config):
self.config = config
# self.line_wrap_pattern = re.compile(r'(?<=[^\W_])- \n(?=\w)')
# def sanitize_text_poor(self, lines):
# text = '\n'.join(lines)
#
# # treat line wraps.
# sanitized_text = self.line_wrap_pattern.sub('', text)
#
# sanitized_text = sanitized_text.replace('\n', ' ')
#
# return sanitized_text
def sanitize_text(self, lines):
if len(lines) <= 1:
return " ".join(lines)