feat: Optimize table extraction quality, add configuration options (#11)
Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
This commit is contained in:
@@ -19,18 +19,6 @@ class PageAssembleModel:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
|
||||
# self.line_wrap_pattern = re.compile(r'(?<=[^\W_])- \n(?=\w)')
|
||||
|
||||
# def sanitize_text_poor(self, lines):
|
||||
# text = '\n'.join(lines)
|
||||
#
|
||||
# # treat line wraps.
|
||||
# sanitized_text = self.line_wrap_pattern.sub('', text)
|
||||
#
|
||||
# sanitized_text = sanitized_text.replace('\n', ' ')
|
||||
#
|
||||
# return sanitized_text
|
||||
|
||||
def sanitize_text(self, lines):
|
||||
if len(lines) <= 1:
|
||||
return " ".join(lines)
|
||||
|
||||
Reference in New Issue
Block a user