fix: handling of long sequence of unescaped underscore chars in markdown (#173)
* Fix for md hanging when encountering long sequence of unescaped underscore chars Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Added comment explaining reason for fix Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * Fixed trailing inline text handling (at the end of a file), and corrected underscore sequence shortening Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> * making fix more rare Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> --------- Signed-off-by: Maksym Lysak <mly@zurich.ibm.com> Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
2cece27208
commit
94d0729c50
@ -1,4 +1,6 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
|
import warnings
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Set, Union
|
from typing import Set, Union
|
||||||
@ -25,6 +27,30 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||||
|
|
||||||
|
def shorten_underscore_sequences(self, markdown_text, max_length=10):
|
||||||
|
# This regex will match any sequence of underscores
|
||||||
|
pattern = r"_+"
|
||||||
|
|
||||||
|
def replace_match(match):
|
||||||
|
underscore_sequence = match.group(
|
||||||
|
0
|
||||||
|
) # Get the full match (sequence of underscores)
|
||||||
|
|
||||||
|
# Shorten the sequence if it exceeds max_length
|
||||||
|
if len(underscore_sequence) > max_length:
|
||||||
|
return "_" * max_length
|
||||||
|
else:
|
||||||
|
return underscore_sequence # Leave it unchanged if it is shorter or equal to max_length
|
||||||
|
|
||||||
|
# Use re.sub to replace long underscore sequences
|
||||||
|
shortened_text = re.sub(pattern, replace_match, markdown_text)
|
||||||
|
|
||||||
|
if len(shortened_text) != len(markdown_text):
|
||||||
|
warnings.warn("Detected potentially incorrect Markdown, correcting...")
|
||||||
|
|
||||||
|
return shortened_text
|
||||||
|
|
||||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
|
|
||||||
@ -42,11 +68,19 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
try:
|
try:
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
text_stream = self.path_or_stream.getvalue().decode("utf-8")
|
||||||
self.markdown = text_stream
|
# remove invalid sequences
|
||||||
|
# very long sequences of underscores will lead to unnecessary long processing times.
|
||||||
|
# In any proper Markdown files, underscores have to be escaped,
|
||||||
|
# otherwise they represent emphasis (bold or italic)
|
||||||
|
self.markdown = self.shorten_underscore_sequences(text_stream)
|
||||||
if isinstance(self.path_or_stream, Path):
|
if isinstance(self.path_or_stream, Path):
|
||||||
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
with open(self.path_or_stream, "r", encoding="utf-8") as f:
|
||||||
md_content = f.read()
|
md_content = f.read()
|
||||||
self.markdown = md_content
|
# remove invalid sequences
|
||||||
|
# very long sequences of underscores will lead to unnecessary long processing times.
|
||||||
|
# In any proper Markdown files, underscores have to be escaped,
|
||||||
|
# otherwise they represent emphasis (bold or italic)
|
||||||
|
self.markdown = self.shorten_underscore_sequences(md_content)
|
||||||
self.valid = True
|
self.valid = True
|
||||||
|
|
||||||
_log.debug(self.markdown)
|
_log.debug(self.markdown)
|
||||||
|
Loading…
Reference in New Issue
Block a user