fix: guess HTML content starting with script tag (#1673)
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
3942923125
commit
984cb137f6
@ -412,7 +412,11 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
else:
|
else:
|
||||||
return "application/xml"
|
return "application/xml"
|
||||||
|
|
||||||
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
|
if re.match(
|
||||||
|
r"(<script.*?>.*?</script>\s*)?(<!doctype\s+html|<html|<head|<body)",
|
||||||
|
content_str,
|
||||||
|
re.DOTALL,
|
||||||
|
):
|
||||||
return "text/html"
|
return "text/html"
|
||||||
|
|
||||||
p = re.compile(
|
p = re.compile(
|
||||||
|
@ -132,6 +132,13 @@ def test_guess_format(tmp_path):
|
|||||||
doc_path = Path("./tests/data/html/wiki_duck.html")
|
doc_path = Path("./tests/data/html/wiki_duck.html")
|
||||||
assert dci._guess_format(doc_path) == InputFormat.HTML
|
assert dci._guess_format(doc_path) == InputFormat.HTML
|
||||||
|
|
||||||
|
html_str = ( # HTML starting with a script
|
||||||
|
"<script>\nconsole.log('foo');\n</script>"
|
||||||
|
'<!doctype html>\n<html lang="en-us class="no-js"></html>'
|
||||||
|
)
|
||||||
|
stream = DocumentStream(name="lorem_ipsum", stream=BytesIO(f"{html_str}".encode()))
|
||||||
|
assert dci._guess_format(stream) == InputFormat.HTML
|
||||||
|
|
||||||
# Valid MD
|
# Valid MD
|
||||||
buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read())
|
buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read())
|
||||||
stream = DocumentStream(name="wiki.md", stream=buf)
|
stream = DocumentStream(name="wiki.md", stream=buf)
|
||||||
|
Loading…
Reference in New Issue
Block a user