From 984cb137f6a8ae2f3a63623add6c474d97ef8739 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Mon, 2 Jun 2025 08:43:24 +0200 Subject: [PATCH] fix: guess HTML content starting with script tag (#1673) Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/datamodel/document.py | 6 +++++- tests/test_input_doc.py | 7 +++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 5791c0e..4c71f5c 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -412,7 +412,11 @@ class _DocumentConversionInput(BaseModel): else: return "application/xml" - if re.match(r".*?\s*)?(\nconsole.log('foo');\n" + '\n' + ) + stream = DocumentStream(name="lorem_ipsum", stream=BytesIO(f"{html_str}".encode())) + assert dci._guess_format(stream) == InputFormat.HTML + # Valid MD buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read()) stream = DocumentStream(name="wiki.md", stream=buf)