From 42856fdf79559188ec4617bc5d3a007286f114d2 Mon Sep 17 00:00:00 2001 From: Christoph Auer <60343111+cau-git@users.noreply.github.com> Date: Tue, 7 Jan 2025 15:49:28 +0100 Subject: [PATCH] fix: Let BeautifulSoup detect the HTML encoding (#695) Signed-off-by: Christoph Auer --- docling/backend/html_backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 9cd1e29..ae47888 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -37,10 +37,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): try: if isinstance(self.path_or_stream, BytesIO): - text_stream = self.path_or_stream.getvalue().decode("utf-8") + text_stream = self.path_or_stream.getvalue() self.soup = BeautifulSoup(text_stream, "html.parser") if isinstance(self.path_or_stream, Path): - with open(self.path_or_stream, "r", encoding="utf-8") as f: + with open(self.path_or_stream, "rb") as f: html_content = f.read() self.soup = BeautifulSoup(html_content, "html.parser") except Exception as e: