fix: Let BeautifulSoup detect the HTML encoding (#695)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-01-07 15:49:28 +01:00 committed by GitHub
parent 2d24faecd9
commit 42856fdf79
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -37,10 +37,10 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
try:
if isinstance(self.path_or_stream, BytesIO):
text_stream = self.path_or_stream.getvalue().decode("utf-8")
text_stream = self.path_or_stream.getvalue()
self.soup = BeautifulSoup(text_stream, "html.parser")
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
with open(self.path_or_stream, "rb") as f:
html_content = f.read()
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e: