From 51d34509156e2dbec9e697276681d59f9ca7e020 Mon Sep 17 00:00:00 2001 From: DavidLee Date: Tue, 27 May 2025 20:06:05 +0800 Subject: [PATCH] fix: UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd0 in position 0: invalid continuation byte (#1665) Update document.py fix: when mime not "application/xml" or "text/plain" raise UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd0 in position 0: invalid continuation byte Signed-off-by: DavidLee --- docling/datamodel/document.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index e464ee3..5791c0e 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -334,9 +334,9 @@ class _DocumentConversionInput(BaseModel): ) -> Optional[InputFormat]: """Guess the input format of a document by checking part of its content.""" input_format: Optional[InputFormat] = None - content_str = content.decode("utf-8") if mime == "application/xml": + content_str = content.decode("utf-8") match_doctype = re.search(r"]+>", content_str) if match_doctype: xml_doctype = match_doctype.group() @@ -358,6 +358,7 @@ class _DocumentConversionInput(BaseModel): input_format = InputFormat.XML_JATS elif mime == "text/plain": + content_str = content.decode("utf-8") if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"): input_format = InputFormat.XML_USPTO