From 51d34509156e2dbec9e697276681d59f9ca7e020 Mon Sep 17 00:00:00 2001
From: DavidLee <yongsheng_li@foxmail.com>
Date: Tue, 27 May 2025 20:06:05 +0800
Subject: [PATCH] fix: UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd0
 in position 0: invalid continuation byte (#1665)

Update document.py

fix: when mime not "application/xml" or "text/plain" raise
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd0 in position 0: invalid continuation byte

Signed-off-by: DavidLee <yongsheng_li@foxmail.com>
---
 docling/datamodel/document.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index e464ee3..5791c0e 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -334,9 +334,9 @@ class _DocumentConversionInput(BaseModel):
     ) -> Optional[InputFormat]:
         """Guess the input format of a document by checking part of its content."""
         input_format: Optional[InputFormat] = None
-        content_str = content.decode("utf-8")
 
         if mime == "application/xml":
+            content_str = content.decode("utf-8")
             match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
             if match_doctype:
                 xml_doctype = match_doctype.group()
@@ -358,6 +358,7 @@ class _DocumentConversionInput(BaseModel):
                     input_format = InputFormat.XML_JATS
 
         elif mime == "text/plain":
+            content_str = content.decode("utf-8")
             if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
                 input_format = InputFormat.XML_USPTO