fix: UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd0 in position 0: invalid continuation byte (#1665)

Update document.py fix: when mime not "application/xml" or "text/plain" raise UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd0 in position 0: invalid continuation byte Signed-off-by: DavidLee <yongsheng_li@foxmail.com>
2025-05-27 20:06:05 +08:00
parent 2579d89510
commit 51d3450915
1 changed files with 2 additions and 1 deletions
@@ -334,9 +334,9 @@ class _DocumentConversionInput(BaseModel):
    ) -> Optional[InputFormat]:
        """Guess the input format of a document by checking part of its content."""
        input_format: Optional[InputFormat] = None
-        content_str = content.decode("utf-8")

        if mime == "application/xml":
+            content_str = content.decode("utf-8")
            match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
            if match_doctype:
                xml_doctype = match_doctype.group()
@@ -358,6 +358,7 @@ class _DocumentConversionInput(BaseModel):
                    input_format = InputFormat.XML_JATS

        elif mime == "text/plain":
+            content_str = content.decode("utf-8")
            if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
                input_format = InputFormat.XML_USPTO