From f4d9d4111b0a6eb87fc1c05a56618fc430d1e7a2 Mon Sep 17 00:00:00 2001 From: MoheyElDin Badr <56153924+MoheyEl-DinBadr@users.noreply.github.com> Date: Tue, 20 May 2025 20:42:37 +0300 Subject: [PATCH] fix: Fix issue with detecting docx files, and files with upper case extensions (#1609) fix detecting files with uppercase extensions Signed-off-by: MoheyElDin Badr --- docling/datamodel/document.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 2b3aa9b..984cf02 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -302,7 +302,7 @@ class _DocumentConversionInput(BaseModel): if ("." in obj.name and not obj.name.startswith(".")) else "" ) - mime = _DocumentConversionInput._mime_from_extension(ext) + mime = _DocumentConversionInput._mime_from_extension(ext.lower()) if mime is not None and mime.lower() == "application/zip": objname = obj.name.lower() if objname.endswith(".xlsx"): @@ -376,6 +376,13 @@ class _DocumentConversionInput(BaseModel): mime = FormatToMimeType[InputFormat.JSON_DOCLING][0] elif ext in FormatToExtensions[InputFormat.PDF]: mime = FormatToMimeType[InputFormat.PDF][0] + elif ext in FormatToExtensions[InputFormat.DOCX]: + mime = FormatToMimeType[InputFormat.DOCX][0] + elif ext in FormatToExtensions[InputFormat.PPTX]: + mime = FormatToMimeType[InputFormat.PPTX][0] + elif ext in FormatToExtensions[InputFormat.XLSX]: + mime = FormatToMimeType[InputFormat.XLSX][0] + return mime @staticmethod