fix: auto-recognize .xlsx, .docx and .pptx files (#1340)

* bug: auto-recognize .xlsx files

Signed-off-by: Tim Kellogg <timothy.kellogg@gmail.com>

* apply styling

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply to other ms office zip formats

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Tim Kellogg <timothy.kellogg@gmail.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Tim Kellogg 2025-04-14 01:45:13 -04:00 committed by GitHub
parent b295da4bfe
commit 0de70e7991
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -283,6 +283,13 @@ class _DocumentConversionInput(BaseModel):
if mime is None: # must guess from
with obj.open("rb") as f:
content = f.read(1024) # Read first 1KB
if mime is not None and mime.lower() == "application/zip":
if obj.suffixes[-1].lower() == ".xlsx":
mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
elif obj.suffixes[-1].lower() == ".docx":
mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
elif obj.suffixes[-1].lower() == ".pptx":
mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
elif isinstance(obj, DocumentStream):
content = obj.stream.read(8192)