diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 43894b0..93dfd1a 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -283,6 +283,13 @@ class _DocumentConversionInput(BaseModel): if mime is None: # must guess from with obj.open("rb") as f: content = f.read(1024) # Read first 1KB + if mime is not None and mime.lower() == "application/zip": + if obj.suffixes[-1].lower() == ".xlsx": + mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + elif obj.suffixes[-1].lower() == ".docx": + mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + elif obj.suffixes[-1].lower() == ".pptx": + mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation" elif isinstance(obj, DocumentStream): content = obj.stream.read(8192)