From 0de70e799100878b2aa48dfd49858c426f3f1b10 Mon Sep 17 00:00:00 2001 From: Tim Kellogg Date: Mon, 14 Apr 2025 01:45:13 -0400 Subject: [PATCH] fix: auto-recognize .xlsx, .docx and .pptx files (#1340) * bug: auto-recognize .xlsx files Signed-off-by: Tim Kellogg * apply styling Signed-off-by: Michele Dolfi * apply to other ms office zip formats Signed-off-by: Michele Dolfi --------- Signed-off-by: Tim Kellogg Signed-off-by: Michele Dolfi Co-authored-by: Michele Dolfi --- docling/datamodel/document.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 43894b0..93dfd1a 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -283,6 +283,13 @@ class _DocumentConversionInput(BaseModel): if mime is None: # must guess from with obj.open("rb") as f: content = f.read(1024) # Read first 1KB + if mime is not None and mime.lower() == "application/zip": + if obj.suffixes[-1].lower() == ".xlsx": + mime = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + elif obj.suffixes[-1].lower() == ".docx": + mime = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + elif obj.suffixes[-1].lower() == ".pptx": + mime = "application/vnd.openxmlformats-officedocument.presentationml.presentation" elif isinstance(obj, DocumentStream): content = obj.stream.read(8192)