From d37bb340c673b867fdf1ec10106bbed1c7ea7668 Mon Sep 17 00:00:00 2001
From: Steve Canny <stcanny@gmail.com>
Date: Fri, 13 Dec 2024 17:10:35 -0800
Subject: [PATCH 1/3] file: refine filetype detection

The eight file-types based on CFB or ZIP compound files can be detected
with 100% accuracy (or at least 99.999%). Try this strategy first,
ignoring the unreliable content-type for these file-types.

This fixes a problem where a CSV file with an asserted XLS file-type was
mistakenly typed as XLS and failed partitioning.
---
 test_unstructured/file_utils/test_filetype.py | 440 +++---------------
 unstructured/file_utils/filetype.py           | 243 +++++-----
 2 files changed, 207 insertions(+), 476 deletions(-)

diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
index 933882f9e2..c1f7ad1f8d 100644
--- a/test_unstructured/file_utils/test_filetype.py
+++ b/test_unstructured/file_utils/test_filetype.py
@@ -14,15 +14,14 @@
     LogCaptureFixture,
     Mock,
     example_doc_path,
-    function_mock,
     patch,
     property_mock,
 )
 from unstructured.file_utils.filetype import (
     _FileTypeDetectionContext,
-    _OleFileDifferentiator,
+    _OleFileDetector,
     _TextFileDifferentiator,
-    _ZipFileDifferentiator,
+    _ZipFileDetector,
     detect_filetype,
     is_json_processable,
 )
@@ -31,7 +30,41 @@
 is_in_docker = os.path.exists("/.dockerenv")
 
 # ================================================================================================
-# STRATEGY #1 - CONTENT-TYPE ASSERTED IN CALL
+# STRATEGY #1 - DIRECT DETECTION OF CFB/ZIP-BASED BINARY FILE TYPES (8 TYPES)
+# ================================================================================================
+
+
+@pytest.mark.parametrize(
+    ("expected_value", "file_name"),
+    [
+        (FileType.DOC, "simple.doc"),
+        (FileType.DOCX, "simple.docx"),
+        (FileType.EPUB, "winter-sports.epub"),
+        (FileType.ODT, "simple.odt"),
+        (FileType.PPT, "fake-power-point.ppt"),
+        (FileType.PPTX, "fake-power-point.pptx"),
+        (FileType.XLS, "tests-example.xls"),
+        (FileType.XLSX, "stanley-cups.xlsx"),
+    ],
+)
+def test_it_detects_correct_file_type_for_CFB_and_ZIP_subtypes_detected_by_direct_inspection(
+    file_name: str, expected_value: FileType, ctx_mime_type_: Mock
+):
+    # -- disable other strategies; no content-type, guessed MIME-type or extension --
+    ctx_mime_type_.return_value = None
+    with open(example_doc_path(file_name), "rb") as f:
+        file = io.BytesIO(f.read())
+
+    file_type = detect_filetype(file=file)
+
+    # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not
+    # -- fall back to MIME-type guessing for any of these test cases.
+    ctx_mime_type_.assert_not_called()
+    assert file_type == expected_value
+
+
+# ================================================================================================
+# STRATEGY #2 - CONTENT-TYPE ASSERTED IN CALL
 # ================================================================================================
 
 
@@ -40,41 +73,21 @@
     [
         (FileType.BMP, "img/bmp_24.bmp", "image/bmp"),
         (FileType.CSV, "stanley-cups.csv", "text/csv"),
-        (FileType.DOC, "simple.doc", "application/msword"),
-        (
-            FileType.DOCX,
-            "simple.docx",
-            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-        ),
         (FileType.EML, "eml/fake-email.eml", "message/rfc822"),
-        (FileType.EPUB, "winter-sports.epub", "application/epub+zip"),
         (FileType.HEIC, "img/DA-1p.heic", "image/heic"),
         (FileType.HTML, "example-10k-1p.html", "text/html"),
         (FileType.JPG, "img/example.jpg", "image/jpeg"),
         (FileType.JSON, "spring-weather.html.json", "application/json"),
         (FileType.MD, "README.md", "text/markdown"),
-        (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"),
         (FileType.ORG, "README.org", "text/org"),
         (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"),
         (FileType.PNG, "img/DA-1p.png", "image/png"),
-        (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"),
-        (
-            FileType.PPTX,
-            "fake-power-point.pptx",
-            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-        ),
         (FileType.RST, "README.rst", "text/x-rst"),
         (FileType.RTF, "fake-doc.rtf", "text/rtf"),
         (FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"),
         (FileType.TSV, "stanley-cups.tsv", "text/tsv"),
         (FileType.TXT, "norwich-city.txt", "text/plain"),
         (FileType.WAV, "CantinaBand3.wav", "audio/wav"),
-        (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"),
-        (
-            FileType.XLSX,
-            "stanley-cups.xlsx",
-            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-        ),
         (FileType.XML, "factbook.xml", "application/xml"),
         (FileType.ZIP, "simple.zip", "application/zip"),
     ],
@@ -82,13 +95,13 @@
 def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_content_type(
     file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock
 ):
-    # -- disable strategy #2, leaving only asserted content-type and extension --
+    # -- disable mime-guessing leaving only asserted content-type and extension --
     ctx_mime_type_.return_value = None
 
     file_type = detect_filetype(example_doc_path(file_name), content_type=content_type)
 
-    # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not
-    # -- fall back to strategy 2 for any of these test cases.
+    # -- Content-type strategy should not need to refer to guessed MIME-type and detection should
+    # not -- fall back to strategy 2 for any of these test cases.
     ctx_mime_type_.assert_not_called()
     assert file_type == expected_value
 
@@ -98,41 +111,21 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_conte
     [
         (FileType.BMP, "img/bmp_24.bmp", "image/bmp"),
         (FileType.CSV, "stanley-cups.csv", "text/csv"),
-        (FileType.DOC, "simple.doc", "application/msword"),
-        (
-            FileType.DOCX,
-            "simple.docx",
-            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-        ),
         (FileType.EML, "eml/fake-email.eml", "message/rfc822"),
-        (FileType.EPUB, "winter-sports.epub", "application/epub+zip"),
         (FileType.HEIC, "img/DA-1p.heic", "image/heic"),
         (FileType.HTML, "example-10k-1p.html", "text/html"),
         (FileType.JPG, "img/example.jpg", "image/jpeg"),
         (FileType.JSON, "spring-weather.html.json", "application/json"),
         (FileType.MD, "README.md", "text/markdown"),
-        (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"),
         (FileType.ORG, "README.org", "text/org"),
         (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"),
         (FileType.PNG, "img/DA-1p.png", "image/png"),
-        (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"),
-        (
-            FileType.PPTX,
-            "fake-power-point.pptx",
-            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-        ),
         (FileType.RST, "README.rst", "text/x-rst"),
         (FileType.RTF, "fake-doc.rtf", "text/rtf"),
         (FileType.TIFF, "img/layout-parser-paper-fast.tiff", "image/tiff"),
         (FileType.TSV, "stanley-cups.tsv", "text/tsv"),
         (FileType.TXT, "norwich-city.txt", "text/plain"),
         (FileType.WAV, "CantinaBand3.wav", "audio/wav"),
-        (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"),
-        (
-            FileType.XLSX,
-            "stanley-cups.xlsx",
-            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-        ),
         (FileType.XML, "factbook.xml", "application/xml"),
         (FileType.ZIP, "simple.zip", "application/zip"),
     ],
@@ -140,93 +133,22 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_conte
 def test_it_detects_correct_file_type_from_file_no_name_with_correct_asserted_content_type(
     file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock
 ):
-    # -- disable strategy #2 (guessed mime-type) --
-    ctx_mime_type_.return_value = None
-    # -- disable strategy #3 (filename extension) by supplying no source of file name --
-    with open(example_doc_path(file_name), "rb") as f:
-        file = io.BytesIO(f.read())
-
-    file_type = detect_filetype(file=file, content_type=content_type)
-
-    # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not
-    # -- fall-back to strategy 2 for any of these test cases.
-    ctx_mime_type_.assert_not_called()
-    assert file_type is expected_value
-
-
-@pytest.mark.parametrize(
-    ("expected_value", "file_name"),
-    [
-        (FileType.DOCX, "simple.docx"),
-        (FileType.PPTX, "fake-power-point.pptx"),
-        (FileType.XLSX, "stanley-cups.xlsx"),
-    ],
-)
-@pytest.mark.parametrize(
-    "content_type",
-    [
-        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-    ],
-)
-def test_it_detects_correct_file_type_from_file_no_name_with_swapped_ms_office_content_type(
-    file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock
-):
-    # -- disable strategies 2 & 3, content-type strategy should get this on its own --
+    # -- disable mime-guessing --
     ctx_mime_type_.return_value = None
+    # -- disable filename extension mapping by supplying no source of file name --
     with open(example_doc_path(file_name), "rb") as f:
         file = io.BytesIO(f.read())
 
     file_type = detect_filetype(file=file, content_type=content_type)
 
-    # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not
-    # -- fall-back to strategy 2 for any of these test cases.
-    ctx_mime_type_.assert_not_called()
-    assert file_type is expected_value
-
-
-@pytest.mark.parametrize(
-    ("expected_value", "file_name"),
-    [
-        (FileType.DOC, "simple.doc"),
-        (FileType.PPT, "fake-power-point.ppt"),
-        (FileType.XLS, "tests-example.xls"),
-    ],
-)
-@pytest.mark.parametrize(
-    "content_type",
-    [
-        "application/msword",
-        "application/vnd.ms-outlook",
-        "application/vnd.ms-powerpoint",
-        "application/vnd.ms-excel",
-        "anything/else",
-    ],
-)
-def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_content_type(
-    file_name: str, content_type: str, expected_value: FileType, ctx_mime_type_: Mock
-):
-    """Fixes wrong XLS asserted as DOC, PPT, etc.
-
-    Asserted content-type can be anything except `None` and differentiator will fix it if the file
-    is DOC, PPT, or XLS type.
-    """
-    # -- disable strategies 2 & 3, content-type strategy should get this on its own --
-    ctx_mime_type_.return_value = None
-    with open(example_doc_path(file_name), "rb") as f:
-        file = io.BytesIO(f.read())
-
-    file_type = detect_filetype(file=file, content_type=content_type)
-
-    # -- Strategy 1 should not need to refer to guessed MIME-type and detection should not
-    # -- fall-back to strategy 2 for any of these test cases.
+    # -- Content-type strategy should not need to refer to guessed MIME-type and detection should
+    # -- not fall-back to strategy 2 for any of these test cases.
     ctx_mime_type_.assert_not_called()
     assert file_type is expected_value
 
 
 # ================================================================================================
-# STRATEGY #2 - GUESS MIME-TYPE WITH LIBMAGIC
+# STRATEGY #3 - GUESS MIME-TYPE WITH LIBMAGIC/FILETYPE LIBRARY
 # ================================================================================================
 
 
@@ -237,31 +159,16 @@ def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_
         (FileType.CSV, "stanley-cups.csv", "text/csv"),
         (FileType.CSV, "stanley-cups.csv", "application/csv"),
         (FileType.CSV, "stanley-cups.csv", "application/x-csv"),
-        (FileType.DOC, "simple.doc", "application/msword"),
-        (
-            FileType.DOCX,
-            "simple.docx",
-            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-        ),
         (FileType.EML, "eml/fake-email.eml", "message/rfc822"),
-        (FileType.EPUB, "winter-sports.epub", "application/epub"),
-        (FileType.EPUB, "winter-sports.epub", "application/epub+zip"),
         (FileType.HEIC, "img/DA-1p.heic", "image/heic"),
         (FileType.HTML, "example-10k-1p.html", "text/html"),
         (FileType.JPG, "img/example.jpg", "image/jpeg"),
         (FileType.JSON, "spring-weather.html.json", "application/json"),
         (FileType.MD, "README.md", "text/markdown"),
         (FileType.MD, "README.md", "text/x-markdown"),
-        (FileType.ODT, "simple.odt", "application/vnd.oasis.opendocument.text"),
         (FileType.ORG, "README.org", "text/org"),
         (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"),
         (FileType.PNG, "img/DA-1p.png", "image/png"),
-        (FileType.PPT, "fake-power-point.ppt", "application/vnd.ms-powerpoint"),
-        (
-            FileType.PPTX,
-            "fake-power-point.pptx",
-            "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-        ),
         (FileType.RST, "README.rst", "text/x-rst"),
         (FileType.RTF, "fake-doc.rtf", "text/rtf"),
         (FileType.RTF, "fake-doc.rtf", "application/rtf"),
@@ -270,18 +177,11 @@ def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_asserted_
         (FileType.TXT, "norwich-city.txt", "text/plain"),
         (FileType.TXT, "simple.yaml", "text/yaml"),
         (FileType.WAV, "CantinaBand3.wav", "audio/wav"),
-        (FileType.XLS, "tests-example.xls", "application/vnd.ms-excel"),
-        (
-            FileType.XLSX,
-            "stanley-cups.xlsx",
-            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-        ),
         (FileType.XML, "factbook.xml", "application/xml"),
         (FileType.XML, "factbook.xml", "text/xml"),
-        (FileType.ZIP, "simple.zip", "application/zip"),
     ],
 )
-def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_recognized_mime_type(
+def test_it_detects_correct_file_type_by_guessed_MIME_when_libmagic_guesses_recognized_mime_type(
     file_name: str, mime_type: str, expected_value: FileType, ctx_mime_type_: Mock
 ):
     # -- libmagic guesses a MIME-type mapped to a `FileType` --
@@ -290,7 +190,7 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec
     with open(example_doc_path(file_name), "rb") as f:
         file = io.BytesIO(f.read())
 
-    # -- disable strategy #1 by not asserting a content_type in the call --
+    # -- disable content-type strategy by not asserting a content_type in the call --
     file_type = detect_filetype(file=file)
 
     # -- ctx.mime_type may be referenced multiple times, but at least once --
@@ -303,30 +203,22 @@ def test_it_detects_correct_file_type_using_strategy_2_when_libmagic_guesses_rec
     [
         (FileType.BMP, "img/bmp_24.bmp"),
         (FileType.CSV, "stanley-cups.csv"),
-        (FileType.DOC, "simple.doc"),
-        (FileType.DOCX, "simple.docx"),
         (FileType.EML, "eml/fake-email.eml"),
-        (FileType.EPUB, "winter-sports.epub"),
         (FileType.HEIC, "img/DA-1p.heic"),
         (FileType.HTML, "ideas-page.html"),
         (FileType.JPG, "img/example.jpg"),
         (FileType.JSON, "spring-weather.html.json"),
-        (FileType.ODT, "simple.odt"),
         (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"),
         (FileType.PNG, "img/DA-1p.png"),
-        (FileType.PPT, "fake-power-point.ppt"),
-        (FileType.PPTX, "fake-power-point.pptx"),
         (FileType.RTF, "fake-doc.rtf"),
         (FileType.TIFF, "img/layout-parser-paper-fast.tiff"),
         (FileType.TXT, "norwich-city.txt"),
         (FileType.WAV, "CantinaBand3.wav"),
-        (FileType.XLS, "tests-example.xls"),
-        (FileType.XLSX, "stanley-cups.xlsx"),
         (FileType.XML, "factbook.xml"),
         (FileType.ZIP, "simple.zip"),
     ],
 )
-def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_type_for_itself(
+def test_it_detects_most_file_types_using_mime_guessing_when_libmagic_guesses_mime_type_for_itself(
     file_name: str, expected_value: FileType
 ):
     """Does not work for all types, in particular:
@@ -339,90 +231,26 @@ def test_it_detects_most_file_types_using_strategy_2_when_libmagic_guesses_mime_
     - ORG is identified as TXT
     - RST is identified as TXT
     """
-    # -- disable strategy #1 by not asserting a content_type in the call --
-    # -- disable strategy #3 (extension) by passing file-like object with no `.name` attribute --
+    # -- disable content-type strategy by not asserting a content_type in the call --
+    # -- disable extension-mapping strategy by passing file-like object with no `.name` attribute --
     with open(example_doc_path(file_name), "rb") as f:
         file = io.BytesIO(f.read())
 
     assert detect_filetype(file=file) is expected_value
 
 
-@pytest.mark.parametrize(
-    ("expected_value", "file_name"),
-    [
-        (FileType.DOC, "simple.doc"),
-        (FileType.PPT, "fake-power-point.ppt"),
-        (FileType.XLS, "tests-example.xls"),
-    ],
-)
-@pytest.mark.parametrize(
-    "guessed_mime_type",
-    [
-        "application/msword",
-        "application/vnd.ms-excel",
-        "application/vnd.ms-outlook",
-        "application/vnd.ms-powerpoint",
-        "application/x-ole-storage",
-        "anything/else",
-    ],
-)
-def test_it_detects_correct_file_type_from_OLE_file_no_name_with_wrong_guessed_mime_type(
-    file_name: str, guessed_mime_type: str, expected_value: FileType, ctx_mime_type_: Mock
-):
-    """Fixes XLS wrongly-guessed as DOC, PPT, "application/x-ole-storage" etc.
-
-    It's better than that actually, the OLE differentiator will get the right file-type for any DOC,
-    PPT, XLS, or MSG file, regardless of guessed MIME-type.
-    """
-    ctx_mime_type_.return_value = guessed_mime_type
-    # -- disable strategy 3 by not providing a file-name source --
-    with open(example_doc_path(file_name), "rb") as f:
-        file = io.BytesIO(f.read())
-
-    # -- disable strategy 1 by not asserting a content-type --
-    file_type = detect_filetype(file=file)
-
-    ctx_mime_type_.assert_called_with()
-    assert file_type is expected_value
-
-
-@pytest.mark.parametrize(
-    ("filename", "mime_type", "expected"),
-    [
-        ("fake.doc", "application/vnd.ms-excel", FileType.DOC),
-        ("fake-power-point.ppt", "application/vnd.ms-excel", FileType.PPT),
-        ("tests-example.xls", "application/msword", FileType.XLS),
-        ("fake-email.msg", "application/vnd.ms-excel", FileType.MSG),
-    ],
-)
-def test_ole_file_structure_trusted_over_mime_type_guess(filename, mime_type, expected):
-    def _guess_mime(*args, **kwargs):
-        return mime_type
-
-    with patch("filetype.guess_mime", _guess_mime):
-        detect_filetype(example_doc_path(filename)) == expected
-
-
 @pytest.mark.parametrize(
     ("expected_value", "file_name"),
     [
         # -- `filetype` lib recognizes all these binary file-types --
         (FileType.BMP, "img/bmp_24.bmp"),
-        (FileType.DOC, "simple.doc"),
-        (FileType.DOCX, "simple.docx"),
-        (FileType.EPUB, "winter-sports.epub"),
         (FileType.HEIC, "img/DA-1p.heic"),
         (FileType.JPG, "img/example.jpg"),
-        (FileType.ODT, "simple.odt"),
         (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"),
         (FileType.PNG, "img/DA-1p.png"),
-        (FileType.PPT, "fake-power-point.ppt"),
-        (FileType.PPTX, "fake-power-point.pptx"),
         (FileType.RTF, "fake-doc.rtf"),
         (FileType.TIFF, "img/layout-parser-paper-fast.tiff"),
         (FileType.WAV, "CantinaBand3.wav"),
-        (FileType.XLS, "tests-example.xls"),
-        (FileType.XLSX, "stanley-cups.xlsx"),
         (FileType.ZIP, "simple.zip"),
         # -- but it doesn't recognize textual file-types at all --
         (FileType.UNK, "stanley-cups.csv"),
@@ -435,11 +263,9 @@ def _guess_mime(*args, **kwargs):
         (FileType.UNK, "stanley-cups.tsv"),
         (FileType.UNK, "norwich-city.txt"),
         (FileType.UNK, "factbook.xml"),
-        # -- and it doesn't recognize MSG files --
-        (FileType.UNK, "fake-email.msg"),
     ],
 )
-def test_strategy_2_can_detect_only_binary_file_types_when_libmagic_is_unavailable(
+def test_strategy_mime_guessing_can_detect_only_binary_file_types_when_libmagic_is_unavailable(
     file_name: str, expected_value: FileType, LIBMAGIC_AVAILABLE_False: bool
 ):
     """File-type is detected using `filetype` library when libmagic is not available.
@@ -447,7 +273,7 @@ def test_strategy_2_can_detect_only_binary_file_types_when_libmagic_is_unavailab
     `filetype.guess_mime()` does a good job on binary file types (PDF, images, legacy MS-Office),
     but doesn't even try to guess textual file-types.
     """
-    # -- disable strategy #3 (extension) by passing file-like object with no `.name` attribute --
+    # -- disable detection by extension by passing file-like object with no `.name` attribute --
     with open(example_doc_path(file_name), "rb") as f:
         file = io.BytesIO(f.read())
     # -- simulate libmagic is not available --
@@ -470,7 +296,7 @@ def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed(
 
 
 # ================================================================================================
-# STRATEGY #3 - MAP FILENAME EXTENSION TO FILETYPE
+# STRATEGY #4 - MAP FILENAME EXTENSION TO FILETYPE
 # ================================================================================================
 
 
@@ -479,35 +305,25 @@ def test_detect_filetype_from_file_warns_when_libmagic_is_not_installed(
     [
         (FileType.BMP, "img/bmp_24.bmp"),
         (FileType.CSV, "stanley-cups.csv"),
-        (FileType.DOC, "simple.doc"),
-        (FileType.DOCX, "simple.docx"),
         (FileType.EML, "eml/fake-email.eml"),
-        (FileType.EPUB, "winter-sports.epub"),
         (FileType.HEIC, "img/DA-1p.heic"),
         (FileType.HTML, "example-10k-1p.html"),
         (FileType.JPG, "img/example.jpg"),
         (FileType.JSON, "spring-weather.html.json"),
         (FileType.MD, "README.md"),
-        (FileType.MSG, "fake-email.msg"),
-        (FileType.ODT, "simple.odt"),
         (FileType.ORG, "README.org"),
         (FileType.PDF, "pdf/layout-parser-paper-fast.pdf"),
         (FileType.PNG, "img/DA-1p.png"),
-        (FileType.PPT, "fake-power-point.ppt"),
-        (FileType.PPTX, "fake-power-point.pptx"),
         (FileType.RST, "README.rst"),
         (FileType.RTF, "fake-doc.rtf"),
         (FileType.TIFF, "img/layout-parser-paper-fast.tiff"),
         (FileType.TSV, "stanley-cups.tsv"),
         (FileType.TXT, "norwich-city.txt"),
         (FileType.WAV, "CantinaBand3.wav"),
-        (FileType.XLS, "tests-example.xls"),
-        (FileType.XLSX, "stanley-cups.xlsx"),
         (FileType.XML, "factbook.xml"),
-        (FileType.ZIP, "simple.zip"),
     ],
 )
-def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_file_type(
+def test_it_detects_correct_file_type_from_extension_when_that_maps_to_a_file_type(
     file_name: str, expected_value: FileType, ctx_mime_type_: Mock
 ):
     # -- disable strategy #2 by making libmagic always guess `None` --
@@ -525,10 +341,8 @@ def test_it_detects_correct_file_type_from_strategy_3_when_extension_maps_to_fil
 @pytest.mark.parametrize(
     ("expected_value", "file_name", "mime_type"),
     [
-        (FileType.BMP, "img/bmp_24.bmp", "application/zip"),
-        (FileType.DOC, "simple.doc", None),
-        (FileType.EPUB, "winter-sports.epub", "application/x-ole-storage"),
-        (FileType.MSG, "fake-email.msg", "application/octet-stream"),
+        (FileType.BMP, "img/bmp_24.bmp", "application/octet-stream"),
+        (FileType.HEIC, "img/DA-1p.heic", "application/octet-stream"),
     ],
 )
 def test_it_falls_back_to_extension_strategy_when_prior_strategies_fail(
@@ -547,6 +361,12 @@ def test_it_falls_back_to_extension_strategy_when_prior_strategies_fail(
 # ================================================================================================
 
 
+@pytest.mark.parametrize("mime_type", [FileType.XLS.mime_type, FileType.XLSX.mime_type])
+def test_it_ignores_asserted_XLS_content_type_when_file_is_CSV(mime_type: str):
+    file_path = example_doc_path("stanley-cups.csv")
+    assert detect_filetype(file_path, content_type=mime_type) == FileType.CSV
+
+
 @pytest.mark.parametrize("mime_type", ["application/xml", "text/xml"])
 @pytest.mark.parametrize("extension", [".html", ".htm"])
 def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extension(
@@ -563,39 +383,6 @@ def test_it_detects_HTML_from_guessed_mime_type_ending_with_xml_and_html_extensi
     assert file_type is FileType.HTML
 
 
-@pytest.mark.parametrize(
-    "mime_type",
-    [
-        "application/octet-stream",
-        "application/zip",
-        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-        "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-    ],
-)
-@pytest.mark.parametrize(
-    ("expected_value", "file_name"),
-    [
-        (FileType.DOCX, "simple.docx"),
-        (FileType.PPTX, "fake-power-point.pptx"),
-        (FileType.XLSX, "stanley-cups.xlsx"),
-        (FileType.ZIP, "simple.zip"),
-    ],
-)
-def test_it_differentiates_files_when_libmagic_guesses_octet_stream_zip_or_modern_ms_office(
-    mime_type: str, file_name: str, expected_value: FileType, ctx_mime_type_: Mock
-):
-    ctx_mime_type_.return_value = mime_type
-    # -- disable extension-based strategy #3 --
-    with open(example_doc_path(file_name), "rb") as f:
-        file = io.BytesIO(f.read())
-
-    file_type = detect_filetype(file=file)
-
-    ctx_mime_type_.assert_called_with()
-    assert file_type is expected_value
-
-
 @pytest.mark.parametrize(
     ("mime_type", "file_name"),
     [
@@ -1000,29 +787,8 @@ def mime_type_prop_(self, request: FixtureRequest):
         return property_mock(request, _FileTypeDetectionContext, "mime_type")
 
 
-class Describe_OleFileDifferentiator:
-    """Unit-test suite for `unstructured.file_utils.filetype._OleFileDifferentiator`."""
-
-    # -- .applies() ---------------------------------------------
-
-    def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self):
-        """The constructor determines whether this differentiator is applicable.
-
-        It returns an instance only when differentiating a CFBF file-type is required, which it
-        judges by inspecting the initial bytes of the file for the CFBF magic-bytes.
-        """
-        ctx = _FileTypeDetectionContext(example_doc_path("simple.doc"))
-
-        differentiator = _OleFileDifferentiator.applies(ctx, "foo/bar")
-
-        assert differentiator is not None
-        assert isinstance(differentiator, _OleFileDifferentiator)
-
-    def and_it_returns_None_when_ole_differentiation_is_not_applicable_to_the_mime_type(self):
-        ctx = _FileTypeDetectionContext(example_doc_path("winter-sports.epub"))
-        assert _OleFileDifferentiator.applies(ctx, "application/epub") is None
-
-    # -- .file_type ---------------------------------------------
+class Describe_OleFileDetector:
+    """Unit-test suite for `unstructured.file_utils.filetype._OleFileDetector`."""
 
     @pytest.mark.parametrize(
         ("file_name", "expected_value"),
@@ -1034,59 +800,15 @@ def and_it_returns_None_when_ole_differentiation_is_not_applicable_to_the_mime_t
             ("README.org", None),
         ],
     )
-    def it_distinguishes_the_file_type_of_applicable_OLE_files(
+    def it_distinguishes_the_file_type_of_applicable_CFB_files(
         self, file_name: str, expected_value: FileType | None
     ):
         # -- no file-name available, just to make sure we're not relying on an extension --
         with open(example_doc_path(file_name), "rb") as f:
             file = io.BytesIO(f.read())
         ctx = _FileTypeDetectionContext(file=file)
-        differentiator = _OleFileDifferentiator(ctx)
 
-        assert differentiator.file_type is expected_value
-
-    @pytest.mark.parametrize(
-        ("file_name", "expected_value"),
-        [
-            ("simple.doc", FileType.DOC),
-            ("fake-power-point.ppt", FileType.PPT),
-            ("tests-example.xls", FileType.XLS),
-            ("fake-email.msg", FileType.MSG),
-        ],
-    )
-    def it_distinguishes_the_file_type_of_applicable_OLE_files_from_storage_content(
-        self, file_name: str, expected_value: FileType | None
-    ):
-        # -- no file-name available, just to make sure we're not relying on an extension --
-        with open(example_doc_path(file_name), "rb") as f:
-            file = io.BytesIO(f.read())
-        ctx = _FileTypeDetectionContext(file=file)
-        differentiator = _OleFileDifferentiator(ctx)
-
-        assert differentiator._check_ole_file_type(ctx) is expected_value
-
-    def but_it_returns_None_to_engage_fallback_when_filetype_cannot_guess_mime(
-        self, guess_mime_: Mock
-    ):
-        guess_mime_.return_value = None
-        # -- no file-name available, just to make sure we're not relying on an extension --
-        with open(example_doc_path("fake-email.msg"), "rb") as f:
-            file = io.BytesIO(f.read())
-        ctx = _FileTypeDetectionContext(file=file)
-        differentiator = _OleFileDifferentiator(ctx)
-        # -- force method to return None to trigger the mime type being guessed
-        differentiator._check_ole_file_type = lambda ctx: None
-
-        file_type = differentiator.file_type
-
-        guess_mime_.assert_called_once_with(file)
-        assert file_type is None
-
-    # -- fixtures --------------------------------------------------------------------------------
-
-    @pytest.fixture
-    def guess_mime_(self, request: FixtureRequest):
-        return function_mock(request, "unstructured.file_utils.filetype.ft.guess_mime")
+        assert _OleFileDetector.file_type(ctx) is expected_value
 
 
 class Describe_TextFileDifferentiator:
@@ -1164,33 +886,15 @@ def it_distinguishes_a_JSON_file_from_other_text_files(
         assert differentiator._is_json is expected_value
 
 
-class Describe_ZipFileDifferentiator:
-    """Unit-test suite for `unstructured.file_utils.filetype._ZipFileDifferentiator`."""
-
-    # -- .applies() ---------------------------------------------
-
-    def it_provides_a_qualifying_alternate_constructor_which_constructs_when_applicable(self):
-        """The constructor determines whether this differentiator is applicable.
-
-        It returns an instance only when differentiating a zip file-type is required, which it can
-        judge from the mime-type provided by the context (`ctx`).
-        """
-        ctx = _FileTypeDetectionContext(example_doc_path("simple.docx"))
-
-        differentiator = _ZipFileDifferentiator.applies(ctx, "application/zip")
-
-        assert isinstance(differentiator, _ZipFileDifferentiator)
-
-    def and_it_returns_None_when_zip_differentiation_does_not_apply_to_the_detection_context(self):
-        ctx = _FileTypeDetectionContext(example_doc_path("norwich-city.txt"))
-        assert _ZipFileDifferentiator.applies(ctx, "application/epub") is None
-
-    # -- .file_type ---------------------------------------------
+class Describe_ZipFileDetector:
+    """Unit-test suite for `unstructured.file_utils.filetype._ZipFileDetector`."""
 
     @pytest.mark.parametrize(
         ("file_name", "expected_value"),
         [
             ("simple.docx", FileType.DOCX),
+            ("winter-sports.epub", FileType.EPUB),
+            ("simple.odt", FileType.ODT),
             ("picture.pptx", FileType.PPTX),
             ("vodafone.xlsx", FileType.XLSX),
             ("simple.zip", FileType.ZIP),
@@ -1201,6 +905,4 @@ def it_distinguishes_the_file_type_of_applicable_zip_files(
         self, file_name: str, expected_value: FileType | None
     ):
         ctx = _FileTypeDetectionContext(example_doc_path(file_name))
-        differentiator = _ZipFileDifferentiator(ctx)
-
-        assert differentiator.file_type is expected_value
+        assert _ZipFileDetector.file_type(ctx) is expected_value
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
index d109cd7384..fed219cd54 100644
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@@ -51,7 +51,11 @@
 from unstructured.partition.common.metadata import set_element_hierarchy
 from unstructured.utils import get_call_args_applying_defaults, lazyproperty
 
-LIBMAGIC_AVAILABLE = bool(importlib.util.find_spec("magic"))
+try:
+    importlib.import_module("magic")
+    LIBMAGIC_AVAILABLE = True
+except ImportError:
+    LIBMAGIC_AVAILABLE = False  # pyright: ignore[reportConstantRedefinition]
 
 
 def detect_filetype(
@@ -133,43 +137,57 @@ def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType:
     @property
     def _file_type(self) -> FileType:
         """FileType member corresponding to this document source."""
-        # -- strategy 1: use content-type asserted by caller --
+        # -- An explicit content-type most commonly asserted by the client/SDK and is therefore
+        # -- inherently unreliable. On the other hand, binary file-types can be detected with 100%
+        # -- accuracy. So start with binary types and only then consider an asserted content-type,
+        # -- generally as a last resort.
+
+        # -- strategy 1: most binary types can be detected with 100% accuracy --
+        if file_type := self._known_binary_file_type:
+            return file_type
+
+        # -- strategy 2: use content-type asserted by caller --
         if file_type := self._file_type_from_content_type:
             return file_type
 
-        # -- strategy 2: guess MIME-type using libmagic and use that --
+        # -- strategy 3: guess MIME-type using libmagic and use that --
         if file_type := self._file_type_from_guessed_mime_type:
             return file_type
 
-        # -- strategy 3: use filename-extension, like ".docx" -> FileType.DOCX --
+        # -- strategy 4: use filename-extension, like ".docx" -> FileType.DOCX --
         if file_type := self._file_type_from_file_extension:
             return file_type
 
-        # -- strategy 4: give up and report FileType.UNK --
+        # -- strategy 5: give up and report FileType.UNK --
         return FileType.UNK
 
     # == STRATEGIES ============================================================
 
+    @property
+    def _known_binary_file_type(self) -> FileType | None:
+        """Detect file-type for binary types we can positively detect."""
+        if file_type := _OleFileDetector.file_type(self._ctx):
+            return file_type
+
+        self._ctx.rule_out_cfb_content_types()
+
+        if file_type := _ZipFileDetector.file_type(self._ctx):
+            return file_type
+
+        self._ctx.rule_out_zip_content_types()
+
+        return None
+
     @property
     def _file_type_from_content_type(self) -> FileType | None:
         """Map passed content-type argument to a file-type, subject to certain rules."""
-        content_type = self._ctx.content_type
 
         # -- when no content-type was asserted by caller, this strategy is not applicable --
-        if not content_type:
+        if not self._ctx.content_type:
             return None
 
-        # -- OLE-based file-format content_type values are sometimes unreliable. These are
-        # -- DOC, PPT, XLS, and MSG.
-        if differentiator := _OleFileDifferentiator.applies(self._ctx, content_type):
-            return differentiator.file_type
-
-        # -- MS-Office 2007+ (OpenXML) content_type value is sometimes unreliable --
-        if differentiator := _ZipFileDifferentiator.applies(self._ctx, content_type):
-            return differentiator.file_type
-
         # -- otherwise we trust the passed `content_type` as long as `FileType` recognizes it --
-        return FileType.from_mime_type(content_type)
+        return FileType.from_mime_type(self._ctx.content_type)
 
     @property
     def _file_type_from_guessed_mime_type(self) -> FileType | None:
@@ -188,24 +206,12 @@ def _file_type_from_guessed_mime_type(self) -> FileType | None:
         if mime_type is None:
             return None
 
-        if differentiator := _OleFileDifferentiator.applies(self._ctx, mime_type):
-            return differentiator.file_type
-
         if mime_type.endswith("xml"):
             return FileType.HTML if extension in (".html", ".htm") else FileType.XML
 
         if differentiator := _TextFileDifferentiator.applies(self._ctx):
             return differentiator.file_type
 
-        # -- applicable to "application/octet-stream", "application/zip", and all Office 2007+
-        # -- document MIME-types, i.e. those for DOCX, PPTX, and XLSX. Note however it does NOT
-        # -- apply to EPUB or ODT documents, even though those are also Zip archives. The zip and
-        # -- octet-stream MIME-types are fed in because they are ambiguous. The MS-Office types are
-        # -- differentiated because they are sometimes mistaken for each other, like DOCX mime-type
-        # -- is actually a PPTX file etc.
-        if differentiator := _ZipFileDifferentiator.applies(self._ctx, mime_type):
-            return differentiator.file_type
-
         # -- All source-code files (e.g. *.py, *.js) are classified as plain text for the moment --
         if self._ctx.has_code_mime_type:
             return FileType.TXT
@@ -214,14 +220,8 @@ def _file_type_from_guessed_mime_type(self) -> FileType | None:
             return FileType.EMPTY
 
         # -- if no more-specific rules apply, use the MIME-type -> FileType mapping when present --
-        if file_type := FileType.from_mime_type(mime_type):
-            return file_type
-
-        logger.warning(
-            f"The MIME type{f' of {self._ctx.file_path!r}' if self._ctx.file_path else ''} is"
-            f" {mime_type!r}. This file type is not currently supported in unstructured.",
-        )
-        return None
+        file_type = FileType.from_mime_type(mime_type)
+        return file_type if file_type != FileType.UNK else None
 
     @lazyproperty
     def _file_type_from_file_extension(self) -> FileType | None:
@@ -236,6 +236,9 @@ def _file_type_from_file_extension(self) -> FileType | None:
 class _FileTypeDetectionContext:
     """Provides all arguments to auto-file detection and values derived from them.
 
+    NOTE that `._content_type` is mutable via `.rule_out_*_content_types()` methods, so it should
+    not be assumed to be a constant value across those calls.
+
     This keeps computation of derived values out of the file-detection code but more importantly
     allows the main filetype-detector to pass the full context to any delegates without coupling
     itself to which values it might need.
@@ -276,7 +279,7 @@ def new(
         self._validate()
         return self
 
-    @lazyproperty
+    @property
     def content_type(self) -> str | None:
         """MIME-type asserted by caller; not based on inspection of file by this process.
 
@@ -284,6 +287,8 @@ def content_type(self) -> str | None:
         present on the response. These are often ambiguous and sometimes just wrong so get some
         further verification. All lower-case when not `None`.
         """
+        # -- Note `._content_type` is mutable via `.invalidate_content_type()` so this cannot be a
+        # -- `@lazyproperty`.
         return self._content_type.lower() if self._content_type else None
 
     @lazyproperty
@@ -327,12 +332,6 @@ def file_path(self) -> str | None:
 
         return os.path.realpath(file_path) if os.path.islink(file_path) else file_path
 
-    @lazyproperty
-    def is_zipfile(self) -> bool:
-        """True when file is a Zip archive."""
-        with self.open() as file:
-            return zipfile.is_zipfile(file)
-
     @lazyproperty
     def has_code_mime_type(self) -> bool:
         """True when `mime_type` plausibly indicates a programming language source-code file."""
@@ -347,9 +346,27 @@ def has_code_mime_type(self) -> bool:
 
         return any(
             lang in mime_type
-            for lang in "c# c++ cpp csharp java javascript php python ruby swift typescript".split()
+            for lang in [
+                "c#",
+                "c++",
+                "cpp",
+                "csharp",
+                "java",
+                "javascript",
+                "php",
+                "python",
+                "ruby",
+                "swift",
+                "typescript",
+            ]
         )
 
+    @lazyproperty
+    def is_zipfile(self) -> bool:
+        """True when file is a Zip archive."""
+        with self.open() as file:
+            return zipfile.is_zipfile(file)
+
     @lazyproperty
     def mime_type(self) -> str | None:
         """The best MIME-type we can get from `magic` (or `filetype` package).
@@ -401,6 +418,38 @@ def open(self) -> Iterator[IO[bytes]]:
             file.seek(0)
             yield file
 
+    def rule_out_cfb_content_types(self) -> None:
+        """Invalidate content-type when a legacy MS-Office file-type is asserted.
+
+        Used before returning `None`; at that point we know the file is not one of these formats
+        so if the asserted `content-type` is a legacy MS-Office type we know it's wrong and should
+        not be used as a fallback later in the detection process.
+        """
+        if FileType.from_mime_type(self._content_type) in (
+            FileType.DOC,
+            FileType.MSG,
+            FileType.PPT,
+            FileType.XLS,
+        ):
+            self._content_type = None
+
+    def rule_out_zip_content_types(self) -> None:
+        """Invalidate content-type when an MS-Office 2007+ file-type is asserted.
+
+        Used before returning `None`; at that point we know the file is not one of these formats
+        so if the asserted `content-type` is an MS-Office 2007+ type we know it's wrong and should
+        not be used as a fallback later in the detection process.
+        """
+        if FileType.from_mime_type(self._content_type) in (
+            FileType.DOCX,
+            FileType.EPUB,
+            FileType.ODT,
+            FileType.PPTX,
+            FileType.XLSX,
+            FileType.ZIP,
+        ):
+            self._content_type = None
+
     @lazyproperty
     def text_head(self) -> str:
         """The initial characters of the text file for use with text-format differentiation.
@@ -440,27 +489,23 @@ def _validate(self) -> None:
             raise ValueError("either `file_path` or `file` argument must be provided")
 
 
-class _OleFileDifferentiator:
-    """Refine an OLE-storage package (CFBF) file-type that may not be as specific as it could be.
+class _OleFileDetector:
+    """Detect and differentiate a CFB file, aka. "OLE" file.
 
-    Compound File Binary Format (CFBF), aka. OLE file, is use by Microsoft for legacy MS Office
-    files (DOC, PPT, XLS) as well as for Outlook MSG files. `libmagic` tends to identify these as
-    `"application/x-ole-storage"` which is true but too not specific enough for partitioning
-    purposes.
+    Compound File Binary Format (CFB), aka. OLE file, is use by Microsoft for legacy MS Office
+    files (DOC, PPT, XLS) as well as for Outlook MSG files.
     """
 
     def __init__(self, ctx: _FileTypeDetectionContext):
         self._ctx = ctx
 
     @classmethod
-    def applies(
-        cls, ctx: _FileTypeDetectionContext, mime_type: str
-    ) -> _OleFileDifferentiator | None:
-        """Constructs an instance, but only if this differentiator applies for `mime_type`."""
-        return cls(ctx) if cls._is_ole_file(ctx) else None
+    def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType | None:
+        """Specific file-type when file is a CFB file, `None` otherwise."""
+        return cls(ctx)._file_type
 
     @property
-    def file_type(self) -> FileType | None:
+    def _file_type(self) -> FileType | None:
         """Differentiated file-type for Microsoft Compound File Binary Format (CFBF).
 
         Returns one of:
@@ -468,34 +513,27 @@ def file_type(self) -> FileType | None:
         - `FileType.PPT`
         - `FileType.XLS`
         - `FileType.MSG`
+        - `None` when the file is not one of these.
         """
-        # -- if this is not a CFBF file then whatever MIME-type was guessed is wrong, so return
-        # -- `None` to trigger fall-back to next strategy.
-        if not self._is_ole_file(self._ctx):
+        # -- all CFB files share common magic number, start with that --
+        if not self._is_ole_file:
             return None
 
-        # -- check storage contents of the ole file for file type markers
-        if (ole_file_type := self._check_ole_file_type(self._ctx)) is not None:
+        # -- check storage contents of the ole file for file-type specific stream names --
+        if (ole_file_type := self._ole_file_type) is not None:
             return ole_file_type
 
-        # -- `filetype` lib is better at legacy MS-Office files than `libmagic`, so we rely on it
-        # -- to differentiate those. Note `filetype` doesn't detect MSG type and won't always
-        # -- detect DOC, PPT, or XLS, returning `None` instead. We let those fall through and we
-        # -- rely on filename-extension to identify those.
-        with self._ctx.open() as file:
-            mime_type = ft.guess_mime(file)
-
-        return FileType.from_mime_type(mime_type) if mime_type else None
+        return None
 
-    @staticmethod
-    def _is_ole_file(ctx: _FileTypeDetectionContext) -> bool:
-        """True when file has CFBF magic first 8 bytes."""
-        with ctx.open() as file:
+    @lazyproperty
+    def _is_ole_file(self) -> bool:
+        """True when file has CFB magic first 8 bytes."""
+        with self._ctx.open() as file:
             return file.read(8) == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1"
 
-    @staticmethod
-    def _check_ole_file_type(ctx: _FileTypeDetectionContext) -> FileType | None:
-        with ctx.open() as f:
+    @lazyproperty
+    def _ole_file_type(self) -> FileType | None:
+        with self._ctx.open() as f:
             ole = OleFileIO(f)  # pyright: ignore[reportUnknownVariableType]
             root_storage = Storage.from_ole(ole)  # pyright: ignore[reportUnknownMemberType]
 
@@ -616,40 +654,28 @@ def _is_json(self) -> bool:
             return False
 
 
-class _ZipFileDifferentiator:
-    """Refine a Zip-packaged file-type that may be ambiguous or swapped."""
+class _ZipFileDetector:
+    """Detect and differentiate a Zip-archive file."""
 
     def __init__(self, ctx: _FileTypeDetectionContext):
         self._ctx = ctx
 
     @classmethod
-    def applies(
-        cls, ctx: _FileTypeDetectionContext, mime_type: str
-    ) -> _ZipFileDifferentiator | None:
-        """Constructs an instance, but only if this differentiator applies for `mime_type`.
+    def file_type(cls, ctx: _FileTypeDetectionContext) -> FileType | None:
+        """Most specific file-type available when file is a Zip file, `None` otherwise.
 
-        Separate `mime_type` argument allows it to be applied to either asserted content-type or
-        guessed mime-type.
+        MS-Office 2007+ files are detected with 100% accuracy. Otherwise this returns `None`, even
+        when we can tell it's a Zip file, so later strategies can have a crack at it. In
+        particular, ODT and EPUB files are Zip archives but are not detected here.
         """
-        return (
-            cls(ctx)
-            if mime_type
-            in (
-                "application/octet-stream",
-                "application/zip",
-                "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
-                "application/vnd.openxmlformats-officedocument.presentationml.presentation",
-                "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
-            )
-            else None
-        )
+        return cls(ctx)._file_type
 
     @lazyproperty
-    def file_type(self) -> FileType | None:
+    def _file_type(self) -> FileType | None:
         """Differentiated file-type for a Zip archive.
 
-        Returns `None` if the file is not a Zip archive. Otherwise it returns `FileType.DOCX`,
-        `FileType.PPTX`, or `FileType.XLSX` when one of those applies and `FileType.ZIP` otherwise.
+        Returns `FileType.DOCX`, `FileType.PPTX`, or `FileType.XLSX` when one of those applies,
+        `None` otherwise.
         """
         if not self._ctx.is_zipfile:
             return None
@@ -657,20 +683,23 @@ def file_type(self) -> FileType | None:
         with self._ctx.open() as file:
             zip = zipfile.ZipFile(file)
 
-            # NOTE(robinson) - .docx and .xlsx files are actually a zip file with a .docx/.xslx
-            # extension. If the MIME type is application/octet-stream, we check if it's a
-            # .docx/.xlsx file by looking for expected filenames within the zip file.
-            filenames = [f.filename for f in zip.filelist]
+            filenames = zip.namelist()
 
-            if all(f in filenames for f in ("word/document.xml",)):
+            if "word/document.xml" in filenames:
                 return FileType.DOCX
 
-            if all(f in filenames for f in ("xl/workbook.xml",)):
+            if "xl/workbook.xml" in filenames:
                 return FileType.XLSX
 
-            if all(f in filenames for f in ("ppt/presentation.xml",)):
+            if "ppt/presentation.xml" in filenames:
                 return FileType.PPTX
 
+            # -- ODT and EPUB files place their MIME-type in `mimetype` in the archive root --
+            if "mimetype" in filenames:
+                with zip.open("mimetype") as f:
+                    mime_type = f.read().decode("utf-8").strip()
+                    return FileType.from_mime_type(mime_type)
+
         return FileType.ZIP
 
 

From e1c5f5b0e615392afea95a2fadadb37f8d233d19 Mon Sep 17 00:00:00 2001
From: Steve Canny <stcanny@gmail.com>
Date: Mon, 16 Dec 2024 09:35:14 -0800
Subject: [PATCH 2/3] fix: recognize `.markdown` extension as MD

---
 unstructured/file_utils/filetype.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
index fed219cd54..4c8e4d2be8 100644
--- a/unstructured/file_utils/filetype.py
+++ b/unstructured/file_utils/filetype.py
@@ -575,7 +575,20 @@ def file_type(self) -> FileType:
         """
         extension = self._ctx.extension
 
-        if extension in ".csv .eml .html .json .md .org .p7s .rst .rtf .tab .tsv".split():
+        if extension in [
+            ".csv",
+            ".eml",
+            ".html",
+            ".json",
+            ".markdown",
+            ".md",
+            ".org",
+            ".p7s",
+            ".rst",
+            ".rtf",
+            ".tab",
+            ".tsv",
+        ]:
             return FileType.from_extension(extension) or FileType.TXT
 
         # NOTE(crag): for older versions of the OS libmagic package, such as is currently

From b7a0b3b717d289ab4b9c6c964111c54363ba2a8f Mon Sep 17 00:00:00 2001
From: Steve Canny <stcanny@gmail.com>
Date: Fri, 13 Dec 2024 17:16:07 -0800
Subject: [PATCH 3/3] chore: bump CHANGELOG + __version__

---
 CHANGELOG.md                | 3 ++-
 unstructured/__version__.py | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 959f6c581b..d13d859802 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.16.12-dev2
+## 0.16.12-dev3
 
 ### Enhancements
 
@@ -9,6 +9,7 @@
 ### Fixes
 
 - **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
+- **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
 
 ## 0.16.11
 
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
index 0dbfa1eb73..d1e3d3bd18 100644
--- a/unstructured/__version__.py
+++ b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.12-dev2"  # pragma: no cover
+__version__ = "0.16.12-dev3"  # pragma: no cover