diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp
index 2ce67a71a5da57..c5e91369ce80e2 100644
--- a/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp
+++ b/src/plugins/intel_npu/src/al/include/intel_npu/al/config/npuw.hpp
@@ -41,6 +41,7 @@ DEFINE_OPT(NPUW_ONLINE_DUMP_PLAN, std::string, "", npuw::partitioning::online::d
 DEFINE_OPT(NPUW_PLAN, std::string, "", npuw::partitioning::plan, CompileTime);
 DEFINE_OPT(NPUW_FOLD, bool, false, npuw::partitioning::fold, CompileTime);
 DEFINE_OPT(NPUW_CWAI, bool, false, npuw::partitioning::cwai, CompileTime);
+DEFINE_OPT(NPUW_DQ, bool, false, npuw::partitioning::dyn_quant, CompileTime);
 DEFINE_OPT(NPUW_DCOFF_TYPE, std::string, "", npuw::partitioning::dcoff_type, CompileTime);
 DEFINE_OPT(NPUW_DCOFF_SCALE, bool, false, npuw::partitioning::dcoff_with_scale, CompileTime);
 DEFINE_OPT(NPUW_FUNCALL_FOR_ALL, bool, false, npuw::partitioning::funcall_for_all, CompileTime);
diff --git a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp
index 627b6b957ebfb3..b0cf6f4a608f66 100644
--- a/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp
+++ b/src/plugins/intel_npu/src/al/include/npuw_private_properties.hpp
@@ -159,6 +159,14 @@ static constexpr ov::Property<bool> fold{"NPUW_FOLD"};
  */
 static constexpr ov::Property<bool> cwai{"NPUW_CWAI"};
 
+/**
+ * @brief
+ * Type: bool.
+ * Apply dynamic quantization transformations at the plugin side.
+ * Default value: false.
+ */
+static constexpr ov::Property<bool> dyn_quant{"NPUW_DQ"};
+
 /**
  * @brief
  * Type: std::string.
diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
index a43794e883368c..406fb9e6288fff 100644
--- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp
+++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp
@@ -26,6 +26,7 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) {
     desc.add<NPUW_PLAN>();
     desc.add<NPUW_FOLD>();
     desc.add<NPUW_CWAI>();
+    desc.add<NPUW_DQ>();
     desc.add<NPUW_DCOFF_TYPE>();
     desc.add<NPUW_DCOFF_SCALE>();
     desc.add<NPUW_FUNCALL_FOR_ALL>();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index 6cfb7fd7ce1dc7..bfb1b474ee0f34 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -797,6 +797,7 @@ void ov::npuw::CompiledModel::implement_properties() {
                           BIND(npuw::partitioning::plan, NPUW_PLAN),
                           BIND(npuw::partitioning::fold, NPUW_FOLD),
                           BIND(npuw::partitioning::cwai, NPUW_CWAI),
+                          BIND(npuw::partitioning::dyn_quant, NPUW_DQ),
                           BIND(npuw::partitioning::funcall_for_all, NPUW_FUNCALL_FOR_ALL),
                           BIND(npuw::partitioning::dcoff_type, NPUW_DCOFF_TYPE),
                           BIND(npuw::partitioning::dcoff_with_scale, NPUW_DCOFF_SCALE),
diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
index 6bd1b296c3a9a9..ba1f56b060e0c4 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp
@@ -478,6 +478,8 @@ void ov::npuw::JustInferRequest::unpack_closure(std::size_t idx, RqPtr request)
     // Bind extra parameters from the function's closure
     // First, do easy things & delay heavy stuff
     std::vector<std::size_t> closure_unpack_required;
+    std::vector<std::size_t> closure_copy_required;
+
     for (std::size_t cidx = 0u; cidx < comp_model_desc.closure.size(); cidx++) {
         auto& closure = comp_model_desc.closure[cidx];
 
@@ -488,14 +490,27 @@ void ov::npuw::JustInferRequest::unpack_closure(std::size_t idx, RqPtr request)
             // Remember where the unpack is required
             closure_unpack_required.push_back(cidx);
         } else if (comp_model_desc.update_required[cidx]) {
-            // Easy case, just set one to another. Copy_to is also possible
-            // and even may be preferrable for some devices, like this:
-            // ```ov::get_tensor_impl(closure)->copy_to(clparam._ptr);'''
-            request->set_tensor(iport, ov::get_tensor_impl(closure));
+            if (needs_copy(idx)) {
+                // Remember where copy is requried
+                closure_copy_required.push_back(cidx);
+            } else {
+                // Easy case, just set one to another
+                request->set_tensor(iport, ov::get_tensor_impl(closure));
+            }
         }
     }  // for(closure)
-       // m_ms_unpack += ov::npuw::perf::ms_to_run([&](){
-       //    ov::parallel_for(closure_unpack_required.size(), [&](std::size_t j) {
+
+    // m_ms_unpack += ov::npuw::perf::ms_to_run([&](){
+    ov::parallel_for(closure_copy_required.size(), [&](std::size_t j) {
+        auto cidx = closure_copy_required[j];
+        auto& closure = comp_model_desc.closure[cidx];
+        const auto closure_param_id = comp_model_desc.param_base + cidx;
+        auto& iport = func_desc.compiled_model->inputs()[closure_param_id];
+        auto clparam = request->get_tensor(iport);
+        ov::get_tensor_impl(closure)->copy_to(clparam._ptr);
+    });
+    // }); // ms_to_run
+
     for (std::size_t j = 0; j != closure_unpack_required.size(); j++) {
         // NB: No need to protect anything here as containers are all
         // preallocated and we only access elements under particular (thread
@@ -525,8 +540,6 @@ void ov::npuw::JustInferRequest::unpack_closure(std::size_t idx, RqPtr request)
             ov::npuw::util::unpack(ov::get_tensor_impl(closure), clparam);
         }
     }
-    //}); // ov_parallel_for
-    // }); // ms_to_run
 }
 
 void ov::npuw::JustInferRequest::recreate_subrequests(std::size_t idx) {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp
index 4e373dda5a24eb..2ee36fcb09361a 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.cpp
@@ -5,6 +5,7 @@
 #include "snapshot.hpp"
 
 #include "../../logging.hpp"
+#include "../../util.hpp"
 #include "../patterns/avoid.hpp"
 #include "../patterns/compute.hpp"
 #include "group.hpp"
@@ -72,6 +73,8 @@ void Snapshot::buildGraph() {
         ++gid;
     }
 
+    using namespace ov::npuw::util::at;
+
     for (const auto& nh : m_graph->sorted()) {
         auto gptr = m_graph->meta(nh).get<Group::GPtr>();
         auto ov_node = gptr->getInitialNode();
@@ -83,7 +86,7 @@ void Snapshot::buildGraph() {
                 auto ov_node_child = target_output.get_node()->shared_from_this();
 
                 // Insert readers from other layers
-                m_node_to_prod_cons->at(ov_node).second.insert(ov_node_child);
+                _(m_node_to_prod_cons).at(ov_node).second.insert(ov_node_child);
 
                 // Save ports for repeated blocks pipeline
                 m_ports_map.insert({{ov_node, ov_node_child}, {i, target_output.get_index()}});
@@ -91,9 +94,9 @@ void Snapshot::buildGraph() {
                 if (!isOp(ov_node_child)) {
                     continue;
                 }
-
-                if (!m_graph->linked(nh, m_node_to_gr->at(ov_node_child)->getHandle())) {
-                    m_graph->link(nh, m_node_to_gr->at(ov_node_child)->getHandle());
+                Group::GPtr gr_child = _(m_node_to_gr).at(ov_node_child);
+                if (!m_graph->linked(nh, gr_child->getHandle())) {
+                    m_graph->link(nh, gr_child->getHandle());
                 }
             }
         }  // for(outputs)
@@ -103,7 +106,7 @@ void Snapshot::buildGraph() {
             auto ov_node_parent = target_input.get_node()->shared_from_this();
 
             // Insert writers from other layers
-            m_node_to_prod_cons->at(ov_node).first.insert(ov_node_parent);
+            _(m_node_to_prod_cons).at(ov_node).first.insert(ov_node_parent);
 
             // Save ports for repeated blocks pipeline
             m_ports_map.insert({{ov_node_parent, ov_node}, {target_input.get_index(), i}});
@@ -112,8 +115,9 @@ void Snapshot::buildGraph() {
                 continue;
             }
 
-            if (!m_graph->linked(m_node_to_gr->at(ov_node_parent)->getHandle(), nh)) {
-                m_graph->link(m_node_to_gr->at(ov_node_parent)->getHandle(), nh);
+            Group::GPtr gr_parent = _(m_node_to_gr).at(ov_node_parent);
+            if (!m_graph->linked(gr_parent->getHandle(), nh)) {
+                m_graph->link(gr_parent->getHandle(), nh);
             }
         }  // for(inputs)
     }      // for(get_ordered_ops)
@@ -1028,11 +1032,11 @@ GPtrSet Snapshot::getRepGroups(const Group::GPtr& group) const {
 }
 
 const OVNodeSet& Snapshot::getNodeProducers(const OVNodePtr& node) const {
-    return m_node_to_prod_cons->at(node).first;
+    return ov::npuw::util::at::_(m_node_to_prod_cons).at(node).first;
 }
 
 const OVNodeSet& Snapshot::getNodeConsumers(const OVNodePtr& node) const {
-    return m_node_to_prod_cons->at(node).second;
+    return ov::npuw::util::at::_(m_node_to_prod_cons).at(node).second;
 }
 
 // Updated within a group during fusion
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
index 6fef5d8b6fdf94..39716df4527236 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
@@ -9,6 +9,7 @@
 #include "intel_npu/al/config/npuw.hpp"
 #include "online/compiler.hpp"
 #include "online/utils/utils.hpp"  // getMetaDesc
+#include "openvino/core/parallel.hpp"
 #include "openvino/op/convert.hpp"
 #include "openvino/op/slice.hpp"
 #include "openvino/op/util/op_types.hpp"
@@ -16,6 +17,7 @@
 #include "openvino/util/common_util.hpp"
 #include "openvino/util/xml_parse_utils.hpp"
 #include "patterns/dcoff.hpp"
+#include "patterns/opt.hpp"
 
 namespace {
 
@@ -277,6 +279,7 @@ class Partitioner {
     void matchResults(const std::string& func_name);
     void createFunction(const std::string& func_name);
     void matchRepeatedSubgraphs(const std::string& func_name);
+    void optimize(const std::string& func_name);
     void decompressionCutOff(const std::string& func_name);
 
     // Final steps
@@ -1557,6 +1560,50 @@ void Partitioner::matchRepeatedSubgraphs(const std::string& func_name) {
     LOG_VERB("Done");
 }
 
+void Partitioner::optimize(const std::string& func_name) {
+    if (!cfg.get<::intel_npu::NPUW_DQ>()) {
+        LOG_VERB("No optimizations will be done to  " << func_name << " in model " << model->get_friendly_name()
+                                                      << "...");
+        return;
+    }
+
+    LOG_VERB("Optimize function " << func_name << " in model " << model->get_friendly_name() << "...");
+    LOG_BLOCK();
+
+    ov::npuw::Function& f = P.functions.at(func_name);
+
+    ov::npuw::patterns::opt::Context ctx;
+    ov::pass::GraphRewrite rewr;
+    rewr.add_matcher<ov::npuw::patterns::opt::DQMatMulCWi>();
+    rewr.add_matcher<ov::npuw::patterns::opt::DQMatMulGQi>(std::ref(ctx));
+    rewr.add_matcher<ov::npuw::patterns::opt::DQMatMulGQ2i>(std::ref(ctx));
+    rewr.run_on_model(f._model);
+    ov::pass::Validate().run_on_model(f._model);
+
+    // Permute tensors where required
+    auto& func_group = all_functions.at(func_name);
+    for (auto&& p : ctx.closures_to_permute) {
+        auto param_idx = f._model->get_parameter_index(p.first);
+        auto closure_idx = param_idx - f._param_offset;
+        ov::parallel_for(func_group.refs.size(), [&](std::size_t f_idx) {
+            auto& funcall = func_group.refs[f_idx].get();
+            ov::npuw::util::permute(funcall._closure[closure_idx], p.second);
+        });
+    }
+
+    // Convert tensors where required
+    for (auto&& p : ctx.closures_to_f16) {
+        auto param_idx = f._model->get_parameter_index(p);
+        auto closure_idx = param_idx - f._param_offset;
+        ov::parallel_for(func_group.refs.size(), [&](std::size_t f_idx) {
+            auto& funcall = func_group.refs[f_idx].get();
+            ov::npuw::util::to_f16(funcall._closure[closure_idx]);
+        });
+    }
+
+    LOG_VERB("Done");
+}
+
 void Partitioner::decompressionCutOff(const std::string& func_name) {
     LOG_VERB("Decompression cut-off for function " << func_name << " in model " << model->get_friendly_name() << "...");
     LOG_BLOCK();
@@ -1826,6 +1873,7 @@ ov::npuw::Partitioning ov::npuw::getPartitioning(const std::shared_ptr<ov::Model
                 p.matchParameters(func_group);
                 p.matchResults(func_group);
                 p.matchRepeatedSubgraphs(func_group);
+                p.optimize(func_group);
                 p.decompressionCutOff(func_group);
             }
         } else if (cfg.get<::intel_npu::NPUW_CWAI>()) {
@@ -1841,6 +1889,7 @@ ov::npuw::Partitioning ov::npuw::getPartitioning(const std::shared_ptr<ov::Model
                 p.saveTinyConstants(func_group);
                 p.saveScaleFactors(func_group);
                 p.createFunction(func_group);
+                p.optimize(func_group);
                 p.decompressionCutOff(func_group);
             }
         } else {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
new file mode 100644
index 00000000000000..d2204982cd5687
--- /dev/null
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp
@@ -0,0 +1,358 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "opt.hpp"
+
+#include "../../logging.hpp"
+#include "openvino/op/add.hpp"
+#include "openvino/op/concat.hpp"
+#include "openvino/op/convert.hpp"
+#include "openvino/op/divide.hpp"
+#include "openvino/op/matmul.hpp"
+#include "openvino/op/multiply.hpp"
+#include "openvino/op/reduce_sum.hpp"
+#include "openvino/op/reshape.hpp"
+#include "openvino/op/split.hpp"
+#include "openvino/op/squeeze.hpp"
+#include "openvino/op/unsqueeze.hpp"
+#include "openvino/op/util/op_types.hpp"
+#include "openvino/pass/pattern/op/label.hpp"  // any_input
+#include "openvino/pass/pattern/op/wrap_type.hpp"
+#include "openvino/util/common_util.hpp"
+
+// FIXME: It is probably much better to have these transformations done
+// just once per function BODY
+// On LLMs, it may reduce the # of such transformations by a factor of
+// 20..30x.
+
+namespace ov {
+namespace npuw {
+namespace patterns {
+namespace opt {
+
+void Context::permute(PPtr orig_param, const Context::Axes& order) {
+    closures_to_permute[orig_param] = order;
+}
+
+void Context::to_f16(PPtr orig_param) {
+    closures_to_f16.insert(orig_param);
+}
+
+namespace opp = ov::pass::pattern;
+
+// FROM:
+//     ???(Act) ----------------------------------->
+//     Param(W) -> to(f16) -> Multiply -> to(f32) -> MatMul
+//     Param(S) ------------>
+//
+// TO:
+//     ???(Act) -> to(f16) ->
+//     Param(W) -> to(f16) -> MatMul -> Multiply -> to(f32)
+//     Param(S) -> Reshape ----------->
+//
+
+DQMatMulCWi::DQMatMulCWi() {
+    auto qweight = opp::wrap_type<ov::op::v0::Parameter>();
+    auto qcoeff = opp::wrap_type<ov::op::v0::Parameter>();
+    auto qcvtw = opp::wrap_type<ov::op::v0::Convert>({qweight});
+    auto qmuls = opp::wrap_type<ov::op::v1::Multiply>({qcvtw, qcoeff});
+    auto qcvtm = opp::wrap_type<ov::op::v0::Convert>({qmuls});
+    auto qmmi = opp::any_input();
+    auto qmm = opp::wrap_type<ov::op::v0::MatMul>({qmmi, qcvtm});
+
+    // Note: Use [=] to make sure the above objects stay alive in the callback
+    auto callback = [=](ov::pass::pattern::Matcher& m) {
+        auto& node_to_output = m.get_pattern_value_map();
+
+        auto matched_node_qweight = node_to_output.at(qweight).get_node_shared_ptr();
+        auto matched_node_qcoeff = node_to_output.at(qcoeff).get_node_shared_ptr();
+        auto matched_node_matmul = node_to_output.at(qmm).get_node_shared_ptr();
+
+        auto matched_qweight = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_qweight);
+        auto matched_qcoeff = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_qcoeff);
+        auto matched_matmul = std::static_pointer_cast<ov::op::v0::MatMul>(matched_node_matmul);
+
+        auto qcoeff_shape = matched_qcoeff->output(0).get_shape();
+
+        if (ov::element::i4 == matched_qweight->get_element_type() && qcoeff_shape[1] == 1 &&
+            !matched_matmul->get_transpose_a() && matched_matmul->get_transpose_b()) {
+            auto matched_node_cvtw = node_to_output.at(qcvtw).get_node_shared_ptr();
+            auto matched_node_cvtm = node_to_output.at(qcvtm).get_node_shared_ptr();
+            auto matched_node_muls = node_to_output.at(qmuls).get_node_shared_ptr();
+            auto matched_node_mmi = node_to_output.at(qmmi).get_node_shared_ptr();
+
+            // Reconnect MatMul to read from Convert(W) directly.
+            // Note: ACT is f32 so has to be converted too.
+            auto new_cvt_act = std::make_shared<ov::op::v0::Convert>(matched_node_mmi, ov::element::f16);
+            matched_matmul->input(0).replace_source_output(new_cvt_act);
+            matched_matmul->input(1).replace_source_output(matched_node_cvtw);
+
+            // Store MatMul's readers
+            auto mm_readers = matched_matmul->output(0).get_target_inputs();
+
+            // Introduce a Reshape to alter Scale factor's shape
+            auto new_dims = std::vector<std::size_t>{qcoeff_shape[1], qcoeff_shape[0]};
+            auto new_const = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{2}, new_dims);
+            auto new_reshape = std::make_shared<ov::op::v1::Reshape>(matched_node_qcoeff, new_const, false);
+
+            // Reconnect Multiply's both inputs. Drop all outputs
+            matched_node_muls->input(0).replace_source_output(matched_matmul);
+            matched_node_muls->input(1).replace_source_output(new_reshape);
+            for (auto&& r : matched_node_muls->output(0).get_target_inputs()) {
+                matched_node_muls->output(0).remove_target_input(r);
+            }
+
+            // Reconnect Convert(M) to convert the Multiply's result
+            matched_node_cvtm->input(0).replace_source_output(matched_node_muls);
+
+            // Reconnect MatMul's old readers to Convert(Multiply)
+            for (auto&& r : mm_readers) {
+                r.replace_source_output(matched_node_cvtm);
+            }
+        }
+
+        return true;  // root has changed
+    };
+    register_matcher(std::make_shared<opp::Matcher>(qmm, "OptDQMatMulCWi"), std::move(callback));
+}
+
+// FROM:
+//     ???(Act) -------------------------------------------->
+//     Param(W) -> Convert(f16|f32) -> Multiply -> Reshape -> MatMul
+//     Param(S) --------------------->
+//
+// WHERE (example):
+//     Act: [ 1,  1, 4096]
+//     W:   [32,128,11008]
+//     S:   [32,  1,11008]
+//                                         [1, 1 ,128]   x
+// TO:                                     [1,11K,128]T  =
+//                 [32,1,128]              [1, 1 ,11K]        [32,1,11K]
+//     ???(Act)  -> Reshape > Split(/32) ->[to(f16) ->       ]}
+//     Param(W*) -----------> Split(/32) ->[to(f16) -> MatMul]} Concat v
+//     Param(S)  ---------------------------------------------> Multiply
+//                                                              Reshape(1,a,b,c)
+//                                                              ReduceSum(1)
+//                                                              Reshape(a,b,c)
+//                                                              to(f32)
+// WHERE:
+//     W* : [32,11008,128]
+
+DQMatMulGQi::DQMatMulGQi(Context::Ref ctx) {
+    auto qweight = opp::wrap_type<ov::op::v0::Parameter>();
+    auto qcoeff = opp::wrap_type<ov::op::v0::Parameter>();
+    auto qcvtw = opp::wrap_type<ov::op::v0::Convert>({qweight});
+    auto qmuls = opp::wrap_type<ov::op::v1::Multiply>({qcvtw, qcoeff});
+    auto qreshp = opp::wrap_type<ov::op::v1::Reshape>({qmuls, opp::any_input()});
+    auto qmmi = opp::any_input();
+    auto qmm = opp::wrap_type<ov::op::v0::MatMul>({qmmi, qreshp});
+
+    // Note: Use [=] to make sure the above objects stay alive in the callback
+    auto callback = [=](ov::pass::pattern::Matcher& m) {
+        auto& node_to_output = m.get_pattern_value_map();
+
+        auto matched_node_qweight = node_to_output.at(qweight).get_node_shared_ptr();
+        auto matched_node_qcoeff = node_to_output.at(qcoeff).get_node_shared_ptr();
+        auto matched_node_matmul = node_to_output.at(qmm).get_node_shared_ptr();
+        auto matched_out_mmi = node_to_output.at(qmmi);
+
+        auto matched_qweight = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_qweight);
+        auto matched_qcoeff = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_qcoeff);
+        auto matched_matmul = std::static_pointer_cast<ov::op::v0::MatMul>(matched_node_matmul);
+
+        auto qweight_shape = matched_qweight->output(0).get_shape();
+        auto qcoeff_shape = matched_qcoeff->output(0).get_shape();
+        auto act_shape = matched_out_mmi.get_shape();
+        auto out_shape = matched_node_matmul->output(0).get_shape();
+
+        if (ov::element::i4 == matched_qweight->get_element_type() &&
+            ov::element::f32 == matched_qcoeff->get_element_type() && qcoeff_shape.size() == 3 &&
+            qweight_shape.size() == 3 && act_shape.size() == 3 && qcoeff_shape[0] == qweight_shape[0] &&
+            qcoeff_shape[1] == 1 && qcoeff_shape[2] == qweight_shape[2] && !matched_matmul->get_transpose_a() &&
+            !matched_matmul->get_transpose_b()) {
+            // Mark W closure to transpose, and transpose the respective parameter
+            ctx.get().permute(matched_qweight, {0, 2, 1});
+
+            // Mark S closure to be lowered fo f16
+            ctx.get().to_f16(matched_qcoeff);
+
+            ov::Shape tw_shape = {qweight_shape[0], qweight_shape[2], qweight_shape[1]};
+            matched_qweight->set_partial_shape(tw_shape);
+            matched_qweight->validate_and_infer_types();
+
+            matched_qcoeff->set_element_type(ov::element::f16);
+            matched_qcoeff->validate_and_infer_types();
+
+            // Reshape the Act to group format
+            const auto NSPLIT = qweight_shape[0];
+            std::vector<std::size_t> rshp_act_v = {NSPLIT, act_shape[1], act_shape[2] / NSPLIT};
+            auto rshp_act_c = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, rshp_act_v);
+            auto rshp_act = std::make_shared<ov::op::v1::Reshape>(matched_out_mmi, rshp_act_c, false);
+
+            // Split Act and W, and S tensors by NSPLIT
+            auto split_axis = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, 0);
+            auto split_a = std::make_shared<ov::op::v1::Split>(rshp_act, split_axis, NSPLIT);
+            auto split_w = std::make_shared<ov::op::v1::Split>(matched_qweight, split_axis, NSPLIT);
+
+            // Do the CW MM for every split
+            std::vector<std::shared_ptr<ov::Node>> to_concat;
+            for (std::size_t i = 0; i < NSPLIT; i++) {
+                auto a_f16 = std::make_shared<ov::op::v0::Convert>(split_a->output(i), ov::element::f16);
+                auto w_f16 = std::make_shared<ov::op::v0::Convert>(split_w->output(i), ov::element::f16);
+                auto m_f16 = std::make_shared<ov::op::v0::MatMul>(a_f16, w_f16, false, true);
+                to_concat.push_back(m_f16);
+            }
+
+            // Now concat and scale the result
+            auto concat = std::make_shared<ov::op::v0::Concat>(to_concat, 0);
+            auto s_f16 = std::make_shared<ov::op::v1::Multiply>(concat, matched_qcoeff);
+
+            // Now reshape to a better shape, ReduceSum, and reshape to the right size again
+            std::vector<std::size_t> rshp_ccat_v = {1, NSPLIT, 1, qweight_shape[2]};
+            auto rshp_ccat_c = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, rshp_ccat_v);
+            auto rshp_ccat = std::make_shared<ov::op::v1::Reshape>(s_f16, rshp_ccat_c, false);
+
+            auto reduce_axis = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, 1);
+            auto reduce = std::make_shared<ov::op::v1::ReduceSum>(rshp_ccat, reduce_axis, true);
+
+            auto rshp_out_c = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, out_shape);
+            auto rshp_out = std::make_shared<ov::op::v1::Reshape>(reduce, rshp_out_c, false);
+
+            // Convert the result to f32 to maintain the graph contracts. FIXME should be avoided
+            auto out = std::make_shared<ov::op::v0::Convert>(rshp_out, ov::element::f32);
+
+            // Now.. Reconnect the matmul readers to the new output (reducesum)
+            for (auto&& r : matched_matmul->output(0).get_target_inputs()) {
+                r.replace_source_output(out);
+            }
+            return true;  // root has changed
+        }
+        return false;  // did nothing here
+    };
+    register_matcher(std::make_shared<opp::Matcher>(qmm, "OptDQMatMulGQi"), std::move(callback));
+}
+
+// FROM:
+//     ???(Act) -------------------------------------------------------->
+//     Param(W) -> Convert(f16) -> Multiply -> Reshape -> Convert(f32) -> MatMul
+//     Param(S) ----------------->
+//
+// WHERE (example):
+//     Act: [  1, 1,2048]
+//     W:   [512,16, 128]
+//     S:   [512,16,   1]
+//                                         [1,  1,128]   x
+// TO:                                     [1,512,128]T  =
+//                 [16,1,128]              [1,  1,512]            [16,1,512]
+//     ???(Act)  -> Reshape > Split(/16) ->[to(f16) ->          ]}
+//     Param(W*) -----------> Split(/16) ->[to(f16) -> MatMul v ]} Concat
+//     Param(S)  -----------> Split(/16) ->[---------> Multiply ]}   v
+//                                                               Reshape(1,16,1,512)
+//                                                               ReduceSum(1)
+//                                                               Reshape(   1,1,512)
+//                                                               to(f32)
+// WHERE:
+//     W* : [16,512,128]
+
+DQMatMulGQ2i::DQMatMulGQ2i(Context::Ref ctx) {
+    auto qweight = opp::wrap_type<ov::op::v0::Parameter>();
+    auto qcoeff = opp::wrap_type<ov::op::v0::Parameter>();
+    auto qcvtw = opp::wrap_type<ov::op::v0::Convert>({qweight});
+    auto qmuls = opp::wrap_type<ov::op::v1::Multiply>({qcvtw, qcoeff});
+    auto qreshp = opp::wrap_type<ov::op::v1::Reshape>({qmuls, opp::any_input()});
+    auto qcvtr = opp::wrap_type<ov::op::v0::Convert>({qreshp});
+    auto qmmi = opp::any_input();
+    auto qmm = opp::wrap_type<ov::op::v0::MatMul>({qmmi, qcvtr});
+
+    // Note: Use [=] to make sure the above objects stay alive in the callback
+    auto callback = [=](ov::pass::pattern::Matcher& m) {
+        auto& node_to_output = m.get_pattern_value_map();
+
+        auto matched_node_qweight = node_to_output.at(qweight).get_node_shared_ptr();
+        auto matched_node_qcoeff = node_to_output.at(qcoeff).get_node_shared_ptr();
+        auto matched_node_matmul = node_to_output.at(qmm).get_node_shared_ptr();
+        auto matched_out_mmi = node_to_output.at(qmmi);
+
+        auto matched_qweight = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_qweight);
+        auto matched_qcoeff = std::static_pointer_cast<ov::op::v0::Parameter>(matched_node_qcoeff);
+        auto matched_matmul = std::static_pointer_cast<ov::op::v0::MatMul>(matched_node_matmul);
+
+        auto qweight_shape = matched_qweight->output(0).get_shape();
+        auto qcoeff_shape = matched_qcoeff->output(0).get_shape();
+        auto act_shape = matched_out_mmi.get_shape();
+        auto out_shape = matched_node_matmul->output(0).get_shape();
+
+        if (ov::element::i4 == matched_qweight->get_element_type() && qweight_shape.size() == 3 &&
+            ov::element::f16 == matched_qcoeff->get_element_type() && qcoeff_shape.size() == 3 &&
+            act_shape.size() == 3 && qcoeff_shape[0] == qweight_shape[0] && qcoeff_shape[2] == 1 &&
+            qcoeff_shape[1] == qweight_shape[1] && !matched_matmul->get_transpose_a() &&
+            matched_matmul->get_transpose_b()) {
+            // Mark W closure to transpose, and transpose the respective parameter
+            ctx.get().permute(matched_qweight, {1, 0, 2});
+
+            ov::Shape tw_shape = {qweight_shape[1], qweight_shape[0], qweight_shape[2]};
+            matched_qweight->set_partial_shape(tw_shape);
+            matched_qweight->validate_and_infer_types();
+
+            // Reshape the Act to group format
+            const auto NSPLIT = qweight_shape[1];
+            std::vector<std::size_t> rshp_act_v = {NSPLIT, 1, act_shape[2] / NSPLIT};
+            auto rshp_act_c = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, rshp_act_v);
+            auto rshp_act = std::make_shared<ov::op::v1::Reshape>(matched_out_mmi, rshp_act_c, false);
+
+            // Split Act and W, and S tensors by NSPLIT
+            auto split_axis = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, 0);
+            auto split_a = std::make_shared<ov::op::v1::Split>(rshp_act, split_axis, NSPLIT);
+            auto split_w = std::make_shared<ov::op::v1::Split>(matched_qweight, split_axis, NSPLIT);
+
+            auto split_axis_s = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, 1);
+            auto split_s = std::make_shared<ov::op::v1::Split>(matched_qcoeff, split_axis_s, NSPLIT);
+
+            std::vector<std::size_t> rshp_scale_v = {1, 1, qcoeff_shape[0]};
+            auto rshp_scale_c = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, rshp_scale_v);
+
+            // Do the CW MM for every split
+            std::vector<std::shared_ptr<ov::Node>> to_concat;
+            for (std::size_t i = 0; i < NSPLIT; i++) {
+                auto a_f16 = std::make_shared<ov::op::v0::Convert>(split_a->output(i), ov::element::f16);
+                auto w_f16 = std::make_shared<ov::op::v0::Convert>(split_w->output(i), ov::element::f16);
+                auto m_f16 = std::make_shared<ov::op::v0::MatMul>(a_f16, w_f16, false, true);
+
+                auto r_f16 = std::make_shared<ov::op::v1::Reshape>(split_s->output(i), rshp_scale_c, false);
+                auto s_f16 = std::make_shared<ov::op::v1::Multiply>(m_f16, r_f16);
+                to_concat.push_back(s_f16);
+            }
+
+            // Now concat and scale the result
+            auto concat = std::make_shared<ov::op::v0::Concat>(to_concat, 0);
+
+            // Now reshape to a better shape, ReduceSum, and reshape to the right size again
+            std::vector<std::size_t> rshp_ccat_v = {1, NSPLIT, 1, qweight_shape[0]};
+            auto rshp_ccat_c = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{4}, rshp_ccat_v);
+            auto rshp_ccat = std::make_shared<ov::op::v1::Reshape>(concat, rshp_ccat_c, false);
+
+            auto reduce_axis = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{}, 1);
+            auto reduce = std::make_shared<ov::op::v1::ReduceSum>(rshp_ccat, reduce_axis, true);
+
+            auto rshp_out_c = std::make_shared<ov::op::v0::Constant>(ov::element::i32, ov::Shape{3}, out_shape);
+            auto rshp_out = std::make_shared<ov::op::v1::Reshape>(reduce, rshp_out_c, false);
+
+            // Convert the result to f32 to maintain the graph contracts. FIXME should be avoided
+            auto out = std::make_shared<ov::op::v0::Convert>(rshp_out, ov::element::f32);
+
+            // Now.. Reconnect the matmul readers to the new output (reducesum)
+            for (auto&& r : matched_matmul->output(0).get_target_inputs()) {
+                r.replace_source_output(out);
+            }
+            return true;  // root has changed
+        }
+        return false;  // did nothing here
+    };
+    register_matcher(std::make_shared<opp::Matcher>(qmm, "OptDQMatMulGQ2i"), std::move(callback));
+}
+
+}  // namespace opt
+}  // namespace patterns
+}  // namespace npuw
+}  // namespace ov
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp
new file mode 100644
index 00000000000000..fee216b706639c
--- /dev/null
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.hpp
@@ -0,0 +1,52 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "openvino/openvino.hpp"
+#include "openvino/pass/graph_rewrite.hpp"
+
+namespace ov {
+namespace npuw {
+
+// Model optimization patterns. Triggered by the plugin at the very top
+namespace patterns {
+namespace opt {
+
+class DQMatMulCWi : public ov::pass::MatcherPass {
+public:
+    DQMatMulCWi();
+};
+
+struct Context {
+    using PPtr = std::shared_ptr<ov::op::v0::Parameter>;
+
+    using Axes = std::vector<std::size_t>;
+    std::map<PPtr, Axes> closures_to_permute;
+    void permute(PPtr orig_param, const Axes& order);
+
+    std::set<PPtr> closures_to_f16;
+    void to_f16(PPtr orig_param);
+
+    using Ref = std::reference_wrapper<Context>;
+};
+
+class DQMatMulGQi : public ov::pass::MatcherPass {
+public:
+    explicit DQMatMulGQi(Context::Ref ctx);
+};
+
+class DQMatMulGQ2i : public ov::pass::MatcherPass {
+public:
+    explicit DQMatMulGQ2i(Context::Ref ctx);
+};
+
+}  // namespace opt
+}  // namespace patterns
+}  // namespace npuw
+}  // namespace ov
diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp
index 7d74192a01a2b6..5a5f8f733d1a7a 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/util.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/util.cpp
@@ -15,6 +15,7 @@
 
 #include "logging.hpp"
 #include "openvino/op/constant.hpp"
+#include "openvino/op/transpose.hpp"
 #include "openvino/op/util/op_types.hpp"
 
 #ifdef UNPACK_PROFILING
@@ -1405,3 +1406,105 @@ void ov::npuw::util::to_f32(const ov::Tensor& in, ov::Tensor& out) {
         break;
     }
 }
+
+void ov::npuw::util::to_f16(ov::Tensor& t) {
+    ov::Shape shape = t.get_shape();
+    NPUW_ASSERT(t.get_element_type() == ov::element::f32);
+    NPUW_ASSERT(t.get_size() % 8 == 0);
+    NPUW_ASSERT(t.is_continuous());
+
+    ov::Tensor tnew(ov::element::f16, shape);
+
+    const float* psrc = t.data<float>();
+    uint8_t* pdst = static_cast<uint8_t*>(tnew.data());
+
+    for (std::size_t i = 0; i < t.get_size() / 8; i++) {
+        __m256 vsrc = _mm256_loadu_ps(psrc);
+        __m128i vout = _mm256_cvtps_ph(vsrc, _MM_FROUND_TO_NEAREST_INT);
+        __m128i* pout = reinterpret_cast<__m128i*>(pdst);
+        _mm_storeu_si128(pout, vout);
+        psrc += 8;        // offset in sizeof(float)
+        pdst += (8 * 2);  // offset in bytes
+    }
+
+    t = std::move(tnew);
+}
+
+inline uint8_t tread_4b(const ov::Tensor& t, std::size_t r, std::size_t c, std::size_t COLS) {
+    const uint8_t* tdata = static_cast<uint8_t*>(t.data());
+    const uint8_t* trow = tdata + r * COLS / 2;
+    const uint8_t* telem = trow + c / 2;
+    if (c % 2 == 0) {
+        return lo4(*telem);
+    }
+    return hi4(*telem);
+}
+
+inline void twrite_4b(ov::Tensor& t, uint8_t value, std::size_t r, std::size_t c, std::size_t COLS) {
+    uint8_t* tdata = static_cast<uint8_t*>(t.data());
+    uint8_t* trow = tdata + r * COLS / 2;
+    uint8_t* telem = trow + c / 2;
+    if (c % 2 == 0) {
+        *telem = (hi4(*telem) << 4) | lo4(value);
+    } else {
+        *telem = (lo4(value) << 4) | lo4(*telem);
+    }
+}
+
+void ov::npuw::util::transpose(ov::Tensor& t) {
+    ov::Shape shape = t.get_shape();
+    NPUW_ASSERT(shape.size() == 3);  // Yes, so far only transpose 3D tensors
+    NPUW_ASSERT(t.get_element_type() == ov::element::i4);
+
+    ov::Shape tshape = {shape[2], shape[0], shape[1]};
+    ov::Tensor tnew(t.get_element_type(), tshape);
+
+    const auto IN_ROWS = shape[0] * shape[1];
+    const auto IN_COLS = shape[2];
+    for (std::size_t i = 0; i < IN_ROWS; i++) {
+        for (std::size_t j = 0; j < IN_COLS; j++) {
+            uint8_t value = tread_4b(t, i, j, IN_COLS);
+            twrite_4b(tnew, value, j, i, IN_ROWS);
+        }
+    }
+    t = std::move(tnew);
+}
+
+void ov::npuw::util::permute(ov::Tensor& t, const std::vector<std::size_t>& axes) {
+    ov::Shape shape = t.get_shape();
+    NPUW_ASSERT(shape.size() == 3);                        // Yes, so far only transpose 3D tensors
+    NPUW_ASSERT(t.get_element_type() == ov::element::i4);  // And, yes, 4bit only!
+
+    if (axes[0] == 2 && axes[1] == 0 && axes[2] == 1) {
+        transpose(t);
+    } else if (axes[0] == 0 && axes[1] == 2 && axes[2] == 1) {
+        ov::Shape tshape = {shape[0], shape[2], shape[1]};
+        ov::Tensor tnew(t.get_element_type(), tshape);
+
+        for (std::size_t p = 0; p < shape[0]; p++) {
+            for (std::size_t r = 0; r < shape[1]; r++) {
+                for (std::size_t c = 0; c < shape[2]; c++) {
+                    uint8_t value = tread_4b(t, p * shape[1] + r, c, shape[2]);
+                    twrite_4b(tnew, value, p * shape[2] + c, r, shape[1]);
+                }
+            }
+        }
+        t = std::move(tnew);
+    } else if (axes[0] == 1 && axes[1] == 0 && axes[2] == 2) {
+        ov::Shape tshape = {shape[1], shape[0], shape[2]};
+        ov::Tensor tnew(t.get_element_type(), tshape);
+
+        // Iterate over output tensor coordinates
+        for (std::size_t p = 0; p < tshape[0]; p++) {
+            for (std::size_t r = 0; r < tshape[1]; r++) {
+                for (std::size_t c = 0; c < tshape[2]; c++) {
+                    uint8_t value = tread_4b(t, r, p * shape[2] + c, shape[1] * shape[2]);
+                    twrite_4b(tnew, value, p * tshape[1] + r, c, tshape[2]);
+                }
+            }
+        }
+        t = std::move(tnew);
+    } else {
+        NPUW_ASSERT(false && "Not supported yet");
+    }
+}
diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.hpp b/src/plugins/intel_npu/src/plugin/npuw/util.hpp
index 88bb7896d94b0f..2a3c0ea7787145 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/util.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/util.hpp
@@ -6,6 +6,7 @@
 
 #include <string>
 
+#include "logging.hpp"
 #include "openvino/runtime/itensor.hpp"
 #include "openvino/runtime/so_ptr.hpp"
 
@@ -50,6 +51,48 @@ void unpack(const ov::SoPtr<ov::ITensor>& from,
             const UnpackOptions& unpack_options = UnpackOptions{true, 16, false});
 
 void to_f32(const ov::Tensor& in, ov::Tensor& out);
+void to_f16(ov::Tensor& t);
+void transpose(ov::Tensor& t);
+void permute(ov::Tensor& t, const std::vector<std::size_t>& axes);
+
+namespace at {
+template <class M>
+struct Impl {
+    using V = typename M::mapped_type;
+
+    M* m = nullptr;
+    explicit Impl(M* pM) : m(pM) {}
+
+    template <typename K>
+    V& at(const K& k) {
+        const auto iter = m->find(k);
+        if (iter == m->end()) {
+            std::stringstream ss;
+            ss << "Key " << k << " is not found in a map of type " << typeid(m).name();
+            const auto msg = ss.str();
+            LOG_ERROR(msg);
+            throw std::out_of_range(msg);
+        }
+        return iter->second;
+    }
+
+    template <typename K>
+    const V& at(const K& k) const {
+        return const_cast<Impl*>(this)->at(k);
+    }
+};
+
+template <typename M>
+Impl<M> _(M* pM) {
+    return Impl<M>(pM);
+}
+
+template <typename M>
+Impl<M> _(std::shared_ptr<M> pM) {
+    return Impl<M>(pM.get());
+}
+
+}  // namespace at
 
 }  // namespace util
 }  // namespace npuw