diff --git a/include/heyoka/detail/llvm_helpers.hpp b/include/heyoka/detail/llvm_helpers.hpp
index 4ab3cf43b..f8227066b 100644
--- a/include/heyoka/detail/llvm_helpers.hpp
+++ b/include/heyoka/detail/llvm_helpers.hpp
@@ -57,7 +57,9 @@ HEYOKA_DLL_PUBLIC std::uint64_t get_alignment(llvm::Module &, llvm::Type *);
 
 HEYOKA_DLL_PUBLIC llvm::Value *load_vector_from_memory(ir_builder &, llvm::Value *, std::uint32_t);
 HEYOKA_DLL_PUBLIC void store_vector_to_memory(ir_builder &, llvm::Value *, llvm::Value *);
-llvm::Value *gather_vector_from_memory(ir_builder &, llvm::Type *, llvm::Value *);
+
+HEYOKA_DLL_PUBLIC llvm::Value *gather_vector_from_memory(ir_builder &, llvm::Type *, llvm::Value *);
+HEYOKA_DLL_PUBLIC void scatter_vector_to_memory(ir_builder &, llvm::Value *, llvm::Value *);
 
 HEYOKA_DLL_PUBLIC llvm::Value *vector_splat(ir_builder &, llvm::Value *, std::uint32_t);
 
diff --git a/include/heyoka/expression.hpp b/include/heyoka/expression.hpp
index 516461665..a8bbc4808 100644
--- a/include/heyoka/expression.hpp
+++ b/include/heyoka/expression.hpp
@@ -381,35 +381,9 @@ inline llvm::Value *taylor_diff(llvm_state &s, const expression &ex, const std::
     }
 }
 
-HEYOKA_DLL_PUBLIC llvm::Function *taylor_c_diff_func_dbl(llvm_state &, const expression &, std::uint32_t, std::uint32_t,
-                                                         bool);
-
-HEYOKA_DLL_PUBLIC llvm::Function *taylor_c_diff_func_ldbl(llvm_state &, const expression &, std::uint32_t,
-                                                          std::uint32_t, bool);
-
-#if defined(HEYOKA_HAVE_REAL128)
-
-HEYOKA_DLL_PUBLIC llvm::Function *taylor_c_diff_func_f128(llvm_state &, const expression &, std::uint32_t,
-                                                          std::uint32_t, bool);
-
-#endif
-
 template <typename T>
-inline llvm::Function *taylor_c_diff_func(llvm_state &s, const expression &ex, std::uint32_t n_uvars,
-                                          std::uint32_t batch_size, bool high_accuracy)
-{
-    if constexpr (std::is_same_v<T, double>) {
-        return taylor_c_diff_func_dbl(s, ex, n_uvars, batch_size, high_accuracy);
-    } else if constexpr (std::is_same_v<T, long double>) {
-        return taylor_c_diff_func_ldbl(s, ex, n_uvars, batch_size, high_accuracy);
-#if defined(HEYOKA_HAVE_REAL128)
-    } else if constexpr (std::is_same_v<T, mppp::real128>) {
-        return taylor_c_diff_func_f128(s, ex, n_uvars, batch_size, high_accuracy);
-#endif
-    } else {
-        static_assert(detail::always_false_v<T>, "Unhandled type.");
-    }
-}
+HEYOKA_DLL_PUBLIC llvm::Function *taylor_c_diff_func(llvm_state &, const expression &, std::uint32_t, std::uint32_t,
+                                                     bool, std::uint32_t);
 
 HEYOKA_DLL_PUBLIC std::uint32_t get_param_size(const expression &);
 
diff --git a/include/heyoka/func.hpp b/include/heyoka/func.hpp
index fbd16dd25..d90079439 100644
--- a/include/heyoka/func.hpp
+++ b/include/heyoka/func.hpp
@@ -135,10 +135,13 @@ struct HEYOKA_DLL_PUBLIC func_inner_base {
                                           const std::vector<llvm::Value *> &, llvm::Value *, llvm::Value *,
                                           std::uint32_t, std::uint32_t, std::uint32_t, std::uint32_t, bool) const = 0;
 #endif
-    virtual llvm::Function *taylor_c_diff_func_dbl(llvm_state &, std::uint32_t, std::uint32_t, bool) const = 0;
-    virtual llvm::Function *taylor_c_diff_func_ldbl(llvm_state &, std::uint32_t, std::uint32_t, bool) const = 0;
+    virtual llvm::Function *taylor_c_diff_func_dbl(llvm_state &, std::uint32_t, std::uint32_t, bool,
+                                                   std::uint32_t) const = 0;
+    virtual llvm::Function *taylor_c_diff_func_ldbl(llvm_state &, std::uint32_t, std::uint32_t, bool,
+                                                    std::uint32_t) const = 0;
 #if defined(HEYOKA_HAVE_REAL128)
-    virtual llvm::Function *taylor_c_diff_func_f128(llvm_state &, std::uint32_t, std::uint32_t, bool) const = 0;
+    virtual llvm::Function *taylor_c_diff_func_f128(llvm_state &, std::uint32_t, std::uint32_t, bool,
+                                                    std::uint32_t) const = 0;
 #endif
 
 private:
@@ -310,7 +313,7 @@ template <typename T>
 using func_taylor_c_diff_func_dbl_t
     = decltype(std::declval<std::add_lvalue_reference_t<const T>>().taylor_c_diff_func_dbl(
         std::declval<llvm_state &>(), std::declval<std::uint32_t>(), std::declval<std::uint32_t>(),
-        std::declval<bool>()));
+        std::declval<bool>(), std::declval<std::uint32_t>()));
 
 template <typename T>
 inline constexpr bool func_has_taylor_c_diff_func_dbl_v
@@ -320,7 +323,7 @@ template <typename T>
 using func_taylor_c_diff_func_ldbl_t
     = decltype(std::declval<std::add_lvalue_reference_t<const T>>().taylor_c_diff_func_ldbl(
         std::declval<llvm_state &>(), std::declval<std::uint32_t>(), std::declval<std::uint32_t>(),
-        std::declval<bool>()));
+        std::declval<bool>(), std::declval<std::uint32_t>()));
 
 template <typename T>
 inline constexpr bool func_has_taylor_c_diff_func_ldbl_v
@@ -332,7 +335,7 @@ template <typename T>
 using func_taylor_c_diff_func_f128_t
     = decltype(std::declval<std::add_lvalue_reference_t<const T>>().taylor_c_diff_func_f128(
         std::declval<llvm_state &>(), std::declval<std::uint32_t>(), std::declval<std::uint32_t>(),
-        std::declval<bool>()));
+        std::declval<bool>(), std::declval<std::uint32_t>()));
 
 template <typename T>
 inline constexpr bool func_has_taylor_c_diff_func_f128_v
@@ -593,20 +596,20 @@ struct HEYOKA_DLL_PUBLIC_INLINE_CLASS func_inner final : func_inner_base {
     }
 #endif
     llvm::Function *taylor_c_diff_func_dbl(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size,
-                                           bool high_accuracy) const final
+                                           bool high_accuracy, std::uint32_t vector_size) const final
     {
         if constexpr (func_has_taylor_c_diff_func_dbl_v<T>) {
-            return m_value.taylor_c_diff_func_dbl(s, n_uvars, batch_size, high_accuracy);
+            return m_value.taylor_c_diff_func_dbl(s, n_uvars, batch_size, high_accuracy, vector_size);
         } else {
             throw not_implemented_error("double Taylor diff in compact mode is not implemented for the function '"
                                         + get_name() + "'");
         }
     }
     llvm::Function *taylor_c_diff_func_ldbl(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size,
-                                            bool high_accuracy) const final
+                                            bool high_accuracy, std::uint32_t vector_size) const final
     {
         if constexpr (func_has_taylor_c_diff_func_ldbl_v<T>) {
-            return m_value.taylor_c_diff_func_ldbl(s, n_uvars, batch_size, high_accuracy);
+            return m_value.taylor_c_diff_func_ldbl(s, n_uvars, batch_size, high_accuracy, vector_size);
         } else {
             throw not_implemented_error("long double Taylor diff in compact mode is not implemented for the function '"
                                         + get_name() + "'");
@@ -614,10 +617,10 @@ struct HEYOKA_DLL_PUBLIC_INLINE_CLASS func_inner final : func_inner_base {
     }
 #if defined(HEYOKA_HAVE_REAL128)
     llvm::Function *taylor_c_diff_func_f128(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size,
-                                            bool high_accuracy) const final
+                                            bool high_accuracy, std::uint32_t vector_size) const final
     {
         if constexpr (func_has_taylor_c_diff_func_f128_v<T>) {
-            return m_value.taylor_c_diff_func_f128(s, n_uvars, batch_size, high_accuracy);
+            return m_value.taylor_c_diff_func_f128(s, n_uvars, batch_size, high_accuracy, vector_size);
         } else {
             throw not_implemented_error("float128 Taylor diff in compact mode is not implemented for the function '"
                                         + get_name() + "'");
@@ -784,10 +787,10 @@ class HEYOKA_DLL_PUBLIC func
                                   llvm::Value *, llvm::Value *, std::uint32_t, std::uint32_t, std::uint32_t,
                                   std::uint32_t, bool) const;
 #endif
-    llvm::Function *taylor_c_diff_func_dbl(llvm_state &, std::uint32_t, std::uint32_t, bool) const;
-    llvm::Function *taylor_c_diff_func_ldbl(llvm_state &, std::uint32_t, std::uint32_t, bool) const;
+    llvm::Function *taylor_c_diff_func_dbl(llvm_state &, std::uint32_t, std::uint32_t, bool, std::uint32_t) const;
+    llvm::Function *taylor_c_diff_func_ldbl(llvm_state &, std::uint32_t, std::uint32_t, bool, std::uint32_t) const;
 #if defined(HEYOKA_HAVE_REAL128)
-    llvm::Function *taylor_c_diff_func_f128(llvm_state &, std::uint32_t, std::uint32_t, bool) const;
+    llvm::Function *taylor_c_diff_func_f128(llvm_state &, std::uint32_t, std::uint32_t, bool, std::uint32_t) const;
 #endif
 };
 
diff --git a/include/heyoka/math/binary_op.hpp b/include/heyoka/math/binary_op.hpp
index 022f3ee87..c58c88516 100644
--- a/include/heyoka/math/binary_op.hpp
+++ b/include/heyoka/math/binary_op.hpp
@@ -94,10 +94,10 @@ class HEYOKA_DLL_PUBLIC binary_op : public func_base
                                   std::uint32_t, bool) const;
 #endif
 
-    llvm::Function *taylor_c_diff_func_dbl(llvm_state &, std::uint32_t, std::uint32_t, bool) const;
-    llvm::Function *taylor_c_diff_func_ldbl(llvm_state &, std::uint32_t, std::uint32_t, bool) const;
+    llvm::Function *taylor_c_diff_func_dbl(llvm_state &, std::uint32_t, std::uint32_t, bool, std::uint32_t) const;
+    llvm::Function *taylor_c_diff_func_ldbl(llvm_state &, std::uint32_t, std::uint32_t, bool, std::uint32_t) const;
 #if defined(HEYOKA_HAVE_REAL128)
-    llvm::Function *taylor_c_diff_func_f128(llvm_state &, std::uint32_t, std::uint32_t, bool) const;
+    llvm::Function *taylor_c_diff_func_f128(llvm_state &, std::uint32_t, std::uint32_t, bool, std::uint32_t) const;
 #endif
 };
 
diff --git a/include/heyoka/math/pow.hpp b/include/heyoka/math/pow.hpp
index fa8a0094a..5cac894e5 100644
--- a/include/heyoka/math/pow.hpp
+++ b/include/heyoka/math/pow.hpp
@@ -82,10 +82,10 @@ class HEYOKA_DLL_PUBLIC pow_impl : public func_base
                                   std::uint32_t, bool) const;
 #endif
 
-    llvm::Function *taylor_c_diff_func_dbl(llvm_state &, std::uint32_t, std::uint32_t, bool) const;
-    llvm::Function *taylor_c_diff_func_ldbl(llvm_state &, std::uint32_t, std::uint32_t, bool) const;
+    llvm::Function *taylor_c_diff_func_dbl(llvm_state &, std::uint32_t, std::uint32_t, bool, std::uint32_t) const;
+    llvm::Function *taylor_c_diff_func_ldbl(llvm_state &, std::uint32_t, std::uint32_t, bool, std::uint32_t) const;
 #if defined(HEYOKA_HAVE_REAL128)
-    llvm::Function *taylor_c_diff_func_f128(llvm_state &, std::uint32_t, std::uint32_t, bool) const;
+    llvm::Function *taylor_c_diff_func_f128(llvm_state &, std::uint32_t, std::uint32_t, bool, std::uint32_t) const;
 #endif
 };
 
diff --git a/include/heyoka/math/sum.hpp b/include/heyoka/math/sum.hpp
index 352ec851a..df61cfdd0 100644
--- a/include/heyoka/math/sum.hpp
+++ b/include/heyoka/math/sum.hpp
@@ -54,10 +54,10 @@ class HEYOKA_DLL_PUBLIC sum_impl : public func_base
                                   llvm::Value *, llvm::Value *, std::uint32_t, std::uint32_t, std::uint32_t,
                                   std::uint32_t, bool) const;
 #endif
-    llvm::Function *taylor_c_diff_func_dbl(llvm_state &, std::uint32_t, std::uint32_t, bool) const;
-    llvm::Function *taylor_c_diff_func_ldbl(llvm_state &, std::uint32_t, std::uint32_t, bool) const;
+    llvm::Function *taylor_c_diff_func_dbl(llvm_state &, std::uint32_t, std::uint32_t, bool, std::uint32_t) const;
+    llvm::Function *taylor_c_diff_func_ldbl(llvm_state &, std::uint32_t, std::uint32_t, bool, std::uint32_t) const;
 #if defined(HEYOKA_HAVE_REAL128)
-    llvm::Function *taylor_c_diff_func_f128(llvm_state &, std::uint32_t, std::uint32_t, bool) const;
+    llvm::Function *taylor_c_diff_func_f128(llvm_state &, std::uint32_t, std::uint32_t, bool, std::uint32_t) const;
 #endif
 };
 
diff --git a/include/heyoka/math/sum_sq.hpp b/include/heyoka/math/sum_sq.hpp
index 067d4e16f..d4a0289ca 100644
--- a/include/heyoka/math/sum_sq.hpp
+++ b/include/heyoka/math/sum_sq.hpp
@@ -61,10 +61,10 @@ class HEYOKA_DLL_PUBLIC sum_sq_impl : public func_base
                                   std::uint32_t, bool) const;
 #endif
 
-    llvm::Function *taylor_c_diff_func_dbl(llvm_state &, std::uint32_t, std::uint32_t, bool) const;
-    llvm::Function *taylor_c_diff_func_ldbl(llvm_state &, std::uint32_t, std::uint32_t, bool) const;
+    llvm::Function *taylor_c_diff_func_dbl(llvm_state &, std::uint32_t, std::uint32_t, bool, std::uint32_t) const;
+    llvm::Function *taylor_c_diff_func_ldbl(llvm_state &, std::uint32_t, std::uint32_t, bool, std::uint32_t) const;
 #if defined(HEYOKA_HAVE_REAL128)
-    llvm::Function *taylor_c_diff_func_f128(llvm_state &, std::uint32_t, std::uint32_t, bool) const;
+    llvm::Function *taylor_c_diff_func_f128(llvm_state &, std::uint32_t, std::uint32_t, bool, std::uint32_t) const;
 #endif
 };
 
diff --git a/include/heyoka/taylor.hpp b/include/heyoka/taylor.hpp
index 1a93f3157..99f2014dc 100644
--- a/include/heyoka/taylor.hpp
+++ b/include/heyoka/taylor.hpp
@@ -103,10 +103,15 @@ llvm::Value *taylor_codegen_numparam(llvm_state &s, const U &n, llvm::Value *par
     }
 }
 
+// TODO remove old overloads.
 HEYOKA_DLL_PUBLIC llvm::Value *taylor_c_diff_numparam_codegen(llvm_state &, const number &, llvm::Value *,
                                                               llvm::Value *, std::uint32_t);
+HEYOKA_DLL_PUBLIC llvm::Value *taylor_c_diff_numparam_codegen(llvm_state &, const number &, llvm::Value *,
+                                                              llvm::Value *, std::uint32_t, std::uint32_t);
 HEYOKA_DLL_PUBLIC llvm::Value *taylor_c_diff_numparam_codegen(llvm_state &, const param &, llvm::Value *, llvm::Value *,
                                                               std::uint32_t);
+HEYOKA_DLL_PUBLIC llvm::Value *taylor_c_diff_numparam_codegen(llvm_state &, const param &, llvm::Value *, llvm::Value *,
+                                                              std::uint32_t, std::uint32_t);
 
 HEYOKA_DLL_PUBLIC llvm::Value *taylor_fetch_diff(const std::vector<llvm::Value *> &, std::uint32_t, std::uint32_t,
                                                  std::uint32_t);
@@ -172,6 +177,12 @@ taylor_c_diff_func_name_args(llvm::LLVMContext &c, const std::string &name, std:
     return taylor_c_diff_func_name_args_impl(c, name, val_t, n_uvars, args, n_hidden_deps);
 }
 
+// TODO remove the other version, then rename?
+template <typename T>
+HEYOKA_DLL_PUBLIC std::pair<std::string, std::vector<llvm::Type *>>
+taylor_c_diff_vfunc_name_args(llvm::LLVMContext &, const std::string &, std::uint32_t, std::uint32_t, std::uint32_t,
+                              const std::vector<std::variant<variable, number, param>> &, std::uint32_t = 0);
+
 // Add a function for computing the dense output
 // via polynomial evaluation.
 template <typename T>
diff --git a/src/detail/llvm_helpers.cpp b/src/detail/llvm_helpers.cpp
index 92fca177f..9698e82da 100644
--- a/src/detail/llvm_helpers.cpp
+++ b/src/detail/llvm_helpers.cpp
@@ -276,12 +276,16 @@ void store_vector_to_memory(ir_builder &builder, llvm::Value *ptr, llvm::Value *
     }
 }
 
-// Gather a vector of type vec_tp from the vector of pointers ptrs.
+// Gather a vector of type vec_tp from ptrs. If vec_tp is a vector type, then ptrs
+// must be a vector of pointers of the same size and the returned value is also a vector
+// of that size. Otherwise, ptrs must be a single scalar pointer and the returned value is a scalar.
 llvm::Value *gather_vector_from_memory(ir_builder &builder, llvm::Type *vec_tp, llvm::Value *ptrs)
 {
     if (llvm::isa<llvm_vector_type>(vec_tp)) {
         // LCOV_EXCL_START
         assert(llvm::isa<llvm_vector_type>(ptrs->getType()));
+        assert(llvm::cast<llvm_vector_type>(vec_tp)->getNumElements()
+               == llvm::cast<llvm_vector_type>(ptrs->getType())->getNumElements());
         assert(ptrs->getType()->getScalarType()->getPointerElementType() == vec_tp->getScalarType());
         // LCOV_EXCL_STOP
 
@@ -311,11 +315,48 @@ llvm::Value *gather_vector_from_memory(ir_builder &builder, llvm::Type *vec_tp,
     }
 }
 
+// Scatter val to ptrs. If val is a vector, then ptrs must be a vector of pointers
+// and a vector scatter takes place. Otherwise, ptrs must be a single scalar pointer
+// and a scalar store takes place.
+void scatter_vector_to_memory(ir_builder &builder, llvm::Value *val, llvm::Value *ptrs)
+{
+    if (llvm::isa<llvm_vector_type>(ptrs->getType())) {
+        // LCOV_EXCL_START
+        assert(llvm::isa<llvm_vector_type>(val->getType()));
+        assert(llvm::cast<llvm_vector_type>(val->getType())->getNumElements()
+               == llvm::cast<llvm_vector_type>(ptrs->getType())->getNumElements());
+        assert(val->getType()->getScalarType() == ptrs->getType()->getScalarType()->getPointerElementType());
+        // LCOV_EXCL_STOP
+
+        // Fetch the alignment of the scalar type.
+        const auto align = get_alignment(*builder.GetInsertBlock()->getModule(), val->getType()->getScalarType());
+
+        builder.CreateMaskedScatter(val, ptrs,
+#if LLVM_VERSION_MAJOR == 10
+                                    boost::numeric_cast<unsigned>(align)
+#else
+                                    llvm::Align(align)
+#endif
+        );
+    } else {
+        // LCOV_EXCL_START
+        assert(!llvm::isa<llvm_vector_type>(val->getType()));
+        assert(ptrs->getType()->getPointerElementType() == val->getType());
+        // LCOV_EXCL_STOP
+
+        // Not a vector, store val directly.
+        builder.CreateStore(val, ptrs);
+    }
+}
+
 // Create a SIMD vector of size vector_size filled with the value c. If vector_size is 1,
 // c will be returned.
 llvm::Value *vector_splat(ir_builder &builder, llvm::Value *c, std::uint32_t vector_size)
 {
+    // LCOV_EXCL_START
     assert(vector_size > 0u);
+    assert(!llvm::isa<llvm_vector_type>(c->getType()));
+    // LCOV_EXCL_STOP
 
     if (vector_size == 1u) {
         return c;
@@ -326,15 +367,18 @@ llvm::Value *vector_splat(ir_builder &builder, llvm::Value *c, std::uint32_t vec
 
 llvm::Type *make_vector_type(llvm::Type *t, std::uint32_t vector_size)
 {
+    // LCOV_EXCL_START
     assert(t != nullptr);
     assert(vector_size > 0u);
+    assert(!llvm::isa<llvm_vector_type>(t));
+    // LCOV_EXCL_STOP
 
     if (vector_size == 1u) {
         return t;
     } else {
         auto retval = llvm_vector_type::get(t, boost::numeric_cast<unsigned>(vector_size));
 
-        assert(retval != nullptr);
+        assert(retval != nullptr); // LCOV_EXCL_LINE
 
         return retval;
     }
@@ -1380,8 +1424,7 @@ llvm::Function *llvm_add_csc_impl(llvm_state &s, llvm::Type *scal_t, std::uint32
                                           vector_splat(builder, builder.getInt32(batch_size), batch_size)));
             assert(llvm_depr_GEP_type_check(cf_ptr_v, scal_t)); // LCOV_EXCL_LINE
             auto last_nz_ptr = builder.CreateInBoundsGEP(scal_t, cf_ptr_v, last_nz_ptr_idx);
-            auto last_nz_cf = batch_size > 1u ? gather_vector_from_memory(builder, cur_cf->getType(), last_nz_ptr)
-                                              : static_cast<llvm::Value *>(builder.CreateLoad(scal_t, last_nz_ptr));
+            auto last_nz_cf = gather_vector_from_memory(builder, cur_cf->getType(), last_nz_ptr);
 
             // Compute the sign of the current coefficient(s).
             auto cur_sgn = llvm_sgn(s, cur_cf);
diff --git a/src/expression.cpp b/src/expression.cpp
index 3548f07be..efca092c6 100644
--- a/src/expression.cpp
+++ b/src/expression.cpp
@@ -40,6 +40,7 @@
 
 #include <heyoka/detail/llvm_fwd.hpp>
 #include <heyoka/detail/type_traits.hpp>
+#include <heyoka/detail/visibility.hpp>
 #include <heyoka/exceptions.hpp>
 #include <heyoka/expression.hpp>
 #include <heyoka/func.hpp>
@@ -1336,24 +1337,18 @@ llvm::Value *taylor_diff_f128(llvm_state &s, const expression &ex, const std::ve
 
 #endif
 
-namespace detail
-{
-
-namespace
-{
-
 template <typename T>
-llvm::Function *taylor_c_diff_func_impl(llvm_state &s, const expression &ex, std::uint32_t n_uvars,
-                                        std::uint32_t batch_size, bool high_accuracy)
+llvm::Function *taylor_c_diff_func(llvm_state &s, const expression &ex, std::uint32_t n_uvars, std::uint32_t batch_size,
+                                   bool high_accuracy, std::uint32_t vector_size)
 {
     if (auto fptr = std::get_if<func>(&ex.value())) {
         if constexpr (std::is_same_v<T, double>) {
-            return fptr->taylor_c_diff_func_dbl(s, n_uvars, batch_size, high_accuracy);
+            return fptr->taylor_c_diff_func_dbl(s, n_uvars, batch_size, high_accuracy, vector_size);
         } else if constexpr (std::is_same_v<T, long double>) {
-            return fptr->taylor_c_diff_func_ldbl(s, n_uvars, batch_size, high_accuracy);
+            return fptr->taylor_c_diff_func_ldbl(s, n_uvars, batch_size, high_accuracy, vector_size);
 #if defined(HEYOKA_HAVE_REAL128)
         } else if constexpr (std::is_same_v<T, mppp::real128>) {
-            return fptr->taylor_c_diff_func_f128(s, n_uvars, batch_size, high_accuracy);
+            return fptr->taylor_c_diff_func_f128(s, n_uvars, batch_size, high_accuracy, vector_size);
 #endif
         } else {
             static_assert(detail::always_false_v<T>, "Unhandled type.");
@@ -1365,29 +1360,16 @@ llvm::Function *taylor_c_diff_func_impl(llvm_state &s, const expression &ex, std
     }
 }
 
-} // namespace
-
-} // namespace detail
-
-llvm::Function *taylor_c_diff_func_dbl(llvm_state &s, const expression &ex, std::uint32_t n_uvars,
-                                       std::uint32_t batch_size, bool high_accuracy)
-{
-    return detail::taylor_c_diff_func_impl<double>(s, ex, n_uvars, batch_size, high_accuracy);
-}
+template HEYOKA_DLL_PUBLIC llvm::Function *taylor_c_diff_func<double>(llvm_state &, const expression &, std::uint32_t,
+                                                                      std::uint32_t, bool, std::uint32_t);
 
-llvm::Function *taylor_c_diff_func_ldbl(llvm_state &s, const expression &ex, std::uint32_t n_uvars,
-                                        std::uint32_t batch_size, bool high_accuracy)
-{
-    return detail::taylor_c_diff_func_impl<long double>(s, ex, n_uvars, batch_size, high_accuracy);
-}
+template HEYOKA_DLL_PUBLIC llvm::Function *
+taylor_c_diff_func<long double>(llvm_state &, const expression &, std::uint32_t, std::uint32_t, bool, std::uint32_t);
 
 #if defined(HEYOKA_HAVE_REAL128)
 
-llvm::Function *taylor_c_diff_func_f128(llvm_state &s, const expression &ex, std::uint32_t n_uvars,
-                                        std::uint32_t batch_size, bool high_accuracy)
-{
-    return detail::taylor_c_diff_func_impl<mppp::real128>(s, ex, n_uvars, batch_size, high_accuracy);
-}
+template HEYOKA_DLL_PUBLIC llvm::Function *
+taylor_c_diff_func<mppp::real128>(llvm_state &, const expression &, std::uint32_t, std::uint32_t, bool, std::uint32_t);
 
 #endif
 
diff --git a/src/func.cpp b/src/func.cpp
index 4ca47a234..82e1d2d61 100644
--- a/src/func.cpp
+++ b/src/func.cpp
@@ -603,7 +603,7 @@ llvm::Value *func::taylor_diff_f128(llvm_state &s, const std::vector<std::uint32
 #endif
 
 llvm::Function *func::taylor_c_diff_func_dbl(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size,
-                                             bool high_accuracy) const
+                                             bool high_accuracy, std::uint32_t vector_size) const
 {
     if (batch_size == 0u) {
         throw std::invalid_argument(
@@ -616,7 +616,12 @@ llvm::Function *func::taylor_c_diff_func_dbl(llvm_state &s, std::uint32_t n_uvar
                 get_name()));
     }
 
-    auto retval = ptr()->taylor_c_diff_func_dbl(s, n_uvars, batch_size, high_accuracy);
+    if (vector_size == 0u || (vector_size > 1u && batch_size > 1u)) {
+        throw std::invalid_argument(fmt::format(
+            "Invalid vector_size detected in func::taylor_c_diff_func_dbl() for the function '{}'", get_name()));
+    }
+
+    auto retval = ptr()->taylor_c_diff_func_dbl(s, n_uvars, batch_size, high_accuracy, vector_size);
 
     if (retval == nullptr) {
         throw std::invalid_argument(
@@ -627,7 +632,7 @@ llvm::Function *func::taylor_c_diff_func_dbl(llvm_state &s, std::uint32_t n_uvar
 }
 
 llvm::Function *func::taylor_c_diff_func_ldbl(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size,
-                                              bool high_accuracy) const
+                                              bool high_accuracy, std::uint32_t vector_size) const
 {
     if (batch_size == 0u) {
         throw std::invalid_argument(
@@ -640,7 +645,12 @@ llvm::Function *func::taylor_c_diff_func_ldbl(llvm_state &s, std::uint32_t n_uva
                 get_name()));
     }
 
-    auto retval = ptr()->taylor_c_diff_func_ldbl(s, n_uvars, batch_size, high_accuracy);
+    if (vector_size == 0u || (vector_size > 1u && batch_size > 1u)) {
+        throw std::invalid_argument(fmt::format(
+            "Invalid vector_size detected in func::taylor_c_diff_func_ldbl() for the function '{}'", get_name()));
+    }
+
+    auto retval = ptr()->taylor_c_diff_func_ldbl(s, n_uvars, batch_size, high_accuracy, vector_size);
 
     if (retval == nullptr) {
         throw std::invalid_argument(
@@ -653,7 +663,7 @@ llvm::Function *func::taylor_c_diff_func_ldbl(llvm_state &s, std::uint32_t n_uva
 #if defined(HEYOKA_HAVE_REAL128)
 
 llvm::Function *func::taylor_c_diff_func_f128(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size,
-                                              bool high_accuracy) const
+                                              bool high_accuracy, std::uint32_t vector_size) const
 {
     if (batch_size == 0u) {
         throw std::invalid_argument(
@@ -666,7 +676,12 @@ llvm::Function *func::taylor_c_diff_func_f128(llvm_state &s, std::uint32_t n_uva
                 get_name()));
     }
 
-    auto retval = ptr()->taylor_c_diff_func_f128(s, n_uvars, batch_size, high_accuracy);
+    if (vector_size == 0u || (vector_size > 1u && batch_size > 1u)) {
+        throw std::invalid_argument(fmt::format(
+            "Invalid vector_size detected in func::taylor_c_diff_func_f128() for the function '{}'", get_name()));
+    }
+
+    auto retval = ptr()->taylor_c_diff_func_f128(s, n_uvars, batch_size, high_accuracy, vector_size);
 
     if (retval == nullptr) {
         throw std::invalid_argument(
diff --git a/src/math/binary_op.cpp b/src/math/binary_op.cpp
index 059aab85e..b11e30c82 100644
--- a/src/math/binary_op.cpp
+++ b/src/math/binary_op.cpp
@@ -26,6 +26,7 @@
 #include <fmt/format.h>
 
 #include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/Constants.h>
 #include <llvm/IR/DerivedTypes.h>
 #include <llvm/IR/Function.h>
 #include <llvm/IR/IRBuilder.h>
@@ -619,17 +620,19 @@ namespace
 template <typename T, typename U, typename V>
 llvm::Function *bo_taylor_c_diff_func_num_num(llvm_state &s, const binary_op &bo, const U &n0, const V &n1,
                                               std::uint32_t n_uvars, std::uint32_t batch_size,
-                                              const std::string &op_name)
+                                              const std::string &op_name, std::uint32_t vector_size)
 {
+    assert(vector_size == 1u || batch_size == 1u); // LCOV_EXCL_LINE
+
     auto &module = s.module();
     auto &builder = s.builder();
     auto &context = s.context();
 
-    // Fetch the floating-point type.
-    auto val_t = to_llvm_vector_type<T>(context, batch_size);
+    // Fetch the return type.
+    auto val_t = to_llvm_vector_type<T>(context, vector_size > 1u ? vector_size : batch_size);
 
     // Fetch the function name and arguments.
-    const auto na_pair = taylor_c_diff_func_name_args<T>(context, op_name, n_uvars, batch_size, {n0, n1});
+    const auto na_pair = taylor_c_diff_vfunc_name_args<T>(context, op_name, n_uvars, batch_size, vector_size, {n0, n1});
     const auto &fname = na_pair.first;
     const auto &fargs = na_pair.second;
 
@@ -664,8 +667,8 @@ llvm::Function *bo_taylor_c_diff_func_num_num(llvm_state &s, const binary_op &bo
             s, builder.CreateICmpEQ(ord, builder.getInt32(0)),
             [&]() {
                 // If the order is zero, run the codegen.
-                auto vnum0 = taylor_c_diff_numparam_codegen(s, n0, num0, par_ptr, batch_size);
-                auto vnum1 = taylor_c_diff_numparam_codegen(s, n1, num1, par_ptr, batch_size);
+                auto vnum0 = taylor_c_diff_numparam_codegen(s, n0, num0, par_ptr, batch_size, vector_size);
+                auto vnum1 = taylor_c_diff_numparam_codegen(s, n1, num1, par_ptr, batch_size, vector_size);
 
                 switch (bo.op()) {
                     case binary_op::type::add:
@@ -683,7 +686,7 @@ llvm::Function *bo_taylor_c_diff_func_num_num(llvm_state &s, const binary_op &bo
             },
             [&]() {
                 // Otherwise, return zero.
-                builder.CreateStore(vector_splat(builder, codegen<T>(s, number{0.}), batch_size), retval);
+                builder.CreateStore(llvm::ConstantFP::get(val_t, 0.), retval);
             });
 
         // Return the result.
@@ -712,26 +715,29 @@ llvm::Function *bo_taylor_c_diff_func_num_num(llvm_state &s, const binary_op &bo
 template <bool AddOrSub, typename T, typename U, typename V,
           std::enable_if_t<std::conjunction_v<is_num_param<U>, is_num_param<V>>, int> = 0>
 llvm::Function *bo_taylor_c_diff_func_addsub_impl(llvm_state &s, const binary_op &bo, const U &num0, const V &num1,
-                                                  std::uint32_t n_uvars, std::uint32_t batch_size)
+                                                  std::uint32_t n_uvars, std::uint32_t batch_size,
+                                                  std::uint32_t vector_size)
 {
-    return bo_taylor_c_diff_func_num_num<T>(s, bo, num0, num1, n_uvars, batch_size, AddOrSub ? "add" : "sub");
+    return bo_taylor_c_diff_func_num_num<T>(s, bo, num0, num1, n_uvars, batch_size, AddOrSub ? "add" : "sub",
+                                            vector_size);
 }
 
 // Derivative of number +- var.
 template <bool AddOrSub, typename T, typename U, std::enable_if_t<is_num_param_v<U>, int> = 0>
 llvm::Function *bo_taylor_c_diff_func_addsub_impl(llvm_state &s, const binary_op &, const U &n, const variable &var,
-                                                  std::uint32_t n_uvars, std::uint32_t batch_size)
+                                                  std::uint32_t n_uvars, std::uint32_t batch_size,
+                                                  std::uint32_t vector_size)
 {
     auto &module = s.module();
     auto &builder = s.builder();
     auto &context = s.context();
 
-    // Fetch the floating-point type.
-    auto val_t = to_llvm_vector_type<T>(context, batch_size);
+    // Fetch the return type.
+    auto val_t = to_llvm_vector_type<T>(context, vector_size > 1u ? vector_size : batch_size);
 
     // Fetch the function name and arguments.
-    const auto na_pair
-        = taylor_c_diff_func_name_args<T>(context, AddOrSub ? "add" : "sub", n_uvars, batch_size, {n, var});
+    const auto na_pair = taylor_c_diff_vfunc_name_args<T>(context, AddOrSub ? "add" : "sub", n_uvars, batch_size,
+                                                          vector_size, {n, var});
     const auto &fname = na_pair.first;
     const auto &fargs = na_pair.second;
 
@@ -767,7 +773,7 @@ llvm::Function *bo_taylor_c_diff_func_addsub_impl(llvm_state &s, const binary_op
             s, builder.CreateICmpEQ(order, builder.getInt32(0)),
             [&]() {
                 // For order zero, run the codegen.
-                auto num_vec = taylor_c_diff_numparam_codegen(s, n, num, par_ptr, batch_size);
+                auto num_vec = taylor_c_diff_numparam_codegen(s, n, num, par_ptr, batch_size, vector_size);
                 auto ret = taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(0), var_idx);
 
                 builder.CreateStore(AddOrSub ? builder.CreateFAdd(num_vec, ret) : builder.CreateFSub(num_vec, ret),
@@ -810,18 +816,19 @@ llvm::Function *bo_taylor_c_diff_func_addsub_impl(llvm_state &s, const binary_op
 // Derivative of var +- number.
 template <bool AddOrSub, typename T, typename U, std::enable_if_t<is_num_param_v<U>, int> = 0>
 llvm::Function *bo_taylor_c_diff_func_addsub_impl(llvm_state &s, const binary_op &, const variable &var, const U &n,
-                                                  std::uint32_t n_uvars, std::uint32_t batch_size)
+                                                  std::uint32_t n_uvars, std::uint32_t batch_size,
+                                                  std::uint32_t vector_size)
 {
     auto &module = s.module();
     auto &builder = s.builder();
     auto &context = s.context();
 
-    // Fetch the floating-point type.
-    auto val_t = to_llvm_vector_type<T>(context, batch_size);
+    // Fetch the return type.
+    auto val_t = to_llvm_vector_type<T>(context, vector_size > 1u ? vector_size : batch_size);
 
     // Fetch the function name and arguments.
-    const auto na_pair
-        = taylor_c_diff_func_name_args<T>(context, AddOrSub ? "add" : "sub", n_uvars, batch_size, {var, n});
+    const auto na_pair = taylor_c_diff_vfunc_name_args<T>(context, AddOrSub ? "add" : "sub", n_uvars, batch_size,
+                                                          vector_size, {var, n});
     const auto &fname = na_pair.first;
     const auto &fargs = na_pair.second;
 
@@ -858,7 +865,7 @@ llvm::Function *bo_taylor_c_diff_func_addsub_impl(llvm_state &s, const binary_op
             [&]() {
                 // For order zero, run the codegen.
                 auto ret = taylor_c_load_diff(s, diff_arr, n_uvars, builder.getInt32(0), var_idx);
-                auto num_vec = taylor_c_diff_numparam_codegen(s, n, num, par_ptr, batch_size);
+                auto num_vec = taylor_c_diff_numparam_codegen(s, n, num, par_ptr, batch_size, vector_size);
 
                 builder.CreateStore(AddOrSub ? builder.CreateFAdd(ret, num_vec) : builder.CreateFSub(ret, num_vec),
                                     retval);
@@ -893,18 +900,19 @@ llvm::Function *bo_taylor_c_diff_func_addsub_impl(llvm_state &s, const binary_op
 // Derivative of var +- var.
 template <bool AddOrSub, typename T>
 llvm::Function *bo_taylor_c_diff_func_addsub_impl(llvm_state &s, const binary_op &, const variable &var0,
-                                                  const variable &var1, std::uint32_t n_uvars, std::uint32_t batch_size)
+                                                  const variable &var1, std::uint32_t n_uvars, std::uint32_t batch_size,
+                                                  std::uint32_t vector_size)
 {
     auto &module = s.module();
     auto &builder = s.builder();
     auto &context = s.context();
 
-    // Fetch the floating-point type.
-    auto val_t = to_llvm_vector_type<T>(context, batch_size);
+    // Fetch the return type.
+    auto val_t = to_llvm_vector_type<T>(context, vector_size > 1u ? vector_size : batch_size);
 
     // Fetch the function name and arguments.
-    const auto na_pair
-        = taylor_c_diff_func_name_args<T>(context, AddOrSub ? "add" : "sub", n_uvars, batch_size, {var0, var1});
+    const auto na_pair = taylor_c_diff_vfunc_name_args<T>(context, AddOrSub ? "add" : "sub", n_uvars, batch_size,
+                                                          vector_size, {var0, var1});
     const auto &fname = na_pair.first;
     const auto &fargs = na_pair.second;
 
@@ -966,7 +974,7 @@ llvm::Function *bo_taylor_c_diff_func_addsub_impl(llvm_state &s, const binary_op
 template <bool, typename, typename V1, typename V2,
           std::enable_if_t<!std::conjunction_v<is_num_param<V1>, is_num_param<V2>>, int> = 0>
 llvm::Function *bo_taylor_c_diff_func_addsub_impl(llvm_state &, const binary_op &, const V1 &, const V2 &,
-                                                  std::uint32_t, std::uint32_t)
+                                                  std::uint32_t, std::uint32_t, std::uint32_t)
 {
     throw std::invalid_argument("An invalid argument type was encountered while trying to build the Taylor derivative "
                                 "of add()/sub() in compact mode");
@@ -975,22 +983,22 @@ llvm::Function *bo_taylor_c_diff_func_addsub_impl(llvm_state &, const binary_op
 
 template <typename T>
 llvm::Function *bo_taylor_c_diff_func_add(llvm_state &s, const binary_op &bo, std::uint32_t n_uvars,
-                                          std::uint32_t batch_size)
+                                          std::uint32_t batch_size, std::uint32_t vector_size)
 {
     return std::visit(
         [&](const auto &v1, const auto &v2) {
-            return bo_taylor_c_diff_func_addsub_impl<true, T>(s, bo, v1, v2, n_uvars, batch_size);
+            return bo_taylor_c_diff_func_addsub_impl<true, T>(s, bo, v1, v2, n_uvars, batch_size, vector_size);
         },
         bo.lhs().value(), bo.rhs().value());
 }
 
 template <typename T>
 llvm::Function *bo_taylor_c_diff_func_sub(llvm_state &s, const binary_op &bo, std::uint32_t n_uvars,
-                                          std::uint32_t batch_size)
+                                          std::uint32_t batch_size, std::uint32_t vector_size)
 {
     return std::visit(
         [&](const auto &v1, const auto &v2) {
-            return bo_taylor_c_diff_func_addsub_impl<false, T>(s, bo, v1, v2, n_uvars, batch_size);
+            return bo_taylor_c_diff_func_addsub_impl<false, T>(s, bo, v1, v2, n_uvars, batch_size, vector_size);
         },
         bo.lhs().value(), bo.rhs().value());
 }
@@ -999,25 +1007,27 @@ llvm::Function *bo_taylor_c_diff_func_sub(llvm_state &s, const binary_op &bo, st
 template <typename T, typename U, typename V,
           std::enable_if_t<std::conjunction_v<is_num_param<U>, is_num_param<V>>, int> = 0>
 llvm::Function *bo_taylor_c_diff_func_mul_impl(llvm_state &s, const binary_op &bo, const U &num0, const V &num1,
-                                               std::uint32_t n_uvars, std::uint32_t batch_size)
+                                               std::uint32_t n_uvars, std::uint32_t batch_size,
+                                               std::uint32_t vector_size)
 {
-    return bo_taylor_c_diff_func_num_num<T>(s, bo, num0, num1, n_uvars, batch_size, "mul");
+    return bo_taylor_c_diff_func_num_num<T>(s, bo, num0, num1, n_uvars, batch_size, "mul", vector_size);
 }
 
 // Derivative of var * number.
 template <typename T, typename U, std::enable_if_t<is_num_param_v<U>, int> = 0>
 llvm::Function *bo_taylor_c_diff_func_mul_impl(llvm_state &s, const binary_op &, const variable &var, const U &n,
-                                               std::uint32_t n_uvars, std::uint32_t batch_size)
+                                               std::uint32_t n_uvars, std::uint32_t batch_size,
+                                               std::uint32_t vector_size)
 {
     auto &module = s.module();
     auto &builder = s.builder();
     auto &context = s.context();
 
-    // Fetch the floating-point type.
-    auto val_t = to_llvm_vector_type<T>(context, batch_size);
+    // Fetch the return type.
+    auto val_t = to_llvm_vector_type<T>(context, vector_size > 1u ? vector_size : batch_size);
 
     // Fetch the function name and arguments.
-    const auto na_pair = taylor_c_diff_func_name_args<T>(context, "mul", n_uvars, batch_size, {var, n});
+    const auto na_pair = taylor_c_diff_vfunc_name_args<T>(context, "mul", n_uvars, batch_size, vector_size, {var, n});
     const auto &fname = na_pair.first;
     const auto &fargs = na_pair.second;
 
@@ -1050,7 +1060,8 @@ llvm::Function *bo_taylor_c_diff_func_mul_impl(llvm_state &s, const binary_op &,
         auto ret = taylor_c_load_diff(s, diff_arr, n_uvars, order, var_idx);
 
         // Create the return value.
-        builder.CreateRet(builder.CreateFMul(ret, taylor_c_diff_numparam_codegen(s, n, num, par_ptr, batch_size)));
+        builder.CreateRet(
+            builder.CreateFMul(ret, taylor_c_diff_numparam_codegen(s, n, num, par_ptr, batch_size, vector_size)));
 
         // Verify.
         s.verify_function(f);
@@ -1074,17 +1085,18 @@ llvm::Function *bo_taylor_c_diff_func_mul_impl(llvm_state &s, const binary_op &,
 // Derivative of number * var.
 template <typename T, typename U, std::enable_if_t<is_num_param_v<U>, int> = 0>
 llvm::Function *bo_taylor_c_diff_func_mul_impl(llvm_state &s, const binary_op &, const U &n, const variable &var,
-                                               std::uint32_t n_uvars, std::uint32_t batch_size)
+                                               std::uint32_t n_uvars, std::uint32_t batch_size,
+                                               std::uint32_t vector_size)
 {
     auto &module = s.module();
     auto &builder = s.builder();
     auto &context = s.context();
 
-    // Fetch the floating-point type.
-    auto val_t = to_llvm_vector_type<T>(context, batch_size);
+    // Fetch the return type.
+    auto val_t = to_llvm_vector_type<T>(context, vector_size > 1u ? vector_size : batch_size);
 
     // Fetch the function name and arguments.
-    const auto na_pair = taylor_c_diff_func_name_args<T>(context, "mul", n_uvars, batch_size, {n, var});
+    const auto na_pair = taylor_c_diff_vfunc_name_args<T>(context, "mul", n_uvars, batch_size, vector_size, {n, var});
     const auto &fname = na_pair.first;
     const auto &fargs = na_pair.second;
 
@@ -1117,7 +1129,8 @@ llvm::Function *bo_taylor_c_diff_func_mul_impl(llvm_state &s, const binary_op &,
         auto ret = taylor_c_load_diff(s, diff_arr, n_uvars, order, var_idx);
 
         // Create the return value.
-        builder.CreateRet(builder.CreateFMul(ret, taylor_c_diff_numparam_codegen(s, n, num, par_ptr, batch_size)));
+        builder.CreateRet(
+            builder.CreateFMul(ret, taylor_c_diff_numparam_codegen(s, n, num, par_ptr, batch_size, vector_size)));
 
         // Verify.
         s.verify_function(f);
@@ -1141,17 +1154,19 @@ llvm::Function *bo_taylor_c_diff_func_mul_impl(llvm_state &s, const binary_op &,
 // Derivative of var * var.
 template <typename T>
 llvm::Function *bo_taylor_c_diff_func_mul_impl(llvm_state &s, const binary_op &, const variable &var0,
-                                               const variable &var1, std::uint32_t n_uvars, std::uint32_t batch_size)
+                                               const variable &var1, std::uint32_t n_uvars, std::uint32_t batch_size,
+                                               std::uint32_t vector_size)
 {
     auto &module = s.module();
     auto &builder = s.builder();
     auto &context = s.context();
 
-    // Fetch the floating-point type.
-    auto val_t = to_llvm_vector_type<T>(context, batch_size);
+    // Fetch the return type.
+    auto val_t = to_llvm_vector_type<T>(context, vector_size > 1u ? vector_size : batch_size);
 
     // Fetch the function name and arguments.
-    const auto na_pair = taylor_c_diff_func_name_args<T>(context, "mul", n_uvars, batch_size, {var0, var1});
+    const auto na_pair
+        = taylor_c_diff_vfunc_name_args<T>(context, "mul", n_uvars, batch_size, vector_size, {var0, var1});
     const auto &fname = na_pair.first;
     const auto &fargs = na_pair.second;
 
@@ -1181,7 +1196,7 @@ llvm::Function *bo_taylor_c_diff_func_mul_impl(llvm_state &s, const binary_op &,
 
         // Create the accumulator.
         auto acc = builder.CreateAlloca(val_t);
-        builder.CreateStore(vector_splat(builder, codegen<T>(s, number{0.}), batch_size), acc);
+        builder.CreateStore(llvm::ConstantFP::get(val_t, 0.), acc);
 
         // Run the loop.
         llvm_loop_u32(s, builder.getInt32(0), builder.CreateAdd(ord, builder.getInt32(1)), [&](llvm::Value *j) {
@@ -1217,7 +1232,7 @@ llvm::Function *bo_taylor_c_diff_func_mul_impl(llvm_state &s, const binary_op &,
 template <typename, typename V1, typename V2,
           std::enable_if_t<!std::conjunction_v<is_num_param<V1>, is_num_param<V2>>, int> = 0>
 llvm::Function *bo_taylor_c_diff_func_mul_impl(llvm_state &, const binary_op &, const V1 &, const V2 &, std::uint32_t,
-                                               std::uint32_t)
+                                               std::uint32_t, std::uint32_t)
 {
     throw std::invalid_argument("An invalid argument type was encountered while trying to build the Taylor derivative "
                                 "of mul() in compact mode");
@@ -1226,11 +1241,11 @@ llvm::Function *bo_taylor_c_diff_func_mul_impl(llvm_state &, const binary_op &,
 
 template <typename T>
 llvm::Function *bo_taylor_c_diff_func_mul(llvm_state &s, const binary_op &bo, std::uint32_t n_uvars,
-                                          std::uint32_t batch_size)
+                                          std::uint32_t batch_size, std::uint32_t vector_size)
 {
     return std::visit(
         [&](const auto &v1, const auto &v2) {
-            return bo_taylor_c_diff_func_mul_impl<T>(s, bo, v1, v2, n_uvars, batch_size);
+            return bo_taylor_c_diff_func_mul_impl<T>(s, bo, v1, v2, n_uvars, batch_size, vector_size);
         },
         bo.lhs().value(), bo.rhs().value());
 }
@@ -1239,25 +1254,27 @@ llvm::Function *bo_taylor_c_diff_func_mul(llvm_state &s, const binary_op &bo, st
 template <typename T, typename U, typename V,
           std::enable_if_t<std::conjunction_v<is_num_param<U>, is_num_param<V>>, int> = 0>
 llvm::Function *bo_taylor_c_diff_func_div_impl(llvm_state &s, const binary_op &bo, const U &num0, const V &num1,
-                                               std::uint32_t n_uvars, std::uint32_t batch_size)
+                                               std::uint32_t n_uvars, std::uint32_t batch_size,
+                                               std::uint32_t vector_size)
 {
-    return bo_taylor_c_diff_func_num_num<T>(s, bo, num0, num1, n_uvars, batch_size, "div");
+    return bo_taylor_c_diff_func_num_num<T>(s, bo, num0, num1, n_uvars, batch_size, "div", vector_size);
 }
 
 // Derivative of var / number.
 template <typename T, typename U, std::enable_if_t<is_num_param_v<U>, int> = 0>
 llvm::Function *bo_taylor_c_diff_func_div_impl(llvm_state &s, const binary_op &, const variable &var, const U &n,
-                                               std::uint32_t n_uvars, std::uint32_t batch_size)
+                                               std::uint32_t n_uvars, std::uint32_t batch_size,
+                                               std::uint32_t vector_size)
 {
     auto &module = s.module();
     auto &builder = s.builder();
     auto &context = s.context();
 
-    // Fetch the floating-point type.
-    auto val_t = to_llvm_vector_type<T>(context, batch_size);
+    // Fetch the return type.
+    auto val_t = to_llvm_vector_type<T>(context, vector_size > 1u ? vector_size : batch_size);
 
     // Fetch the function name and arguments.
-    const auto na_pair = taylor_c_diff_func_name_args<T>(context, "div", n_uvars, batch_size, {var, n});
+    const auto na_pair = taylor_c_diff_vfunc_name_args<T>(context, "div", n_uvars, batch_size, vector_size, {var, n});
     const auto &fname = na_pair.first;
     const auto &fargs = na_pair.second;
 
@@ -1290,7 +1307,8 @@ llvm::Function *bo_taylor_c_diff_func_div_impl(llvm_state &s, const binary_op &,
         auto ret = taylor_c_load_diff(s, diff_arr, n_uvars, order, var_idx);
 
         // Create the return value.
-        builder.CreateRet(builder.CreateFDiv(ret, taylor_c_diff_numparam_codegen(s, n, num, par_ptr, batch_size)));
+        builder.CreateRet(
+            builder.CreateFDiv(ret, taylor_c_diff_numparam_codegen(s, n, num, par_ptr, batch_size, vector_size)));
 
         // Verify.
         s.verify_function(f);
@@ -1314,17 +1332,18 @@ llvm::Function *bo_taylor_c_diff_func_div_impl(llvm_state &s, const binary_op &,
 // Derivative of number / var.
 template <typename T, typename U, std::enable_if_t<is_num_param_v<U>, int> = 0>
 llvm::Function *bo_taylor_c_diff_func_div_impl(llvm_state &s, const binary_op &, const U &n, const variable &var,
-                                               std::uint32_t n_uvars, std::uint32_t batch_size)
+                                               std::uint32_t n_uvars, std::uint32_t batch_size,
+                                               std::uint32_t vector_size)
 {
     auto &module = s.module();
     auto &builder = s.builder();
     auto &context = s.context();
 
-    // Fetch the floating-point type.
-    auto val_t = to_llvm_vector_type<T>(context, batch_size);
+    // Fetch the return type.
+    auto val_t = to_llvm_vector_type<T>(context, vector_size > 1u ? vector_size : batch_size);
 
     // Fetch the function name and arguments.
-    const auto na_pair = taylor_c_diff_func_name_args<T>(context, "div", n_uvars, batch_size, {n, var});
+    const auto na_pair = taylor_c_diff_vfunc_name_args<T>(context, "div", n_uvars, batch_size, vector_size, {n, var});
     const auto &fname = na_pair.first;
     const auto &fargs = na_pair.second;
 
@@ -1367,14 +1386,14 @@ llvm::Function *bo_taylor_c_diff_func_div_impl(llvm_state &s, const binary_op &,
             s, builder.CreateICmpEQ(ord, builder.getInt32(0)),
             [&]() {
                 // For order zero, run the codegen.
-                auto num_vec = taylor_c_diff_numparam_codegen(s, n, num, par_ptr, batch_size);
+                auto num_vec = taylor_c_diff_numparam_codegen(s, n, num, par_ptr, batch_size, vector_size);
                 auto ret = taylor_c_load_diff(s, diff_ptr, n_uvars, builder.getInt32(0), var_idx);
 
                 builder.CreateStore(builder.CreateFDiv(num_vec, ret), retval);
             },
             [&]() {
                 // Init the accumulator.
-                builder.CreateStore(vector_splat(builder, codegen<T>(s, number{0.}), batch_size), acc);
+                builder.CreateStore(llvm::ConstantFP::get(val_t, 0.), acc);
 
                 // Run the loop.
                 llvm_loop_u32(s, builder.getInt32(1), builder.CreateAdd(ord, builder.getInt32(1)), [&](llvm::Value *j) {
@@ -1418,17 +1437,19 @@ llvm::Function *bo_taylor_c_diff_func_div_impl(llvm_state &s, const binary_op &,
 // Derivative of var / var.
 template <typename T>
 llvm::Function *bo_taylor_c_diff_func_div_impl(llvm_state &s, const binary_op &, const variable &var0,
-                                               const variable &var1, std::uint32_t n_uvars, std::uint32_t batch_size)
+                                               const variable &var1, std::uint32_t n_uvars, std::uint32_t batch_size,
+                                               std::uint32_t vector_size)
 {
     auto &module = s.module();
     auto &builder = s.builder();
     auto &context = s.context();
 
-    // Fetch the floating-point type.
-    auto val_t = to_llvm_vector_type<T>(context, batch_size);
+    // Fetch the return type.
+    auto val_t = to_llvm_vector_type<T>(context, vector_size > 1u ? vector_size : batch_size);
 
     // Fetch the function name and arguments.
-    const auto na_pair = taylor_c_diff_func_name_args<T>(context, "div", n_uvars, batch_size, {var0, var1});
+    const auto na_pair
+        = taylor_c_diff_vfunc_name_args<T>(context, "div", n_uvars, batch_size, vector_size, {var0, var1});
     const auto &fname = na_pair.first;
     const auto &fargs = na_pair.second;
 
@@ -1459,7 +1480,7 @@ llvm::Function *bo_taylor_c_diff_func_div_impl(llvm_state &s, const binary_op &,
 
         // Create the accumulator.
         auto acc = builder.CreateAlloca(val_t);
-        builder.CreateStore(vector_splat(builder, codegen<T>(s, number{0.}), batch_size), acc);
+        builder.CreateStore(llvm::ConstantFP::get(val_t, 0.), acc);
 
         // Run the loop.
         llvm_loop_u32(s, builder.getInt32(1), builder.CreateAdd(ord, builder.getInt32(1)), [&](llvm::Value *j) {
@@ -1499,7 +1520,7 @@ llvm::Function *bo_taylor_c_diff_func_div_impl(llvm_state &s, const binary_op &,
 template <typename, typename V1, typename V2,
           std::enable_if_t<!std::conjunction_v<is_num_param<V1>, is_num_param<V2>>, int> = 0>
 llvm::Function *bo_taylor_c_diff_func_div_impl(llvm_state &, const binary_op &, const V1 &, const V2 &, std::uint32_t,
-                                               std::uint32_t)
+                                               std::uint32_t, std::uint32_t)
 {
     throw std::invalid_argument("An invalid argument type was encountered while trying to build the Taylor derivative "
                                 "of div() in compact mode");
@@ -1508,51 +1529,51 @@ llvm::Function *bo_taylor_c_diff_func_div_impl(llvm_state &, const binary_op &,
 
 template <typename T>
 llvm::Function *bo_taylor_c_diff_func_div(llvm_state &s, const binary_op &bo, std::uint32_t n_uvars,
-                                          std::uint32_t batch_size)
+                                          std::uint32_t batch_size, std::uint32_t vector_size)
 {
     return std::visit(
         [&](const auto &v1, const auto &v2) {
-            return bo_taylor_c_diff_func_div_impl<T>(s, bo, v1, v2, n_uvars, batch_size);
+            return bo_taylor_c_diff_func_div_impl<T>(s, bo, v1, v2, n_uvars, batch_size, vector_size);
         },
         bo.lhs().value(), bo.rhs().value());
 }
 
 template <typename T>
 llvm::Function *taylor_c_diff_func_bo_impl(llvm_state &s, const binary_op &bo, std::uint32_t n_uvars,
-                                           std::uint32_t batch_size)
+                                           std::uint32_t batch_size, std::uint32_t vector_size)
 {
     switch (bo.op()) {
         case binary_op::type::add:
-            return bo_taylor_c_diff_func_add<T>(s, bo, n_uvars, batch_size);
+            return bo_taylor_c_diff_func_add<T>(s, bo, n_uvars, batch_size, vector_size);
         case binary_op::type::sub:
-            return bo_taylor_c_diff_func_sub<T>(s, bo, n_uvars, batch_size);
+            return bo_taylor_c_diff_func_sub<T>(s, bo, n_uvars, batch_size, vector_size);
         case binary_op::type::mul:
-            return bo_taylor_c_diff_func_mul<T>(s, bo, n_uvars, batch_size);
+            return bo_taylor_c_diff_func_mul<T>(s, bo, n_uvars, batch_size, vector_size);
         default:
-            return bo_taylor_c_diff_func_div<T>(s, bo, n_uvars, batch_size);
+            return bo_taylor_c_diff_func_div<T>(s, bo, n_uvars, batch_size, vector_size);
     }
 }
 
 } // namespace
 
-llvm::Function *binary_op::taylor_c_diff_func_dbl(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size,
-                                                  bool) const
+llvm::Function *binary_op::taylor_c_diff_func_dbl(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size, bool,
+                                                  std::uint32_t vector_size) const
 {
-    return taylor_c_diff_func_bo_impl<double>(s, *this, n_uvars, batch_size);
+    return taylor_c_diff_func_bo_impl<double>(s, *this, n_uvars, batch_size, vector_size);
 }
 
-llvm::Function *binary_op::taylor_c_diff_func_ldbl(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size,
-                                                   bool) const
+llvm::Function *binary_op::taylor_c_diff_func_ldbl(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size, bool,
+                                                   std::uint32_t vector_size) const
 {
-    return taylor_c_diff_func_bo_impl<long double>(s, *this, n_uvars, batch_size);
+    return taylor_c_diff_func_bo_impl<long double>(s, *this, n_uvars, batch_size, vector_size);
 }
 
 #if defined(HEYOKA_HAVE_REAL128)
 
-llvm::Function *binary_op::taylor_c_diff_func_f128(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size,
-                                                   bool) const
+llvm::Function *binary_op::taylor_c_diff_func_f128(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size, bool,
+                                                   std::uint32_t vector_size) const
 {
-    return taylor_c_diff_func_bo_impl<mppp::real128>(s, *this, n_uvars, batch_size);
+    return taylor_c_diff_func_bo_impl<mppp::real128>(s, *this, n_uvars, batch_size, vector_size);
 }
 
 #endif
diff --git a/src/math/pow.cpp b/src/math/pow.cpp
index 16defb95b..fafac0f89 100644
--- a/src/math/pow.cpp
+++ b/src/math/pow.cpp
@@ -26,6 +26,7 @@
 
 #include <llvm/IR/Attributes.h>
 #include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/Constants.h>
 #include <llvm/IR/DerivedTypes.h>
 #include <llvm/IR/Function.h>
 #include <llvm/IR/IRBuilder.h>
@@ -368,16 +369,16 @@ namespace
 template <typename T, typename U, typename V,
           std::enable_if_t<std::conjunction_v<is_num_param<U>, is_num_param<V>>, int> = 0>
 llvm::Function *taylor_c_diff_func_pow_impl(llvm_state &s, const pow_impl &fn, const U &n0, const V &n1,
-                                            std::uint32_t n_uvars, std::uint32_t batch_size)
+                                            std::uint32_t n_uvars, std::uint32_t batch_size, std::uint32_t vector_size)
 {
     auto &module = s.module();
     auto &builder = s.builder();
     auto &context = s.context();
 
-    // Fetch the floating-point type.
-    auto val_t = to_llvm_vector_type<T>(context, batch_size);
+    // Fetch the return type.
+    auto val_t = to_llvm_vector_type<T>(context, vector_size > 1u ? vector_size : batch_size);
 
-    const auto na_pair = taylor_c_diff_func_name_args<T>(context, "pow", n_uvars, batch_size, {n0, n1});
+    const auto na_pair = taylor_c_diff_vfunc_name_args<T>(context, "pow", n_uvars, batch_size, vector_size, {n0, n1});
     const auto &fname = na_pair.first;
     const auto &fargs = na_pair.second;
 
@@ -413,14 +414,15 @@ llvm::Function *taylor_c_diff_func_pow_impl(llvm_state &s, const pow_impl &fn, c
             [&]() {
                 // If the order is zero, run the codegen.
                 builder.CreateStore(
-                    codegen_from_values<T>(s, fn,
-                                           {taylor_c_diff_numparam_codegen(s, n0, num_base, par_ptr, batch_size),
-                                            taylor_c_diff_numparam_codegen(s, n1, num_exp, par_ptr, batch_size)}),
+                    codegen_from_values<T>(
+                        s, fn,
+                        {taylor_c_diff_numparam_codegen(s, n0, num_base, par_ptr, batch_size, vector_size),
+                         taylor_c_diff_numparam_codegen(s, n1, num_exp, par_ptr, batch_size, vector_size)}),
                     retval);
             },
             [&]() {
                 // Otherwise, return zero.
-                builder.CreateStore(vector_splat(builder, codegen<T>(s, number{0.}), batch_size), retval);
+                builder.CreateStore(llvm::ConstantFP::get(val_t, 0.), retval);
             });
 
         // Return the result.
@@ -448,16 +450,17 @@ llvm::Function *taylor_c_diff_func_pow_impl(llvm_state &s, const pow_impl &fn, c
 // Derivative of pow(variable, number).
 template <typename T, typename U, std::enable_if_t<is_num_param_v<U>, int> = 0>
 llvm::Function *taylor_c_diff_func_pow_impl(llvm_state &s, const pow_impl &fn, const variable &var, const U &n,
-                                            std::uint32_t n_uvars, std::uint32_t batch_size)
+                                            std::uint32_t n_uvars, std::uint32_t batch_size, std::uint32_t vector_size)
 {
     auto &module = s.module();
     auto &builder = s.builder();
     auto &context = s.context();
 
-    // Fetch the floating-point type.
-    auto val_t = to_llvm_vector_type<T>(context, batch_size);
+    // Fetch the return type.
+    const auto val_t_width = vector_size > 1u ? vector_size : batch_size;
+    auto val_t = to_llvm_vector_type<T>(context, val_t_width);
 
-    const auto na_pair = taylor_c_diff_func_name_args<T>(context, "pow", n_uvars, batch_size, {var, n});
+    const auto na_pair = taylor_c_diff_vfunc_name_args<T>(context, "pow", n_uvars, batch_size, vector_size, {var, n});
     const auto &fname = na_pair.first;
     const auto &fargs = na_pair.second;
 
@@ -498,18 +501,19 @@ llvm::Function *taylor_c_diff_func_pow_impl(llvm_state &s, const pow_impl &fn, c
             [&]() {
                 // For order 0, invoke the function on the order 0 of var_idx.
                 builder.CreateStore(
-                    codegen_from_values<T>(s, fn,
-                                           {taylor_c_load_diff(s, diff_ptr, n_uvars, builder.getInt32(0), var_idx),
-                                            taylor_c_diff_numparam_codegen(s, n, exponent, par_ptr, batch_size)}),
+                    codegen_from_values<T>(
+                        s, fn,
+                        {taylor_c_load_diff(s, diff_ptr, n_uvars, builder.getInt32(0), var_idx),
+                         taylor_c_diff_numparam_codegen(s, n, exponent, par_ptr, batch_size, vector_size)}),
                     retval);
             },
             [&]() {
                 // Create FP vector versions of exponent and order.
-                auto alpha_v = taylor_c_diff_numparam_codegen(s, n, exponent, par_ptr, batch_size);
-                auto ord_v = vector_splat(builder, builder.CreateUIToFP(ord, to_llvm_type<T>(context)), batch_size);
+                auto alpha_v = taylor_c_diff_numparam_codegen(s, n, exponent, par_ptr, batch_size, vector_size);
+                auto ord_v = vector_splat(builder, builder.CreateUIToFP(ord, to_llvm_type<T>(context)), val_t_width);
 
                 // Init the accumulator.
-                builder.CreateStore(vector_splat(builder, codegen<T>(s, number{0.}), batch_size), acc);
+                builder.CreateStore(llvm::ConstantFP::get(val_t, 0.), acc);
 
                 // Run the loop.
                 llvm_loop_u32(s, builder.getInt32(0), ord, [&](llvm::Value *j) {
@@ -517,12 +521,10 @@ llvm::Function *taylor_c_diff_func_pow_impl(llvm_state &s, const pow_impl &fn, c
                     auto aj = taylor_c_load_diff(s, diff_ptr, n_uvars, j, u_idx);
 
                     // Compute the factor n*alpha-j*(alpha+1).
-                    auto j_v = vector_splat(builder, builder.CreateUIToFP(j, to_llvm_type<T>(context)), batch_size);
+                    auto j_v = vector_splat(builder, builder.CreateUIToFP(j, to_llvm_type<T>(context)), val_t_width);
                     auto fac = builder.CreateFSub(
                         builder.CreateFMul(ord_v, alpha_v),
-                        builder.CreateFMul(
-                            j_v,
-                            builder.CreateFAdd(alpha_v, vector_splat(builder, codegen<T>(s, number{1.}), batch_size))));
+                        builder.CreateFMul(j_v, builder.CreateFAdd(alpha_v, llvm::ConstantFP::get(val_t, 1.))));
 
                     builder.CreateStore(builder.CreateFAdd(builder.CreateLoad(val_t, acc),
                                                            builder.CreateFMul(fac, builder.CreateFMul(b_nj, aj))),
@@ -563,7 +565,7 @@ llvm::Function *taylor_c_diff_func_pow_impl(llvm_state &s, const pow_impl &fn, c
 template <typename T, typename U1, typename U2,
           std::enable_if_t<!std::conjunction_v<is_num_param<U1>, is_num_param<U2>>, int> = 0>
 llvm::Function *taylor_c_diff_func_pow_impl(llvm_state &, const pow_impl &, const U1 &, const U2 &, std::uint32_t,
-                                            std::uint32_t)
+                                            std::uint32_t, std::uint32_t)
 {
     throw std::invalid_argument("An invalid argument type was encountered while trying to build the Taylor derivative "
                                 "of a pow() in compact mode");
@@ -571,37 +573,37 @@ llvm::Function *taylor_c_diff_func_pow_impl(llvm_state &, const pow_impl &, cons
 
 template <typename T>
 llvm::Function *taylor_c_diff_func_pow(llvm_state &s, const pow_impl &fn, std::uint32_t n_uvars,
-                                       std::uint32_t batch_size)
+                                       std::uint32_t batch_size, std::uint32_t vector_size)
 {
     assert(fn.args().size() == 2u);
 
     return std::visit(
         [&](const auto &v1, const auto &v2) {
-            return taylor_c_diff_func_pow_impl<T>(s, fn, v1, v2, n_uvars, batch_size);
+            return taylor_c_diff_func_pow_impl<T>(s, fn, v1, v2, n_uvars, batch_size, vector_size);
         },
         fn.args()[0].value(), fn.args()[1].value());
 }
 
 } // namespace
 
-llvm::Function *pow_impl::taylor_c_diff_func_dbl(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size,
-                                                 bool) const
+llvm::Function *pow_impl::taylor_c_diff_func_dbl(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size, bool,
+                                                 std::uint32_t vector_size) const
 {
-    return taylor_c_diff_func_pow<double>(s, *this, n_uvars, batch_size);
+    return taylor_c_diff_func_pow<double>(s, *this, n_uvars, batch_size, vector_size);
 }
 
-llvm::Function *pow_impl::taylor_c_diff_func_ldbl(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size,
-                                                  bool) const
+llvm::Function *pow_impl::taylor_c_diff_func_ldbl(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size, bool,
+                                                  std::uint32_t vector_size) const
 {
-    return taylor_c_diff_func_pow<long double>(s, *this, n_uvars, batch_size);
+    return taylor_c_diff_func_pow<long double>(s, *this, n_uvars, batch_size, vector_size);
 }
 
 #if defined(HEYOKA_HAVE_REAL128)
 
-llvm::Function *pow_impl::taylor_c_diff_func_f128(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size,
-                                                  bool) const
+llvm::Function *pow_impl::taylor_c_diff_func_f128(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size, bool,
+                                                  std::uint32_t vector_size) const
 {
-    return taylor_c_diff_func_pow<mppp::real128>(s, *this, n_uvars, batch_size);
+    return taylor_c_diff_func_pow<mppp::real128>(s, *this, n_uvars, batch_size, vector_size);
 }
 
 #endif
diff --git a/src/math/sum.cpp b/src/math/sum.cpp
index 979c406be..545d4a781 100644
--- a/src/math/sum.cpp
+++ b/src/math/sum.cpp
@@ -23,6 +23,7 @@
 
 #include <llvm/IR/Attributes.h>
 #include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/Constants.h>
 #include <llvm/IR/DerivedTypes.h>
 #include <llvm/IR/Function.h>
 #include <llvm/IR/IRBuilder.h>
@@ -186,7 +187,7 @@ namespace
 
 template <typename T>
 llvm::Function *sum_taylor_c_diff_func_impl(llvm_state &s, const sum_impl &sf, std::uint32_t n_uvars,
-                                            std::uint32_t batch_size)
+                                            std::uint32_t batch_size, std::uint32_t vector_size)
 {
     // NOTE: this is prevented in the implementation
     // of the sum() function.
@@ -196,8 +197,8 @@ llvm::Function *sum_taylor_c_diff_func_impl(llvm_state &s, const sum_impl &sf, s
     auto &builder = s.builder();
     auto &context = s.context();
 
-    // Fetch the floating-point type.
-    auto val_t = to_llvm_vector_type<T>(context, batch_size);
+    // Fetch the return type.
+    auto val_t = to_llvm_vector_type<T>(context, vector_size > 1u ? vector_size : batch_size);
 
     // Build the vector of arguments needed to determine the function name.
     std::vector<std::variant<variable, number, param>> nm_args;
@@ -220,7 +221,7 @@ llvm::Function *sum_taylor_c_diff_func_impl(llvm_state &s, const sum_impl &sf, s
     }
 
     // Fetch the function name and arguments.
-    const auto na_pair = taylor_c_diff_func_name_args<T>(context, "sum", n_uvars, batch_size, nm_args);
+    const auto na_pair = taylor_c_diff_vfunc_name_args<T>(context, "sum", n_uvars, batch_size, vector_size, nm_args);
     const auto &fname = na_pair.first;
     const auto &fargs = na_pair.second;
 
@@ -270,12 +271,12 @@ llvm::Function *sum_taylor_c_diff_func_impl(llvm_state &s, const sum_impl &sf, s
                             [&]() {
                                 // If the order is zero, run the codegen.
                                 builder.CreateStore(
-                                    taylor_c_diff_numparam_codegen(s, v, terms + i, par_ptr, batch_size), retval);
+                                    taylor_c_diff_numparam_codegen(s, v, terms + i, par_ptr, batch_size, vector_size),
+                                    retval);
                             },
                             [&]() {
                                 // Otherwise, return zero.
-                                builder.CreateStore(vector_splat(builder, codegen<T>(s, number{0.}), batch_size),
-                                                    retval);
+                                builder.CreateStore(llvm::ConstantFP::get(val_t, 0.), retval);
                             });
 
                         return builder.CreateLoad(val_t, retval);
@@ -315,24 +316,24 @@ llvm::Function *sum_taylor_c_diff_func_impl(llvm_state &s, const sum_impl &sf, s
 
 } // namespace
 
-llvm::Function *sum_impl::taylor_c_diff_func_dbl(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size,
-                                                 bool) const
+llvm::Function *sum_impl::taylor_c_diff_func_dbl(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size, bool,
+                                                 std::uint32_t vector_size) const
 {
-    return sum_taylor_c_diff_func_impl<double>(s, *this, n_uvars, batch_size);
+    return sum_taylor_c_diff_func_impl<double>(s, *this, n_uvars, batch_size, vector_size);
 }
 
-llvm::Function *sum_impl::taylor_c_diff_func_ldbl(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size,
-                                                  bool) const
+llvm::Function *sum_impl::taylor_c_diff_func_ldbl(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size, bool,
+                                                  std::uint32_t vector_size) const
 {
-    return sum_taylor_c_diff_func_impl<long double>(s, *this, n_uvars, batch_size);
+    return sum_taylor_c_diff_func_impl<long double>(s, *this, n_uvars, batch_size, vector_size);
 }
 
 #if defined(HEYOKA_HAVE_REAL128)
 
-llvm::Function *sum_impl::taylor_c_diff_func_f128(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size,
-                                                  bool) const
+llvm::Function *sum_impl::taylor_c_diff_func_f128(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size, bool,
+                                                  std::uint32_t vector_size) const
 {
-    return sum_taylor_c_diff_func_impl<mppp::real128>(s, *this, n_uvars, batch_size);
+    return sum_taylor_c_diff_func_impl<mppp::real128>(s, *this, n_uvars, batch_size, vector_size);
 }
 
 #endif
diff --git a/src/math/sum_sq.cpp b/src/math/sum_sq.cpp
index 7f3b51d3b..725ee3ec7 100644
--- a/src/math/sum_sq.cpp
+++ b/src/math/sum_sq.cpp
@@ -26,6 +26,7 @@
 
 #include <llvm/IR/Attributes.h>
 #include <llvm/IR/BasicBlock.h>
+#include <llvm/IR/Constants.h>
 #include <llvm/IR/DerivedTypes.h>
 #include <llvm/IR/Function.h>
 #include <llvm/IR/IRBuilder.h>
@@ -290,7 +291,7 @@ namespace
 
 template <typename T>
 llvm::Function *sum_sq_taylor_c_diff_func_impl(llvm_state &s, const sum_sq_impl &sf, std::uint32_t n_uvars,
-                                               std::uint32_t batch_size)
+                                               std::uint32_t batch_size, std::uint32_t vector_size)
 {
     // NOTE: this is prevented in the implementation
     // of the sum() function.
@@ -300,8 +301,8 @@ llvm::Function *sum_sq_taylor_c_diff_func_impl(llvm_state &s, const sum_sq_impl
     auto &builder = s.builder();
     auto &context = s.context();
 
-    // Fetch the floating-point type.
-    auto val_t = to_llvm_vector_type<T>(context, batch_size);
+    // Fetch the return type.
+    auto val_t = to_llvm_vector_type<T>(context, vector_size > 1u ? vector_size : batch_size);
 
     // Build the vector of arguments needed to determine the function name.
     std::vector<std::variant<variable, number, param>> nm_args;
@@ -324,7 +325,7 @@ llvm::Function *sum_sq_taylor_c_diff_func_impl(llvm_state &s, const sum_sq_impl
     }
 
     // Fetch the function name and arguments.
-    const auto na_pair = taylor_c_diff_func_name_args<T>(context, "sum_sq", n_uvars, batch_size, nm_args);
+    const auto na_pair = taylor_c_diff_vfunc_name_args<T>(context, "sum_sq", n_uvars, batch_size, vector_size, nm_args);
     const auto &fname = na_pair.first;
     const auto &fargs = na_pair.second;
 
@@ -359,7 +360,7 @@ llvm::Function *sum_sq_taylor_c_diff_func_impl(llvm_state &s, const sum_sq_impl
         v_accs.resize(boost::numeric_cast<decltype(v_accs.size())>(sf.args().size()));
         for (auto &acc : v_accs) {
             acc = builder.CreateAlloca(val_t);
-            builder.CreateStore(vector_splat(builder, codegen<T>(s, number{0.}), batch_size), acc);
+            builder.CreateStore(llvm::ConstantFP::get(val_t, 0.), acc);
         }
 
         // Create the return value.
@@ -465,13 +466,13 @@ llvm::Function *sum_sq_taylor_c_diff_func_impl(llvm_state &s, const sum_sq_impl
                                     s, builder.CreateICmpEQ(order, builder.getInt32(0)),
                                     [&]() {
                                         // Order 0, store the num/param.
-                                        builder.CreateStore(
-                                            taylor_c_diff_numparam_codegen(s, v, terms + k, par_ptr, batch_size), ret);
+                                        builder.CreateStore(taylor_c_diff_numparam_codegen(s, v, terms + k, par_ptr,
+                                                                                           batch_size, vector_size),
+                                                            ret);
                                     },
                                     [&]() {
                                         // Order 2 or higher, store zero.
-                                        builder.CreateStore(
-                                            vector_splat(builder, codegen<T>(s, number{0.}), batch_size), ret);
+                                        builder.CreateStore(llvm::ConstantFP::get(val_t, 0.), ret);
                                     });
 
                                 auto val = builder.CreateLoad(val_t, ret);
@@ -522,23 +523,23 @@ llvm::Function *sum_sq_taylor_c_diff_func_impl(llvm_state &s, const sum_sq_impl
 } // namespace
 
 llvm::Function *sum_sq_impl::taylor_c_diff_func_dbl(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size,
-                                                    bool) const
+                                                    bool, std::uint32_t vector_size) const
 {
-    return sum_sq_taylor_c_diff_func_impl<double>(s, *this, n_uvars, batch_size);
+    return sum_sq_taylor_c_diff_func_impl<double>(s, *this, n_uvars, batch_size, vector_size);
 }
 
 llvm::Function *sum_sq_impl::taylor_c_diff_func_ldbl(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size,
-                                                     bool) const
+                                                     bool, std::uint32_t vector_size) const
 {
-    return sum_sq_taylor_c_diff_func_impl<long double>(s, *this, n_uvars, batch_size);
+    return sum_sq_taylor_c_diff_func_impl<long double>(s, *this, n_uvars, batch_size, vector_size);
 }
 
 #if defined(HEYOKA_HAVE_REAL128)
 
 llvm::Function *sum_sq_impl::taylor_c_diff_func_f128(llvm_state &s, std::uint32_t n_uvars, std::uint32_t batch_size,
-                                                     bool) const
+                                                     bool, std::uint32_t vector_size) const
 {
-    return sum_sq_taylor_c_diff_func_impl<mppp::real128>(s, *this, n_uvars, batch_size);
+    return sum_sq_taylor_c_diff_func_impl<mppp::real128>(s, *this, n_uvars, batch_size, vector_size);
 }
 
 #endif
diff --git a/src/taylor_01.cpp b/src/taylor_01.cpp
index 0dfff03a7..019a0888b 100644
--- a/src/taylor_01.cpp
+++ b/src/taylor_01.cpp
@@ -67,6 +67,7 @@
 #endif
 
 #include <heyoka/detail/llvm_helpers.hpp>
+#include <heyoka/detail/llvm_vector_type.hpp>
 #include <heyoka/detail/logging_impl.hpp>
 #include <heyoka/detail/string_conv.hpp>
 #include <heyoka/detail/type_traits.hpp>
@@ -198,6 +199,118 @@ taylor_c_diff_func_name_args_impl(llvm::LLVMContext &context, const std::string
     return std::make_pair(std::move(fname), std::move(fargs));
 }
 
+// TODO remove the other version, then rename?
+template <typename T>
+std::pair<std::string, std::vector<llvm::Type *>>
+taylor_c_diff_vfunc_name_args(llvm::LLVMContext &context, const std::string &name, std::uint32_t n_uvars,
+                              std::uint32_t batch_size, std::uint32_t vector_size,
+                              const std::vector<std::variant<variable, number, param>> &args,
+                              std::uint32_t n_hidden_deps)
+{
+    // LCOV_EXCL_START
+    assert(n_uvars > 0u);
+    assert(vector_size == 1u || batch_size == 1u);
+    // LCOV_EXCL_STOP
+
+    // Fetch the scalar floating-point type corresponding to T.
+    auto scal_fp_t = to_llvm_type<T>(context);
+
+    // Fetch the floating-point type stored in diff_arr.
+    auto diff_val_t = make_vector_type(scal_fp_t, batch_size);
+
+    // Init the name.
+    auto fname = fmt::format("heyoka.taylor_c_diff.{}.{}.", vector_size, name);
+
+    // Init the vector of arguments:
+    // - diff order,
+    // - idx/indices of the u variable(s) whose diff is being computed,
+    // - diff array (pointer to diff_val_t),
+    // - par ptr (pointer to scalar),
+    // - time ptr (pointer to scalar).
+    std::vector<llvm::Type *> fargs{llvm::Type::getInt32Ty(context),
+                                    make_vector_type(llvm::Type::getInt32Ty(context), vector_size),
+                                    llvm::PointerType::getUnqual(diff_val_t), llvm::PointerType::getUnqual(scal_fp_t),
+                                    llvm::PointerType::getUnqual(scal_fp_t)};
+
+    // Add the mangling and LLVM arg types for the argument types. Also, detect if
+    // we have variables in the arguments.
+    bool with_var = false;
+    for (decltype(args.size()) i = 0; i < args.size(); ++i) {
+        // Detect variable.
+        if (std::holds_alternative<variable>(args[i])) {
+            with_var = true;
+        }
+
+        // Name mangling.
+        fname += std::visit([](const auto &v) { return taylor_c_diff_mangle(v); }, args[i]);
+
+        // Add the arguments separator, if we are not at the
+        // last argument.
+        if (i != args.size() - 1u) {
+            fname += '_';
+        }
+
+        // Add the LLVM function argument type.
+        fargs.push_back(std::visit(
+            [&](const auto &v) -> llvm::Type * {
+                using type = detail::uncvref_t<decltype(v)>;
+
+                if constexpr (std::is_same_v<type, number>) {
+                    // For numbers, the argument is passed as a scalar
+                    // floating-point value, or a vector of floating-point
+                    // values in vector mode.
+                    return make_vector_type(scal_fp_t, vector_size);
+                } else {
+                    // For vars and params, the argument is an index
+                    // in an array, or a vector of indices in vector mode.
+                    return make_vector_type(llvm::Type::getInt32Ty(context), vector_size);
+                }
+            },
+            args[i]));
+    }
+
+    // Close the argument list with a ".".
+    // NOTE: this will result in a ".." in the name
+    // if the function has zero arguments.
+    fname += '.';
+
+    // If we have variables in the arguments, add mangling
+    // for n_uvars. This is needed because the function logic
+    // for accessing the derivatives depends on n_uvars.
+    if (with_var) {
+        fname += fmt::format("n_uvars_{}.", n_uvars);
+    }
+
+    // Finally, add the mangling for diff_val_t.
+    fname += llvm_mangle_type(diff_val_t);
+
+    // Fill in the hidden dependency arguments. These are all indices or vectors
+    // of indices.
+    fargs.insert(fargs.end(), boost::numeric_cast<decltype(fargs.size())>(n_hidden_deps),
+                 make_vector_type(llvm::Type::getInt32Ty(context), vector_size));
+
+    return std::make_pair(std::move(fname), std::move(fargs));
+}
+
+template HEYOKA_DLL_PUBLIC std::pair<std::string, std::vector<llvm::Type *>>
+taylor_c_diff_vfunc_name_args<double>(llvm::LLVMContext &, const std::string &, std::uint32_t, std::uint32_t,
+                                      std::uint32_t, const std::vector<std::variant<variable, number, param>> &,
+                                      std::uint32_t);
+
+template HEYOKA_DLL_PUBLIC std::pair<std::string, std::vector<llvm::Type *>>
+taylor_c_diff_vfunc_name_args<long double>(llvm::LLVMContext &, const std::string &, std::uint32_t, std::uint32_t,
+                                           std::uint32_t, const std::vector<std::variant<variable, number, param>> &,
+                                           std::uint32_t);
+
+#if defined(HEYOKA_HAVE_REAL128)
+
+template HEYOKA_DLL_PUBLIC std::pair<std::string, std::vector<llvm::Type *>>
+taylor_c_diff_vfunc_name_args<mppp::real128>(llvm::LLVMContext &, const std::string &, std::uint32_t, std::uint32_t,
+                                             std::uint32_t, const std::vector<std::variant<variable, number, param>> &,
+                                             std::uint32_t);
+
+#endif
+
 namespace
 {
 
@@ -280,13 +393,33 @@ llvm::Value *taylor_c_diff_numparam_codegen(llvm_state &s, const number &, llvm:
     return vector_splat(s.builder(), n, batch_size);
 }
 
+llvm::Value *taylor_c_diff_numparam_codegen(llvm_state &s, const number &, llvm::Value *n, llvm::Value *,
+                                            std::uint32_t batch_size, std::uint32_t vector_size)
+{
+    // LCOV_EXCL_START
+#if !defined(NDEBUG)
+    assert(batch_size > 0u);
+    assert(vector_size > 0u);
+
+    if (vector_size == 1u) {
+        assert(!llvm::isa<llvm_vector_type>(n->getType()));
+    } else {
+        assert(batch_size == 1u);
+        assert(llvm::isa<llvm_vector_type>(n->getType()));
+        assert(llvm::cast<llvm_vector_type>(n->getType())->getNumElements() == vector_size);
+    }
+#endif
+    // LCOV_EXCL_STOP
+
+    return vector_size == 1u ? vector_splat(s.builder(), n, batch_size) : n;
+}
+
 llvm::Value *taylor_c_diff_numparam_codegen(llvm_state &s, const param &, llvm::Value *p, llvm::Value *par_ptr,
                                             std::uint32_t batch_size)
 {
     // LCOV_EXCL_START
     assert(batch_size > 0u);
     assert(llvm::isa<llvm::PointerType>(par_ptr->getType()));
-    assert(!llvm::cast<llvm::PointerType>(par_ptr->getType())->isVectorTy());
     // LCOV_EXCL_STOP
 
     auto &builder = s.builder();
@@ -299,6 +432,46 @@ llvm::Value *taylor_c_diff_numparam_codegen(llvm_state &s, const param &, llvm::
     return load_vector_from_memory(builder, ptr, batch_size);
 }
 
+llvm::Value *taylor_c_diff_numparam_codegen(llvm_state &s, const param &, llvm::Value *p, llvm::Value *par_ptr,
+                                            std::uint32_t batch_size, std::uint32_t vector_size)
+{
+    // LCOV_EXCL_START
+#if !defined(NDEBUG)
+    assert(batch_size > 0u);
+    assert(vector_size > 0u);
+
+    assert(llvm::isa<llvm::PointerType>(par_ptr->getType()));
+    assert(!llvm::isa<llvm_vector_type>(llvm::cast<llvm::PointerType>(par_ptr->getType())->getPointerElementType()));
+
+    if (vector_size == 1u) {
+        assert(!llvm::isa<llvm_vector_type>(p->getType()));
+    } else {
+        assert(batch_size == 1u);
+        assert(llvm::isa<llvm_vector_type>(p->getType()));
+        assert(llvm::cast<llvm_vector_type>(p->getType())->getNumElements() == vector_size);
+    }
+#endif
+    // LCOV_EXCL_STOP
+
+    auto &builder = s.builder();
+
+    // Fetch the scalar floating-point type of the parameters.
+    auto *scal_fp_t = llvm::cast<llvm::PointerType>(par_ptr->getType())->getPointerElementType();
+
+    // NOTE: overflow checks are done in taylor_compute_jet().
+    if (vector_size == 1u) {
+        auto *ptr = builder.CreateInBoundsGEP(scal_fp_t, par_ptr, builder.CreateMul(p, builder.getInt32(batch_size)));
+
+        return load_vector_from_memory(builder, ptr, batch_size);
+    } else {
+        // Fetch the pointers into par_ptr.
+        auto *ptrs = builder.CreateInBoundsGEP(scal_fp_t, par_ptr, p);
+
+        // Gather.
+        return gather_vector_from_memory(builder, make_vector_type(scal_fp_t, vector_size), ptrs);
+    }
+}
+
 // Helper to fetch the derivative of order 'order' of the u variable at index u_idx from the
 // derivative array 'arr'. The total number of u variables is n_uvars.
 llvm::Value *taylor_fetch_diff(const std::vector<llvm::Value *> &arr, std::uint32_t u_idx, std::uint32_t order,
@@ -314,38 +487,90 @@ llvm::Value *taylor_fetch_diff(const std::vector<llvm::Value *> &arr, std::uint3
     return arr[idx];
 }
 
-// Load the derivative of order 'order' of the u variable u_idx from the array of Taylor derivatives diff_arr.
+// Load the derivative of order 'order' of the u variable(s) u_idx from the array of Taylor derivatives diff_arr.
 // n_uvars is the total number of u variables.
 llvm::Value *taylor_c_load_diff(llvm_state &s, llvm::Value *diff_arr, std::uint32_t n_uvars, llvm::Value *order,
                                 llvm::Value *u_idx)
 {
-    auto &builder = s.builder();
-
     // NOTE: overflow check has already been done to ensure that the
     // total size of diff_arr fits in a 32-bit unsigned integer.
-    assert(llvm_depr_GEP_type_check(diff_arr, pointee_type(diff_arr))); // LCOV_EXCL_LINE
-    auto *ptr
-        = builder.CreateInBoundsGEP(pointee_type(diff_arr), diff_arr,
-                                    builder.CreateAdd(builder.CreateMul(order, builder.getInt32(n_uvars)), u_idx));
+    auto &builder = s.builder();
+
+    // Fetch the floating-point type in diff_arr.
+    auto *diff_val_t = pointee_type(diff_arr);
+
+    if (auto *vec_idx_t = llvm::dyn_cast<llvm_vector_type>(u_idx->getType())) {
+        // Vector of indices.
 
-    return builder.CreateLoad(pointee_type(diff_arr), ptr);
+        // NOTE: vector mode and batch mode are mutually exclusive.
+        assert(!llvm::isa<llvm_vector_type>(diff_val_t)); // LCOV_EXCL_LINE
+
+        // Fetch the vector floating-point type.
+        const auto vector_size = boost::numeric_cast<std::uint32_t>(vec_idx_t->getNumElements());
+        // NOTE: the expectation here is that vectors of size 1 never show up, as they are always
+        // turned into scalars by helpers such as make_vector_type() & co.
+        assert(vector_size > 1u); // LCOV_EXCL_LINE
+        auto *vec_fp_t = make_vector_type(diff_val_t, vector_size);
+
+        // Compute the pointers.
+        auto *ptrs = builder.CreateInBoundsGEP(
+            diff_val_t, diff_arr,
+            builder.CreateAdd(vector_splat(builder, builder.CreateMul(order, builder.getInt32(n_uvars)), vector_size),
+                              u_idx));
+
+        // Load.
+        return gather_vector_from_memory(builder, vec_fp_t, ptrs);
+    } else {
+        // Single index.
+        auto *ptr = builder.CreateInBoundsGEP(
+            diff_val_t, diff_arr, builder.CreateAdd(builder.CreateMul(order, builder.getInt32(n_uvars)), u_idx));
+
+        return builder.CreateLoad(diff_val_t, ptr);
+    }
 }
 
-// Store the value val as the derivative of order 'order' of the u variable u_idx
+// Store the value x as the derivative of order 'order' of the u variable u_idx
 // into the array of Taylor derivatives diff_arr. n_uvars is the total number of u variables.
 void taylor_c_store_diff(llvm_state &s, llvm::Value *diff_arr, std::uint32_t n_uvars, llvm::Value *order,
-                         llvm::Value *u_idx, llvm::Value *val)
+                         llvm::Value *u_idx, llvm::Value *x)
 {
-    auto &builder = s.builder();
-
     // NOTE: overflow check has already been done to ensure that the
     // total size of diff_arr fits in a 32-bit unsigned integer.
-    assert(llvm_depr_GEP_type_check(diff_arr, pointee_type(diff_arr))); // LCOV_EXCL_LINE
-    auto *ptr
-        = builder.CreateInBoundsGEP(pointee_type(diff_arr), diff_arr,
-                                    builder.CreateAdd(builder.CreateMul(order, builder.getInt32(n_uvars)), u_idx));
+    auto &builder = s.builder();
+
+    // Fetch the floating-point type in diff_arr.
+    auto *diff_val_t = pointee_type(diff_arr);
 
-    builder.CreateStore(val, ptr);
+    if (auto *vec_idx_t = llvm::dyn_cast<llvm_vector_type>(u_idx->getType())) {
+        // Vector mode.
+
+        // LCOV_EXCL_START
+        // NOTE: vector mode and batch mode are mutually exclusive.
+        assert(!llvm::isa<llvm_vector_type>(diff_val_t));
+        assert(llvm::isa<llvm_vector_type>(x->getType()));
+        assert(llvm::cast<llvm_vector_type>(x->getType())->getNumElements() == vec_idx_t->getNumElements());
+        // LCOV_EXCL_STOP
+
+        const auto vector_size = boost::numeric_cast<std::uint32_t>(vec_idx_t->getNumElements());
+        // NOTE: the expectation here is that vectors of size 1 never show up, as they are always
+        // turned into scalars by helpers such as make_vector_type() & co.
+        assert(vector_size > 1u); // LCOV_EXCL_LINE
+
+        // Compute the pointers.
+        auto *ptrs = builder.CreateInBoundsGEP(
+            diff_val_t, diff_arr,
+            builder.CreateAdd(vector_splat(builder, builder.CreateMul(order, builder.getInt32(n_uvars)), vector_size),
+                              u_idx));
+
+        // Store.
+        scatter_vector_to_memory(builder, x, ptrs);
+    } else {
+        // Scalar mode.
+        auto *ptr = builder.CreateInBoundsGEP(
+            diff_val_t, diff_arr, builder.CreateAdd(builder.CreateMul(order, builder.getInt32(n_uvars)), u_idx));
+
+        builder.CreateStore(x, ptr);
+    }
 }
 
 namespace
diff --git a/src/taylor_02.cpp b/src/taylor_02.cpp
index 167e35b8e..374543603 100644
--- a/src/taylor_02.cpp
+++ b/src/taylor_02.cpp
@@ -19,6 +19,7 @@
 #include <map>
 #include <stdexcept>
 #include <string>
+#include <tuple>
 #include <type_traits>
 #include <unordered_map>
 #include <utility>
@@ -1052,12 +1053,16 @@ struct llvm_func_name_compare {
 };
 
 // For each segment in s_dc, this function will return a dict mapping an LLVM function
-// f for the computation of a Taylor derivative to a size and a vector of std::functions. For example, one entry
-// in the return value will read something like:
-// {f : (2, [g_0, g_1, g_2])}
+// f for the computation of a Taylor derivative to:
+// - an integer,
+// - a vector of std::functions,
+// - an expression.
+// For example, one entry in the return value will read something like:
+// {f : (2, [g_0, g_1, g_2], func(u_0, u_1, u_2))}
 // The meaning in this example is that the arity of f is 3 and it will be called with 2 different
 // sets of arguments. The g_i functions are expected to be called with input argument j in [0, 1]
 // to yield the value of the i-th function argument for f at the j-th invocation.
+// func(u_0, u_1, u_2) is the heyoka expression from which f was generated.
 template <typename T>
 auto taylor_build_function_maps(llvm_state &s, const std::vector<taylor_dc_t> &s_dc, std::uint32_t n_eq,
                                 std::uint32_t n_uvars, std::uint32_t batch_size, bool high_accuracy)
@@ -1070,62 +1075,70 @@ auto taylor_build_function_maps(llvm_state &s, const std::vector<taylor_dc_t> &s
     // functions are invoked in taylor_compute_jet_compact_mode() is always the same. If we used directly pointer
     // comparisons instead, the order could vary across different executions and different platforms. The name
     // mangling we do when creating the function names should ensure that there are no possible name collisions.
-    std::vector<
-        std::map<llvm::Function *, std::pair<std::uint32_t, std::vector<std::function<llvm::Value *(llvm::Value *)>>>,
-                 llvm_func_name_compare>>
+    using gen_vec_t = std::vector<std::function<llvm::Value *(llvm::Value *)>>;
+    std::vector<std::map<llvm::Function *, std::tuple<std::uint32_t, gen_vec_t, expression>, llvm_func_name_compare>>
         retval;
 
-    // Variable to keep track of the u variable
+    // Counter to keep track of the index of the u variable
     // on whose definition we are operating.
     auto cur_u_idx = n_eq;
     for (const auto &seg : s_dc) {
-        // This structure maps an LLVM function to sets of arguments
-        // with which the function is to be called. For instance, if function
-        // f(x, y, z) is to be called as f(a, b, c) and f(d, e, f), then tmp_map
-        // will contain {f : [[a, b, c], [d, e, f]]}.
+        // This structure maps an LLVM function to:
+        // - the sets of arguments with which the function is to be called,
+        // - the expression that was used for the generation of the LLVM function.
+        // For instance, if the LLVM function f generated from the heyoka expression
+        // func(x, y, z) is to be called as f(a, b, c) and f(d, e, f), then tmp_map
+        // will contain {f : ([[a, b, c], [d, e, f]], func(x, y, z))}.
         // After construction, we have verified that for each function
         // in the map the sets of arguments have all the same size.
-        std::unordered_map<llvm::Function *, std::vector<std::vector<std::variant<std::uint32_t, number>>>> tmp_map;
+        using v_args_t = std::vector<std::vector<std::variant<std::uint32_t, number>>>;
+        std::unordered_map<llvm::Function *, std::tuple<v_args_t, expression>> tmp_map;
 
         for (const auto &ex : seg) {
-            // Get the function for the computation of the derivative.
-            auto func = taylor_c_diff_func<T>(s, ex.first, n_uvars, batch_size, high_accuracy);
+            // Generate or get the function for the computation of the derivative.
+            auto func = taylor_c_diff_func<T>(s, ex.first, n_uvars, batch_size, high_accuracy, 1);
 
             // Insert the function into tmp_map.
-            const auto [it, is_new_func] = tmp_map.try_emplace(func);
+            const auto [it, is_new_func] = tmp_map.try_emplace(func, v_args_t{}, ex.first);
 
-            assert(is_new_func || !it->second.empty()); // LCOV_EXCL_LINE
+            assert(is_new_func || !std::get<0>(it->second).empty()); // LCOV_EXCL_LINE
 
             // Convert the variables/constants in the current dc
             // element into a set of indices/constants.
             const auto cdiff_args = taylor_udef_to_variants(ex.first, ex.second);
 
-            if (!is_new_func && it->second.back().size() - 1u != cdiff_args.size()) {
+            if (!is_new_func && std::get<0>(it->second).back().size() - 1u != cdiff_args.size()) {
                 throw std::invalid_argument(
                     "Inconsistent arity detected in a Taylor derivative function in compact "
                     "mode: the same function is being called with both {} and {} arguments"_format(
-                        it->second.back().size() - 1u, cdiff_args.size()));
+                        std::get<0>(it->second).back().size() - 1u, cdiff_args.size()));
             }
 
             // Add the new set of arguments.
-            it->second.emplace_back();
+            std::get<0>(it->second).emplace_back();
             // Add the idx of the u variable.
-            it->second.back().emplace_back(cur_u_idx);
+            std::get<0>(it->second).back().emplace_back(cur_u_idx);
             // Add the actual function arguments.
-            it->second.back().insert(it->second.back().end(), cdiff_args.begin(), cdiff_args.end());
+            std::get<0>(it->second)
+                .back()
+                .insert(std::get<0>(it->second).back().end(), cdiff_args.begin(), cdiff_args.end());
 
             ++cur_u_idx;
         }
 
-        // Now we build the transposition of tmp_map: from {f : [[a, b, c], [d, e, f]]}
-        // to {f : [[a, d], [b, e], [c, f]]}.
-        std::unordered_map<llvm::Function *, std::vector<std::variant<std::vector<std::uint32_t>, std::vector<number>>>>
-            tmp_map_transpose;
-        for (const auto &[func, vv] : tmp_map) {
-            assert(!vv.empty()); // LCOV_EXCL_LINE
+        // Now we build the arguments transposition of tmp_map: from {f : ([[a, b, c], [d, e, f]], func(x, y, z))}
+        // to {f : ([[a, d], [b, e], [c, f]], func(x, y, z))}.
+        using v_args_t_t = std::vector<std::variant<std::vector<std::uint32_t>, std::vector<number>>>;
+        std::unordered_map<llvm::Function *, std::tuple<v_args_t_t, expression>> tmp_map_transpose;
+        for (auto &[func, tup] : tmp_map) {
+            const auto &vv = std::get<0>(tup);
+
+            assert(!vv.empty());                                                    // LCOV_EXCL_LINE
+            assert(std::holds_alternative<heyoka::func>(std::get<1>(tup).value())); // LCOV_EXCL_LINE
 
             // Add the function.
-            const auto [it, ins_status] = tmp_map_transpose.try_emplace(func);
+            const auto [it, ins_status]
+                = tmp_map_transpose.try_emplace(func, v_args_t_t{}, std::move(std::get<1>(tup)));
             assert(ins_status); // LCOV_EXCL_LINE
 
             const auto n_calls = vv.size();
@@ -1145,7 +1158,7 @@ auto taylor_build_function_maps(llvm_state &s, const std::vector<taylor_dc_t> &s
 
                 // Turn tmp_c_vec (a vector of variants) into a variant
                 // of vectors, and insert the result.
-                it->second.push_back(taylor_c_vv_transpose(tmp_c_vec));
+                std::get<0>(it->second).push_back(taylor_c_vv_transpose(tmp_c_vec));
             }
         }
 
@@ -1153,31 +1166,34 @@ auto taylor_build_function_maps(llvm_state &s, const std::vector<taylor_dc_t> &s
         retval.emplace_back();
         auto &a_map = retval.back();
 
-        for (const auto &[func, vv] : tmp_map_transpose) {
+        for (auto &[func, tup] : tmp_map_transpose) {
+            const auto &vv = std::get<0>(tup);
+
             assert(!vv.empty()); // LCOV_EXCL_LINE
 
+            // Compute the number of calls for this function.
+            const auto ncalls
+                = std::visit([](const auto &x) { return boost::numeric_cast<std::uint32_t>(x.size()); }, vv[0]);
+            assert(ncalls > 0u); // LCOV_EXCL_LINE
+
             // Add the function.
-            const auto [it, ins_status] = a_map.try_emplace(func);
+            const auto [it, ins_status] = a_map.try_emplace(func, ncalls, gen_vec_t{}, std::move(std::get<1>(tup)));
             assert(ins_status); // LCOV_EXCL_LINE
 
-            // Set the number of calls for this function.
-            it->second.first
-                = std::visit([](const auto &x) { return boost::numeric_cast<std::uint32_t>(x.size()); }, vv[0]);
-            assert(it->second.first > 0u); // LCOV_EXCL_LINE
-
             // Create the g functions for each argument.
             for (const auto &v : vv) {
-                it->second.second.push_back(std::visit(
-                    [&s](const auto &x) {
-                        using type = detail::uncvref_t<decltype(x)>;
+                std::get<1>(it->second)
+                    .push_back(std::visit(
+                        [&s](const auto &x) {
+                            using type = detail::uncvref_t<decltype(x)>;
 
-                        if constexpr (std::is_same_v<type, std::vector<std::uint32_t>>) {
-                            return taylor_c_make_arg_gen_vidx(s, x);
-                        } else {
-                            return taylor_c_make_arg_gen_vc<T>(s, x);
-                        }
-                    },
-                    v));
+                            if constexpr (std::is_same_v<type, std::vector<std::uint32_t>>) {
+                                return taylor_c_make_arg_gen_vidx(s, x);
+                            } else {
+                                return taylor_c_make_arg_gen_vc<T>(s, x);
+                            }
+                        },
+                        v));
             }
         }
     }
@@ -1193,7 +1209,7 @@ auto taylor_build_function_maps(llvm_state &s, const std::vector<taylor_dc_t> &s
             fm_bd.emplace_back();
 
             for (const auto &p : m) {
-                fm_bd.back().push_back(p.second.first);
+                fm_bd.back().push_back(std::get<0>(p.second));
             }
         }
 
@@ -1341,10 +1357,10 @@ llvm::Value *taylor_compute_jet_compact_mode(llvm_state &s, llvm::Value *order0,
                 const auto &func = p.first;
 
                 // The number of func calls.
-                const auto ncalls = p.second.first;
+                const auto ncalls = std::get<0>(p.second);
 
                 // The generators for the arguments of func.
-                const auto &gens = p.second.second;
+                const auto &gens = std::get<1>(p.second);
 
                 // Fetch the current insertion block.
                 auto *orig_bb = builder.GetInsertBlock();
@@ -1434,35 +1450,167 @@ llvm::Value *taylor_compute_jet_compact_mode(llvm_state &s, llvm::Value *order0,
     // func is the LLVM function for the computation of the Taylor derivative in the block,
     // ncalls the number of times it must be called, gens the generators for the
     // function arguments and cur_order the order of the derivative.
-    auto block_diff = [&](const auto &func, const auto &ncalls, const auto &gens, llvm::Value *cur_order) {
+    // TODO fix docs.
+    auto block_diff = [&](const auto &func, const auto &ncalls, const auto &gens, const expression &ex,
+                          llvm::Value *cur_order) {
         // LCOV_EXCL_START
         assert(ncalls > 0u);
         assert(!gens.empty());
         assert(std::all_of(gens.begin(), gens.end(), [](const auto &f) { return static_cast<bool>(f); }));
         // LCOV_EXCL_STOP
 
-        // Loop over the number of calls.
-        llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(ncalls), [&](llvm::Value *cur_call_idx) {
-            // Create the u variable index from the first generator.
-            auto u_idx = gens[0](cur_call_idx);
+        if (batch_size == 1u) {
+            // The batch size is 1: we can implement the vectorized codepath.
+
+            const auto barfo_size = 4u;
+
+            const auto nregs = ncalls / barfo_size, rem = ncalls % barfo_size;
+
+            llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(nregs), [&](llvm::Value *idx) {
+                // Turn the sets of arguments returned by the generators into a single set of vector arguments.
+                std::vector<llvm::Value *> gen_vec_args, tmp;
+
+                for (const auto &gen : gens) {
+                    // Generate the arguments into tmp.
+                    tmp.clear();
+
+                    for (std::uint32_t i = 0; i < barfo_size; ++i) {
+                        tmp.push_back(gen(builder.CreateAdd(builder.CreateMul(idx, builder.getInt32(barfo_size)),
+                                                            builder.getInt32(i))));
+                    }
+
+                    // Transform tmp into a vector and add it
+                    // to gen_vec_args.
+                    // NOTE: if ncalls is 1, then scalars_to_vector()
+                    // will just return the first element of tmp.
+                    // TODO fix docs
+                    gen_vec_args.push_back(scalars_to_vector(builder, tmp));
+                }
+
+                // Create the vector diff function.
+                auto *vfunc = taylor_c_diff_func<T>(s, ex, n_uvars, 1, high_accuracy, barfo_size);
+
+                // Initialise the arguments with which vfunc must be called. The following
+                // initial arguments are always present:
+                // - current Taylor order,
+                // - u indices of the variables,
+                // - array of derivatives,
+                // - pointer to the param values,
+                // - pointer to the time value(s).
+                std::vector<llvm::Value *> args{cur_order, gen_vec_args[0], diff_arr, par_ptr, time_ptr};
+
+                // Append the other arguments.
+                for (decltype(gens.size()) i = 1; i < gens.size(); ++i) {
+                    args.push_back(gen_vec_args[i]);
+                }
+
+                // Calculate the derivative and store the result.
+                taylor_c_store_diff(s, diff_arr, n_uvars, cur_order, gen_vec_args[0], builder.CreateCall(vfunc, args));
+            });
 
-            // Initialise the vector of arguments with which func must be called. The following
+            if (rem != 0u) {
+                std::vector<llvm::Value *> gen_vec_args, tmp;
+
+                for (const auto &gen : gens) {
+                    // Generate the arguments into tmp.
+                    tmp.clear();
+
+                    for (std::uint32_t i = 0; i < rem; ++i) {
+                        tmp.push_back(gen(builder.getInt32(nregs * barfo_size + i)));
+                    }
+
+                    // Transform tmp into a vector and add it
+                    // to gen_vec_args.
+                    // NOTE: if ncalls is 1, then scalars_to_vector()
+                    // will just return the first element of tmp.
+                    // TODO fix docs
+                    gen_vec_args.push_back(scalars_to_vector(builder, tmp));
+                }
+
+                // Create the vector diff function.
+                auto *vfunc = taylor_c_diff_func<T>(s, ex, n_uvars, 1, high_accuracy, rem);
+
+                // Initialise the arguments with which vfunc must be called. The following
+                // initial arguments are always present:
+                // - current Taylor order,
+                // - u indices of the variables,
+                // - array of derivatives,
+                // - pointer to the param values,
+                // - pointer to the time value(s).
+                std::vector<llvm::Value *> args{cur_order, gen_vec_args[0], diff_arr, par_ptr, time_ptr};
+
+                // Append the other arguments.
+                for (decltype(gens.size()) i = 1; i < gens.size(); ++i) {
+                    args.push_back(gen_vec_args[i]);
+                }
+
+                // Calculate the derivative and store the result.
+                taylor_c_store_diff(s, diff_arr, n_uvars, cur_order, gen_vec_args[0], builder.CreateCall(vfunc, args));
+            }
+
+#if 0
+            // Turn the sets of arguments returned by the generators into a single set of vector arguments.
+            std::vector<llvm::Value *> gen_vec_args, tmp;
+
+            for (const auto &gen : gens) {
+                // Generate the arguments into tmp.
+                tmp.clear();
+
+                for (std::uint32_t i = 0; i < ncalls; ++i) {
+                    tmp.push_back(gen(builder.getInt32(i)));
+                }
+
+                // Transform tmp into a vector and add it
+                // to gen_vec_args.
+                // NOTE: if ncalls is 1, then scalars_to_vector()
+                // will just return the first element of tmp.
+                gen_vec_args.push_back(scalars_to_vector(builder, tmp));
+            }
+
+            // Create the vector diff function.
+            auto *vfunc = taylor_c_diff_func<T>(s, ex, n_uvars, 1, high_accuracy, ncalls);
+
+            // Initialise the arguments with which vfunc must be called. The following
             // initial arguments are always present:
             // - current Taylor order,
-            // - u index of the variable,
+            // - u indices of the variables,
             // - array of derivatives,
             // - pointer to the param values,
             // - pointer to the time value(s).
-            std::vector<llvm::Value *> args{cur_order, u_idx, diff_arr, par_ptr, time_ptr};
+            std::vector<llvm::Value *> args{cur_order, gen_vec_args[0], diff_arr, par_ptr, time_ptr};
 
-            // Create the other arguments via the generators.
+            // Append the other arguments.
             for (decltype(gens.size()) i = 1; i < gens.size(); ++i) {
-                args.push_back(gens[i](cur_call_idx));
+                args.push_back(gen_vec_args[i]);
             }
 
             // Calculate the derivative and store the result.
-            taylor_c_store_diff(s, diff_arr, n_uvars, cur_order, u_idx, builder.CreateCall(func, args));
-        });
+            taylor_c_store_diff(s, diff_arr, n_uvars, cur_order, gen_vec_args[0], builder.CreateCall(vfunc, args));
+#endif
+        } else {
+            // Loop over the number of calls.
+            llvm_loop_u32(s, builder.getInt32(0), builder.getInt32(ncalls), [&](llvm::Value *cur_call_idx) {
+                // Create the u variable index from the first generator.
+                auto u_idx = gens[0](cur_call_idx);
+
+                // Initialise the arguments with which func must be called. The following
+                // initial arguments are always present:
+                // - current Taylor order,
+                // - u index of the variable,
+                // - array of derivatives,
+                // - pointer to the param values,
+                // - pointer to the time value(s).
+                std::vector<llvm::Value *> args{cur_order, u_idx, diff_arr, par_ptr, time_ptr};
+
+                // Create the other arguments via the generators.
+                for (decltype(gens.size()) i = 1; i < gens.size(); ++i) {
+                    args.push_back(gens[i](cur_call_idx));
+                }
+
+                // Calculate the derivative and store the result.
+                taylor_c_store_diff(s, diff_arr, n_uvars, cur_order, u_idx, builder.CreateCall(func, args));
+            });
+        }
     };
 
     // Helper to compute concurrently all the derivatives
@@ -1521,7 +1669,7 @@ llvm::Value *taylor_compute_jet_compact_mode(llvm_state &s, llvm::Value *order0,
             // of order cur_order serially.
             for (const auto &map : f_maps) {
                 for (const auto &p : map) {
-                    block_diff(p.first, p.second.first, p.second.second, cur_order);
+                    block_diff(p.first, std::get<0>(p.second), std::get<1>(p.second), std::get<2>(p.second), cur_order);
                 }
             }
         }
@@ -1571,7 +1719,7 @@ llvm::Value *taylor_compute_jet_compact_mode(llvm_state &s, llvm::Value *order0,
                 // that each block in a segment processes the derivatives
                 // of exactly ncalls u variables.
                 for (const auto &p : f_maps[i]) {
-                    const auto ncalls = p.second.first;
+                    const auto ncalls = std::get<0>(p.second);
                     cur_start_u_idx += ncalls;
                 }
             }
@@ -1584,9 +1732,9 @@ llvm::Value *taylor_compute_jet_compact_mode(llvm_state &s, llvm::Value *order0,
 
                 // Compute the derivatives of all the blocks in the segment.
                 for (const auto &p : map) {
-                    const auto ncalls = p.second.first;
+                    const auto ncalls = std::get<0>(p.second);
 
-                    block_diff(p.first, ncalls, p.second.second, builder.getInt32(order));
+                    block_diff(p.first, ncalls, std::get<1>(p.second), std::get<2>(p.second), builder.getInt32(order));
 
                     // Update cur_start_u_idx taking advantage of the fact
                     // that each block in a segment processes the derivatives
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 4b1e2e3db..70e836ee3 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -126,3 +126,4 @@ ADD_HEYOKA_TESTCASE(c_output)
 ADD_HEYOKA_TESTCASE(ensemble_propagate)
 ADD_HEYOKA_TESTCASE(parallel_mode)
 ADD_HEYOKA_TESTCASE(opt_checks)
+ADD_HEYOKA_TESTCASE(tmp_vec_mode)
diff --git a/test/tmp_vec_mode.cpp b/test/tmp_vec_mode.cpp
new file mode 100644
index 000000000..c70529bb0
--- /dev/null
+++ b/test/tmp_vec_mode.cpp
@@ -0,0 +1,35 @@
+// Copyright 2020, 2021, 2022 Francesco Biscani (bluescarni@gmail.com), Dario Izzo (dario.izzo@gmail.com)
+//
+// This file is part of the heyoka library.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <heyoka/expression.hpp>
+#include <heyoka/taylor.hpp>
+
+#include "catch.hpp"
+#include "test_utils.hpp"
+
+using namespace heyoka;
+
+TEST_CASE("foo")
+{
+    auto [x, y, z, t] = make_vars("x", "y", "z", "t");
+
+    taylor_adaptive<double> ta{{prime(x) = x + y, prime(y) = y + z, prime(z) = z + t, prime(t) = t + x},
+                               {1., 2., 3., 4.},
+                               kw::compact_mode = true,
+                               kw::opt_level = 3u};
+
+    std::cout << ta.get_llvm_state().get_ir() << '\n';
+
+    for (const auto &[ex, _] : ta.get_decomposition()) {
+        std::cout << ex << '\n';
+    }
+
+    ta.propagate_until(5.);
+
+    std::cout << ta << '\n';
+}