rainiwu · swfsql · Feb 7, 2024 · Feb 6, 2024 · Feb 9, 2024 · Feb 9, 2024
diff --git a/dfdx-core/Cargo.toml b/dfdx-core/Cargo.toml
@@ -35,7 +35,7 @@ num-traits = { workspace = true }
 safetensors = { workspace = true, optional = true }
 memmap2 = { workspace = true, optional = true }
 half = { version = "2.3.1", optional = true, features = ["num-traits", "rand_distr"] }
-gemm = { version = "0.16.14", default-features = false, optional = true, features = ["rayon"] }
+gemm = { version = "0.17.1", default-features = false, optional = true, features = ["rayon"] }
 rayon = { version = "1.7.0", optional = true }
 libm = { workspace = true }
 wgpu = { version = "0.18.0", features = ["glsl", "spirv"], optional = true }

diff --git a/dfdx-core/src/tensor/gradients.rs b/dfdx-core/src/tensor/gradients.rs
@@ -153,7 +153,7 @@ impl<E, D: Storage<E>> Gradients<E, D> {
     #[inline]
     pub(crate) fn many_and_ref<L: Shape, R: Shape>(
         &mut self,
-        ls: &Vec<impl Tensorlike<L, E, D>>,
+        ls: &[impl Tensorlike<L, E, D>],
         r: &impl Tensorlike<R, E, D>,
     ) -> (Vec<&mut D::Vec>, &D::Vec) {
         for i in 0..ls.len() {

diff --git a/dfdx-core/src/tensor_ops/utilities/device.rs b/dfdx-core/src/tensor_ops/utilities/device.rs
@@ -114,33 +114,61 @@ pub trait Device<E: Dtype>:
     + crate::tensor_ops::axpy::AxpyKernel<E>
 
     // conv1d
-    + super::super::conv1d::Conv1DKernel<E>
+    + NonCudnnCuda<E>
+{
+}
+
+#[cfg(feature = "cudnn")]
+pub trait NonCudnnCuda<E: Dtype> {}
+
+#[cfg(not(feature = "cudnn"))]
+pub trait NonCudnnCuda<E: Dtype>:
+    // conv1d
+    super::super::conv1d::Conv1DKernel<E>
 {
 }
 
 #[cfg(feature = "f16")]
-impl Device<f16> for crate::tensor::Cpu {}
-#[cfg(feature = "f16")]
-impl Device<AMP<f16>> for crate::tensor::Cpu {}
+mod f16_ {
+    use super::*;
+    impl Device<f16> for crate::tensor::Cpu {}
+    impl NonCudnnCuda<f16> for crate::tensor::Cpu {}
+    impl Device<AMP<f16>> for crate::tensor::Cpu {}
+    impl NonCudnnCuda<AMP<f16>> for crate::tensor::Cpu {}
+}
 impl Device<f32> for crate::tensor::Cpu {}
+impl NonCudnnCuda<f32> for crate::tensor::Cpu {}
 impl Device<f64> for crate::tensor::Cpu {}
+impl NonCudnnCuda<f64> for crate::tensor::Cpu {}
 
 #[cfg(all(feature = "cuda", feature = "f16"))]
-impl Device<f16> for crate::tensor::Cuda {}
-#[cfg(all(feature = "cuda", feature = "f16"))]
-impl Device<AMP<f16>> for crate::tensor::Cuda {}
-#[cfg(feature = "cuda")]
-impl Device<f32> for crate::tensor::Cuda {}
+mod cuda_f16 {
+    use super::*;
+    impl Device<f16> for crate::tensor::Cuda {}
+    impl NonCudnnCuda<f16> for crate::tensor::Cuda {}
+    impl Device<AMP<f16>> for crate::tensor::Cuda {}
+    impl NonCudnnCuda<AMP<f16>> for crate::tensor::Cuda {}
+}
 #[cfg(feature = "cuda")]
-impl Device<f64> for crate::tensor::Cuda {}
+mod cuda {
+    use super::*;
+    impl Device<f32> for crate::tensor::Cuda {}
+    impl NonCudnnCuda<f32> for crate::tensor::Cuda {}
+    impl Device<f64> for crate::tensor::Cuda {}
+    impl NonCudnnCuda<f64> for crate::tensor::Cuda {}
+}
 
 // TODO: How can we implement this for f16 when WGSL doesn't support f16 yet?
 // #[cfg(all(feature = "webgpu", feature = "f16"))]
 // impl Device<f16> for crate::tensor::Webgpu {}
 // #[cfg(all(feature = "webgpu", feature = "f16"))]
 // impl Device<AMP<f16>> for crate::tensor::Webgpu {}
 #[cfg(feature = "webgpu")]
-impl Device<f32> for crate::tensor::Webgpu {}
+mod webgpu {
+    use super::*;
+    impl Device<f32> for crate::tensor::Webgpu {}
+    impl NonCudnnCuda<f32> for crate::tensor::Webgpu {}
+}
 
 // TODO: How can we implement this for f64 when WGSL doesn't support f64 yet?
 // #[cfg(feature = "webgpu")]