dotnet · tannergooding · Aug 13, 2024 · Aug 8, 2024 · Aug 9, 2024 · Aug 9, 2024
diff --git a/...nsors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs b/...nsors/src/System/Numerics/Tensors/netcore/Common/TensorPrimitives.IAggregationOperator.cs
@@ -141,9 +141,12 @@ static T Vectorized128(ref T xRef, nuint remainder)
 
                         // We need to the ensure the underlying data can be aligned and only align
                         // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
+                        // can never achieve the required SIMD alignment. This cannot be done for
+                        // float or double since that changes how results compound together.
 
-                        bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0;
+                        bool canAlign = (typeof(T) != typeof(float)) &&
+                                        (typeof(T) != typeof(double)) &&
+                                        ((nuint)xPtr % (nuint)sizeof(T)) == 0;
 
                         if (canAlign)
                         {
@@ -156,11 +159,20 @@ static T Vectorized128(ref T xRef, nuint remainder)
                             misalignment = ((uint)sizeof(Vector128<T>) - ((nuint)xPtr % (uint)sizeof(Vector128<T>))) / (uint)sizeof(T);
 
                             xPtr += misalignment;
-
                             Debug.Assert(((nuint)xPtr % (uint)sizeof(Vector128<T>)) == 0);
 
                             remainder -= misalignment;
                         }
+                        else
+                        {
+                            // We can't align, but this also means we're processing the full data from beg
+                            // so account for that to ensure we don't double process and include them in the
+                            // aggregate twice.
+
+                            misalignment = (uint)Vector128<T>.Count;
+                            xPtr += misalignment;
+                            remainder -= misalignment;
+                        }
 
                         Vector128<T> vector1;
                         Vector128<T> vector2;
@@ -310,9 +322,12 @@ static T Vectorized256(ref T xRef, nuint remainder)
 
                         // We need to the ensure the underlying data can be aligned and only align
                         // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
+                        // can never achieve the required SIMD alignment. This cannot be done for
+                        // float or double since that changes how results compound together.
 
-                        bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0;
+                        bool canAlign = (typeof(T) != typeof(float)) &&
+                                        (typeof(T) != typeof(double)) &&
+                                        ((nuint)xPtr % (nuint)sizeof(T)) == 0;
 
                         if (canAlign)
                         {
@@ -330,6 +345,16 @@ static T Vectorized256(ref T xRef, nuint remainder)
 
                             remainder -= misalignment;
                         }
+                        else
+                        {
+                            // We can't align, but this also means we're processing the full data from beg
+                            // so account for that to ensure we don't double process and include them in the
+                            // aggregate twice.
+
+                            misalignment = (uint)Vector256<T>.Count;
+                            xPtr += misalignment;
+                            remainder -= misalignment;
+                        }
 
                         Vector256<T> vector1;
                         Vector256<T> vector2;
@@ -479,9 +504,12 @@ static T Vectorized512(ref T xRef, nuint remainder)
 
                         // We need to the ensure the underlying data can be aligned and only align
                         // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
+                        // can never achieve the required SIMD alignment. This cannot be done for
+                        // float or double since that changes how results compound together.
 
-                        bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0;
+                        bool canAlign = (typeof(T) != typeof(float)) &&
+                                        (typeof(T) != typeof(double)) &&
+                                        ((nuint)xPtr % (nuint)sizeof(T)) == 0;
 
                         if (canAlign)
                         {
@@ -499,6 +527,16 @@ static T Vectorized512(ref T xRef, nuint remainder)
 
                             remainder -= misalignment;
                         }
+                        else
+                        {
+                            // We can't align, but this also means we're processing the full data from beg
+                            // so account for that to ensure we don't double process and include them in the
+                            // aggregate twice.
+
+                            misalignment = (uint)Vector512<T>.Count;
+                            xPtr += misalignment;
+                            remainder -= misalignment;
+                        }
 
                         Vector512<T> vector1;
                         Vector512<T> vector2;
@@ -1227,9 +1265,12 @@ static T Vectorized128(ref T xRef, ref T yRef, nuint remainder)
 
                         // We need to the ensure the underlying data can be aligned and only align
                         // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
+                        // can never achieve the required SIMD alignment. This cannot be done for
+                        // float or double since that changes how results compound together.
 
-                        bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0;
+                        bool canAlign = (typeof(T) != typeof(float)) &&
+                                        (typeof(T) != typeof(double)) &&
+                                        ((nuint)xPtr % (nuint)sizeof(T)) == 0;
 
                         if (canAlign)
                         {
@@ -1248,6 +1289,19 @@ static T Vectorized128(ref T xRef, ref T yRef, nuint remainder)
 
                             remainder -= misalignment;
                         }
+                        else
+                        {
+                            // We can't align, but this also means we're processing the full data from beg
+                            // so account for that to ensure we don't double process and include them in the
+                            // aggregate twice.
+
+                            misalignment = (uint)Vector128<T>.Count;
+
+                            xPtr += misalignment;
+                            yPtr += misalignment;
+
+                            remainder -= misalignment;
+                        }
 
                         Vector128<T> vector1;
                         Vector128<T> vector2;
@@ -1418,9 +1472,12 @@ static T Vectorized256(ref T xRef, ref T yRef, nuint remainder)
 
                         // We need to the ensure the underlying data can be aligned and only align
                         // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
+                        // can never achieve the required SIMD alignment. This cannot be done for
+                        // float or double since that changes how results compound together.
 
-                        bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0;
+                        bool canAlign = (typeof(T) != typeof(float)) &&
+                                        (typeof(T) != typeof(double)) &&
+                                        ((nuint)xPtr % (nuint)sizeof(T)) == 0;
 
                         if (canAlign)
                         {
@@ -1439,6 +1496,19 @@ static T Vectorized256(ref T xRef, ref T yRef, nuint remainder)
 
                             remainder -= misalignment;
                         }
+                        else
+                        {
+                            // We can't align, but this also means we're processing the full data from beg
+                            // so account for that to ensure we don't double process and include them in the
+                            // aggregate twice.
+
+                            misalignment = (uint)Vector256<T>.Count;
+
+                            xPtr += misalignment;
+                            yPtr += misalignment;
+
+                            remainder -= misalignment;
+                        }
 
                         Vector256<T> vector1;
                         Vector256<T> vector2;
@@ -1609,9 +1679,12 @@ static T Vectorized512(ref T xRef, ref T yRef, nuint remainder)
 
                         // We need to the ensure the underlying data can be aligned and only align
                         // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
+                        // can never achieve the required SIMD alignment. This cannot be done for
+                        // float or double since that changes how results compound together.
 
-                        bool canAlign = ((nuint)xPtr % (nuint)sizeof(T)) == 0;
+                        bool canAlign = (typeof(T) != typeof(float)) &&
+                                        (typeof(T) != typeof(double)) &&
+                                        ((nuint)xPtr % (nuint)sizeof(T)) == 0;
 
                         if (canAlign)
                         {
@@ -1630,6 +1703,19 @@ static T Vectorized512(ref T xRef, ref T yRef, nuint remainder)
 
                             remainder -= misalignment;
                         }
+                        else
+                        {
+                            // We can't align, but this also means we're processing the full data from beg
+                            // so account for that to ensure we don't double process and include them in the
+                            // aggregate twice.
+
+                            misalignment = (uint)Vector512<T>.Count;
+
+                            xPtr += misalignment;
+                            yPtr += misalignment;
+
+                            remainder -= misalignment;
+                        }
 
                         Vector512<T> vector1;
                         Vector512<T> vector2;

diff --git a/...cs.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs b/...cs.Tensors/src/System/Numerics/Tensors/netstandard/TensorPrimitives.Single.netstandard.cs
@@ -175,28 +175,15 @@ static float Vectorized(ref float xRef, nuint remainder, TTransformOperator tran
                     {
                         float* xPtr = px;
 
-                        // We need to the ensure the underlying data can be aligned and only align
-                        // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
+                        // Unlike many other vectorization algorithms, we cannot align for aggregation
+                        // because that changes how results compound together and can cause a significant
+                        // difference in the output. This also means we're processing the full data from beg
+                        // so account for that to ensure we don't double process and include them in the
+                        // aggregate twice.
 
-                        bool canAlign = ((nuint)(xPtr) % sizeof(float)) == 0;
-
-                        if (canAlign)
-                        {
-                            // Compute by how many elements we're misaligned and adjust the pointers accordingly
-                            //
-                            // Noting that we are only actually aligning dPtr. This is because unaligned stores
-                            // are more expensive than unaligned loads and aligning both is significantly more
-                            // complex.
-
-                            misalignment = ((uint)(sizeof(Vector<float>)) - ((nuint)(xPtr) % (uint)(sizeof(Vector<float>)))) / sizeof(float);
-
-                            xPtr += misalignment;
-
-                            Debug.Assert(((nuint)(xPtr) % (uint)(sizeof(Vector<float>))) == 0);
-
-                            remainder -= misalignment;
-                        }
+                        misalignment = (uint)Vector<float>.Count;
+                        xPtr += misalignment;
+                        remainder -= misalignment;
 
                         Vector<float> vector1;
                         Vector<float> vector2;
@@ -480,29 +467,18 @@ static float Vectorized(ref float xRef, ref float yRef, nuint remainder, TBinary
                         float* xPtr = px;
                         float* yPtr = py;
 
-                        // We need to the ensure the underlying data can be aligned and only align
-                        // it if it can. It is possible we have an unaligned ref, in which case we
-                        // can never achieve the required SIMD alignment.
-
-                        bool canAlign = ((nuint)(xPtr) % sizeof(float)) == 0;
-
-                        if (canAlign)
-                        {
-                            // Compute by how many elements we're misaligned and adjust the pointers accordingly
-                            //
-                            // Noting that we are only actually aligning dPtr. This is because unaligned stores
-                            // are more expensive than unaligned loads and aligning both is significantly more
-                            // complex.
-
-                            misalignment = ((uint)(sizeof(Vector<float>)) - ((nuint)(xPtr) % (uint)(sizeof(Vector<float>)))) / sizeof(float);
+                        // Unlike many other vectorization algorithms, we cannot align for aggregation
+                        // because that changes how results compound together and can cause a significant
+                        // difference in the output. This also means we're processing the full data from beg
+                        // so account for that to ensure we don't double process and include them in the
+                        // aggregate twice.
 
-                            xPtr += misalignment;
-                            yPtr += misalignment;
+                        misalignment = (uint)Vector<float>.Count;
 
-                            Debug.Assert(((nuint)(xPtr) % (uint)(sizeof(Vector<float>))) == 0);
+                        xPtr += misalignment;
+                        yPtr += misalignment;
 
-                            remainder -= misalignment;
-                        }
+                        remainder -= misalignment;
 
                         Vector<float> vector1;
                         Vector<float> vector2;