neon base64: vectorize with vector factor 16

The performance with this change is slightly worse than VF8: the code generated by LLVM contains too many mov's instead of byte vzip and vuzp. GCC is also generating too many movs and dups which make the code slower than when compiled with LLVM. Experiments from an A72 firefly cpu freq set to 1.2GHz: $ sudo cat /sys/devices/system/cpu/cpu4/cpufreq/cpuinfo_cur_freq 1200000 Before the patch with trunk LLVM as of today: ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- neon_base64_decode_lena 214964 ns 214955 ns 3256 neon_base64_decode_peppers 19452 ns 19452 ns 35989 neon_base64_decode_mandril 502020 ns 502002 ns 1394 neon_base64_decode_moby_dick 2290 ns 2290 ns 305775 neon_base64_decode_googlelogo 4820 ns 4820 ns 145098 neon_base64_decode_bingsocialicon 2778 ns 2778 ns 251984 neon_base64_decode_all 748928 ns 748916 ns 934 with the patch: ------------------------------------------------------------------------- Benchmark Time CPU Iterations ------------------------------------------------------------------------- neon_base64_decode_lena 316154 ns 316148 ns 2214 neon_base64_decode_peppers 28442 ns 28442 ns 24608 neon_base64_decode_mandril 738890 ns 738872 ns 947 neon_base64_decode_moby_dick 3362 ns 3362 ns 208250 neon_base64_decode_googlelogo 7056 ns 7056 ns 99171 neon_base64_decode_bingsocialicon 4087 ns 4087 ns 171265 neon_base64_decode_all 1097039 ns 1097017 ns 638
lemire · Feb 20, 2018 · fc7c6b5 · fc7c6b5
1 parent cd73549
commit fc7c6b5
Showing 1 changed file with 46 additions and 36 deletions.
diff --git a/src/neonbase64.cc b/src/neonbase64.cc
@@ -1,6 +1,5 @@
 #include <arm_neon.h>
-#include <stdlib.h>
-#include <string.h>
+#include <cstddef>
 #define MODP_B64_ERROR ((size_t)-1)
 
 size_t chromium_base64_decode(char* dest, const char* src, size_t len);
@@ -19,21 +18,19 @@ size_t chromium_base64_decode(char* dest, const char* src, size_t len);
 
 int neon_base64_decode(char *out, const char *src, size_t srclen) {
   char *out_orig = out;
-  const uint8x8_t zero8 = vdup_n_u8(0);
-  const uint16x8_t zero16 = vdupq_n_u16(0);
-  const uint32x4_t zero32 = vdupq_n_u32(0);
   const uint8x8x2_t lut_lo = {0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
                               0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A};
   const uint8x8x2_t lut_hi = {0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
                               0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10};
   const uint8x8x2_t lut_roll = {0, 16, 19, 4, 191, 191, 185, 185,
                                 0, 0,  0,  0, 0,   0,   0,   0};
-  const uint8x8_t cst = {0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1};
-  const uint16x4_t cst1 = {0x1000, 0x1, 0x1000, 0x1};
-  const uint8x8_t k2f = vdup_n_u8(0x2f);
-  const uint8x8_t kf = vdup_n_u8(0xf);
+  const uint8x8_t zero8 = vdup_n_u8(0);
+  const uint16x4_t zero16 = vdup_n_u16(0);
+  const uint8x16_t k2f = vdupq_n_u8(0x2f);
+  const uint8x16_t kf = vdupq_n_u8(0xf);
+  const uint8x8_t cst = {0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40};
+  const uint16x4_t cst1 = {0x1000, 0x1000, 0x1000, 0x1000};
 
-  const uint8x8_t shuf = {2, 1, 0, 6, 5, 4, 255, 255};
   const uint8x8_t shuf0 = {2, 1, 0, 6, 5, 4, 2 + 8, 1 + 8};
   const uint8x8_t shuf1 = {0 + 8,  6 + 8,  5 + 8,  4 + 8,
                            2 + 16, 1 + 16, 0 + 16, 6 + 16};
@@ -42,50 +39,63 @@ int neon_base64_decode(char *out, const char *src, size_t srclen) {
 
   uint8x8x4_t pack;
   uint8x8_t res[3];
-  uint8x8_t str[4];
+  uint8x16_t str[2];
 
-  while (srclen >= 8*4) {
-    __builtin_memcpy(str, src, 8*4);
-    for (int i = 0; i < 4; i++) {
-      uint8x8_t in = str[i];
-      uint8x8_t lo_nibbles = vand_u8(in, kf);
-      uint8x8_t lo = vtbl2_u8(lut_lo, lo_nibbles);
+  while (srclen >= 8 * 4) {
+    __builtin_memcpy(str, src, 8 * 4);
+    for (int i = 0; i < 2; i++) {
+      uint8x16_t in = str[i];
+      uint8x16_t lo_nibbles = vandq_u8(in, kf);
+      uint8x16_t hi_nibbles = vshrq_n_u8(in, 4);
 
-      uint8x8_t hi_nibbles = vsra_n_u8(zero8, in, 4);
-      uint8x8_t hi = vtbl2_u8(lut_hi, hi_nibbles);
+      uint8x8_t lo1 = vtbl2_u8(lut_lo, vget_low_u8(lo_nibbles));
+      uint8x8_t hi1 = vtbl2_u8(lut_hi, vget_low_u8(hi_nibbles));
+      if ((uint64_t)vtst_u8(lo1, hi1))
+        goto break_end;
 
-      if ((uint64_t)vtst_u8(lo, hi))
+      uint8x8_t lo2 = vtbl2_u8(lut_lo, vget_high_u8(lo_nibbles));
+      uint8x8_t hi2 = vtbl2_u8(lut_hi, vget_high_u8(hi_nibbles));
+      if ((uint64_t)vtst_u8(lo2, hi2))
         goto break_end;
 
-      uint8x8_t eq_2F = vceq_u8(in, k2f);
-      uint8x8_t roll = vtbl2_u8(lut_roll, vadd_u8(eq_2F, hi_nibbles));
-      uint8x8_t rolled = vadd_u8(in, roll);
+      uint8x16_t eq_2F = vceqq_u8(in, k2f);
+      uint8x16_t add = vaddq_u8(eq_2F, hi_nibbles);
+      uint8x8_t roll1 = vtbl2_u8(lut_roll, vget_low_u8(add));
+      uint8x8_t roll2 = vtbl2_u8(lut_roll, vget_high_u8(add));
+      uint8x16_t roll12 = vcombine_u8(roll1, roll2);
+      uint8x16_t rolled = vaddq_u8(in, roll12);
 
       // Step 1: swap and merge adjacent 6-bit fields.
-      uint16x8_t mul = vmlal_u8(zero16, rolled, cst);
-      uint32x4_t t = vpaddlq_u16(mul);
-      uint8x8_t merge = vuzp_u16(vget_low_u32(t), vget_high_u32(t)).val[0];
+      uint8x8x2_t unzip8 = vuzp_u8(vget_low_u8(rolled), vget_high_u8(rolled));
+      uint8x8x2_t zip8 = vzip_u8(unzip8.val[1], zero8);
+      uint16x8_t zip1 =
+          vreinterpretq_u16_u8(vcombine_u8(zip8.val[0], zip8.val[1]));
+      uint8x16_t mul = vreinterpretq_u8_u16(vmlal_u8(zip1, unzip8.val[0], cst));
 
       // Step 2: swap and merge 12-bit words into a 24-bit word.
-      uint32x4_t v = vpaddlq_u32(vmlal_u16(zero32, merge, cst1));
-      uint8x8_t merge1 = vuzp_u32(vget_low_u32(v), vget_high_u32(v)).val[0];
-
-      pack.val[i] = merge1;
+      uint16x4x2_t unzip16 = vuzp_u16(vreinterpret_u16_u8(vget_low_u8(mul)),
+                                      vreinterpret_u16_u8(vget_high_u8(mul)));
+      uint16x4x2_t zip16 = vzip_u16(unzip16.val[1], zero16);
+      uint32x4_t zip2 =
+          vreinterpretq_u32_u16(vcombine_u16(zip16.val[0], zip16.val[1]));
+      uint32x4_t merge = vmlal_u16(zip2, unzip16.val[0], cst1);
+      pack.val[2 * i] = vget_low_u8(vreinterpretq_u8_u32(merge));
+      pack.val[2 * i + 1] = vget_high_u8(vreinterpretq_u8_u32(merge));
     }
 
     res[0] = vtbl4_u8(pack, shuf0);
     res[1] = vtbl4_u8(pack, shuf1);
     res[2] = vtbl4_u8(pack, shuf2);
-    __builtin_memcpy(out, res, 6*4);
+    __builtin_memcpy(out, res, 6 * 4);
 
-    out += 6*4;
-    srclen -= 8*4;
-    src += 8*4;
+    out += 6 * 4;
+    srclen -= 8 * 4;
+    src += 8 * 4;
   }
 
-  break_end:
+break_end:
   size_t scalarret = chromium_base64_decode(out, src, srclen);
   if (scalarret == MODP_B64_ERROR)
-    return (int) MODP_B64_ERROR;
+    return (int)MODP_B64_ERROR;
   return (out - out_orig) + scalarret;
 }