Skip to content

Commit

Permalink
neon base64: vectorize with vector factor 16
Browse files Browse the repository at this point in the history
The performance with this change is slightly worse than VF8: the code generated
by LLVM contains too many mov's instead of byte vzip and vuzp. GCC is also
generating too many movs and dups which make the code slower than when compiled
with LLVM.

Experiments from an A72 firefly cpu freq set to 1.2GHz:
$ sudo cat /sys/devices/system/cpu/cpu4/cpufreq/cpuinfo_cur_freq
1200000

Before the patch with trunk LLVM as of today:
-------------------------------------------------------------------------
Benchmark                                  Time           CPU Iterations
-------------------------------------------------------------------------
neon_base64_decode_lena               214964 ns     214955 ns       3256
neon_base64_decode_peppers             19452 ns      19452 ns      35989
neon_base64_decode_mandril            502020 ns     502002 ns       1394
neon_base64_decode_moby_dick            2290 ns       2290 ns     305775
neon_base64_decode_googlelogo           4820 ns       4820 ns     145098
neon_base64_decode_bingsocialicon       2778 ns       2778 ns     251984
neon_base64_decode_all                748928 ns     748916 ns        934

with the patch:
-------------------------------------------------------------------------
Benchmark                                  Time           CPU Iterations
-------------------------------------------------------------------------
neon_base64_decode_lena               316154 ns     316148 ns       2214
neon_base64_decode_peppers             28442 ns      28442 ns      24608
neon_base64_decode_mandril            738890 ns     738872 ns        947
neon_base64_decode_moby_dick            3362 ns       3362 ns     208250
neon_base64_decode_googlelogo           7056 ns       7056 ns      99171
neon_base64_decode_bingsocialicon       4087 ns       4087 ns     171265
neon_base64_decode_all               1097039 ns    1097017 ns        638
  • Loading branch information
Sebastian Pop committed Feb 20, 2018
1 parent cd73549 commit fc7c6b5
Showing 1 changed file with 46 additions and 36 deletions.
82 changes: 46 additions & 36 deletions src/neonbase64.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#include <arm_neon.h>
#include <stdlib.h>
#include <string.h>
#include <cstddef>
#define MODP_B64_ERROR ((size_t)-1)

size_t chromium_base64_decode(char* dest, const char* src, size_t len);
Expand All @@ -19,21 +18,19 @@ size_t chromium_base64_decode(char* dest, const char* src, size_t len);

int neon_base64_decode(char *out, const char *src, size_t srclen) {
char *out_orig = out;
const uint8x8_t zero8 = vdup_n_u8(0);
const uint16x8_t zero16 = vdupq_n_u16(0);
const uint32x4_t zero32 = vdupq_n_u32(0);
const uint8x8x2_t lut_lo = {0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A};
const uint8x8x2_t lut_hi = {0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10};
const uint8x8x2_t lut_roll = {0, 16, 19, 4, 191, 191, 185, 185,
0, 0, 0, 0, 0, 0, 0, 0};
const uint8x8_t cst = {0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1};
const uint16x4_t cst1 = {0x1000, 0x1, 0x1000, 0x1};
const uint8x8_t k2f = vdup_n_u8(0x2f);
const uint8x8_t kf = vdup_n_u8(0xf);
const uint8x8_t zero8 = vdup_n_u8(0);
const uint16x4_t zero16 = vdup_n_u16(0);
const uint8x16_t k2f = vdupq_n_u8(0x2f);
const uint8x16_t kf = vdupq_n_u8(0xf);
const uint8x8_t cst = {0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40};
const uint16x4_t cst1 = {0x1000, 0x1000, 0x1000, 0x1000};

const uint8x8_t shuf = {2, 1, 0, 6, 5, 4, 255, 255};
const uint8x8_t shuf0 = {2, 1, 0, 6, 5, 4, 2 + 8, 1 + 8};
const uint8x8_t shuf1 = {0 + 8, 6 + 8, 5 + 8, 4 + 8,
2 + 16, 1 + 16, 0 + 16, 6 + 16};
Expand All @@ -42,50 +39,63 @@ int neon_base64_decode(char *out, const char *src, size_t srclen) {

uint8x8x4_t pack;
uint8x8_t res[3];
uint8x8_t str[4];
uint8x16_t str[2];

while (srclen >= 8*4) {
__builtin_memcpy(str, src, 8*4);
for (int i = 0; i < 4; i++) {
uint8x8_t in = str[i];
uint8x8_t lo_nibbles = vand_u8(in, kf);
uint8x8_t lo = vtbl2_u8(lut_lo, lo_nibbles);
while (srclen >= 8 * 4) {
__builtin_memcpy(str, src, 8 * 4);
for (int i = 0; i < 2; i++) {
uint8x16_t in = str[i];
uint8x16_t lo_nibbles = vandq_u8(in, kf);
uint8x16_t hi_nibbles = vshrq_n_u8(in, 4);

uint8x8_t hi_nibbles = vsra_n_u8(zero8, in, 4);
uint8x8_t hi = vtbl2_u8(lut_hi, hi_nibbles);
uint8x8_t lo1 = vtbl2_u8(lut_lo, vget_low_u8(lo_nibbles));
uint8x8_t hi1 = vtbl2_u8(lut_hi, vget_low_u8(hi_nibbles));
if ((uint64_t)vtst_u8(lo1, hi1))
goto break_end;

if ((uint64_t)vtst_u8(lo, hi))
uint8x8_t lo2 = vtbl2_u8(lut_lo, vget_high_u8(lo_nibbles));
uint8x8_t hi2 = vtbl2_u8(lut_hi, vget_high_u8(hi_nibbles));
if ((uint64_t)vtst_u8(lo2, hi2))
goto break_end;

uint8x8_t eq_2F = vceq_u8(in, k2f);
uint8x8_t roll = vtbl2_u8(lut_roll, vadd_u8(eq_2F, hi_nibbles));
uint8x8_t rolled = vadd_u8(in, roll);
uint8x16_t eq_2F = vceqq_u8(in, k2f);
uint8x16_t add = vaddq_u8(eq_2F, hi_nibbles);
uint8x8_t roll1 = vtbl2_u8(lut_roll, vget_low_u8(add));
uint8x8_t roll2 = vtbl2_u8(lut_roll, vget_high_u8(add));
uint8x16_t roll12 = vcombine_u8(roll1, roll2);
uint8x16_t rolled = vaddq_u8(in, roll12);

// Step 1: swap and merge adjacent 6-bit fields.
uint16x8_t mul = vmlal_u8(zero16, rolled, cst);
uint32x4_t t = vpaddlq_u16(mul);
uint8x8_t merge = vuzp_u16(vget_low_u32(t), vget_high_u32(t)).val[0];
uint8x8x2_t unzip8 = vuzp_u8(vget_low_u8(rolled), vget_high_u8(rolled));
uint8x8x2_t zip8 = vzip_u8(unzip8.val[1], zero8);
uint16x8_t zip1 =
vreinterpretq_u16_u8(vcombine_u8(zip8.val[0], zip8.val[1]));
uint8x16_t mul = vreinterpretq_u8_u16(vmlal_u8(zip1, unzip8.val[0], cst));

// Step 2: swap and merge 12-bit words into a 24-bit word.
uint32x4_t v = vpaddlq_u32(vmlal_u16(zero32, merge, cst1));
uint8x8_t merge1 = vuzp_u32(vget_low_u32(v), vget_high_u32(v)).val[0];

pack.val[i] = merge1;
uint16x4x2_t unzip16 = vuzp_u16(vreinterpret_u16_u8(vget_low_u8(mul)),
vreinterpret_u16_u8(vget_high_u8(mul)));
uint16x4x2_t zip16 = vzip_u16(unzip16.val[1], zero16);
uint32x4_t zip2 =
vreinterpretq_u32_u16(vcombine_u16(zip16.val[0], zip16.val[1]));
uint32x4_t merge = vmlal_u16(zip2, unzip16.val[0], cst1);
pack.val[2 * i] = vget_low_u8(vreinterpretq_u8_u32(merge));
pack.val[2 * i + 1] = vget_high_u8(vreinterpretq_u8_u32(merge));
}

res[0] = vtbl4_u8(pack, shuf0);
res[1] = vtbl4_u8(pack, shuf1);
res[2] = vtbl4_u8(pack, shuf2);
__builtin_memcpy(out, res, 6*4);
__builtin_memcpy(out, res, 6 * 4);

out += 6*4;
srclen -= 8*4;
src += 8*4;
out += 6 * 4;
srclen -= 8 * 4;
src += 8 * 4;
}

break_end:
break_end:
size_t scalarret = chromium_base64_decode(out, src, srclen);
if (scalarret == MODP_B64_ERROR)
return (int) MODP_B64_ERROR;
return (int)MODP_B64_ERROR;
return (out - out_orig) + scalarret;
}

0 comments on commit fc7c6b5

Please sign in to comment.