Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

neon base64: vectorize with vector factor 16 #6

Merged
merged 1 commit into from
Mar 1, 2018
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 62 additions & 45 deletions src/neonbase64.cc
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#include <arm_neon.h>
#include <stdlib.h>
#include <string.h>
#include <cstddef>
#define MODP_B64_ERROR ((size_t)-1)

size_t chromium_base64_decode(char* dest, const char* src, size_t len);
Expand All @@ -19,21 +18,19 @@ size_t chromium_base64_decode(char* dest, const char* src, size_t len);

int neon_base64_decode(char *out, const char *src, size_t srclen) {
char *out_orig = out;
const uint8x8_t zero8 = vdup_n_u8(0);
const uint8x16_t lut_lo = {0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A};
const uint8x16_t lut_hi = {0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10};
const uint8x16_t lut_roll = {0, 16, 19, 4, 191, 191, 185, 185,
0, 0, 0, 0, 0, 0, 0, 0};
const uint8x16_t zero8 = vdupq_n_u8(0);
const uint16x8_t zero16 = vdupq_n_u16(0);
const uint32x4_t zero32 = vdupq_n_u32(0);
const uint8x8x2_t lut_lo = {0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A};
const uint8x8x2_t lut_hi = {0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10};
const uint8x8x2_t lut_roll = {0, 16, 19, 4, 191, 191, 185, 185,
0, 0, 0, 0, 0, 0, 0, 0};
const uint8x8_t cst = {0x40, 0x1, 0x40, 0x1, 0x40, 0x1, 0x40, 0x1};
const uint16x4_t cst1 = {0x1000, 0x1, 0x1000, 0x1};
const uint8x8_t k2f = vdup_n_u8(0x2f);
const uint8x8_t kf = vdup_n_u8(0xf);
const uint8x16_t k2f = vdupq_n_u8(0x2f);
const uint8x16_t kf = vdupq_n_u8(0xf);
const uint8x8_t cst = {0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40};
const uint16x4_t cst1 = {0x1000, 0x1000, 0x1000, 0x1000};

const uint8x8_t shuf = {2, 1, 0, 6, 5, 4, 255, 255};
const uint8x8_t shuf0 = {2, 1, 0, 6, 5, 4, 2 + 8, 1 + 8};
const uint8x8_t shuf1 = {0 + 8, 6 + 8, 5 + 8, 4 + 8,
2 + 16, 1 + 16, 0 + 16, 6 + 16};
Expand All @@ -42,50 +39,70 @@ int neon_base64_decode(char *out, const char *src, size_t srclen) {

uint8x8x4_t pack;
uint8x8_t res[3];
uint8x8_t str[4];
uint8x16_t str[2];

while (srclen >= 8*4) {
__builtin_memcpy(str, src, 8*4);
for (int i = 0; i < 4; i++) {
uint8x8_t in = str[i];
uint8x8_t lo_nibbles = vand_u8(in, kf);
uint8x8_t lo = vtbl2_u8(lut_lo, lo_nibbles);
while (srclen >= 8 * 4) {
__builtin_memcpy(str, src, 8 * 4);

uint8x8_t hi_nibbles = vsra_n_u8(zero8, in, 4);
uint8x8_t hi = vtbl2_u8(lut_hi, hi_nibbles);
uint8x16_t in0 = str[0];
uint8x16_t in1 = str[1];
uint8x16_t lo_nibbles0 = vandq_u8(in0, kf);
uint8x16_t lo_nibbles1 = vandq_u8(in1, kf);
uint8x16_t hi_nibbles0 = vshrq_n_u8(in0, 4);
uint8x16_t hi_nibbles1 = vshrq_n_u8(in1, 4);

if ((uint64_t)vtst_u8(lo, hi))
goto break_end;
uint8x16_t lo0 = vqtbl1q_u8(lut_lo, lo_nibbles0);
uint8x16_t lo1 = vqtbl1q_u8(lut_lo, lo_nibbles1);
uint8x16_t hi0 = vqtbl1q_u8(lut_hi, hi_nibbles0);
uint8x16_t hi1 = vqtbl1q_u8(lut_hi, hi_nibbles1);
uint8x16_t test0 = vtstq_u8(lo0, hi0);
uint8x16_t test1 = vtstq_u8(lo1, hi1);
uint8x16_t orr0 = vorrq_u8(test0, test1);
uint8x8_t orr1 = vorr_u8(vget_low_u8(orr0), vget_high_u8(orr0));
if ((uint64_t)orr1)
break;

uint8x8_t eq_2F = vceq_u8(in, k2f);
uint8x8_t roll = vtbl2_u8(lut_roll, vadd_u8(eq_2F, hi_nibbles));
uint8x8_t rolled = vadd_u8(in, roll);
uint8x16_t eq_2F0 = vceqq_u8(in0, k2f);
uint8x16_t eq_2F1 = vceqq_u8(in1, k2f);
uint8x16_t add0 = vaddq_u8(eq_2F0, hi_nibbles0);
uint8x16_t add1 = vaddq_u8(eq_2F1, hi_nibbles1);
uint8x16_t roll0 = vqtbl1q_u8(lut_roll, add0);
uint8x16_t roll1 = vqtbl1q_u8(lut_roll, add1);
uint8x16_t rolled0 = vaddq_u8(in0, roll0);
uint8x16_t rolled1 = vaddq_u8(in1, roll1);

// Step 1: swap and merge adjacent 6-bit fields.
uint16x8_t mul = vmlal_u8(zero16, rolled, cst);
uint32x4_t t = vpaddlq_u16(mul);
uint8x8_t merge = vuzp_u16(vget_low_u32(t), vget_high_u32(t)).val[0];
// Step 1: swap and merge adjacent 6-bit fields.
uint8x16x2_t unzip8 = vuzpq_u8(rolled0, rolled1);
uint8x16x2_t zip8 = vzipq_u8(unzip8.val[1], zero8);
uint16x8_t mul0 = vmlal_u8(vreinterpretq_u16_u8(zip8.val[0]),
vget_low_u8(unzip8.val[0]), cst);
uint16x8_t mul1 = vmlal_u8(vreinterpretq_u16_u8(zip8.val[1]),
vget_high_u8(unzip8.val[0]), cst);

// Step 2: swap and merge 12-bit words into a 24-bit word.
uint32x4_t v = vpaddlq_u32(vmlal_u16(zero32, merge, cst1));
uint8x8_t merge1 = vuzp_u32(vget_low_u32(v), vget_high_u32(v)).val[0];

pack.val[i] = merge1;
}
// Step 2: swap and merge 12-bit words into a 24-bit word.
uint16x8x2_t unzip16 = vuzpq_u16(mul0, mul1);
uint16x8x2_t zip16 = vzipq_u16(unzip16.val[1], zero16);
uint32x4_t merge0 = vmlal_u16(vreinterpretq_u32_u16(zip16.val[0]),
vget_low_u16(unzip16.val[0]), cst1);
uint32x4_t merge1 = vmlal_u16(vreinterpretq_u32_u16(zip16.val[1]),
vget_high_u16(unzip16.val[0]), cst1);
pack.val[0] = vget_low_u8(vreinterpretq_u8_u32(merge0));
pack.val[1] = vget_high_u8(vreinterpretq_u8_u32(merge0));
pack.val[2] = vget_low_u8(vreinterpretq_u8_u32(merge1));
pack.val[3] = vget_high_u8(vreinterpretq_u8_u32(merge1));

res[0] = vtbl4_u8(pack, shuf0);
res[1] = vtbl4_u8(pack, shuf1);
res[2] = vtbl4_u8(pack, shuf2);
__builtin_memcpy(out, res, 6*4);
__builtin_memcpy(out, res, 6 * 4);

out += 6*4;
srclen -= 8*4;
src += 8*4;
out += 6 * 4;
srclen -= 8 * 4;
src += 8 * 4;
}

break_end:
size_t scalarret = chromium_base64_decode(out, src, srclen);
if (scalarret == MODP_B64_ERROR)
return (int) MODP_B64_ERROR;
return (int)MODP_B64_ERROR;
return (out - out_orig) + scalarret;
}