Skip to content

Commit

Permalink
AVX2, AVX1, SSE4.1 での UMxx の temporal decode を高速化した。
Browse files Browse the repository at this point in the history
  • Loading branch information
umezawatakeshi committed May 26, 2021
1 parent cd54d94 commit fe89fac
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 47 deletions.
33 changes: 14 additions & 19 deletions utv_core/SymPack_x86x64_xmm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -384,23 +384,21 @@ void tuned_Unpack8SymWithDiff8(uint8_t *pDstBegin, uint8_t *pDstEnd, const uint8
auto r = pControl;

{
__m128i prev = _mm_set1_epi8((char)0x80);
__m128i prev = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, (char)0x80, (char)0x80);

auto t = pPrevBegin;
for (auto p = pDstBegin; p != pDstBegin + cbStride; p += 16, t += 16)
{
auto [s0, m0] = UnpackForDelta<F>(q, r, shift);

auto t0 = _mm_add_epi8(s0, _mm_loadu_si128((const __m128i*)t));
auto a = _mm_alignr_epi8(_mm_and_si128(t0, m0), prev, 15);
s0 = _mm_andnot_si128(m0, _mm_add_epi8(s0, a));
s0 = _mm_add_epi8(s0, _mm_slli_si128(s0, 1));
s0 = _mm_add_epi8(s0, _mm_slli_si128(s0, 2));
s0 = _mm_add_epi8(s0, _mm_slli_si128(s0, 4));
s0 = _mm_add_epi8(s0, _mm_andnot_si128(m0, _mm_slli_si128(s0, 8)));
s0 = _mm_add_epi8(_mm_add_epi8(s0, prev), _mm_slli_epi64(s0, 8));
s0 = _mm_add_epi8(s0, _mm_slli_epi64(s0, 16));
s0 = _mm_add_epi8(s0, _mm_slli_epi64(s0, 32));
s0 = _mm_blendv_epi8(s0, t0, m0);
_mm_storeu_si128((__m128i *)p, s0);
prev = s0;
s0 = _mm_add_epi8(s0, _mm_shuffle_epi8(s0, _mm_or_si128(_mm_set_epi8(7, 7, 7, 7, 7, 7, 7, 7, -1, -1, -1, -1, -1, -1, -1, -1), m0)));
_mm_storeu_si128((__m128i*)p, s0);
prev = _mm_shuffle_epi8(s0, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 15));
}
}

Expand All @@ -415,17 +413,14 @@ void tuned_Unpack8SymWithDiff8(uint8_t *pDstBegin, uint8_t *pDstEnd, const uint8
auto [s0, m0] = UnpackForDelta<F>(q, r, shift);

__m128i top = _mm_loadu_si128((const __m128i*)(p - cbStride));
auto t0 = _mm_add_epi8(s0, _mm_loadu_si128((const __m128i*)t));
auto a = _mm_alignr_epi8(_mm_and_si128(_mm_sub_epi8(t0, top), m0), prev, 15);
s0 = _mm_andnot_si128(m0, _mm_add_epi8(s0, a));
s0 = _mm_add_epi8(s0, _mm_slli_si128(s0, 1));
s0 = _mm_add_epi8(s0, _mm_slli_si128(s0, 2));
s0 = _mm_add_epi8(s0, _mm_slli_si128(s0, 4));
s0 = _mm_add_epi8(s0, _mm_andnot_si128(m0, _mm_slli_si128(s0, 8)));
s0 = _mm_add_epi8(s0, top);
auto t0 = _mm_sub_epi8(_mm_add_epi8(s0, _mm_loadu_si128((const __m128i*)t)), top);
s0 = _mm_add_epi8(_mm_add_epi8(s0, prev), _mm_slli_epi64(s0, 8));
s0 = _mm_add_epi8(s0, _mm_slli_epi64(s0, 16));
s0 = _mm_add_epi8(s0, _mm_slli_epi64(s0, 32));
s0 = _mm_blendv_epi8(s0, t0, m0);
_mm_storeu_si128((__m128i *)p, s0);
prev = _mm_sub_epi8(s0, top);
s0 = _mm_add_epi8(s0, _mm_shuffle_epi8(s0, _mm_or_si128(_mm_set_epi8(7, 7, 7, 7, 7, 7, 7, 7, -1, -1, -1, -1, -1, -1, -1, -1), m0)));
_mm_storeu_si128((__m128i*)p, _mm_add_epi8(s0, top));
prev = _mm_shuffle_epi8(s0, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 15));
}
}
}
Expand Down
55 changes: 27 additions & 28 deletions utv_core/SymPack_x86x64_ymm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -388,30 +388,32 @@ void tuned_Unpack8SymWithDiff8<CODEFEATURE_AVX2>(uint8_t *pDstBegin, uint8_t *pD
auto r = pControl;

{
__m256i prev = _mm256_set1_epi8((char)0x80);
__m256i prev = _mm256_set_epi8(
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, (char)0x80, (char)0x80
);

auto t = pPrevBegin;
for (auto p = pDstBegin; p != pDstBegin + cbStride; p += 32, t += 32)
{
auto [s0, m0] = UnpackForDelta<CODEFEATURE_AVX2>(q, r, shift);

__m256i t0 = _mm256_add_epi8(s0, _mm256_loadu_si256((const __m256i*)t));
__m256i t0masked = _mm256_and_si256(t0, m0);
__m256i atmp = _mm256_permute2x128_si256(t0masked, prev, 0x03);
__m256i a0 = _mm256_alignr_epi8(t0masked, atmp, 15);
s0 = _mm256_andnot_si256(m0, _mm256_add_epi8(s0, a0));
s0 = _mm256_add_epi8(s0, _mm256_slli_epi64(s0, 8));
s0 = _mm256_add_epi8(_mm256_add_epi8(s0, prev), _mm256_slli_epi64(s0, 8));
s0 = _mm256_add_epi8(s0, _mm256_slli_epi64(s0, 16));
s0 = _mm256_add_epi8(s0, _mm256_slli_epi64(s0, 32));
s0 = _mm256_add_epi8(s0, _mm256_shuffle_epi8(_mm256_andnot_si256(m0, s0), _mm256_set16_epi8(7, 7, 7, 7, 7, 7, 7, 7, -1, -1, -1, -1, -1, -1, -1, -1)));
__m256i mask16 = _mm256_or_si256(m0, _mm256_permute4x64_epi64(m0, _MM_SHUFFLE(3, 3, 2, 1)));
s0 = _mm256_add_epi8(s0, _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(_mm256_castsi256_si128(_mm256_andnot_si256(mask16, s0))), _mm256_set_epi8(
s0 = _mm256_blendv_epi8(s0, t0, m0);
s0 = _mm256_add_epi8(s0, _mm256_shuffle_epi8(s0, _mm256_or_si256(_mm256_set16_epi8(7, 7, 7, 7, 7, 7, 7, 7, -1, -1, -1, -1, -1, -1, -1, -1), m0)));
__m256i mask16 = _mm256_or_si256(m0, _mm256_slli_si256(m0, 8));
s0 = _mm256_add_epi8(s0, _mm256_andnot_si256(mask16, _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(_mm256_castsi256_si128(s0)), _mm256_set_epi8(
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
)));
s0 = _mm256_blendv_epi8(s0, t0, m0);
_mm256_storeu_si256((__m256i *)p, s0);
prev = s0;
))));
_mm256_storeu_si256((__m256i*)p, s0);
prev = _mm256_shuffle_epi8(_mm256_zextsi128_si256(_mm256_extracti128_si256(s0, 1)), _mm256_set_epi8(
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 15
));
}
}

Expand All @@ -426,25 +428,22 @@ void tuned_Unpack8SymWithDiff8<CODEFEATURE_AVX2>(uint8_t *pDstBegin, uint8_t *pD
auto [s0, m0] = UnpackForDelta<CODEFEATURE_AVX2>(q, r, shift);

__m256i top = _mm256_loadu_si256((const __m256i*)(p - cbStride));
__m256i t0 = _mm256_add_epi8(s0, _mm256_loadu_si256((const __m256i*)t));
__m256i t0masked = _mm256_and_si256(_mm256_sub_epi8(t0, top), m0);
__m256i atmp = _mm256_permute2x128_si256(t0masked, prev, 0x03);
__m256i a0 = _mm256_alignr_epi8(t0masked, atmp, 15);
s0 = _mm256_andnot_si256(m0, _mm256_add_epi8(s0, a0));
s0 = _mm256_add_epi8(s0, _mm256_slli_epi64(s0, 8));
__m256i t0 = _mm256_sub_epi8(_mm256_add_epi8(s0, _mm256_loadu_si256((const __m256i*)t)), top);
s0 = _mm256_add_epi8(_mm256_add_epi8(s0, prev), _mm256_slli_epi64(s0, 8));
s0 = _mm256_add_epi8(s0, _mm256_slli_epi64(s0, 16));
s0 = _mm256_add_epi8(s0, _mm256_slli_epi64(s0, 32));
s0 = _mm256_add_epi8(s0, _mm256_shuffle_epi8(_mm256_andnot_si256(m0, s0), _mm256_set16_epi8(7, 7, 7, 7, 7, 7, 7, 7, -1, -1, -1, -1, -1, -1, -1, -1)));
__m256i mask16 = _mm256_or_si256(m0, _mm256_permute4x64_epi64(m0, _MM_SHUFFLE(3, 3, 2, 1)));
s0 = _mm256_add_epi8(s0, _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(_mm256_castsi256_si128(_mm256_andnot_si256(mask16, s0))), _mm256_set_epi8(
s0 = _mm256_blendv_epi8(s0, t0, m0);
s0 = _mm256_add_epi8(s0, _mm256_shuffle_epi8(s0, _mm256_or_si256(_mm256_set16_epi8(7, 7, 7, 7, 7, 7, 7, 7, -1, -1, -1, -1, -1, -1, -1, -1), m0)));
__m256i mask16 = _mm256_or_si256(m0, _mm256_slli_si256(m0, 8));
s0 = _mm256_add_epi8(s0, _mm256_andnot_si256(mask16, _mm256_shuffle_epi8(_mm256_broadcastsi128_si256(_mm256_castsi256_si128(s0)), _mm256_set_epi8(
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
)));
s0 = _mm256_add_epi8(s0, top);
s0 = _mm256_blendv_epi8(s0, t0, m0);
_mm256_storeu_si256((__m256i *)p, s0);
prev = _mm256_sub_epi8(s0, top);
))));
_mm256_storeu_si256((__m256i*)p, _mm256_add_epi8(s0, top));
prev = _mm256_shuffle_epi8(_mm256_zextsi128_si256(_mm256_extracti128_si256(s0, 1)), _mm256_set_epi8(
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 15
));
}
}

}

0 comments on commit fe89fac

Please sign in to comment.