Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AVX2 vectorization for very large bitsets #4422

Merged
merged 6 commits into from
Feb 29, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions benchmarks/src/bitset_to_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,11 @@ namespace {

BENCHMARK(BM_bitset_to_string<15, char>);
BENCHMARK(BM_bitset_to_string<64, char>);
BENCHMARK(BM_bitset_to_string<512, char>);
BENCHMARK(BM_bitset_to_string_large_single<char>);
BENCHMARK(BM_bitset_to_string<7, wchar_t>);
BENCHMARK(BM_bitset_to_string<64, wchar_t>);
BENCHMARK(BM_bitset_to_string<512, wchar_t>);
BENCHMARK(BM_bitset_to_string_large_single<wchar_t>);

BENCHMARK_MAIN();
92 changes: 92 additions & 0 deletions stl/src/vector_algorithms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2166,6 +2166,17 @@ __declspec(noalias) size_t

#ifndef _M_ARM64EC
namespace {
__m256i __forceinline _Bitset_to_string_1_step_avx(const uint32_t _Val, const __m256i _Px0, const __m256i _Px1) {
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
const __m128i _Vx1 = _mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x01010101, 0x02020202, 0x03030303));
const __m256i _Vx2 = _mm256_castsi128_si256(_Vx1);
const __m256i _Vx3 = _mm256_permutevar8x32_epi32(_Vx2, _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0));
const __m256i _Msk = _mm256_and_si256(_Vx3, _mm256_set1_epi64x(0x0102040810204080));
const __m256i _Ex0 = _mm256_cmpeq_epi8(_Msk, _mm256_setzero_si256());
const __m256i _Ex1 = _mm256_blendv_epi8(_Px1, _Px0, _Ex0);
return _Ex1;
}

__m128i __forceinline _Bitset_to_string_1_step(const uint16_t _Val, const __m128i _Px0, const __m128i _Px1) {
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
const __m128i _Vx1 = _mm_unpacklo_epi8(_Vx0, _Vx0);
Expand All @@ -2177,6 +2188,18 @@ namespace {
return _Ex1;
}

__m256i __forceinline _Bitset_to_string_2_step_avx(const uint16_t _Val, const __m256i _Px0, const __m256i _Px1) {
const __m128i _Vx0 = _mm_cvtsi32_si128(_Val);
const __m128i _Vx1 = _mm_shuffle_epi8(_Vx0, _mm_set_epi32(0x00000000, 0x00000000, 0x01010101, 0x01010101));
const __m256i _Vx2 = _mm256_castsi128_si256(_Vx1);
const __m256i _Vx3 = _mm256_permute4x64_epi64(_Vx2, _MM_SHUFFLE(1, 1, 0, 0));
const __m256i _Msk = _mm256_and_si256(
_Vx3, _mm256_set_epi64x(0x0001000200040008, 0x0010002000400080, 0x0001000200040008, 0x0010002000400080));
const __m256i _Ex0 = _mm256_cmpeq_epi16(_Msk, _mm256_setzero_si256());
const __m256i _Ex1 = _mm256_blendv_epi8(_Px1, _Px0, _Ex0);
return _Ex1;
}

__m128i __forceinline _Bitset_to_string_2_step(const uint8_t _Val, const __m128i _Px0, const __m128i _Px1) {
const __m128i _Vx = _mm_set1_epi16(_Val);
const __m128i _Msk = _mm_and_si128(_Vx, _mm_set_epi64x(0x0001000200040008, 0x0010002000400080));
Expand All @@ -2192,6 +2215,38 @@ extern "C" {
__declspec(noalias) void __stdcall __std_bitset_to_string_1(
char* const _Dest, const void* _Src, size_t _Size_bits, const char _Elem0, const char _Elem1) noexcept {
#ifndef _M_ARM64EC
if (_Use_avx2() && _Size_bits >= 256) {
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved
const __m256i _Px0 = _mm256_broadcastb_epi8(_mm_cvtsi32_si128(_Elem0));
const __m256i _Px1 = _mm256_broadcastb_epi8(_mm_cvtsi32_si128(_Elem1));
if (_Size_bits >= 32) {
char* _Pos = _Dest + _Size_bits;
_Size_bits &= 0x1F;
char* const _Stop_at = _Dest + _Size_bits;
do {
uint32_t _Val;
memcpy(&_Val, _Src, 4);
const __m256i _Elems = _Bitset_to_string_1_step_avx(_Val, _Px0, _Px1);
_Pos -= 32;
_mm256_storeu_si256(reinterpret_cast<__m256i*>(_Pos), _Elems);
_Advance_bytes(_Src, 4);
} while (_Pos != _Stop_at);
}

if (_Size_bits > 0) {
__assume(_Size_bits < 32);
uint32_t _Val;
memcpy(&_Val, _Src, _Size_bits / 4);
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved
StephanTLavavej marked this conversation as resolved.
Show resolved Hide resolved
const __m256i _Elems = _Bitset_to_string_1_step_avx(_Val, _Px0, _Px1);
char _Tmp[32];
_mm256_storeu_si256(reinterpret_cast<__m256i*>(_Tmp), _Elems);
const char* const _Tmpd = _Tmp + (32 - _Size_bits);
memcpy(_Dest, _Tmpd, _Size_bits);
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could take advantage of at least remaining of 32-bit unit available due to using array of units in bitset.
Here we can use AVX2 masked store _mm256_maskstore_epi32, and and 4-byte memcpy above.
I'm not sure if the gain worth doing this, as the improvement only applies to the tail part.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correction: this applies to the above memcpy only, To write more here we should over-reserve string, which though seems also feasible.

}

_mm256_zeroupper(); // TRANSITION, DevCom-10331414
return;
}

if (_Use_sse2()) {
const __m128i _Px0 = _mm_set1_epi8(_Elem0 ^ _Elem1);
const __m128i _Px1 = _mm_set1_epi8(_Elem1);
Expand Down Expand Up @@ -2238,6 +2293,43 @@ __declspec(noalias) void __stdcall __std_bitset_to_string_1(
__declspec(noalias) void __stdcall __std_bitset_to_string_2(
wchar_t* const _Dest, const void* _Src, size_t _Size_bits, const wchar_t _Elem0, const wchar_t _Elem1) noexcept {
#ifndef _M_ARM64EC
if (_Use_avx2() && _Size_bits >= 256) {
const __m256i _Px0 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(_Elem0));
const __m256i _Px1 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(_Elem1));

if (_Size_bits >= 16) {
wchar_t* _Pos = _Dest + _Size_bits;
_Size_bits &= 0xF;
wchar_t* const _Stop_at = _Dest + _Size_bits;
do {
uint16_t _Val;
memcpy(&_Val, _Src, 2);
const __m256i _Elems = _Bitset_to_string_2_step_avx(_Val, _Px0, _Px1);
_Pos -= 16;
_mm256_storeu_si256(reinterpret_cast<__m256i*>(_Pos), _Elems);
_Advance_bytes(_Src, 2);
} while (_Pos != _Stop_at);
}

if (_Size_bits > 0) {
__assume(_Size_bits < 16);
uint16_t _Val;
if (_Size_bits > 8) {
memcpy(&_Val, _Src, 2);
} else {
_Val = *reinterpret_cast<const uint8_t*>(_Src);
}
const __m256i _Elems = _Bitset_to_string_2_step_avx(_Val, _Px0, _Px1);
wchar_t _Tmp[16];
_mm256_storeu_si256(reinterpret_cast<__m256i*>(_Tmp), _Elems);
const wchar_t* const _Tmpd = _Tmp + (16 - _Size_bits);
memcpy(_Dest, _Tmpd, _Size_bits * 2);
}

_mm256_zeroupper(); // TRANSITION, DevCom-10331414
return;
}

if (_Use_sse2()) {
const __m128i _Px0 = _mm_set1_epi16(_Elem0 ^ _Elem1);
const __m128i _Px1 = _mm_set1_epi16(_Elem1);
Expand Down