Skip to content

Commit

Permalink
mask like floats
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexGuteniev committed Oct 4, 2024
1 parent 4a7d60b commit 39974d1
Showing 1 changed file with 12 additions and 24 deletions.
36 changes: 12 additions & 24 deletions stl/src/vector_algorithms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3741,22 +3741,16 @@ void* __stdcall __std_remove_4(void* _First, void* const _Last, const uint32_t _
void* _Out = _First;

if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes > 32) {
const __m256i _Match = _mm256_set1_epi32(_Val);
const __m256i _Dense_shuf = _mm256_set_epi8( //
12, 8, 4, 0, -1, -1, -1, -1, //
-1, -1, -1, -1, -1, -1, -1, -1, //
-1, -1, -1, -1, -1, -1, -1, -1, //
-1, -1, -1, -1, 12, 8, 4, 0);
const __m256i _Match = _mm256_set1_epi32(_Val);

void* _Stop = _First;
_Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F});
do {
const __m256i _Src = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(_First));
const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match);
const unsigned _Bingo_d = _mm256_movemask_epi8(_mm256_shuffle_epi8(_Mask, _Dense_shuf));
const unsigned _Bingo = _rotl8(static_cast<uint8_t>(_rotl(_Bingo_d, 4)), 4);
const __m256i _Shuf = _mm256_cvtepi8_epi32(_mm_loadu_si64(_Remove_patterns_4._Data[_Bingo]));
const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf);
const __m256i _Src = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(_First));
const __m256i _Mask = _mm256_cmpeq_epi32(_Src, _Match);
const unsigned _Bingo = _mm256_movemask_ps(_mm256_castsi256_ps(_Mask));
const __m256i _Shuf = _mm256_cvtepi8_epi32(_mm_loadu_si64(_Remove_patterns_4._Data[_Bingo]));
const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest);
_Advance_bytes(_Out, _Remove_patterns_4._Count[_Bingo]);
_Advance_bytes(_First, 32);
Expand All @@ -3773,22 +3767,16 @@ void* __stdcall __std_remove_8(void* _First, void* const _Last, const uint64_t _
void* _Out = _First;

if (const size_t _Size_bytes = _Byte_length(_First, _Last); _Use_avx2() && _Size_bytes > 32) {
const __m256i _Match = _mm256_set1_epi64x(_Val);
const __m256i _Dense_shuf = _mm256_set_epi8( //
8, 0, -1, -1, -1, -1, -1, -1, //
-1, -1, -1, -1, -1, -1, -1, -1, //
-1, -1, -1, -1, -1, -1, -1, -1, //
-1, -1, -1, -1, -1, -1, 8, 0);
const __m256i _Match = _mm256_set1_epi64x(_Val);

void* _Stop = _First;
_Advance_bytes(_Stop, _Size_bytes & ~size_t{0x1F});
do {
const __m256i _Src = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(_First));
const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match);
const unsigned _Bingo_d = _mm256_movemask_epi8(_mm256_shuffle_epi8(_Mask, _Dense_shuf));
const unsigned _Bingo = (_Bingo_d | (_Bingo_d >> 28)) & 0xF;
const __m256i _Shuf = _mm256_cvtepi8_epi32(_mm_loadu_si64(_Remove_patterns_8._Data[_Bingo]));
const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf);
const __m256i _Src = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(_First));
const __m256i _Mask = _mm256_cmpeq_epi64(_Src, _Match);
const unsigned _Bingo = _mm256_movemask_pd(_mm256_castsi256_pd(_Mask));
const __m256i _Shuf = _mm256_cvtepi8_epi32(_mm_loadu_si64(_Remove_patterns_8._Data[_Bingo]));
const __m256i _Dest = _mm256_permutevar8x32_epi32(_Src, _Shuf);
_mm256_storeu_si256(reinterpret_cast<__m256i*>(_Out), _Dest);
_Advance_bytes(_Out, _Remove_patterns_8._Count[_Bingo]);
_Advance_bytes(_First, 32);
Expand Down

0 comments on commit 39974d1

Please sign in to comment.