-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathAVXShift.hpp
182 lines (148 loc) · 5.87 KB
/
AVXShift.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
// Datum : 10.02.2022
// Author : (c) Daniel Infuehr
// Content: AVX2 Branchless Fill Algorithm
// This header must be included without any change if you use parts of this code
#pragma once
#include <stdint.h>
#include <algorithm>
#ifdef __AVX2__
# include <immintrin.h>
#endif
namespace Chess_Lookup::AVXShift
{
static constexpr uint64_t BB_AF = 0x0101010101010101;
static constexpr uint64_t BB_HF = 0x8080808080808080;
static constexpr uint64_t BB_R1 = 0x00000000000000FF;
static constexpr uint64_t BB_R8 = 0xFF00000000000000;
static constexpr auto Size = 0; //Above will get inlined into code
#ifdef __AVX2__
struct Vec4I {
__m256i ymm;
Vec4I()
{ } //Empty Constructor
Vec4I(__m256i&& value) : ymm(value)
{
}
Vec4I(uint64_t value) : ymm(_mm256_set1_epi64x(value))
{
}
Vec4I(uint64_t a, uint64_t b, uint64_t c, uint64_t d) : ymm(_mm256_set_epi64x(a,b,c,d))
{
}
inline Vec4I operator|(const Vec4I& rhs) const {
return Vec4I(_mm256_or_si256(ymm, rhs.ymm));
}
inline Vec4I& operator|=(const Vec4I& rhs) {
ymm = _mm256_or_si256(ymm, rhs.ymm);
return *this;
}
inline Vec4I& operator&=(const Vec4I& rhs) {
ymm = _mm256_and_si256(ymm, rhs.ymm);
return *this;
}
inline Vec4I& operator<<=(const Vec4I& rhs) {
ymm = _mm256_sllv_epi64(ymm, rhs.ymm);
return *this;
}
inline Vec4I& operator>>=(const Vec4I& rhs) {
ymm = _mm256_srlv_epi64(ymm, rhs.ymm);
return *this;
}
inline Vec4I& andNot(const Vec4I& rhs) {
ymm = _mm256_andnot_si256(rhs.ymm, ymm); //andNot in intrisics = ~a & b. But andNot = a & ~b
return *this;
}
inline void Store(void* ptr) {
_mm256_storeu_epi64(ptr, ymm);
}
inline void Set(const Vec4I& rhs)
{
ymm = rhs.ymm;
}
inline void Set(uint64_t a, uint64_t b, uint64_t c, uint64_t d)
{
ymm = _mm256_set_epi64x(a, b, c, d);
}
inline void Set(uint64_t a)
{
ymm = _mm256_set1_epi64x(a);
}
inline void Zero() {
ymm = _mm256_setzero_si256();
}
inline bool IsZero() const {
return _mm256_testz_si256(ymm, ymm);
}
inline uint64_t horizontal_or() const {
//Replace both _mm256_or_si256 by any operator to support other 64 bit horizontal reductions
__m256i x1 = _mm256_or_si256(ymm, _mm256_permute2x128_si256(ymm, ymm, 1));
__m256i x2 = _mm256_shuffle_epi32(x1, 0b01001110);
__m256i x3 = _mm256_or_si256(x1, x2);
return _mm256_extract_epi64(x3, 0); //_mm_cvtsi128_si64x(_mm256_castsi256_si128(x3));
// Same performance - scalar OR
//const __m128i temp = _mm256_extracti128_si256(ymm, 1);
//return (uint64_t&)ymm
// | (uint64_t)(_mm_extract_epi64(_mm256_castsi256_si128(ymm), 1))
// | (uint64_t&)(temp)
// | (uint64_t)(_mm_extract_epi64(temp, 1));
}
};
#endif
static constexpr uint64_t Bishop(int s, uint64_t o)
{
uint64_t tmp, att = 0;
o &= ~(1ull << s);
tmp = 1ull << s; while ((tmp & (o | BB_HF | BB_R8)) == 0) { att |= tmp <<= 9; }
tmp = 1ull << s; while ((tmp & (o | BB_AF | BB_R8)) == 0) { att |= tmp <<= 7; }
tmp = 1ull << s; while ((tmp & (o | BB_AF | BB_R1)) == 0) { att |= tmp >>= 9; }
tmp = 1ull << s; while ((tmp & (o | BB_HF | BB_R1)) == 0) { att |= tmp >>= 7; }
return att;
}
static constexpr uint64_t Rook(int s, uint64_t o)
{
uint64_t tmp, att = 0;
o &= ~(1ull << s);
tmp = 1ull << s; while ((tmp & (o | BB_HF)) == 0) { att |= tmp <<= 1; }
tmp = 1ull << s; while ((tmp & (o | BB_AF)) == 0) { att |= tmp >>= 1; }
tmp = 1ull << s; while ((tmp & (o | BB_R8)) == 0) { att |= tmp <<= 8; }
tmp = 1ull << s; while ((tmp & (o | BB_R1)) == 0) { att |= tmp >>= 8; }
return att;
}
static inline uint64_t Queen(const int s, uint64_t o)
{
#ifndef __AVX2__
return Rook(s, o) | Bishop(s, o);
#else
//This is branchless improvement of 'NO HEADACHES' algorithm code. (no stop condition. Expand 7 times)
//Trying to stop early by !_mm256_testz_si256(tmp.ymm, tmp.ymm) halves performance!
//That being said - this is generally a bad idea since other AVX algorithms like dumb7fill can calculate 4 sliders in parallel.
//This still calculates only one slider.
//4x faster avx version:
const Vec4I shift = Vec4I(9, 7, 1, 8);
const Vec4I Occ = Vec4I(o).andNot(1ull << s);
Vec4I A, tmp, att;
att.Zero();
//Same code repeated 7x per 4 rays:
//att.ymm = _mm256_or_si256(att.ymm, tmp.ymm = _mm256_sllv_epi64(_mm256_andnot_si256(A.ymm, tmp.ymm), shift.ymm));
A = (Occ | Vec4I(BB_HF | BB_R8, BB_AF | BB_R8, BB_HF, BB_R8));
tmp = (1ull << s);
tmp.andNot(A); att |= tmp <<= shift;
tmp.andNot(A); att |= tmp <<= shift;
tmp.andNot(A); att |= tmp <<= shift;
tmp.andNot(A); att |= tmp <<= shift;
tmp.andNot(A); att |= tmp <<= shift;
tmp.andNot(A); att |= tmp <<= shift;
tmp.andNot(A); att |= tmp <<= shift;
A = (Occ | Vec4I(BB_AF | BB_R1, BB_HF | BB_R1, BB_AF, BB_R1));
tmp = (1ull << s);
tmp.andNot(A); att |= tmp >>= shift;
tmp.andNot(A); att |= tmp >>= shift;
tmp.andNot(A); att |= tmp >>= shift;
tmp.andNot(A); att |= tmp >>= shift;
tmp.andNot(A); att |= tmp >>= shift;
tmp.andNot(A); att |= tmp >>= shift;
tmp.andNot(A); att |= tmp >>= shift;
return att.horizontal_or();
#endif
}
}