diff --git a/.gitignore b/.gitignore
index 8dda599..82d42fa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,16 +3,18 @@ avisynth/src/Debug/*
avisynth/src/x64/*
avisynth/src/.vs/*
avisynth/archive/*
+vapoursynth/*
*.opensdf
*.opendb
*.psess
*.sdf
*.suo
*.sln
-*.vcxproj.filters
-*.vcxproj.user
+*.filters
+*.user
*.vspx
*.vsp
*.dll
*.avs
*.exe
+*.db
diff --git a/avisynth/LISENCE.GPLv2 b/avisynth/LICENSE.GPLv2
similarity index 100%
rename from avisynth/LISENCE.GPLv2
rename to avisynth/LICENSE.GPLv2
diff --git a/avisynth/src/TCannyMod.vcxproj b/avisynth/src/TCannyMod.vcxproj
index 3353dd6..567db3d 100644
--- a/avisynth/src/TCannyMod.vcxproj
+++ b/avisynth/src/TCannyMod.vcxproj
@@ -71,7 +71,7 @@
true
- true
+ false
false
@@ -103,12 +103,14 @@
Speed
true
StreamingSIMDExtensions2
- Precise
+ Fast
AnySuitable
+ true
+ MaxSpeed
MachineX86
- Debug
+ No
Windows
true
true
@@ -120,14 +122,14 @@
AnySuitable
true
Speed
- AdvancedVectorExtensions
+ NotSet
true
Fast
- Full
+ MaxSpeed
true
- Debug
+ No
UseLinkTimeCodeGeneration
diff --git a/avisynth/src/cpu_check.cpp b/avisynth/src/cpu_check.cpp
index f52d43e..e8855f1 100644
--- a/avisynth/src/cpu_check.cpp
+++ b/avisynth/src/cpu_check.cpp
@@ -71,12 +71,12 @@ static inline void get_cpuid2(int *array, int info_type, int ecx)
#endif
}
-static inline int is_bit_set(int bitfield, int bit)
+static inline int is_bit_set(int bitfield, int bit) noexcept
{
return bitfield & (1 << bit);
}
-static uint32_t get_simd_support_info(void)
+static uint32_t get_simd_support_info(void) noexcept
{
uint32_t ret = 0;
int regs[4] = {0};
@@ -157,27 +157,27 @@ static uint32_t get_simd_support_info(void)
return ret;
}
-int has_sse2()
+bool has_sse2() noexcept
{
- return !!(get_simd_support_info() & CPU_SSE2_SUPPORT);
+ return (get_simd_support_info() & CPU_SSE2_SUPPORT) != 0;
}
-int has_ssse3()
+bool has_ssse3() noexcept
{
- return !!(get_simd_support_info() & CPU_SSSE3_SUPPORT);
+ return (get_simd_support_info() & CPU_SSSE3_SUPPORT) != 0;
}
-int has_sse41()
+bool has_sse41() noexcept
{
- return !!(get_simd_support_info() & CPU_SSE4_1_SUPPORT);
+ return (get_simd_support_info() & CPU_SSE4_1_SUPPORT) != 0;
}
-int has_avx()
+bool has_avx() noexcept
{
- return !!(get_simd_support_info() & CPU_AVX_SUPPORT);
+ return (get_simd_support_info() & CPU_AVX_SUPPORT) != 0;
}
-int has_avx2()
+bool has_avx2() noexcept
{
- return !!(get_simd_support_info() & CPU_AVX2_SUPPORT);
+ return (get_simd_support_info() & CPU_AVX2_SUPPORT) != 0;
}
diff --git a/avisynth/src/edge_detection.h b/avisynth/src/edge_detection.h
index eccb577..69b7bb2 100644
--- a/avisynth/src/edge_detection.h
+++ b/avisynth/src/edge_detection.h
@@ -31,7 +31,7 @@
#include "simd.h"
-static const float* get_tangent(int idx)
+static const float* get_tangent(int idx) noexcept
{
alignas(32) static const float tangent[32] = {
0.414213538169860839843750f, 0.414213538169860839843750f, // tan(pi/8)
@@ -60,7 +60,7 @@ template
static void __stdcall
standard(float* blurp, const size_t blur_pitch, float* emaskp,
const size_t emask_pitch, int32_t* dirp, const size_t dir_pitch,
- const size_t width, const size_t height)
+ const size_t width, const size_t height) noexcept
{
constexpr size_t step = sizeof(Vf) / sizeof(float);
@@ -110,13 +110,13 @@ standard(float* blurp, const size_t blur_pitch, float* emaskp,
Vi d3 = castps_si(and_ps(cmpge_ps(tan, t1125), cmplt_ps(tan, t1575)));
d3 = srli_i32(d3, 24);
d0 = or_si(or_si(d0, d1), or_si(d2, d3));
- stream(dirp + x, d0);
+ stream(dirp + x, d0);
}
Vf magnitude = mul(gx, gx);
magnitude = madd(gy, gy, magnitude);
magnitude = sqrt(magnitude);
- stream(emaskp + x, magnitude);
+ stream(emaskp + x, magnitude);
}
emaskp += emask_pitch;
dirp += dir_pitch;
@@ -138,7 +138,7 @@ template
static void __stdcall
sobel(float* blurp, const size_t blur_pitch, float* emaskp,
const size_t emask_pitch, int32_t* dirp, const size_t dir_pitch,
- const size_t width, const size_t height)
+ const size_t width, const size_t height) noexcept
{
constexpr size_t step = sizeof(Vf) / sizeof(float);
@@ -197,13 +197,13 @@ sobel(float* blurp, const size_t blur_pitch, float* emaskp,
Vi d3 = castps_si(and_ps(cmpge_ps(tan, t1125), cmplt_ps(tan, t1575)));
d3 = srli_i32(d3, 24);
d0 = or_si(or_si(d0, d1), or_si(d2, d3));
- stream(dirp + x, d0);
+ stream(dirp + x, d0);
}
Vf magnitude = mul(gx, gx);
magnitude = madd(gy, gy, magnitude);
magnitude = sqrt(magnitude);
- stream(emaskp + x, magnitude);
+ stream(emaskp + x, magnitude);
}
emaskp += emask_pitch;
dirp += dir_pitch;
@@ -218,7 +218,8 @@ template
static void __stdcall
non_max_suppress(const float* emaskp, const size_t em_pitch,
const int32_t* dirp, const size_t dir_pitch, float* blurp,
- const size_t blur_pitch, const size_t width, const size_t height)
+ const size_t blur_pitch, const size_t width,
+ const size_t height) noexcept
{
constexpr size_t step = sizeof(Vf) / sizeof(float);
@@ -275,7 +276,7 @@ non_max_suppress(const float* emaskp, const size_t em_pitch,
void __stdcall
hysteresis(uint8_t* hystp, const size_t hpitch, float* blurp,
const size_t bpitch, const int width, const int height,
- const float tmin, const float tmax);
+ const float tmin, const float tmax) noexcept;
#endif
diff --git a/avisynth/src/gaussian_blur.h b/avisynth/src/gaussian_blur.h
index 46e04a8..3504933 100644
--- a/avisynth/src/gaussian_blur.h
+++ b/avisynth/src/gaussian_blur.h
@@ -32,14 +32,14 @@
template
static void __stdcall
convert_to_float(const size_t width, const size_t height, const uint8_t* srcp,
- const int src_pitch, float* blurp, const size_t blur_pitch)
+ const int src_pitch, float* blurp, const size_t blur_pitch) noexcept
{
constexpr size_t step = sizeof(Vf) / sizeof(float);
for (size_t y = 0; y < height; y++) {
for (size_t x = 0; x < width; x += step) {
Vf val = cvtu8_ps(srcp + x);
- stream(blurp + x, val);
+ stream(blurp + x, val);
}
srcp += src_pitch;
blurp += blur_pitch;
@@ -50,7 +50,7 @@ convert_to_float(const size_t width, const size_t height, const uint8_t* srcp,
template
static void
horizontal_blur(const float* hkernel, float* buffp, const int radius,
- const size_t width, float* blurp)
+ const size_t width, float* blurp) noexcept
{
constexpr size_t step = sizeof(Vf) / sizeof(float);
const int length = radius * 2 + 1;
@@ -67,7 +67,7 @@ horizontal_blur(const float* hkernel, float* buffp, const int radius,
Vf val = loadu(buffp + x + i);
sum = madd(k, val, sum);
}
- stream(blurp + x, sum);
+ stream(blurp + x, sum);
}
}
@@ -77,7 +77,7 @@ static void __stdcall
gaussian_blur(const int radius, const float* kernel, const float* hkernel,
float* buffp, float* blurp, const size_t blur_pitch,
const uint8_t* srcp, const size_t src_pitch, const size_t width,
- const size_t height)
+ const size_t height) noexcept
{
if (radius == 0) {
convert_to_float(
@@ -106,7 +106,7 @@ gaussian_blur(const int radius, const float* kernel, const float* hkernel,
sum = madd(k, input, sum);
}
- store(buffp + x, sum);
+ store(buffp + x, sum);
}
horizontal_blur(hkernel, buffp, radius, width, blurp);
blurp += blur_pitch;
diff --git a/avisynth/src/hysteresis.cpp b/avisynth/src/hysteresis.cpp
index 52c37ad..ca5ae2d 100644
--- a/avisynth/src/hysteresis.cpp
+++ b/avisynth/src/hysteresis.cpp
@@ -41,7 +41,7 @@ struct Pos {
static __forceinline void
hystfunc(const int32_t x, const int32_t y, float* edge, uint8_t* hyst,
const size_t epitch, const size_t hpitch, const float th,
- std::vector& stack)
+ std::vector& stack) noexcept
{
if (!hyst[x + y * hpitch] && edge[x + y * epitch] > th) {
edge[x + y * epitch] = FLT_MAX;
@@ -54,10 +54,11 @@ hystfunc(const int32_t x, const int32_t y, float* edge, uint8_t* hyst,
void __stdcall
hysteresis(uint8_t* hystp, const size_t hpitch, float* blurp,
const size_t bpitch, const int width, const int height,
- const float tmin, const float tmax)
+ const float tmin, const float tmax) noexcept
{
memset(hystp, 0, hpitch * height);
std::vector stack;
+ stack.reserve(512);
for (int32_t y = 0; y < height; ++y) {
for (int32_t x = 0; x < width; ++x) {
diff --git a/avisynth/src/simd.h b/avisynth/src/simd.h
index 64352d3..4f5fc44 100644
--- a/avisynth/src/simd.h
+++ b/avisynth/src/simd.h
@@ -26,8 +26,13 @@
#ifndef TCANNY_MOD_SIMD_H
#define TCANNY_MOD_SIMD_H
+
#include
+#if defined(__AVX2__)
#include
+#else
+#include
+#endif
#define SFINLINE static __forceinline
@@ -42,6 +47,10 @@ enum arch_t {
template
T zero();
+template
+T set1_ps(const float& x);
+
+
template <>
SFINLINE __m128i zero<__m128i>()
{
@@ -54,33 +63,12 @@ SFINLINE __m128 zero<__m128>()
return _mm_setzero_ps();
}
-template <>
-SFINLINE __m256i zero<__m256i>()
-{
- return _mm256_setzero_si256();
-}
-
-template <>
-SFINLINE __m256 zero<__m256>()
-{
- return _mm256_setzero_ps();
-}
-
-template
-T set1_ps(const float& x);
-
template <>
SFINLINE __m128 set1_ps<__m128>(const float& x)
{
return _mm_set_ps1(x);
}
-template <>
-SFINLINE __m256 set1_ps<__m256>(const float& x)
-{
- return _mm256_set1_ps(x);
-}
-
template
T set1_i8(const int8_t&);
@@ -90,17 +78,19 @@ SFINLINE __m128i set1_i8<__m128i>(const int8_t& x)
return _mm_set1_epi8(x);
}
-template <>
-SFINLINE __m256i set1_i8<__m256i>(const int8_t& x)
-{
- return _mm256_set1_epi8(x);
-}
-
/*---------------load--------------------*/
template
T load(const float* p);
+template
+T load(const uint8_t*);
+
+template
+T load(const int32_t*);
+
+
+
template <>
SFINLINE __m128 load<__m128>(const float* p)
{
@@ -108,357 +98,418 @@ SFINLINE __m128 load<__m128>(const float* p)
}
template <>
-SFINLINE __m256 load<__m256>(const float* p)
+SFINLINE __m128i load(const uint8_t* p)
{
- return _mm256_load_ps(p);
+ return _mm_load_si128(reinterpret_cast(p));
}
-template
-T load(const uint8_t*);
-
template <>
-SFINLINE __m128i load(const uint8_t* p)
+SFINLINE __m128i load(const int32_t* p)
{
return _mm_load_si128(reinterpret_cast(p));
}
+
+
+template
+T loadu(const float* p);
+
+template
+T loadu(const uint8_t* p);
+
+template
+T loadu(const int32_t* p);
+
+
+
template <>
-SFINLINE __m256i load(const uint8_t* p)
+SFINLINE __m128 loadu(const float* p)
{
- return _mm256_load_si256(reinterpret_cast(p));
+ return _mm_loadu_ps(p);
}
-template
-T load(const int32_t*);
-
template <>
-SFINLINE __m128i load(const int32_t* p)
+SFINLINE __m128i loadu(const uint8_t* p)
{
- return _mm_load_si128(reinterpret_cast(p));
+ return _mm_loadu_si128(reinterpret_cast(p));
}
template <>
-SFINLINE __m256i load(const int32_t* p)
+SFINLINE __m128i loadu(const int32_t* p)
{
- return _mm256_load_si256(reinterpret_cast(p));
+ return _mm_loadu_si128(reinterpret_cast(p));
}
-template
-T loadu(const float* p);
-template <>
-SFINLINE __m128 loadu<__m128>(const float* p)
+/*-------------store---------------------*/
+
+SFINLINE void store(float* p, const __m128& x)
{
- return _mm_loadu_ps(p);
+ _mm_store_ps(p, x);
}
-template <>
-SFINLINE __m256 loadu<__m256>(const float* p)
+SFINLINE void storeu(float* p, const __m128& x)
{
- return _mm256_loadu_ps(p);
+ _mm_storeu_ps(p, x);
}
-template
-T loadu(const uint8_t* p);
+SFINLINE void stream(float* p, const __m128& x)
+{
+ _mm_stream_ps(p, x);
+}
-template <>
-SFINLINE __m128i loadu<__m128i>(const uint8_t* p)
+SFINLINE void stream(uint8_t* p, const __m128i& x)
{
- return _mm_loadu_si128(reinterpret_cast(p));
+ return _mm_stream_si128(reinterpret_cast<__m128i*>(p), x);
}
-template <>
-SFINLINE __m256i loadu<__m256i>(const uint8_t* p)
+SFINLINE void stream(int32_t* p, const __m128i& x)
{
- return _mm256_loadu_si256(reinterpret_cast(p));
+ return _mm_stream_si128(reinterpret_cast<__m128i*>(p), x);
}
-template
-T loadu(const int32_t* p);
-template <>
-SFINLINE __m128i loadu<__m128i>(const int32_t* p)
+
+/*-----------cast--------------------------*/
+SFINLINE __m128i castps_si(const __m128& x)
{
- return _mm_loadu_si128(reinterpret_cast(p));
+ return _mm_castps_si128(x);
}
-template <>
-SFINLINE __m256i loadu<__m256i>(const int32_t* p)
+SFINLINE __m128 castsi_ps(const __m128i& x)
{
- return _mm256_loadu_si256(reinterpret_cast(p));
+ return _mm_castsi128_ps(x);
}
-/*-------------store---------------------*/
-template
-void store(float* p, const T& x) {}
-template <>
-SFINLINE void store<__m128>(float* p, const __m128& x)
+/*-------------------logical-------------------------------*/
+SFINLINE __m128 and_ps(const __m128& x, const __m128& y)
{
- _mm_store_ps(p, x);
+ return _mm_and_ps(x, y);
}
-template <>
-SFINLINE void store<__m256>(float* p, const __m256& x)
+SFINLINE __m128i and_si(const __m128i& x, const __m128i& y)
{
- _mm256_store_ps(p, x);
+ return _mm_and_si128(x, y);
}
-template
-void storeu(float* p, const T& x) {}
+SFINLINE __m128 or_ps(const __m128& x, const __m128& y)
+{
+ return _mm_or_ps(x, y);
+}
-template <>
-SFINLINE void storeu<__m128>(float* p, const __m128& x)
+SFINLINE __m128i or_si(const __m128i& x, const __m128i& y)
{
- _mm_storeu_ps(p, x);
+ return _mm_or_si128(x, y);
}
-template <>
-SFINLINE void storeu<__m256>(float* p, const __m256& x)
+SFINLINE __m128 andnot_ps(const __m128& x, const __m128& y)
{
- _mm256_storeu_ps(p, x);
+ return _mm_andnot_ps(x, y);
}
-template
-void stream(float* p, const T& x) {}
+SFINLINE __m128i andnot_si(const __m128i& x, const __m128i& y)
+{
+ return _mm_andnot_si128(x, y);
+}
-template <>
-SFINLINE void stream<__m128>(float* p, const __m128& x)
+SFINLINE __m128 xor_ps(const __m128& x, const __m128& y)
{
- _mm_stream_ps(p, x);
+ return _mm_xor_ps(x, y);
}
-template <>
-SFINLINE void stream<__m256>(float* p, const __m256& x)
+SFINLINE __m128i xor_si(const __m128i& x, const __m128i& y)
{
- _mm256_stream_ps(p, x);
+ return _mm_xor_si128(x, y);
}
-template
-void stream(uint8_t* p, const T& x) {}
-template <>
-SFINLINE void stream<__m128i>(uint8_t* p, const __m128i& x)
+/*-----------------shift-----------------------*/
+SFINLINE __m128i srli_i32(const __m128i& x, int n)
{
- return _mm_stream_si128(reinterpret_cast<__m128i*>(p), x);
+ return _mm_srli_epi32(x, n);
}
-template <>
-SFINLINE void stream<__m256i>(uint8_t* p, const __m256i& x)
+
+/*------------------arithmetic--------------------*/
+SFINLINE __m128 add(const __m128& x, const __m128& y)
{
- return _mm256_stream_si256(reinterpret_cast<__m256i*>(p), x);
+ return _mm_add_ps(x, y);
}
-template
-void stream(int32_t* p, const T& x) {}
+SFINLINE __m128 sub(const __m128& x, const __m128& y)
+{
+ return _mm_sub_ps(x, y);
+}
-template <>
-SFINLINE void stream<__m128i>(int32_t* p, const __m128i& x)
+SFINLINE __m128 mul(const __m128& x, const __m128& y)
{
- return _mm_stream_si128(reinterpret_cast<__m128i*>(p), x);
+ return _mm_mul_ps(x, y);
}
-template <>
-SFINLINE void stream<__m256i>(int32_t* p, const __m256i& x)
+SFINLINE __m128 madd(const __m128& x, const __m128& y, const __m128& z)
{
- return _mm256_stream_si256(reinterpret_cast<__m256i*>(p), x);
+ return add(mul(x, y), z);
}
-/*-----------cast--------------------------*/
-SFINLINE __m128i castps_si(const __m128& x)
+/*--------------convert-----------------------*/
+SFINLINE __m128i cvtps_i32(const __m128& x)
{
- return _mm_castps_si128(x);
+ return _mm_cvtps_epi32(x);
}
-SFINLINE __m256i castps_si(const __m256& x)
+template
+T cvtu8_ps(const uint8_t* ptr);
+
+template <>
+SFINLINE __m128 cvtu8_ps<__m128, HAS_SSE2>(const uint8_t* ptr)
{
- return _mm256_castps_si256(x);
+ const int32_t* p32 = reinterpret_cast(ptr);
+ __m128i t = _mm_cvtsi32_si128(p32[0]);
+ __m128i z = zero<__m128i>();
+ t = _mm_unpacklo_epi8(t, z);
+ t = _mm_unpacklo_epi16(t, z);
+ return _mm_cvtepi32_ps(t);
}
-SFINLINE __m128 castsi_ps(const __m128i& x)
+template <>
+SFINLINE __m128 cvtu8_ps<__m128, HAS_SSE41>(const uint8_t* ptr)
{
- return _mm_castsi128_ps(x);
+ const int32_t* p32 = reinterpret_cast(ptr);
+ __m128i t = _mm_cvtsi32_si128(p32[0]);
+ t = _mm_cvtepu8_epi32(t);
+ return _mm_cvtepi32_ps(t);
}
-SFINLINE __m256 castsi_ps(const __m256i& x)
+SFINLINE __m128i
+cvti32_u8(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
{
- return _mm256_castsi256_ps(x);
+ __m128i x = _mm_packs_epi32(a, b);
+ __m128i y = _mm_packs_epi32(c, d);
+ return _mm_packus_epi16(x, y);
}
+/*-----------------math-----------------------*/
+SFINLINE __m128 max(const __m128& x, const __m128& y)
+{
+ return _mm_max_ps(x, y);
+}
-/*-------------------logical-------------------------------*/
-SFINLINE __m128 and_ps(const __m128& x, const __m128& y)
+SFINLINE __m128 rcp_ps(const __m128& x)
{
- return _mm_and_ps(x, y);
+ return _mm_rcp_ps(x);
}
-SFINLINE __m256 and_ps(const __m256& x, const __m256& y)
+SFINLINE __m128 sqrt(const __m128& x)
{
- return _mm256_and_ps(x, y);
+ return _mm_sqrt_ps(x);
}
-SFINLINE __m128i and_si(const __m128i& x, const __m128i& y)
+
+/*-----------compare-------------------------------*/
+
+SFINLINE __m128i cmpeq_i32(const __m128i& x, const __m128i& y)
{
- return _mm_and_si128(x, y);
+ return _mm_cmpeq_epi32(x, y);
}
-SFINLINE __m256i and_si(const __m256i& x, const __m256i& y)
+SFINLINE __m128 cmplt_ps(const __m128& x, const __m128& y)
{
- return _mm256_and_si256(x, y);
+ return _mm_cmplt_ps(x, y);
}
-SFINLINE __m128 or_ps(const __m128& x, const __m128& y)
+SFINLINE __m128 cmpge_ps(const __m128& x, const __m128& y)
{
- return _mm_or_ps(x, y);
+ return _mm_cmpge_ps(x, y);
}
-SFINLINE __m256 or_ps(const __m256& x, const __m256& y)
+SFINLINE __m128 cmpord_ps(const __m128& x, const __m128& y)
{
- return _mm256_or_ps(x, y);
+ return _mm_cmpord_ps(x, y);
}
-SFINLINE __m128i or_si(const __m128i& x, const __m128i& y)
+
+
+
+/*----------------misc-----------------------------*/
+SFINLINE __m128 blendv(const __m128& x, const __m128& y, const __m128& mask)
{
- return _mm_or_si128(x, y);
+ return or_ps(and_ps(mask, y), andnot_ps(mask, x));
}
-SFINLINE __m256i or_si(const __m256i& x, const __m256i& y)
+
+
+
+
+#if defined(__AVX2__)
+
+template <>
+SFINLINE __m256i zero<__m256i>()
{
- return _mm256_or_si256(x, y);
+ return _mm256_setzero_si256();
}
-SFINLINE __m128 andnot_ps(const __m128& x, const __m128& y)
+template <>
+SFINLINE __m256 zero<__m256>()
{
- return _mm_andnot_ps(x, y);
+ return _mm256_setzero_ps();
}
-SFINLINE __m256 andnot_ps(const __m256& x, const __m256& y)
+template <>
+SFINLINE __m256 set1_ps<__m256>(const float& x)
{
- return _mm256_andnot_ps(x, y);
+ return _mm256_set1_ps(x);
}
-SFINLINE __m128i andnot_si(const __m128i& x, const __m128i& y)
+template <>
+SFINLINE __m256i set1_i8<__m256i>(const int8_t& x)
{
- return _mm_andnot_si128(x, y);
+ return _mm256_set1_epi8(x);
}
-SFINLINE __m256i andnot_si(const __m256i& x, const __m256i& y)
+template <>
+SFINLINE __m256 load(const float* p)
{
- return _mm256_andnot_si256(x, y);
+ return _mm256_load_ps(p);
}
-SFINLINE __m128 xor_ps(const __m128& x, const __m128& y)
+template <>
+SFINLINE __m256i load(const uint8_t* p)
{
- return _mm_xor_ps(x, y);
+ return _mm256_load_si256(reinterpret_cast(p));
}
-SFINLINE __m256 xor_ps(const __m256& x, const __m256& y)
+template <>
+SFINLINE __m256i load(const int32_t* p)
{
- return _mm256_xor_ps(x, y);
+ return _mm256_load_si256(reinterpret_cast(p));
}
-SFINLINE __m128i xor_si(const __m128i& x, const __m128i& y)
+template <>
+SFINLINE __m256 loadu(const float* p)
{
- return _mm_xor_si128(x, y);
+ return _mm256_loadu_ps(p);
}
-SFINLINE __m256i xor_si(const __m256i& x, const __m256i& y)
+template <>
+SFINLINE __m256i loadu(const uint8_t* p)
{
- return _mm256_xor_si256(x, y);
+ return _mm256_loadu_si256(reinterpret_cast(p));
}
+template <>
+SFINLINE __m256i loadu(const int32_t* p)
+{
+ return _mm256_loadu_si256(reinterpret_cast(p));
+}
-/*-----------------shift-----------------------*/
-SFINLINE __m128i srli_i32(const __m128i& x, int n)
+SFINLINE void store(float* p, const __m256& x)
{
- return _mm_srli_epi32(x, n);
+ _mm256_store_ps(p, x);
}
-SFINLINE __m256i srli_i32(const __m256i& x, int n)
+SFINLINE void storeu(float* p, const __m256& x)
{
- return _mm256_srli_epi32(x, n);
+ _mm256_storeu_ps(p, x);
}
+SFINLINE void stream(float* p, const __m256& x)
+{
+ _mm256_stream_ps(p, x);
+}
-/*------------------arithmetic--------------------*/
-SFINLINE __m128 add(const __m128& x, const __m128& y)
+SFINLINE void stream(uint8_t* p, const __m256i& x)
{
- return _mm_add_ps(x, y);
+ return _mm256_stream_si256(reinterpret_cast<__m256i*>(p), x);
}
-SFINLINE __m256 add(const __m256& x, const __m256& y)
+SFINLINE void stream(int32_t* p, const __m256i& x)
{
- return _mm256_add_ps(x, y);
+ return _mm256_stream_si256(reinterpret_cast<__m256i*>(p), x);
}
-SFINLINE __m128 sub(const __m128& x, const __m128& y)
+SFINLINE __m256i castps_si(const __m256& x)
{
- return _mm_sub_ps(x, y);
+ return _mm256_castps_si256(x);
}
-SFINLINE __m256 sub(const __m256& x, const __m256& y)
+SFINLINE __m256 castsi_ps(const __m256i& x)
{
- return _mm256_sub_ps(x, y);
+ return _mm256_castsi256_ps(x);
}
-SFINLINE __m128 mul(const __m128& x, const __m128& y)
+SFINLINE __m256 and_ps(const __m256& x, const __m256& y)
{
- return _mm_mul_ps(x, y);
+ return _mm256_and_ps(x, y);
}
-SFINLINE __m256 mul(const __m256& x, const __m256& y)
+SFINLINE __m256i and_si(const __m256i& x, const __m256i& y)
{
- return _mm256_mul_ps(x, y);
+ return _mm256_and_si256(x, y);
}
-SFINLINE __m128 madd(const __m128& x, const __m128& y, const __m128& z)
+SFINLINE __m256 or_ps(const __m256& x, const __m256& y)
{
- return add(mul(x, y), z);
+ return _mm256_or_ps(x, y);
}
-SFINLINE __m256 madd(const __m256& x, const __m256& y, const __m256& z)
+SFINLINE __m256i or_si(const __m256i& x, const __m256i& y)
{
- return _mm256_fmadd_ps(x, y, z);
+ return _mm256_or_si256(x, y);
}
+SFINLINE __m256 andnot_ps(const __m256& x, const __m256& y)
+{
+ return _mm256_andnot_ps(x, y);
+}
+SFINLINE __m256i andnot_si(const __m256i& x, const __m256i& y)
+{
+ return _mm256_andnot_si256(x, y);
+}
-/*--------------convert-----------------------*/
-SFINLINE __m128i cvtps_i32(const __m128& x)
+SFINLINE __m256 xor_ps(const __m256& x, const __m256& y)
{
- return _mm_cvtps_epi32(x);
+ return _mm256_xor_ps(x, y);
}
-SFINLINE __m256i cvtps_i32(const __m256& x)
+SFINLINE __m256i xor_si(const __m256i& x, const __m256i& y)
{
- return _mm256_cvtps_epi32(x);
+ return _mm256_xor_si256(x, y);
}
-template
-T cvtu8_ps(const uint8_t* ptr);
+SFINLINE __m256i srli_i32(const __m256i& x, int n)
+{
+ return _mm256_srli_epi32(x, n);
+}
-template <>
-SFINLINE __m128 cvtu8_ps<__m128, HAS_SSE2>(const uint8_t* ptr)
+SFINLINE __m256 add(const __m256& x, const __m256& y)
{
- const int32_t* p32 = reinterpret_cast(ptr);
- __m128i t = _mm_cvtsi32_si128(p32[0]);
- __m128i z = zero<__m128i>();
- t = _mm_unpacklo_epi8(t, z);
- t = _mm_unpacklo_epi16(t, z);
- return _mm_cvtepi32_ps(t);
+ return _mm256_add_ps(x, y);
}
-template <>
-SFINLINE __m128 cvtu8_ps<__m128, HAS_SSE41>(const uint8_t* ptr)
+SFINLINE __m256 sub(const __m256& x, const __m256& y)
{
- const int32_t* p32 = reinterpret_cast(ptr);
- __m128i t = _mm_cvtsi32_si128(p32[0]);
- t = _mm_cvtepu8_epi32(t);
- return _mm_cvtepi32_ps(t);
+ return _mm256_sub_ps(x, y);
+}
+
+SFINLINE __m256 mul(const __m256& x, const __m256& y)
+{
+ return _mm256_mul_ps(x, y);
+}
+
+SFINLINE __m256 madd(const __m256& x, const __m256& y, const __m256& z)
+{
+ return _mm256_fmadd_ps(x, y, z);
+}
+
+SFINLINE __m256i cvtps_i32(const __m256& x)
+{
+ return _mm256_cvtps_epi32(x);
}
template <>
@@ -469,14 +520,6 @@ SFINLINE __m256 cvtu8_ps<__m256, HAS_AVX2>(const uint8_t* ptr)
return _mm256_cvtepi32_ps(t1);
}
-SFINLINE __m128i
-cvti32_u8(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
-{
- __m128i x = _mm_packs_epi32(a, b);
- __m128i y = _mm_packs_epi32(c, d);
- return _mm_packus_epi16(x, y);
-}
-
SFINLINE __m256i
cvti32_u8(const __m256i& a, const __m256i& b, const __m256i& c, const __m256i& d)
{
@@ -487,109 +530,64 @@ cvti32_u8(const __m256i& a, const __m256i& b, const __m256i& c, const __m256i& d
return _mm256_packus_epi16(t0, t1);
}
-
-/*-----------------math-----------------------*/
-SFINLINE __m128 max(const __m128& x, const __m128& y)
-{
- return _mm_max_ps(x, y);
-}
-
SFINLINE __m256 max(const __m256& x, const __m256& y)
{
return _mm256_max_ps(x, y);
}
-template
-SFINLINE T abs(const T& val)
-{
- return max(val, sub(zero(), val));
-}
-
-SFINLINE __m128 rcp_ps(const __m128& x)
-{
- return _mm_rcp_ps(x);
-}
-
SFINLINE __m256 rcp_ps(const __m256& x)
{
return _mm256_rcp_ps(x);
}
-template
-SFINLINE T rcp_hq(const T& x)
-{
- T rcp = rcp_ps(x);
- T t = mul(mul(x, rcp), rcp);
- rcp = add(rcp, rcp);
- return sub(rcp, t);
-}
-
-SFINLINE __m128 sqrt(const __m128& x)
-{
- return _mm_sqrt_ps(x);
-}
-
SFINLINE __m256 sqrt(const __m256& x)
{
return _mm256_sqrt_ps(x);
}
-
-/*-----------compare-------------------------------*/
-
-SFINLINE __m128i cmpeq_i32(const __m128i& x, const __m128i& y)
-{
- return _mm_cmpeq_epi32(x, y);
-}
-
SFINLINE __m256i cmpeq_i32(const __m256i& x, const __m256i& y)
{
return _mm256_cmpeq_epi32(x, y);
}
-SFINLINE __m128 cmplt_ps(const __m128& x, const __m128& y)
-{
- return _mm_cmplt_ps(x, y);
-}
-
SFINLINE __m256 cmplt_ps(const __m256& x, const __m256& y)
{
return _mm256_cmp_ps(x, y, _CMP_LT_OQ);
}
-SFINLINE __m128 cmpge_ps(const __m128& x, const __m128& y)
-{
- return _mm_cmpge_ps(x, y);
-}
-
SFINLINE __m256 cmpge_ps(const __m256& x, const __m256& y)
{
return _mm256_cmp_ps(x, y, _CMP_GE_OQ);
}
-SFINLINE __m128 cmpord_ps(const __m128& x, const __m128& y)
+SFINLINE __m256 cmpord_ps(const __m256& x, const __m256& y)
{
- return _mm_cmpord_ps(x, y);
+ return _mm256_cmp_ps(x, y, _CMP_ORD_Q);
}
-SFINLINE __m256 cmpord_ps(const __m256& x, const __m256& y)
+SFINLINE __m256 blendv(const __m256&x, const __m256& y, const __m256& mask)
{
- return _mm256_cmp_ps(x, y, _CMP_ORD_Q);
+ return _mm256_blendv_ps(x, y, mask);
}
+#endif // __AVX2__
-/*----------------misc-----------------------------*/
-SFINLINE __m128 blendv(const __m128& x, const __m128& y, const __m128& mask)
+
+template
+SFINLINE T abs(const T& val)
{
- return or_ps(and_ps(mask, y), andnot_ps(mask, x));
+ return max(val, sub(zero(), val));
}
-SFINLINE __m256 blendv(const __m256&x, const __m256& y, const __m256& mask)
+template
+SFINLINE T rcp_hq(const T& x)
{
- return _mm256_blendv_ps(x, y, mask);
+ T rcp = rcp_ps(x);
+ T t = mul(mul(x, rcp), rcp);
+ rcp = add(rcp, rcp);
+ return sub(rcp, t);
}
-
#endif
diff --git a/avisynth/src/tcannymod.cpp b/avisynth/src/tcannymod.cpp
index 3e1a8f1..d3afd6a 100644
--- a/avisynth/src/tcannymod.cpp
+++ b/avisynth/src/tcannymod.cpp
@@ -28,39 +28,29 @@
#include
#include