diff --git a/.gitignore b/.gitignore index 8dda599..82d42fa 100644 --- a/.gitignore +++ b/.gitignore @@ -3,16 +3,18 @@ avisynth/src/Debug/* avisynth/src/x64/* avisynth/src/.vs/* avisynth/archive/* +vapoursynth/* *.opensdf *.opendb *.psess *.sdf *.suo *.sln -*.vcxproj.filters -*.vcxproj.user +*.filters +*.user *.vspx *.vsp *.dll *.avs *.exe +*.db diff --git a/avisynth/LISENCE.GPLv2 b/avisynth/LICENSE.GPLv2 similarity index 100% rename from avisynth/LISENCE.GPLv2 rename to avisynth/LICENSE.GPLv2 diff --git a/avisynth/src/TCannyMod.vcxproj b/avisynth/src/TCannyMod.vcxproj index 3353dd6..567db3d 100644 --- a/avisynth/src/TCannyMod.vcxproj +++ b/avisynth/src/TCannyMod.vcxproj @@ -71,7 +71,7 @@ true - true + false false @@ -103,12 +103,14 @@ Speed true StreamingSIMDExtensions2 - Precise + Fast AnySuitable + true + MaxSpeed MachineX86 - Debug + No Windows true true @@ -120,14 +122,14 @@ AnySuitable true Speed - AdvancedVectorExtensions + NotSet true Fast - Full + MaxSpeed true - Debug + No UseLinkTimeCodeGeneration diff --git a/avisynth/src/cpu_check.cpp b/avisynth/src/cpu_check.cpp index f52d43e..e8855f1 100644 --- a/avisynth/src/cpu_check.cpp +++ b/avisynth/src/cpu_check.cpp @@ -71,12 +71,12 @@ static inline void get_cpuid2(int *array, int info_type, int ecx) #endif } -static inline int is_bit_set(int bitfield, int bit) +static inline int is_bit_set(int bitfield, int bit) noexcept { return bitfield & (1 << bit); } -static uint32_t get_simd_support_info(void) +static uint32_t get_simd_support_info(void) noexcept { uint32_t ret = 0; int regs[4] = {0}; @@ -157,27 +157,27 @@ static uint32_t get_simd_support_info(void) return ret; } -int has_sse2() +bool has_sse2() noexcept { - return !!(get_simd_support_info() & CPU_SSE2_SUPPORT); + return (get_simd_support_info() & CPU_SSE2_SUPPORT) != 0; } -int has_ssse3() +bool has_ssse3() noexcept { - return !!(get_simd_support_info() & CPU_SSSE3_SUPPORT); + return (get_simd_support_info() & CPU_SSSE3_SUPPORT) != 0; } -int has_sse41() +bool has_sse41() noexcept { - return !!(get_simd_support_info() & CPU_SSE4_1_SUPPORT); + return (get_simd_support_info() & CPU_SSE4_1_SUPPORT) != 0; } -int has_avx() +bool has_avx() noexcept { - return !!(get_simd_support_info() & CPU_AVX_SUPPORT); + return (get_simd_support_info() & CPU_AVX_SUPPORT) != 0; } -int has_avx2() +bool has_avx2() noexcept { - return !!(get_simd_support_info() & CPU_AVX2_SUPPORT); + return (get_simd_support_info() & CPU_AVX2_SUPPORT) != 0; } diff --git a/avisynth/src/edge_detection.h b/avisynth/src/edge_detection.h index eccb577..69b7bb2 100644 --- a/avisynth/src/edge_detection.h +++ b/avisynth/src/edge_detection.h @@ -31,7 +31,7 @@ #include "simd.h" -static const float* get_tangent(int idx) +static const float* get_tangent(int idx) noexcept { alignas(32) static const float tangent[32] = { 0.414213538169860839843750f, 0.414213538169860839843750f, // tan(pi/8) @@ -60,7 +60,7 @@ template static void __stdcall standard(float* blurp, const size_t blur_pitch, float* emaskp, const size_t emask_pitch, int32_t* dirp, const size_t dir_pitch, - const size_t width, const size_t height) + const size_t width, const size_t height) noexcept { constexpr size_t step = sizeof(Vf) / sizeof(float); @@ -110,13 +110,13 @@ standard(float* blurp, const size_t blur_pitch, float* emaskp, Vi d3 = castps_si(and_ps(cmpge_ps(tan, t1125), cmplt_ps(tan, t1575))); d3 = srli_i32(d3, 24); d0 = or_si(or_si(d0, d1), or_si(d2, d3)); - stream(dirp + x, d0); + stream(dirp + x, d0); } Vf magnitude = mul(gx, gx); magnitude = madd(gy, gy, magnitude); magnitude = sqrt(magnitude); - stream(emaskp + x, magnitude); + stream(emaskp + x, magnitude); } emaskp += emask_pitch; dirp += dir_pitch; @@ -138,7 +138,7 @@ template static void __stdcall sobel(float* blurp, const size_t blur_pitch, float* emaskp, const size_t emask_pitch, int32_t* dirp, const size_t dir_pitch, - const size_t width, const size_t height) + const size_t width, const size_t height) noexcept { constexpr size_t step = sizeof(Vf) / sizeof(float); @@ -197,13 +197,13 @@ sobel(float* blurp, const size_t blur_pitch, float* emaskp, Vi d3 = castps_si(and_ps(cmpge_ps(tan, t1125), cmplt_ps(tan, t1575))); d3 = srli_i32(d3, 24); d0 = or_si(or_si(d0, d1), or_si(d2, d3)); - stream(dirp + x, d0); + stream(dirp + x, d0); } Vf magnitude = mul(gx, gx); magnitude = madd(gy, gy, magnitude); magnitude = sqrt(magnitude); - stream(emaskp + x, magnitude); + stream(emaskp + x, magnitude); } emaskp += emask_pitch; dirp += dir_pitch; @@ -218,7 +218,8 @@ template static void __stdcall non_max_suppress(const float* emaskp, const size_t em_pitch, const int32_t* dirp, const size_t dir_pitch, float* blurp, - const size_t blur_pitch, const size_t width, const size_t height) + const size_t blur_pitch, const size_t width, + const size_t height) noexcept { constexpr size_t step = sizeof(Vf) / sizeof(float); @@ -275,7 +276,7 @@ non_max_suppress(const float* emaskp, const size_t em_pitch, void __stdcall hysteresis(uint8_t* hystp, const size_t hpitch, float* blurp, const size_t bpitch, const int width, const int height, - const float tmin, const float tmax); + const float tmin, const float tmax) noexcept; #endif diff --git a/avisynth/src/gaussian_blur.h b/avisynth/src/gaussian_blur.h index 46e04a8..3504933 100644 --- a/avisynth/src/gaussian_blur.h +++ b/avisynth/src/gaussian_blur.h @@ -32,14 +32,14 @@ template static void __stdcall convert_to_float(const size_t width, const size_t height, const uint8_t* srcp, - const int src_pitch, float* blurp, const size_t blur_pitch) + const int src_pitch, float* blurp, const size_t blur_pitch) noexcept { constexpr size_t step = sizeof(Vf) / sizeof(float); for (size_t y = 0; y < height; y++) { for (size_t x = 0; x < width; x += step) { Vf val = cvtu8_ps(srcp + x); - stream(blurp + x, val); + stream(blurp + x, val); } srcp += src_pitch; blurp += blur_pitch; @@ -50,7 +50,7 @@ convert_to_float(const size_t width, const size_t height, const uint8_t* srcp, template static void horizontal_blur(const float* hkernel, float* buffp, const int radius, - const size_t width, float* blurp) + const size_t width, float* blurp) noexcept { constexpr size_t step = sizeof(Vf) / sizeof(float); const int length = radius * 2 + 1; @@ -67,7 +67,7 @@ horizontal_blur(const float* hkernel, float* buffp, const int radius, Vf val = loadu(buffp + x + i); sum = madd(k, val, sum); } - stream(blurp + x, sum); + stream(blurp + x, sum); } } @@ -77,7 +77,7 @@ static void __stdcall gaussian_blur(const int radius, const float* kernel, const float* hkernel, float* buffp, float* blurp, const size_t blur_pitch, const uint8_t* srcp, const size_t src_pitch, const size_t width, - const size_t height) + const size_t height) noexcept { if (radius == 0) { convert_to_float( @@ -106,7 +106,7 @@ gaussian_blur(const int radius, const float* kernel, const float* hkernel, sum = madd(k, input, sum); } - store(buffp + x, sum); + store(buffp + x, sum); } horizontal_blur(hkernel, buffp, radius, width, blurp); blurp += blur_pitch; diff --git a/avisynth/src/hysteresis.cpp b/avisynth/src/hysteresis.cpp index 52c37ad..ca5ae2d 100644 --- a/avisynth/src/hysteresis.cpp +++ b/avisynth/src/hysteresis.cpp @@ -41,7 +41,7 @@ struct Pos { static __forceinline void hystfunc(const int32_t x, const int32_t y, float* edge, uint8_t* hyst, const size_t epitch, const size_t hpitch, const float th, - std::vector& stack) + std::vector& stack) noexcept { if (!hyst[x + y * hpitch] && edge[x + y * epitch] > th) { edge[x + y * epitch] = FLT_MAX; @@ -54,10 +54,11 @@ hystfunc(const int32_t x, const int32_t y, float* edge, uint8_t* hyst, void __stdcall hysteresis(uint8_t* hystp, const size_t hpitch, float* blurp, const size_t bpitch, const int width, const int height, - const float tmin, const float tmax) + const float tmin, const float tmax) noexcept { memset(hystp, 0, hpitch * height); std::vector stack; + stack.reserve(512); for (int32_t y = 0; y < height; ++y) { for (int32_t x = 0; x < width; ++x) { diff --git a/avisynth/src/simd.h b/avisynth/src/simd.h index 64352d3..4f5fc44 100644 --- a/avisynth/src/simd.h +++ b/avisynth/src/simd.h @@ -26,8 +26,13 @@ #ifndef TCANNY_MOD_SIMD_H #define TCANNY_MOD_SIMD_H + #include +#if defined(__AVX2__) #include +#else +#include +#endif #define SFINLINE static __forceinline @@ -42,6 +47,10 @@ enum arch_t { template T zero(); +template +T set1_ps(const float& x); + + template <> SFINLINE __m128i zero<__m128i>() { @@ -54,33 +63,12 @@ SFINLINE __m128 zero<__m128>() return _mm_setzero_ps(); } -template <> -SFINLINE __m256i zero<__m256i>() -{ - return _mm256_setzero_si256(); -} - -template <> -SFINLINE __m256 zero<__m256>() -{ - return _mm256_setzero_ps(); -} - -template -T set1_ps(const float& x); - template <> SFINLINE __m128 set1_ps<__m128>(const float& x) { return _mm_set_ps1(x); } -template <> -SFINLINE __m256 set1_ps<__m256>(const float& x) -{ - return _mm256_set1_ps(x); -} - template T set1_i8(const int8_t&); @@ -90,17 +78,19 @@ SFINLINE __m128i set1_i8<__m128i>(const int8_t& x) return _mm_set1_epi8(x); } -template <> -SFINLINE __m256i set1_i8<__m256i>(const int8_t& x) -{ - return _mm256_set1_epi8(x); -} - /*---------------load--------------------*/ template T load(const float* p); +template +T load(const uint8_t*); + +template +T load(const int32_t*); + + + template <> SFINLINE __m128 load<__m128>(const float* p) { @@ -108,357 +98,418 @@ SFINLINE __m128 load<__m128>(const float* p) } template <> -SFINLINE __m256 load<__m256>(const float* p) +SFINLINE __m128i load(const uint8_t* p) { - return _mm256_load_ps(p); + return _mm_load_si128(reinterpret_cast(p)); } -template -T load(const uint8_t*); - template <> -SFINLINE __m128i load(const uint8_t* p) +SFINLINE __m128i load(const int32_t* p) { return _mm_load_si128(reinterpret_cast(p)); } + + +template +T loadu(const float* p); + +template +T loadu(const uint8_t* p); + +template +T loadu(const int32_t* p); + + + template <> -SFINLINE __m256i load(const uint8_t* p) +SFINLINE __m128 loadu(const float* p) { - return _mm256_load_si256(reinterpret_cast(p)); + return _mm_loadu_ps(p); } -template -T load(const int32_t*); - template <> -SFINLINE __m128i load(const int32_t* p) +SFINLINE __m128i loadu(const uint8_t* p) { - return _mm_load_si128(reinterpret_cast(p)); + return _mm_loadu_si128(reinterpret_cast(p)); } template <> -SFINLINE __m256i load(const int32_t* p) +SFINLINE __m128i loadu(const int32_t* p) { - return _mm256_load_si256(reinterpret_cast(p)); + return _mm_loadu_si128(reinterpret_cast(p)); } -template -T loadu(const float* p); -template <> -SFINLINE __m128 loadu<__m128>(const float* p) +/*-------------store---------------------*/ + +SFINLINE void store(float* p, const __m128& x) { - return _mm_loadu_ps(p); + _mm_store_ps(p, x); } -template <> -SFINLINE __m256 loadu<__m256>(const float* p) +SFINLINE void storeu(float* p, const __m128& x) { - return _mm256_loadu_ps(p); + _mm_storeu_ps(p, x); } -template -T loadu(const uint8_t* p); +SFINLINE void stream(float* p, const __m128& x) +{ + _mm_stream_ps(p, x); +} -template <> -SFINLINE __m128i loadu<__m128i>(const uint8_t* p) +SFINLINE void stream(uint8_t* p, const __m128i& x) { - return _mm_loadu_si128(reinterpret_cast(p)); + return _mm_stream_si128(reinterpret_cast<__m128i*>(p), x); } -template <> -SFINLINE __m256i loadu<__m256i>(const uint8_t* p) +SFINLINE void stream(int32_t* p, const __m128i& x) { - return _mm256_loadu_si256(reinterpret_cast(p)); + return _mm_stream_si128(reinterpret_cast<__m128i*>(p), x); } -template -T loadu(const int32_t* p); -template <> -SFINLINE __m128i loadu<__m128i>(const int32_t* p) + +/*-----------cast--------------------------*/ +SFINLINE __m128i castps_si(const __m128& x) { - return _mm_loadu_si128(reinterpret_cast(p)); + return _mm_castps_si128(x); } -template <> -SFINLINE __m256i loadu<__m256i>(const int32_t* p) +SFINLINE __m128 castsi_ps(const __m128i& x) { - return _mm256_loadu_si256(reinterpret_cast(p)); + return _mm_castsi128_ps(x); } -/*-------------store---------------------*/ -template -void store(float* p, const T& x) {} -template <> -SFINLINE void store<__m128>(float* p, const __m128& x) +/*-------------------logical-------------------------------*/ +SFINLINE __m128 and_ps(const __m128& x, const __m128& y) { - _mm_store_ps(p, x); + return _mm_and_ps(x, y); } -template <> -SFINLINE void store<__m256>(float* p, const __m256& x) +SFINLINE __m128i and_si(const __m128i& x, const __m128i& y) { - _mm256_store_ps(p, x); + return _mm_and_si128(x, y); } -template -void storeu(float* p, const T& x) {} +SFINLINE __m128 or_ps(const __m128& x, const __m128& y) +{ + return _mm_or_ps(x, y); +} -template <> -SFINLINE void storeu<__m128>(float* p, const __m128& x) +SFINLINE __m128i or_si(const __m128i& x, const __m128i& y) { - _mm_storeu_ps(p, x); + return _mm_or_si128(x, y); } -template <> -SFINLINE void storeu<__m256>(float* p, const __m256& x) +SFINLINE __m128 andnot_ps(const __m128& x, const __m128& y) { - _mm256_storeu_ps(p, x); + return _mm_andnot_ps(x, y); } -template -void stream(float* p, const T& x) {} +SFINLINE __m128i andnot_si(const __m128i& x, const __m128i& y) +{ + return _mm_andnot_si128(x, y); +} -template <> -SFINLINE void stream<__m128>(float* p, const __m128& x) +SFINLINE __m128 xor_ps(const __m128& x, const __m128& y) { - _mm_stream_ps(p, x); + return _mm_xor_ps(x, y); } -template <> -SFINLINE void stream<__m256>(float* p, const __m256& x) +SFINLINE __m128i xor_si(const __m128i& x, const __m128i& y) { - _mm256_stream_ps(p, x); + return _mm_xor_si128(x, y); } -template -void stream(uint8_t* p, const T& x) {} -template <> -SFINLINE void stream<__m128i>(uint8_t* p, const __m128i& x) +/*-----------------shift-----------------------*/ +SFINLINE __m128i srli_i32(const __m128i& x, int n) { - return _mm_stream_si128(reinterpret_cast<__m128i*>(p), x); + return _mm_srli_epi32(x, n); } -template <> -SFINLINE void stream<__m256i>(uint8_t* p, const __m256i& x) + +/*------------------arithmetic--------------------*/ +SFINLINE __m128 add(const __m128& x, const __m128& y) { - return _mm256_stream_si256(reinterpret_cast<__m256i*>(p), x); + return _mm_add_ps(x, y); } -template -void stream(int32_t* p, const T& x) {} +SFINLINE __m128 sub(const __m128& x, const __m128& y) +{ + return _mm_sub_ps(x, y); +} -template <> -SFINLINE void stream<__m128i>(int32_t* p, const __m128i& x) +SFINLINE __m128 mul(const __m128& x, const __m128& y) { - return _mm_stream_si128(reinterpret_cast<__m128i*>(p), x); + return _mm_mul_ps(x, y); } -template <> -SFINLINE void stream<__m256i>(int32_t* p, const __m256i& x) +SFINLINE __m128 madd(const __m128& x, const __m128& y, const __m128& z) { - return _mm256_stream_si256(reinterpret_cast<__m256i*>(p), x); + return add(mul(x, y), z); } -/*-----------cast--------------------------*/ -SFINLINE __m128i castps_si(const __m128& x) +/*--------------convert-----------------------*/ +SFINLINE __m128i cvtps_i32(const __m128& x) { - return _mm_castps_si128(x); + return _mm_cvtps_epi32(x); } -SFINLINE __m256i castps_si(const __m256& x) +template +T cvtu8_ps(const uint8_t* ptr); + +template <> +SFINLINE __m128 cvtu8_ps<__m128, HAS_SSE2>(const uint8_t* ptr) { - return _mm256_castps_si256(x); + const int32_t* p32 = reinterpret_cast(ptr); + __m128i t = _mm_cvtsi32_si128(p32[0]); + __m128i z = zero<__m128i>(); + t = _mm_unpacklo_epi8(t, z); + t = _mm_unpacklo_epi16(t, z); + return _mm_cvtepi32_ps(t); } -SFINLINE __m128 castsi_ps(const __m128i& x) +template <> +SFINLINE __m128 cvtu8_ps<__m128, HAS_SSE41>(const uint8_t* ptr) { - return _mm_castsi128_ps(x); + const int32_t* p32 = reinterpret_cast(ptr); + __m128i t = _mm_cvtsi32_si128(p32[0]); + t = _mm_cvtepu8_epi32(t); + return _mm_cvtepi32_ps(t); } -SFINLINE __m256 castsi_ps(const __m256i& x) +SFINLINE __m128i +cvti32_u8(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) { - return _mm256_castsi256_ps(x); + __m128i x = _mm_packs_epi32(a, b); + __m128i y = _mm_packs_epi32(c, d); + return _mm_packus_epi16(x, y); } +/*-----------------math-----------------------*/ +SFINLINE __m128 max(const __m128& x, const __m128& y) +{ + return _mm_max_ps(x, y); +} -/*-------------------logical-------------------------------*/ -SFINLINE __m128 and_ps(const __m128& x, const __m128& y) +SFINLINE __m128 rcp_ps(const __m128& x) { - return _mm_and_ps(x, y); + return _mm_rcp_ps(x); } -SFINLINE __m256 and_ps(const __m256& x, const __m256& y) +SFINLINE __m128 sqrt(const __m128& x) { - return _mm256_and_ps(x, y); + return _mm_sqrt_ps(x); } -SFINLINE __m128i and_si(const __m128i& x, const __m128i& y) + +/*-----------compare-------------------------------*/ + +SFINLINE __m128i cmpeq_i32(const __m128i& x, const __m128i& y) { - return _mm_and_si128(x, y); + return _mm_cmpeq_epi32(x, y); } -SFINLINE __m256i and_si(const __m256i& x, const __m256i& y) +SFINLINE __m128 cmplt_ps(const __m128& x, const __m128& y) { - return _mm256_and_si256(x, y); + return _mm_cmplt_ps(x, y); } -SFINLINE __m128 or_ps(const __m128& x, const __m128& y) +SFINLINE __m128 cmpge_ps(const __m128& x, const __m128& y) { - return _mm_or_ps(x, y); + return _mm_cmpge_ps(x, y); } -SFINLINE __m256 or_ps(const __m256& x, const __m256& y) +SFINLINE __m128 cmpord_ps(const __m128& x, const __m128& y) { - return _mm256_or_ps(x, y); + return _mm_cmpord_ps(x, y); } -SFINLINE __m128i or_si(const __m128i& x, const __m128i& y) + + + +/*----------------misc-----------------------------*/ +SFINLINE __m128 blendv(const __m128& x, const __m128& y, const __m128& mask) { - return _mm_or_si128(x, y); + return or_ps(and_ps(mask, y), andnot_ps(mask, x)); } -SFINLINE __m256i or_si(const __m256i& x, const __m256i& y) + + + + +#if defined(__AVX2__) + +template <> +SFINLINE __m256i zero<__m256i>() { - return _mm256_or_si256(x, y); + return _mm256_setzero_si256(); } -SFINLINE __m128 andnot_ps(const __m128& x, const __m128& y) +template <> +SFINLINE __m256 zero<__m256>() { - return _mm_andnot_ps(x, y); + return _mm256_setzero_ps(); } -SFINLINE __m256 andnot_ps(const __m256& x, const __m256& y) +template <> +SFINLINE __m256 set1_ps<__m256>(const float& x) { - return _mm256_andnot_ps(x, y); + return _mm256_set1_ps(x); } -SFINLINE __m128i andnot_si(const __m128i& x, const __m128i& y) +template <> +SFINLINE __m256i set1_i8<__m256i>(const int8_t& x) { - return _mm_andnot_si128(x, y); + return _mm256_set1_epi8(x); } -SFINLINE __m256i andnot_si(const __m256i& x, const __m256i& y) +template <> +SFINLINE __m256 load(const float* p) { - return _mm256_andnot_si256(x, y); + return _mm256_load_ps(p); } -SFINLINE __m128 xor_ps(const __m128& x, const __m128& y) +template <> +SFINLINE __m256i load(const uint8_t* p) { - return _mm_xor_ps(x, y); + return _mm256_load_si256(reinterpret_cast(p)); } -SFINLINE __m256 xor_ps(const __m256& x, const __m256& y) +template <> +SFINLINE __m256i load(const int32_t* p) { - return _mm256_xor_ps(x, y); + return _mm256_load_si256(reinterpret_cast(p)); } -SFINLINE __m128i xor_si(const __m128i& x, const __m128i& y) +template <> +SFINLINE __m256 loadu(const float* p) { - return _mm_xor_si128(x, y); + return _mm256_loadu_ps(p); } -SFINLINE __m256i xor_si(const __m256i& x, const __m256i& y) +template <> +SFINLINE __m256i loadu(const uint8_t* p) { - return _mm256_xor_si256(x, y); + return _mm256_loadu_si256(reinterpret_cast(p)); } +template <> +SFINLINE __m256i loadu(const int32_t* p) +{ + return _mm256_loadu_si256(reinterpret_cast(p)); +} -/*-----------------shift-----------------------*/ -SFINLINE __m128i srli_i32(const __m128i& x, int n) +SFINLINE void store(float* p, const __m256& x) { - return _mm_srli_epi32(x, n); + _mm256_store_ps(p, x); } -SFINLINE __m256i srli_i32(const __m256i& x, int n) +SFINLINE void storeu(float* p, const __m256& x) { - return _mm256_srli_epi32(x, n); + _mm256_storeu_ps(p, x); } +SFINLINE void stream(float* p, const __m256& x) +{ + _mm256_stream_ps(p, x); +} -/*------------------arithmetic--------------------*/ -SFINLINE __m128 add(const __m128& x, const __m128& y) +SFINLINE void stream(uint8_t* p, const __m256i& x) { - return _mm_add_ps(x, y); + return _mm256_stream_si256(reinterpret_cast<__m256i*>(p), x); } -SFINLINE __m256 add(const __m256& x, const __m256& y) +SFINLINE void stream(int32_t* p, const __m256i& x) { - return _mm256_add_ps(x, y); + return _mm256_stream_si256(reinterpret_cast<__m256i*>(p), x); } -SFINLINE __m128 sub(const __m128& x, const __m128& y) +SFINLINE __m256i castps_si(const __m256& x) { - return _mm_sub_ps(x, y); + return _mm256_castps_si256(x); } -SFINLINE __m256 sub(const __m256& x, const __m256& y) +SFINLINE __m256 castsi_ps(const __m256i& x) { - return _mm256_sub_ps(x, y); + return _mm256_castsi256_ps(x); } -SFINLINE __m128 mul(const __m128& x, const __m128& y) +SFINLINE __m256 and_ps(const __m256& x, const __m256& y) { - return _mm_mul_ps(x, y); + return _mm256_and_ps(x, y); } -SFINLINE __m256 mul(const __m256& x, const __m256& y) +SFINLINE __m256i and_si(const __m256i& x, const __m256i& y) { - return _mm256_mul_ps(x, y); + return _mm256_and_si256(x, y); } -SFINLINE __m128 madd(const __m128& x, const __m128& y, const __m128& z) +SFINLINE __m256 or_ps(const __m256& x, const __m256& y) { - return add(mul(x, y), z); + return _mm256_or_ps(x, y); } -SFINLINE __m256 madd(const __m256& x, const __m256& y, const __m256& z) +SFINLINE __m256i or_si(const __m256i& x, const __m256i& y) { - return _mm256_fmadd_ps(x, y, z); + return _mm256_or_si256(x, y); } +SFINLINE __m256 andnot_ps(const __m256& x, const __m256& y) +{ + return _mm256_andnot_ps(x, y); +} +SFINLINE __m256i andnot_si(const __m256i& x, const __m256i& y) +{ + return _mm256_andnot_si256(x, y); +} -/*--------------convert-----------------------*/ -SFINLINE __m128i cvtps_i32(const __m128& x) +SFINLINE __m256 xor_ps(const __m256& x, const __m256& y) { - return _mm_cvtps_epi32(x); + return _mm256_xor_ps(x, y); } -SFINLINE __m256i cvtps_i32(const __m256& x) +SFINLINE __m256i xor_si(const __m256i& x, const __m256i& y) { - return _mm256_cvtps_epi32(x); + return _mm256_xor_si256(x, y); } -template -T cvtu8_ps(const uint8_t* ptr); +SFINLINE __m256i srli_i32(const __m256i& x, int n) +{ + return _mm256_srli_epi32(x, n); +} -template <> -SFINLINE __m128 cvtu8_ps<__m128, HAS_SSE2>(const uint8_t* ptr) +SFINLINE __m256 add(const __m256& x, const __m256& y) { - const int32_t* p32 = reinterpret_cast(ptr); - __m128i t = _mm_cvtsi32_si128(p32[0]); - __m128i z = zero<__m128i>(); - t = _mm_unpacklo_epi8(t, z); - t = _mm_unpacklo_epi16(t, z); - return _mm_cvtepi32_ps(t); + return _mm256_add_ps(x, y); } -template <> -SFINLINE __m128 cvtu8_ps<__m128, HAS_SSE41>(const uint8_t* ptr) +SFINLINE __m256 sub(const __m256& x, const __m256& y) { - const int32_t* p32 = reinterpret_cast(ptr); - __m128i t = _mm_cvtsi32_si128(p32[0]); - t = _mm_cvtepu8_epi32(t); - return _mm_cvtepi32_ps(t); + return _mm256_sub_ps(x, y); +} + +SFINLINE __m256 mul(const __m256& x, const __m256& y) +{ + return _mm256_mul_ps(x, y); +} + +SFINLINE __m256 madd(const __m256& x, const __m256& y, const __m256& z) +{ + return _mm256_fmadd_ps(x, y, z); +} + +SFINLINE __m256i cvtps_i32(const __m256& x) +{ + return _mm256_cvtps_epi32(x); } template <> @@ -469,14 +520,6 @@ SFINLINE __m256 cvtu8_ps<__m256, HAS_AVX2>(const uint8_t* ptr) return _mm256_cvtepi32_ps(t1); } -SFINLINE __m128i -cvti32_u8(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d) -{ - __m128i x = _mm_packs_epi32(a, b); - __m128i y = _mm_packs_epi32(c, d); - return _mm_packus_epi16(x, y); -} - SFINLINE __m256i cvti32_u8(const __m256i& a, const __m256i& b, const __m256i& c, const __m256i& d) { @@ -487,109 +530,64 @@ cvti32_u8(const __m256i& a, const __m256i& b, const __m256i& c, const __m256i& d return _mm256_packus_epi16(t0, t1); } - -/*-----------------math-----------------------*/ -SFINLINE __m128 max(const __m128& x, const __m128& y) -{ - return _mm_max_ps(x, y); -} - SFINLINE __m256 max(const __m256& x, const __m256& y) { return _mm256_max_ps(x, y); } -template -SFINLINE T abs(const T& val) -{ - return max(val, sub(zero(), val)); -} - -SFINLINE __m128 rcp_ps(const __m128& x) -{ - return _mm_rcp_ps(x); -} - SFINLINE __m256 rcp_ps(const __m256& x) { return _mm256_rcp_ps(x); } -template -SFINLINE T rcp_hq(const T& x) -{ - T rcp = rcp_ps(x); - T t = mul(mul(x, rcp), rcp); - rcp = add(rcp, rcp); - return sub(rcp, t); -} - -SFINLINE __m128 sqrt(const __m128& x) -{ - return _mm_sqrt_ps(x); -} - SFINLINE __m256 sqrt(const __m256& x) { return _mm256_sqrt_ps(x); } - -/*-----------compare-------------------------------*/ - -SFINLINE __m128i cmpeq_i32(const __m128i& x, const __m128i& y) -{ - return _mm_cmpeq_epi32(x, y); -} - SFINLINE __m256i cmpeq_i32(const __m256i& x, const __m256i& y) { return _mm256_cmpeq_epi32(x, y); } -SFINLINE __m128 cmplt_ps(const __m128& x, const __m128& y) -{ - return _mm_cmplt_ps(x, y); -} - SFINLINE __m256 cmplt_ps(const __m256& x, const __m256& y) { return _mm256_cmp_ps(x, y, _CMP_LT_OQ); } -SFINLINE __m128 cmpge_ps(const __m128& x, const __m128& y) -{ - return _mm_cmpge_ps(x, y); -} - SFINLINE __m256 cmpge_ps(const __m256& x, const __m256& y) { return _mm256_cmp_ps(x, y, _CMP_GE_OQ); } -SFINLINE __m128 cmpord_ps(const __m128& x, const __m128& y) +SFINLINE __m256 cmpord_ps(const __m256& x, const __m256& y) { - return _mm_cmpord_ps(x, y); + return _mm256_cmp_ps(x, y, _CMP_ORD_Q); } -SFINLINE __m256 cmpord_ps(const __m256& x, const __m256& y) +SFINLINE __m256 blendv(const __m256&x, const __m256& y, const __m256& mask) { - return _mm256_cmp_ps(x, y, _CMP_ORD_Q); + return _mm256_blendv_ps(x, y, mask); } +#endif // __AVX2__ -/*----------------misc-----------------------------*/ -SFINLINE __m128 blendv(const __m128& x, const __m128& y, const __m128& mask) + +template +SFINLINE T abs(const T& val) { - return or_ps(and_ps(mask, y), andnot_ps(mask, x)); + return max(val, sub(zero(), val)); } -SFINLINE __m256 blendv(const __m256&x, const __m256& y, const __m256& mask) +template +SFINLINE T rcp_hq(const T& x) { - return _mm256_blendv_ps(x, y, mask); + T rcp = rcp_ps(x); + T t = mul(mul(x, rcp), rcp); + rcp = add(rcp, rcp); + return sub(rcp, t); } - #endif diff --git a/avisynth/src/tcannymod.cpp b/avisynth/src/tcannymod.cpp index 3e1a8f1..d3afd6a 100644 --- a/avisynth/src/tcannymod.cpp +++ b/avisynth/src/tcannymod.cpp @@ -28,39 +28,29 @@ #include #include #include +#include #include "tcannymod.h" #include "gaussian_blur.h" #include "edge_detection.h" #include "write_frame.h" -static gaussian_blur_t get_gaussian_blur(arch_t arch) -{ - if (arch == HAS_SSE2) { - return gaussian_blur<__m128, GB_MAX_LENGTH, HAS_SSE2>; - } - if (arch == HAS_SSE41) { - return gaussian_blur<__m128, GB_MAX_LENGTH, HAS_SSE41>; - } - return gaussian_blur<__m256, GB_MAX_LENGTH, HAS_AVX2>; -} - - static edge_detection_t -get_edge_detection(bool use_sobel, bool calc_dir, arch_t arch) +get_edge_detection(bool use_sobel, bool calc_dir, arch_t arch) noexcept { using std::make_tuple; std::map, edge_detection_t> func; func[make_tuple(false, false, HAS_SSE2)] = standard<__m128, __m128i, false>; - func[make_tuple(false, false, HAS_AVX2)] = standard<__m256, __m256i, false>; func[make_tuple(false, true, HAS_SSE2)] = standard<__m128, __m128i, true>; - func[make_tuple(false, true, HAS_AVX2)] = standard<__m256, __m256i, true>; - func[make_tuple(true, false, HAS_SSE2)] = sobel<__m128, __m128i, false>; - func[make_tuple(true, false, HAS_AVX2)] = sobel<__m256, __m256i, false>; func[make_tuple(true, true, HAS_SSE2)] = sobel<__m128, __m128i, true>; +#if defined(__AVX2__) + func[make_tuple(false, false, HAS_AVX2)] = standard<__m256, __m256i, false>; + func[make_tuple(false, true, HAS_AVX2)] = standard<__m256, __m256i, true>; + func[make_tuple(true, false, HAS_AVX2)] = sobel<__m256, __m256i, false>; func[make_tuple(true, true, HAS_AVX2)] = sobel<__m256, __m256i, true>; +#endif arch_t a = arch == HAS_SSE41 ? HAS_SSE2 : arch; @@ -68,42 +58,52 @@ get_edge_detection(bool use_sobel, bool calc_dir, arch_t arch) } -static non_max_suppress_t -get_non_max_suppress(arch_t arch) + +static write_gradient_mask_t +get_write_gradient_mask(bool scale, arch_t arch) noexcept { - if (arch < HAS_AVX2) { - return non_max_suppress<__m128, __m128i>; +#if defined(__AVX2__) + if (arch == HAS_AVX2) { + return scale ? write_gradient_mask<__m256, __m256i, true> + : write_gradient_mask<__m256, __m256i, false>; } - return non_max_suppress<__m256, __m256i>; +#endif + return scale ? write_gradient_mask<__m128, __m128i, true> + : write_gradient_mask<__m128, __m128i, false>; + } -static write_gradient_mask_t get_write_gradient_mask(bool scale, arch_t arch) +static inline void validate(bool cond, const char* msg) { - if (arch < HAS_AVX2) { - return scale ? write_gradient_mask<__m128, __m128i, true> - : write_gradient_mask<__m128, __m128i, false>; - } - return scale ? write_gradient_mask<__m256, __m256i, true> - : write_gradient_mask<__m256, __m256i, false>; + if (cond) + throw std::runtime_error(msg); } -static write_gradient_direction_t get_write_gradient_direction(arch_t arch) +template +static inline T +my_malloc(size_t size, size_t align, bool is_plus, AvsAllocType at, + ise_t* env) noexcept { - if (arch < HAS_AVX2) { - return write_gradient_direction<__m128i>; + void* p; + if (is_plus) { + p = static_cast(env)->Allocate(size, align, at); + } else { + p = _aligned_malloc(size, align); } - return write_gradient_direction<__m256i>; + return reinterpret_cast(p); } -static write_edge_direction_t get_write_edge_direction(arch_t arch) +static inline void my_free(void* p, bool is_plus, ise_t* env) noexcept { - if (arch < HAS_AVX2) { - return write_edge_direction<__m128i>; + if (is_plus) { + static_cast(env)->Free(p); + } else { + _aligned_free(p); } - return write_edge_direction<__m256i>; + p = nullptr; } @@ -112,10 +112,7 @@ set_gb_kernel(float sigma, int& radius, float* kernel) { radius = std::max(static_cast(sigma * 3.0f + 0.5f), 1); int length = radius * 2 + 1; - if (length > GB_MAX_LENGTH) { - radius = 0; - return; - } + validate(length > GB_MAX_LENGTH, "sigma is too large."); float sum = 0.0f; for (int i = -radius; i <= radius; i++) { @@ -127,44 +124,42 @@ set_gb_kernel(float sigma, int& radius, float* kernel) } -static arch_t get_arch(int opt) +static arch_t get_arch(int opt, bool is_plus) noexcept { if (opt == 0 || !has_sse41()) { return HAS_SSE2; } +#if !defined(__AVX2__) + return HAS_SSE41; +#else if (opt == 1 || !has_avx2()) { return HAS_SSE41; } return HAS_AVX2; +#endif } -static inline void validate(bool cond, const char* msg) -{ - if (cond) - throw msg; -} - TCannyM::TCannyM(PClip ch, int m, float sigma, float tmin, float tmax, int c, - bool sobel, float s, int opt, const char* n) : + bool sobel, float s, int opt, const char* n, bool is_plus) : GenericVideoFilter(ch), mode(m), gbRadius(0), th_min(tmin), th_max(tmax), - chroma(c), name(n), scale(s) + chroma(c), name(n), scale(s), isPlus(is_plus) { validate(!vi.IsPlanar(), "Planar format only."); numPlanes = (vi.IsY8() || chroma == 0) ? 1 : 3; - arch_t arch = get_arch(opt); + arch_t arch = get_arch(opt, isPlus); + align = (arch < HAS_AVX2) ? 16 : 32; if (sigma > 0.0f) { set_gb_kernel(sigma, gbRadius, gbKernel); - validate(gbRadius == 0, "sigma is too large."); size_t length = (gbRadius * 2 + 1); - horizontalKernel = static_cast( - _aligned_malloc(length * align, align)); - validate(!horizontalKernel, "failed to prepare kernel."); + horizontalKernel = my_malloc( + length * align, align, false, AVS_NORMAL_ALLOC, nullptr); + validate(!horizontalKernel, "failed to allocate memory."); size_t step = align / sizeof(float); for (size_t i = 0; i < length; ++i) { @@ -172,7 +167,6 @@ TCannyM::TCannyM(PClip ch, int m, float sigma, float tmin, float tmax, int c, horizontalKernel[i * step + j] = gbKernel[i]; } } - } blurPitch = ((align + (vi.width + 1) * sizeof(float)) + align - 1) & ~(align - 1); @@ -190,43 +184,60 @@ TCannyM::TCannyM(PClip ch, int m, float sigma, float tmin, float tmax, int c, emaskPitch /= sizeof(float); dirPitch /= sizeof(int32_t); - gaussianBlur = get_gaussian_blur(arch); + switch (arch) { +#if defined(__AVX2__) + case HAS_AVX2: + gaussianBlur = gaussian_blur<__m256, GB_MAX_LENGTH, HAS_AVX2>; + nonMaximumSuppression = non_max_suppress<__m256, __m256i>; + writeGradientDirection = write_gradient_direction<__m256i>; + writeEdgeDirection = write_edge_direction<__m256i>; + break; +#endif + case HAS_SSE41: + gaussianBlur = gaussian_blur<__m128, GB_MAX_LENGTH, HAS_SSE41>; + nonMaximumSuppression = non_max_suppress<__m128, __m128i>; + writeGradientDirection = write_gradient_direction<__m128i>; + writeEdgeDirection = write_edge_direction<__m128i>; + break; + default: + gaussianBlur = gaussian_blur<__m128, GB_MAX_LENGTH, HAS_SSE2>; + nonMaximumSuppression = non_max_suppress<__m128, __m128i>; + writeGradientDirection = write_gradient_direction<__m128i>; + writeEdgeDirection = write_edge_direction<__m128i>; + } edgeDetection = get_edge_detection(sobel, (mode != 1 && mode != 4), arch); - nonMaximumSuppression = get_non_max_suppress(arch); - writeBluredFrame = get_write_gradient_mask(false, arch); writeGradientMask = get_write_gradient_mask(scale != 1.0f, arch); - - writeGradientDirection = get_write_gradient_direction(arch); - - writeEdgeDirection = get_write_edge_direction(arch); } TCannyM::~TCannyM() { - _aligned_free(horizontalKernel); - horizontalKernel = nullptr; + my_free(horizontalKernel, false, nullptr); } class Buffers { - uint8_t* orig; + ise_t* env; + bool isPlus; public: + uint8_t* orig; float* buffp; float* blurp; float* emaskp; int32_t* dirp; uint8_t* hystp; Buffers(size_t bufsize, size_t blsize, size_t emsize, size_t dirsize, - size_t hystsize, size_t align) + size_t hystsize, size_t align, bool ip, ise_t* e) : + env(e), isPlus(ip) { size_t total_size = bufsize + blsize + emsize + dirsize + hystsize; - orig = static_cast(_aligned_malloc(total_size, align)); - validate(!orig, "failed to allocate buffers."); + orig = my_malloc( + total_size, align, isPlus, AVS_POOLED_ALLOC, env); + buffp = reinterpret_cast(orig) + 8; blurp = reinterpret_cast(orig + bufsize + align); emaskp = reinterpret_cast(orig + bufsize + blsize); @@ -235,8 +246,7 @@ class Buffers { }; ~Buffers() { - _aligned_free(orig); - orig = nullptr; + my_free(orig, isPlus, env); }; }; @@ -246,8 +256,11 @@ PVideoFrame __stdcall TCannyM::GetFrame(int n, ise_t* env) PVideoFrame src = child->GetFrame(n, env); PVideoFrame dst = env->NewVideoFrame(vi, align); -try { - auto b = Buffers(buffSize, blurSize, emaskSize, dirSize, hystSize, align); + auto b = Buffers(buffSize, blurSize, emaskSize, dirSize, hystSize, align, + isPlus, env); + if (b.orig == nullptr) { + env->ThrowError("%s: failed to allocate buffer.", name); + } const int planes[] = { PLANAR_Y, PLANAR_U, PLANAR_V }; @@ -270,12 +283,6 @@ try { continue; } - if ((reinterpret_cast(srcp) & (align - 1)) || - (src_pitch | dst_pitch) & (align - 1)) { - b.~Buffers(); - throw "Invalid memory alignment."; - } - gaussianBlur(gbRadius, gbKernel, horizontalKernel, b.buffp, b.blurp, blurPitch, srcp, src_pitch, width, height); if (mode == 4) { @@ -312,9 +319,6 @@ try { env->BitBlt(dstp, dst_pitch, b.hystp, hystPitch, width, height); } -} catch (const char* e) { - env->ThrowError("%s: %s", name, e); -} return dst; } @@ -330,82 +334,87 @@ static float calc_scale(double gmmax) static AVSValue __cdecl create_tcannymod(AVSValue args, void* user_data, ise_t* env) { - TCannyM* f; -try { - validate(!has_sse2(), "This filter requires SSE2."); - - int mode = args[1].AsInt(0); - validate(mode < 0 || mode > 4, "mode must be between 0 and 4."); - - float sigma = static_cast(args[2].AsFloat(1.5f)); - validate(sigma < 0.0f, "sigma must be greater than zero."); - - float tmin = static_cast(args[4].AsFloat(0.1f)); - validate(tmin < 0.0f, "t_l must be greater than zero."); - - float tmax = static_cast(args[3].AsFloat(8.0f)); - validate(tmax < tmin, "t_h must be greater than t_l."); - - int chroma = args[6].AsInt(0); - validate(chroma < 0 || chroma > 4, "chroma must be set to 0, 1, 2, 3 or 4."); - - float scale = calc_scale(args[7].AsFloat(255.0)); - - f = new TCannyM(args[0].AsClip(), mode, sigma, tmin, tmax, chroma, - args[5].AsBool(false), scale, args[8].AsInt(HAS_AVX2), - "TCannyMod"); -} catch (const char* e) { - env->ThrowError("TCannyMod: %s", e); -} - return f; + try { + validate(!has_sse2(), "This filter requires SSE2."); + + int mode = args[1].AsInt(0); + validate(mode < 0 || mode > 4, "mode must be between 0 and 4."); + + float sigma = static_cast(args[2].AsFloat(1.5f)); + validate(sigma < 0.0f, "sigma must be greater than zero."); + + float tmin = static_cast(args[4].AsFloat(0.1f)); + validate(tmin < 0.0f, "t_l must be greater than zero."); + + float tmax = static_cast(args[3].AsFloat(8.0f)); + validate(tmax < tmin, "t_h must be greater than t_l."); + + int chroma = args[6].AsInt(0); + validate(chroma < 0 || chroma > 4, + "chroma must be set to 0, 1, 2, 3 or 4."); + + float scale = calc_scale(args[7].AsFloat(255.0)); + + bool is_plus = user_data != nullptr; + + return new TCannyM(args[0].AsClip(), mode, sigma, tmin, tmax, chroma, + args[5].AsBool(false), scale, args[8].AsInt(HAS_AVX2), + "TCannyMod", is_plus); + } catch (std::runtime_error& e) { + env->ThrowError("TCannyMod: %s", e.what()); + } + return 0; } static AVSValue __cdecl create_gblur(AVSValue args, void* user_data, ise_t* env) { - TCannyM* f; -try { - validate(!has_sse2(), "This filter requires SSE2."); - - float sigma = (float)args[1].AsFloat(0.5); - validate(sigma < 0.0f, "sigma must be greater than zero."); - - int chroma = args[2].AsInt(1); - validate(chroma < 0 || chroma > 4, "chroma must be set to 0, 1, 2, 3 or 4."); - - f = new TCannyM(args[0].AsClip(), 4, sigma, 1.0f, 1.0f, chroma, false, - 1.0f, args[3].AsInt(HAS_AVX2), "GBlur"); -} catch (const char* e) { - env->ThrowError("GBlur: %s", e); -} - return f; + try { + validate(!has_sse2(), "This filter requires SSE2."); + + float sigma = (float)args[1].AsFloat(0.5); + validate(sigma < 0.0f, "sigma must be greater than zero."); + + int chroma = args[2].AsInt(1); + validate(chroma < 0 || chroma > 4, + "chroma must be set to 0, 1, 2, 3 or 4."); + + bool is_plus = user_data != nullptr; + + return new TCannyM(args[0].AsClip(), 4, sigma, 1.0f, 1.0f, chroma, false, + 1.0f, args[3].AsInt(HAS_AVX2), "GBlur", is_plus); + } catch (std::runtime_error& e) { + env->ThrowError("GBlur: %s", e.what()); + } + return 0; } static AVSValue __cdecl create_emask(AVSValue args, void* user_data, ise_t* env) { - TCannyM* f; -try { - validate(!has_sse2(), "This filter requires SSE2."); - - float sigma = (float)args[1].AsFloat(1.5); - validate(sigma < 0.0f, "sigma must be greater than zero."); - - int chroma = args[2].AsInt(0); - validate(chroma < 0 || chroma > 4, - "chroma must be set to 0, 1, 2, 3 or 4."); - - float scale = calc_scale(args[2].AsFloat(50.0)); - - f = new TCannyM(args[0].AsClip(), 1, sigma, 1.0f, 1.0f, chroma, - args[5].AsBool(false), scale, args[3].AsInt(HAS_AVX2), - "EMask"); -} catch (const char* e) { - env->ThrowError("EMask: %s", e); -} - return f; + try { + validate(!has_sse2(), "This filter requires SSE2."); + + float sigma = (float)args[1].AsFloat(1.5); + validate(sigma < 0.0f, "sigma must be greater than zero."); + + int chroma = args[2].AsInt(0); + validate(chroma < 0 || chroma > 4, + "chroma must be set to 0, 1, 2, 3 or 4."); + + float scale = calc_scale(args[2].AsFloat(50.0)); + + bool is_plus = user_data != nullptr; + + return new TCannyM(args[0].AsClip(), 1, sigma, 1.0f, 1.0f, chroma, + args[5].AsBool(false), scale, args[3].AsInt(HAS_AVX2), + "EMask", is_plus); + } catch (std::runtime_error& e) { + env->ThrowError("EMask: %s", e.what()); + } + return 0; } @@ -416,6 +425,9 @@ extern "C" __declspec(dllexport) const char * __stdcall AvisynthPluginInit3(ise_t* env, const AVS_Linkage* const vectors) { AVS_linkage = vectors; + + void* is_plus = env->FunctionExists("SetFilterMTMode") ? "true" : nullptr; + env->AddFunction("TCannyMod", /*0*/ "c" /*1*/ "[mode]i" @@ -425,10 +437,20 @@ AvisynthPluginInit3(ise_t* env, const AVS_Linkage* const vectors) /*5*/ "[sobel]b" /*6*/ "[chroma]i" /*7*/ "[gmmax]f" - /*8*/ "[opt]i", create_tcannymod, nullptr); + /*8*/ "[opt]i", create_tcannymod, is_plus); + env->AddFunction("GBlur", "c[sigma]f[chroma]i[opt]i", - create_gblur, nullptr); + create_gblur, is_plus); env->AddFunction("EMask", "c[sigma]f[gmmax]f[chroma]i[sobel]b[opt]i", - create_emask, nullptr); - return "Canny edge detection filter for Avisynth2.6 ver." TCANNY_M_VERSION; + create_emask, is_plus); + + if (is_plus != nullptr) { + auto env2 = static_cast(env); + env2->SetFilterMTMode("TCannyMod", MT_NICE_FILTER, true); + env2->SetFilterMTMode("GBlur", MT_NICE_FILTER, true); + env2->SetFilterMTMode("EMask", MT_NICE_FILTER, true); + } + + return "Canny edge detection filter for Avisynth2.6/Avisynth+ ver." + TCANNY_M_VERSION; } diff --git a/avisynth/src/tcannymod.h b/avisynth/src/tcannymod.h index 01dfebb..8e57de1 100644 --- a/avisynth/src/tcannymod.h +++ b/avisynth/src/tcannymod.h @@ -34,7 +34,7 @@ #include #include -#define TCANNY_M_VERSION "1.1.1" +#define TCANNY_M_VERSION "1.2.0" constexpr size_t GB_MAX_LENGTH = 17; @@ -80,6 +80,7 @@ class TCannyM : public GenericVideoFilter { const char* name; int numPlanes; size_t align; + bool isPlus; int mode; int chroma; float th_min; @@ -109,13 +110,14 @@ class TCannyM : public GenericVideoFilter { public: TCannyM(PClip child, int mode, float sigma, float th_min, float th_max, - int chroma, bool sobel, float scale, int opt, const char* name); + int chroma, bool sobel, float scale, int opt, const char* name, + bool is_plus); ~TCannyM(); PVideoFrame __stdcall GetFrame(int n, ise_t* env); }; -extern int has_sse2(); -extern int has_sse41(); -extern int has_avx(); -extern int has_avx2(); +extern bool has_sse2(); +extern bool has_sse41(); +extern bool has_avx(); +extern bool has_avx2(); #endif diff --git a/avisynth/src/write_frame.h b/avisynth/src/write_frame.h index 74fd995..30f01b1 100644 --- a/avisynth/src/write_frame.h +++ b/avisynth/src/write_frame.h @@ -34,7 +34,7 @@ template static void __stdcall write_gradient_mask(const float* srcp, uint8_t* dstp, const size_t width, const size_t height, const size_t dst_pitch, - const size_t src_pitch, const float scale) + const size_t src_pitch, const float scale) noexcept { constexpr size_t align = sizeof(Vi); constexpr size_t step = align / sizeof(float); @@ -59,7 +59,7 @@ write_gradient_mask(const float* srcp, uint8_t* dstp, const size_t width, Vi x3 = cvtps_i32(f3); Vi ret = cvti32_u8(x0, x1, x2, x3); - stream(dstp + x, ret); + stream(dstp + x, ret); } srcp += src_pitch; dstp += dst_pitch; @@ -71,7 +71,7 @@ template static void __stdcall write_gradient_direction(const int32_t* dirp, uint8_t* dstp, const size_t dir_pitch, const size_t dst_pitch, - const size_t width, const size_t height) + const size_t width, const size_t height) noexcept { constexpr size_t align = sizeof(Vi); constexpr size_t step = align / sizeof(int32_t); @@ -83,7 +83,7 @@ write_gradient_direction(const int32_t* dirp, uint8_t* dstp, Vi x2 = load(dirp + x + step * 2); Vi x3 = load(dirp + x + step * 3); Vi dst = cvti32_u8(x0, x1, x2, x3); - stream(dstp + x, dst); + stream(dstp + x, dst); } dirp += dir_pitch; dstp += dst_pitch; @@ -96,7 +96,7 @@ void __stdcall write_edge_direction(const int32_t* dirp, const uint8_t* hystp, uint8_t* dstp, const size_t dir_pitch, const size_t hyst_pitch, const size_t dst_pitch, const size_t width, - const size_t height) + const size_t height) noexcept { constexpr size_t align = sizeof(Vi); constexpr size_t step = align / sizeof(int32_t); @@ -110,7 +110,7 @@ write_edge_direction(const int32_t* dirp, const uint8_t* hystp, uint8_t* dstp, const Vi dir = cvti32_u8(x0, x1, x2, x3); const Vi hyst = load(hystp + x); const Vi dst = and_si(dir, hyst); - stream(dstp + x, dst); + stream(dstp + x, dst); } dirp += dir_pitch; hystp += hyst_pitch;