diff --git a/.gitignore b/.gitignore
index 8dda599..82d42fa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,16 +3,18 @@ avisynth/src/Debug/*
 avisynth/src/x64/*
 avisynth/src/.vs/*
 avisynth/archive/*
+vapoursynth/*
 *.opensdf
 *.opendb
 *.psess
 *.sdf
 *.suo
 *.sln
-*.vcxproj.filters
-*.vcxproj.user
+*.filters
+*.user
 *.vspx
 *.vsp
 *.dll
 *.avs
 *.exe
+*.db
diff --git a/avisynth/LISENCE.GPLv2 b/avisynth/LICENSE.GPLv2
similarity index 100%
rename from avisynth/LISENCE.GPLv2
rename to avisynth/LICENSE.GPLv2
diff --git a/avisynth/src/TCannyMod.vcxproj b/avisynth/src/TCannyMod.vcxproj
index 3353dd6..567db3d 100644
--- a/avisynth/src/TCannyMod.vcxproj
+++ b/avisynth/src/TCannyMod.vcxproj
@@ -71,7 +71,7 @@
     <LinkIncremental>true</LinkIncremental>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>true</LinkIncremental>
+    <LinkIncremental>false</LinkIncremental>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
     <LinkIncremental>false</LinkIncremental>
@@ -103,12 +103,14 @@
       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
       <OmitFramePointers>true</OmitFramePointers>
       <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
-      <FloatingPointModel>Precise</FloatingPointModel>
+      <FloatingPointModel>Fast</FloatingPointModel>
       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <StringPooling>true</StringPooling>
+      <Optimization>MaxSpeed</Optimization>
     </ClCompile>
     <Link>
       <TargetMachine>MachineX86</TargetMachine>
-      <GenerateDebugInformation>Debug</GenerateDebugInformation>
+      <GenerateDebugInformation>No</GenerateDebugInformation>
       <SubSystem>Windows</SubSystem>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
@@ -120,14 +122,14 @@
       <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
-      <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
+      <EnableEnhancedInstructionSet>NotSet</EnableEnhancedInstructionSet>
       <OmitFramePointers>true</OmitFramePointers>
       <FloatingPointModel>Fast</FloatingPointModel>
-      <Optimization>Full</Optimization>
+      <Optimization>MaxSpeed</Optimization>
       <StringPooling>true</StringPooling>
     </ClCompile>
     <Link>
-      <GenerateDebugInformation>Debug</GenerateDebugInformation>
+      <GenerateDebugInformation>No</GenerateDebugInformation>
       <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
     </Link>
   </ItemDefinitionGroup>
diff --git a/avisynth/src/cpu_check.cpp b/avisynth/src/cpu_check.cpp
index f52d43e..e8855f1 100644
--- a/avisynth/src/cpu_check.cpp
+++ b/avisynth/src/cpu_check.cpp
@@ -71,12 +71,12 @@ static inline void get_cpuid2(int *array, int info_type, int ecx)
 #endif
 }
 
-static inline int is_bit_set(int bitfield, int bit)
+static inline int is_bit_set(int bitfield, int bit)  noexcept
 {
     return bitfield & (1 << bit);
 }
 
-static uint32_t get_simd_support_info(void)
+static uint32_t get_simd_support_info(void) noexcept
 {
     uint32_t ret = 0;
     int regs[4] = {0};
@@ -157,27 +157,27 @@ static uint32_t get_simd_support_info(void)
     return ret;
 }
 
-int has_sse2()
+bool has_sse2() noexcept
 {
-    return !!(get_simd_support_info() & CPU_SSE2_SUPPORT);
+    return (get_simd_support_info() & CPU_SSE2_SUPPORT) != 0;
 }
 
-int has_ssse3()
+bool has_ssse3() noexcept
 {
-    return !!(get_simd_support_info() & CPU_SSSE3_SUPPORT);
+    return (get_simd_support_info() & CPU_SSSE3_SUPPORT) != 0;
 }
 
-int has_sse41()
+bool has_sse41() noexcept
 {
-    return !!(get_simd_support_info() & CPU_SSE4_1_SUPPORT);
+    return (get_simd_support_info() & CPU_SSE4_1_SUPPORT) != 0;
 }
 
-int has_avx()
+bool has_avx() noexcept
 {
-    return !!(get_simd_support_info() & CPU_AVX_SUPPORT);
+    return (get_simd_support_info() & CPU_AVX_SUPPORT) != 0;
 }
 
-int has_avx2()
+bool has_avx2() noexcept
 {
-    return !!(get_simd_support_info() & CPU_AVX2_SUPPORT);
+    return (get_simd_support_info() & CPU_AVX2_SUPPORT) != 0;
 }
diff --git a/avisynth/src/edge_detection.h b/avisynth/src/edge_detection.h
index eccb577..69b7bb2 100644
--- a/avisynth/src/edge_detection.h
+++ b/avisynth/src/edge_detection.h
@@ -31,7 +31,7 @@
 #include "simd.h"
 
 
-static const float* get_tangent(int idx)
+static const float* get_tangent(int idx) noexcept
 {
      alignas(32) static const float tangent[32] = {
         0.414213538169860839843750f, 0.414213538169860839843750f, // tan(pi/8)
@@ -60,7 +60,7 @@ template <typename Vf, typename Vi, bool CALC_DIR>
 static void __stdcall
 standard(float* blurp, const size_t blur_pitch, float* emaskp,
          const size_t emask_pitch, int32_t* dirp, const size_t dir_pitch,
-         const size_t width, const size_t height)
+         const size_t width, const size_t height) noexcept
 {
 
     constexpr size_t step = sizeof(Vf) / sizeof(float);
@@ -110,13 +110,13 @@ standard(float* blurp, const size_t blur_pitch, float* emaskp,
                 Vi d3 = castps_si(and_ps(cmpge_ps(tan, t1125), cmplt_ps(tan, t1575)));
                 d3 = srli_i32(d3, 24);
                 d0 = or_si(or_si(d0, d1), or_si(d2, d3));
-                stream<Vi>(dirp + x, d0);
+                stream(dirp + x, d0);
             }
 
             Vf magnitude = mul(gx, gx);
             magnitude = madd(gy, gy, magnitude);
             magnitude = sqrt(magnitude);
-            stream<Vf>(emaskp + x, magnitude);
+            stream(emaskp + x, magnitude);
         }
         emaskp += emask_pitch;
         dirp += dir_pitch;
@@ -138,7 +138,7 @@ template <typename Vf, typename Vi, bool CALC_DIR>
 static void __stdcall
 sobel(float* blurp, const size_t blur_pitch, float* emaskp,
       const size_t emask_pitch, int32_t* dirp, const size_t dir_pitch,
-      const size_t width, const size_t height)
+      const size_t width, const size_t height) noexcept
 {
     constexpr size_t step = sizeof(Vf) / sizeof(float);
 
@@ -197,13 +197,13 @@ sobel(float* blurp, const size_t blur_pitch, float* emaskp,
                 Vi d3 = castps_si(and_ps(cmpge_ps(tan, t1125), cmplt_ps(tan, t1575)));
                 d3 = srli_i32(d3, 24);
                 d0 = or_si(or_si(d0, d1), or_si(d2, d3));
-                stream<Vi>(dirp + x, d0);
+                stream(dirp + x, d0);
             }
 
             Vf magnitude = mul(gx, gx);
             magnitude = madd(gy, gy, magnitude);
             magnitude = sqrt(magnitude);
-            stream<Vf>(emaskp + x, magnitude);
+            stream(emaskp + x, magnitude);
         }
         emaskp += emask_pitch;
         dirp += dir_pitch;
@@ -218,7 +218,8 @@ template <typename Vf, typename Vi>
 static void __stdcall
 non_max_suppress(const float* emaskp, const size_t em_pitch,
                  const int32_t* dirp, const size_t dir_pitch, float* blurp,
-                 const size_t blur_pitch, const size_t width, const size_t height)
+                 const size_t blur_pitch, const size_t width,
+                 const size_t height) noexcept
 {
     constexpr size_t step = sizeof(Vf) / sizeof(float);
 
@@ -275,7 +276,7 @@ non_max_suppress(const float* emaskp, const size_t em_pitch,
 void __stdcall
 hysteresis(uint8_t* hystp, const size_t hpitch, float* blurp,
            const size_t bpitch, const int width, const int height,
-           const float tmin, const float tmax);
+           const float tmin, const float tmax) noexcept;
 
 #endif
 
diff --git a/avisynth/src/gaussian_blur.h b/avisynth/src/gaussian_blur.h
index 46e04a8..3504933 100644
--- a/avisynth/src/gaussian_blur.h
+++ b/avisynth/src/gaussian_blur.h
@@ -32,14 +32,14 @@
 template <typename Vf, arch_t ARCH>
 static void __stdcall
 convert_to_float(const size_t width, const size_t height, const uint8_t* srcp,
-                 const int src_pitch, float* blurp, const size_t blur_pitch)
+                 const int src_pitch, float* blurp, const size_t blur_pitch) noexcept
 {
     constexpr size_t step = sizeof(Vf) / sizeof(float);
 
     for (size_t y = 0; y < height; y++) {
         for (size_t x = 0; x < width; x += step) {
             Vf val = cvtu8_ps<Vf, ARCH>(srcp + x);
-            stream<Vf>(blurp + x, val);
+            stream(blurp + x, val);
         }
         srcp += src_pitch;
         blurp += blur_pitch;
@@ -50,7 +50,7 @@ convert_to_float(const size_t width, const size_t height, const uint8_t* srcp,
 template <typename Vf>
 static void
 horizontal_blur(const float* hkernel, float* buffp, const int radius,
-                const size_t width, float* blurp)
+                const size_t width, float* blurp) noexcept
 {
     constexpr size_t step = sizeof(Vf) / sizeof(float);
     const int length = radius * 2 + 1;
@@ -67,7 +67,7 @@ horizontal_blur(const float* hkernel, float* buffp, const int radius,
             Vf val = loadu<Vf>(buffp + x + i);
             sum = madd(k, val, sum);
         }
-        stream<Vf>(blurp + x, sum);
+        stream(blurp + x, sum);
     }
 }
 
@@ -77,7 +77,7 @@ static void __stdcall
 gaussian_blur(const int radius, const float* kernel, const float* hkernel,
               float* buffp, float* blurp, const size_t blur_pitch,
               const uint8_t* srcp, const size_t src_pitch, const size_t width,
-              const size_t height)
+              const size_t height) noexcept
 {
     if (radius == 0) {
         convert_to_float<Vf, ARCH>(
@@ -106,7 +106,7 @@ gaussian_blur(const int radius, const float* kernel, const float* hkernel,
 
                 sum = madd(k, input, sum);
             }
-            store<Vf>(buffp + x, sum);
+            store(buffp + x, sum);
         }
         horizontal_blur<Vf>(hkernel, buffp, radius, width, blurp);
         blurp += blur_pitch;
diff --git a/avisynth/src/hysteresis.cpp b/avisynth/src/hysteresis.cpp
index 52c37ad..ca5ae2d 100644
--- a/avisynth/src/hysteresis.cpp
+++ b/avisynth/src/hysteresis.cpp
@@ -41,7 +41,7 @@ struct Pos {
 static __forceinline void
 hystfunc(const int32_t x, const int32_t y, float* edge, uint8_t* hyst,
          const size_t epitch, const size_t hpitch, const float th,
-         std::vector<Pos>& stack)
+         std::vector<Pos>& stack) noexcept
 {
     if (!hyst[x + y * hpitch] && edge[x + y * epitch] > th) {
         edge[x + y * epitch] = FLT_MAX;
@@ -54,10 +54,11 @@ hystfunc(const int32_t x, const int32_t y, float* edge, uint8_t* hyst,
 void __stdcall
 hysteresis(uint8_t* hystp, const size_t hpitch, float* blurp,
            const size_t bpitch, const int width, const int height,
-           const float tmin, const float tmax)
+           const float tmin, const float tmax) noexcept
 {
     memset(hystp, 0, hpitch * height);
     std::vector<Pos> stack;
+    stack.reserve(512);
 
     for (int32_t y = 0; y < height; ++y) {
         for (int32_t x = 0; x < width; ++x) {
diff --git a/avisynth/src/simd.h b/avisynth/src/simd.h
index 64352d3..4f5fc44 100644
--- a/avisynth/src/simd.h
+++ b/avisynth/src/simd.h
@@ -26,8 +26,13 @@
 #ifndef TCANNY_MOD_SIMD_H
 #define TCANNY_MOD_SIMD_H
 
+
 #include <cstdint>
+#if defined(__AVX2__)
 #include <immintrin.h>
+#else
+#include <smmintrin.h>
+#endif
 
 #define SFINLINE static __forceinline
 
@@ -42,6 +47,10 @@ enum arch_t {
 template <typename T>
 T zero();
 
+template <typename T>
+T set1_ps(const float& x);
+
+
 template <>
 SFINLINE __m128i zero<__m128i>()
 {
@@ -54,33 +63,12 @@ SFINLINE __m128 zero<__m128>()
     return _mm_setzero_ps();
 }
 
-template <>
-SFINLINE __m256i zero<__m256i>()
-{
-    return _mm256_setzero_si256();
-}
-
-template <>
-SFINLINE __m256 zero<__m256>()
-{
-    return _mm256_setzero_ps();
-}
-
-template <typename T>
-T set1_ps(const float& x);
-
 template <>
 SFINLINE __m128 set1_ps<__m128>(const float& x)
 {
     return _mm_set_ps1(x);
 }
 
-template <>
-SFINLINE __m256 set1_ps<__m256>(const float& x)
-{
-    return _mm256_set1_ps(x);
-}
-
 template <typename T>
 T set1_i8(const int8_t&);
 
@@ -90,17 +78,19 @@ SFINLINE __m128i set1_i8<__m128i>(const int8_t& x)
     return _mm_set1_epi8(x);
 }
 
-template <>
-SFINLINE __m256i set1_i8<__m256i>(const int8_t& x)
-{
-    return _mm256_set1_epi8(x);
-}
-
 
 /*---------------load--------------------*/
 template <typename T>
 T load(const float* p);
 
+template <typename T>
+T load(const uint8_t*);
+
+template <typename T>
+T load(const int32_t*);
+
+
+
 template <>
 SFINLINE __m128 load<__m128>(const float* p)
 {
@@ -108,357 +98,418 @@ SFINLINE __m128 load<__m128>(const float* p)
 }
 
 template <>
-SFINLINE __m256 load<__m256>(const float* p)
+SFINLINE __m128i load(const uint8_t* p)
 {
-    return _mm256_load_ps(p);
+    return _mm_load_si128(reinterpret_cast<const __m128i*>(p));
 }
 
-template <typename T>
-T load(const uint8_t*);
-
 template <>
-SFINLINE __m128i load(const uint8_t* p)
+SFINLINE __m128i load(const int32_t* p)
 {
     return _mm_load_si128(reinterpret_cast<const __m128i*>(p));
 }
 
+
+
+template <typename T>
+T loadu(const float* p);
+
+template <typename T>
+T loadu(const uint8_t* p);
+
+template <typename T>
+T loadu(const int32_t* p);
+
+
+
 template <>
-SFINLINE __m256i load(const uint8_t* p)
+SFINLINE __m128 loadu(const float* p)
 {
-    return _mm256_load_si256(reinterpret_cast<const __m256i*>(p));
+    return _mm_loadu_ps(p);
 }
 
-template <typename T>
-T load(const int32_t*);
-
 template <>
-SFINLINE __m128i load(const int32_t* p)
+SFINLINE __m128i loadu(const uint8_t* p)
 {
-    return _mm_load_si128(reinterpret_cast<const __m128i*>(p));
+    return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
 }
 
 template <>
-SFINLINE __m256i load(const int32_t* p)
+SFINLINE __m128i loadu(const int32_t* p)
 {
-    return _mm256_load_si256(reinterpret_cast<const __m256i*>(p));
+    return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
 }
 
-template <typename T>
-T loadu(const float* p);
 
-template <>
-SFINLINE __m128 loadu<__m128>(const float* p)
+/*-------------store---------------------*/
+
+SFINLINE void store(float* p, const __m128& x)
 {
-    return _mm_loadu_ps(p);
+    _mm_store_ps(p, x);
 }
 
-template <>
-SFINLINE __m256 loadu<__m256>(const float* p)
+SFINLINE void storeu(float* p, const __m128& x)
 {
-    return _mm256_loadu_ps(p);
+    _mm_storeu_ps(p, x);
 }
 
-template <typename T>
-T loadu(const uint8_t* p);
+SFINLINE void stream(float* p, const __m128& x)
+{
+    _mm_stream_ps(p, x);
+}
 
-template <>
-SFINLINE __m128i loadu<__m128i>(const uint8_t* p)
+SFINLINE void stream(uint8_t* p, const __m128i& x)
 {
-    return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
+    return _mm_stream_si128(reinterpret_cast<__m128i*>(p), x);
 }
 
-template <>
-SFINLINE __m256i loadu<__m256i>(const uint8_t* p)
+SFINLINE void stream(int32_t* p, const __m128i& x)
 {
-    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
+    return _mm_stream_si128(reinterpret_cast<__m128i*>(p), x);
 }
 
-template <typename T>
-T loadu(const int32_t* p);
 
-template <>
-SFINLINE __m128i loadu<__m128i>(const int32_t* p)
+
+/*-----------cast--------------------------*/
+SFINLINE __m128i castps_si(const __m128& x)
 {
-    return _mm_loadu_si128(reinterpret_cast<const __m128i*>(p));
+    return _mm_castps_si128(x);
 }
 
-template <>
-SFINLINE __m256i loadu<__m256i>(const int32_t* p)
+SFINLINE __m128 castsi_ps(const __m128i& x)
 {
-    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
+    return _mm_castsi128_ps(x);
 }
 
 
-/*-------------store---------------------*/
-template <typename T>
-void store(float* p, const T& x) {}
 
-template <>
-SFINLINE void store<__m128>(float* p, const __m128& x)
+/*-------------------logical-------------------------------*/
+SFINLINE __m128 and_ps(const __m128& x, const __m128& y)
 {
-    _mm_store_ps(p, x);
+    return _mm_and_ps(x, y);
 }
 
-template <>
-SFINLINE void store<__m256>(float* p, const __m256& x)
+SFINLINE __m128i and_si(const __m128i& x, const __m128i& y)
 {
-    _mm256_store_ps(p, x);
+    return _mm_and_si128(x, y);
 }
 
-template <typename T>
-void storeu(float* p, const T& x) {}
+SFINLINE __m128 or_ps(const __m128& x, const __m128& y)
+{
+    return _mm_or_ps(x, y);
+}
 
-template <>
-SFINLINE void storeu<__m128>(float* p, const __m128& x)
+SFINLINE __m128i or_si(const __m128i& x, const __m128i& y)
 {
-    _mm_storeu_ps(p, x);
+    return _mm_or_si128(x, y);
 }
 
-template <>
-SFINLINE void storeu<__m256>(float* p, const __m256& x)
+SFINLINE __m128 andnot_ps(const __m128& x, const __m128& y)
 {
-    _mm256_storeu_ps(p, x);
+    return _mm_andnot_ps(x, y);
 }
 
-template <typename T>
-void stream(float* p, const T& x) {}
+SFINLINE __m128i andnot_si(const __m128i& x, const __m128i& y)
+{
+    return _mm_andnot_si128(x, y);
+}
 
-template <>
-SFINLINE void stream<__m128>(float* p, const __m128& x)
+SFINLINE __m128 xor_ps(const __m128& x, const __m128& y)
 {
-    _mm_stream_ps(p, x);
+    return _mm_xor_ps(x, y);
 }
 
-template <>
-SFINLINE void stream<__m256>(float* p, const __m256& x)
+SFINLINE __m128i xor_si(const __m128i& x, const __m128i& y)
 {
-    _mm256_stream_ps(p, x);
+    return _mm_xor_si128(x, y);
 }
 
-template <typename T>
-void stream(uint8_t* p, const T& x) {}
 
-template <>
-SFINLINE void stream<__m128i>(uint8_t* p, const __m128i& x)
+/*-----------------shift-----------------------*/
+SFINLINE __m128i srli_i32(const __m128i& x, int n)
 {
-    return _mm_stream_si128(reinterpret_cast<__m128i*>(p), x);
+    return _mm_srli_epi32(x, n);
 }
 
-template <>
-SFINLINE void stream<__m256i>(uint8_t* p, const __m256i& x)
+
+/*------------------arithmetic--------------------*/
+SFINLINE __m128 add(const __m128& x, const __m128& y)
 {
-    return _mm256_stream_si256(reinterpret_cast<__m256i*>(p), x);
+    return _mm_add_ps(x, y);
 }
 
-template <typename T>
-void stream(int32_t* p, const T& x) {}
+SFINLINE __m128 sub(const __m128& x, const __m128& y)
+{
+    return _mm_sub_ps(x, y);
+}
 
-template <>
-SFINLINE void stream<__m128i>(int32_t* p, const __m128i& x)
+SFINLINE __m128 mul(const __m128& x, const __m128& y)
 {
-    return _mm_stream_si128(reinterpret_cast<__m128i*>(p), x);
+    return _mm_mul_ps(x, y);
 }
 
-template <>
-SFINLINE void stream<__m256i>(int32_t* p, const __m256i& x)
+SFINLINE __m128 madd(const __m128& x, const __m128& y, const __m128& z)
 {
-    return _mm256_stream_si256(reinterpret_cast<__m256i*>(p), x);
+    return add(mul(x, y), z);
 }
 
 
 
-/*-----------cast--------------------------*/
-SFINLINE __m128i castps_si(const __m128& x)
+/*--------------convert-----------------------*/
+SFINLINE __m128i cvtps_i32(const __m128& x)
 {
-    return _mm_castps_si128(x);
+    return _mm_cvtps_epi32(x);
 }
 
-SFINLINE __m256i castps_si(const __m256& x)
+template <typename T, arch_t ARCH>
+T cvtu8_ps(const uint8_t* ptr);
+
+template <>
+SFINLINE __m128 cvtu8_ps<__m128, HAS_SSE2>(const uint8_t* ptr)
 {
-    return _mm256_castps_si256(x);
+    const int32_t* p32 = reinterpret_cast<const int32_t*>(ptr);
+    __m128i t = _mm_cvtsi32_si128(p32[0]);
+    __m128i z = zero<__m128i>();
+    t = _mm_unpacklo_epi8(t, z);
+    t = _mm_unpacklo_epi16(t, z);
+    return _mm_cvtepi32_ps(t);
 }
 
-SFINLINE __m128 castsi_ps(const __m128i& x)
+template <>
+SFINLINE __m128 cvtu8_ps<__m128, HAS_SSE41>(const uint8_t* ptr)
 {
-    return _mm_castsi128_ps(x);
+    const int32_t* p32 = reinterpret_cast<const int32_t*>(ptr);
+    __m128i t = _mm_cvtsi32_si128(p32[0]);
+    t = _mm_cvtepu8_epi32(t);
+    return _mm_cvtepi32_ps(t);
 }
 
-SFINLINE __m256 castsi_ps(const __m256i& x)
+SFINLINE __m128i
+cvti32_u8(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
 {
-    return _mm256_castsi256_ps(x);
+    __m128i x = _mm_packs_epi32(a, b);
+    __m128i y = _mm_packs_epi32(c, d);
+    return _mm_packus_epi16(x, y);
 }
 
 
+/*-----------------math-----------------------*/
+SFINLINE __m128 max(const __m128& x, const __m128& y)
+{
+    return _mm_max_ps(x, y);
+}
 
-/*-------------------logical-------------------------------*/
-SFINLINE __m128 and_ps(const __m128& x, const __m128& y)
+SFINLINE __m128 rcp_ps(const __m128& x)
 {
-    return _mm_and_ps(x, y);
+    return _mm_rcp_ps(x);
 }
 
-SFINLINE __m256 and_ps(const __m256& x, const __m256& y)
+SFINLINE __m128 sqrt(const __m128& x)
 {
-    return _mm256_and_ps(x, y);
+    return _mm_sqrt_ps(x);
 }
 
-SFINLINE __m128i and_si(const __m128i& x, const __m128i& y)
+
+/*-----------compare-------------------------------*/
+
+SFINLINE __m128i cmpeq_i32(const __m128i& x, const __m128i& y)
 {
-    return _mm_and_si128(x, y);
+    return _mm_cmpeq_epi32(x, y);
 }
 
-SFINLINE __m256i and_si(const __m256i& x, const __m256i& y)
+SFINLINE __m128 cmplt_ps(const __m128& x, const __m128& y)
 {
-    return _mm256_and_si256(x, y);
+    return _mm_cmplt_ps(x, y);
 }
 
-SFINLINE __m128 or_ps(const __m128& x, const __m128& y)
+SFINLINE __m128 cmpge_ps(const __m128& x, const __m128& y)
 {
-    return _mm_or_ps(x, y);
+    return _mm_cmpge_ps(x, y);
 }
 
-SFINLINE __m256 or_ps(const __m256& x, const __m256& y)
+SFINLINE __m128 cmpord_ps(const __m128& x, const __m128& y)
 {
-    return _mm256_or_ps(x, y);
+    return _mm_cmpord_ps(x, y);
 }
 
-SFINLINE __m128i or_si(const __m128i& x, const __m128i& y)
+
+
+
+/*----------------misc-----------------------------*/
+SFINLINE __m128 blendv(const __m128& x, const __m128& y, const __m128& mask)
 {
-    return _mm_or_si128(x, y);
+    return or_ps(and_ps(mask, y), andnot_ps(mask, x));
 }
 
-SFINLINE __m256i or_si(const __m256i& x, const __m256i& y)
+
+
+
+
+#if defined(__AVX2__)
+
+template <>
+SFINLINE __m256i zero<__m256i>()
 {
-    return _mm256_or_si256(x, y);
+    return _mm256_setzero_si256();
 }
 
-SFINLINE __m128 andnot_ps(const __m128& x, const __m128& y)
+template <>
+SFINLINE __m256 zero<__m256>()
 {
-    return _mm_andnot_ps(x, y);
+    return _mm256_setzero_ps();
 }
 
-SFINLINE __m256 andnot_ps(const __m256& x, const __m256& y)
+template <>
+SFINLINE __m256 set1_ps<__m256>(const float& x)
 {
-    return _mm256_andnot_ps(x, y);
+    return _mm256_set1_ps(x);
 }
 
-SFINLINE __m128i andnot_si(const __m128i& x, const __m128i& y)
+template <>
+SFINLINE __m256i set1_i8<__m256i>(const int8_t& x)
 {
-    return _mm_andnot_si128(x, y);
+    return _mm256_set1_epi8(x);
 }
 
-SFINLINE __m256i andnot_si(const __m256i& x, const __m256i& y)
+template <>
+SFINLINE __m256 load(const float* p)
 {
-    return _mm256_andnot_si256(x, y);
+    return _mm256_load_ps(p);
 }
 
-SFINLINE __m128 xor_ps(const __m128& x, const __m128& y)
+template <>
+SFINLINE __m256i load(const uint8_t* p)
 {
-    return _mm_xor_ps(x, y);
+    return _mm256_load_si256(reinterpret_cast<const __m256i*>(p));
 }
 
-SFINLINE __m256 xor_ps(const __m256& x, const __m256& y)
+template <>
+SFINLINE __m256i load(const int32_t* p)
 {
-    return _mm256_xor_ps(x, y);
+    return _mm256_load_si256(reinterpret_cast<const __m256i*>(p));
 }
 
-SFINLINE __m128i xor_si(const __m128i& x, const __m128i& y)
+template <>
+SFINLINE __m256 loadu(const float* p)
 {
-    return _mm_xor_si128(x, y);
+    return _mm256_loadu_ps(p);
 }
 
-SFINLINE __m256i xor_si(const __m256i& x, const __m256i& y)
+template <>
+SFINLINE __m256i loadu(const uint8_t* p)
 {
-    return _mm256_xor_si256(x, y);
+    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
 }
 
+template <>
+SFINLINE __m256i loadu(const int32_t* p)
+{
+    return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(p));
+}
 
-/*-----------------shift-----------------------*/
-SFINLINE __m128i srli_i32(const __m128i& x, int n)
+SFINLINE void store(float* p, const __m256& x)
 {
-    return _mm_srli_epi32(x, n);
+    _mm256_store_ps(p, x);
 }
 
-SFINLINE __m256i srli_i32(const __m256i& x, int n)
+SFINLINE void storeu(float* p, const __m256& x)
 {
-    return _mm256_srli_epi32(x, n);
+    _mm256_storeu_ps(p, x);
 }
 
+SFINLINE void stream(float* p, const __m256& x)
+{
+    _mm256_stream_ps(p, x);
+}
 
-/*------------------arithmetic--------------------*/
-SFINLINE __m128 add(const __m128& x, const __m128& y)
+SFINLINE void stream(uint8_t* p, const __m256i& x)
 {
-    return _mm_add_ps(x, y);
+    return _mm256_stream_si256(reinterpret_cast<__m256i*>(p), x);
 }
 
-SFINLINE __m256 add(const __m256& x, const __m256& y)
+SFINLINE void stream(int32_t* p, const __m256i& x)
 {
-    return _mm256_add_ps(x, y);
+    return _mm256_stream_si256(reinterpret_cast<__m256i*>(p), x);
 }
 
-SFINLINE __m128 sub(const __m128& x, const __m128& y)
+SFINLINE __m256i castps_si(const __m256& x)
 {
-    return _mm_sub_ps(x, y);
+    return _mm256_castps_si256(x);
 }
 
-SFINLINE __m256 sub(const __m256& x, const __m256& y)
+SFINLINE __m256 castsi_ps(const __m256i& x)
 {
-    return _mm256_sub_ps(x, y);
+    return _mm256_castsi256_ps(x);
 }
 
-SFINLINE __m128 mul(const __m128& x, const __m128& y)
+SFINLINE __m256 and_ps(const __m256& x, const __m256& y)
 {
-    return _mm_mul_ps(x, y);
+    return _mm256_and_ps(x, y);
 }
 
-SFINLINE __m256 mul(const __m256& x, const __m256& y)
+SFINLINE __m256i and_si(const __m256i& x, const __m256i& y)
 {
-    return _mm256_mul_ps(x, y);
+    return _mm256_and_si256(x, y);
 }
 
-SFINLINE __m128 madd(const __m128& x, const __m128& y, const __m128& z)
+SFINLINE __m256 or_ps(const __m256& x, const __m256& y)
 {
-    return add(mul(x, y), z);
+    return _mm256_or_ps(x, y);
 }
 
-SFINLINE __m256 madd(const __m256& x, const __m256& y, const __m256& z)
+SFINLINE __m256i or_si(const __m256i& x, const __m256i& y)
 {
-    return _mm256_fmadd_ps(x, y, z);
+    return _mm256_or_si256(x, y);
 }
 
+SFINLINE __m256 andnot_ps(const __m256& x, const __m256& y)
+{
+    return _mm256_andnot_ps(x, y);
+}
 
+SFINLINE __m256i andnot_si(const __m256i& x, const __m256i& y)
+{
+    return _mm256_andnot_si256(x, y);
+}
 
-/*--------------convert-----------------------*/
-SFINLINE __m128i cvtps_i32(const __m128& x)
+SFINLINE __m256 xor_ps(const __m256& x, const __m256& y)
 {
-    return _mm_cvtps_epi32(x);
+    return _mm256_xor_ps(x, y);
 }
 
-SFINLINE __m256i cvtps_i32(const __m256& x)
+SFINLINE __m256i xor_si(const __m256i& x, const __m256i& y)
 {
-    return _mm256_cvtps_epi32(x);
+    return _mm256_xor_si256(x, y);
 }
 
-template <typename T, arch_t ARCH>
-T cvtu8_ps(const uint8_t* ptr);
+SFINLINE __m256i srli_i32(const __m256i& x, int n)
+{
+    return _mm256_srli_epi32(x, n);
+}
 
-template <>
-SFINLINE __m128 cvtu8_ps<__m128, HAS_SSE2>(const uint8_t* ptr)
+SFINLINE __m256 add(const __m256& x, const __m256& y)
 {
-    const int32_t* p32 = reinterpret_cast<const int32_t*>(ptr);
-    __m128i t = _mm_cvtsi32_si128(p32[0]);
-    __m128i z = zero<__m128i>();
-    t = _mm_unpacklo_epi8(t, z);
-    t = _mm_unpacklo_epi16(t, z);
-    return _mm_cvtepi32_ps(t);
+    return _mm256_add_ps(x, y);
 }
 
-template <>
-SFINLINE __m128 cvtu8_ps<__m128, HAS_SSE41>(const uint8_t* ptr)
+SFINLINE __m256 sub(const __m256& x, const __m256& y)
 {
-    const int32_t* p32 = reinterpret_cast<const int32_t*>(ptr);
-    __m128i t = _mm_cvtsi32_si128(p32[0]);
-    t = _mm_cvtepu8_epi32(t);
-    return _mm_cvtepi32_ps(t);
+    return _mm256_sub_ps(x, y);
+}
+
+SFINLINE __m256 mul(const __m256& x, const __m256& y)
+{
+    return _mm256_mul_ps(x, y);
+}
+
+SFINLINE __m256 madd(const __m256& x, const __m256& y, const __m256& z)
+{
+    return _mm256_fmadd_ps(x, y, z);
+}
+
+SFINLINE __m256i cvtps_i32(const __m256& x)
+{
+    return _mm256_cvtps_epi32(x);
 }
 
 template <>
@@ -469,14 +520,6 @@ SFINLINE __m256 cvtu8_ps<__m256, HAS_AVX2>(const uint8_t* ptr)
     return _mm256_cvtepi32_ps(t1);
 }
 
-SFINLINE __m128i
-cvti32_u8(const __m128i& a, const __m128i& b, const __m128i& c, const __m128i& d)
-{
-    __m128i x = _mm_packs_epi32(a, b);
-    __m128i y = _mm_packs_epi32(c, d);
-    return _mm_packus_epi16(x, y);
-}
-
 SFINLINE __m256i
 cvti32_u8(const __m256i& a, const __m256i& b, const __m256i& c, const __m256i& d)
 {
@@ -487,109 +530,64 @@ cvti32_u8(const __m256i& a, const __m256i& b, const __m256i& c, const __m256i& d
     return _mm256_packus_epi16(t0, t1);
 }
 
-
-/*-----------------math-----------------------*/
-SFINLINE __m128 max(const __m128& x, const __m128& y)
-{
-    return _mm_max_ps(x, y);
-}
-
 SFINLINE __m256 max(const __m256& x, const __m256& y)
 {
     return _mm256_max_ps(x, y);
 }
 
-template <typename T>
-SFINLINE T abs(const T& val)
-{
-    return max(val, sub(zero<T>(), val));
-}
-
-SFINLINE __m128 rcp_ps(const __m128& x)
-{
-    return _mm_rcp_ps(x);
-}
-
 SFINLINE __m256 rcp_ps(const __m256& x)
 {
     return _mm256_rcp_ps(x);
 }
 
-template <typename T>
-SFINLINE T rcp_hq(const T& x)
-{
-    T rcp = rcp_ps(x);
-    T t = mul(mul(x, rcp), rcp);
-    rcp = add(rcp, rcp);
-    return sub(rcp, t);
-}
-
-SFINLINE __m128 sqrt(const __m128& x)
-{
-    return _mm_sqrt_ps(x);
-}
-
 SFINLINE __m256 sqrt(const __m256& x)
 {
     return _mm256_sqrt_ps(x);
 }
 
-
-/*-----------compare-------------------------------*/
-
-SFINLINE __m128i cmpeq_i32(const __m128i& x, const __m128i& y)
-{
-    return _mm_cmpeq_epi32(x, y);
-}
-
 SFINLINE __m256i cmpeq_i32(const __m256i& x, const __m256i& y)
 {
     return _mm256_cmpeq_epi32(x, y);
 }
 
-SFINLINE __m128 cmplt_ps(const __m128& x, const __m128& y)
-{
-    return _mm_cmplt_ps(x, y);
-}
-
 SFINLINE __m256 cmplt_ps(const __m256& x, const __m256& y)
 {
     return _mm256_cmp_ps(x, y, _CMP_LT_OQ);
 }
 
-SFINLINE __m128 cmpge_ps(const __m128& x, const __m128& y)
-{
-    return _mm_cmpge_ps(x, y);
-}
-
 SFINLINE __m256 cmpge_ps(const __m256& x, const __m256& y)
 {
     return _mm256_cmp_ps(x, y, _CMP_GE_OQ);
 }
 
-SFINLINE __m128 cmpord_ps(const __m128& x, const __m128& y)
+SFINLINE __m256 cmpord_ps(const __m256& x, const __m256& y)
 {
-    return _mm_cmpord_ps(x, y);
+    return _mm256_cmp_ps(x, y, _CMP_ORD_Q);
 }
 
-SFINLINE __m256 cmpord_ps(const __m256& x, const __m256& y)
+SFINLINE __m256 blendv(const __m256&x, const __m256& y, const __m256& mask)
 {
-    return _mm256_cmp_ps(x, y, _CMP_ORD_Q);
+    return _mm256_blendv_ps(x, y, mask);
 }
 
+#endif // __AVX2__
 
-/*----------------misc-----------------------------*/
-SFINLINE __m128 blendv(const __m128& x, const __m128& y, const __m128& mask)
+
+template <typename T>
+SFINLINE T abs(const T& val)
 {
-    return or_ps(and_ps(mask, y), andnot_ps(mask, x));
+    return max(val, sub(zero<T>(), val));
 }
 
-SFINLINE __m256 blendv(const __m256&x, const __m256& y, const __m256& mask)
+template <typename T>
+SFINLINE T rcp_hq(const T& x)
 {
-    return _mm256_blendv_ps(x, y, mask);
+    T rcp = rcp_ps(x);
+    T t = mul(mul(x, rcp), rcp);
+    rcp = add(rcp, rcp);
+    return sub(rcp, t);
 }
 
 
 
-
 #endif
diff --git a/avisynth/src/tcannymod.cpp b/avisynth/src/tcannymod.cpp
index 3e1a8f1..d3afd6a 100644
--- a/avisynth/src/tcannymod.cpp
+++ b/avisynth/src/tcannymod.cpp
@@ -28,39 +28,29 @@
 #include <algorithm>
 #include <map>
 #include <tuple>
+#include <stdexcept>
 #include "tcannymod.h"
 #include "gaussian_blur.h"
 #include "edge_detection.h"
 #include "write_frame.h"
 
 
-static gaussian_blur_t get_gaussian_blur(arch_t arch)
-{
-    if (arch == HAS_SSE2) {
-        return gaussian_blur<__m128, GB_MAX_LENGTH, HAS_SSE2>;
-    }
-    if (arch == HAS_SSE41) {
-        return gaussian_blur<__m128, GB_MAX_LENGTH, HAS_SSE41>;
-    }
-    return gaussian_blur<__m256, GB_MAX_LENGTH, HAS_AVX2>;
-}
-
-
 static edge_detection_t
-get_edge_detection(bool use_sobel, bool calc_dir, arch_t arch)
+get_edge_detection(bool use_sobel, bool calc_dir, arch_t arch) noexcept
 {
     using std::make_tuple;
     std::map<std::tuple<bool, bool, arch_t>, edge_detection_t> func;
 
     func[make_tuple(false, false, HAS_SSE2)] = standard<__m128, __m128i, false>;
-    func[make_tuple(false, false, HAS_AVX2)] = standard<__m256, __m256i, false>;
     func[make_tuple(false, true, HAS_SSE2)] = standard<__m128, __m128i, true>;
-    func[make_tuple(false, true, HAS_AVX2)] = standard<__m256, __m256i, true>;
-
     func[make_tuple(true, false, HAS_SSE2)] = sobel<__m128, __m128i, false>;
-    func[make_tuple(true, false, HAS_AVX2)] = sobel<__m256, __m256i, false>;
     func[make_tuple(true, true, HAS_SSE2)] = sobel<__m128, __m128i, true>;
+#if defined(__AVX2__)
+    func[make_tuple(false, false, HAS_AVX2)] = standard<__m256, __m256i, false>;
+    func[make_tuple(false, true, HAS_AVX2)] = standard<__m256, __m256i, true>;
+    func[make_tuple(true, false, HAS_AVX2)] = sobel<__m256, __m256i, false>;
     func[make_tuple(true, true, HAS_AVX2)] = sobel<__m256, __m256i, true>;
+#endif
 
     arch_t a = arch == HAS_SSE41 ? HAS_SSE2 : arch;
 
@@ -68,42 +58,52 @@ get_edge_detection(bool use_sobel, bool calc_dir, arch_t arch)
 }
 
 
-static non_max_suppress_t
-get_non_max_suppress(arch_t arch)
+
+static write_gradient_mask_t
+get_write_gradient_mask(bool scale, arch_t arch) noexcept
 {
-    if (arch < HAS_AVX2) {
-        return non_max_suppress<__m128, __m128i>;
+#if defined(__AVX2__)
+    if (arch == HAS_AVX2) {
+        return scale ? write_gradient_mask<__m256, __m256i, true>
+                     : write_gradient_mask<__m256, __m256i, false>;
     }
-    return non_max_suppress<__m256, __m256i>;
+#endif
+    return scale ? write_gradient_mask<__m128, __m128i, true>
+                 : write_gradient_mask<__m128, __m128i, false>; 
+
 }
 
 
-static write_gradient_mask_t get_write_gradient_mask(bool scale, arch_t arch)
+static inline void validate(bool cond, const char* msg)
 {
-    if (arch < HAS_AVX2) {
-        return scale ? write_gradient_mask<__m128, __m128i, true>
-                     : write_gradient_mask<__m128, __m128i, false>; 
-    }
-    return scale ? write_gradient_mask<__m256, __m256i, true>
-                 : write_gradient_mask<__m256, __m256i, false>;
+    if (cond)
+        throw std::runtime_error(msg);
 }
 
 
-static write_gradient_direction_t get_write_gradient_direction(arch_t arch)
+template <typename T>
+static inline T
+my_malloc(size_t size, size_t align, bool is_plus, AvsAllocType at,
+          ise_t* env) noexcept
 {
-    if (arch < HAS_AVX2) {
-        return write_gradient_direction<__m128i>;
+    void* p;
+    if (is_plus) {
+        p = static_cast<IScriptEnvironment2*>(env)->Allocate(size, align, at);
+    } else {
+        p = _aligned_malloc(size, align);
     }
-    return write_gradient_direction<__m256i>;
+    return reinterpret_cast<T>(p);
 }
 
 
-static write_edge_direction_t get_write_edge_direction(arch_t arch)
+static inline void my_free(void* p, bool is_plus, ise_t* env) noexcept
 {
-    if (arch < HAS_AVX2) {
-        return write_edge_direction<__m128i>;
+    if (is_plus) {
+        static_cast<IScriptEnvironment2*>(env)->Free(p);
+    } else {
+        _aligned_free(p);
     }
-    return write_edge_direction<__m256i>;
+    p = nullptr;
 }
 
 
@@ -112,10 +112,7 @@ set_gb_kernel(float sigma, int& radius, float* kernel)
 {
     radius = std::max(static_cast<int>(sigma * 3.0f + 0.5f), 1);
     int length = radius * 2 + 1;
-    if (length > GB_MAX_LENGTH) {
-        radius = 0;
-        return;
-    }
+    validate(length > GB_MAX_LENGTH, "sigma is too large.");
 
     float sum = 0.0f;
     for (int i = -radius; i <= radius; i++) {
@@ -127,44 +124,42 @@ set_gb_kernel(float sigma, int& radius, float* kernel)
 }
 
 
-static arch_t get_arch(int opt)
+static arch_t get_arch(int opt, bool is_plus) noexcept
 {
     if (opt == 0 || !has_sse41()) {
         return HAS_SSE2;
     }
+#if !defined(__AVX2__)
+    return HAS_SSE41;
+#else
     if (opt == 1 || !has_avx2()) {
         return HAS_SSE41;
     }
     return HAS_AVX2;
+#endif
 }
 
 
-static inline void validate(bool cond, const char* msg)
-{
-    if (cond)
-        throw msg;
-}
-
 TCannyM::TCannyM(PClip ch, int m, float sigma, float tmin, float tmax, int c,
-                 bool sobel, float s, int opt, const char* n) :
+                 bool sobel, float s, int opt, const char* n, bool is_plus) :
     GenericVideoFilter(ch), mode(m), gbRadius(0), th_min(tmin), th_max(tmax),
-    chroma(c), name(n), scale(s)
+    chroma(c), name(n), scale(s), isPlus(is_plus)
 {
     validate(!vi.IsPlanar(), "Planar format only.");
 
     numPlanes = (vi.IsY8() || chroma == 0) ? 1 : 3;
 
-    arch_t arch = get_arch(opt);
+    arch_t arch = get_arch(opt, isPlus);
+
     align = (arch < HAS_AVX2) ? 16 : 32;
 
     if (sigma > 0.0f) {
         set_gb_kernel(sigma, gbRadius, gbKernel);
-        validate(gbRadius == 0, "sigma is too large.");
 
         size_t length = (gbRadius * 2 + 1);
-        horizontalKernel = static_cast<float*>(
-            _aligned_malloc(length * align, align));
-        validate(!horizontalKernel, "failed to prepare kernel.");
+        horizontalKernel = my_malloc<float*>(
+            length * align, align, false, AVS_NORMAL_ALLOC, nullptr);
+        validate(!horizontalKernel, "failed to allocate memory.");
 
         size_t step = align / sizeof(float);
         for (size_t i = 0; i < length; ++i) {
@@ -172,7 +167,6 @@ TCannyM::TCannyM(PClip ch, int m, float sigma, float tmin, float tmax, int c,
                 horizontalKernel[i * step + j] = gbKernel[i];
             }
         }
-
     }
 
     blurPitch = ((align + (vi.width + 1) * sizeof(float)) + align - 1) & ~(align - 1);
@@ -190,43 +184,60 @@ TCannyM::TCannyM(PClip ch, int m, float sigma, float tmin, float tmax, int c,
     emaskPitch /= sizeof(float);
     dirPitch /= sizeof(int32_t);
 
-    gaussianBlur = get_gaussian_blur(arch);
+    switch (arch) {
+#if defined(__AVX2__)
+    case HAS_AVX2:
+        gaussianBlur = gaussian_blur<__m256, GB_MAX_LENGTH, HAS_AVX2>;
+        nonMaximumSuppression = non_max_suppress<__m256, __m256i>;
+        writeGradientDirection = write_gradient_direction<__m256i>;
+        writeEdgeDirection = write_edge_direction<__m256i>;
+        break;
+#endif
+    case HAS_SSE41:
+        gaussianBlur = gaussian_blur<__m128, GB_MAX_LENGTH, HAS_SSE41>;
+        nonMaximumSuppression = non_max_suppress<__m128, __m128i>;
+        writeGradientDirection = write_gradient_direction<__m128i>;
+        writeEdgeDirection = write_edge_direction<__m128i>;
+        break;
+    default:
+        gaussianBlur = gaussian_blur<__m128, GB_MAX_LENGTH, HAS_SSE2>;
+        nonMaximumSuppression = non_max_suppress<__m128, __m128i>;
+        writeGradientDirection = write_gradient_direction<__m128i>;
+        writeEdgeDirection = write_edge_direction<__m128i>;
+    }
 
     edgeDetection = get_edge_detection(sobel, (mode != 1 && mode != 4), arch);
 
-    nonMaximumSuppression = get_non_max_suppress(arch);
-
     writeBluredFrame = get_write_gradient_mask(false, arch);
 
     writeGradientMask = get_write_gradient_mask(scale != 1.0f, arch);
-
-    writeGradientDirection = get_write_gradient_direction(arch);
-
-    writeEdgeDirection = get_write_edge_direction(arch);
 }
 
 
 TCannyM::~TCannyM()
 {
-    _aligned_free(horizontalKernel);
-    horizontalKernel = nullptr;
+    my_free(horizontalKernel, false, nullptr);
 }
 
 
 class Buffers {
-    uint8_t* orig;
+    ise_t* env;
+    bool isPlus;
 public:
+    uint8_t* orig;
     float* buffp;
     float* blurp;
     float* emaskp;
     int32_t* dirp;
     uint8_t* hystp;
     Buffers(size_t bufsize, size_t blsize, size_t emsize, size_t dirsize,
-            size_t hystsize, size_t align)
+            size_t hystsize, size_t align, bool ip, ise_t* e) :
+        env(e), isPlus(ip)
     {
         size_t total_size = bufsize + blsize + emsize + dirsize + hystsize;
-        orig = static_cast<uint8_t*>(_aligned_malloc(total_size, align));
-        validate(!orig, "failed to allocate buffers.");
+        orig = my_malloc<uint8_t*>(
+            total_size, align, isPlus, AVS_POOLED_ALLOC, env);
+
         buffp = reinterpret_cast<float*>(orig) + 8;
         blurp = reinterpret_cast<float*>(orig + bufsize + align);
         emaskp = reinterpret_cast<float*>(orig + bufsize + blsize);
@@ -235,8 +246,7 @@ class Buffers {
     };
     ~Buffers()
     {
-        _aligned_free(orig);
-        orig = nullptr;
+        my_free(orig, isPlus, env);
     };
 };
 
@@ -246,8 +256,11 @@ PVideoFrame __stdcall TCannyM::GetFrame(int n, ise_t* env)
     PVideoFrame src = child->GetFrame(n, env);
     PVideoFrame dst = env->NewVideoFrame(vi, align);
 
-try {
-    auto b = Buffers(buffSize, blurSize, emaskSize, dirSize, hystSize, align);
+    auto b = Buffers(buffSize, blurSize, emaskSize, dirSize, hystSize, align,
+                     isPlus, env);
+    if (b.orig == nullptr) {
+        env->ThrowError("%s: failed to allocate buffer.", name);
+    }
 
     const int planes[] = { PLANAR_Y, PLANAR_U, PLANAR_V };
 
@@ -270,12 +283,6 @@ try {
             continue;
         }
 
-        if ((reinterpret_cast<uintptr_t>(srcp) & (align - 1)) ||
-                (src_pitch | dst_pitch) & (align - 1)) {
-            b.~Buffers();
-            throw "Invalid memory alignment.";
-        }
-
         gaussianBlur(gbRadius, gbKernel, horizontalKernel, b.buffp, b.blurp,
                      blurPitch, srcp, src_pitch, width, height);
         if (mode == 4) {
@@ -312,9 +319,6 @@ try {
 
         env->BitBlt(dstp, dst_pitch, b.hystp, hystPitch, width, height);
     }
-} catch (const char* e) {
-    env->ThrowError("%s: %s", name, e);
-}
 
     return dst;
 }
@@ -330,82 +334,87 @@ static float calc_scale(double gmmax)
 static AVSValue __cdecl
 create_tcannymod(AVSValue args, void* user_data, ise_t* env)
 {
-    TCannyM* f;
-try {
-    validate(!has_sse2(), "This filter requires SSE2.");
-
-    int mode = args[1].AsInt(0);
-    validate(mode < 0 || mode > 4, "mode must be between 0 and 4.");
-
-    float sigma = static_cast<float>(args[2].AsFloat(1.5f));
-    validate(sigma < 0.0f, "sigma must be greater than zero.");
-
-    float tmin = static_cast<float>(args[4].AsFloat(0.1f));
-    validate(tmin < 0.0f, "t_l must be greater than zero.");
-
-    float tmax = static_cast<float>(args[3].AsFloat(8.0f));
-    validate(tmax < tmin, "t_h must be greater than t_l.");
-
-    int chroma = args[6].AsInt(0);
-    validate(chroma < 0 || chroma > 4, "chroma must be set to 0, 1, 2, 3 or 4.");
-
-    float scale = calc_scale(args[7].AsFloat(255.0));
-
-    f = new TCannyM(args[0].AsClip(), mode, sigma, tmin, tmax, chroma,
-                    args[5].AsBool(false), scale, args[8].AsInt(HAS_AVX2),
-                    "TCannyMod");
-} catch (const char* e) {
-    env->ThrowError("TCannyMod: %s", e);
-}
-    return f;
+    try {
+        validate(!has_sse2(), "This filter requires SSE2.");
+    
+        int mode = args[1].AsInt(0);
+        validate(mode < 0 || mode > 4, "mode must be between 0 and 4.");
+    
+        float sigma = static_cast<float>(args[2].AsFloat(1.5f));
+        validate(sigma < 0.0f, "sigma must be greater than zero.");
+    
+        float tmin = static_cast<float>(args[4].AsFloat(0.1f));
+        validate(tmin < 0.0f, "t_l must be greater than zero.");
+    
+        float tmax = static_cast<float>(args[3].AsFloat(8.0f));
+        validate(tmax < tmin, "t_h must be greater than t_l.");
+    
+        int chroma = args[6].AsInt(0);
+        validate(chroma < 0 || chroma > 4,
+                 "chroma must be set to 0, 1, 2, 3 or 4.");
+    
+        float scale = calc_scale(args[7].AsFloat(255.0));
+    
+        bool is_plus = user_data != nullptr;
+    
+        return new TCannyM(args[0].AsClip(), mode, sigma, tmin, tmax, chroma,
+                           args[5].AsBool(false), scale, args[8].AsInt(HAS_AVX2),
+                           "TCannyMod", is_plus);
+    } catch (std::runtime_error& e) {
+        env->ThrowError("TCannyMod: %s", e.what());
+    }
+    return 0;
 }
 
 
 static AVSValue __cdecl
 create_gblur(AVSValue args, void* user_data, ise_t* env)
 {
-    TCannyM* f;
-try {
-    validate(!has_sse2(), "This filter requires SSE2.");
-
-    float sigma = (float)args[1].AsFloat(0.5);
-    validate(sigma < 0.0f, "sigma must be greater than zero.");
-
-    int chroma = args[2].AsInt(1);
-    validate(chroma < 0 || chroma > 4, "chroma must be set to 0, 1, 2, 3 or 4.");
-
-    f = new TCannyM(args[0].AsClip(), 4, sigma, 1.0f, 1.0f, chroma, false,
-                    1.0f, args[3].AsInt(HAS_AVX2), "GBlur");
-} catch (const char* e) {
-    env->ThrowError("GBlur: %s", e);
-}
-    return f;
+    try {
+        validate(!has_sse2(), "This filter requires SSE2.");
+    
+        float sigma = (float)args[1].AsFloat(0.5);
+        validate(sigma < 0.0f, "sigma must be greater than zero.");
+    
+        int chroma = args[2].AsInt(1);
+        validate(chroma < 0 || chroma > 4,
+                 "chroma must be set to 0, 1, 2, 3 or 4.");
+    
+        bool is_plus = user_data != nullptr;
+    
+        return new TCannyM(args[0].AsClip(), 4, sigma, 1.0f, 1.0f, chroma, false,
+                        1.0f, args[3].AsInt(HAS_AVX2), "GBlur", is_plus);
+    } catch (std::runtime_error& e) {
+        env->ThrowError("GBlur: %s", e.what());
+    }
+    return 0;
 }
 
 
 static AVSValue __cdecl
 create_emask(AVSValue args, void* user_data, ise_t* env)
 {
-    TCannyM* f;
-try {
-    validate(!has_sse2(), "This filter requires SSE2.");
-
-    float sigma = (float)args[1].AsFloat(1.5);
-    validate(sigma < 0.0f, "sigma must be greater than zero.");
-
-    int chroma = args[2].AsInt(0);
-    validate(chroma < 0 || chroma > 4,
-             "chroma must be set to 0, 1, 2, 3 or 4.");
-
-    float scale = calc_scale(args[2].AsFloat(50.0));
-
-    f = new TCannyM(args[0].AsClip(), 1, sigma, 1.0f, 1.0f, chroma,
-                    args[5].AsBool(false), scale, args[3].AsInt(HAS_AVX2),
-                    "EMask");
-} catch (const char* e) {
-    env->ThrowError("EMask: %s", e);
-}
-    return f;
+    try {
+        validate(!has_sse2(), "This filter requires SSE2.");
+    
+        float sigma = (float)args[1].AsFloat(1.5);
+        validate(sigma < 0.0f, "sigma must be greater than zero.");
+    
+        int chroma = args[2].AsInt(0);
+        validate(chroma < 0 || chroma > 4,
+                 "chroma must be set to 0, 1, 2, 3 or 4.");
+    
+        float scale = calc_scale(args[2].AsFloat(50.0));
+    
+        bool is_plus = user_data != nullptr;
+    
+        return new TCannyM(args[0].AsClip(), 1, sigma, 1.0f, 1.0f, chroma,
+                           args[5].AsBool(false), scale, args[3].AsInt(HAS_AVX2),
+                           "EMask", is_plus);
+    } catch (std::runtime_error& e) {
+        env->ThrowError("EMask: %s", e.what());
+    }
+    return 0;
 }
 
 
@@ -416,6 +425,9 @@ extern "C" __declspec(dllexport) const char * __stdcall
 AvisynthPluginInit3(ise_t* env, const AVS_Linkage* const vectors)
 {
     AVS_linkage = vectors;
+
+    void* is_plus = env->FunctionExists("SetFilterMTMode") ? "true" : nullptr;
+
     env->AddFunction("TCannyMod",
              /*0*/   "c"
              /*1*/   "[mode]i"
@@ -425,10 +437,20 @@ AvisynthPluginInit3(ise_t* env, const AVS_Linkage* const vectors)
              /*5*/   "[sobel]b"
              /*6*/   "[chroma]i"
              /*7*/   "[gmmax]f"
-             /*8*/   "[opt]i", create_tcannymod, nullptr);
+             /*8*/   "[opt]i", create_tcannymod, is_plus);
+
     env->AddFunction("GBlur", "c[sigma]f[chroma]i[opt]i",
-                     create_gblur, nullptr);
+                     create_gblur, is_plus);
     env->AddFunction("EMask", "c[sigma]f[gmmax]f[chroma]i[sobel]b[opt]i",
-                     create_emask, nullptr);
-    return "Canny edge detection filter for Avisynth2.6 ver." TCANNY_M_VERSION;
+                     create_emask, is_plus);
+
+    if (is_plus != nullptr) {
+        auto env2 = static_cast<IScriptEnvironment2*>(env);
+        env2->SetFilterMTMode("TCannyMod", MT_NICE_FILTER, true);
+        env2->SetFilterMTMode("GBlur", MT_NICE_FILTER, true);
+        env2->SetFilterMTMode("EMask", MT_NICE_FILTER, true);
+    }
+
+    return "Canny edge detection filter for Avisynth2.6/Avisynth+ ver."
+        TCANNY_M_VERSION;
 }
diff --git a/avisynth/src/tcannymod.h b/avisynth/src/tcannymod.h
index 01dfebb..8e57de1 100644
--- a/avisynth/src/tcannymod.h
+++ b/avisynth/src/tcannymod.h
@@ -34,7 +34,7 @@
 #include <windows.h>
 #include <avisynth.h>
 
-#define TCANNY_M_VERSION "1.1.1"
+#define TCANNY_M_VERSION "1.2.0"
 
 constexpr size_t GB_MAX_LENGTH = 17;
 
@@ -80,6 +80,7 @@ class TCannyM : public GenericVideoFilter {
     const char* name;
     int numPlanes;
     size_t align;
+    bool isPlus;
     int mode;
     int chroma;
     float th_min;
@@ -109,13 +110,14 @@ class TCannyM : public GenericVideoFilter {
 
 public:
     TCannyM(PClip child, int mode, float sigma, float th_min, float th_max,
-            int chroma, bool sobel, float scale, int opt, const char* name);
+            int chroma, bool sobel, float scale, int opt, const char* name,
+            bool is_plus);
     ~TCannyM();
     PVideoFrame __stdcall GetFrame(int n, ise_t* env);
 };
 
-extern int has_sse2();
-extern int has_sse41();
-extern int has_avx();
-extern int has_avx2();
+extern bool has_sse2();
+extern bool has_sse41();
+extern bool has_avx();
+extern bool has_avx2();
 #endif
diff --git a/avisynth/src/write_frame.h b/avisynth/src/write_frame.h
index 74fd995..30f01b1 100644
--- a/avisynth/src/write_frame.h
+++ b/avisynth/src/write_frame.h
@@ -34,7 +34,7 @@ template <typename Vf, typename Vi, bool SCALE>
 static void __stdcall
 write_gradient_mask(const float* srcp, uint8_t* dstp, const size_t width,
                     const size_t height, const size_t dst_pitch,
-                    const size_t src_pitch, const float scale)
+                    const size_t src_pitch, const float scale) noexcept
 {
     constexpr size_t align = sizeof(Vi);
     constexpr size_t step = align / sizeof(float);
@@ -59,7 +59,7 @@ write_gradient_mask(const float* srcp, uint8_t* dstp, const size_t width,
             Vi x3 = cvtps_i32(f3);
 
             Vi ret = cvti32_u8(x0, x1, x2, x3);
-            stream<Vi>(dstp + x, ret);
+            stream(dstp + x, ret);
         }
         srcp += src_pitch;
         dstp += dst_pitch;
@@ -71,7 +71,7 @@ template <typename Vi>
 static void __stdcall
 write_gradient_direction(const int32_t* dirp, uint8_t* dstp,
                          const size_t dir_pitch, const size_t dst_pitch,
-                         const size_t width, const size_t height)
+                         const size_t width, const size_t height) noexcept
 {
     constexpr size_t align = sizeof(Vi);
     constexpr size_t step = align / sizeof(int32_t);
@@ -83,7 +83,7 @@ write_gradient_direction(const int32_t* dirp, uint8_t* dstp,
             Vi x2 = load<Vi>(dirp + x + step * 2);
             Vi x3 = load<Vi>(dirp + x + step * 3);
             Vi dst = cvti32_u8(x0, x1, x2, x3);
-            stream<Vi>(dstp + x, dst);
+            stream(dstp + x, dst);
         }
         dirp += dir_pitch;
         dstp += dst_pitch;
@@ -96,7 +96,7 @@ void __stdcall
 write_edge_direction(const int32_t* dirp, const uint8_t* hystp, uint8_t* dstp,
                      const size_t dir_pitch, const size_t hyst_pitch,
                      const size_t dst_pitch, const size_t width,
-                     const size_t height)
+                     const size_t height) noexcept
 {
     constexpr size_t align = sizeof(Vi);
     constexpr size_t step = align / sizeof(int32_t);
@@ -110,7 +110,7 @@ write_edge_direction(const int32_t* dirp, const uint8_t* hystp, uint8_t* dstp,
             const Vi dir = cvti32_u8(x0, x1, x2, x3);
             const Vi hyst = load<Vi>(hystp + x);
             const Vi dst = and_si(dir, hyst);
-            stream<Vi>(dstp + x, dst);
+            stream(dstp + x, dst);
         }
         dirp += dir_pitch;
         hystp += hyst_pitch;