diff --git a/src/mono/mono/arch/arm64/arm64-codegen.h b/src/mono/mono/arch/arm64/arm64-codegen.h index 67c0864ed91bd..49dc27294832a 100644 --- a/src/mono/mono/arch/arm64/arm64-codegen.h +++ b/src/mono/mono/arch/arm64/arm64-codegen.h @@ -1111,8 +1111,8 @@ arm_encode_arith_imm (int imm, guint32 *shift) /* NEON :: extract */ #define arm_neon_extr_opcode(p, q, op2, imm4, rd, rn, rm) arm_neon_opcode_3reg ((p), (q), 0b00101110000000000000000000000000 | (op2) << 22 | (imm4) << 11, (rd), (rn), (rm)) -#define arm_neon_ext_8b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_LOW, 0b00, (index), (rd), (rn), (rd)) -#define arm_neon_ext_16b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_FULL, 0b00, (index), (rd), (rn), (rd)) +#define arm_neon_ext_8b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_LOW, 0b00, (index), (rd), (rn), (rm)) +#define arm_neon_ext_16b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_FULL, 0b00, (index), (rd), (rn), (rm)) /* NEON :: copy */ #define arm_neon_cpy_opcode(p, q, op, imm5, imm4, rd, rn) arm_neon_opcode_2reg ((p), (q), 0b00001110000000000000010000000000 | (op) << 29 | (imm5) << 16 | (imm4) << 11, (rd), (rn)) diff --git a/src/mono/mono/mini/cpu-arm64.mdesc b/src/mono/mono/mini/cpu-arm64.mdesc index 0e27f23291ad3..8da7174c0fd58 100644 --- a/src/mono/mono/mini/cpu-arm64.mdesc +++ b/src/mono/mono/mini/cpu-arm64.mdesc @@ -521,6 +521,16 @@ expand_i4: dest:x src1:i len:4 expand_i8: dest:x src1:i len:4 expand_r4: dest:x src1:f len:4 expand_r8: dest:x src1:f len:4 +insert_i1: dest:x src1:i len:4 +insert_i2: dest:x src1:i len:4 +insert_i4: dest:x src1:i len:4 +insert_i8: dest:x src1:i len:4 +insert_r4: dest:x src1:f len:4 +insert_r8: dest:x src1:f len:4 +create_scalar_int: dest:x src1:i len:8 +create_scalar_float: dest:x src1:f len:12 +create_scalar_unsafe_int: dest:x src1:i len:4 +create_scalar_unsafe_float: dest:x src1:f len:4 generic_class_init: src1:a len:44 clob:c gc_safe_point: src1:i len:12 clob:c diff --git a/src/mono/mono/mini/mini-arm64.c b/src/mono/mono/mini/mini-arm64.c index e9e9ff6b0edcb..3d23bc27da4ba 100644 --- a/src/mono/mono/mini/mini-arm64.c +++ b/src/mono/mono/mini/mini-arm64.c @@ -3717,6 +3717,48 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) } break; } + /* SIMD that is not table-generated */ + /* TODO: once https://github.com/dotnet/runtime/issues/83252 is done, + * move the following two to the codegen table in simd-arm64.h + */ + case OP_ONES_COMPLEMENT: + arm_neon_not (code, get_vector_size_macro (ins), dreg, sreg1); + break; + case OP_NEGATION: + if (is_type_float_macro (ins->inst_c1)) { + arm_neon_fneg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1); + } else { + arm_neon_neg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1); + } + break; + case OP_XBINOP: + switch (ins->inst_c0) { + case OP_IMAX: + code = emit_smax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2); + break; + case OP_IMAX_UN: + code = emit_umax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2); + break; + case OP_IMIN: + code = emit_smin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2); + break; + case OP_IMIN_UN: + code = emit_umin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2); + break; + default: + g_assert_not_reached (); + } + break; + case OP_XZERO: + arm_neon_eor_16b (code, dreg, dreg, dreg); + break; + case OP_XONES: + arm_neon_eor_16b (code, dreg, dreg, dreg); + arm_neon_not_16b (code, dreg, dreg); + break; + case OP_XEXTRACT: + code = emit_xextract (code, VREG_FULL, ins->inst_c0, dreg, sreg1); + break; case OP_STOREX_MEMBASE: code = emit_strfpq (code, sreg1, dreg, ins->inst_offset); break; @@ -3730,10 +3772,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) if (cfg->compile_aot && cfg->code_exec_only) { mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128_GOT, ins->inst_p0); arm_ldrx_lit (code, ARMREG_IP0, 0); - arm_ldrfpq (code, ins->dreg, ARMREG_IP0, 0); + arm_ldrfpq (code, dreg, ARMREG_IP0, 0); } else { mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128, ins->inst_p0); - arm_neon_ldrq_lit (code, ins->dreg, 0); + arm_neon_ldrq_lit (code, dreg, 0); } break; } @@ -3744,13 +3786,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case OP_EXPAND_I4: case OP_EXPAND_I8: { const int t = get_type_size_macro (ins->inst_c1); - arm_neon_dup_g (code, VREG_FULL, t, ins->dreg, ins->sreg1); + arm_neon_dup_g (code, VREG_FULL, t, dreg, sreg1); break; } case OP_EXPAND_R4: case OP_EXPAND_R8: { const int t = get_type_size_macro (ins->inst_c1); - arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, 0); + arm_neon_fdup_e (code, VREG_FULL, t, dreg, sreg1, 0); break; } case OP_EXTRACT_I1: @@ -3760,9 +3802,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) const int t = get_type_size_macro (ins->inst_c1); // smov is not defined for i64 if (is_type_unsigned_macro (ins->inst_c1) || t == TYPE_I64) { - arm_neon_umov (code, t, ins->dreg, ins->sreg1, ins->inst_c0); + arm_neon_umov (code, t, dreg, sreg1, ins->inst_c0); } else { - arm_neon_smov (code, t, ins->dreg, ins->sreg1, ins->inst_c0); + arm_neon_smov (code, t, dreg, sreg1, ins->inst_c0); } break; } @@ -3773,17 +3815,39 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) // Technically, this broadcasts element #inst_c0 to all dest XREG elements; whereas it should // set the FREG to the said element. Since FREG and XREG pool is the same on arm64 and the rest // of the F/XREG is ignored in FREG mode, this operation remains valid. - arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, ins->inst_c0); + arm_neon_fdup_e (code, VREG_FULL, t, dreg, sreg1, ins->inst_c0); } break; + case OP_INSERT_I1: + case OP_INSERT_I2: + case OP_INSERT_I4: + case OP_INSERT_I8: { + const int t = get_type_size_macro (ins->inst_c1); + arm_neon_ins_g(code, t, dreg, sreg1, ins->inst_c0); + break; + } + case OP_INSERT_R4: + case OP_INSERT_R8: { + int t = 0; + switch (ins->inst_c1) { + case MONO_TYPE_R4: + t = SIZE_4; + break; + case MONO_TYPE_R8: + t = SIZE_8; + break; + } + arm_neon_ins_e(code, t, dreg, sreg1, ins->inst_c0, 0); + break; + } case OP_ARM64_XADDV: { switch (ins->inst_c0) { case INTRINS_AARCH64_ADV_SIMD_FADDV: if (ins->inst_c1 == MONO_TYPE_R8) { - arm_neon_faddp (code, VREG_FULL, TYPE_F64, ins->dreg, ins->sreg1, ins->sreg1); + arm_neon_faddp (code, VREG_FULL, TYPE_F64, dreg, sreg1, sreg1); } else if (ins->inst_c1 == MONO_TYPE_R4) { - arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->sreg1, ins->sreg1); - arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->dreg, ins->dreg); + arm_neon_faddp (code, VREG_FULL, TYPE_F32, dreg, sreg1, sreg1); + arm_neon_faddp (code, VREG_FULL, TYPE_F32, dreg, dreg, dreg); } else { g_assert_not_reached (); } @@ -3792,7 +3856,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) case INTRINS_AARCH64_ADV_SIMD_UADDV: case INTRINS_AARCH64_ADV_SIMD_SADDV: if (get_type_size_macro (ins->inst_c1) == TYPE_I64) - arm_neon_addp (code, VREG_FULL, TYPE_I64, ins->dreg, ins->sreg1, ins->sreg1); + arm_neon_addp (code, VREG_FULL, TYPE_I64, dreg, sreg1, sreg1); else g_assert_not_reached (); // remaining int types are handled through the codegen table break; @@ -3802,6 +3866,52 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) } break; } + case OP_CREATE_SCALAR_INT: { + const int t = get_type_size_macro (ins->inst_c1); + arm_neon_eor_16b (code, dreg, dreg, dreg); + arm_neon_ins_g(code, t, dreg, sreg1, 0); + break; + } + case OP_CREATE_SCALAR_FLOAT: { + int t = 0; + switch (ins->inst_c1) { + case MONO_TYPE_R4: + t = SIZE_4; + break; + case MONO_TYPE_R8: + t = SIZE_8; + break; + } + // Use a temp register for zero op, as sreg1 and dreg share the same register here + arm_neon_eor_16b (code, NEON_TMP_REG, NEON_TMP_REG, NEON_TMP_REG); + arm_neon_ins_e(code, t, NEON_TMP_REG, sreg1, 0, 0); + arm_neon_mov (code, dreg, NEON_TMP_REG); + break; + } + case OP_CREATE_SCALAR_UNSAFE_INT: { + const int t = get_type_size_macro (ins->inst_c1); + arm_neon_ins_g(code, t, dreg, sreg1, 0); + break; + } + case OP_CREATE_SCALAR_UNSAFE_FLOAT: { + if (dreg != sreg1) { + int t = 0; + switch (ins->inst_c1) { + case MONO_TYPE_R4: + t = SIZE_4; + break; + case MONO_TYPE_R8: + t = SIZE_8; + break; + } + arm_neon_ins_e(code, t, dreg, sreg1, 0, 0); + } + break; + } + // Enable this when adding support for Narrow and enable support for Create at the same time + // case OP_XCONCAT: + // arm_neon_ext_16b(code, dreg, sreg1, sreg2, 8); + // break; /* BRANCH */ case OP_BR: @@ -3875,49 +3985,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb) arm_cbnzx (code, sreg1, 0); break; - /* SIMD that is not table-generated */ - /* TODO: once https://github.com/dotnet/runtime/issues/83252 is done, - * move the following two to the codegen table in simd-arm64.h - */ - case OP_ONES_COMPLEMENT: - arm_neon_not (code, get_vector_size_macro (ins), dreg, sreg1); - break; - case OP_NEGATION: - if (is_type_float_macro (ins->inst_c1)) { - arm_neon_fneg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1); - } else { - arm_neon_neg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1); - } - break; - case OP_XBINOP: - switch (ins->inst_c0) { - case OP_IMAX: - code = emit_smax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2); - break; - case OP_IMAX_UN: - code = emit_umax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2); - break; - case OP_IMIN: - code = emit_smin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2); - break; - case OP_IMIN_UN: - code = emit_umin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2); - break; - default: - g_assert_not_reached (); - } - break; - case OP_XZERO: - arm_neon_eor_16b (code, dreg, dreg, dreg); - break; - case OP_XONES: - arm_neon_eor_16b (code, dreg, dreg, dreg); - arm_neon_not_16b (code, dreg, dreg); - break; - case OP_XEXTRACT: - code = emit_xextract (code, VREG_FULL, ins->inst_c0, dreg, sreg1); - break; - /* ALU */ case OP_IADD: arm_addw (code, dreg, sreg1, sreg2); diff --git a/src/mono/mono/mini/mini-ops.h b/src/mono/mono/mini/mini-ops.h index e8f5188a528bc..c6b49fd723330 100644 --- a/src/mono/mono/mini/mini-ops.h +++ b/src/mono/mono/mini/mini-ops.h @@ -1168,6 +1168,11 @@ MINI_OP3(OP_MULX_HL64, "mulxhl64", LREG, LREG, LREG, LREG) MINI_OP(OP_CREATE_SCALAR_UNSAFE, "create_scalar_unsafe", XREG, XREG, NONE) MINI_OP(OP_CREATE_SCALAR, "create_scalar", XREG, XREG, NONE) +MINI_OP(OP_CREATE_SCALAR_UNSAFE_INT, "create_scalar_unsafe_int", XREG, IREG, NONE) +MINI_OP(OP_CREATE_SCALAR_UNSAFE_FLOAT, "create_scalar_unsafe_float", XREG, FREG, NONE) +MINI_OP(OP_CREATE_SCALAR_INT, "create_scalar_int", XREG, IREG, NONE) +MINI_OP(OP_CREATE_SCALAR_FLOAT, "create_scalar_float", XREG, FREG, NONE) + MINI_OP(OP_XMOVE, "xmove", XREG, XREG, NONE) MINI_OP(OP_XZERO, "xzero", XREG, NONE, NONE) MINI_OP(OP_XONES, "xones", XREG, NONE, NONE) diff --git a/src/mono/mono/mini/simd-intrinsics.c b/src/mono/mono/mini/simd-intrinsics.c index 8f8d158f52e83..7e89aba8d6981 100644 --- a/src/mono/mono/mini/simd-intrinsics.c +++ b/src/mono/mono/mini/simd-intrinsics.c @@ -843,9 +843,9 @@ type_to_expand_op (MonoTypeEnum type) } static int -type_to_insert_op (MonoType *type) +type_to_insert_op (MonoTypeEnum type) { - switch (type->type) { + switch (type) { case MONO_TYPE_I1: case MONO_TYPE_U1: return OP_INSERT_I1; @@ -992,14 +992,15 @@ emit_hardware_intrinsics ( static MonoInst * emit_vector_create_elementwise ( MonoCompile *cfg, MonoMethodSignature *fsig, MonoType *vtype, - MonoType *etype, MonoInst **args) + MonoTypeEnum type, MonoInst **args) { - int op = type_to_insert_op (etype); + int op = type_to_insert_op (type); MonoClass *vklass = mono_class_from_mono_type_internal (vtype); MonoInst *ins = emit_xzero (cfg, vklass); for (int i = 0; i < fsig->param_count; ++i) { ins = emit_simd_ins (cfg, vklass, op, ins->dreg, args [i]->dreg); ins->inst_c0 = i; + ins->inst_c1 = type; } return ins; } @@ -1097,11 +1098,6 @@ static guint16 sri_vector_methods [] = { SN_AsUInt16, SN_AsUInt32, SN_AsUInt64, - SN_AsVector128, - SN_AsVector2, - SN_AsVector256, - SN_AsVector3, - SN_AsVector4, SN_BitwiseAnd, SN_BitwiseOr, SN_Ceiling, @@ -1150,8 +1146,6 @@ static guint16 sri_vector_methods [] = { SN_ToScalar, SN_ToVector128, SN_ToVector128Unsafe, - SN_ToVector256, - SN_ToVector256Unsafe, SN_WidenLower, SN_WidenUpper, SN_WithElement, @@ -1216,11 +1210,6 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi if (!COMPILE_LLVM (cfg)) return NULL; #endif -// FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64 -#ifdef TARGET_ARM64 - if (!(cfg->compile_aot && cfg->full_aot && !cfg->interp)) - return NULL; -#endif int id = lookup_intrins (sri_vector_methods, sizeof (sri_vector_methods), cmethod); if (id == -1) { @@ -1228,64 +1217,40 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi return NULL; } - if (!strcmp (m_class_get_name (cfg->method->klass), "Vector256") || !strcmp (m_class_get_name (cfg->method->klass), "Vector512")) + if (!strcmp (m_class_get_name (cmethod->klass), "Vector256") || !strcmp (m_class_get_name (cmethod->klass), "Vector512")) return NULL; // FIXME: This limitation could be removed once everything here are supported by mini JIT on arm64 #ifdef TARGET_ARM64 if (!COMPILE_LLVM (cfg)) { + if (!(!strcmp (m_class_get_name (cmethod->klass), "Vector128") || !strcmp (m_class_get_name (cmethod->klass), "Vector"))) + return NULL; switch (id) { - case SN_Add: - case SN_Equals: - case SN_GreaterThan: - case SN_GreaterThanOrEqual: - case SN_LessThan: - case SN_LessThanOrEqual: - case SN_Negate: - case SN_OnesComplement: - case SN_EqualsAny: - case SN_GreaterThanAny: - case SN_GreaterThanOrEqualAny: - case SN_LessThanAny: - case SN_LessThanOrEqualAny: - case SN_EqualsAll: - case SN_GreaterThanAll: - case SN_GreaterThanOrEqualAll: - case SN_LessThanAll: - case SN_LessThanOrEqualAll: - case SN_Subtract: - case SN_BitwiseAnd: - case SN_BitwiseOr: - case SN_Xor: - case SN_As: - case SN_AsByte: - case SN_AsDouble: - case SN_AsInt16: - case SN_AsInt32: - case SN_AsInt64: - case SN_AsSByte: - case SN_AsSingle: - case SN_AsUInt16: - case SN_AsUInt32: - case SN_AsUInt64: - case SN_Max: - case SN_Min: - case SN_Sum: - case SN_ToScalar: - case SN_Floor: - case SN_Ceiling: - case SN_Divide: - case SN_Multiply: - case SN_Sqrt: - case SN_Abs: - break; - default: + case SN_AndNot: + case SN_ConditionalSelect: + case SN_ConvertToDouble: + case SN_ConvertToInt32: + case SN_ConvertToInt64: + case SN_ConvertToSingle: + case SN_ConvertToUInt32: + case SN_ConvertToUInt64: + case SN_Create: + case SN_Dot: + case SN_ExtractMostSignificantBits: + case SN_GetElement: + case SN_GetLower: + case SN_GetUpper: + case SN_Narrow: + case SN_Shuffle: + case SN_ToVector128: + case SN_ToVector128Unsafe: + case SN_WidenLower: + case SN_WidenUpper: + case SN_WithElement: return NULL; + default: + break; } - MonoClass *arg0_class = mono_class_from_mono_type_internal (fsig->params [0]); - int class_size = mono_class_value_size (arg0_class, NULL); - if (class_size != 16) - return NULL; } #endif @@ -1462,25 +1427,44 @@ emit_sri_vector (MonoCompile *cfg, MonoMethod *cmethod, MonoMethodSignature *fsi MonoType *etype = get_vector_t_elem_type (fsig->ret); if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype)) return NULL; - if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype)) - return emit_simd_ins (cfg, klass, type_to_expand_op (etype->type), args [0]->dreg, -1); - else if (is_create_from_half_vectors_overload (fsig)) + if (fsig->param_count == 1 && mono_metadata_type_equal (fsig->params [0], etype)) { + MonoInst* ins = emit_simd_ins (cfg, klass, type_to_expand_op (etype->type), args [0]->dreg, -1); + ins->inst_c1 = arg0_type; + return ins; + } else if (is_create_from_half_vectors_overload (fsig)) return emit_simd_ins (cfg, klass, OP_XCONCAT, args [0]->dreg, args [1]->dreg); else if (is_elementwise_create_overload (fsig, etype)) - return emit_vector_create_elementwise (cfg, fsig, fsig->ret, etype, args); + return emit_vector_create_elementwise (cfg, fsig, fsig->ret, arg0_type, args); break; } case SN_CreateScalar: { MonoType *etype = get_vector_t_elem_type (fsig->ret); if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype)) return NULL; - return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR, -1, arg0_type, fsig, args); + if (COMPILE_LLVM (cfg)) + return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR, -1, arg0_type, fsig, args); + else { + if (type_enum_is_float (arg0_type)) { + return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_FLOAT, -1, arg0_type, fsig, args); + } else { + return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_INT, -1, arg0_type, fsig, args); + } + } + } case SN_CreateScalarUnsafe: { MonoType *etype = get_vector_t_elem_type (fsig->ret); if (!MONO_TYPE_IS_VECTOR_PRIMITIVE (etype)) return NULL; - return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_UNSAFE, -1, arg0_type, fsig, args); + if (COMPILE_LLVM (cfg)) + return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_UNSAFE, -1, arg0_type, fsig, args); + else { + if (type_enum_is_float (arg0_type)) { + return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_UNSAFE_FLOAT, -1, arg0_type, fsig, args); + } else { + return emit_simd_ins_for_sig (cfg, klass, OP_CREATE_SCALAR_UNSAFE_INT, -1, arg0_type, fsig, args); + } + } } case SN_Dot: { if (!is_element_type_primitive (fsig->params [0]))