Skip to content

Commit

Permalink
[Mono] Enable the supported V128 SIMD intrinsics on Arm64 across all …
Browse files Browse the repository at this point in the history
…codegen engines (#84289)

* Enable the supported ones

* Add supporte for Create* and fix a bug

* Fix CreateScalar for floating types

* Fix create*

* Address review feedback
  • Loading branch information
fanyang-mono authored Apr 12, 2023
1 parent e3c3700 commit 1d3c822
Show file tree
Hide file tree
Showing 5 changed files with 194 additions and 128 deletions.
4 changes: 2 additions & 2 deletions src/mono/mono/arch/arm64/arm64-codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -1111,8 +1111,8 @@ arm_encode_arith_imm (int imm, guint32 *shift)
/* NEON :: extract */
#define arm_neon_extr_opcode(p, q, op2, imm4, rd, rn, rm) arm_neon_opcode_3reg ((p), (q), 0b00101110000000000000000000000000 | (op2) << 22 | (imm4) << 11, (rd), (rn), (rm))

#define arm_neon_ext_8b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_LOW, 0b00, (index), (rd), (rn), (rd))
#define arm_neon_ext_16b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_FULL, 0b00, (index), (rd), (rn), (rd))
#define arm_neon_ext_8b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_LOW, 0b00, (index), (rd), (rn), (rm))
#define arm_neon_ext_16b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_FULL, 0b00, (index), (rd), (rn), (rm))

/* NEON :: copy */
#define arm_neon_cpy_opcode(p, q, op, imm5, imm4, rd, rn) arm_neon_opcode_2reg ((p), (q), 0b00001110000000000000010000000000 | (op) << 29 | (imm5) << 16 | (imm4) << 11, (rd), (rn))
Expand Down
10 changes: 10 additions & 0 deletions src/mono/mono/mini/cpu-arm64.mdesc
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,16 @@ expand_i4: dest:x src1:i len:4
expand_i8: dest:x src1:i len:4
expand_r4: dest:x src1:f len:4
expand_r8: dest:x src1:f len:4
insert_i1: dest:x src1:i len:4
insert_i2: dest:x src1:i len:4
insert_i4: dest:x src1:i len:4
insert_i8: dest:x src1:i len:4
insert_r4: dest:x src1:f len:4
insert_r8: dest:x src1:f len:4
create_scalar_int: dest:x src1:i len:8
create_scalar_float: dest:x src1:f len:12
create_scalar_unsafe_int: dest:x src1:i len:4
create_scalar_unsafe_float: dest:x src1:f len:4

generic_class_init: src1:a len:44 clob:c
gc_safe_point: src1:i len:12 clob:c
Expand Down
175 changes: 121 additions & 54 deletions src/mono/mono/mini/mini-arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -3717,6 +3717,48 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
}
break;
}
/* SIMD that is not table-generated */
/* TODO: once https://github.com/dotnet/runtime/issues/83252 is done,
* move the following two to the codegen table in simd-arm64.h
*/
case OP_ONES_COMPLEMENT:
arm_neon_not (code, get_vector_size_macro (ins), dreg, sreg1);
break;
case OP_NEGATION:
if (is_type_float_macro (ins->inst_c1)) {
arm_neon_fneg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
} else {
arm_neon_neg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
}
break;
case OP_XBINOP:
switch (ins->inst_c0) {
case OP_IMAX:
code = emit_smax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMAX_UN:
code = emit_umax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMIN:
code = emit_smin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMIN_UN:
code = emit_umin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
default:
g_assert_not_reached ();
}
break;
case OP_XZERO:
arm_neon_eor_16b (code, dreg, dreg, dreg);
break;
case OP_XONES:
arm_neon_eor_16b (code, dreg, dreg, dreg);
arm_neon_not_16b (code, dreg, dreg);
break;
case OP_XEXTRACT:
code = emit_xextract (code, VREG_FULL, ins->inst_c0, dreg, sreg1);
break;
case OP_STOREX_MEMBASE:
code = emit_strfpq (code, sreg1, dreg, ins->inst_offset);
break;
Expand All @@ -3730,10 +3772,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
if (cfg->compile_aot && cfg->code_exec_only) {
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128_GOT, ins->inst_p0);
arm_ldrx_lit (code, ARMREG_IP0, 0);
arm_ldrfpq (code, ins->dreg, ARMREG_IP0, 0);
arm_ldrfpq (code, dreg, ARMREG_IP0, 0);
} else {
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128, ins->inst_p0);
arm_neon_ldrq_lit (code, ins->dreg, 0);
arm_neon_ldrq_lit (code, dreg, 0);
}
break;
}
Expand All @@ -3744,13 +3786,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
case OP_EXPAND_I4:
case OP_EXPAND_I8: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_dup_g (code, VREG_FULL, t, ins->dreg, ins->sreg1);
arm_neon_dup_g (code, VREG_FULL, t, dreg, sreg1);
break;
}
case OP_EXPAND_R4:
case OP_EXPAND_R8: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, 0);
arm_neon_fdup_e (code, VREG_FULL, t, dreg, sreg1, 0);
break;
}
case OP_EXTRACT_I1:
Expand All @@ -3760,9 +3802,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
const int t = get_type_size_macro (ins->inst_c1);
// smov is not defined for i64
if (is_type_unsigned_macro (ins->inst_c1) || t == TYPE_I64) {
arm_neon_umov (code, t, ins->dreg, ins->sreg1, ins->inst_c0);
arm_neon_umov (code, t, dreg, sreg1, ins->inst_c0);
} else {
arm_neon_smov (code, t, ins->dreg, ins->sreg1, ins->inst_c0);
arm_neon_smov (code, t, dreg, sreg1, ins->inst_c0);
}
break;
}
Expand All @@ -3773,17 +3815,39 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
// Technically, this broadcasts element #inst_c0 to all dest XREG elements; whereas it should
// set the FREG to the said element. Since FREG and XREG pool is the same on arm64 and the rest
// of the F/XREG is ignored in FREG mode, this operation remains valid.
arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, ins->inst_c0);
arm_neon_fdup_e (code, VREG_FULL, t, dreg, sreg1, ins->inst_c0);
}
break;
case OP_INSERT_I1:
case OP_INSERT_I2:
case OP_INSERT_I4:
case OP_INSERT_I8: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_ins_g(code, t, dreg, sreg1, ins->inst_c0);
break;
}
case OP_INSERT_R4:
case OP_INSERT_R8: {
int t = 0;
switch (ins->inst_c1) {
case MONO_TYPE_R4:
t = SIZE_4;
break;
case MONO_TYPE_R8:
t = SIZE_8;
break;
}
arm_neon_ins_e(code, t, dreg, sreg1, ins->inst_c0, 0);
break;
}
case OP_ARM64_XADDV: {
switch (ins->inst_c0) {
case INTRINS_AARCH64_ADV_SIMD_FADDV:
if (ins->inst_c1 == MONO_TYPE_R8) {
arm_neon_faddp (code, VREG_FULL, TYPE_F64, ins->dreg, ins->sreg1, ins->sreg1);
arm_neon_faddp (code, VREG_FULL, TYPE_F64, dreg, sreg1, sreg1);
} else if (ins->inst_c1 == MONO_TYPE_R4) {
arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->sreg1, ins->sreg1);
arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->dreg, ins->dreg);
arm_neon_faddp (code, VREG_FULL, TYPE_F32, dreg, sreg1, sreg1);
arm_neon_faddp (code, VREG_FULL, TYPE_F32, dreg, dreg, dreg);
} else {
g_assert_not_reached ();
}
Expand All @@ -3792,7 +3856,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
case INTRINS_AARCH64_ADV_SIMD_UADDV:
case INTRINS_AARCH64_ADV_SIMD_SADDV:
if (get_type_size_macro (ins->inst_c1) == TYPE_I64)
arm_neon_addp (code, VREG_FULL, TYPE_I64, ins->dreg, ins->sreg1, ins->sreg1);
arm_neon_addp (code, VREG_FULL, TYPE_I64, dreg, sreg1, sreg1);
else
g_assert_not_reached (); // remaining int types are handled through the codegen table
break;
Expand All @@ -3802,6 +3866,52 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
}
break;
}
case OP_CREATE_SCALAR_INT: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_eor_16b (code, dreg, dreg, dreg);
arm_neon_ins_g(code, t, dreg, sreg1, 0);
break;
}
case OP_CREATE_SCALAR_FLOAT: {
int t = 0;
switch (ins->inst_c1) {
case MONO_TYPE_R4:
t = SIZE_4;
break;
case MONO_TYPE_R8:
t = SIZE_8;
break;
}
// Use a temp register for zero op, as sreg1 and dreg share the same register here
arm_neon_eor_16b (code, NEON_TMP_REG, NEON_TMP_REG, NEON_TMP_REG);
arm_neon_ins_e(code, t, NEON_TMP_REG, sreg1, 0, 0);
arm_neon_mov (code, dreg, NEON_TMP_REG);
break;
}
case OP_CREATE_SCALAR_UNSAFE_INT: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_ins_g(code, t, dreg, sreg1, 0);
break;
}
case OP_CREATE_SCALAR_UNSAFE_FLOAT: {
if (dreg != sreg1) {
int t = 0;
switch (ins->inst_c1) {
case MONO_TYPE_R4:
t = SIZE_4;
break;
case MONO_TYPE_R8:
t = SIZE_8;
break;
}
arm_neon_ins_e(code, t, dreg, sreg1, 0, 0);
}
break;
}
// Enable this when adding support for Narrow and enable support for Create at the same time
// case OP_XCONCAT:
// arm_neon_ext_16b(code, dreg, sreg1, sreg2, 8);
// break;

/* BRANCH */
case OP_BR:
Expand Down Expand Up @@ -3875,49 +3985,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
arm_cbnzx (code, sreg1, 0);
break;

/* SIMD that is not table-generated */
/* TODO: once https://github.com/dotnet/runtime/issues/83252 is done,
* move the following two to the codegen table in simd-arm64.h
*/
case OP_ONES_COMPLEMENT:
arm_neon_not (code, get_vector_size_macro (ins), dreg, sreg1);
break;
case OP_NEGATION:
if (is_type_float_macro (ins->inst_c1)) {
arm_neon_fneg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
} else {
arm_neon_neg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
}
break;
case OP_XBINOP:
switch (ins->inst_c0) {
case OP_IMAX:
code = emit_smax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMAX_UN:
code = emit_umax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMIN:
code = emit_smin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMIN_UN:
code = emit_umin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
default:
g_assert_not_reached ();
}
break;
case OP_XZERO:
arm_neon_eor_16b (code, dreg, dreg, dreg);
break;
case OP_XONES:
arm_neon_eor_16b (code, dreg, dreg, dreg);
arm_neon_not_16b (code, dreg, dreg);
break;
case OP_XEXTRACT:
code = emit_xextract (code, VREG_FULL, ins->inst_c0, dreg, sreg1);
break;

/* ALU */
case OP_IADD:
arm_addw (code, dreg, sreg1, sreg2);
Expand Down
5 changes: 5 additions & 0 deletions src/mono/mono/mini/mini-ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -1168,6 +1168,11 @@ MINI_OP3(OP_MULX_HL64, "mulxhl64", LREG, LREG, LREG, LREG)
MINI_OP(OP_CREATE_SCALAR_UNSAFE, "create_scalar_unsafe", XREG, XREG, NONE)
MINI_OP(OP_CREATE_SCALAR, "create_scalar", XREG, XREG, NONE)

MINI_OP(OP_CREATE_SCALAR_UNSAFE_INT, "create_scalar_unsafe_int", XREG, IREG, NONE)
MINI_OP(OP_CREATE_SCALAR_UNSAFE_FLOAT, "create_scalar_unsafe_float", XREG, FREG, NONE)
MINI_OP(OP_CREATE_SCALAR_INT, "create_scalar_int", XREG, IREG, NONE)
MINI_OP(OP_CREATE_SCALAR_FLOAT, "create_scalar_float", XREG, FREG, NONE)

MINI_OP(OP_XMOVE, "xmove", XREG, XREG, NONE)
MINI_OP(OP_XZERO, "xzero", XREG, NONE, NONE)
MINI_OP(OP_XONES, "xones", XREG, NONE, NONE)
Expand Down
Loading

0 comments on commit 1d3c822

Please sign in to comment.