Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Mono] Enable the supported V128 SIMD intrinsics on Arm64 across all codegen engines #84289

Merged
merged 6 commits into from
Apr 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/mono/mono/arch/arm64/arm64-codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -1111,8 +1111,8 @@ arm_encode_arith_imm (int imm, guint32 *shift)
/* NEON :: extract */
#define arm_neon_extr_opcode(p, q, op2, imm4, rd, rn, rm) arm_neon_opcode_3reg ((p), (q), 0b00101110000000000000000000000000 | (op2) << 22 | (imm4) << 11, (rd), (rn), (rm))

#define arm_neon_ext_8b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_LOW, 0b00, (index), (rd), (rn), (rd))
#define arm_neon_ext_16b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_FULL, 0b00, (index), (rd), (rn), (rd))
#define arm_neon_ext_8b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_LOW, 0b00, (index), (rd), (rn), (rm))
#define arm_neon_ext_16b(p, rd, rn, rm, index) arm_neon_extr_opcode ((p), VREG_FULL, 0b00, (index), (rd), (rn), (rm))

/* NEON :: copy */
#define arm_neon_cpy_opcode(p, q, op, imm5, imm4, rd, rn) arm_neon_opcode_2reg ((p), (q), 0b00001110000000000000010000000000 | (op) << 29 | (imm5) << 16 | (imm4) << 11, (rd), (rn))
Expand Down
10 changes: 10 additions & 0 deletions src/mono/mono/mini/cpu-arm64.mdesc
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,16 @@ expand_i4: dest:x src1:i len:4
expand_i8: dest:x src1:i len:4
expand_r4: dest:x src1:f len:4
expand_r8: dest:x src1:f len:4
insert_i1: dest:x src1:i len:4
insert_i2: dest:x src1:i len:4
insert_i4: dest:x src1:i len:4
insert_i8: dest:x src1:i len:4
insert_r4: dest:x src1:f len:4
insert_r8: dest:x src1:f len:4
create_scalar_int: dest:x src1:i len:8
create_scalar_float: dest:x src1:f len:12
create_scalar_unsafe_int: dest:x src1:i len:4
create_scalar_unsafe_float: dest:x src1:f len:4

generic_class_init: src1:a len:44 clob:c
gc_safe_point: src1:i len:12 clob:c
Expand Down
175 changes: 121 additions & 54 deletions src/mono/mono/mini/mini-arm64.c
Original file line number Diff line number Diff line change
Expand Up @@ -3717,6 +3717,48 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
}
break;
}
/* SIMD that is not table-generated */
/* TODO: once https://github.com/dotnet/runtime/issues/83252 is done,
* move the following two to the codegen table in simd-arm64.h
*/
case OP_ONES_COMPLEMENT:
arm_neon_not (code, get_vector_size_macro (ins), dreg, sreg1);
break;
case OP_NEGATION:
if (is_type_float_macro (ins->inst_c1)) {
arm_neon_fneg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
} else {
arm_neon_neg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
}
break;
case OP_XBINOP:
switch (ins->inst_c0) {
case OP_IMAX:
code = emit_smax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMAX_UN:
code = emit_umax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMIN:
code = emit_smin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMIN_UN:
code = emit_umin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
default:
g_assert_not_reached ();
}
break;
case OP_XZERO:
arm_neon_eor_16b (code, dreg, dreg, dreg);
break;
case OP_XONES:
arm_neon_eor_16b (code, dreg, dreg, dreg);
arm_neon_not_16b (code, dreg, dreg);
break;
case OP_XEXTRACT:
code = emit_xextract (code, VREG_FULL, ins->inst_c0, dreg, sreg1);
break;
case OP_STOREX_MEMBASE:
code = emit_strfpq (code, sreg1, dreg, ins->inst_offset);
break;
Expand All @@ -3730,10 +3772,10 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
if (cfg->compile_aot && cfg->code_exec_only) {
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128_GOT, ins->inst_p0);
arm_ldrx_lit (code, ARMREG_IP0, 0);
arm_ldrfpq (code, ins->dreg, ARMREG_IP0, 0);
arm_ldrfpq (code, dreg, ARMREG_IP0, 0);
} else {
mono_add_patch_info (cfg, offset, MONO_PATCH_INFO_X128, ins->inst_p0);
arm_neon_ldrq_lit (code, ins->dreg, 0);
arm_neon_ldrq_lit (code, dreg, 0);
}
break;
}
Expand All @@ -3744,13 +3786,13 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
case OP_EXPAND_I4:
case OP_EXPAND_I8: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_dup_g (code, VREG_FULL, t, ins->dreg, ins->sreg1);
arm_neon_dup_g (code, VREG_FULL, t, dreg, sreg1);
break;
}
case OP_EXPAND_R4:
case OP_EXPAND_R8: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, 0);
arm_neon_fdup_e (code, VREG_FULL, t, dreg, sreg1, 0);
break;
}
case OP_EXTRACT_I1:
Expand All @@ -3760,9 +3802,9 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
const int t = get_type_size_macro (ins->inst_c1);
// smov is not defined for i64
if (is_type_unsigned_macro (ins->inst_c1) || t == TYPE_I64) {
arm_neon_umov (code, t, ins->dreg, ins->sreg1, ins->inst_c0);
arm_neon_umov (code, t, dreg, sreg1, ins->inst_c0);
} else {
arm_neon_smov (code, t, ins->dreg, ins->sreg1, ins->inst_c0);
arm_neon_smov (code, t, dreg, sreg1, ins->inst_c0);
}
break;
}
Expand All @@ -3773,17 +3815,39 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
// Technically, this broadcasts element #inst_c0 to all dest XREG elements; whereas it should
// set the FREG to the said element. Since FREG and XREG pool is the same on arm64 and the rest
// of the F/XREG is ignored in FREG mode, this operation remains valid.
arm_neon_fdup_e (code, VREG_FULL, t, ins->dreg, ins->sreg1, ins->inst_c0);
arm_neon_fdup_e (code, VREG_FULL, t, dreg, sreg1, ins->inst_c0);
}
break;
case OP_INSERT_I1:
case OP_INSERT_I2:
case OP_INSERT_I4:
case OP_INSERT_I8: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_ins_g(code, t, dreg, sreg1, ins->inst_c0);
break;
}
case OP_INSERT_R4:
case OP_INSERT_R8: {
int t = 0;
switch (ins->inst_c1) {
case MONO_TYPE_R4:
t = SIZE_4;
break;
case MONO_TYPE_R8:
t = SIZE_8;
break;
}
arm_neon_ins_e(code, t, dreg, sreg1, ins->inst_c0, 0);
break;
}
case OP_ARM64_XADDV: {
switch (ins->inst_c0) {
case INTRINS_AARCH64_ADV_SIMD_FADDV:
if (ins->inst_c1 == MONO_TYPE_R8) {
arm_neon_faddp (code, VREG_FULL, TYPE_F64, ins->dreg, ins->sreg1, ins->sreg1);
arm_neon_faddp (code, VREG_FULL, TYPE_F64, dreg, sreg1, sreg1);
} else if (ins->inst_c1 == MONO_TYPE_R4) {
arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->sreg1, ins->sreg1);
arm_neon_faddp (code, VREG_FULL, TYPE_F32, ins->dreg, ins->dreg, ins->dreg);
arm_neon_faddp (code, VREG_FULL, TYPE_F32, dreg, sreg1, sreg1);
arm_neon_faddp (code, VREG_FULL, TYPE_F32, dreg, dreg, dreg);
} else {
g_assert_not_reached ();
}
Expand All @@ -3792,7 +3856,7 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
case INTRINS_AARCH64_ADV_SIMD_UADDV:
case INTRINS_AARCH64_ADV_SIMD_SADDV:
if (get_type_size_macro (ins->inst_c1) == TYPE_I64)
arm_neon_addp (code, VREG_FULL, TYPE_I64, ins->dreg, ins->sreg1, ins->sreg1);
arm_neon_addp (code, VREG_FULL, TYPE_I64, dreg, sreg1, sreg1);
else
g_assert_not_reached (); // remaining int types are handled through the codegen table
break;
Expand All @@ -3802,6 +3866,52 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
}
break;
}
case OP_CREATE_SCALAR_INT: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_eor_16b (code, dreg, dreg, dreg);
arm_neon_ins_g(code, t, dreg, sreg1, 0);
break;
}
case OP_CREATE_SCALAR_FLOAT: {
int t = 0;
switch (ins->inst_c1) {
case MONO_TYPE_R4:
t = SIZE_4;
break;
case MONO_TYPE_R8:
t = SIZE_8;
break;
}
// Use a temp register for zero op, as sreg1 and dreg share the same register here
arm_neon_eor_16b (code, NEON_TMP_REG, NEON_TMP_REG, NEON_TMP_REG);
fanyang-mono marked this conversation as resolved.
Show resolved Hide resolved
arm_neon_ins_e(code, t, NEON_TMP_REG, sreg1, 0, 0);
arm_neon_mov (code, dreg, NEON_TMP_REG);
break;
}
case OP_CREATE_SCALAR_UNSAFE_INT: {
const int t = get_type_size_macro (ins->inst_c1);
arm_neon_ins_g(code, t, dreg, sreg1, 0);
break;
}
case OP_CREATE_SCALAR_UNSAFE_FLOAT: {
if (dreg != sreg1) {
int t = 0;
switch (ins->inst_c1) {
fanyang-mono marked this conversation as resolved.
Show resolved Hide resolved
case MONO_TYPE_R4:
t = SIZE_4;
break;
case MONO_TYPE_R8:
t = SIZE_8;
break;
}
arm_neon_ins_e(code, t, dreg, sreg1, 0, 0);
}
break;
}
// Enable this when adding support for Narrow and enable support for Create at the same time
// case OP_XCONCAT:
// arm_neon_ext_16b(code, dreg, sreg1, sreg2, 8);
// break;

/* BRANCH */
case OP_BR:
Expand Down Expand Up @@ -3875,49 +3985,6 @@ mono_arch_output_basic_block (MonoCompile *cfg, MonoBasicBlock *bb)
arm_cbnzx (code, sreg1, 0);
break;

/* SIMD that is not table-generated */
/* TODO: once https://github.com/dotnet/runtime/issues/83252 is done,
* move the following two to the codegen table in simd-arm64.h
*/
case OP_ONES_COMPLEMENT:
arm_neon_not (code, get_vector_size_macro (ins), dreg, sreg1);
break;
case OP_NEGATION:
if (is_type_float_macro (ins->inst_c1)) {
arm_neon_fneg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
} else {
arm_neon_neg (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1);
}
break;
case OP_XBINOP:
switch (ins->inst_c0) {
case OP_IMAX:
code = emit_smax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMAX_UN:
code = emit_umax_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMIN:
code = emit_smin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
case OP_IMIN_UN:
code = emit_umin_i8 (code, get_vector_size_macro (ins), get_type_size_macro (ins->inst_c1), dreg, sreg1, sreg2);
break;
default:
g_assert_not_reached ();
}
break;
case OP_XZERO:
arm_neon_eor_16b (code, dreg, dreg, dreg);
break;
case OP_XONES:
arm_neon_eor_16b (code, dreg, dreg, dreg);
arm_neon_not_16b (code, dreg, dreg);
break;
case OP_XEXTRACT:
code = emit_xextract (code, VREG_FULL, ins->inst_c0, dreg, sreg1);
break;

/* ALU */
case OP_IADD:
arm_addw (code, dreg, sreg1, sreg2);
Expand Down
5 changes: 5 additions & 0 deletions src/mono/mono/mini/mini-ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -1168,6 +1168,11 @@ MINI_OP3(OP_MULX_HL64, "mulxhl64", LREG, LREG, LREG, LREG)
MINI_OP(OP_CREATE_SCALAR_UNSAFE, "create_scalar_unsafe", XREG, XREG, NONE)
MINI_OP(OP_CREATE_SCALAR, "create_scalar", XREG, XREG, NONE)

MINI_OP(OP_CREATE_SCALAR_UNSAFE_INT, "create_scalar_unsafe_int", XREG, IREG, NONE)
MINI_OP(OP_CREATE_SCALAR_UNSAFE_FLOAT, "create_scalar_unsafe_float", XREG, FREG, NONE)
MINI_OP(OP_CREATE_SCALAR_INT, "create_scalar_int", XREG, IREG, NONE)
MINI_OP(OP_CREATE_SCALAR_FLOAT, "create_scalar_float", XREG, FREG, NONE)

MINI_OP(OP_XMOVE, "xmove", XREG, XREG, NONE)
MINI_OP(OP_XZERO, "xzero", XREG, NONE, NONE)
MINI_OP(OP_XONES, "xones", XREG, NONE, NONE)
Expand Down
Loading