Commit 2261e053 authored by Ilja Iskovs's avatar Ilja Iskovs Committed by V8 LUCI CQ

[arm64][wasm] Use NEON S/Usra for Wasm SIMD add(shr(x, imm), y)

A single AArch64 SIMD signed/unsigned Shift Right and Accumulate can be
used to implement Wasm SIMD add(shr(x, imm), y). This gives a 1-1.5%
improvement on some compute intensive Wasm benchmarks on Neoverse-N1.

Mla and Adalp optimisations were refactored to match the style of the
added code.

Change-Id: Id5959a31ca267e02b7d60e7ff6f942adb029b41e
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3089157Reviewed-by: 's avatarZhi An Ng <zhin@chromium.org>
Commit-Queue: Martyn Capewell <martyn.capewell@arm.com>
Cr-Commit-Position: refs/heads/master@{#76280}
parent b3b9466a
......@@ -2644,6 +2644,24 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64S128Select, Bsl, 16B);
SIMD_BINOP_CASE(kArm64S128AndNot, Bic, 16B);
case kArm64Ssra: {
int8_t laneSize = LaneSizeField::decode(opcode);
VectorFormat f = VectorFormatFillQ(laneSize);
int8_t mask = laneSize - 1;
VRegister dst = i.OutputSimd128Register().Format(f);
DCHECK_EQ(dst, i.InputSimd128Register(0).Format(f));
__ Ssra(dst, i.InputSimd128Register(1).Format(f), i.InputInt8(2) & mask);
break;
}
case kArm64Usra: {
int8_t laneSize = LaneSizeField::decode(opcode);
VectorFormat f = VectorFormatFillQ(laneSize);
int8_t mask = laneSize - 1;
VRegister dst = i.OutputSimd128Register().Format(f);
DCHECK_EQ(dst, i.InputSimd128Register(0).Format(f));
__ Usra(dst, i.InputSimd128Register(1).Format(f), i.InputUint8(2) & mask);
break;
}
case kArm64S32x4Shuffle: {
Simd128Register dst = i.OutputSimd128Register().V4S(),
src0 = i.InputSimd128Register(0).V4S(),
......
......@@ -351,6 +351,8 @@ namespace compiler {
V(Arm64S128Not) \
V(Arm64S128Select) \
V(Arm64S128AndNot) \
V(Arm64Ssra) \
V(Arm64Usra) \
V(Arm64S32x4ZipLeft) \
V(Arm64S32x4ZipRight) \
V(Arm64S32x4UnzipLeft) \
......
......@@ -321,6 +321,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64S128Not:
case kArm64S128Select:
case kArm64S128AndNot:
case kArm64Ssra:
case kArm64Usra:
case kArm64S32x4ZipLeft:
case kArm64S32x4ZipRight:
case kArm64S32x4UnzipLeft:
......
......@@ -430,6 +430,50 @@ WASM_SIMD_TEST(F32x4Le) {
RunF32x4CompareOpTest(execution_tier, kExprF32x4Le, LessEqual);
}
template <typename ScalarType>
void RunShiftAddTestSequence(TestExecutionTier execution_tier,
WasmOpcode shiftr_opcode, WasmOpcode add_opcode,
WasmOpcode splat_opcode, int32_t imm,
ScalarType (*shift_fn)(ScalarType, int32_t)) {
WasmRunner<int32_t, ScalarType, ScalarType> r(execution_tier);
// globals to store results for left and right cases
ScalarType* g1 = r.builder().template AddGlobal<ScalarType>(kWasmS128);
ScalarType* g2 = r.builder().template AddGlobal<ScalarType>(kWasmS128);
byte value1 = 0, value2 = 1;
byte temp1 = r.AllocateLocal(kWasmS128);
byte temp2 = r.AllocateLocal(kWasmS128);
auto expected_fn = [shift_fn](ScalarType x, ScalarType y, uint32_t imm) {
return base::AddWithWraparound(x, shift_fn(y, imm));
};
BUILD(
r,
WASM_LOCAL_SET(temp1,
WASM_SIMD_OPN(splat_opcode, WASM_LOCAL_GET(value1))),
WASM_LOCAL_SET(temp2,
WASM_SIMD_OPN(splat_opcode, WASM_LOCAL_GET(value2))),
WASM_GLOBAL_SET(0, WASM_SIMD_BINOP(add_opcode,
WASM_SIMD_BINOP(shiftr_opcode,
WASM_LOCAL_GET(temp2),
WASM_I32V(imm)),
WASM_LOCAL_GET(temp1))),
WASM_GLOBAL_SET(1, WASM_SIMD_BINOP(add_opcode, WASM_LOCAL_GET(temp1),
WASM_SIMD_BINOP(shiftr_opcode,
WASM_LOCAL_GET(temp2),
WASM_I32V(imm)))),
WASM_ONE);
for (ScalarType x : compiler::ValueHelper::GetVector<ScalarType>()) {
for (ScalarType y : compiler::ValueHelper::GetVector<ScalarType>()) {
r.Call(x, y);
ScalarType expected = expected_fn(x, y, imm);
for (size_t i = 0; i < kSimd128Size / sizeof(ScalarType); i++) {
CHECK_EQ(expected, LANE(g1, i));
CHECK_EQ(expected, LANE(g2, i));
}
}
}
}
WASM_SIMD_TEST(I64x2Splat) {
WasmRunner<int32_t, int64_t> r(execution_tier);
// Set up a global to hold output vector.
......@@ -500,6 +544,17 @@ WASM_SIMD_TEST(I64x2ShrU) {
RunI64x2ShiftOpTest(execution_tier, kExprI64x2ShrU, LogicalShiftRight);
}
WASM_SIMD_TEST(I64x2ShiftAdd) {
for (int imm = 0; imm <= 64; imm++) {
RunShiftAddTestSequence<int64_t>(execution_tier, kExprI64x2ShrU,
kExprI64x2Add, kExprI64x2Splat, imm,
LogicalShiftRight);
RunShiftAddTestSequence<int64_t>(execution_tier, kExprI64x2ShrS,
kExprI64x2Add, kExprI64x2Splat, imm,
ArithmeticShiftRight);
}
}
WASM_SIMD_TEST(I64x2Add) {
RunI64x2BinOpTest(execution_tier, kExprI64x2Add, base::AddWithWraparound);
}
......@@ -1350,6 +1405,17 @@ WASM_SIMD_TEST(I32x4ShrU) {
RunI32x4ShiftOpTest(execution_tier, kExprI32x4ShrU, LogicalShiftRight);
}
WASM_SIMD_TEST(I32x4ShiftAdd) {
for (int imm = 0; imm <= 32; imm++) {
RunShiftAddTestSequence<int32_t>(execution_tier, kExprI32x4ShrU,
kExprI32x4Add, kExprI32x4Splat, imm,
LogicalShiftRight);
RunShiftAddTestSequence<int32_t>(execution_tier, kExprI32x4ShrS,
kExprI32x4Add, kExprI32x4Splat, imm,
ArithmeticShiftRight);
}
}
// Tests both signed and unsigned conversion from I8x16 (unpacking).
WASM_SIMD_TEST(I16x8ConvertI8x16) {
WasmRunner<int32_t, int32_t> r(execution_tier);
......@@ -1660,6 +1726,17 @@ WASM_SIMD_TEST(I16x8ShrU) {
RunI16x8ShiftOpTest(execution_tier, kExprI16x8ShrU, LogicalShiftRight);
}
WASM_SIMD_TEST(I16x8ShiftAdd) {
for (int imm = 0; imm <= 16; imm++) {
RunShiftAddTestSequence<int16_t>(execution_tier, kExprI16x8ShrU,
kExprI16x8Add, kExprI16x8Splat, imm,
LogicalShiftRight);
RunShiftAddTestSequence<int16_t>(execution_tier, kExprI16x8ShrS,
kExprI16x8Add, kExprI16x8Splat, imm,
ArithmeticShiftRight);
}
}
WASM_SIMD_TEST(I8x16Neg) {
RunI8x16UnOpTest(execution_tier, kExprI8x16Neg, base::NegateWithWraparound);
}
......@@ -1817,6 +1894,17 @@ WASM_SIMD_TEST(I8x16ShrU) {
RunI8x16ShiftOpTest(execution_tier, kExprI8x16ShrU, LogicalShiftRight);
}
WASM_SIMD_TEST(I8x16ShiftAdd) {
for (int imm = 0; imm <= 8; imm++) {
RunShiftAddTestSequence<int8_t>(execution_tier, kExprI8x16ShrU,
kExprI8x16Add, kExprI8x16Splat, imm,
LogicalShiftRight);
RunShiftAddTestSequence<int8_t>(execution_tier, kExprI8x16ShrS,
kExprI8x16Add, kExprI8x16Splat, imm,
ArithmeticShiftRight);
}
}
// Test Select by making a mask where the 0th and 3rd lanes are true and the
// rest false, and comparing for non-equality with zero to convert to a boolean
// vector.
......
......@@ -894,6 +894,7 @@ inline WasmOpcode LoadStoreOpcodeOf(MachineType type, bool store) {
#define TO_BYTE(val) static_cast<byte>(val)
// Encode all simd ops as a 2-byte LEB.
#define WASM_SIMD_OP(op) kSimdPrefix, U32V_2(op & 0xff)
#define WASM_SIMD_OPN(op, ...) __VA_ARGS__, WASM_SIMD_OP(op)
#define WASM_SIMD_SPLAT(Type, ...) __VA_ARGS__, WASM_SIMD_OP(kExpr##Type##Splat)
#define WASM_SIMD_UNOP(op, x) x, WASM_SIMD_OP(op)
#define WASM_SIMD_BINOP(op, x, y) x, y, WASM_SIMD_OP(op)
......
......@@ -2231,6 +2231,101 @@ INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorSIMDDPWithSIMDMulTest,
::testing::ValuesIn(kSIMDMulDPInstructions));
namespace {
struct SIMDShrAddInst {
const char* shradd_constructor_name;
const Operator* (MachineOperatorBuilder::*shr_s_operator)();
const Operator* (MachineOperatorBuilder::*shr_u_operator)();
const Operator* (MachineOperatorBuilder::*add_operator)();
const int laneSize;
};
std::ostream& operator<<(std::ostream& os, const SIMDShrAddInst& inst) {
return os << inst.shradd_constructor_name;
}
} // namespace
static const SIMDShrAddInst kSIMDShrAddInstructions[] = {
{"I64x2ShrAdd", &MachineOperatorBuilder::I64x2ShrS,
&MachineOperatorBuilder::I64x2ShrU, &MachineOperatorBuilder::I64x2Add, 64},
{"I32x4ShrAdd", &MachineOperatorBuilder::I32x4ShrS,
&MachineOperatorBuilder::I32x4ShrU, &MachineOperatorBuilder::I32x4Add, 32},
{"I16x8ShrAdd", &MachineOperatorBuilder::I16x8ShrS,
&MachineOperatorBuilder::I16x8ShrU, &MachineOperatorBuilder::I16x8Add, 16},
{"I8x16ShrAdd", &MachineOperatorBuilder::I8x16ShrS,
&MachineOperatorBuilder::I8x16ShrU, &MachineOperatorBuilder::I8x16Add, 8}};
using InstructionSelectorSIMDShrAddTest =
InstructionSelectorTestWithParam<SIMDShrAddInst>;
TEST_P(InstructionSelectorSIMDShrAddTest, ShrAddS) {
const SIMDShrAddInst param = GetParam();
const MachineType type = MachineType::Simd128();
{
StreamBuilder m(this, type, type, type);
Node* n = m.AddNode((m.machine()->*param.shr_s_operator)(), m.Parameter(1),
m.Int32Constant(1));
m.Return(
m.AddNode((m.machine()->*param.add_operator)(), m.Parameter(0), n));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kArm64Ssra, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(param.laneSize, LaneSizeField::decode(s[0]->opcode()));
EXPECT_EQ(1U, s[0]->OutputCount());
}
{
StreamBuilder m(this, type, type, type);
Node* n = m.AddNode((m.machine()->*param.shr_s_operator)(), m.Parameter(0),
m.Int32Constant(1));
m.Return(
m.AddNode((m.machine()->*param.add_operator)(), n, m.Parameter(1)));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kArm64Ssra, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(param.laneSize, LaneSizeField::decode(s[0]->opcode()));
EXPECT_EQ(1U, s[0]->OutputCount());
}
}
TEST_P(InstructionSelectorSIMDShrAddTest, ShrAddU) {
const SIMDShrAddInst param = GetParam();
const MachineType type = MachineType::Simd128();
{
StreamBuilder m(this, type, type, type);
Node* n = m.AddNode((m.machine()->*param.shr_u_operator)(), m.Parameter(1),
m.Int32Constant(1));
m.Return(
m.AddNode((m.machine()->*param.add_operator)(), m.Parameter(0), n));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kArm64Usra, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(param.laneSize, LaneSizeField::decode(s[0]->opcode()));
EXPECT_EQ(1U, s[0]->OutputCount());
}
{
StreamBuilder m(this, type, type, type);
Node* n = m.AddNode((m.machine()->*param.shr_u_operator)(), m.Parameter(0),
m.Int32Constant(1));
m.Return(
m.AddNode((m.machine()->*param.add_operator)(), n, m.Parameter(1)));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kArm64Usra, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(param.laneSize, LaneSizeField::decode(s[0]->opcode()));
EXPECT_EQ(1U, s[0]->OutputCount());
}
}
INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorSIMDShrAddTest,
::testing::ValuesIn(kSIMDShrAddInstructions));
struct SIMDMulDupInst {
const uint8_t shuffle[16];
int32_t lane;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment