Commit 2261e053 authored by Ilja Iskovs's avatar Ilja Iskovs Committed by V8 LUCI CQ

[arm64][wasm] Use NEON S/Usra for Wasm SIMD add(shr(x, imm), y)

A single AArch64 SIMD signed/unsigned Shift Right and Accumulate can be
used to implement Wasm SIMD add(shr(x, imm), y). This gives a 1-1.5%
improvement on some compute intensive Wasm benchmarks on Neoverse-N1.

Mla and Adalp optimisations were refactored to match the style of the
added code.

Change-Id: Id5959a31ca267e02b7d60e7ff6f942adb029b41e
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3089157Reviewed-by: 's avatarZhi An Ng <zhin@chromium.org>
Commit-Queue: Martyn Capewell <martyn.capewell@arm.com>
Cr-Commit-Position: refs/heads/master@{#76280}
parent b3b9466a
......@@ -2644,6 +2644,24 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64S128Select, Bsl, 16B);
SIMD_BINOP_CASE(kArm64S128AndNot, Bic, 16B);
case kArm64Ssra: {
int8_t laneSize = LaneSizeField::decode(opcode);
VectorFormat f = VectorFormatFillQ(laneSize);
int8_t mask = laneSize - 1;
VRegister dst = i.OutputSimd128Register().Format(f);
DCHECK_EQ(dst, i.InputSimd128Register(0).Format(f));
__ Ssra(dst, i.InputSimd128Register(1).Format(f), i.InputInt8(2) & mask);
break;
}
case kArm64Usra: {
int8_t laneSize = LaneSizeField::decode(opcode);
VectorFormat f = VectorFormatFillQ(laneSize);
int8_t mask = laneSize - 1;
VRegister dst = i.OutputSimd128Register().Format(f);
DCHECK_EQ(dst, i.InputSimd128Register(0).Format(f));
__ Usra(dst, i.InputSimd128Register(1).Format(f), i.InputUint8(2) & mask);
break;
}
case kArm64S32x4Shuffle: {
Simd128Register dst = i.OutputSimd128Register().V4S(),
src0 = i.InputSimd128Register(0).V4S(),
......
......@@ -351,6 +351,8 @@ namespace compiler {
V(Arm64S128Not) \
V(Arm64S128Select) \
V(Arm64S128AndNot) \
V(Arm64Ssra) \
V(Arm64Usra) \
V(Arm64S32x4ZipLeft) \
V(Arm64S32x4ZipRight) \
V(Arm64S32x4UnzipLeft) \
......
......@@ -321,6 +321,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64S128Not:
case kArm64S128Select:
case kArm64S128AndNot:
case kArm64Ssra:
case kArm64Usra:
case kArm64S32x4ZipLeft:
case kArm64S32x4ZipRight:
case kArm64S32x4UnzipLeft:
......
......@@ -3483,7 +3483,6 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
V(F32x4Ne, kArm64F32x4Ne) \
V(F32x4Lt, kArm64F32x4Lt) \
V(F32x4Le, kArm64F32x4Le) \
V(I64x2Add, kArm64I64x2Add) \
V(I64x2Sub, kArm64I64x2Sub) \
V(I64x2Eq, kArm64I64x2Eq) \
V(I64x2Ne, kArm64I64x2Ne) \
......@@ -3520,7 +3519,6 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
V(I16x8GeU, kArm64I16x8GeU) \
V(I16x8RoundingAverageU, kArm64I16x8RoundingAverageU) \
V(I16x8Q15MulRSatS, kArm64I16x8Q15MulRSatS) \
V(I8x16Add, kArm64I8x16Add) \
V(I8x16Sub, kArm64I8x16Sub) \
V(I8x16SConvertI16x8, kArm64I8x16SConvertI16x8) \
V(I8x16AddSatS, kArm64I8x16AddSatS) \
......@@ -3707,58 +3705,110 @@ void InstructionSelector::VisitI64x2Mul(Node* node) {
arraysize(temps), temps);
}
namespace {
bool ShraHelper(InstructionSelector* selector, Node* node, int lane_size,
InstructionCode shra_code, InstructionCode add_code,
IrOpcode::Value shift_op) {
Arm64OperandGenerator g(selector);
Node* left = node->InputAt(0);
Node* right = node->InputAt(1);
if (right->opcode() == shift_op) {
std::swap(left, right);
} else if (left->opcode() != shift_op) {
return false;
}
if (!selector->CanCover(node, left) || !g.IsIntegerConstant(left->InputAt(1)))
return false;
// If shifting by zero, just do the addition
if (g.GetIntegerConstantValue(left->InputAt(1)) % lane_size == 0) {
selector->Emit(add_code, g.DefineAsRegister(node),
g.UseRegister(left->InputAt(0)), g.UseRegister(right));
} else {
selector->Emit(shra_code | LaneSizeField::encode(lane_size),
g.DefineSameAsFirst(node), g.UseRegister(right),
g.UseRegister(left->InputAt(0)),
g.UseImmediate(left->InputAt(1)));
}
return true;
}
bool AdalpHelper(InstructionSelector* selector, Node* node, int lane_size,
InstructionCode adalp_code, IrOpcode::Value ext_op) {
Arm64OperandGenerator g(selector);
Node* left = node->InputAt(0);
Node* right = node->InputAt(1);
if (right->opcode() == ext_op) {
std::swap(left, right);
} else if (left->opcode() != ext_op) {
return false;
}
if (selector->CanCover(node, left)) {
selector->Emit(adalp_code | LaneSizeField::encode(lane_size),
g.DefineSameAsFirst(node), g.UseRegister(right),
g.UseRegister(left->InputAt(0)));
return true;
}
return false;
}
bool MlaHelper(InstructionSelector* selector, Node* node,
InstructionCode mla_code, IrOpcode::Value mul_op) {
Arm64OperandGenerator g(selector);
Node* left = node->InputAt(0);
Node* right = node->InputAt(1);
if (right->opcode() == mul_op) {
std::swap(left, right);
} else if (left->opcode() != mul_op) {
return false;
}
if (selector->CanCover(node, left)) {
selector->Emit(mla_code, g.DefineSameAsFirst(node), g.UseRegister(right),
g.UseRegister(left->InputAt(0)),
g.UseRegister(left->InputAt(1)));
return true;
}
return false;
}
} // namespace
void InstructionSelector::VisitI64x2Add(Node* node) {
if (!ShraHelper(this, node, 64, kArm64Ssra, kArm64I64x2Add,
IrOpcode::kI64x2ShrS) &&
!ShraHelper(this, node, 64, kArm64Usra, kArm64I64x2Add,
IrOpcode::kI64x2ShrU)) {
VisitRRR(this, kArm64I64x2Add, node);
}
}
void InstructionSelector::VisitI8x16Add(Node* node) {
if (!ShraHelper(this, node, 8, kArm64Ssra, kArm64I8x16Add,
IrOpcode::kI8x16ShrS) &&
!ShraHelper(this, node, 8, kArm64Usra, kArm64I8x16Add,
IrOpcode::kI8x16ShrU)) {
VisitRRR(this, kArm64I8x16Add, node);
}
}
#define VISIT_SIMD_ADD(Type, PairwiseType, LaneSize) \
void InstructionSelector::Visit##Type##Add(Node* node) { \
Arm64OperandGenerator g(this); \
Node* left = node->InputAt(0); \
Node* right = node->InputAt(1); \
/* Select Mla(z, x, y) for Add(Mul(x, y), z). */ \
if (left->opcode() == IrOpcode::k##Type##Mul && CanCover(node, left)) { \
Emit(kArm64##Type##Mla, g.DefineSameAsFirst(node), g.UseRegister(right), \
g.UseRegister(left->InputAt(0)), g.UseRegister(left->InputAt(1))); \
return; \
} \
/* Select Mla(z, x, y) for Add(z, Mul(x, y)). */ \
if (right->opcode() == IrOpcode::k##Type##Mul && CanCover(node, right)) { \
Emit(kArm64##Type##Mla, g.DefineSameAsFirst(node), g.UseRegister(left), \
g.UseRegister(right->InputAt(0)), \
g.UseRegister(right->InputAt(1))); \
return; \
} \
/* Select Sadalp(x, y) for Add(x, ExtAddPairwiseS(y)). */ \
if (right->opcode() == \
IrOpcode::k##Type##ExtAddPairwise##PairwiseType##S && \
CanCover(node, right)) { \
Emit(kArm64Sadalp | LaneSizeField::encode(LaneSize), \
g.DefineSameAsFirst(node), g.UseRegister(left), \
g.UseRegister(right->InputAt(0))); \
return; \
} \
/* Select Sadalp(y, x) for Add(ExtAddPairwiseS(x), y). */ \
if (left->opcode() == \
IrOpcode::k##Type##ExtAddPairwise##PairwiseType##S && \
CanCover(node, left)) { \
Emit(kArm64Sadalp | LaneSizeField::encode(LaneSize), \
g.DefineSameAsFirst(node), g.UseRegister(right), \
g.UseRegister(left->InputAt(0))); \
/* Select Mla(z, x, y) for Add(x, Mul(y, z)). */ \
if (MlaHelper(this, node, kArm64##Type##Mla, IrOpcode::k##Type##Mul)) { \
return; \
} \
/* Select Uadalp(x, y) for Add(x, ExtAddPairwiseU(y)). */ \
if (right->opcode() == \
IrOpcode::k##Type##ExtAddPairwise##PairwiseType##U && \
CanCover(node, right)) { \
Emit(kArm64Uadalp | LaneSizeField::encode(LaneSize), \
g.DefineSameAsFirst(node), g.UseRegister(left), \
g.UseRegister(right->InputAt(0))); \
/* Select S/Uadalp(x, y) for Add(x, ExtAddPairwise(y)). */ \
if (AdalpHelper(this, node, LaneSize, kArm64Sadalp, \
IrOpcode::k##Type##ExtAddPairwise##PairwiseType##S) || \
AdalpHelper(this, node, LaneSize, kArm64Uadalp, \
IrOpcode::k##Type##ExtAddPairwise##PairwiseType##U)) { \
return; \
} \
/* Select Uadalp(y, x) for Add(ExtAddPairwiseU(x), y). */ \
if (left->opcode() == \
IrOpcode::k##Type##ExtAddPairwise##PairwiseType##U && \
CanCover(node, left)) { \
Emit(kArm64Uadalp | LaneSizeField::encode(LaneSize), \
g.DefineSameAsFirst(node), g.UseRegister(right), \
g.UseRegister(left->InputAt(0))); \
/* Select S/Usra(x, y) for Add(x, ShiftRight(y, imm)). */ \
if (ShraHelper(this, node, LaneSize, kArm64Ssra, kArm64##Type##Add, \
IrOpcode::k##Type##ShrS) || \
ShraHelper(this, node, LaneSize, kArm64Usra, kArm64##Type##Add, \
IrOpcode::k##Type##ShrU)) { \
return; \
} \
VisitRRR(this, kArm64##Type##Add, node); \
......
......@@ -430,6 +430,50 @@ WASM_SIMD_TEST(F32x4Le) {
RunF32x4CompareOpTest(execution_tier, kExprF32x4Le, LessEqual);
}
template <typename ScalarType>
void RunShiftAddTestSequence(TestExecutionTier execution_tier,
WasmOpcode shiftr_opcode, WasmOpcode add_opcode,
WasmOpcode splat_opcode, int32_t imm,
ScalarType (*shift_fn)(ScalarType, int32_t)) {
WasmRunner<int32_t, ScalarType, ScalarType> r(execution_tier);
// globals to store results for left and right cases
ScalarType* g1 = r.builder().template AddGlobal<ScalarType>(kWasmS128);
ScalarType* g2 = r.builder().template AddGlobal<ScalarType>(kWasmS128);
byte value1 = 0, value2 = 1;
byte temp1 = r.AllocateLocal(kWasmS128);
byte temp2 = r.AllocateLocal(kWasmS128);
auto expected_fn = [shift_fn](ScalarType x, ScalarType y, uint32_t imm) {
return base::AddWithWraparound(x, shift_fn(y, imm));
};
BUILD(
r,
WASM_LOCAL_SET(temp1,
WASM_SIMD_OPN(splat_opcode, WASM_LOCAL_GET(value1))),
WASM_LOCAL_SET(temp2,
WASM_SIMD_OPN(splat_opcode, WASM_LOCAL_GET(value2))),
WASM_GLOBAL_SET(0, WASM_SIMD_BINOP(add_opcode,
WASM_SIMD_BINOP(shiftr_opcode,
WASM_LOCAL_GET(temp2),
WASM_I32V(imm)),
WASM_LOCAL_GET(temp1))),
WASM_GLOBAL_SET(1, WASM_SIMD_BINOP(add_opcode, WASM_LOCAL_GET(temp1),
WASM_SIMD_BINOP(shiftr_opcode,
WASM_LOCAL_GET(temp2),
WASM_I32V(imm)))),
WASM_ONE);
for (ScalarType x : compiler::ValueHelper::GetVector<ScalarType>()) {
for (ScalarType y : compiler::ValueHelper::GetVector<ScalarType>()) {
r.Call(x, y);
ScalarType expected = expected_fn(x, y, imm);
for (size_t i = 0; i < kSimd128Size / sizeof(ScalarType); i++) {
CHECK_EQ(expected, LANE(g1, i));
CHECK_EQ(expected, LANE(g2, i));
}
}
}
}
WASM_SIMD_TEST(I64x2Splat) {
WasmRunner<int32_t, int64_t> r(execution_tier);
// Set up a global to hold output vector.
......@@ -500,6 +544,17 @@ WASM_SIMD_TEST(I64x2ShrU) {
RunI64x2ShiftOpTest(execution_tier, kExprI64x2ShrU, LogicalShiftRight);
}
WASM_SIMD_TEST(I64x2ShiftAdd) {
for (int imm = 0; imm <= 64; imm++) {
RunShiftAddTestSequence<int64_t>(execution_tier, kExprI64x2ShrU,
kExprI64x2Add, kExprI64x2Splat, imm,
LogicalShiftRight);
RunShiftAddTestSequence<int64_t>(execution_tier, kExprI64x2ShrS,
kExprI64x2Add, kExprI64x2Splat, imm,
ArithmeticShiftRight);
}
}
WASM_SIMD_TEST(I64x2Add) {
RunI64x2BinOpTest(execution_tier, kExprI64x2Add, base::AddWithWraparound);
}
......@@ -1350,6 +1405,17 @@ WASM_SIMD_TEST(I32x4ShrU) {
RunI32x4ShiftOpTest(execution_tier, kExprI32x4ShrU, LogicalShiftRight);
}
WASM_SIMD_TEST(I32x4ShiftAdd) {
for (int imm = 0; imm <= 32; imm++) {
RunShiftAddTestSequence<int32_t>(execution_tier, kExprI32x4ShrU,
kExprI32x4Add, kExprI32x4Splat, imm,
LogicalShiftRight);
RunShiftAddTestSequence<int32_t>(execution_tier, kExprI32x4ShrS,
kExprI32x4Add, kExprI32x4Splat, imm,
ArithmeticShiftRight);
}
}
// Tests both signed and unsigned conversion from I8x16 (unpacking).
WASM_SIMD_TEST(I16x8ConvertI8x16) {
WasmRunner<int32_t, int32_t> r(execution_tier);
......@@ -1660,6 +1726,17 @@ WASM_SIMD_TEST(I16x8ShrU) {
RunI16x8ShiftOpTest(execution_tier, kExprI16x8ShrU, LogicalShiftRight);
}
WASM_SIMD_TEST(I16x8ShiftAdd) {
for (int imm = 0; imm <= 16; imm++) {
RunShiftAddTestSequence<int16_t>(execution_tier, kExprI16x8ShrU,
kExprI16x8Add, kExprI16x8Splat, imm,
LogicalShiftRight);
RunShiftAddTestSequence<int16_t>(execution_tier, kExprI16x8ShrS,
kExprI16x8Add, kExprI16x8Splat, imm,
ArithmeticShiftRight);
}
}
WASM_SIMD_TEST(I8x16Neg) {
RunI8x16UnOpTest(execution_tier, kExprI8x16Neg, base::NegateWithWraparound);
}
......@@ -1817,6 +1894,17 @@ WASM_SIMD_TEST(I8x16ShrU) {
RunI8x16ShiftOpTest(execution_tier, kExprI8x16ShrU, LogicalShiftRight);
}
WASM_SIMD_TEST(I8x16ShiftAdd) {
for (int imm = 0; imm <= 8; imm++) {
RunShiftAddTestSequence<int8_t>(execution_tier, kExprI8x16ShrU,
kExprI8x16Add, kExprI8x16Splat, imm,
LogicalShiftRight);
RunShiftAddTestSequence<int8_t>(execution_tier, kExprI8x16ShrS,
kExprI8x16Add, kExprI8x16Splat, imm,
ArithmeticShiftRight);
}
}
// Test Select by making a mask where the 0th and 3rd lanes are true and the
// rest false, and comparing for non-equality with zero to convert to a boolean
// vector.
......
......@@ -894,6 +894,7 @@ inline WasmOpcode LoadStoreOpcodeOf(MachineType type, bool store) {
#define TO_BYTE(val) static_cast<byte>(val)
// Encode all simd ops as a 2-byte LEB.
#define WASM_SIMD_OP(op) kSimdPrefix, U32V_2(op & 0xff)
#define WASM_SIMD_OPN(op, ...) __VA_ARGS__, WASM_SIMD_OP(op)
#define WASM_SIMD_SPLAT(Type, ...) __VA_ARGS__, WASM_SIMD_OP(kExpr##Type##Splat)
#define WASM_SIMD_UNOP(op, x) x, WASM_SIMD_OP(op)
#define WASM_SIMD_BINOP(op, x, y) x, y, WASM_SIMD_OP(op)
......
......@@ -2231,6 +2231,101 @@ INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorSIMDDPWithSIMDMulTest,
::testing::ValuesIn(kSIMDMulDPInstructions));
namespace {
struct SIMDShrAddInst {
const char* shradd_constructor_name;
const Operator* (MachineOperatorBuilder::*shr_s_operator)();
const Operator* (MachineOperatorBuilder::*shr_u_operator)();
const Operator* (MachineOperatorBuilder::*add_operator)();
const int laneSize;
};
std::ostream& operator<<(std::ostream& os, const SIMDShrAddInst& inst) {
return os << inst.shradd_constructor_name;
}
} // namespace
static const SIMDShrAddInst kSIMDShrAddInstructions[] = {
{"I64x2ShrAdd", &MachineOperatorBuilder::I64x2ShrS,
&MachineOperatorBuilder::I64x2ShrU, &MachineOperatorBuilder::I64x2Add, 64},
{"I32x4ShrAdd", &MachineOperatorBuilder::I32x4ShrS,
&MachineOperatorBuilder::I32x4ShrU, &MachineOperatorBuilder::I32x4Add, 32},
{"I16x8ShrAdd", &MachineOperatorBuilder::I16x8ShrS,
&MachineOperatorBuilder::I16x8ShrU, &MachineOperatorBuilder::I16x8Add, 16},
{"I8x16ShrAdd", &MachineOperatorBuilder::I8x16ShrS,
&MachineOperatorBuilder::I8x16ShrU, &MachineOperatorBuilder::I8x16Add, 8}};
using InstructionSelectorSIMDShrAddTest =
InstructionSelectorTestWithParam<SIMDShrAddInst>;
TEST_P(InstructionSelectorSIMDShrAddTest, ShrAddS) {
const SIMDShrAddInst param = GetParam();
const MachineType type = MachineType::Simd128();
{
StreamBuilder m(this, type, type, type);
Node* n = m.AddNode((m.machine()->*param.shr_s_operator)(), m.Parameter(1),
m.Int32Constant(1));
m.Return(
m.AddNode((m.machine()->*param.add_operator)(), m.Parameter(0), n));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kArm64Ssra, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(param.laneSize, LaneSizeField::decode(s[0]->opcode()));
EXPECT_EQ(1U, s[0]->OutputCount());
}
{
StreamBuilder m(this, type, type, type);
Node* n = m.AddNode((m.machine()->*param.shr_s_operator)(), m.Parameter(0),
m.Int32Constant(1));
m.Return(
m.AddNode((m.machine()->*param.add_operator)(), n, m.Parameter(1)));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kArm64Ssra, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(param.laneSize, LaneSizeField::decode(s[0]->opcode()));
EXPECT_EQ(1U, s[0]->OutputCount());
}
}
TEST_P(InstructionSelectorSIMDShrAddTest, ShrAddU) {
const SIMDShrAddInst param = GetParam();
const MachineType type = MachineType::Simd128();
{
StreamBuilder m(this, type, type, type);
Node* n = m.AddNode((m.machine()->*param.shr_u_operator)(), m.Parameter(1),
m.Int32Constant(1));
m.Return(
m.AddNode((m.machine()->*param.add_operator)(), m.Parameter(0), n));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kArm64Usra, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(param.laneSize, LaneSizeField::decode(s[0]->opcode()));
EXPECT_EQ(1U, s[0]->OutputCount());
}
{
StreamBuilder m(this, type, type, type);
Node* n = m.AddNode((m.machine()->*param.shr_u_operator)(), m.Parameter(0),
m.Int32Constant(1));
m.Return(
m.AddNode((m.machine()->*param.add_operator)(), n, m.Parameter(1)));
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kArm64Usra, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(param.laneSize, LaneSizeField::decode(s[0]->opcode()));
EXPECT_EQ(1U, s[0]->OutputCount());
}
}
INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorSIMDShrAddTest,
::testing::ValuesIn(kSIMDShrAddInstructions));
struct SIMDMulDupInst {
const uint8_t shuffle[16];
int32_t lane;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment