Commit 9e55269b authored by Ilja Iskovs's avatar Ilja Iskovs Committed by V8 LUCI CQ

[wasm-simd][arm64] Use Bic(x, imm) for And(x, [Not](imm)) when possible

Immediate version of the Bitclear instruction can be used for logical
And with some immediates. It can also be used to implement
And(x, Not(imm)) in a single instruction. This patch gives ~0.5% runtime
improvement in one benchmark on Neoverse N1.

Change-Id: Ia926c6746f0c252f81626c6fca21c4dfb41679d9
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3160667Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Martyn Capewell <martyn.capewell@arm.com>
Cr-Commit-Position: refs/heads/main@{#80015}
parent dec4bb06
......@@ -2657,7 +2657,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
SIMD_DESTRUCTIVE_BINOP_CASE(kArm64S128Select, Bsl, 16B);
SIMD_BINOP_CASE(kArm64S128AndNot, Bic, 16B);
case kArm64S128AndNot:
if (instr->InputAt(1)->IsImmediate()) {
VectorFormat f = VectorFormatFillQ(LaneSizeField::decode(opcode));
VRegister dst = i.OutputSimd128Register().Format(f);
DCHECK_EQ(dst, i.InputSimd128Register(0).Format(f));
__ Bic(dst, i.InputInt32(1), i.InputInt8(2));
} else {
__ Bic(i.OutputSimd128Register().V16B(),
i.InputSimd128Register(0).V16B(),
i.InputSimd128Register(1).V16B());
}
break;
case kArm64Ssra: {
int8_t laneSize = LaneSizeField::decode(opcode);
VectorFormat f = VectorFormatFillQ(laneSize);
......
......@@ -3530,19 +3530,17 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
V(I8x16ShrS, 8) \
V(I8x16ShrU, 8)
#define SIMD_BINOP_LIST(V) \
V(I32x4Mul, kArm64I32x4Mul) \
V(I32x4DotI16x8S, kArm64I32x4DotI16x8S) \
V(I16x8SConvertI32x4, kArm64I16x8SConvertI32x4) \
V(I16x8Mul, kArm64I16x8Mul) \
V(I16x8UConvertI32x4, kArm64I16x8UConvertI32x4) \
V(I16x8Q15MulRSatS, kArm64I16x8Q15MulRSatS) \
V(I8x16SConvertI16x8, kArm64I8x16SConvertI16x8) \
V(I8x16UConvertI16x8, kArm64I8x16UConvertI16x8) \
V(S128And, kArm64S128And) \
V(S128Or, kArm64S128Or) \
V(S128Xor, kArm64S128Xor) \
V(S128AndNot, kArm64S128AndNot)
#define SIMD_BINOP_LIST(V) \
V(I32x4Mul, kArm64I32x4Mul) \
V(I32x4DotI16x8S, kArm64I32x4DotI16x8S) \
V(I16x8SConvertI32x4, kArm64I16x8SConvertI32x4) \
V(I16x8Mul, kArm64I16x8Mul) \
V(I16x8UConvertI32x4, kArm64I16x8UConvertI32x4) \
V(I16x8Q15MulRSatS, kArm64I16x8Q15MulRSatS) \
V(I8x16SConvertI16x8, kArm64I8x16SConvertI16x8) \
V(I8x16UConvertI16x8, kArm64I8x16UConvertI16x8) \
V(S128Or, kArm64S128Or) \
V(S128Xor, kArm64S128Xor)
#define SIMD_BINOP_LANE_SIZE_LIST(V) \
V(F64x2Min, kArm64FMin, 64) \
......@@ -3608,6 +3606,108 @@ void InstructionSelector::VisitS128Const(Node* node) {
}
}
namespace {
struct BicImmParam {
BicImmParam(uint32_t imm, uint8_t lane_size, uint8_t shift_amount)
: imm(imm), lane_size(lane_size), shift_amount(shift_amount) {}
uint8_t imm;
uint8_t lane_size;
uint8_t shift_amount;
};
struct BicImmResult {
BicImmResult(base::Optional<BicImmParam> param, Node* const_node,
Node* other_node)
: param(param), const_node(const_node), other_node(other_node) {}
base::Optional<BicImmParam> param;
Node* const_node;
Node* other_node;
};
base::Optional<BicImmParam> BicImm16bitHelper(uint16_t val) {
uint8_t byte0 = val & 0xFF;
uint8_t byte1 = val >> 8;
// Cannot use Bic if both bytes are not 0x00
if (byte0 == 0x00) {
return BicImmParam(byte1, 16, 8);
}
if (byte1 == 0x00) {
return BicImmParam(byte0, 16, 0);
}
return base::nullopt;
}
base::Optional<BicImmParam> BicImm32bitHelper(uint32_t val) {
for (int i = 0; i < 4; i++) {
// All bytes are 0 but one
if ((val & (0xFF << (8 * i))) == val) {
return BicImmParam(static_cast<uint8_t>(val >> i * 8), 32, i * 8);
}
}
// Low and high 2 bytes are equal
if ((val >> 16) == (0xFFFF & val)) {
return BicImm16bitHelper(0xFFFF & val);
}
return base::nullopt;
}
base::Optional<BicImmParam> BicImmConstHelper(Node* const_node, bool not_imm) {
const int kUint32Immediates = 4;
uint32_t val[kUint32Immediates];
STATIC_ASSERT(sizeof(val) == kSimd128Size);
memcpy(val, S128ImmediateParameterOf(const_node->op()).data(), kSimd128Size);
// If 4 uint32s are not the same, cannot emit Bic
if (!(val[0] == val[1] && val[1] == val[2] && val[2] == val[3])) {
return base::nullopt;
}
return BicImm32bitHelper(not_imm ? ~val[0] : val[0]);
}
base::Optional<BicImmResult> BicImmHelper(Node* or_node, bool not_imm) {
Node* left = or_node->InputAt(0);
Node* right = or_node->InputAt(1);
if (left->opcode() == IrOpcode::kS128Const) {
return BicImmResult(BicImmConstHelper(left, not_imm), left, right);
}
if (right->opcode() == IrOpcode::kS128Const) {
return BicImmResult(BicImmConstHelper(right, not_imm), right, left);
}
return base::nullopt;
}
bool TryEmitS128AndNotImm(InstructionSelector* selector, Node* node,
bool not_imm) {
Arm64OperandGenerator g(selector);
base::Optional<BicImmResult> result = BicImmHelper(node, not_imm);
if (!result.has_value()) return false;
base::Optional<BicImmParam> param = result->param;
if (param.has_value()) {
if (selector->CanCover(node, result->other_node)) {
selector->Emit(
kArm64S128AndNot | LaneSizeField::encode(param->lane_size),
g.DefineSameAsFirst(node), g.UseRegister(result->other_node),
g.UseImmediate(param->imm), g.UseImmediate(param->shift_amount));
return true;
}
}
return false;
}
} // namespace
void InstructionSelector::VisitS128AndNot(Node* node) {
if (!TryEmitS128AndNotImm(this, node, false)) {
VisitRRR(this, kArm64S128AndNot, node);
}
}
void InstructionSelector::VisitS128And(Node* node) {
if (!TryEmitS128AndNotImm(this, node, true)) {
VisitRRR(this, kArm64S128And, node);
}
}
void InstructionSelector::VisitS128Zero(Node* node) {
Arm64OperandGenerator g(this);
Emit(kArm64S128Zero, g.DefineAsRegister(node));
......
......@@ -1545,6 +1545,56 @@ WASM_SIMD_TEST(S128And) {
[](int32_t x, int32_t y) { return x & y; });
}
template <typename ScalarType>
using BinOp = ScalarType (*)(ScalarType, ScalarType);
template <typename ScalarType>
void RunS128ConstBinOpTest(TestExecutionTier execution_tier,
WasmOpcode binop_opcode, WasmOpcode splat_opcode,
BinOp<ScalarType> expected_op) {
for (ScalarType x : compiler::ValueHelper::GetVector<ScalarType>()) {
WasmRunner<int32_t, ScalarType> r(execution_tier);
// Global to hold output.
ScalarType* g1 = r.builder().template AddGlobal<ScalarType>(kWasmS128);
ScalarType* g2 = r.builder().template AddGlobal<ScalarType>(kWasmS128);
// Build a function to splat one argument into a local,
// and execute the op with a const as the second argument
byte value = 0;
byte temp1 = r.AllocateLocal(kWasmS128);
uint8_t const_buffer[16];
for (size_t i = 0; i < kSimd128Size / sizeof(ScalarType); i++) {
WriteLittleEndianValue<ScalarType>(
bit_cast<ScalarType*>(&const_buffer[0]) + i, x);
}
BUILD(
r,
WASM_LOCAL_SET(temp1,
WASM_SIMD_OPN(splat_opcode, WASM_LOCAL_GET(value))),
WASM_GLOBAL_SET(0, WASM_SIMD_BINOP(binop_opcode, WASM_LOCAL_GET(temp1),
WASM_SIMD_CONSTANT(const_buffer))),
WASM_GLOBAL_SET(
1, WASM_SIMD_BINOP(binop_opcode, WASM_SIMD_CONSTANT(const_buffer),
WASM_LOCAL_GET(temp1))),
WASM_ONE);
for (ScalarType y : compiler::ValueHelper::GetVector<ScalarType>()) {
r.Call(y);
ScalarType expected1 = expected_op(y, x);
ScalarType expected2 = expected_op(x, y);
for (size_t i = 0; i < kSimd128Size / sizeof(ScalarType); i++) {
CHECK_EQ(expected1, LANE(g1, i));
CHECK_EQ(expected2, LANE(g2, i));
}
}
}
}
WASM_SIMD_TEST(S128AndImm) {
RunS128ConstBinOpTest<int32_t>(execution_tier, kExprS128And, kExprI32x4Splat,
[](int32_t x, int32_t y) { return x & y; });
RunS128ConstBinOpTest<int16_t>(
execution_tier, kExprS128And, kExprI16x8Splat,
[](int16_t x, int16_t y) { return static_cast<int16_t>(x & y); });
}
WASM_SIMD_TEST(S128Or) {
RunI32x4BinOpTest(execution_tier, kExprS128Or,
[](int32_t x, int32_t y) { return x | y; });
......@@ -1561,6 +1611,15 @@ WASM_SIMD_TEST(S128AndNot) {
[](int32_t x, int32_t y) { return x & ~y; });
}
WASM_SIMD_TEST(S128AndNotImm) {
RunS128ConstBinOpTest<int32_t>(execution_tier, kExprS128AndNot,
kExprI32x4Splat,
[](int32_t x, int32_t y) { return x & ~y; });
RunS128ConstBinOpTest<int16_t>(
execution_tier, kExprS128AndNot, kExprI16x8Splat,
[](int16_t x, int16_t y) { return static_cast<int16_t>(x & ~y); });
}
WASM_SIMD_TEST(I32x4Eq) {
RunI32x4BinOpTest(execution_tier, kExprI32x4Eq, Equal);
}
......
......@@ -5571,6 +5571,191 @@ INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorSIMDConstZeroCmTest,
::testing::ValuesIn(SIMDConstZeroCmTests));
struct SIMDConstAndTest {
const uint8_t data[16];
const Operator* (MachineOperatorBuilder::*simd_op)();
const ArchOpcode expected_op;
const uint8_t lane_size;
const uint8_t shift_amount;
const int32_t expected_imm;
const size_t size;
};
static const SIMDConstAndTest SIMDConstAndTests[] = {
{{0xFF, 0xFE, 0xFF, 0xFE, 0xFF, 0xFE, 0xFF, 0xFE, 0xFF, 0xFE, 0xFF, 0xFE,
0xFF, 0xFE, 0xFF, 0xFE},
&MachineOperatorBuilder::S128And,
kArm64S128AndNot,
16,
8,
0x01,
1},
{{0xFE, 0xFF, 0xFE, 0xFF, 0xFE, 0xFF, 0xFE, 0xFF, 0xFE, 0xFF, 0xFE, 0xFF,
0xFE, 0xFF, 0xFE, 0xFF},
&MachineOperatorBuilder::S128And,
kArm64S128AndNot,
16,
0,
0x01,
1},
{{0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFE,
0xFF, 0xFF, 0xFF, 0xFE},
&MachineOperatorBuilder::S128And,
kArm64S128AndNot,
32,
24,
0x01,
1},
{{0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF,
0xFF, 0xFF, 0xFE, 0xFF},
&MachineOperatorBuilder::S128And,
kArm64S128AndNot,
32,
16,
0x01,
1},
{{0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF,
0xFF, 0xFE, 0xFF, 0xFF},
&MachineOperatorBuilder::S128And,
kArm64S128AndNot,
32,
8,
0x01,
1},
{{0xFE, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF,
0xFE, 0xFF, 0xFF, 0xFF},
&MachineOperatorBuilder::S128And,
kArm64S128AndNot,
32,
0,
0x01,
1},
{{0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
0xEE, 0xEE, 0xEE, 0xEE},
&MachineOperatorBuilder::S128And,
kArm64S128And,
0,
0,
0x00,
2},
{{0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
0x00, 0x01, 0x00, 0x01},
&MachineOperatorBuilder::S128AndNot,
kArm64S128AndNot,
16,
8,
0x01,
1},
{{0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00,
0x01, 0x00, 0x01, 0x00},
&MachineOperatorBuilder::S128AndNot,
kArm64S128AndNot,
16,
0,
0x01,
1},
{{0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
0x00, 0x00, 0x00, 0x01},
&MachineOperatorBuilder::S128AndNot,
kArm64S128AndNot,
32,
24,
0x01,
1},
{{0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00,
0x00, 0x00, 0x01, 0x00},
&MachineOperatorBuilder::S128AndNot,
kArm64S128AndNot,
32,
16,
0x01,
1},
{{0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00,
0x00, 0x01, 0x00, 0x00},
&MachineOperatorBuilder::S128AndNot,
kArm64S128AndNot,
32,
8,
0x01,
1},
{{0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
0x01, 0x00, 0x00, 0x00},
&MachineOperatorBuilder::S128AndNot,
kArm64S128AndNot,
32,
0,
0x01,
1},
{{0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
0xEE, 0xEE, 0xEE, 0xEE},
&MachineOperatorBuilder::S128AndNot,
kArm64S128AndNot,
0,
0,
0x00,
2},
};
using InstructionSelectorSIMDConstAndTest =
InstructionSelectorTestWithParam<SIMDConstAndTest>;
TEST_P(InstructionSelectorSIMDConstAndTest, ConstAnd) {
const SIMDConstAndTest param = GetParam();
// Const node on the left
{
StreamBuilder m(this, MachineType::Simd128(), MachineType::Simd128());
Node* cnst = m.S128Const(param.data);
Node* op = m.AddNode((m.machine()->*param.simd_op)(), cnst, m.Parameter(0));
m.Return(op);
Stream s = m.Build();
ASSERT_EQ(param.size, s.size());
if (param.size == 1) {
EXPECT_EQ(param.expected_op, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
EXPECT_EQ(param.lane_size, LaneSizeField::decode(s[0]->opcode()));
EXPECT_EQ(param.shift_amount, s.ToInt32(s[0]->InputAt(2)));
EXPECT_EQ(param.expected_imm, s.ToInt32(s[0]->InputAt(1)));
} else {
EXPECT_EQ(kArm64S128Const, s[0]->arch_opcode());
EXPECT_EQ(param.expected_op, s[1]->arch_opcode());
EXPECT_EQ(2U, s[1]->InputCount());
EXPECT_EQ(1U, s[1]->OutputCount());
}
}
// Const node on the right
{
StreamBuilder m(this, MachineType::Simd128(), MachineType::Simd128());
Node* cnst = m.S128Const(param.data);
Node* op = m.AddNode((m.machine()->*param.simd_op)(), m.Parameter(0), cnst);
m.Return(op);
Stream s = m.Build();
ASSERT_EQ(param.size, s.size());
if (param.size == 1) {
EXPECT_EQ(param.expected_op, s[0]->arch_opcode());
EXPECT_EQ(3U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
EXPECT_EQ(param.lane_size, LaneSizeField::decode(s[0]->opcode()));
EXPECT_EQ(param.shift_amount, s.ToInt32(s[0]->InputAt(2)));
EXPECT_EQ(param.expected_imm, s.ToInt32(s[0]->InputAt(1)));
} else {
EXPECT_EQ(kArm64S128Const, s[0]->arch_opcode());
EXPECT_EQ(param.expected_op, s[1]->arch_opcode());
EXPECT_EQ(2U, s[1]->InputCount());
EXPECT_EQ(1U, s[1]->OutputCount());
}
}
}
INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorSIMDConstAndTest,
::testing::ValuesIn(SIMDConstAndTests));
} // namespace
} // namespace compiler
} // namespace internal
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment