Commit 7d7b25d9 authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[wasm-simd][x64] Optimize integer splats of constant 0

Integer splats (especially for sizes < 32-bits) does not directly
translate to a single instruction on x64. We can do better for special
values, like 0, which can be lowered to `xor dst dst`. We do this check
in the instruction selector, and emit a special opcode kX64S128Zero.

Also change the xor operation for kX64S128Zero from xorps to pxor. This
can help reduce any potential data bypass delay (search for this on
agner's microarchitecture manual for more details.). Since integer
splats are likely to be followed by integer ops, we should remain in the
integer domain, thus use pxor.

For i64x2.splat the codegen goes from:

  xorl rdi,rdi
  vmovq xmm0,rdi
  vmovddup xmm0,xmm0

to:

  vpxor xmm0,xmm0,xmm0

Also add a unittest to verify this optimization, and necessary
raw-assembler methods for the test.

Bug: v8:11093
Change-Id: I26b092032b6e672f1d5d26e35d79578ebe591cfe
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2516299Reviewed-by: 's avatarTobias Tebbi <tebbi@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#70977}
parent 2ccd4dc5
...@@ -3014,7 +3014,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3014,7 +3014,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} }
case kX64S128Zero: { case kX64S128Zero: {
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
__ Xorps(dst, dst); __ Pxor(dst, dst);
break; break;
} }
case kX64S128AllOnes: { case kX64S128AllOnes: {
......
...@@ -2966,22 +2966,31 @@ void InstructionSelector::VisitS128Zero(Node* node) { ...@@ -2966,22 +2966,31 @@ void InstructionSelector::VisitS128Zero(Node* node) {
} }
#define SIMD_TYPES_FOR_SPLAT(V) \ #define SIMD_TYPES_FOR_SPLAT(V) \
V(F64x2) \
V(I64x2) \ V(I64x2) \
V(I32x4) \ V(I32x4) \
V(I16x8) \ V(I16x8) \
V(I8x16) V(I8x16)
// Splat with an optimization for const 0.
#define VISIT_SIMD_SPLAT(Type) \ #define VISIT_SIMD_SPLAT(Type) \
void InstructionSelector::Visit##Type##Splat(Node* node) { \ void InstructionSelector::Visit##Type##Splat(Node* node) { \
X64OperandGenerator g(this); \ X64OperandGenerator g(this); \
Emit(kX64##Type##Splat, g.DefineAsRegister(node), \ Node* input = node->InputAt(0); \
g.Use(node->InputAt(0))); \ if (g.CanBeImmediate(input) && g.GetImmediateIntegerValue(input) == 0) { \
Emit(kX64S128Zero, g.DefineAsRegister(node)); \
} else { \
Emit(kX64##Type##Splat, g.DefineAsRegister(node), g.Use(input)); \
} \
} }
SIMD_TYPES_FOR_SPLAT(VISIT_SIMD_SPLAT) SIMD_TYPES_FOR_SPLAT(VISIT_SIMD_SPLAT)
#undef VISIT_SIMD_SPLAT #undef VISIT_SIMD_SPLAT
#undef SIMD_TYPES_FOR_SPLAT #undef SIMD_TYPES_FOR_SPLAT
void InstructionSelector::VisitF64x2Splat(Node* node) {
X64OperandGenerator g(this);
Emit(kX64F64x2Splat, g.DefineAsRegister(node), g.Use(node->InputAt(0)));
}
void InstructionSelector::VisitF32x4Splat(Node* node) { void InstructionSelector::VisitF32x4Splat(Node* node) {
X64OperandGenerator g(this); X64OperandGenerator g(this);
InstructionOperand dst = InstructionOperand dst =
......
...@@ -835,6 +835,12 @@ class V8_EXPORT_PRIVATE RawMachineAssembler { ...@@ -835,6 +835,12 @@ class V8_EXPORT_PRIVATE RawMachineAssembler {
return AddNode(machine()->Float64SilenceNaN(), a); return AddNode(machine()->Float64SilenceNaN(), a);
} }
// SIMD operations.
Node* I64x2Splat(Node* a) { return AddNode(machine()->I64x2Splat(), a); }
Node* I32x4Splat(Node* a) { return AddNode(machine()->I32x4Splat(), a); }
Node* I16x8Splat(Node* a) { return AddNode(machine()->I16x8Splat(), a); }
Node* I8x16Splat(Node* a) { return AddNode(machine()->I8x16Splat(), a); }
// Stack operations. // Stack operations.
Node* LoadFramePointer() { return AddNode(machine()->LoadFramePointer()); } Node* LoadFramePointer() { return AddNode(machine()->LoadFramePointer()); }
Node* LoadParentFramePointer() { Node* LoadParentFramePointer() {
......
...@@ -1888,6 +1888,53 @@ TEST_F(InstructionSelectorTest, LoadAndWord64ShiftRight32) { ...@@ -1888,6 +1888,53 @@ TEST_F(InstructionSelectorTest, LoadAndWord64ShiftRight32) {
} }
} }
TEST_F(InstructionSelectorTest, SIMDSplatZero) {
// Test optimization for splat of contant 0.
// {i8x16,i16x8,i32x4,i64x2}.splat(const(0)) -> v128.zero().
// Optimizations for f32x4.splat and f64x2.splat not implemented since it
// doesn't improve the codegen as much (same number of instructions).
{
StreamBuilder m(this, MachineType::Simd128());
Node* const splat = m.I64x2Splat(m.Int64Constant(0));
m.Return(splat);
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kX64S128Zero, s[0]->arch_opcode());
ASSERT_EQ(0U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
{
StreamBuilder m(this, MachineType::Simd128());
Node* const splat = m.I32x4Splat(m.Int32Constant(0));
m.Return(splat);
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kX64S128Zero, s[0]->arch_opcode());
ASSERT_EQ(0U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
{
StreamBuilder m(this, MachineType::Simd128());
Node* const splat = m.I16x8Splat(m.Int32Constant(0));
m.Return(splat);
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kX64S128Zero, s[0]->arch_opcode());
ASSERT_EQ(0U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
{
StreamBuilder m(this, MachineType::Simd128());
Node* const splat = m.I8x16Splat(m.Int32Constant(0));
m.Return(splat);
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kX64S128Zero, s[0]->arch_opcode());
ASSERT_EQ(0U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
}
} // namespace compiler } // namespace compiler
} // namespace internal } // namespace internal
} // namespace v8 } // namespace v8
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment