Commit 7d7b25d9 authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[wasm-simd][x64] Optimize integer splats of constant 0

Integer splats (especially for sizes < 32-bits) does not directly
translate to a single instruction on x64. We can do better for special
values, like 0, which can be lowered to `xor dst dst`. We do this check
in the instruction selector, and emit a special opcode kX64S128Zero.

Also change the xor operation for kX64S128Zero from xorps to pxor. This
can help reduce any potential data bypass delay (search for this on
agner's microarchitecture manual for more details.). Since integer
splats are likely to be followed by integer ops, we should remain in the
integer domain, thus use pxor.

For i64x2.splat the codegen goes from:

  xorl rdi,rdi
  vmovq xmm0,rdi
  vmovddup xmm0,xmm0

to:

  vpxor xmm0,xmm0,xmm0

Also add a unittest to verify this optimization, and necessary
raw-assembler methods for the test.

Bug: v8:11093
Change-Id: I26b092032b6e672f1d5d26e35d79578ebe591cfe
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2516299Reviewed-by: 's avatarTobias Tebbi <tebbi@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#70977}
parent 2ccd4dc5
......@@ -3014,7 +3014,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64S128Zero: {
XMMRegister dst = i.OutputSimd128Register();
__ Xorps(dst, dst);
__ Pxor(dst, dst);
break;
}
case kX64S128AllOnes: {
......
......@@ -2966,22 +2966,31 @@ void InstructionSelector::VisitS128Zero(Node* node) {
}
#define SIMD_TYPES_FOR_SPLAT(V) \
V(F64x2) \
V(I64x2) \
V(I32x4) \
V(I16x8) \
V(I8x16)
#define VISIT_SIMD_SPLAT(Type) \
void InstructionSelector::Visit##Type##Splat(Node* node) { \
X64OperandGenerator g(this); \
Emit(kX64##Type##Splat, g.DefineAsRegister(node), \
g.Use(node->InputAt(0))); \
// Splat with an optimization for const 0.
#define VISIT_SIMD_SPLAT(Type) \
void InstructionSelector::Visit##Type##Splat(Node* node) { \
X64OperandGenerator g(this); \
Node* input = node->InputAt(0); \
if (g.CanBeImmediate(input) && g.GetImmediateIntegerValue(input) == 0) { \
Emit(kX64S128Zero, g.DefineAsRegister(node)); \
} else { \
Emit(kX64##Type##Splat, g.DefineAsRegister(node), g.Use(input)); \
} \
}
SIMD_TYPES_FOR_SPLAT(VISIT_SIMD_SPLAT)
#undef VISIT_SIMD_SPLAT
#undef SIMD_TYPES_FOR_SPLAT
void InstructionSelector::VisitF64x2Splat(Node* node) {
X64OperandGenerator g(this);
Emit(kX64F64x2Splat, g.DefineAsRegister(node), g.Use(node->InputAt(0)));
}
void InstructionSelector::VisitF32x4Splat(Node* node) {
X64OperandGenerator g(this);
InstructionOperand dst =
......
......@@ -835,6 +835,12 @@ class V8_EXPORT_PRIVATE RawMachineAssembler {
return AddNode(machine()->Float64SilenceNaN(), a);
}
// SIMD operations.
Node* I64x2Splat(Node* a) { return AddNode(machine()->I64x2Splat(), a); }
Node* I32x4Splat(Node* a) { return AddNode(machine()->I32x4Splat(), a); }
Node* I16x8Splat(Node* a) { return AddNode(machine()->I16x8Splat(), a); }
Node* I8x16Splat(Node* a) { return AddNode(machine()->I8x16Splat(), a); }
// Stack operations.
Node* LoadFramePointer() { return AddNode(machine()->LoadFramePointer()); }
Node* LoadParentFramePointer() {
......
......@@ -1888,6 +1888,53 @@ TEST_F(InstructionSelectorTest, LoadAndWord64ShiftRight32) {
}
}
TEST_F(InstructionSelectorTest, SIMDSplatZero) {
// Test optimization for splat of contant 0.
// {i8x16,i16x8,i32x4,i64x2}.splat(const(0)) -> v128.zero().
// Optimizations for f32x4.splat and f64x2.splat not implemented since it
// doesn't improve the codegen as much (same number of instructions).
{
StreamBuilder m(this, MachineType::Simd128());
Node* const splat = m.I64x2Splat(m.Int64Constant(0));
m.Return(splat);
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kX64S128Zero, s[0]->arch_opcode());
ASSERT_EQ(0U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
{
StreamBuilder m(this, MachineType::Simd128());
Node* const splat = m.I32x4Splat(m.Int32Constant(0));
m.Return(splat);
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kX64S128Zero, s[0]->arch_opcode());
ASSERT_EQ(0U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
{
StreamBuilder m(this, MachineType::Simd128());
Node* const splat = m.I16x8Splat(m.Int32Constant(0));
m.Return(splat);
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kX64S128Zero, s[0]->arch_opcode());
ASSERT_EQ(0U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
{
StreamBuilder m(this, MachineType::Simd128());
Node* const splat = m.I8x16Splat(m.Int32Constant(0));
m.Return(splat);
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(kX64S128Zero, s[0]->arch_opcode());
ASSERT_EQ(0U, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
}
} // namespace compiler
} // namespace internal
} // namespace v8
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment