Commit 8e9ad4f8 authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[x64][ia32][wasm-simd] Optimize v128.bitselect

Couple of optimizations for v128.bitselect on both ia32 and x64.

1. Remove an extra movaps when AVX is supported, since we have 3-operand
instructions
2. Tweak the algorithm from:
     xor(and(xor(src1, src2), mask) src2)

   To:
     or(and(src1, mask), andnot(src2, mask))
   It is easier to read and understand, and also eliminate a dependency
   chain (on kScratchDoubleReg) in the older algorithm.
3. Use integer forms of the logical ops. Older processors have higher
throughput on these, compared to the floating point ops. However, the
integer forms are 1 byte longer, so on SSE, we stick to the floating
point ops.

For AVX, this reduces instruction count from 9948 to 9868.

Change-Id: Idd5d26b99a76255dbfa63e2c304e6af3760c4ec6
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2591859Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71845}
parent 3e6f5fe1
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
V(paddusb, 66, 0F, DC) \ V(paddusb, 66, 0F, DC) \
V(paddusw, 66, 0F, DD) \ V(paddusw, 66, 0F, DD) \
V(pand, 66, 0F, DB) \ V(pand, 66, 0F, DB) \
V(pandn, 66, 0F, DF) \
V(pcmpeqb, 66, 0F, 74) \ V(pcmpeqb, 66, 0F, 74) \
V(pcmpeqw, 66, 0F, 75) \ V(pcmpeqw, 66, 0F, 75) \
V(pcmpeqd, 66, 0F, 76) \ V(pcmpeqd, 66, 0F, 76) \
......
...@@ -76,6 +76,7 @@ ...@@ -76,6 +76,7 @@
V(paddusb, 66, 0F, DC) \ V(paddusb, 66, 0F, DC) \
V(paddusw, 66, 0F, DD) \ V(paddusw, 66, 0F, DD) \
V(pmaxub, 66, 0F, DE) \ V(pmaxub, 66, 0F, DE) \
V(pandn, 66, 0F, DF) \
V(pavgb, 66, 0F, E0) \ V(pavgb, 66, 0F, E0) \
V(psraw, 66, 0F, E1) \ V(psraw, 66, 0F, E1) \
V(psrad, 66, 0F, E2) \ V(psrad, 66, 0F, E2) \
......
...@@ -3883,19 +3883,20 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3883,19 +3883,20 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
// Mask used here is stored in dst. // Mask used here is stored in dst.
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
__ movaps(kScratchDoubleReg, i.InputSimd128Register(1)); // Use float ops as they are 1 byte shorter than int ops.
__ xorps(kScratchDoubleReg, i.InputSimd128Register(2)); __ movaps(kScratchDoubleReg, i.InputSimd128Register(0));
__ andps(dst, kScratchDoubleReg); __ andnps(kScratchDoubleReg, i.InputSimd128Register(2));
__ xorps(dst, i.InputSimd128Register(2)); __ andps(dst, i.InputSimd128Register(1));
__ orps(dst, kScratchDoubleReg);
break; break;
} }
case kAVXS128Select: { case kAVXS128Select: {
CpuFeatureScope avx_scope(tasm(), AVX); CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
__ vxorps(kScratchDoubleReg, i.InputSimd128Register(2), XMMRegister mask = i.InputSimd128Register(0);
i.InputOperand(1)); __ vpandn(kScratchDoubleReg, mask, i.InputSimd128Register(2));
__ vandps(kScratchDoubleReg, kScratchDoubleReg, i.InputOperand(0)); __ vpand(dst, i.InputSimd128Register(1), mask);
__ vxorps(dst, kScratchDoubleReg, i.InputSimd128Register(2)); __ vpor(dst, dst, kScratchDoubleReg);
break; break;
} }
case kIA32S128AndNot: { case kIA32S128AndNot: {
......
...@@ -3754,12 +3754,25 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3754,12 +3754,25 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kX64S128Select: { case kX64S128Select: {
// Mask used here is stored in dst. // v128.select = v128.or(v128.and(v1, c), v128.andnot(v2, c)).
// pandn(x, y) = !x & y, so we have to flip the mask and input.
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
__ Movaps(kScratchDoubleReg, i.InputSimd128Register(1)); XMMRegister mask = i.InputSimd128Register(0);
__ Xorps(kScratchDoubleReg, i.InputSimd128Register(2)); XMMRegister src1 = i.InputSimd128Register(1);
__ Andps(dst, kScratchDoubleReg); XMMRegister src2 = i.InputSimd128Register(2);
__ Xorps(dst, i.InputSimd128Register(2)); if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpandn(kScratchDoubleReg, mask, src2);
__ vpand(dst, src1, mask);
__ vpor(dst, dst, kScratchDoubleReg);
} else {
DCHECK_EQ(dst, mask);
// Use float ops as they are 1 byte shorter than int ops.
__ movaps(kScratchDoubleReg, mask);
__ andnps(kScratchDoubleReg, src2);
__ andps(dst, src1);
__ orps(dst, kScratchDoubleReg);
}
break; break;
} }
case kX64S128AndNot: { case kX64S128AndNot: {
......
...@@ -3161,9 +3161,10 @@ SIMD_ALLTRUE_LIST(VISIT_SIMD_ALLTRUE) ...@@ -3161,9 +3161,10 @@ SIMD_ALLTRUE_LIST(VISIT_SIMD_ALLTRUE)
void InstructionSelector::VisitS128Select(Node* node) { void InstructionSelector::VisitS128Select(Node* node) {
X64OperandGenerator g(this); X64OperandGenerator g(this);
Emit(kX64S128Select, g.DefineSameAsFirst(node), InstructionOperand dst =
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)), IsSupported(AVX) ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node);
g.UseRegister(node->InputAt(2))); Emit(kX64S128Select, dst, g.UseRegister(node->InputAt(0)),
g.UseRegister(node->InputAt(1)), g.UseRegister(node->InputAt(2)));
} }
namespace { namespace {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment