Commit 0d886c56 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][x64] Optimize codegen when shift is constant

This optimizes i8x16 shifts when the shift value is constant. It brings
generated instruction counts down from 10 to 6 (unsigned), and 9 to 5
(signed).

For Signed, we use a word (16-bit) shift, then mask away the high (shru)
or low (shl) bits to achieve a byte shift. Most of the instructions are
dedicated to building the mask.

Bug: v8:10115
Change-Id: I1d5c0e0fb779eeb7e0185d3cb7fd595837fd8daf
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2106293Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66779}
parent aff14d7e
...@@ -189,7 +189,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -189,7 +189,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Pavgb, pavgb) AVX_OP(Pavgb, pavgb)
AVX_OP(Pavgw, pavgw) AVX_OP(Pavgw, pavgw)
AVX_OP(Psrad, psrad) AVX_OP(Psrad, psrad)
AVX_OP(Psllw, psllw)
AVX_OP(Psllq, psllq) AVX_OP(Psllq, psllq)
AVX_OP(Psrlw, psrlw)
AVX_OP(Psrld, psrld) AVX_OP(Psrld, psrld)
AVX_OP(Psrlq, psrlq) AVX_OP(Psrlq, psrlq)
AVX_OP(Paddd, paddd) AVX_OP(Paddd, paddd)
......
...@@ -3345,42 +3345,64 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3345,42 +3345,64 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kX64I8x16Shl: { case kX64I8x16Shl: {
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
// Temp registers for shift mask andadditional moves to XMM registers. // Temp registers for shift mask and additional moves to XMM registers.
Register tmp = i.ToRegister(instr->TempAt(0)); Register tmp = i.ToRegister(instr->TempAt(0));
XMMRegister tmp_simd = i.TempSimd128Register(1); XMMRegister tmp_simd = i.TempSimd128Register(1);
Register shift = i.InputRegister(1); if (HasImmediateInput(instr, 1)) {
// Mask off the unwanted bits before word-shifting. // Perform 16-bit shift, then mask away low bits.
__ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg); uint8_t shift = i.InputInt3(1);
// Take shift value modulo 8. __ Psllw(dst, static_cast<byte>(shift));
__ andq(shift, Immediate(7));
__ movq(tmp, shift); uint8_t bmask = static_cast<uint8_t>(0xff << shift);
__ addq(tmp, Immediate(8)); uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
__ movq(tmp_simd, tmp); __ movl(tmp, Immediate(mask));
__ psrlw(kScratchDoubleReg, tmp_simd); __ Movd(tmp_simd, tmp);
__ packuswb(kScratchDoubleReg, kScratchDoubleReg); __ Pshufd(tmp_simd, tmp_simd, static_cast<uint8_t>(0));
__ pand(dst, kScratchDoubleReg); __ Pand(dst, tmp_simd);
__ movq(tmp_simd, shift); } else {
__ psllw(dst, tmp_simd); Register shift = i.InputRegister(1);
// Mask off the unwanted bits before word-shifting.
__ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
// Take shift value modulo 8.
__ andq(shift, Immediate(7));
__ movq(tmp, shift);
__ addq(tmp, Immediate(8));
__ movq(tmp_simd, tmp);
__ psrlw(kScratchDoubleReg, tmp_simd);
__ packuswb(kScratchDoubleReg, kScratchDoubleReg);
__ pand(dst, kScratchDoubleReg);
__ movq(tmp_simd, shift);
__ psllw(dst, tmp_simd);
}
break; break;
} }
case kX64I8x16ShrS: { case kX64I8x16ShrS: {
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
// Temp registers for shift mask andadditional moves to XMM registers. if (HasImmediateInput(instr, 1)) {
Register tmp = i.ToRegister(instr->TempAt(0)); __ punpckhbw(kScratchDoubleReg, dst);
XMMRegister tmp_simd = i.TempSimd128Register(1); __ punpcklbw(dst, dst);
// Unpack the bytes into words, do arithmetic shifts, and repack. uint8_t shift = i.InputInt3(1) + 8;
__ punpckhbw(kScratchDoubleReg, dst); __ psraw(kScratchDoubleReg, shift);
__ punpcklbw(dst, dst); __ psraw(dst, shift);
// Prepare shift value __ packsswb(dst, kScratchDoubleReg);
__ movq(tmp, i.InputRegister(1)); } else {
// Take shift value modulo 8. // Temp registers for shift mask andadditional moves to XMM registers.
__ andq(tmp, Immediate(7)); Register tmp = i.ToRegister(instr->TempAt(0));
__ addq(tmp, Immediate(8)); XMMRegister tmp_simd = i.TempSimd128Register(1);
__ movq(tmp_simd, tmp); // Unpack the bytes into words, do arithmetic shifts, and repack.
__ psraw(kScratchDoubleReg, tmp_simd); __ punpckhbw(kScratchDoubleReg, dst);
__ psraw(dst, tmp_simd); __ punpcklbw(dst, dst);
__ packsswb(dst, kScratchDoubleReg); // Prepare shift value
__ movq(tmp, i.InputRegister(1));
// Take shift value modulo 8.
__ andq(tmp, Immediate(7));
__ addq(tmp, Immediate(8));
__ movq(tmp_simd, tmp);
__ psraw(kScratchDoubleReg, tmp_simd);
__ psraw(dst, tmp_simd);
__ packsswb(dst, kScratchDoubleReg);
}
break; break;
} }
case kX64I8x16Add: { case kX64I8x16Add: {
...@@ -3478,17 +3500,30 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3478,17 +3500,30 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
// Temp registers for shift mask andadditional moves to XMM registers. // Temp registers for shift mask andadditional moves to XMM registers.
Register tmp = i.ToRegister(instr->TempAt(0)); Register tmp = i.ToRegister(instr->TempAt(0));
XMMRegister tmp_simd = i.TempSimd128Register(1); XMMRegister tmp_simd = i.TempSimd128Register(1);
__ punpckhbw(kScratchDoubleReg, dst); if (HasImmediateInput(instr, 1)) {
__ punpcklbw(dst, dst); // Perform 16-bit shift, then mask away high bits.
// Prepare shift value uint8_t shift = i.InputInt3(1);
__ movq(tmp, i.InputRegister(1)); __ Psrlw(dst, static_cast<byte>(shift));
// Take shift value modulo 8.
__ andq(tmp, Immediate(7)); uint8_t bmask = 0xff >> shift;
__ addq(tmp, Immediate(8)); uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
__ movq(tmp_simd, tmp); __ movl(tmp, Immediate(mask));
__ psrlw(kScratchDoubleReg, tmp_simd); __ Movd(tmp_simd, tmp);
__ psrlw(dst, tmp_simd); __ Pshufd(tmp_simd, tmp_simd, static_cast<byte>(0));
__ packuswb(dst, kScratchDoubleReg); __ Pand(dst, tmp_simd);
} else {
__ punpckhbw(kScratchDoubleReg, dst);
__ punpcklbw(dst, dst);
// Prepare shift value
__ movq(tmp, i.InputRegister(1));
// Take shift value modulo 8.
__ andq(tmp, Immediate(7));
__ addq(tmp, Immediate(8));
__ movq(tmp_simd, tmp);
__ psrlw(kScratchDoubleReg, tmp_simd);
__ psrlw(dst, tmp_simd);
__ packuswb(dst, kScratchDoubleReg);
}
break; break;
} }
case kX64I8x16AddSaturateU: { case kX64I8x16AddSaturateU: {
......
...@@ -2742,7 +2742,6 @@ VISIT_ATOMIC_BINOP(Xor) ...@@ -2742,7 +2742,6 @@ VISIT_ATOMIC_BINOP(Xor)
#define SIMD_NARROW_SHIFT_OPCODES(V) \ #define SIMD_NARROW_SHIFT_OPCODES(V) \
V(I8x16Shl) \ V(I8x16Shl) \
V(I8x16ShrS) \
V(I8x16ShrU) V(I8x16ShrU)
#define SIMD_ANYTRUE_LIST(V) \ #define SIMD_ANYTRUE_LIST(V) \
...@@ -2820,9 +2819,15 @@ SIMD_SHIFT_OPCODES(VISIT_SIMD_SHIFT) ...@@ -2820,9 +2819,15 @@ SIMD_SHIFT_OPCODES(VISIT_SIMD_SHIFT)
void InstructionSelector::Visit##Opcode(Node* node) { \ void InstructionSelector::Visit##Opcode(Node* node) { \
X64OperandGenerator g(this); \ X64OperandGenerator g(this); \
InstructionOperand temps[] = {g.TempRegister(), g.TempSimd128Register()}; \ InstructionOperand temps[] = {g.TempRegister(), g.TempSimd128Register()}; \
Emit(kX64##Opcode, g.DefineSameAsFirst(node), \ if (g.CanBeImmediate(node->InputAt(1))) { \
g.UseUniqueRegister(node->InputAt(0)), \ Emit(kX64##Opcode, g.DefineSameAsFirst(node), \
g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps); \ g.UseRegister(node->InputAt(0)), g.UseImmediate(node->InputAt(1)), \
arraysize(temps), temps); \
} else { \
Emit(kX64##Opcode, g.DefineSameAsFirst(node), \
g.UseUniqueRegister(node->InputAt(0)), \
g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps); \
} \
} }
SIMD_NARROW_SHIFT_OPCODES(VISIT_SIMD_NARROW_SHIFT) SIMD_NARROW_SHIFT_OPCODES(VISIT_SIMD_NARROW_SHIFT)
#undef VISIT_SIMD_NARROW_SHIFT #undef VISIT_SIMD_NARROW_SHIFT
...@@ -3032,6 +3037,19 @@ void InstructionSelector::VisitI8x16Mul(Node* node) { ...@@ -3032,6 +3037,19 @@ void InstructionSelector::VisitI8x16Mul(Node* node) {
g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps); g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps);
} }
void InstructionSelector::VisitI8x16ShrS(Node* node) {
X64OperandGenerator g(this);
if (g.CanBeImmediate(node->InputAt(1))) {
Emit(kX64I8x16ShrS, g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(0)), g.UseImmediate(node->InputAt(1)));
} else {
InstructionOperand temps[] = {g.TempRegister(), g.TempSimd128Register()};
Emit(kX64I8x16ShrS, g.DefineSameAsFirst(node),
g.UseUniqueRegister(node->InputAt(0)),
g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps);
}
}
void InstructionSelector::VisitInt32AbsWithOverflow(Node* node) { void InstructionSelector::VisitInt32AbsWithOverflow(Node* node) {
UNREACHABLE(); UNREACHABLE();
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment