Commit 0d886c56 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][x64] Optimize codegen when shift is constant

This optimizes i8x16 shifts when the shift value is constant. It brings
generated instruction counts down from 10 to 6 (unsigned), and 9 to 5
(signed).

For Signed, we use a word (16-bit) shift, then mask away the high (shru)
or low (shl) bits to achieve a byte shift. Most of the instructions are
dedicated to building the mask.

Bug: v8:10115
Change-Id: I1d5c0e0fb779eeb7e0185d3cb7fd595837fd8daf
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2106293Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66779}
parent aff14d7e
......@@ -189,7 +189,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_OP(Pavgb, pavgb)
AVX_OP(Pavgw, pavgw)
AVX_OP(Psrad, psrad)
AVX_OP(Psllw, psllw)
AVX_OP(Psllq, psllq)
AVX_OP(Psrlw, psrlw)
AVX_OP(Psrld, psrld)
AVX_OP(Psrlq, psrlq)
AVX_OP(Paddd, paddd)
......
......@@ -3345,42 +3345,64 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kX64I8x16Shl: {
XMMRegister dst = i.OutputSimd128Register();
DCHECK_EQ(dst, i.InputSimd128Register(0));
// Temp registers for shift mask andadditional moves to XMM registers.
// Temp registers for shift mask and additional moves to XMM registers.
Register tmp = i.ToRegister(instr->TempAt(0));
XMMRegister tmp_simd = i.TempSimd128Register(1);
Register shift = i.InputRegister(1);
// Mask off the unwanted bits before word-shifting.
__ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
// Take shift value modulo 8.
__ andq(shift, Immediate(7));
__ movq(tmp, shift);
__ addq(tmp, Immediate(8));
__ movq(tmp_simd, tmp);
__ psrlw(kScratchDoubleReg, tmp_simd);
__ packuswb(kScratchDoubleReg, kScratchDoubleReg);
__ pand(dst, kScratchDoubleReg);
__ movq(tmp_simd, shift);
__ psllw(dst, tmp_simd);
if (HasImmediateInput(instr, 1)) {
// Perform 16-bit shift, then mask away low bits.
uint8_t shift = i.InputInt3(1);
__ Psllw(dst, static_cast<byte>(shift));
uint8_t bmask = static_cast<uint8_t>(0xff << shift);
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
__ movl(tmp, Immediate(mask));
__ Movd(tmp_simd, tmp);
__ Pshufd(tmp_simd, tmp_simd, static_cast<uint8_t>(0));
__ Pand(dst, tmp_simd);
} else {
Register shift = i.InputRegister(1);
// Mask off the unwanted bits before word-shifting.
__ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
// Take shift value modulo 8.
__ andq(shift, Immediate(7));
__ movq(tmp, shift);
__ addq(tmp, Immediate(8));
__ movq(tmp_simd, tmp);
__ psrlw(kScratchDoubleReg, tmp_simd);
__ packuswb(kScratchDoubleReg, kScratchDoubleReg);
__ pand(dst, kScratchDoubleReg);
__ movq(tmp_simd, shift);
__ psllw(dst, tmp_simd);
}
break;
}
case kX64I8x16ShrS: {
XMMRegister dst = i.OutputSimd128Register();
DCHECK_EQ(dst, i.InputSimd128Register(0));
// Temp registers for shift mask andadditional moves to XMM registers.
Register tmp = i.ToRegister(instr->TempAt(0));
XMMRegister tmp_simd = i.TempSimd128Register(1);
// Unpack the bytes into words, do arithmetic shifts, and repack.
__ punpckhbw(kScratchDoubleReg, dst);
__ punpcklbw(dst, dst);
// Prepare shift value
__ movq(tmp, i.InputRegister(1));
// Take shift value modulo 8.
__ andq(tmp, Immediate(7));
__ addq(tmp, Immediate(8));
__ movq(tmp_simd, tmp);
__ psraw(kScratchDoubleReg, tmp_simd);
__ psraw(dst, tmp_simd);
__ packsswb(dst, kScratchDoubleReg);
if (HasImmediateInput(instr, 1)) {
__ punpckhbw(kScratchDoubleReg, dst);
__ punpcklbw(dst, dst);
uint8_t shift = i.InputInt3(1) + 8;
__ psraw(kScratchDoubleReg, shift);
__ psraw(dst, shift);
__ packsswb(dst, kScratchDoubleReg);
} else {
// Temp registers for shift mask andadditional moves to XMM registers.
Register tmp = i.ToRegister(instr->TempAt(0));
XMMRegister tmp_simd = i.TempSimd128Register(1);
// Unpack the bytes into words, do arithmetic shifts, and repack.
__ punpckhbw(kScratchDoubleReg, dst);
__ punpcklbw(dst, dst);
// Prepare shift value
__ movq(tmp, i.InputRegister(1));
// Take shift value modulo 8.
__ andq(tmp, Immediate(7));
__ addq(tmp, Immediate(8));
__ movq(tmp_simd, tmp);
__ psraw(kScratchDoubleReg, tmp_simd);
__ psraw(dst, tmp_simd);
__ packsswb(dst, kScratchDoubleReg);
}
break;
}
case kX64I8x16Add: {
......@@ -3478,17 +3500,30 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
// Temp registers for shift mask andadditional moves to XMM registers.
Register tmp = i.ToRegister(instr->TempAt(0));
XMMRegister tmp_simd = i.TempSimd128Register(1);
__ punpckhbw(kScratchDoubleReg, dst);
__ punpcklbw(dst, dst);
// Prepare shift value
__ movq(tmp, i.InputRegister(1));
// Take shift value modulo 8.
__ andq(tmp, Immediate(7));
__ addq(tmp, Immediate(8));
__ movq(tmp_simd, tmp);
__ psrlw(kScratchDoubleReg, tmp_simd);
__ psrlw(dst, tmp_simd);
__ packuswb(dst, kScratchDoubleReg);
if (HasImmediateInput(instr, 1)) {
// Perform 16-bit shift, then mask away high bits.
uint8_t shift = i.InputInt3(1);
__ Psrlw(dst, static_cast<byte>(shift));
uint8_t bmask = 0xff >> shift;
uint32_t mask = bmask << 24 | bmask << 16 | bmask << 8 | bmask;
__ movl(tmp, Immediate(mask));
__ Movd(tmp_simd, tmp);
__ Pshufd(tmp_simd, tmp_simd, static_cast<byte>(0));
__ Pand(dst, tmp_simd);
} else {
__ punpckhbw(kScratchDoubleReg, dst);
__ punpcklbw(dst, dst);
// Prepare shift value
__ movq(tmp, i.InputRegister(1));
// Take shift value modulo 8.
__ andq(tmp, Immediate(7));
__ addq(tmp, Immediate(8));
__ movq(tmp_simd, tmp);
__ psrlw(kScratchDoubleReg, tmp_simd);
__ psrlw(dst, tmp_simd);
__ packuswb(dst, kScratchDoubleReg);
}
break;
}
case kX64I8x16AddSaturateU: {
......
......@@ -2742,7 +2742,6 @@ VISIT_ATOMIC_BINOP(Xor)
#define SIMD_NARROW_SHIFT_OPCODES(V) \
V(I8x16Shl) \
V(I8x16ShrS) \
V(I8x16ShrU)
#define SIMD_ANYTRUE_LIST(V) \
......@@ -2820,9 +2819,15 @@ SIMD_SHIFT_OPCODES(VISIT_SIMD_SHIFT)
void InstructionSelector::Visit##Opcode(Node* node) { \
X64OperandGenerator g(this); \
InstructionOperand temps[] = {g.TempRegister(), g.TempSimd128Register()}; \
Emit(kX64##Opcode, g.DefineSameAsFirst(node), \
g.UseUniqueRegister(node->InputAt(0)), \
g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps); \
if (g.CanBeImmediate(node->InputAt(1))) { \
Emit(kX64##Opcode, g.DefineSameAsFirst(node), \
g.UseRegister(node->InputAt(0)), g.UseImmediate(node->InputAt(1)), \
arraysize(temps), temps); \
} else { \
Emit(kX64##Opcode, g.DefineSameAsFirst(node), \
g.UseUniqueRegister(node->InputAt(0)), \
g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps); \
} \
}
SIMD_NARROW_SHIFT_OPCODES(VISIT_SIMD_NARROW_SHIFT)
#undef VISIT_SIMD_NARROW_SHIFT
......@@ -3032,6 +3037,19 @@ void InstructionSelector::VisitI8x16Mul(Node* node) {
g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps);
}
void InstructionSelector::VisitI8x16ShrS(Node* node) {
X64OperandGenerator g(this);
if (g.CanBeImmediate(node->InputAt(1))) {
Emit(kX64I8x16ShrS, g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(0)), g.UseImmediate(node->InputAt(1)));
} else {
InstructionOperand temps[] = {g.TempRegister(), g.TempSimd128Register()};
Emit(kX64I8x16ShrS, g.DefineSameAsFirst(node),
g.UseUniqueRegister(node->InputAt(0)),
g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps);
}
}
void InstructionSelector::VisitInt32AbsWithOverflow(Node* node) {
UNREACHABLE();
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment