Commit 80508f14 authored by Bill Budge's avatar Bill Budge Committed by Commit Bot

[wasm simd] Improve code generation for I8x16 Multiply on ia32

- Uses a different technique to do the multiplies, saving one
  instruction and one temporary register on SSE and AVX.

ug: v8:6020
Change-Id: I4f3ff6186dae5eb10d90cda31c7d16b651a00d7e
Reviewed-on: https://chromium-review.googlesource.com/1132522Reviewed-by: 's avatarJing Bao <jing.bao@intel.com>
Commit-Queue: Bill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#54370}
parent 2114c6ee
......@@ -2851,43 +2851,42 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kSSEI8x16Mul: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
XMMRegister left = i.InputSimd128Register(0);
XMMRegister dst = i.OutputSimd128Register();
DCHECK_EQ(dst, i.InputSimd128Register(0));
XMMRegister right = i.InputSimd128Register(1);
XMMRegister t0 = i.ToSimd128Register(instr->TempAt(0));
XMMRegister t1 = i.ToSimd128Register(instr->TempAt(1));
XMMRegister tmp = i.ToSimd128Register(instr->TempAt(0));
// I16x8 view of I8x16
// left = AAaa AAaa ... AAaa AAaa
// right= BBbb BBbb ... BBbb BBbb
// t0 = 00AA 00AA ... 00AA 00AA
// t1 = 00BB 00BB ... 00BB 00BB
__ movaps(t0, left);
__ movaps(t1, right);
__ pcmpeqb(kScratchDoubleReg, kScratchDoubleReg);
__ psrlw(t0, 8);
__ psrlw(t1, 8);
// t = 00AA 00AA ... 00AA 00AA
// s = 00BB 00BB ... 00BB 00BB
__ movaps(tmp, dst);
__ movaps(kScratchDoubleReg, right);
__ psrlw(tmp, 8);
__ psrlw(kScratchDoubleReg, 8);
// dst = left * 256
__ psllw(dst, 8);
// left = I16x8Mul(left, right)
// => __pp __pp ... __pp __pp
// t0 = I16x8Mul(t0, t1)
// t = I16x8Mul(t, s)
// => __PP __PP ... __PP __PP
__ psrlw(kScratchDoubleReg, 8);
__ pmullw(t0, t1);
__ pmullw(left, right);
__ pmullw(tmp, kScratchDoubleReg);
// dst = I16x8Mul(left * 256, right)
// => pp__ pp__ ... pp__ pp__
__ pmullw(dst, right);
// t0 = I16x8Shl(t0, 8)
// t = I16x8Shl(t, 8)
// => PP00 PP00 ... PP00 PP00
__ psllw(t0, 8);
__ psllw(tmp, 8);
// left = I16x8And(left, 0x00ff)
// dst = I16x8Shr(dst, 8)
// => 00pp 00pp ... 00pp 00pp
__ pand(left, kScratchDoubleReg);
__ psrlw(dst, 8);
// left = I16x8Or(left, t0)
// dst = I16x8Or(dst, t)
// => PPpp PPpp ... PPpp PPpp
__ por(left, t0);
__ por(dst, tmp);
break;
}
case kAVXI8x16Mul: {
......@@ -2895,39 +2894,39 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister dst = i.OutputSimd128Register();
XMMRegister left = i.InputSimd128Register(0);
XMMRegister right = i.InputSimd128Register(1);
XMMRegister t0 = i.ToSimd128Register(instr->TempAt(0));
XMMRegister t1 = i.ToSimd128Register(instr->TempAt(1));
XMMRegister tmp = i.ToSimd128Register(instr->TempAt(0));
// I16x8 view of I8x16
// left = AAaa AAaa ... AAaa AAaa
// right= BBbb BBbb ... BBbb BBbb
// t0 = 00AA 00AA ... 00AA 00AA
// t1 = 00BB 00BB ... 00BB 00BB
__ vpcmpeqb(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
__ vpsrlw(t0, left, 8);
__ vpsrlw(t1, right, 8);
// t = 00AA 00AA ... 00AA 00AA
// s = 00BB 00BB ... 00BB 00BB
__ vpsrlw(tmp, left, 8);
__ vpsrlw(kScratchDoubleReg, right, 8);
// dst = I16x8Mul(left, right)
// => __pp __pp ... __pp __pp
__ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 8);
__ vpmullw(dst, left, right);
// t0 = I16x8Mul(t0, t1)
// t = I16x8Mul(t0, t1)
// => __PP __PP ... __PP __PP
__ vpmullw(t0, t0, t1);
__ vpmullw(tmp, tmp, kScratchDoubleReg);
// t0 = I16x8Shl(t0, 8)
// => PP00 PP00 ... PP00 PP00
__ vpsllw(t0, t0, 8);
// s = left * 256
__ vpsllw(kScratchDoubleReg, left, 8);
// dst = I16x8And(dst, 0x00ff)
// dst = I16x8Mul(left * 256, right)
// => pp__ pp__ ... pp__ pp__
__ vpmullw(dst, kScratchDoubleReg, right);
// dst = I16x8Shr(dst, 8)
// => 00pp 00pp ... 00pp 00pp
__ vpand(dst, dst, kScratchDoubleReg);
__ vpsrlw(dst, dst, 8);
// t = I16x8Shl(t, 8)
// => PP00 PP00 ... PP00 PP00
__ vpsllw(tmp, tmp, 8);
// dst = I16x8Or(dst, t0)
// dst = I16x8Or(dst, t)
// => PPpp PPpp ... PPpp PPpp
__ vpor(dst, dst, t0);
__ vpor(dst, dst, tmp);
break;
}
case kSSEI8x16MinS: {
......
......@@ -1893,8 +1893,7 @@ void InstructionSelector::VisitI8x16Mul(Node* node) {
IA32OperandGenerator g(this);
InstructionOperand operand0 = g.UseUniqueRegister(node->InputAt(0));
InstructionOperand operand1 = g.UseUniqueRegister(node->InputAt(1));
InstructionOperand temps[] = {g.TempSimd128Register(),
g.TempSimd128Register()};
InstructionOperand temps[] = {g.TempSimd128Register()};
if (IsSupported(AVX)) {
Emit(kAVXI8x16Mul, g.DefineAsRegister(node), operand0, operand1,
arraysize(temps), temps);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment