Commit 80508f14 authored by Bill Budge's avatar Bill Budge Committed by Commit Bot

[wasm simd] Improve code generation for I8x16 Multiply on ia32

- Uses a different technique to do the multiplies, saving one
  instruction and one temporary register on SSE and AVX.

ug: v8:6020
Change-Id: I4f3ff6186dae5eb10d90cda31c7d16b651a00d7e
Reviewed-on: https://chromium-review.googlesource.com/1132522Reviewed-by: 's avatarJing Bao <jing.bao@intel.com>
Commit-Queue: Bill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#54370}
parent 2114c6ee
...@@ -2851,43 +2851,42 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2851,43 +2851,42 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kSSEI8x16Mul: { case kSSEI8x16Mul: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); XMMRegister dst = i.OutputSimd128Register();
XMMRegister left = i.InputSimd128Register(0); DCHECK_EQ(dst, i.InputSimd128Register(0));
XMMRegister right = i.InputSimd128Register(1); XMMRegister right = i.InputSimd128Register(1);
XMMRegister t0 = i.ToSimd128Register(instr->TempAt(0)); XMMRegister tmp = i.ToSimd128Register(instr->TempAt(0));
XMMRegister t1 = i.ToSimd128Register(instr->TempAt(1));
// I16x8 view of I8x16 // I16x8 view of I8x16
// left = AAaa AAaa ... AAaa AAaa // left = AAaa AAaa ... AAaa AAaa
// right= BBbb BBbb ... BBbb BBbb // right= BBbb BBbb ... BBbb BBbb
// t0 = 00AA 00AA ... 00AA 00AA // t = 00AA 00AA ... 00AA 00AA
// t1 = 00BB 00BB ... 00BB 00BB // s = 00BB 00BB ... 00BB 00BB
__ movaps(t0, left); __ movaps(tmp, dst);
__ movaps(t1, right); __ movaps(kScratchDoubleReg, right);
__ pcmpeqb(kScratchDoubleReg, kScratchDoubleReg); __ psrlw(tmp, 8);
__ psrlw(t0, 8); __ psrlw(kScratchDoubleReg, 8);
__ psrlw(t1, 8); // dst = left * 256
__ psllw(dst, 8);
// left = I16x8Mul(left, right) // t = I16x8Mul(t, s)
// => __pp __pp ... __pp __pp
// t0 = I16x8Mul(t0, t1)
// => __PP __PP ... __PP __PP // => __PP __PP ... __PP __PP
__ psrlw(kScratchDoubleReg, 8); __ pmullw(tmp, kScratchDoubleReg);
__ pmullw(t0, t1); // dst = I16x8Mul(left * 256, right)
__ pmullw(left, right); // => pp__ pp__ ... pp__ pp__
__ pmullw(dst, right);
// t0 = I16x8Shl(t0, 8) // t = I16x8Shl(t, 8)
// => PP00 PP00 ... PP00 PP00 // => PP00 PP00 ... PP00 PP00
__ psllw(t0, 8); __ psllw(tmp, 8);
// left = I16x8And(left, 0x00ff) // dst = I16x8Shr(dst, 8)
// => 00pp 00pp ... 00pp 00pp // => 00pp 00pp ... 00pp 00pp
__ pand(left, kScratchDoubleReg); __ psrlw(dst, 8);
// left = I16x8Or(left, t0) // dst = I16x8Or(dst, t)
// => PPpp PPpp ... PPpp PPpp // => PPpp PPpp ... PPpp PPpp
__ por(left, t0); __ por(dst, tmp);
break; break;
} }
case kAVXI8x16Mul: { case kAVXI8x16Mul: {
...@@ -2895,39 +2894,39 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2895,39 +2894,39 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
XMMRegister left = i.InputSimd128Register(0); XMMRegister left = i.InputSimd128Register(0);
XMMRegister right = i.InputSimd128Register(1); XMMRegister right = i.InputSimd128Register(1);
XMMRegister t0 = i.ToSimd128Register(instr->TempAt(0)); XMMRegister tmp = i.ToSimd128Register(instr->TempAt(0));
XMMRegister t1 = i.ToSimd128Register(instr->TempAt(1));
// I16x8 view of I8x16 // I16x8 view of I8x16
// left = AAaa AAaa ... AAaa AAaa // left = AAaa AAaa ... AAaa AAaa
// right= BBbb BBbb ... BBbb BBbb // right= BBbb BBbb ... BBbb BBbb
// t0 = 00AA 00AA ... 00AA 00AA // t = 00AA 00AA ... 00AA 00AA
// t1 = 00BB 00BB ... 00BB 00BB // s = 00BB 00BB ... 00BB 00BB
__ vpcmpeqb(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg); __ vpsrlw(tmp, left, 8);
__ vpsrlw(t0, left, 8); __ vpsrlw(kScratchDoubleReg, right, 8);
__ vpsrlw(t1, right, 8);
// dst = I16x8Mul(left, right) // t = I16x8Mul(t0, t1)
// => __pp __pp ... __pp __pp
__ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 8);
__ vpmullw(dst, left, right);
// t0 = I16x8Mul(t0, t1)
// => __PP __PP ... __PP __PP // => __PP __PP ... __PP __PP
__ vpmullw(t0, t0, t1); __ vpmullw(tmp, tmp, kScratchDoubleReg);
// t0 = I16x8Shl(t0, 8) // s = left * 256
// => PP00 PP00 ... PP00 PP00 __ vpsllw(kScratchDoubleReg, left, 8);
__ vpsllw(t0, t0, 8);
// dst = I16x8And(dst, 0x00ff) // dst = I16x8Mul(left * 256, right)
// => pp__ pp__ ... pp__ pp__
__ vpmullw(dst, kScratchDoubleReg, right);
// dst = I16x8Shr(dst, 8)
// => 00pp 00pp ... 00pp 00pp // => 00pp 00pp ... 00pp 00pp
__ vpand(dst, dst, kScratchDoubleReg); __ vpsrlw(dst, dst, 8);
// t = I16x8Shl(t, 8)
// => PP00 PP00 ... PP00 PP00
__ vpsllw(tmp, tmp, 8);
// dst = I16x8Or(dst, t0) // dst = I16x8Or(dst, t)
// => PPpp PPpp ... PPpp PPpp // => PPpp PPpp ... PPpp PPpp
__ vpor(dst, dst, t0); __ vpor(dst, dst, tmp);
break; break;
} }
case kSSEI8x16MinS: { case kSSEI8x16MinS: {
......
...@@ -1893,8 +1893,7 @@ void InstructionSelector::VisitI8x16Mul(Node* node) { ...@@ -1893,8 +1893,7 @@ void InstructionSelector::VisitI8x16Mul(Node* node) {
IA32OperandGenerator g(this); IA32OperandGenerator g(this);
InstructionOperand operand0 = g.UseUniqueRegister(node->InputAt(0)); InstructionOperand operand0 = g.UseUniqueRegister(node->InputAt(0));
InstructionOperand operand1 = g.UseUniqueRegister(node->InputAt(1)); InstructionOperand operand1 = g.UseUniqueRegister(node->InputAt(1));
InstructionOperand temps[] = {g.TempSimd128Register(), InstructionOperand temps[] = {g.TempSimd128Register()};
g.TempSimd128Register()};
if (IsSupported(AVX)) { if (IsSupported(AVX)) {
Emit(kAVXI8x16Mul, g.DefineAsRegister(node), operand0, operand1, Emit(kAVXI8x16Mul, g.DefineAsRegister(node), operand0, operand1,
arraysize(temps), temps); arraysize(temps), temps);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment