Commit a8b789fc authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd] Improve codegen for S8x16Shuffle

Instead of putting the 16 immediate bytes on the stack,
we move them into a temporary register.

The instruction-selector then has to change, to ensure that
the operands are distinct from the temporary.

Tested on the two workloads given in
https://github.com/zeux/wasm-simd/issues/2#issuecomment-614399004

For slow, the row "filter:" oct12 goes from ~50ms to ~27ms,
the rest of the figures look about the same or slightly faster.
For optimal, the same figure goes from ~25ms to ~24ms,
the rest of the figures look slightly faster.

Raw outputs are uploaded to bug.

Bug: v8:10117
Change-Id: I7f77a3066b5e24584f1c01574aa9311f56bd7fb4
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2152853
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Cr-Commit-Position: refs/heads/master@{#67190}
parent fc29c453
......@@ -670,13 +670,13 @@ void AdjustStackPointerForTailCall(TurboAssembler* assembler,
}
}
void SetupShuffleMaskOnStack(TurboAssembler* assembler, uint32_t* mask) {
int64_t shuffle_mask = (mask[2]) | (static_cast<uint64_t>(mask[3]) << 32);
void SetupShuffleMaskInTempRegister(TurboAssembler* assembler, uint32_t* mask,
XMMRegister tmp) {
uint64_t shuffle_mask = (mask[0]) | (static_cast<uint64_t>(mask[1]) << 32);
assembler->Move(tmp, shuffle_mask);
shuffle_mask = (mask[2]) | (static_cast<uint64_t>(mask[3]) << 32);
assembler->movq(kScratchRegister, shuffle_mask);
assembler->Push(kScratchRegister);
shuffle_mask = (mask[0]) | (static_cast<uint64_t>(mask[1]) << 32);
assembler->movq(kScratchRegister, shuffle_mask);
assembler->Push(kScratchRegister);
assembler->Pinsrq(tmp, kScratchRegister, int8_t{1});
}
} // namespace
......@@ -3595,10 +3595,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kX64S8x16Shuffle: {
XMMRegister dst = i.OutputSimd128Register();
Register tmp = i.TempRegister(0);
// Prepare 16 byte aligned buffer for shuffle control mask
__ movq(tmp, rsp);
__ andq(rsp, Immediate(-16));
XMMRegister tmp_simd = i.TempSimd128Register(0);
if (instr->InputCount() == 5) { // only one input operand
uint32_t mask[4] = {};
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
......@@ -3606,22 +3603,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
mask[j - 1] = i.InputUint32(j);
}
SetupShuffleMaskOnStack(tasm(), mask);
__ Pshufb(dst, Operand(rsp, 0));
SetupShuffleMaskInTempRegister(tasm(), mask, tmp_simd);
__ Pshufb(dst, tmp_simd);
} else { // two input operands
DCHECK_EQ(6, instr->InputCount());
ASSEMBLE_SIMD_INSTR(Movups, kScratchDoubleReg, 0);
uint32_t mask[4] = {};
uint32_t mask1[4] = {};
for (int j = 5; j > 1; j--) {
uint32_t lanes = i.InputUint32(j);
for (int k = 0; k < 32; k += 8) {
uint8_t lane = lanes >> k;
mask[j - 2] |= (lane < kSimd128Size ? lane : 0x80) << k;
mask1[j - 2] |= (lane < kSimd128Size ? lane : 0x80) << k;
}
}
SetupShuffleMaskOnStack(tasm(), mask);
__ Pshufb(kScratchDoubleReg, Operand(rsp, 0));
uint32_t mask1[4] = {};
SetupShuffleMaskInTempRegister(tasm(), mask1, tmp_simd);
__ Pshufb(kScratchDoubleReg, tmp_simd);
uint32_t mask2[4] = {};
if (instr->InputAt(1)->IsSimd128Register()) {
XMMRegister src1 = i.InputSimd128Register(1);
if (src1 != dst) __ movups(dst, src1);
......@@ -3632,14 +3629,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
uint32_t lanes = i.InputUint32(j);
for (int k = 0; k < 32; k += 8) {
uint8_t lane = lanes >> k;
mask1[j - 2] |= (lane >= kSimd128Size ? (lane & 0x0F) : 0x80) << k;
mask2[j - 2] |= (lane >= kSimd128Size ? (lane & 0x0F) : 0x80) << k;
}
}
SetupShuffleMaskOnStack(tasm(), mask1);
__ Pshufb(dst, Operand(rsp, 0));
SetupShuffleMaskInTempRegister(tasm(), mask2, tmp_simd);
__ Pshufb(dst, tmp_simd);
__ Por(dst, kScratchDoubleReg);
}
__ movq(rsp, tmp);
break;
}
case kX64S8x16LoadSplat: {
......
......@@ -3338,7 +3338,7 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
imms[imm_count++] = Pack4Lanes(shuffle + 4);
imms[imm_count++] = Pack4Lanes(shuffle + 8);
imms[imm_count++] = Pack4Lanes(shuffle + 12);
temps[temp_count++] = g.TempRegister();
temps[temp_count++] = g.TempSimd128Register();
}
// Use DefineAsRegister(node) and Use(src0) if we can without forcing an extra
......@@ -3347,7 +3347,7 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
InstructionOperand dst =
no_same_as_first ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node);
InstructionOperand src0 =
src0_needs_reg ? g.UseRegister(input0) : g.Use(input0);
src0_needs_reg ? g.UseUniqueRegister(input0) : g.UseUnique(input0);
int input_count = 0;
InstructionOperand inputs[2 + kMaxImms + kMaxTemps];
......@@ -3355,7 +3355,7 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
if (!is_swizzle) {
Node* input1 = node->InputAt(1);
inputs[input_count++] =
src1_needs_reg ? g.UseRegister(input1) : g.Use(input1);
src1_needs_reg ? g.UseUniqueRegister(input1) : g.UseUnique(input1);
}
for (int i = 0; i < imm_count; ++i) {
inputs[input_count++] = g.UseImmediate(imms[i]);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment