Commit c296436e authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][ia32] Optimize i8x16.swizzle with external refs

This is similar to the x64 optimization for i8x16.swizzle.

Use external refs to load the masks neded for i8x16.swizzle. Before it
would need 3 instructions (2 moves + 1 pshufd), now it requires 2 moves.
Also on AVX we can relax the dst == src requirement, which can
potentially save a move too.

Extract the code sequence into a macro-assembler function for sharing
between Liftoff and TurboFan.

Bug: v8:11346
Change-Id: Id0ec5e891595f0b0fc2922e932fc6c501eca8dc1
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2727150Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#73171}
parent e0bbda57
......@@ -1220,6 +1220,28 @@ void TurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src,
}
}
void TurboAssembler::I8x16Swizzle(XMMRegister dst, XMMRegister src,
XMMRegister mask, XMMRegister scratch,
Register tmp) {
// Out-of-range indices should return 0, add 112 so that any value > 15
// saturates to 128 (top bit set), so pshufb will zero that lane.
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_swizzle_mask(), tmp);
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpaddusb(scratch, mask, op);
vpshufb(dst, src, scratch);
} else {
CpuFeatureScope sse_scope(this, SSSE3);
movaps(scratch, op);
if (dst != src) {
movaps(dst, src);
}
paddusb(scratch, mask);
pshufb(dst, scratch);
}
}
void TurboAssembler::ShlPair(Register high, Register low, uint8_t shift) {
DCHECK_GE(63, shift);
if (shift >= 32) {
......
......@@ -704,6 +704,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
Register scratch);
void I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src,
XMMRegister tmp);
void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask,
XMMRegister scratch, Register tmp);
void Push(Register src) { push(src); }
void Push(Operand src) { push(src); }
......
......@@ -3631,16 +3631,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kIA32I8x16Swizzle: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
XMMRegister dst = i.OutputSimd128Register();
XMMRegister mask = i.TempSimd128Register(0);
// Out-of-range indices should return 0, add 112 so that any value > 15
// saturates to 128 (top bit set), so pshufb will zero that lane.
__ Move(mask, uint32_t{0x70707070});
__ Pshufd(mask, mask, 0x0);
__ Paddusb(mask, i.InputSimd128Register(1));
__ Pshufb(dst, mask);
__ I8x16Swizzle(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), kScratchDoubleReg,
i.TempRegister(0));
break;
}
case kIA32I8x16Shuffle: {
......
......@@ -3026,9 +3026,10 @@ void InstructionSelector::VisitI8x16Shuffle(Node* node) {
void InstructionSelector::VisitI8x16Swizzle(Node* node) {
IA32OperandGenerator g(this);
InstructionOperand temps[] = {g.TempSimd128Register()};
Emit(kIA32I8x16Swizzle, g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(0)), g.UseUniqueRegister(node->InputAt(1)),
InstructionOperand temps[] = {g.TempRegister()};
Emit(kIA32I8x16Swizzle,
IsSupported(AVX) ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
arraysize(temps), temps);
}
......
......@@ -2881,13 +2881,9 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
XMMRegister mask = liftoff::kScratchDoubleReg;
// Out-of-range indices should return 0, add 112 (0x70) so that any value > 15
// saturates to 128 (top bit set), so pshufb will zero that lane.
TurboAssembler::Move(mask, uint32_t{0x70707070});
Pshufd(mask, mask, uint8_t{0x0});
Paddusb(mask, rhs.fp());
Pshufb(dst.fp(), lhs.fp(), mask);
Register scratch = GetUnusedRegister(RegClass::kGpReg, {}).gp();
I8x16Swizzle(dst.fp(), lhs.fp(), rhs.fp(), liftoff::kScratchDoubleReg,
scratch);
}
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment