Commit c4915092 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][x64] Optimize i8x16.swizzle with external refs

Use external refs to load the masks neded for i8x16.swizzle. Before it
would need 3 instructions (2 moves + 1 pshufd), now it requires 2 moves.
Also on AVX we can relax the dst == src requirement, which can
potentially save a move too.

Bug: v8:11346
Change-Id: If350529de7272a7b178e12778a5e02813b34631c
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2713168Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#72989}
parent a0cdf76c
......@@ -75,6 +75,12 @@ constexpr struct alignas(16) {
} double_negate_constant = {uint64_t{0x8000000000000000},
uint64_t{0x8000000000000000}};
constexpr struct alignas(16) {
uint64_t a;
uint64_t b;
} wasm_i8x16_swizzle_mask = {uint64_t{0x70707070'70707070},
uint64_t{0x70707070'70707070}};
constexpr struct alignas(16) {
uint64_t a;
uint64_t b;
......@@ -545,6 +551,10 @@ ExternalReference ExternalReference::address_of_double_neg_constant() {
return ExternalReference(reinterpret_cast<Address>(&double_negate_constant));
}
ExternalReference ExternalReference::address_of_wasm_i8x16_swizzle_mask() {
return ExternalReference(reinterpret_cast<Address>(&wasm_i8x16_swizzle_mask));
}
ExternalReference ExternalReference::address_of_wasm_i8x16_popcnt_mask() {
return ExternalReference(reinterpret_cast<Address>(&wasm_i8x16_popcnt_mask));
}
......
......@@ -117,6 +117,7 @@ class StatsCounter;
V(address_of_runtime_stats_flag, "TracingFlags::runtime_stats") \
V(address_of_the_hole_nan, "the_hole_nan") \
V(address_of_uint32_bias, "uint32_bias") \
V(address_of_wasm_i8x16_swizzle_mask, "wasm_i8x16_swizzle_mask") \
V(address_of_wasm_i8x16_popcnt_mask, "wasm_i8x16_popcnt_mask") \
V(address_of_wasm_i8x16_splat_0x01, "wasm_i8x16_splat_0x01") \
V(address_of_wasm_i8x16_splat_0x0f, "wasm_i8x16_splat_0x0f") \
......
......@@ -2598,6 +2598,27 @@ void TurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst,
Paddd(dst, kScratchDoubleReg);
}
void TurboAssembler::I8x16Swizzle(XMMRegister dst, XMMRegister src,
XMMRegister mask) {
// Out-of-range indices should return 0, add 112 so that any value > 15
// saturates to 128 (top bit set), so pshufb will zero that lane.
Operand op = ExternalReferenceAsOperand(
ExternalReference::address_of_wasm_i8x16_swizzle_mask());
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(this, AVX);
vpaddusb(kScratchDoubleReg, mask, op);
vpshufb(dst, src, kScratchDoubleReg);
} else {
CpuFeatureScope sse_scope(this, SSSE3);
movdqa(kScratchDoubleReg, op);
if (dst != src) {
movaps(dst, src);
}
paddusb(kScratchDoubleReg, mask);
pshufb(dst, kScratchDoubleReg);
}
}
void TurboAssembler::Abspd(XMMRegister dst) {
Andps(dst, ExternalReferenceAsOperand(
ExternalReference::address_of_double_abs_constant()));
......
......@@ -635,6 +635,8 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void I16x8ExtAddPairwiseI8x16S(XMMRegister dst, XMMRegister src);
void I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src);
void I8x16Swizzle(XMMRegister dst, XMMRegister src, XMMRegister mask);
void Abspd(XMMRegister dst);
void Negpd(XMMRegister dst);
......
......@@ -3751,16 +3751,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kX64I8x16Swizzle: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
XMMRegister dst = i.OutputSimd128Register();
XMMRegister mask = i.TempSimd128Register(0);
// Out-of-range indices should return 0, add 112 so that any value > 15
// saturates to 128 (top bit set), so pshufb will zero that lane.
__ Move(mask, uint32_t{0x70707070});
__ Pshufd(mask, mask, uint8_t{0x0});
__ Paddusb(mask, i.InputSimd128Register(1));
__ Pshufb(dst, mask);
__ I8x16Swizzle(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
break;
}
case kX64I8x16Shuffle: {
......
......@@ -3652,10 +3652,9 @@ void InstructionSelector::VisitI8x16Shuffle(Node* node) {
void InstructionSelector::VisitI8x16Swizzle(Node* node) {
X64OperandGenerator g(this);
InstructionOperand temps[] = {g.TempSimd128Register()};
Emit(kX64I8x16Swizzle, g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(0)), g.UseUniqueRegister(node->InputAt(1)),
arraysize(temps), temps);
Emit(kX64I8x16Swizzle,
IsSupported(AVX) ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)));
}
namespace {
......
......@@ -2483,13 +2483,7 @@ void LiftoffAssembler::emit_i8x16_shuffle(LiftoffRegister dst,
void LiftoffAssembler::emit_i8x16_swizzle(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs) {
XMMRegister mask = kScratchDoubleReg;
// Out-of-range indices should return 0, add 112 (0x70) so that any value > 15
// saturates to 128 (top bit set), so pshufb will zero that lane.
TurboAssembler::Move(mask, uint32_t{0x70707070});
Pshufd(mask, mask, uint8_t{0x0});
Paddusb(mask, rhs.fp());
Pshufb(dst.fp(), lhs.fp(), mask);
I8x16Swizzle(dst.fp(), lhs.fp(), rhs.fp());
}
void LiftoffAssembler::emit_i8x16_popcnt(LiftoffRegister dst,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment