Commit 732dba60 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][liftoff][x64] Optimize shuffles (swizzles)

Swizzles are shuffles that only use values from 1 operand, e.g.
v8x16.shuffle 0 1 2 3 0 0 0 0 4 5 6 7 0 0 0 0 (all the values are < 16).

Match such patterns and emit an optimized codegen that uses less
registers and instructions. Only implemented for x64 for now, the other
backends will come in follow-up patches.

Bug: v8:10696
Change-Id: Iffa694b04c97313eab7d138e4bdad7c0c85cda89
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2335419Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#69231}
parent 47794594
......@@ -1353,6 +1353,12 @@ void TurboAssembler::Move(XMMRegister dst, uint64_t src) {
}
}
void TurboAssembler::Move(XMMRegister dst, uint64_t high, uint64_t low) {
Move(dst, low);
movq(kScratchRegister, high);
Pinsrq(dst, kScratchRegister, int8_t{1});
}
// ----------------------------------------------------------------------------
void MacroAssembler::Absps(XMMRegister dst) {
......
......@@ -432,6 +432,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void Move(XMMRegister dst, uint64_t src);
void Move(XMMRegister dst, float src) { Move(dst, bit_cast<uint32_t>(src)); }
void Move(XMMRegister dst, double src) { Move(dst, bit_cast<uint64_t>(src)); }
void Move(XMMRegister dst, uint64_t high, uint64_t low);
// Move if the registers are not identical.
void Move(Register target, Register source);
......
......@@ -759,9 +759,8 @@ void AdjustStackPointerForTailCall(TurboAssembler* assembler,
void SetupSimdImmediateInRegister(TurboAssembler* assembler, uint32_t* imms,
XMMRegister reg) {
assembler->Move(reg, make_uint64(imms[1], imms[0]));
assembler->movq(kScratchRegister, make_uint64(imms[3], imms[2]));
assembler->Pinsrq(reg, kScratchRegister, int8_t{1});
assembler->Move(reg, make_uint64(imms[3], imms[2]),
make_uint64(imms[1], imms[0]));
}
} // namespace
......
......@@ -2905,7 +2905,8 @@ void LiftoffAssembler::emit_i16x8_replace_lane(LiftoffRegister dst,
void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs,
const uint8_t shuffle[16]) {
const uint8_t shuffle[16],
bool is_swizzle) {
Simd128Register dest = liftoff::GetSimd128Register(dst);
Simd128Register src1 = liftoff::GetSimd128Register(lhs);
Simd128Register src2 = liftoff::GetSimd128Register(rhs);
......
......@@ -2029,7 +2029,8 @@ void LiftoffAssembler::emit_i16x8_max_u(LiftoffRegister dst,
void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs,
const uint8_t shuffle[16]) {
const uint8_t shuffle[16],
bool is_swizzle) {
VRegister src1 = lhs.fp();
VRegister src2 = rhs.fp();
VRegister temp = dst.fp();
......
......@@ -2638,7 +2638,8 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr,
void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs,
const uint8_t shuffle[16]) {
const uint8_t shuffle[16],
bool is_swizzle) {
LiftoffRegister tmp = GetUnusedRegister(kGpReg, {});
// Prepare 16 byte aligned buffer for shuffle control mask.
mov(tmp.gp(), esp);
......
......@@ -756,8 +756,8 @@ class LiftoffAssembler : public TurboAssembler {
LoadType type, LoadTransformationKind transform,
uint32_t* protected_load_pc);
inline void emit_s8x16_shuffle(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs,
const uint8_t shuffle[16]);
LiftoffRegister rhs, const uint8_t shuffle[16],
bool is_swizzle);
inline void emit_s8x16_swizzle(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs);
inline void emit_i8x16_splat(LiftoffRegister dst, LiftoffRegister src);
......
......@@ -23,6 +23,7 @@
#include "src/wasm/function-compiler.h"
#include "src/wasm/memory-tracing.h"
#include "src/wasm/object-access.h"
#include "src/wasm/simd-shuffle.h"
#include "src/wasm/wasm-debug.h"
#include "src/wasm/wasm-engine.h"
#include "src/wasm/wasm-linkage.h"
......@@ -2843,7 +2844,16 @@ class LiftoffCompiler {
LiftoffRegister lhs = __ PopToRegister(LiftoffRegList::ForRegs(rhs));
LiftoffRegister dst = __ GetUnusedRegister(result_rc, {lhs, rhs}, {});
__ LiftoffAssembler::emit_s8x16_shuffle(dst, lhs, rhs, imm.value);
uint8_t shuffle[kSimd128Size];
memcpy(shuffle, imm.value, sizeof(shuffle));
bool is_swizzle;
bool needs_swap;
wasm::SimdShuffle::CanonicalizeShuffle(lhs == rhs, shuffle, &needs_swap,
&is_swizzle);
if (needs_swap) {
std::swap(lhs, rhs);
}
__ LiftoffAssembler::emit_s8x16_shuffle(dst, lhs, rhs, shuffle, is_swizzle);
__ PushRegister(kWasmS128, dst);
}
......
......@@ -1610,7 +1610,8 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr,
void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs,
const uint8_t shuffle[16]) {
const uint8_t shuffle[16],
bool is_swizzle) {
bailout(kSimd, "emit_s8x16_shuffle");
}
......
......@@ -1495,7 +1495,8 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr,
void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs,
const uint8_t shuffle[16]) {
const uint8_t shuffle[16],
bool is_swizzle) {
MSARegister dst_msa = dst.fp().toW();
MSARegister lhs_msa = lhs.fp().toW();
MSARegister rhs_msa = rhs.fp().toW();
......
......@@ -1017,7 +1017,8 @@ void LiftoffAssembler::emit_i16x8_extract_lane_s(LiftoffRegister dst,
void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs,
const uint8_t shuffle[16]) {
const uint8_t shuffle[16],
bool is_swizzle) {
bailout(kSimd, "s8x16_shuffle");
}
......
......@@ -1021,7 +1021,8 @@ void LiftoffAssembler::emit_i16x8_extract_lane_s(LiftoffRegister dst,
void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs,
const uint8_t shuffle[16]) {
const uint8_t shuffle[16],
bool is_swizzle) {
bailout(kSimd, "s8x16_shuffle");
}
......
......@@ -5,9 +5,9 @@
#ifndef V8_WASM_BASELINE_X64_LIFTOFF_ASSEMBLER_X64_H_
#define V8_WASM_BASELINE_X64_LIFTOFF_ASSEMBLER_X64_H_
#include "src/wasm/baseline/liftoff-assembler.h"
#include "src/codegen/assembler.h"
#include "src/wasm/baseline/liftoff-assembler.h"
#include "src/wasm/simd-shuffle.h"
namespace v8 {
namespace internal {
......@@ -2274,7 +2274,26 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr,
void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst,
LiftoffRegister lhs,
LiftoffRegister rhs,
const uint8_t shuffle[16]) {
const uint8_t shuffle[16],
bool is_swizzle) {
if (is_swizzle) {
uint32_t imms[4];
// Shuffles that use just 1 operand are called swizzles, rhs can be ignored.
wasm::SimdShuffle::Pack16Lanes(imms, shuffle);
TurboAssembler::Move(kScratchDoubleReg, make_uint64(imms[3], imms[2]),
make_uint64(imms[1], imms[0]));
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpshufb(dst.fp(), lhs.fp(), kScratchDoubleReg);
} else {
if (dst != lhs) {
movups(dst.fp(), lhs.fp());
}
pshufb(dst.fp(), kScratchDoubleReg);
}
return;
}
LiftoffRegister tmp_simd =
GetUnusedRegister(kFpReg, LiftoffRegList::ForRegs(dst, lhs, rhs));
Movups(kScratchDoubleReg, lhs.fp());
......
......@@ -136,6 +136,12 @@ int32_t SimdShuffle::Pack4Lanes(const uint8_t* shuffle) {
return result;
}
void SimdShuffle::Pack16Lanes(uint32_t* dst, const uint8_t* shuffle) {
for (int i = 0; i < 4; i++) {
dst[i] = wasm::SimdShuffle::Pack4Lanes(shuffle + (i * 4));
}
}
} // namespace wasm
} // namespace internal
} // namespace v8
......@@ -82,6 +82,8 @@ class V8_EXPORT_PRIVATE SimdShuffle {
static uint8_t PackBlend4(const uint8_t* shuffle32x4);
// Packs 4 bytes of shuffle into a 32 bit immediate.
static int32_t Pack4Lanes(const uint8_t* shuffle);
// Packs 16 bytes of shuffle into an array of 4 uint32_t.
static void Pack16Lanes(uint32_t* dst, const uint8_t* shuffle);
};
} // namespace wasm
} // namespace internal
......
......@@ -5,6 +5,9 @@
#include "src/wasm/simd-shuffle.h"
#include "test/unittests/test-utils.h"
#include "testing/gmock-support.h"
using ::testing::ElementsAre;
namespace v8 {
namespace internal {
......@@ -272,6 +275,14 @@ TEST(SimdShufflePackTest, Pack4Lanes) {
EXPECT_EQ(0x7ca00801, SimdShuffle::Pack4Lanes(arr));
}
TEST(SimdShufflePackTest, Pack16Lanes) {
uint8_t arr[16]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
uint32_t imms[4]{0};
SimdShuffle::Pack16Lanes(imms, arr);
EXPECT_THAT(imms,
ElementsAre(0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c));
}
} // namespace wasm
} // namespace internal
} // namespace v8
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment