Commit 732dba60 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][liftoff][x64] Optimize shuffles (swizzles)

Swizzles are shuffles that only use values from 1 operand, e.g.
v8x16.shuffle 0 1 2 3 0 0 0 0 4 5 6 7 0 0 0 0 (all the values are < 16).

Match such patterns and emit an optimized codegen that uses less
registers and instructions. Only implemented for x64 for now, the other
backends will come in follow-up patches.

Bug: v8:10696
Change-Id: Iffa694b04c97313eab7d138e4bdad7c0c85cda89
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2335419Reviewed-by: 's avatarClemens Backes <clemensb@chromium.org>
Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#69231}
parent 47794594
...@@ -1353,6 +1353,12 @@ void TurboAssembler::Move(XMMRegister dst, uint64_t src) { ...@@ -1353,6 +1353,12 @@ void TurboAssembler::Move(XMMRegister dst, uint64_t src) {
} }
} }
void TurboAssembler::Move(XMMRegister dst, uint64_t high, uint64_t low) {
Move(dst, low);
movq(kScratchRegister, high);
Pinsrq(dst, kScratchRegister, int8_t{1});
}
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
void MacroAssembler::Absps(XMMRegister dst) { void MacroAssembler::Absps(XMMRegister dst) {
......
...@@ -432,6 +432,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -432,6 +432,7 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void Move(XMMRegister dst, uint64_t src); void Move(XMMRegister dst, uint64_t src);
void Move(XMMRegister dst, float src) { Move(dst, bit_cast<uint32_t>(src)); } void Move(XMMRegister dst, float src) { Move(dst, bit_cast<uint32_t>(src)); }
void Move(XMMRegister dst, double src) { Move(dst, bit_cast<uint64_t>(src)); } void Move(XMMRegister dst, double src) { Move(dst, bit_cast<uint64_t>(src)); }
void Move(XMMRegister dst, uint64_t high, uint64_t low);
// Move if the registers are not identical. // Move if the registers are not identical.
void Move(Register target, Register source); void Move(Register target, Register source);
......
...@@ -759,9 +759,8 @@ void AdjustStackPointerForTailCall(TurboAssembler* assembler, ...@@ -759,9 +759,8 @@ void AdjustStackPointerForTailCall(TurboAssembler* assembler,
void SetupSimdImmediateInRegister(TurboAssembler* assembler, uint32_t* imms, void SetupSimdImmediateInRegister(TurboAssembler* assembler, uint32_t* imms,
XMMRegister reg) { XMMRegister reg) {
assembler->Move(reg, make_uint64(imms[1], imms[0])); assembler->Move(reg, make_uint64(imms[3], imms[2]),
assembler->movq(kScratchRegister, make_uint64(imms[3], imms[2])); make_uint64(imms[1], imms[0]));
assembler->Pinsrq(reg, kScratchRegister, int8_t{1});
} }
} // namespace } // namespace
......
...@@ -2905,7 +2905,8 @@ void LiftoffAssembler::emit_i16x8_replace_lane(LiftoffRegister dst, ...@@ -2905,7 +2905,8 @@ void LiftoffAssembler::emit_i16x8_replace_lane(LiftoffRegister dst,
void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst, void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst,
LiftoffRegister lhs, LiftoffRegister lhs,
LiftoffRegister rhs, LiftoffRegister rhs,
const uint8_t shuffle[16]) { const uint8_t shuffle[16],
bool is_swizzle) {
Simd128Register dest = liftoff::GetSimd128Register(dst); Simd128Register dest = liftoff::GetSimd128Register(dst);
Simd128Register src1 = liftoff::GetSimd128Register(lhs); Simd128Register src1 = liftoff::GetSimd128Register(lhs);
Simd128Register src2 = liftoff::GetSimd128Register(rhs); Simd128Register src2 = liftoff::GetSimd128Register(rhs);
......
...@@ -2029,7 +2029,8 @@ void LiftoffAssembler::emit_i16x8_max_u(LiftoffRegister dst, ...@@ -2029,7 +2029,8 @@ void LiftoffAssembler::emit_i16x8_max_u(LiftoffRegister dst,
void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst, void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst,
LiftoffRegister lhs, LiftoffRegister lhs,
LiftoffRegister rhs, LiftoffRegister rhs,
const uint8_t shuffle[16]) { const uint8_t shuffle[16],
bool is_swizzle) {
VRegister src1 = lhs.fp(); VRegister src1 = lhs.fp();
VRegister src2 = rhs.fp(); VRegister src2 = rhs.fp();
VRegister temp = dst.fp(); VRegister temp = dst.fp();
......
...@@ -2638,7 +2638,8 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr, ...@@ -2638,7 +2638,8 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr,
void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst, void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst,
LiftoffRegister lhs, LiftoffRegister lhs,
LiftoffRegister rhs, LiftoffRegister rhs,
const uint8_t shuffle[16]) { const uint8_t shuffle[16],
bool is_swizzle) {
LiftoffRegister tmp = GetUnusedRegister(kGpReg, {}); LiftoffRegister tmp = GetUnusedRegister(kGpReg, {});
// Prepare 16 byte aligned buffer for shuffle control mask. // Prepare 16 byte aligned buffer for shuffle control mask.
mov(tmp.gp(), esp); mov(tmp.gp(), esp);
......
...@@ -756,8 +756,8 @@ class LiftoffAssembler : public TurboAssembler { ...@@ -756,8 +756,8 @@ class LiftoffAssembler : public TurboAssembler {
LoadType type, LoadTransformationKind transform, LoadType type, LoadTransformationKind transform,
uint32_t* protected_load_pc); uint32_t* protected_load_pc);
inline void emit_s8x16_shuffle(LiftoffRegister dst, LiftoffRegister lhs, inline void emit_s8x16_shuffle(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs, LiftoffRegister rhs, const uint8_t shuffle[16],
const uint8_t shuffle[16]); bool is_swizzle);
inline void emit_s8x16_swizzle(LiftoffRegister dst, LiftoffRegister lhs, inline void emit_s8x16_swizzle(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegister rhs); LiftoffRegister rhs);
inline void emit_i8x16_splat(LiftoffRegister dst, LiftoffRegister src); inline void emit_i8x16_splat(LiftoffRegister dst, LiftoffRegister src);
......
...@@ -23,6 +23,7 @@ ...@@ -23,6 +23,7 @@
#include "src/wasm/function-compiler.h" #include "src/wasm/function-compiler.h"
#include "src/wasm/memory-tracing.h" #include "src/wasm/memory-tracing.h"
#include "src/wasm/object-access.h" #include "src/wasm/object-access.h"
#include "src/wasm/simd-shuffle.h"
#include "src/wasm/wasm-debug.h" #include "src/wasm/wasm-debug.h"
#include "src/wasm/wasm-engine.h" #include "src/wasm/wasm-engine.h"
#include "src/wasm/wasm-linkage.h" #include "src/wasm/wasm-linkage.h"
...@@ -2843,7 +2844,16 @@ class LiftoffCompiler { ...@@ -2843,7 +2844,16 @@ class LiftoffCompiler {
LiftoffRegister lhs = __ PopToRegister(LiftoffRegList::ForRegs(rhs)); LiftoffRegister lhs = __ PopToRegister(LiftoffRegList::ForRegs(rhs));
LiftoffRegister dst = __ GetUnusedRegister(result_rc, {lhs, rhs}, {}); LiftoffRegister dst = __ GetUnusedRegister(result_rc, {lhs, rhs}, {});
__ LiftoffAssembler::emit_s8x16_shuffle(dst, lhs, rhs, imm.value); uint8_t shuffle[kSimd128Size];
memcpy(shuffle, imm.value, sizeof(shuffle));
bool is_swizzle;
bool needs_swap;
wasm::SimdShuffle::CanonicalizeShuffle(lhs == rhs, shuffle, &needs_swap,
&is_swizzle);
if (needs_swap) {
std::swap(lhs, rhs);
}
__ LiftoffAssembler::emit_s8x16_shuffle(dst, lhs, rhs, shuffle, is_swizzle);
__ PushRegister(kWasmS128, dst); __ PushRegister(kWasmS128, dst);
} }
......
...@@ -1610,7 +1610,8 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr, ...@@ -1610,7 +1610,8 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr,
void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst, void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst,
LiftoffRegister lhs, LiftoffRegister lhs,
LiftoffRegister rhs, LiftoffRegister rhs,
const uint8_t shuffle[16]) { const uint8_t shuffle[16],
bool is_swizzle) {
bailout(kSimd, "emit_s8x16_shuffle"); bailout(kSimd, "emit_s8x16_shuffle");
} }
......
...@@ -1495,7 +1495,8 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr, ...@@ -1495,7 +1495,8 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr,
void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst, void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst,
LiftoffRegister lhs, LiftoffRegister lhs,
LiftoffRegister rhs, LiftoffRegister rhs,
const uint8_t shuffle[16]) { const uint8_t shuffle[16],
bool is_swizzle) {
MSARegister dst_msa = dst.fp().toW(); MSARegister dst_msa = dst.fp().toW();
MSARegister lhs_msa = lhs.fp().toW(); MSARegister lhs_msa = lhs.fp().toW();
MSARegister rhs_msa = rhs.fp().toW(); MSARegister rhs_msa = rhs.fp().toW();
......
...@@ -1017,7 +1017,8 @@ void LiftoffAssembler::emit_i16x8_extract_lane_s(LiftoffRegister dst, ...@@ -1017,7 +1017,8 @@ void LiftoffAssembler::emit_i16x8_extract_lane_s(LiftoffRegister dst,
void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst, void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst,
LiftoffRegister lhs, LiftoffRegister lhs,
LiftoffRegister rhs, LiftoffRegister rhs,
const uint8_t shuffle[16]) { const uint8_t shuffle[16],
bool is_swizzle) {
bailout(kSimd, "s8x16_shuffle"); bailout(kSimd, "s8x16_shuffle");
} }
......
...@@ -1021,7 +1021,8 @@ void LiftoffAssembler::emit_i16x8_extract_lane_s(LiftoffRegister dst, ...@@ -1021,7 +1021,8 @@ void LiftoffAssembler::emit_i16x8_extract_lane_s(LiftoffRegister dst,
void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst, void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst,
LiftoffRegister lhs, LiftoffRegister lhs,
LiftoffRegister rhs, LiftoffRegister rhs,
const uint8_t shuffle[16]) { const uint8_t shuffle[16],
bool is_swizzle) {
bailout(kSimd, "s8x16_shuffle"); bailout(kSimd, "s8x16_shuffle");
} }
......
...@@ -5,9 +5,9 @@ ...@@ -5,9 +5,9 @@
#ifndef V8_WASM_BASELINE_X64_LIFTOFF_ASSEMBLER_X64_H_ #ifndef V8_WASM_BASELINE_X64_LIFTOFF_ASSEMBLER_X64_H_
#define V8_WASM_BASELINE_X64_LIFTOFF_ASSEMBLER_X64_H_ #define V8_WASM_BASELINE_X64_LIFTOFF_ASSEMBLER_X64_H_
#include "src/wasm/baseline/liftoff-assembler.h"
#include "src/codegen/assembler.h" #include "src/codegen/assembler.h"
#include "src/wasm/baseline/liftoff-assembler.h"
#include "src/wasm/simd-shuffle.h"
namespace v8 { namespace v8 {
namespace internal { namespace internal {
...@@ -2274,7 +2274,26 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr, ...@@ -2274,7 +2274,26 @@ void LiftoffAssembler::LoadTransform(LiftoffRegister dst, Register src_addr,
void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst, void LiftoffAssembler::emit_s8x16_shuffle(LiftoffRegister dst,
LiftoffRegister lhs, LiftoffRegister lhs,
LiftoffRegister rhs, LiftoffRegister rhs,
const uint8_t shuffle[16]) { const uint8_t shuffle[16],
bool is_swizzle) {
if (is_swizzle) {
uint32_t imms[4];
// Shuffles that use just 1 operand are called swizzles, rhs can be ignored.
wasm::SimdShuffle::Pack16Lanes(imms, shuffle);
TurboAssembler::Move(kScratchDoubleReg, make_uint64(imms[3], imms[2]),
make_uint64(imms[1], imms[0]));
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpshufb(dst.fp(), lhs.fp(), kScratchDoubleReg);
} else {
if (dst != lhs) {
movups(dst.fp(), lhs.fp());
}
pshufb(dst.fp(), kScratchDoubleReg);
}
return;
}
LiftoffRegister tmp_simd = LiftoffRegister tmp_simd =
GetUnusedRegister(kFpReg, LiftoffRegList::ForRegs(dst, lhs, rhs)); GetUnusedRegister(kFpReg, LiftoffRegList::ForRegs(dst, lhs, rhs));
Movups(kScratchDoubleReg, lhs.fp()); Movups(kScratchDoubleReg, lhs.fp());
......
...@@ -136,6 +136,12 @@ int32_t SimdShuffle::Pack4Lanes(const uint8_t* shuffle) { ...@@ -136,6 +136,12 @@ int32_t SimdShuffle::Pack4Lanes(const uint8_t* shuffle) {
return result; return result;
} }
void SimdShuffle::Pack16Lanes(uint32_t* dst, const uint8_t* shuffle) {
for (int i = 0; i < 4; i++) {
dst[i] = wasm::SimdShuffle::Pack4Lanes(shuffle + (i * 4));
}
}
} // namespace wasm } // namespace wasm
} // namespace internal } // namespace internal
} // namespace v8 } // namespace v8
...@@ -82,6 +82,8 @@ class V8_EXPORT_PRIVATE SimdShuffle { ...@@ -82,6 +82,8 @@ class V8_EXPORT_PRIVATE SimdShuffle {
static uint8_t PackBlend4(const uint8_t* shuffle32x4); static uint8_t PackBlend4(const uint8_t* shuffle32x4);
// Packs 4 bytes of shuffle into a 32 bit immediate. // Packs 4 bytes of shuffle into a 32 bit immediate.
static int32_t Pack4Lanes(const uint8_t* shuffle); static int32_t Pack4Lanes(const uint8_t* shuffle);
// Packs 16 bytes of shuffle into an array of 4 uint32_t.
static void Pack16Lanes(uint32_t* dst, const uint8_t* shuffle);
}; };
} // namespace wasm } // namespace wasm
} // namespace internal } // namespace internal
......
...@@ -5,6 +5,9 @@ ...@@ -5,6 +5,9 @@
#include "src/wasm/simd-shuffle.h" #include "src/wasm/simd-shuffle.h"
#include "test/unittests/test-utils.h" #include "test/unittests/test-utils.h"
#include "testing/gmock-support.h"
using ::testing::ElementsAre;
namespace v8 { namespace v8 {
namespace internal { namespace internal {
...@@ -272,6 +275,14 @@ TEST(SimdShufflePackTest, Pack4Lanes) { ...@@ -272,6 +275,14 @@ TEST(SimdShufflePackTest, Pack4Lanes) {
EXPECT_EQ(0x7ca00801, SimdShuffle::Pack4Lanes(arr)); EXPECT_EQ(0x7ca00801, SimdShuffle::Pack4Lanes(arr));
} }
TEST(SimdShufflePackTest, Pack16Lanes) {
uint8_t arr[16]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
uint32_t imms[4]{0};
SimdShuffle::Pack16Lanes(imms, arr);
EXPECT_THAT(imms,
ElementsAre(0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c));
}
} // namespace wasm } // namespace wasm
} // namespace internal } // namespace internal
} // namespace v8 } // namespace v8
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment