Commit e344bf94 authored by Fanchen Kong's avatar Fanchen Kong Committed by V8 LUCI CQ

[x64][wasm-simd] Pattern match on packed byte to dword zero extend like shuffle

When a 8x16 shuffle matches a packed byte to dword zero extension,
1. input1 is S128Zero after canonicalization,
2. the indices {0,4,8,16} are consecutive value in the range [0-15] and
other indices are in the range [16-31],
the shuffle can be matched to packed byte to dword zero extend. These
shuffles are commonly used in image processing.

Change-Id: I14d1e35401dbc5ecd91f67c46ea9762628835d01
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3547667Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Fanchen Kong <fanchen.kong@intel.com>
Cr-Commit-Position: refs/heads/main@{#80953}
parent 86ca80bb
......@@ -117,6 +117,7 @@
V(pmovsxwd, 66, 0F, 38, 23) \
V(pmovsxdq, 66, 0F, 38, 25) \
V(pmovzxbw, 66, 0F, 38, 30) \
V(pmovzxbd, 66, 0F, 38, 31) \
V(pmovzxwd, 66, 0F, 38, 33) \
V(pmovzxdq, 66, 0F, 38, 35) \
V(ptest, 66, 0F, 38, 17)
......
......@@ -384,6 +384,7 @@ class V8_EXPORT_PRIVATE SharedTurboAssembler : public TurboAssemblerBase {
AVX_OP_SSE4_1(Pmovsxdq, pmovsxdq)
AVX_OP_SSE4_1(Pmovsxwd, pmovsxwd)
AVX_OP_SSE4_1(Pmovzxbw, pmovzxbw)
AVX_OP_SSE4_1(Pmovzxbd, pmovzxbd)
AVX_OP_SSE4_1(Pmovzxdq, pmovzxdq)
AVX_OP_SSE4_1(Pmovzxwd, pmovzxwd)
AVX_OP_SSE4_1(Pmulld, pmulld)
......
......@@ -187,6 +187,7 @@
V(pmovsxwd, 66, 0F, 38, 23) \
V(pmovsxdq, 66, 0F, 38, 25) \
V(pmovzxbw, 66, 0F, 38, 30) \
V(pmovzxbd, 66, 0F, 38, 31) \
V(pmovzxwd, 66, 0F, 38, 33) \
V(pmovzxdq, 66, 0F, 38, 35)
......
......@@ -3319,6 +3319,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
kScratchDoubleReg);
break;
}
case kX64I32X4ShiftZeroExtendI8x16: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
uint8_t shift = i.InputUint8(1);
if (shift != 0) {
__ Palignr(dst, src, shift);
__ Pmovzxbd(dst, dst);
} else {
__ Pmovzxbd(dst, src);
}
break;
}
case kX64S128Const: {
// Emit code for generic constants as all zeros, or ones cases will be
// handled separately by the selector.
......
......@@ -286,6 +286,7 @@ namespace compiler {
V(X64I32x4ExtAddPairwiseI16x8U) \
V(X64I32x4TruncSatF64x2SZero) \
V(X64I32x4TruncSatF64x2UZero) \
V(X64I32X4ShiftZeroExtendI8x16) \
V(X64I16x8Splat) \
V(X64I16x8ExtractLaneS) \
V(X64I16x8SConvertI8x16Low) \
......
......@@ -237,6 +237,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64I32x4ExtAddPairwiseI16x8U:
case kX64I32x4TruncSatF64x2SZero:
case kX64I32x4TruncSatF64x2UZero:
case kX64I32X4ShiftZeroExtendI8x16:
case kX64I16x8Splat:
case kX64I16x8ExtractLaneS:
case kX64I16x8SConvertI8x16Low:
......
......@@ -3784,12 +3784,20 @@ void InstructionSelector::VisitI8x16Shuffle(Node* node) {
shuffle[i] ^= kSimd128Size;
}
}
// If the most significant bit (bit 7) of each byte of the shuffle control
// mask is set, then constant zero is written in the result byte. Input1 is
// zeros now, we can avoid using input1 by setting bit 7 of shuffle[i] to 1.
for (int i = 0; i < kSimd128Size; ++i) {
if (shuffle[i] >= kSimd128Size) {
shuffle[i] = 0x80;
if (wasm::SimdShuffle::TryMatchByteToDwordZeroExtend(shuffle)) {
opcode = kX64I32X4ShiftZeroExtendI8x16;
no_same_as_first = true;
src0_needs_reg = true;
imms[imm_count++] = shuffle[0];
} else {
// If the most significant bit (bit 7) of each byte of the shuffle control
// mask is set, then constant zero is written in the result byte. Input1
// is zeros now, we can avoid using input1 by setting bit 7 of shuffle[i]
// to 1.
for (int i = 0; i < kSimd128Size; ++i) {
if (shuffle[i] >= kSimd128Size) {
shuffle[i] = 0x80;
}
}
}
}
......
......@@ -127,6 +127,15 @@ bool SimdShuffle::TryMatchBlend(const uint8_t* shuffle) {
return true;
}
bool SimdShuffle::TryMatchByteToDwordZeroExtend(const uint8_t* shuffle) {
for (int i = 0; i < 16; ++i) {
if ((i % 4 != 0) && (shuffle[i] < 16)) return false;
if ((i % 4 == 0) && (shuffle[i] > 15 || (shuffle[i] != shuffle[0] + i / 4)))
return false;
}
return true;
}
uint8_t SimdShuffle::PackShuffle4(uint8_t* shuffle) {
return (shuffle[0] & 3) | ((shuffle[1] & 3) << 2) | ((shuffle[2] & 3) << 4) |
((shuffle[3] & 3) << 6);
......
......@@ -83,6 +83,12 @@ class V8_EXPORT_PRIVATE SimdShuffle {
// shuffle should be canonicalized.
static bool TryMatchBlend(const uint8_t* shuffle);
// Tries to match a byte shuffle to a packed byte to dword zero extend
// operation. E.g. [8 x x x 9 x x x 10 x x x 11 x x x ] (x is arbitrary value
// large than 15). The shuffle should be canonicalized. Its second input
// should be zero.
static bool TryMatchByteToDwordZeroExtend(const uint8_t* shuffle);
// Packs a 4 lane shuffle into a single imm8 suitable for use by pshufd,
// pshuflw, and pshufhw.
static uint8_t PackShuffle4(uint8_t* shuffle);
......
......@@ -2660,12 +2660,10 @@ WASM_SIMD_TEST(I8x16ShuffleWithZeroInput) {
WasmRunner<int32_t> r(execution_tier);
static const int kElems = kSimd128Size / sizeof(uint8_t);
uint8_t* dst = r.builder().AddGlobal<uint8_t>(kWasmS128);
uint8_t* src0 = r.builder().AddGlobal<uint8_t>(kWasmS128);
uint8_t* src1 = r.builder().AddGlobal<uint8_t>(kWasmS128);
// src0 is zero, it's used to zero extend src1
for (int i = 0; i < kElems; i++) {
LANE(src0, i) = 0;
LANE(src1, i) = i;
}
......@@ -2674,11 +2672,12 @@ WASM_SIMD_TEST(I8x16ShuffleWithZeroInput) {
18, 9, 10, 11, 19, 13, 14, 15};
constexpr std::array<int8_t, 16> expected = {0, 0, 0, 0, 1, 0, 0, 0,
2, 0, 0, 0, 3, 0, 0, 0};
constexpr std::array<int8_t, 16> zeros = {0};
BUILD(r,
WASM_GLOBAL_SET(0, WASM_SIMD_I8x16_SHUFFLE_OP(
kExprI8x16Shuffle, shuffle, WASM_GLOBAL_GET(1),
WASM_GLOBAL_GET(2))),
kExprI8x16Shuffle, shuffle,
WASM_SIMD_CONSTANT(zeros), WASM_GLOBAL_GET(1))),
WASM_ONE);
CHECK_EQ(1, r.Call());
for (int i = 0; i < kElems; i++) {
......
......@@ -2369,6 +2369,54 @@ TEST_P(InstructionSelectorSIMDArchShuffleTest, SIMDArchShuffle) {
INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorSIMDArchShuffleTest,
::testing::ValuesIn(kArchShuffles));
struct ShuffleWithZeroInput {
uint8_t shuffle_mask[kSimd128Size];
ArchOpcode arch_opcode;
size_t input_count;
};
static constexpr ShuffleWithZeroInput kShuffleWithZeroInput[] = {
// These are matched by TryMatchByteToDwordZeroExtend.
{
{16, 1, 2, 3, 17, 4, 5, 6, 18, 7, 8, 9, 19, 10, 11, 12},
kX64I32X4ShiftZeroExtendI8x16,
2,
},
// Generic shuffle that uses one zero input.
{
{16, 1, 2, 3, 17, 4, 5, 6, 18, 7, 8, 9, 19, 20, 21, 22},
kX64I8x16Shuffle,
5,
},
};
using InstructionSelectorSIMDShuffleWithZeroInputTest =
InstructionSelectorTestWithParam<ShuffleWithZeroInput>;
TEST_P(InstructionSelectorSIMDShuffleWithZeroInputTest,
SIMDShuffleWithZeroInputTest) {
MachineType type = MachineType::Simd128();
{
// Tests shuffle to packed zero extend optimization
uint8_t zeros[kSimd128Size] = {0};
StreamBuilder m(this, type, type);
auto param = GetParam();
const Operator* op = m.machine()->I8x16Shuffle(param.shuffle_mask);
Node* const c = m.S128Const(zeros);
Node* n = m.AddNode(op, c, m.Parameter(0));
m.Return(n);
Stream s = m.Build();
ASSERT_EQ(1U, s.size());
EXPECT_EQ(param.arch_opcode, s[0]->arch_opcode());
ASSERT_EQ(param.input_count, s[0]->InputCount());
EXPECT_EQ(1U, s[0]->OutputCount());
}
}
INSTANTIATE_TEST_SUITE_P(InstructionSelectorTest,
InstructionSelectorSIMDShuffleWithZeroInputTest,
::testing::ValuesIn(kShuffleWithZeroInput));
#endif // V8_ENABLE_WEBASSEMBLY
struct SwizzleConstants {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment