Commit 7c98abdb authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[x64][wasm-simd] Pattern match 32x4 rotate

Code like:

  x = wasm_v32x4_shuffle(x, x, 1, 2, 3, 0);

is currently matched by S8x16Concat, which lowers to two instructions:

  movapd xmm_dst, xmm_src
  palignr xmm_dst, xmm_src, 0x4

There is a special case after a S8x16Concat is matched:.

- is_swizzle, the inputs are the same
- it is a 32x4 shuffle (offset % 4 == 0)

Which can have a better codegen:

- (dst == src) shufps dst, src, 0b00111001
- (dst != src) pshufd dst, src, 0b00111001

Add a new simd shuffle matcher which will match 32x4 rotate, and
construct the appropriate indices referring to the 32x4 elements.

pshufd for the given example. However, this matching happens after
S8x16Concat, so we get the palignr first. We could move the pattern
matching cases around, but it will lead to some cases where
where it would have matched a S8x16Concat, but now matches a
S32x4shuffle instead, leading to worse codegen.

Note: we also pattern match on 32x4Swizzle, which correctly generates
Change-Id: Ie3aca53bbc06826be2cf49632de4c24ec73d0a9a
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2589062Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71754}
parent e327fe69
......@@ -3906,6 +3906,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
break;
}
case kX64S32x4Rotate: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
uint8_t mask = i.InputUint8(1);
if (dst == src) {
// 1-byte shorter encoding than pshufd.
__ Shufps(dst, src, mask);
} else {
__ Pshufd(dst, src, mask);
}
break;
}
case kX64S32x4Swizzle: {
DCHECK_EQ(2, instr->InputCount());
ASSEMBLE_SIMD_IMM_INSTR(Pshufd, i.OutputSimd128Register(), 0,
......
......@@ -350,6 +350,7 @@ namespace compiler {
V(X64S128Load32x2U) \
V(X64S128Store32Lane) \
V(X64S128Store64Lane) \
V(X64S32x4Rotate) \
V(X64S32x4Swizzle) \
V(X64S32x4Shuffle) \
V(X64S16x8Blend) \
......
......@@ -312,6 +312,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64V16x8AllTrue:
case kX64I8x16Swizzle:
case kX64I8x16Shuffle:
case kX64S32x4Rotate:
case kX64S32x4Swizzle:
case kX64S32x4Shuffle:
case kX64S16x8Blend:
......
......@@ -3465,15 +3465,22 @@ void InstructionSelector::VisitI8x16Shuffle(Node* node) {
int index;
const ShuffleEntry* arch_shuffle;
if (wasm::SimdShuffle::TryMatchConcat(shuffle, &offset)) {
// Swap inputs from the normal order for (v)palignr.
SwapShuffleInputs(node);
is_swizzle = false; // It's simpler to just handle the general case.
no_same_as_first = false; // SSE requires same-as-first.
// TODO(v8:9608): also see v8:9083
src1_needs_reg = true;
opcode = kX64S8x16Alignr;
// palignr takes a single imm8 offset.
imms[imm_count++] = offset;
if (wasm::SimdShuffle::TryMatch32x4Rotate(shuffle, shuffle32x4,
is_swizzle)) {
uint8_t shuffle_mask = wasm::SimdShuffle::PackShuffle4(shuffle32x4);
opcode = kX64S32x4Rotate;
imms[imm_count++] = shuffle_mask;
} else {
// Swap inputs from the normal order for (v)palignr.
SwapShuffleInputs(node);
is_swizzle = false; // It's simpler to just handle the general case.
no_same_as_first = false; // SSE requires same-as-first.
// TODO(v8:9608): also see v8:9083
src1_needs_reg = true;
opcode = kX64S8x16Alignr;
// palignr takes a single imm8 offset.
imms[imm_count++] = offset;
}
} else if (TryMatchArchShuffle(shuffle, arch_shuffles,
arraysize(arch_shuffles), is_swizzle,
&arch_shuffle)) {
......
......@@ -58,6 +58,25 @@ bool SimdShuffle::TryMatchIdentity(const uint8_t* shuffle) {
return true;
}
bool SimdShuffle::TryMatch32x4Rotate(const uint8_t* shuffle,
uint8_t* shuffle32x4, bool is_swizzle) {
uint8_t offset;
bool is_concat = TryMatchConcat(shuffle, &offset);
DCHECK_NE(offset, 0); // 0 is identity, it should not be matched.
// Since we already have a concat shuffle, we know that the indices goes from:
// [ offset, ..., 15, 0, ... ], it suffices to check that the offset points
// to the low byte of a 32x4 element.
if (!is_concat || !is_swizzle || offset % 4 != 0) {
return false;
}
uint8_t offset_32 = offset / 4;
for (int i = 0; i < 4; i++) {
shuffle32x4[i] = (offset_32 + i) % 4;
}
return true;
}
bool SimdShuffle::TryMatch32x4Shuffle(const uint8_t* shuffle,
uint8_t* shuffle32x4) {
for (int i = 0; i < 4; ++i) {
......
......@@ -51,6 +51,12 @@ class V8_EXPORT_PRIVATE SimdShuffle {
return true;
}
// Tries to match a 32x4 rotate, only makes sense if the inputs are equal
// (is_swizzle). A rotation is a shuffle like [1, 2, 3, 0]. This will always
// match a Concat, but can have better codegen.
static bool TryMatch32x4Rotate(const uint8_t* shuffle, uint8_t* shuffle32x4,
bool is_swizzle);
// Tries to match an 8x16 byte shuffle to an equivalent 32x4 shuffle. If
// successful, it writes the 32x4 shuffle word indices. E.g.
// [0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15] == [0 2 1 3]
......
......@@ -2951,6 +2951,7 @@ void RunShuffleOpTest(TestExecutionTier execution_tier, LowerSimd lower_simd,
V(S32x4TransposeRight) \
V(S32x2Reverse) \
V(S32x4Irregular) \
V(S32x4Rotate) \
V(S16x8Dup) \
V(S16x8ZipLeft) \
V(S16x8ZipRight) \
......@@ -3003,6 +3004,7 @@ ShuffleMap test_shuffles = {
{{4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11}}},
{kS32x4Irregular,
{{0, 1, 2, 3, 16, 17, 18, 19, 16, 17, 18, 19, 20, 21, 22, 23}}},
{kS32x4Rotate, {{4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3}}},
{kS16x8Dup,
{{18, 19, 18, 19, 18, 19, 18, 19, 18, 19, 18, 19, 18, 19, 18, 19}}},
{kS16x8ZipLeft, {{0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23}}},
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment