Commit 3bb0f51a authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[wasm-simd][x64] Pattern match on shufps-style shuffles

When a 8x16 shuffle matches a 32x4 shuffle (every group of 4 indices are
consecutive), and the first 2 indices are in the range [0-3], and the
other 2 indices are in the range [4-7], then we can match it to a
shufps. E.g. [0,2,4,6], [1,3,5,7]. These shuffles are commonly used to
extract odd/even floats.

Change-Id: I031fe44f71a13bbc72115c22b02a5eaaf29d3794
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2596579
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71860}
parent 1215f2a8
...@@ -3936,6 +3936,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3936,6 +3936,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} }
break; break;
} }
case kX64Shufps: {
__ Shufps(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1), i.InputUint8(2));
break;
}
case kX64S32x4Rotate: { case kX64S32x4Rotate: {
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0); XMMRegister src = i.InputSimd128Register(0);
......
...@@ -350,6 +350,7 @@ namespace compiler { ...@@ -350,6 +350,7 @@ namespace compiler {
V(X64S128Load32x2U) \ V(X64S128Load32x2U) \
V(X64S128Store32Lane) \ V(X64S128Store32Lane) \
V(X64S128Store64Lane) \ V(X64S128Store64Lane) \
V(X64Shufps) \
V(X64S32x4Rotate) \ V(X64S32x4Rotate) \
V(X64S32x4Swizzle) \ V(X64S32x4Swizzle) \
V(X64S32x4Shuffle) \ V(X64S32x4Shuffle) \
......
...@@ -312,6 +312,7 @@ int InstructionScheduler::GetTargetInstructionFlags( ...@@ -312,6 +312,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64V16x8AllTrue: case kX64V16x8AllTrue:
case kX64I8x16Swizzle: case kX64I8x16Swizzle:
case kX64I8x16Shuffle: case kX64I8x16Shuffle:
case kX64Shufps:
case kX64S32x4Rotate: case kX64S32x4Rotate:
case kX64S32x4Swizzle: case kX64S32x4Swizzle:
case kX64S32x4Shuffle: case kX64S32x4Shuffle:
......
...@@ -3447,6 +3447,15 @@ bool TryMatchArchShuffle(const uint8_t* shuffle, const ShuffleEntry* table, ...@@ -3447,6 +3447,15 @@ bool TryMatchArchShuffle(const uint8_t* shuffle, const ShuffleEntry* table,
return false; return false;
} }
bool TryMatchShufps(const uint8_t* shuffle32x4) {
DCHECK_GT(8, shuffle32x4[2]);
DCHECK_GT(8, shuffle32x4[3]);
// shufps can be used if the first 2 indices select the first input [0-3], and
// the other 2 indices select the second input [4-7].
return shuffle32x4[0] < 4 && shuffle32x4[1] < 4 && shuffle32x4[2] > 3 &&
shuffle32x4[3] > 3;
}
} // namespace } // namespace
void InstructionSelector::VisitI8x16Shuffle(Node* node) { void InstructionSelector::VisitI8x16Shuffle(Node* node) {
...@@ -3529,6 +3538,12 @@ void InstructionSelector::VisitI8x16Shuffle(Node* node) { ...@@ -3529,6 +3538,12 @@ void InstructionSelector::VisitI8x16Shuffle(Node* node) {
uint8_t blend_mask = wasm::SimdShuffle::PackBlend4(shuffle32x4); uint8_t blend_mask = wasm::SimdShuffle::PackBlend4(shuffle32x4);
imms[imm_count++] = blend_mask; imms[imm_count++] = blend_mask;
no_same_as_first = CpuFeatures::IsSupported(AVX); no_same_as_first = CpuFeatures::IsSupported(AVX);
} else if (TryMatchShufps(shuffle32x4)) {
opcode = kX64Shufps;
uint8_t mask = wasm::SimdShuffle::PackShuffle4(shuffle32x4);
imms[imm_count++] = mask;
src1_needs_reg = true;
no_same_as_first = IsSupported(AVX);
} else { } else {
opcode = kX64S32x4Shuffle; opcode = kX64S32x4Shuffle;
no_same_as_first = true; no_same_as_first = true;
......
...@@ -3082,6 +3082,28 @@ WASM_SIMD_TEST(S8x16Concat) { ...@@ -3082,6 +3082,28 @@ WASM_SIMD_TEST(S8x16Concat) {
} }
} }
WASM_SIMD_TEST(ShuffleShufps) {
// We reverse engineer the shufps immediates into 8x16 shuffles.
std::array<int8_t, kSimd128Size> expected;
for (int mask = 0; mask < 256; mask++) {
// Each iteration of this loop sets byte[i] of the 32x4 lanes.
// Low 2 lanes (2-bits each) select from first input.
uint8_t index0 = (mask & 3) * 4;
uint8_t index1 = ((mask >> 2) & 3) * 4;
// Next 2 bits select from src2, so add 16 to the index.
uint8_t index2 = ((mask >> 4) & 3) * 4 + 16;
uint8_t index3 = ((mask >> 6) & 3) * 4 + 16;
for (int i = 0; i < 4; i++) {
expected[0 + i] = index0 + i;
expected[4 + i] = index1 + i;
expected[8 + i] = index2 + i;
expected[12 + i] = index3 + i;
}
RunShuffleOpTest(execution_tier, lower_simd, kExprI8x16Shuffle, expected);
}
}
struct SwizzleTestArgs { struct SwizzleTestArgs {
const Shuffle input; const Shuffle input;
const Shuffle indices; const Shuffle indices;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment