Commit 17568b91 authored by Bill Budge's avatar Bill Budge Committed by Commit Bot

[wasm simd] Generate better code for Dup shuffles

- Dup shuffles broadcast a single lane from a source
  operand to all lanes of the destination. Conceptually
  similar to a splat, they require special handling since
  the splatted value must be extracted from a source. The
  32x4 case is already well handled (pshufd) but 16x8 and
  8x16 currently generate the general shuffle code sequence.
- Adds IA32S16x8Dup, IA32S8x16Dup  opcodes.

Bug: v8:6020
Change-Id: Ia4f044aa7e25cae30e8f9007c2488db738ca6cfc
Reviewed-on: https://chromium-review.googlesource.com/1128513Reviewed-by: 's avatarJing Bao <jing.bao@intel.com>
Commit-Queue: Bill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#54362}
parent f8bda2d3
......@@ -3281,6 +3281,52 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kIA32S8x16Alignr:
ASSEMBLE_SIMD_IMM_SHUFFLE(palignr, SSSE3, i.InputInt8(2));
break;
case kIA32S16x8Dup: {
XMMRegister dst = i.OutputSimd128Register();
Operand src = i.InputOperand(0);
int8_t lane = i.InputInt8(1) & 0x7;
int8_t lane4 = lane & 0x3;
int8_t half_dup = lane4 | (lane4 << 2) | (lane4 << 4) | (lane4 << 6);
if (lane < 4) {
__ Pshuflw(dst, src, half_dup);
__ Pshufd(dst, dst, 0);
} else {
__ Pshufhw(dst, src, half_dup);
__ Pshufd(dst, dst, 0xaa);
}
break;
}
case kIA32S8x16Dup: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src = i.InputSimd128Register(0);
int8_t lane = i.InputInt8(1) & 0xf;
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope avx_scope(tasm(), AVX);
if (lane < 8) {
__ vpunpcklbw(dst, src, src);
} else {
__ vpunpckhbw(dst, src, src);
}
} else {
DCHECK_EQ(dst, src);
if (lane < 8) {
__ punpcklbw(dst, dst);
} else {
__ punpckhbw(dst, dst);
}
}
lane &= 0x7;
int8_t lane4 = lane & 0x3;
int8_t half_dup = lane4 | (lane4 << 2) | (lane4 << 4) | (lane4 << 6);
if (lane < 4) {
__ Pshuflw(dst, dst, half_dup);
__ Pshufd(dst, dst, 0);
} else {
__ Pshufhw(dst, dst, half_dup);
__ Pshufd(dst, dst, 0xaa);
}
break;
}
case kIA32S64x2UnpackHigh:
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhqdq);
break;
......
......@@ -317,6 +317,8 @@ namespace compiler {
V(IA32S16x8HalfShuffle1) \
V(IA32S16x8HalfShuffle2) \
V(IA32S8x16Alignr) \
V(IA32S16x8Dup) \
V(IA32S8x16Dup) \
V(SSES16x8UnzipHigh) \
V(AVXS16x8UnzipHigh) \
V(SSES16x8UnzipLow) \
......
......@@ -299,6 +299,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32S16x8HalfShuffle1:
case kIA32S16x8HalfShuffle2:
case kIA32S8x16Alignr:
case kIA32S16x8Dup:
case kIA32S8x16Dup:
case kSSES16x8UnzipHigh:
case kAVXS16x8UnzipHigh:
case kSSES16x8UnzipLow:
......
......@@ -2238,6 +2238,7 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
uint8_t offset;
uint8_t shuffle32x4[4];
uint8_t shuffle16x8[8];
int index;
const ShuffleEntry* arch_shuffle;
if (TryMatchConcat(shuffle, &offset)) {
// Swap inputs from the normal order for (v)palignr.
......@@ -2286,6 +2287,10 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
opcode = kIA32S16x8Blend;
blend_mask = PackBlend8(shuffle16x8);
imms[imm_count++] = blend_mask;
} else if (TryMatchDup<8>(shuffle, &index)) {
opcode = kIA32S16x8Dup;
src0_needs_reg = false;
imms[imm_count++] = index;
} else if (TryMatch16x8HalfShuffle(shuffle16x8, &blend_mask)) {
opcode = is_swizzle ? kIA32S16x8HalfShuffle1 : kIA32S16x8HalfShuffle2;
// Half-shuffles don't need DefineSameAsFirst or UseRegister(src0).
......@@ -2297,6 +2302,11 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
imms[imm_count++] = mask_hi;
if (!is_swizzle) imms[imm_count++] = blend_mask;
}
} else if (TryMatchDup<16>(shuffle, &index)) {
opcode = kIA32S8x16Dup;
no_same_as_first = use_avx;
src0_needs_reg = true;
imms[imm_count++] = index;
}
if (opcode == kIA32S8x16Shuffle) {
// Use same-as-first for general swizzle, but not shuffle.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment