Commit 51ded9a7 authored by Bill Budge's avatar Bill Budge Committed by Commit Bot

[wasm simd] Handle more shuffles

- Handles zip, unzip, and transpose shuffles/swizzles.
- Adds punpck* instructions to assembler.

Bug: v8:6020
Change-Id: If124b7a7462ffd0470347b54ce4a93c01667e384
Reviewed-on: https://chromium-review.googlesource.com/1084069Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Bill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#53947}
parent edfcba04
...@@ -420,6 +420,30 @@ void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen, ...@@ -420,6 +420,30 @@ void EmitWordLoadPoisoningIfNeeded(CodeGenerator* codegen,
} \ } \
} while (0) } while (0)
#define ASSEMBLE_SIMD_PUNPCK_SHUFFLE(opcode) \
do { \
XMMRegister src0 = i.InputSimd128Register(0); \
Operand src1 = i.InputOperand(instr->InputCount() == 2 ? 1 : 0); \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope avx_scope(tasm(), AVX); \
__ v##opcode(i.OutputSimd128Register(), src0, src1); \
} else { \
DCHECK_EQ(i.OutputSimd128Register(), src0); \
__ opcode(i.OutputSimd128Register(), src1); \
} \
} while (false)
#define ASSEMBLE_SIMD_IMM_SHUFFLE(opcode, SSELevel, imm) \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope avx_scope(tasm(), AVX); \
__ v##opcode(i.OutputSimd128Register(), i.InputSimd128Register(0), \
i.InputOperand(1), imm); \
} else { \
CpuFeatureScope sse_scope(tasm(), SSELevel); \
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); \
__ opcode(i.OutputSimd128Register(), i.InputOperand(1), imm); \
}
void CodeGenerator::AssembleDeconstructFrame() { void CodeGenerator::AssembleDeconstructFrame() {
__ mov(esp, ebp); __ mov(esp, ebp);
__ pop(ebp); __ pop(ebp);
...@@ -3195,23 +3219,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3195,23 +3219,16 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} }
case kIA32S32x4Shuffle: { case kIA32S32x4Shuffle: {
DCHECK_EQ(4, instr->InputCount()); // Swizzles should be handled above. DCHECK_EQ(4, instr->InputCount()); // Swizzles should be handled above.
__ Pshufd(kScratchDoubleReg, i.InputOperand(1), i.InputInt8(2)); int8_t shuffle = i.InputInt8(2);
__ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), i.InputInt8(2)); DCHECK_NE(0xe4, shuffle); // A simple blend should be handled below.
__ Pshufd(kScratchDoubleReg, i.InputOperand(1), shuffle);
__ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), shuffle);
__ Pblendw(i.OutputSimd128Register(), kScratchDoubleReg, i.InputInt8(3)); __ Pblendw(i.OutputSimd128Register(), kScratchDoubleReg, i.InputInt8(3));
break; break;
} }
case kSSES16x8Blend: { case kIA32S16x8Blend:
CpuFeatureScope sse_scope(tasm(), SSE4_1); ASSEMBLE_SIMD_IMM_SHUFFLE(pblendw, SSE4_1, i.InputInt8(2));
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pblendw(i.OutputSimd128Register(), i.InputOperand(1), i.InputInt8(2));
break;
}
case kAVXS16x8Blend: {
CpuFeatureScope sse_scope(tasm(), AVX);
__ vpblendw(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1), i.InputInt8(2));
break; break;
}
case kIA32S16x8HalfShuffle1: { case kIA32S16x8HalfShuffle1: {
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
__ Pshuflw(dst, i.InputOperand(0), i.InputInt8(1)); __ Pshuflw(dst, i.InputOperand(0), i.InputInt8(1));
...@@ -3227,18 +3244,202 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3227,18 +3244,202 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pblendw(dst, kScratchDoubleReg, i.InputInt8(4)); __ Pblendw(dst, kScratchDoubleReg, i.InputInt8(4));
break; break;
} }
case kSSES8x16Alignr: { case kIA32S8x16Alignr:
CpuFeatureScope sse_scope(tasm(), SSSE3); ASSEMBLE_SIMD_IMM_SHUFFLE(palignr, SSSE3, i.InputInt8(2));
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); break;
__ palignr(i.OutputSimd128Register(), i.InputOperand(1), i.InputInt8(2)); case kIA32S64x2UnpackHigh:
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhqdq);
break;
case kIA32S32x4UnpackHigh:
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhdq);
break;
case kIA32S16x8UnpackHigh:
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhwd);
break;
case kIA32S8x16UnpackHigh:
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckhbw);
break;
case kIA32S64x2UnpackLow:
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklqdq);
break;
case kIA32S32x4UnpackLow:
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpckldq);
break;
case kIA32S16x8UnpackLow:
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklwd);
break;
case kIA32S8x16UnpackLow:
ASSEMBLE_SIMD_PUNPCK_SHUFFLE(punpcklbw);
break;
case kSSES16x8UnzipHigh: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src2 = dst;
DCHECK_EQ(dst, i.InputSimd128Register(0));
if (instr->InputCount() == 2) {
__ movups(kScratchDoubleReg, i.InputOperand(1));
__ psrld(kScratchDoubleReg, 16);
src2 = kScratchDoubleReg;
}
__ psrld(dst, 16);
__ packusdw(dst, src2);
break;
}
case kAVXS16x8UnzipHigh: {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src2 = dst;
if (instr->InputCount() == 2) {
__ vpsrld(kScratchDoubleReg, i.InputSimd128Register(1), 16);
src2 = kScratchDoubleReg;
}
__ vpsrld(dst, i.InputSimd128Register(0), 16);
__ vpackusdw(dst, dst, src2);
break;
}
case kSSES16x8UnzipLow: {
CpuFeatureScope sse_scope(tasm(), SSE4_1);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src2 = dst;
DCHECK_EQ(dst, i.InputSimd128Register(0));
__ pxor(kScratchDoubleReg, kScratchDoubleReg);
if (instr->InputCount() == 2) {
__ pblendw(kScratchDoubleReg, i.InputOperand(1), 0x55);
src2 = kScratchDoubleReg;
}
__ pblendw(dst, kScratchDoubleReg, 0xaa);
__ packusdw(dst, src2);
break;
}
case kAVXS16x8UnzipLow: {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src2 = dst;
__ vpxor(kScratchDoubleReg, kScratchDoubleReg, kScratchDoubleReg);
if (instr->InputCount() == 2) {
__ vpblendw(kScratchDoubleReg, kScratchDoubleReg, i.InputOperand(1),
0x55);
src2 = kScratchDoubleReg;
}
__ vpblendw(dst, kScratchDoubleReg, i.InputSimd128Register(0), 0x55);
__ vpackusdw(dst, dst, src2);
break;
}
case kSSES8x16UnzipHigh: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src2 = dst;
DCHECK_EQ(dst, i.InputSimd128Register(0));
if (instr->InputCount() == 2) {
__ movups(kScratchDoubleReg, i.InputOperand(1));
__ psrlw(kScratchDoubleReg, 8);
src2 = kScratchDoubleReg;
}
__ psrlw(dst, 8);
__ packuswb(dst, src2);
break;
}
case kAVXS8x16UnzipHigh: {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src2 = dst;
if (instr->InputCount() == 2) {
__ vpsrlw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
src2 = kScratchDoubleReg;
}
__ vpsrlw(dst, i.InputSimd128Register(0), 8);
__ vpackuswb(dst, dst, src2);
break;
}
case kSSES8x16UnzipLow: {
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src2 = dst;
DCHECK_EQ(dst, i.InputSimd128Register(0));
if (instr->InputCount() == 2) {
__ movups(kScratchDoubleReg, i.InputOperand(1));
__ psllw(kScratchDoubleReg, 8);
__ psrlw(kScratchDoubleReg, 8);
src2 = kScratchDoubleReg;
}
__ psllw(dst, 8);
__ psrlw(dst, 8);
__ packuswb(dst, src2);
break;
}
case kAVXS8x16UnzipLow: {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src2 = dst;
if (instr->InputCount() == 2) {
__ vpsllw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
__ vpsrlw(kScratchDoubleReg, kScratchDoubleReg, 8);
src2 = kScratchDoubleReg;
}
__ vpsllw(dst, i.InputSimd128Register(0), 8);
__ vpsrlw(dst, dst, 8);
__ vpackuswb(dst, dst, src2);
break;
}
case kSSES8x16TransposeLow: {
XMMRegister dst = i.OutputSimd128Register();
DCHECK_EQ(dst, i.InputSimd128Register(0));
__ psllw(dst, 8);
if (instr->InputCount() == 1) {
__ movups(kScratchDoubleReg, dst);
} else {
DCHECK_EQ(2, instr->InputCount());
__ movups(kScratchDoubleReg, i.InputOperand(1));
__ psllw(kScratchDoubleReg, 8);
}
__ psrlw(dst, 8);
__ por(dst, kScratchDoubleReg);
break;
}
case kAVXS8x16TransposeLow: {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister dst = i.OutputSimd128Register();
if (instr->InputCount() == 1) {
__ vpsllw(kScratchDoubleReg, i.InputSimd128Register(0), 8);
__ vpsrlw(dst, kScratchDoubleReg, 8);
} else {
DCHECK_EQ(2, instr->InputCount());
__ vpsllw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
__ vpsllw(dst, i.InputSimd128Register(0), 8);
__ vpsrlw(dst, dst, 8);
}
__ vpor(dst, dst, kScratchDoubleReg);
break;
}
case kSSES8x16TransposeHigh: {
XMMRegister dst = i.OutputSimd128Register();
DCHECK_EQ(dst, i.InputSimd128Register(0));
__ psrlw(dst, 8);
if (instr->InputCount() == 1) {
__ movups(kScratchDoubleReg, dst);
} else {
DCHECK_EQ(2, instr->InputCount());
__ movups(kScratchDoubleReg, i.InputOperand(1));
__ psrlw(kScratchDoubleReg, 8);
}
__ psllw(kScratchDoubleReg, 8);
__ por(dst, kScratchDoubleReg);
break; break;
} }
case kAVXS8x16Alignr: { case kAVXS8x16TransposeHigh: {
CpuFeatureScope avx_scope(tasm(), AVX); CpuFeatureScope avx_scope(tasm(), AVX);
__ vpalignr(i.OutputSimd128Register(), i.InputSimd128Register(0), XMMRegister dst = i.OutputSimd128Register();
i.InputOperand(1), i.InputInt8(2)); if (instr->InputCount() == 1) {
__ vpsrlw(dst, i.InputSimd128Register(0), 8);
__ vpsllw(kScratchDoubleReg, dst, 8);
} else {
DCHECK_EQ(2, instr->InputCount());
__ vpsrlw(kScratchDoubleReg, i.InputSimd128Register(1), 8);
__ vpsrlw(dst, i.InputSimd128Register(0), 8);
__ vpsllw(kScratchDoubleReg, kScratchDoubleReg, 8);
}
__ vpor(dst, dst, kScratchDoubleReg);
break; break;
} }
case kIA32S1x4AnyTrue: case kIA32S1x4AnyTrue:
case kIA32S1x8AnyTrue: case kIA32S1x8AnyTrue:
case kIA32S1x16AnyTrue: { case kIA32S1x16AnyTrue: {
...@@ -4136,6 +4337,8 @@ void CodeGenerator::AssembleJumpTable(Label** targets, size_t target_count) { ...@@ -4136,6 +4337,8 @@ void CodeGenerator::AssembleJumpTable(Label** targets, size_t target_count) {
#undef ASSEMBLE_BINOP #undef ASSEMBLE_BINOP
#undef ASSEMBLE_ATOMIC_BINOP #undef ASSEMBLE_ATOMIC_BINOP
#undef ASSEMBLE_MOVX #undef ASSEMBLE_MOVX
#undef ASSEMBLE_SIMD_PUNPCK_SHUFFLE
#undef ASSEMBLE_SIMD_IMM_SHUFFLE
} // namespace compiler } // namespace compiler
} // namespace internal } // namespace internal
......
...@@ -303,12 +303,30 @@ namespace compiler { ...@@ -303,12 +303,30 @@ namespace compiler {
V(IA32S8x16Shuffle) \ V(IA32S8x16Shuffle) \
V(IA32S32x4Swizzle) \ V(IA32S32x4Swizzle) \
V(IA32S32x4Shuffle) \ V(IA32S32x4Shuffle) \
V(SSES16x8Blend) \ V(IA32S16x8Blend) \
V(AVXS16x8Blend) \
V(IA32S16x8HalfShuffle1) \ V(IA32S16x8HalfShuffle1) \
V(IA32S16x8HalfShuffle2) \ V(IA32S16x8HalfShuffle2) \
V(SSES8x16Alignr) \ V(IA32S8x16Alignr) \
V(AVXS8x16Alignr) \ V(SSES16x8UnzipHigh) \
V(AVXS16x8UnzipHigh) \
V(SSES16x8UnzipLow) \
V(AVXS16x8UnzipLow) \
V(SSES8x16UnzipHigh) \
V(AVXS8x16UnzipHigh) \
V(SSES8x16UnzipLow) \
V(AVXS8x16UnzipLow) \
V(IA32S64x2UnpackHigh) \
V(IA32S32x4UnpackHigh) \
V(IA32S16x8UnpackHigh) \
V(IA32S8x16UnpackHigh) \
V(IA32S64x2UnpackLow) \
V(IA32S32x4UnpackLow) \
V(IA32S16x8UnpackLow) \
V(IA32S8x16UnpackLow) \
V(SSES8x16TransposeLow) \
V(AVXS8x16TransposeLow) \
V(SSES8x16TransposeHigh) \
V(AVXS8x16TransposeHigh) \
V(IA32S1x4AnyTrue) \ V(IA32S1x4AnyTrue) \
V(IA32S1x4AllTrue) \ V(IA32S1x4AllTrue) \
V(IA32S1x8AnyTrue) \ V(IA32S1x8AnyTrue) \
......
...@@ -285,12 +285,30 @@ int InstructionScheduler::GetTargetInstructionFlags( ...@@ -285,12 +285,30 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32S8x16Shuffle: case kIA32S8x16Shuffle:
case kIA32S32x4Swizzle: case kIA32S32x4Swizzle:
case kIA32S32x4Shuffle: case kIA32S32x4Shuffle:
case kSSES16x8Blend: case kIA32S16x8Blend:
case kAVXS16x8Blend:
case kIA32S16x8HalfShuffle1: case kIA32S16x8HalfShuffle1:
case kIA32S16x8HalfShuffle2: case kIA32S16x8HalfShuffle2:
case kSSES8x16Alignr: case kIA32S8x16Alignr:
case kAVXS8x16Alignr: case kSSES16x8UnzipHigh:
case kAVXS16x8UnzipHigh:
case kSSES16x8UnzipLow:
case kAVXS16x8UnzipLow:
case kSSES8x16UnzipHigh:
case kAVXS8x16UnzipHigh:
case kSSES8x16UnzipLow:
case kAVXS8x16UnzipLow:
case kIA32S64x2UnpackHigh:
case kIA32S32x4UnpackHigh:
case kIA32S16x8UnpackHigh:
case kIA32S8x16UnpackHigh:
case kIA32S64x2UnpackLow:
case kIA32S32x4UnpackLow:
case kIA32S16x8UnpackLow:
case kIA32S8x16UnpackLow:
case kSSES8x16TransposeLow:
case kAVXS8x16TransposeLow:
case kSSES8x16TransposeHigh:
case kAVXS8x16TransposeHigh:
case kIA32S1x4AnyTrue: case kIA32S1x4AnyTrue:
case kIA32S1x4AllTrue: case kIA32S1x4AllTrue:
case kIA32S1x8AnyTrue: case kIA32S1x8AnyTrue:
......
...@@ -2059,16 +2059,123 @@ uint8_t PackBlend4(const uint8_t* shuffle32x4) { ...@@ -2059,16 +2059,123 @@ uint8_t PackBlend4(const uint8_t* shuffle32x4) {
return result; return result;
} }
// Returns true if shuffle can be separated into two half shuffles, i.e.lanes // Returns true if shuffle can be decomposed into two 16x4 half shuffles
// don't move from low 4 lanes to high 4 lanes or vice versa) and a blend. // followed by a 16x8 blend.
// E.g. [3 2 1 0 15 14 13 12]. // E.g. [3 2 1 0 15 14 13 12].
bool Is16x8BlendedShuffle(uint8_t* shuffle16x8, uint8_t* blend_mask) { bool TryMatch16x8HalfShuffle(uint8_t* shuffle16x8, uint8_t* blend_mask) {
*blend_mask = 0; *blend_mask = 0;
for (int i = 0; i < 8; i++) { for (int i = 0; i < 8; i++) {
*blend_mask |= (shuffle16x8[i] > 7 ? 1 : 0) << i;
if ((shuffle16x8[i] & 0x4) != (i & 0x4)) return false; if ((shuffle16x8[i] & 0x4) != (i & 0x4)) return false;
*blend_mask |= (shuffle16x8[i] > 7 ? 1 : 0) << i;
}
return true;
}
struct ShuffleEntry {
uint8_t shuffle[kSimd128Size];
ArchOpcode opcode;
ArchOpcode avx_opcode;
bool src0_needs_reg;
bool src1_needs_reg;
};
// Shuffles that map to architecture-specific instruction sequences. These are
// matched very early, so we shouldn't include shuffles that match better in
// later tests, like 32x4 and 16x8 shuffles. In general, these patterns should
// map to either a single instruction, or be finer grained, such as zip/unzip or
// transpose patterns.
static const ShuffleEntry arch_shuffles[] = {
{{0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23},
kIA32S64x2UnpackLow,
kIA32S64x2UnpackLow,
true,
false},
{{8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31},
kIA32S64x2UnpackHigh,
kIA32S64x2UnpackHigh,
true,
false},
{{0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23},
kIA32S32x4UnpackLow,
kIA32S32x4UnpackLow,
true,
false},
{{8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31},
kIA32S32x4UnpackHigh,
kIA32S32x4UnpackHigh,
true,
false},
{{0, 1, 16, 17, 2, 3, 18, 19, 4, 5, 20, 21, 6, 7, 22, 23},
kIA32S16x8UnpackLow,
kIA32S16x8UnpackLow,
true,
false},
{{8, 9, 24, 25, 10, 11, 26, 27, 12, 13, 28, 29, 14, 15, 30, 31},
kIA32S16x8UnpackHigh,
kIA32S16x8UnpackHigh,
true,
false},
{{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23},
kIA32S8x16UnpackLow,
kIA32S8x16UnpackLow,
true,
false},
{{8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31},
kIA32S8x16UnpackHigh,
kIA32S8x16UnpackHigh,
true,
false},
{{0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29},
kSSES16x8UnzipLow,
kAVXS16x8UnzipLow,
true,
false},
{{2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31},
kSSES16x8UnzipHigh,
kAVXS16x8UnzipHigh,
true,
true},
{{0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30},
kSSES8x16UnzipLow,
kAVXS8x16UnzipLow,
true,
true},
{{1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31},
kSSES8x16UnzipHigh,
kAVXS8x16UnzipHigh,
true,
true},
{{0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30},
kSSES8x16TransposeLow,
kAVXS8x16TransposeLow,
true,
true},
{{1, 17, 3, 19, 5, 21, 7, 23, 9, 25, 11, 27, 13, 29, 15, 31},
kSSES8x16TransposeHigh,
kAVXS8x16TransposeHigh,
true,
true}};
bool TryMatchArchShuffle(const uint8_t* shuffle, const ShuffleEntry* table,
size_t num_entries, bool is_swizzle,
const ShuffleEntry** arch_shuffle) {
uint8_t mask = is_swizzle ? kSimd128Size - 1 : 2 * kSimd128Size - 1;
for (size_t i = 0; i < num_entries; ++i) {
const ShuffleEntry& entry = table[i];
int j = 0;
for (; j < kSimd128Size; ++j) {
if ((entry.shuffle[j] & mask) != (shuffle[j] & mask)) {
break;
}
} }
if (j == kSimd128Size) {
*arch_shuffle = &entry;
return true; return true;
}
}
return false;
} }
} // namespace } // namespace
...@@ -2090,40 +2197,51 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) { ...@@ -2090,40 +2197,51 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
bool use_avx = CpuFeatures::IsSupported(AVX); bool use_avx = CpuFeatures::IsSupported(AVX);
// AVX and swizzles don't generally need DefineSameAsFirst to avoid a move. // AVX and swizzles don't generally need DefineSameAsFirst to avoid a move.
bool no_same_as_first = use_avx || is_swizzle; bool no_same_as_first = use_avx || is_swizzle;
// We generally need UseRegister for the first source. // We generally need UseRegister for input0, Use for input1.
bool no_use_register = false; bool src0_needs_reg = true;
bool src1_needs_reg = false;
ArchOpcode opcode = kIA32S8x16Shuffle; // general shuffle is the default ArchOpcode opcode = kIA32S8x16Shuffle; // general shuffle is the default
uint8_t offset; uint8_t offset;
uint8_t shuffle32x4[4]; uint8_t shuffle32x4[4];
uint8_t shuffle16x8[8]; uint8_t shuffle16x8[8];
const ShuffleEntry* arch_shuffle;
if (TryMatchConcat(shuffle, &offset)) { if (TryMatchConcat(shuffle, &offset)) {
// Swap inputs from the normal order for (v)palignr. // Swap inputs from the normal order for (v)palignr.
SwapShuffleInputs(node); SwapShuffleInputs(node);
is_swizzle = false; // It's simpler to just handle the general case. is_swizzle = false; // It's simpler to just handle the general case.
no_same_as_first = use_avx; // SSE requires same-as-first. no_same_as_first = use_avx; // SSE requires same-as-first.
opcode = use_avx ? kAVXS8x16Alignr : kSSES8x16Alignr; opcode = kIA32S8x16Alignr;
// palignr takes a single imm8 offset. // palignr takes a single imm8 offset.
imms[imm_count++] = offset; imms[imm_count++] = offset;
} else if (TryMatchArchShuffle(shuffle, arch_shuffles,
arraysize(arch_shuffles), is_swizzle,
&arch_shuffle)) {
opcode = use_avx ? arch_shuffle->avx_opcode : arch_shuffle->opcode;
src0_needs_reg = arch_shuffle->src0_needs_reg;
// SSE can't take advantage of both operands in registers and needs
// same-as-first.
src1_needs_reg = use_avx && arch_shuffle->src1_needs_reg;
no_same_as_first = use_avx;
} else if (TryMatch32x4Shuffle(shuffle, shuffle32x4)) { } else if (TryMatch32x4Shuffle(shuffle, shuffle32x4)) {
uint8_t shuffle_mask = PackShuffle4(shuffle32x4); uint8_t shuffle_mask = PackShuffle4(shuffle32x4);
if (is_swizzle) { if (is_swizzle) {
// pshufd takes a single imm8 shuffle mask. // pshufd takes a single imm8 shuffle mask.
opcode = kIA32S32x4Swizzle; opcode = kIA32S32x4Swizzle;
no_same_as_first = true; no_same_as_first = true;
no_use_register = true; src0_needs_reg = false;
imms[imm_count++] = shuffle_mask; imms[imm_count++] = shuffle_mask;
} else { } else {
// 2 operand shuffle // 2 operand shuffle
// A blend is more efficient than a general 32x4 shuffle; try it first. // A blend is more efficient than a general 32x4 shuffle; try it first.
if (TryMatchBlend(shuffle)) { if (TryMatchBlend(shuffle)) {
opcode = use_avx ? kAVXS16x8Blend : kSSES16x8Blend; opcode = kIA32S16x8Blend;
uint8_t blend_mask = PackBlend4(shuffle32x4); uint8_t blend_mask = PackBlend4(shuffle32x4);
imms[imm_count++] = blend_mask; imms[imm_count++] = blend_mask;
} else { } else {
opcode = kIA32S32x4Shuffle; opcode = kIA32S32x4Shuffle;
no_same_as_first = true; no_same_as_first = true;
no_use_register = true; src0_needs_reg = false;
imms[imm_count++] = shuffle_mask; imms[imm_count++] = shuffle_mask;
int8_t blend_mask = PackBlend4(shuffle32x4); int8_t blend_mask = PackBlend4(shuffle32x4);
imms[imm_count++] = blend_mask; imms[imm_count++] = blend_mask;
...@@ -2132,14 +2250,14 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) { ...@@ -2132,14 +2250,14 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
} else if (TryMatch16x8Shuffle(shuffle, shuffle16x8)) { } else if (TryMatch16x8Shuffle(shuffle, shuffle16x8)) {
uint8_t blend_mask; uint8_t blend_mask;
if (TryMatchBlend(shuffle)) { if (TryMatchBlend(shuffle)) {
opcode = use_avx ? kAVXS16x8Blend : kSSES16x8Blend; opcode = kIA32S16x8Blend;
blend_mask = PackBlend8(shuffle16x8); blend_mask = PackBlend8(shuffle16x8);
imms[imm_count++] = blend_mask; imms[imm_count++] = blend_mask;
} else if (Is16x8BlendedShuffle(shuffle16x8, &blend_mask)) { } else if (TryMatch16x8HalfShuffle(shuffle16x8, &blend_mask)) {
opcode = is_swizzle ? kIA32S16x8HalfShuffle1 : kIA32S16x8HalfShuffle2; opcode = is_swizzle ? kIA32S16x8HalfShuffle1 : kIA32S16x8HalfShuffle2;
// Half-shuffles don't need DefineSameAsFirst or UseRegister(src0). // Half-shuffles don't need DefineSameAsFirst or UseRegister(src0).
no_same_as_first = true; no_same_as_first = true;
no_use_register = true; src0_needs_reg = false;
uint8_t mask_lo = PackShuffle4(shuffle16x8); uint8_t mask_lo = PackShuffle4(shuffle16x8);
uint8_t mask_hi = PackShuffle4(shuffle16x8 + 4); uint8_t mask_hi = PackShuffle4(shuffle16x8 + 4);
imms[imm_count++] = mask_lo; imms[imm_count++] = mask_lo;
...@@ -2150,7 +2268,7 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) { ...@@ -2150,7 +2268,7 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
if (opcode == kIA32S8x16Shuffle) { if (opcode == kIA32S8x16Shuffle) {
// Use same-as-first for general swizzle, but not shuffle. // Use same-as-first for general swizzle, but not shuffle.
no_same_as_first = !is_swizzle; no_same_as_first = !is_swizzle;
no_use_register = no_same_as_first; src0_needs_reg = !no_same_as_first;
imms[imm_count++] = Pack4Lanes(shuffle); imms[imm_count++] = Pack4Lanes(shuffle);
imms[imm_count++] = Pack4Lanes(shuffle + 4); imms[imm_count++] = Pack4Lanes(shuffle + 4);
imms[imm_count++] = Pack4Lanes(shuffle + 8); imms[imm_count++] = Pack4Lanes(shuffle + 8);
...@@ -2164,13 +2282,15 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) { ...@@ -2164,13 +2282,15 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
InstructionOperand dst = InstructionOperand dst =
no_same_as_first ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node); no_same_as_first ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node);
InstructionOperand src0 = InstructionOperand src0 =
no_use_register ? g.Use(input0) : g.UseRegister(input0); src0_needs_reg ? g.UseRegister(input0) : g.Use(input0);
int input_count = 0; int input_count = 0;
InstructionOperand inputs[2 + kMaxImms + kMaxTemps]; InstructionOperand inputs[2 + kMaxImms + kMaxTemps];
inputs[input_count++] = src0; inputs[input_count++] = src0;
if (!is_swizzle) { if (!is_swizzle) {
inputs[input_count++] = g.Use(node->InputAt(1)); Node* input1 = node->InputAt(1);
inputs[input_count++] =
src1_needs_reg ? g.UseRegister(input1) : g.Use(input1);
} }
for (int i = 0; i < imm_count; ++i) { for (int i = 0; i < imm_count; ++i) {
inputs[input_count++] = g.UseImmediate(imms[i]); inputs[input_count++] = g.UseImmediate(imms[i]);
......
...@@ -42,8 +42,14 @@ ...@@ -42,8 +42,14 @@
V(psubsw, 66, 0F, E9) \ V(psubsw, 66, 0F, E9) \
V(psubusb, 66, 0F, D8) \ V(psubusb, 66, 0F, D8) \
V(psubusw, 66, 0F, D9) \ V(psubusw, 66, 0F, D9) \
V(punpckhdq, 66, 0F, 6A) \ V(punpcklbw, 66, 0F, 60) \
V(punpcklwd, 66, 0F, 61) \
V(punpckldq, 66, 0F, 62) \ V(punpckldq, 66, 0F, 62) \
V(punpcklqdq, 66, 0F, 6C) \
V(punpckhbw, 66, 0F, 68) \
V(punpckhwd, 66, 0F, 69) \
V(punpckhdq, 66, 0F, 6A) \
V(punpckhqdq, 66, 0F, 6D) \
V(pxor, 66, 0F, EF) V(pxor, 66, 0F, EF)
#define SSSE3_INSTRUCTION_LIST(V) \ #define SSSE3_INSTRUCTION_LIST(V) \
......
...@@ -1874,6 +1874,7 @@ WASM_SIMD_COMPILED_AND_LOWERED_TEST(S16x8TransposeRight) { ...@@ -1874,6 +1874,7 @@ WASM_SIMD_COMPILED_AND_LOWERED_TEST(S16x8TransposeRight) {
{{2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31}}); {{2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31}});
} }
// TODO(simd) 'Reverse' tests should be 2-operand shuffles, not swizzles.
WASM_SIMD_COMPILED_AND_LOWERED_TEST(S16x4Reverse) { WASM_SIMD_COMPILED_AND_LOWERED_TEST(S16x4Reverse) {
RunShuffleOpTest<int8_t>( RunShuffleOpTest<int8_t>(
execution_mode, lower_simd, kExprS8x16Shuffle, execution_mode, lower_simd, kExprS8x16Shuffle,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment