Commit 1ae84e18 authored by Bill Budge's avatar Bill Budge Committed by Commit Bot

[wasm] Handle special case 32x4 and 16x8 shuffles on ia32

- Adds v/pshufhw, v/palignr instructions to assembler and
  macro-assembler.
- Uses better instruction sequences for 32x4 shuffles using half-
  shuffles and blends.
- Uses better instruction sequences for 16x8 shuffles.
- Uses better instruction sequences for concatenating 8x16 shuffles.

Change-Id: I3e6dca2937a2b167c754c47331c3a2f8ab9786db
Reviewed-on: https://chromium-review.googlesource.com/1066986
Commit-Queue: Bill Budge <bbudge@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Cr-Commit-Position: refs/heads/master@{#53460}
parent d920bf37
......@@ -3132,10 +3132,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kIA32S8x16Shuffle: {
XMMRegister dst = i.OutputSimd128Register();
Operand src0 = i.InputOperand(0);
Register tmp = i.TempRegister(0);
if (!src0.is_reg(dst)) {
__ movups(dst, src0);
}
// Prepare 16-byte boundary buffer for shuffle control mask
__ mov(tmp, esp);
__ movups(dst, i.InputOperand(0));
__ and_(esp, -16);
if (instr->InputCount() == 5) { // only one input operand
for (int j = 4; j > 0; j--) {
......@@ -3172,7 +3175,73 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kIA32S32x4Swizzle: {
__ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), i.InputInt8(1));
__ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), i.InputInt8(2));
break;
}
case kIA32S32x4Shuffle: {
DCHECK_EQ(4, instr->InputCount()); // Swizzles should be handled above.
__ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), i.InputInt8(2));
__ Pshufd(kScratchDoubleReg, i.InputOperand(1), i.InputInt8(2));
__ Pblendw(i.OutputSimd128Register(), kScratchDoubleReg, i.InputInt8(3));
break;
}
case kSSES16x8Blend: {
CpuFeatureScope sse_scope(tasm(), SSSE3);
if (instr->InputCount() == 2) {
// swizzle
__ pblendw(i.OutputSimd128Register(), i.InputOperand(0),
i.InputInt8(1));
} else {
// shuffle
DCHECK_EQ(3, instr->InputCount());
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pblendw(i.OutputSimd128Register(), i.InputOperand(1),
i.InputInt8(2));
}
break;
}
case kAVXS16x8Blend: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpblendw(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1), i.InputInt8(2));
break;
}
case kIA32S16x8ShuffleBlend: {
XMMRegister dst = i.OutputSimd128Register();
if (instr->InputCount() == 3) {
// swizzle
__ Pshuflw(dst, i.InputOperand(0), i.InputInt8(1));
__ Pshufhw(dst, dst, i.InputInt8(2));
} else {
// shuffle
DCHECK_EQ(5, instr->InputCount());
__ Pshuflw(dst, i.InputOperand(0), i.InputInt8(2));
__ Pshufhw(dst, dst, i.InputInt8(3));
__ Pshuflw(kScratchDoubleReg, i.InputOperand(1), i.InputInt8(2));
__ Pshufhw(kScratchDoubleReg, kScratchDoubleReg, i.InputInt8(3));
__ Pblendw(dst, kScratchDoubleReg, i.InputInt8(4));
}
break;
}
case kSSES8x16Alignr: {
CpuFeatureScope sse_scope(tasm(), SSSE3);
if (instr->InputCount() == 2) {
// swizzle
__ palignr(i.OutputSimd128Register(), i.InputOperand(0),
i.InputInt8(1));
} else {
// shuffle
DCHECK_EQ(3, instr->InputCount());
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ palignr(i.OutputSimd128Register(), i.InputOperand(1),
i.InputInt8(2));
}
break;
}
case kAVXS8x16Alignr: {
CpuFeatureScope avx_scope(tasm(), AVX);
__ vpalignr(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1), i.InputInt8(2));
break;
}
case kIA32S1x4AnyTrue:
......
......@@ -302,6 +302,12 @@ namespace compiler {
V(AVXS128Select) \
V(IA32S8x16Shuffle) \
V(IA32S32x4Swizzle) \
V(IA32S32x4Shuffle) \
V(SSES16x8Blend) \
V(AVXS16x8Blend) \
V(IA32S16x8ShuffleBlend) \
V(SSES8x16Alignr) \
V(AVXS8x16Alignr) \
V(IA32S1x4AnyTrue) \
V(IA32S1x4AllTrue) \
V(IA32S1x8AnyTrue) \
......
......@@ -284,6 +284,12 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kAVXS128Select:
case kIA32S8x16Shuffle:
case kIA32S32x4Swizzle:
case kIA32S32x4Shuffle:
case kSSES16x8Blend:
case kAVXS16x8Blend:
case kIA32S16x8ShuffleBlend:
case kSSES8x16Alignr:
case kAVXS8x16Alignr:
case kIA32S1x4AnyTrue:
case kIA32S1x4AllTrue:
case kIA32S1x8AnyTrue:
......
......@@ -2019,41 +2019,144 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
UNREACHABLE();
}
namespace {
// Packs a 4 lane shuffle into a single imm8 suitable for use by pshufd,
// pshuflw, and pshufhw.
uint8_t PackShuffle4(uint8_t* shuffle) {
return (shuffle[0] & 3) | ((shuffle[1] & 3) << 2) | ((shuffle[2] & 3) << 4) |
((shuffle[3] & 3) << 6);
}
// Gets an 8 bit lane mask suitable for 16x8 pblendw.
uint8_t PackBlend8(const uint8_t* shuffle16x8) {
int8_t result = 0;
for (int i = 0; i < 8; ++i) {
result |= (shuffle16x8[i] >= 8 ? 1 : 0) << i;
}
return result;
}
// Gets an 8 bit lane mask suitable for 32x4 pblendw.
uint8_t PackBlend4(const uint8_t* shuffle32x4) {
int8_t result = 0;
for (int i = 0; i < 4; ++i) {
result |= (shuffle32x4[i] >= 4 ? 0x3 : 0) << (i * 2);
}
return result;
}
// Returns true if shuffle can be separated into two half shuffles, i.e.lanes
// don't move from low 4 lanes to high 4 lanes or vice versa) and a blend.
// E.g. [3 2 1 0 15 14 13 12].
bool Is16x8BlendedShuffle(uint8_t* shuffle16x8, uint8_t* blend_mask) {
*blend_mask = 0;
for (int i = 0; i < 8; i++) {
*blend_mask |= (shuffle16x8[i] > 7 ? 1 : 0) << i;
if ((shuffle16x8[i] & 0x4) != (i & 0x4)) return false;
}
return true;
}
void SwapShuffleInputs(Node* node) {
Node* input0 = node->InputAt(0);
Node* input1 = node->InputAt(1);
node->ReplaceInput(0, input1);
node->ReplaceInput(1, input0);
}
} // namespace
void InstructionSelector::VisitS8x16Shuffle(Node* node) {
static const int kMaxSwizzleIndex = 15;
static const int kMaxShuffleIndex = 31;
const uint8_t* shuffle = OpParameter<uint8_t*>(node->op());
uint8_t mask = CanonicalizeShuffle(node);
uint8_t shuffle32x4[4];
bool is_swizzle = (mask == kMaxSwizzleIndex);
DCHECK_IMPLIES(!is_swizzle, mask == kMaxShuffleIndex);
USE(kMaxShuffleIndex);
int imm_count = 0;
static const int kMaxImms = 6;
uint32_t imms[kMaxImms];
int temp_count = 0;
static const int kMaxTemps = 2;
InstructionOperand temps[kMaxTemps];
IA32OperandGenerator g(this);
InstructionOperand output = g.DefineAsRegister(node);
InstructionOperand inputs[6];
InstructionOperand temps[1];
size_t input_count = 0;
Node* input0 = node->InputAt(0);
Node* input1 = node->InputAt(1);
if (mask == kMaxSwizzleIndex) {
if (TryMatch32x4Shuffle(shuffle, shuffle32x4)) {
Emit(kIA32S32x4Swizzle, output, g.Use(input0),
g.UseImmediate((shuffle32x4[0] & 3) | ((shuffle32x4[1] & 3) << 2) |
((shuffle32x4[2] & 3) << 4) |
((shuffle32x4[3] & 3) << 6)));
return;
bool use_avx = CpuFeatures::IsSupported(AVX);
ArchOpcode opcode = kIA32S8x16Shuffle; // general shuffle is the default
uint8_t offset;
uint8_t shuffle32x4[4];
uint8_t shuffle16x8[8];
if (TryMatchConcat(shuffle, mask, &offset)) {
// Swap inputs for (v)palignr.
// TODO(bbudge) Handle concatenations where the sources are reversed.
SwapShuffleInputs(node);
// palignr takes a single imm8 offset.
opcode = use_avx ? kAVXS8x16Alignr : kSSES8x16Alignr;
imms[imm_count++] = offset;
} else if (TryMatch32x4Shuffle(shuffle, shuffle32x4)) {
uint8_t shuffle_mask = PackShuffle4(shuffle32x4);
if (is_swizzle) {
// pshufd takes a single imm8 shuffle mask.
opcode = kIA32S32x4Swizzle;
imms[imm_count++] = shuffle_mask;
} else {
// 2 operand shuffle
// A blend is more efficient than a general 32x4 shuffle; try it first.
if (TryMatchBlend(shuffle)) {
opcode = use_avx ? kAVXS16x8Blend : kSSES16x8Blend;
uint8_t blend_mask = PackBlend4(shuffle32x4);
imms[imm_count++] = blend_mask;
} else {
opcode = kIA32S32x4Shuffle;
imms[imm_count++] = shuffle_mask;
int8_t blend_mask = PackBlend4(shuffle32x4);
imms[imm_count++] = blend_mask;
}
}
// TODO(ia32): handle non 32x4 swizzles here
inputs[input_count++] = g.Use(input0);
} else {
DCHECK_EQ(kMaxShuffleIndex, mask);
USE(kMaxShuffleIndex);
inputs[input_count++] = g.Use(input0);
inputs[input_count++] = g.Use(input1);
}
inputs[input_count++] = g.UseImmediate(Pack4Lanes(shuffle, mask));
inputs[input_count++] = g.UseImmediate(Pack4Lanes(shuffle + 4, mask));
inputs[input_count++] = g.UseImmediate(Pack4Lanes(shuffle + 8, mask));
inputs[input_count++] = g.UseImmediate(Pack4Lanes(shuffle + 12, mask));
temps[0] = g.TempRegister();
Emit(kIA32S8x16Shuffle, 1, &output, input_count, inputs, 1, temps);
} else if (TryMatch16x8Shuffle(shuffle, shuffle16x8)) {
uint8_t blend_mask;
if (TryMatchBlend(shuffle)) {
opcode = use_avx ? kAVXS16x8Blend : kSSES16x8Blend;
blend_mask = PackBlend8(shuffle16x8);
imms[imm_count++] = blend_mask;
} else if (Is16x8BlendedShuffle(shuffle16x8, &blend_mask)) {
opcode = kIA32S16x8ShuffleBlend;
uint8_t mask_lo = PackShuffle4(shuffle16x8);
uint8_t mask_hi = PackShuffle4(shuffle16x8 + 4);
imms[imm_count++] = mask_lo;
imms[imm_count++] = mask_hi;
// TODO(bbudge) eliminate the blend for swizzles.
imms[imm_count++] = blend_mask;
}
}
if (opcode == kIA32S8x16Shuffle) {
// General shuffle.
imms[imm_count++] = Pack4Lanes(shuffle, mask);
imms[imm_count++] = Pack4Lanes(shuffle + 4, mask);
imms[imm_count++] = Pack4Lanes(shuffle + 8, mask);
imms[imm_count++] = Pack4Lanes(shuffle + 12, mask);
temps[temp_count++] = g.TempRegister();
}
// Swizzles and AVX don't require input[0] == output.
InstructionOperand output = use_avx || is_swizzle ? g.DefineAsRegister(node)
: g.DefineSameAsFirst(node);
int input_count = 0;
InstructionOperand inputs[2 + kMaxImms + kMaxTemps];
InstructionOperand src0 = g.UseRegister(node->InputAt(0));
inputs[input_count++] = src0;
if (!is_swizzle || (use_avx && opcode != kIA32S8x16Shuffle)) {
inputs[input_count++] = g.Use(node->InputAt(1));
}
for (int i = 0; i < imm_count; ++i) {
inputs[input_count++] = g.UseImmediate(imms[i]);
}
Emit(opcode, 1, &output, input_count, inputs, temp_count, temps);
}
// static
......
......@@ -2872,9 +2872,22 @@ bool InstructionSelector::TryMatch32x4Shuffle(const uint8_t* shuffle,
return true;
}
// static
bool InstructionSelector::TryMatch16x8Shuffle(const uint8_t* shuffle,
uint8_t* shuffle16x8) {
for (int i = 0; i < 8; ++i) {
if (shuffle[i * 2] % 2 != 0) return false;
for (int j = 1; j < 2; ++j) {
if (shuffle[i * 2 + j] - shuffle[i * 2 + j - 1] != 1) return false;
}
shuffle16x8[i] = shuffle[i * 2] / 2;
}
return true;
}
// static
bool InstructionSelector::TryMatchConcat(const uint8_t* shuffle, uint8_t mask,
uint8_t* vext) {
uint8_t* offset) {
uint8_t start = shuffle[0];
int i = 1;
for (; i < 16 - start; ++i) {
......@@ -2884,12 +2897,18 @@ bool InstructionSelector::TryMatchConcat(const uint8_t* shuffle, uint8_t mask,
for (; i < 16; ++i, ++wrap) {
if ((shuffle[i] & mask) != (wrap & mask)) return false;
}
*vext = start;
*offset = start;
return true;
}
// static
bool InstructionSelector::TryMatchBlend(const uint8_t* shuffle) {
for (int i = 0; i < 16; ++i) {
if ((shuffle[i] & 0xF) != i) return false;
}
return true;
}
// Canonicalize shuffles to make pattern matching simpler. Returns a mask that
// will ignore the high bit of indices in some cases.
uint8_t InstructionSelector::CanonicalizeShuffle(Node* node) {
static const int kMaxLaneIndex = 15;
static const int kMaxShuffleIndex = 31;
......@@ -2906,7 +2925,7 @@ uint8_t InstructionSelector::CanonicalizeShuffle(Node* node) {
bool src0_is_used = false;
bool src1_is_used = false;
for (int i = 0; i < 16; ++i) {
if (shuffle[i] < 16) {
if (shuffle[i] <= kMaxLaneIndex) {
src0_is_used = true;
} else {
src1_is_used = true;
......
......@@ -621,21 +621,34 @@ class V8_EXPORT_PRIVATE InstructionSelector final {
return true;
}
// Tries to match 8x16 byte shuffle to an equivalent 32x4 word shuffle. If
// successful, it writes the 32x4 shuffle word indices.
// Tries to match an 8x16 byte shuffle to an equivalent 32x4 shuffle. If
// successful, it writes the 32x4 shuffle word indices. E.g.
// [0 1 2 3 8 9 10 11 4 5 6 7 12 13 14 15] == [0 2 1 3]
static bool TryMatch32x4Shuffle(const uint8_t* shuffle, uint8_t* shuffle32x4);
// Tries to match a byte shuffle to a concatenate operation. If successful,
// it writes the byte offset.
// Tries to match an 8x16 byte shuffle to an equivalent 16x8 shuffle. If
// successful, it writes the 16x8 shuffle word indices. E.g.
// [0 1 8 9 2 3 10 11 4 5 12 13 6 7 14 15] == [0 4 1 5 2 6 3 7]
static bool TryMatch16x8Shuffle(const uint8_t* shuffle, uint8_t* shuffle16x8);
// Tries to match a byte shuffle to a concatenate operation, formed by taking
// 16 bytes from the 32 byte concatenation of the inputs. If successful, it
// writes the byte offset. E.g. [4 5 6 7 .. 16 17 18 19] concatenates both
// source vectors with offset 4.
static bool TryMatchConcat(const uint8_t* shuffle, uint8_t mask,
uint8_t* offset);
// Tries to match a byte shuffle to a blend operation, which is a shuffle
// where no lanes change position. E.g. [0 9 2 11 .. 14 31] interleaves the
// even lanes of the first source with the odd lanes of the second.
static bool TryMatchBlend(const uint8_t* shuffle);
// Packs 4 bytes of shuffle into a 32 bit immediate, using a mask from
// CanonicalizeShuffle to convert unary shuffles.
static int32_t Pack4Lanes(const uint8_t* shuffle, uint8_t mask);
// Canonicalize shuffles to make pattern matching simpler. Returns a mask that
// will ignore the high bit of indices if shuffle is unary.
// will clear the high bit of indices if shuffle is unary (a swizzle).
uint8_t CanonicalizeShuffle(Node* node);
// ===========================================================================
......
......@@ -2706,6 +2706,15 @@ void Assembler::psrlq(XMMRegister dst, XMMRegister src) {
emit_sse_operand(dst, src);
}
void Assembler::pshufhw(XMMRegister dst, Operand src, uint8_t shuffle) {
EnsureSpace ensure_space(this);
EMIT(0xF3);
EMIT(0x0F);
EMIT(0x70);
emit_sse_operand(dst, src);
EMIT(shuffle);
}
void Assembler::pshuflw(XMMRegister dst, Operand src, uint8_t shuffle) {
EnsureSpace ensure_space(this);
EMIT(0xF2);
......@@ -2735,6 +2744,17 @@ void Assembler::pblendw(XMMRegister dst, Operand src, uint8_t mask) {
EMIT(mask);
}
void Assembler::palignr(XMMRegister dst, Operand src, uint8_t mask) {
DCHECK(IsEnabled(SSSE3));
EnsureSpace ensure_space(this);
EMIT(0x66);
EMIT(0x0F);
EMIT(0x3A);
EMIT(0x0F);
emit_sse_operand(dst, src);
EMIT(mask);
}
void Assembler::pextrb(Operand dst, XMMRegister src, int8_t offset) {
DCHECK(IsEnabled(SSE4_1));
EnsureSpace ensure_space(this);
......@@ -2959,6 +2979,11 @@ void Assembler::vpsrad(XMMRegister dst, XMMRegister src, int8_t imm8) {
EMIT(imm8);
}
void Assembler::vpshufhw(XMMRegister dst, Operand src, uint8_t shuffle) {
vinstr(0x70, dst, xmm0, src, kF3, k0F, kWIG);
EMIT(shuffle);
}
void Assembler::vpshuflw(XMMRegister dst, Operand src, uint8_t shuffle) {
vinstr(0x70, dst, xmm0, src, kF2, k0F, kWIG);
EMIT(shuffle);
......@@ -2975,6 +3000,12 @@ void Assembler::vpblendw(XMMRegister dst, XMMRegister src1, Operand src2,
EMIT(mask);
}
void Assembler::vpalignr(XMMRegister dst, XMMRegister src1, Operand src2,
uint8_t mask) {
vinstr(0x0F, dst, src1, src2, k66, k0F3A, kWIG);
EMIT(mask);
}
void Assembler::vpextrb(Operand dst, XMMRegister src, int8_t offset) {
vinstr(0x14, src, xmm0, dst, k66, k0F3A, kWIG);
EMIT(offset);
......
......@@ -1125,6 +1125,10 @@ class Assembler : public AssemblerBase {
void psrlq(XMMRegister reg, int8_t shift);
void psrlq(XMMRegister dst, XMMRegister src);
void pshufhw(XMMRegister dst, XMMRegister src, uint8_t shuffle) {
pshufhw(dst, Operand(src), shuffle);
}
void pshufhw(XMMRegister dst, Operand src, uint8_t shuffle);
void pshuflw(XMMRegister dst, XMMRegister src, uint8_t shuffle) {
pshuflw(dst, Operand(src), shuffle);
}
......@@ -1139,6 +1143,11 @@ class Assembler : public AssemblerBase {
}
void pblendw(XMMRegister dst, Operand src, uint8_t mask);
void palignr(XMMRegister dst, XMMRegister src, uint8_t mask) {
palignr(dst, Operand(src), mask);
}
void palignr(XMMRegister dst, Operand src, uint8_t mask);
void pextrb(Register dst, XMMRegister src, int8_t offset) {
pextrb(Operand(dst), src, offset);
}
......@@ -1442,6 +1451,10 @@ class Assembler : public AssemblerBase {
void vpsraw(XMMRegister dst, XMMRegister src, int8_t imm8);
void vpsrad(XMMRegister dst, XMMRegister src, int8_t imm8);
void vpshufhw(XMMRegister dst, XMMRegister src, uint8_t shuffle) {
vpshufhw(dst, Operand(src), shuffle);
}
void vpshufhw(XMMRegister dst, Operand src, uint8_t shuffle);
void vpshuflw(XMMRegister dst, XMMRegister src, uint8_t shuffle) {
vpshuflw(dst, Operand(src), shuffle);
}
......@@ -1457,6 +1470,12 @@ class Assembler : public AssemblerBase {
}
void vpblendw(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t mask);
void vpalignr(XMMRegister dst, XMMRegister src1, XMMRegister src2,
uint8_t mask) {
vpalignr(dst, src1, Operand(src2), mask);
}
void vpalignr(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t mask);
void vpextrb(Register dst, XMMRegister src, int8_t offset) {
vpextrb(Operand(dst), src, offset);
}
......
......@@ -830,6 +830,13 @@ int DisassemblerIA32::AVXInstruction(byte* data) {
AppendToBuffer(",%d", *reinterpret_cast<uint8_t*>(current));
current++;
break;
case 0x0F:
AppendToBuffer("vpalignr %s,%s,", NameOfXMMRegister(regop),
NameOfXMMRegister(vvvv));
current += PrintRightXMMOperand(current);
AppendToBuffer(",%d", *reinterpret_cast<uint8_t*>(current));
current++;
break;
case 0x14:
AppendToBuffer("vpextrb ");
current += PrintRightOperand(current);
......@@ -975,6 +982,12 @@ int DisassemblerIA32::AVXInstruction(byte* data) {
AppendToBuffer("vmovdqu %s,", NameOfXMMRegister(regop));
current += PrintRightOperand(current);
break;
case 0x70:
AppendToBuffer("vpshufhw %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
AppendToBuffer(",%d", *reinterpret_cast<int8_t*>(current));
current++;
break;
case 0x7f:
AppendToBuffer("vmovdqu ");
current += PrintRightOperand(current);
......@@ -1979,6 +1992,14 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector<char> out_buffer,
data += PrintRightXMMOperand(data);
AppendToBuffer(",%d", *reinterpret_cast<uint8_t*>(data));
data++;
} else if (*data == 0x0F) {
data++;
int mod, regop, rm;
get_modrm(*data, &mod, &regop, &rm);
AppendToBuffer("palignr %s,", NameOfXMMRegister(regop));
data += PrintRightXMMOperand(data);
AppendToBuffer(",%d", *reinterpret_cast<uint8_t*>(data));
data++;
} else if (*data == 0x14) {
data++;
int mod, regop, rm;
......@@ -2397,6 +2418,14 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector<char> out_buffer,
get_modrm(*data, &mod, &regop, &rm);
AppendToBuffer("movdqu %s,", NameOfXMMRegister(regop));
data += PrintRightXMMOperand(data);
} else if (b2 == 0x70) {
data += 3;
int mod, regop, rm;
get_modrm(*data, &mod, &regop, &rm);
AppendToBuffer("pshufhw %s,", NameOfXMMRegister(regop));
data += PrintRightXMMOperand(data);
AppendToBuffer(",%d", *reinterpret_cast<int8_t*>(data));
data++;
} else if (b2 == 0x7F) {
AppendToBuffer("movdqu ");
data += 3;
......
......@@ -1269,6 +1269,15 @@ void TurboAssembler::Move(XMMRegister dst, uint64_t src) {
}
}
void TurboAssembler::Pshufhw(XMMRegister dst, Operand src, uint8_t shuffle) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpshufhw(dst, src, shuffle);
} else {
pshufhw(dst, src, shuffle);
}
}
void TurboAssembler::Pshuflw(XMMRegister dst, Operand src, uint8_t shuffle) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
......@@ -1357,6 +1366,20 @@ void TurboAssembler::Pshufb(XMMRegister dst, Operand src) {
UNREACHABLE();
}
void TurboAssembler::Pblendw(XMMRegister dst, Operand src, uint8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpblendw(dst, dst, src, imm8);
return;
}
if (CpuFeatures::IsSupported(SSE4_1)) {
CpuFeatureScope sse_scope(this, SSE4_1);
pblendw(dst, src, imm8);
return;
}
UNREACHABLE();
}
void TurboAssembler::Pextrb(Register dst, XMMRegister src, int8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
......
......@@ -205,6 +205,10 @@ class TurboAssembler : public Assembler {
// may be bigger than 2^16 - 1. Requires a scratch register.
void Ret(int bytes_dropped, Register scratch);
void Pshufhw(XMMRegister dst, XMMRegister src, uint8_t shuffle) {
Pshufhw(dst, Operand(src), shuffle);
}
void Pshufhw(XMMRegister dst, Operand src, uint8_t shuffle);
void Pshuflw(XMMRegister dst, XMMRegister src, uint8_t shuffle) {
Pshuflw(dst, Operand(src), shuffle);
}
......@@ -275,6 +279,10 @@ class TurboAssembler : public Assembler {
void Pshufb(XMMRegister dst, XMMRegister src) { Pshufb(dst, Operand(src)); }
void Pshufb(XMMRegister dst, Operand src);
void Pblendw(XMMRegister dst, XMMRegister src, uint8_t imm8) {
Pblendw(dst, Operand(src), imm8);
}
void Pblendw(XMMRegister dst, Operand src, uint8_t imm8);
void Psignb(XMMRegister dst, XMMRegister src) { Psignb(dst, Operand(src)); }
void Psignb(XMMRegister dst, Operand src);
......
......@@ -496,6 +496,8 @@ TEST(DisasmIa320) {
__ psrlq(xmm0, 17);
__ psrlq(xmm0, xmm1);
__ pshufhw(xmm5, xmm1, 5);
__ pshufhw(xmm5, Operand(edx, 4), 5);
__ pshuflw(xmm5, xmm1, 5);
__ pshuflw(xmm5, Operand(edx, 4), 5);
__ pshufd(xmm5, xmm1, 5);
......@@ -547,6 +549,8 @@ TEST(DisasmIa320) {
if (CpuFeatures::IsSupported(SSSE3)) {
CpuFeatureScope scope(&assm, SSSE3);
SSSE3_INSTRUCTION_LIST(EMIT_SSE34_INSTR)
__ palignr(xmm5, xmm1, 5);
__ palignr(xmm5, Operand(edx, 4), 5);
}
}
......@@ -672,12 +676,16 @@ TEST(DisasmIa320) {
__ vpsraw(xmm0, xmm7, 21);
__ vpsrad(xmm0, xmm7, 21);
__ vpshufhw(xmm5, xmm1, 5);
__ vpshufhw(xmm5, Operand(edx, 4), 5);
__ vpshuflw(xmm5, xmm1, 5);
__ vpshuflw(xmm5, Operand(edx, 4), 5);
__ vpshufd(xmm5, xmm1, 5);
__ vpshufd(xmm5, Operand(edx, 4), 5);
__ vpblendw(xmm5, xmm1, xmm0, 5);
__ vpblendw(xmm5, xmm1, Operand(edx, 4), 5);
__ vpalignr(xmm5, xmm1, xmm0, 5);
__ vpalignr(xmm5, xmm1, Operand(edx, 4), 5);
__ vpextrb(eax, xmm0, 1);
__ vpextrb(Operand(edx, 4), xmm0, 1);
__ vpextrw(eax, xmm0, 1);
......
......@@ -1936,8 +1936,19 @@ WASM_SIMD_COMPILED_AND_LOWERED_TEST(S8x16Irregular) {
{{0, 0, 0, 0, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}});
}
// Test shuffles that concatenate the two vectors.
// Test shuffles that blend the two vectors (elements remain in their lanes.)
WASM_SIMD_COMPILED_AND_LOWERED_TEST(S8x16Blend) {
static const int kLanes = 16;
std::array<uint8_t, kLanes> expected;
for (int bias = 1; bias < kLanes; bias++) {
for (int i = 0; i < bias; i++) expected[i] = i;
for (int i = bias; i < kLanes; i++) expected[i] = i + kLanes;
RunBinaryLaneOpTest(execution_mode, lower_simd, kExprS8x16Shuffle,
expected);
}
}
// Test shuffles that concatenate the two vectors.
WASM_SIMD_COMPILED_AND_LOWERED_TEST(S8x16Concat) {
static const int kLanes = 16;
std::array<uint8_t, kLanes> expected;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment