Commit bcb4fbd4 authored by Bill Budge's avatar Bill Budge Committed by Commit Bot

[wasm simd] Handle more shuffles

- Shuffle canonicalization improved to reverse operands to match
  more architectural shuffles.
- Handles shuffles where the order of operands is reversed.
- Adds tests for non-canonical shuffles, and for swizzles.
- Improves TryMatchConcat method.
- Substantially rewrites shuffles on ia32 to better handle swizzles
  and fix bugs on reversed shuffles where source registers are
  overwritten.
- Adds Palignr macro-assembler instructions for ia32.

Bug: v8:6020
Change-Id: I8e43a1e7650057c66690af1504b67509a1437d75
Reviewed-on: https://chromium-review.googlesource.com/1070934
Commit-Queue: Bill Budge <bbudge@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Reviewed-by: 's avatarMartyn Capewell <martyn.capewell@arm.com>
Reviewed-by: 's avatarJaroslav Sevcik <jarin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#53648}
parent 43886bc3
......@@ -2445,7 +2445,9 @@ static const ShuffleEntry arch_shuffles[] = {
{{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}, kArmS8x2Reverse}};
bool TryMatchArchShuffle(const uint8_t* shuffle, const ShuffleEntry* table,
size_t num_entries, uint8_t mask, ArchOpcode* opcode) {
size_t num_entries, bool is_swizzle,
ArchOpcode* opcode) {
uint8_t mask = is_swizzle ? kSimd128Size - 1 : 2 * kSimd128Size - 1;
for (size_t i = 0; i < num_entries; ++i) {
const ShuffleEntry& entry = table[i];
int j = 0;
......@@ -2477,48 +2479,48 @@ void ArrangeShuffleTable(ArmOperandGenerator* g, Node* input0, Node* input1,
} // namespace
void InstructionSelector::VisitS8x16Shuffle(Node* node) {
const uint8_t* shuffle = OpParameter<uint8_t*>(node->op());
uint8_t mask = CanonicalizeShuffle(node);
uint8_t shuffle[kSimd128Size];
bool is_swizzle;
CanonicalizeShuffle(node, shuffle, &is_swizzle);
Node* input0 = node->InputAt(0);
Node* input1 = node->InputAt(1);
uint8_t shuffle32x4[4];
ArmOperandGenerator g(this);
int index = 0;
if (TryMatch32x4Shuffle(shuffle, shuffle32x4)) {
if (TryMatchDup<4>(shuffle, &index)) {
InstructionOperand src = index < 4 ? g.UseRegister(node->InputAt(0))
: g.UseRegister(node->InputAt(1));
InstructionOperand src =
index < 4 ? g.UseRegister(input0) : g.UseRegister(input1);
Emit(kArmS128Dup, g.DefineAsRegister(node), src, g.UseImmediate(Neon32),
g.UseImmediate(index % 4));
} else {
Emit(kArmS32x4Shuffle, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
g.UseImmediate(Pack4Lanes(shuffle32x4, mask)));
Emit(kArmS32x4Shuffle, g.DefineAsRegister(node), g.UseRegister(input0),
g.UseRegister(input1), g.UseImmediate(Pack4Lanes(shuffle32x4)));
}
return;
}
if (TryMatchDup<8>(shuffle, &index)) {
InstructionOperand src = index < 8 ? g.UseRegister(node->InputAt(0))
: g.UseRegister(node->InputAt(1));
InstructionOperand src =
index < 8 ? g.UseRegister(input0) : g.UseRegister(input1);
Emit(kArmS128Dup, g.DefineAsRegister(node), src, g.UseImmediate(Neon16),
g.UseImmediate(index % 8));
return;
}
if (TryMatchDup<16>(shuffle, &index)) {
InstructionOperand src = index < 16 ? g.UseRegister(node->InputAt(0))
: g.UseRegister(node->InputAt(1));
InstructionOperand src =
index < 16 ? g.UseRegister(input0) : g.UseRegister(input1);
Emit(kArmS128Dup, g.DefineAsRegister(node), src, g.UseImmediate(Neon8),
g.UseImmediate(index % 16));
return;
}
ArchOpcode opcode;
if (TryMatchArchShuffle(shuffle, arch_shuffles, arraysize(arch_shuffles),
mask, &opcode)) {
is_swizzle, &opcode)) {
VisitRRRShuffle(this, opcode, node);
return;
}
Node* input0 = node->InputAt(0);
Node* input1 = node->InputAt(1);
uint8_t offset;
if (TryMatchConcat(shuffle, mask, &offset)) {
if (TryMatchConcat(shuffle, &offset)) {
Emit(kArmS8x16Concat, g.DefineAsRegister(node), g.UseRegister(input0),
g.UseRegister(input1), g.UseImmediate(offset));
return;
......@@ -2527,10 +2529,10 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
InstructionOperand src0, src1;
ArrangeShuffleTable(&g, input0, input1, &src0, &src1);
Emit(kArmS8x16Shuffle, g.DefineAsRegister(node), src0, src1,
g.UseImmediate(Pack4Lanes(shuffle, mask)),
g.UseImmediate(Pack4Lanes(shuffle + 4, mask)),
g.UseImmediate(Pack4Lanes(shuffle + 8, mask)),
g.UseImmediate(Pack4Lanes(shuffle + 12, mask)));
g.UseImmediate(Pack4Lanes(shuffle)),
g.UseImmediate(Pack4Lanes(shuffle + 4)),
g.UseImmediate(Pack4Lanes(shuffle + 8)),
g.UseImmediate(Pack4Lanes(shuffle + 12)));
}
void InstructionSelector::VisitSignExtendWord8ToInt32(Node* node) {
......
......@@ -3088,7 +3088,9 @@ static const ShuffleEntry arch_shuffles[] = {
kArm64S8x2Reverse}};
bool TryMatchArchShuffle(const uint8_t* shuffle, const ShuffleEntry* table,
size_t num_entries, uint8_t mask, ArchOpcode* opcode) {
size_t num_entries, bool is_swizzle,
ArchOpcode* opcode) {
uint8_t mask = is_swizzle ? kSimd128Size - 1 : 2 * kSimd128Size - 1;
for (size_t i = 0; i < num_entries; i++) {
const ShuffleEntry& entry = table[i];
int j = 0;
......@@ -3120,48 +3122,48 @@ void ArrangeShuffleTable(Arm64OperandGenerator* g, Node* input0, Node* input1,
} // namespace
void InstructionSelector::VisitS8x16Shuffle(Node* node) {
const uint8_t* shuffle = OpParameter<uint8_t*>(node->op());
uint8_t mask = CanonicalizeShuffle(node);
uint8_t shuffle[kSimd128Size];
bool is_swizzle;
CanonicalizeShuffle(node, shuffle, &is_swizzle);
uint8_t shuffle32x4[4];
Arm64OperandGenerator g(this);
ArchOpcode opcode;
if (TryMatchArchShuffle(shuffle, arch_shuffles, arraysize(arch_shuffles),
mask, &opcode)) {
is_swizzle, &opcode)) {
VisitRRR(this, opcode, node);
return;
}
Node* input0 = node->InputAt(0);
Node* input1 = node->InputAt(1);
uint8_t bias;
if (TryMatchConcat(shuffle, mask, &bias)) {
uint8_t offset;
if (TryMatchConcat(shuffle, &offset)) {
Emit(kArm64S8x16Concat, g.DefineAsRegister(node), g.UseRegister(input0),
g.UseRegister(input1), g.UseImmediate(bias));
g.UseRegister(input1), g.UseImmediate(offset));
return;
}
int index = 0;
if (TryMatch32x4Shuffle(shuffle, shuffle32x4)) {
if (TryMatchDup<4>(shuffle, &index)) {
InstructionOperand src = index < 4 ? g.UseRegister(node->InputAt(0))
: g.UseRegister(node->InputAt(1));
InstructionOperand src =
index < 4 ? g.UseRegister(input0) : g.UseRegister(input1);
Emit(kArm64S128Dup, g.DefineAsRegister(node), src, g.UseImmediate(4),
g.UseImmediate(index % 4));
} else {
Emit(kArm64S32x4Shuffle, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
g.UseImmediate(Pack4Lanes(shuffle32x4, mask)));
Emit(kArm64S32x4Shuffle, g.DefineAsRegister(node), g.UseRegister(input0),
g.UseRegister(input1), g.UseImmediate(Pack4Lanes(shuffle32x4)));
}
return;
}
if (TryMatchDup<8>(shuffle, &index)) {
InstructionOperand src = index < 8 ? g.UseRegister(node->InputAt(0))
: g.UseRegister(node->InputAt(1));
InstructionOperand src =
index < 8 ? g.UseRegister(input0) : g.UseRegister(input1);
Emit(kArm64S128Dup, g.DefineAsRegister(node), src, g.UseImmediate(8),
g.UseImmediate(index % 8));
return;
}
if (TryMatchDup<16>(shuffle, &index)) {
InstructionOperand src = index < 16 ? g.UseRegister(node->InputAt(0))
: g.UseRegister(node->InputAt(1));
InstructionOperand src =
index < 16 ? g.UseRegister(input0) : g.UseRegister(input1);
Emit(kArm64S128Dup, g.DefineAsRegister(node), src, g.UseImmediate(16),
g.UseImmediate(index % 16));
return;
......@@ -3170,10 +3172,10 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
InstructionOperand src0, src1;
ArrangeShuffleTable(&g, input0, input1, &src0, &src1);
Emit(kArm64S8x16Shuffle, g.DefineAsRegister(node), src0, src1,
g.UseImmediate(Pack4Lanes(shuffle, mask)),
g.UseImmediate(Pack4Lanes(shuffle + 4, mask)),
g.UseImmediate(Pack4Lanes(shuffle + 8, mask)),
g.UseImmediate(Pack4Lanes(shuffle + 12, mask)));
g.UseImmediate(Pack4Lanes(shuffle)),
g.UseImmediate(Pack4Lanes(shuffle + 4)),
g.UseImmediate(Pack4Lanes(shuffle + 8)),
g.UseImmediate(Pack4Lanes(shuffle + 12)));
}
void InstructionSelector::VisitSignExtendWord8ToInt32(Node* node) {
......
......@@ -3139,13 +3139,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister dst = i.OutputSimd128Register();
Operand src0 = i.InputOperand(0);
Register tmp = i.TempRegister(0);
if (!src0.is_reg(dst)) {
__ movups(dst, src0);
}
// Prepare 16-byte boundary buffer for shuffle control mask
// Prepare 16 byte aligned buffer for shuffle control mask
__ mov(tmp, esp);
__ and_(esp, -16);
if (instr->InputCount() == 5) { // only one input operand
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
for (int j = 4; j > 0; j--) {
uint32_t mask = i.InputUint32(j);
__ push(Immediate(mask));
......@@ -3153,6 +3151,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pshufb(dst, Operand(esp, 0));
} else { // two input operands
DCHECK_EQ(6, instr->InputCount());
__ movups(kScratchDoubleReg, src0);
for (int j = 5; j > 1; j--) {
uint32_t lanes = i.InputUint32(j);
uint32_t mask = 0;
......@@ -3162,8 +3161,9 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
__ push(Immediate(mask));
}
__ Pshufb(dst, Operand(esp, 0));
__ movups(kScratchDoubleReg, i.InputOperand(1));
__ Pshufb(kScratchDoubleReg, Operand(esp, 0));
Operand src1 = i.InputOperand(1);
if (!src1.is_reg(dst)) __ movups(dst, src1);
for (int j = 5; j > 1; j--) {
uint32_t lanes = i.InputUint32(j);
uint32_t mask = 0;
......@@ -3173,74 +3173,55 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
__ push(Immediate(mask));
}
__ Pshufb(kScratchDoubleReg, Operand(esp, 0));
__ Pshufb(dst, Operand(esp, 0));
__ por(dst, kScratchDoubleReg);
}
__ mov(esp, tmp);
break;
}
case kIA32S32x4Swizzle: {
__ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), i.InputInt8(2));
DCHECK_EQ(2, instr->InputCount());
__ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), i.InputInt8(1));
break;
}
case kIA32S32x4Shuffle: {
DCHECK_EQ(4, instr->InputCount()); // Swizzles should be handled above.
__ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), i.InputInt8(2));
__ Pshufd(kScratchDoubleReg, i.InputOperand(1), i.InputInt8(2));
__ Pshufd(i.OutputSimd128Register(), i.InputOperand(0), i.InputInt8(2));
__ Pblendw(i.OutputSimd128Register(), kScratchDoubleReg, i.InputInt8(3));
break;
}
case kSSES16x8Blend: {
CpuFeatureScope sse_scope(tasm(), SSSE3);
if (instr->InputCount() == 2) {
// swizzle
__ pblendw(i.OutputSimd128Register(), i.InputOperand(0),
i.InputInt8(1));
} else {
// shuffle
DCHECK_EQ(3, instr->InputCount());
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pblendw(i.OutputSimd128Register(), i.InputOperand(1),
i.InputInt8(2));
}
CpuFeatureScope sse_scope(tasm(), SSE4_1);
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pblendw(i.OutputSimd128Register(), i.InputOperand(1), i.InputInt8(2));
break;
}
case kAVXS16x8Blend: {
CpuFeatureScope avx_scope(tasm(), AVX);
CpuFeatureScope sse_scope(tasm(), AVX);
__ vpblendw(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputOperand(1), i.InputInt8(2));
break;
}
case kIA32S16x8ShuffleBlend: {
case kIA32S16x8HalfShuffle1: {
XMMRegister dst = i.OutputSimd128Register();
if (instr->InputCount() == 3) {
// swizzle
__ Pshuflw(dst, i.InputOperand(0), i.InputInt8(1));
__ Pshufhw(dst, dst, i.InputInt8(2));
} else {
// shuffle
DCHECK_EQ(5, instr->InputCount());
__ Pshuflw(dst, i.InputOperand(0), i.InputInt8(2));
__ Pshufhw(dst, dst, i.InputInt8(3));
__ Pshuflw(kScratchDoubleReg, i.InputOperand(1), i.InputInt8(2));
__ Pshufhw(kScratchDoubleReg, kScratchDoubleReg, i.InputInt8(3));
__ Pblendw(dst, kScratchDoubleReg, i.InputInt8(4));
}
__ Pshuflw(dst, i.InputOperand(0), i.InputInt8(1));
__ Pshufhw(dst, dst, i.InputInt8(2));
break;
}
case kIA32S16x8HalfShuffle2: {
XMMRegister dst = i.OutputSimd128Register();
__ Pshuflw(kScratchDoubleReg, i.InputOperand(1), i.InputInt8(2));
__ Pshufhw(kScratchDoubleReg, kScratchDoubleReg, i.InputInt8(3));
__ Pshuflw(dst, i.InputOperand(0), i.InputInt8(2));
__ Pshufhw(dst, dst, i.InputInt8(3));
__ Pblendw(dst, kScratchDoubleReg, i.InputInt8(4));
break;
}
case kSSES8x16Alignr: {
CpuFeatureScope sse_scope(tasm(), SSSE3);
if (instr->InputCount() == 2) {
// swizzle
__ palignr(i.OutputSimd128Register(), i.InputOperand(0),
i.InputInt8(1));
} else {
// shuffle
DCHECK_EQ(3, instr->InputCount());
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ palignr(i.OutputSimd128Register(), i.InputOperand(1),
i.InputInt8(2));
}
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ palignr(i.OutputSimd128Register(), i.InputOperand(1), i.InputInt8(2));
break;
}
case kAVXS8x16Alignr: {
......
......@@ -305,7 +305,8 @@ namespace compiler {
V(IA32S32x4Shuffle) \
V(SSES16x8Blend) \
V(AVXS16x8Blend) \
V(IA32S16x8ShuffleBlend) \
V(IA32S16x8HalfShuffle1) \
V(IA32S16x8HalfShuffle2) \
V(SSES8x16Alignr) \
V(AVXS8x16Alignr) \
V(IA32S1x4AnyTrue) \
......
......@@ -287,7 +287,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32S32x4Shuffle:
case kSSES16x8Blend:
case kAVXS16x8Blend:
case kIA32S16x8ShuffleBlend:
case kIA32S16x8HalfShuffle1:
case kIA32S16x8HalfShuffle2:
case kSSES8x16Alignr:
case kAVXS8x16Alignr:
case kIA32S1x4AnyTrue:
......
......@@ -2058,23 +2058,13 @@ bool Is16x8BlendedShuffle(uint8_t* shuffle16x8, uint8_t* blend_mask) {
return true;
}
void SwapShuffleInputs(Node* node) {
Node* input0 = node->InputAt(0);
Node* input1 = node->InputAt(1);
node->ReplaceInput(0, input1);
node->ReplaceInput(1, input0);
}
} // namespace
// TODO(bbudge) Make sure identity shuffle emits no instructions.
void InstructionSelector::VisitS8x16Shuffle(Node* node) {
static const int kMaxSwizzleIndex = 15;
static const int kMaxShuffleIndex = 31;
const uint8_t* shuffle = OpParameter<uint8_t*>(node->op());
uint8_t mask = CanonicalizeShuffle(node);
bool is_swizzle = (mask == kMaxSwizzleIndex);
DCHECK_IMPLIES(!is_swizzle, mask == kMaxShuffleIndex);
USE(kMaxShuffleIndex);
uint8_t shuffle[kSimd128Size];
bool is_swizzle;
CanonicalizeShuffle(node, shuffle, &is_swizzle);
int imm_count = 0;
static const int kMaxImms = 6;
......@@ -2085,23 +2075,30 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
IA32OperandGenerator g(this);
bool use_avx = CpuFeatures::IsSupported(AVX);
// AVX and swizzles don't generally need DefineSameAsFirst to avoid a move.
bool no_same_as_first = use_avx || is_swizzle;
// We generally need UseRegister for the first source.
bool no_use_register = false;
ArchOpcode opcode = kIA32S8x16Shuffle; // general shuffle is the default
uint8_t offset;
uint8_t shuffle32x4[4];
uint8_t shuffle16x8[8];
if (TryMatchConcat(shuffle, mask, &offset)) {
// Swap inputs for (v)palignr.
// TODO(bbudge) Handle concatenations where the sources are reversed.
if (TryMatchConcat(shuffle, &offset)) {
// Swap inputs from the normal order for (v)palignr.
SwapShuffleInputs(node);
// palignr takes a single imm8 offset.
is_swizzle = false; // It's simpler to just handle the general case.
no_same_as_first = use_avx; // SSE requires same-as-first.
opcode = use_avx ? kAVXS8x16Alignr : kSSES8x16Alignr;
// palignr takes a single imm8 offset.
imms[imm_count++] = offset;
} else if (TryMatch32x4Shuffle(shuffle, shuffle32x4)) {
uint8_t shuffle_mask = PackShuffle4(shuffle32x4);
if (is_swizzle) {
// pshufd takes a single imm8 shuffle mask.
opcode = kIA32S32x4Swizzle;
no_same_as_first = true;
no_use_register = true;
imms[imm_count++] = shuffle_mask;
} else {
// 2 operand shuffle
......@@ -2112,6 +2109,8 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
imms[imm_count++] = blend_mask;
} else {
opcode = kIA32S32x4Shuffle;
no_same_as_first = true;
no_use_register = true;
imms[imm_count++] = shuffle_mask;
int8_t blend_mask = PackBlend4(shuffle32x4);
imms[imm_count++] = blend_mask;
......@@ -2124,39 +2123,46 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
blend_mask = PackBlend8(shuffle16x8);
imms[imm_count++] = blend_mask;
} else if (Is16x8BlendedShuffle(shuffle16x8, &blend_mask)) {
opcode = kIA32S16x8ShuffleBlend;
opcode = is_swizzle ? kIA32S16x8HalfShuffle1 : kIA32S16x8HalfShuffle2;
// Half-shuffles don't need DefineSameAsFirst or UseRegister(src0).
no_same_as_first = true;
no_use_register = true;
uint8_t mask_lo = PackShuffle4(shuffle16x8);
uint8_t mask_hi = PackShuffle4(shuffle16x8 + 4);
imms[imm_count++] = mask_lo;
imms[imm_count++] = mask_hi;
// TODO(bbudge) eliminate the blend for swizzles.
imms[imm_count++] = blend_mask;
if (!is_swizzle) imms[imm_count++] = blend_mask;
}
}
if (opcode == kIA32S8x16Shuffle) {
// General shuffle.
imms[imm_count++] = Pack4Lanes(shuffle, mask);
imms[imm_count++] = Pack4Lanes(shuffle + 4, mask);
imms[imm_count++] = Pack4Lanes(shuffle + 8, mask);
imms[imm_count++] = Pack4Lanes(shuffle + 12, mask);
// Use same-as-first for general swizzle, but not shuffle.
no_same_as_first = !is_swizzle;
no_use_register = no_same_as_first;
imms[imm_count++] = Pack4Lanes(shuffle);
imms[imm_count++] = Pack4Lanes(shuffle + 4);
imms[imm_count++] = Pack4Lanes(shuffle + 8);
imms[imm_count++] = Pack4Lanes(shuffle + 12);
temps[temp_count++] = g.TempRegister();
}
// Swizzles and AVX don't require input[0] == output.
InstructionOperand output = use_avx || is_swizzle ? g.DefineAsRegister(node)
: g.DefineSameAsFirst(node);
// Use DefineAsRegister(node) and Use(src0) if we can without forcing an extra
// move instruction in the CodeGenerator.
Node* input0 = node->InputAt(0);
InstructionOperand dst =
no_same_as_first ? g.DefineAsRegister(node) : g.DefineSameAsFirst(node);
InstructionOperand src0 =
no_use_register ? g.Use(input0) : g.UseRegister(input0);
int input_count = 0;
InstructionOperand inputs[2 + kMaxImms + kMaxTemps];
InstructionOperand src0 = g.UseRegister(node->InputAt(0));
inputs[input_count++] = src0;
if (!is_swizzle || (use_avx && opcode != kIA32S8x16Shuffle)) {
if (!is_swizzle) {
inputs[input_count++] = g.Use(node->InputAt(1));
}
for (int i = 0; i < imm_count; ++i) {
inputs[input_count++] = g.UseImmediate(imms[i]);
}
Emit(opcode, 1, &output, input_count, inputs, temp_count, temps);
Emit(opcode, 1, &dst, input_count, inputs, temp_count, temps);
}
// static
......
......@@ -2884,16 +2884,19 @@ bool InstructionSelector::TryMatch16x8Shuffle(const uint8_t* shuffle,
}
// static
bool InstructionSelector::TryMatchConcat(const uint8_t* shuffle, uint8_t mask,
bool InstructionSelector::TryMatchConcat(const uint8_t* shuffle,
uint8_t* offset) {
// Don't match the identity shuffle (e.g. [0 1 2 ... 15]).
uint8_t start = shuffle[0];
int i = 1;
for (; i < 16 - start; ++i) {
if ((shuffle[i] & mask) != ((shuffle[i - 1] + 1) & mask)) return false;
}
uint8_t wrap = 16;
for (; i < 16; ++i, ++wrap) {
if ((shuffle[i] & mask) != (wrap & mask)) return false;
if (start == 0) return false;
DCHECK_GT(kSimd128Size, start); // The shuffle should be canonicalized.
// A concatenation is a series of consecutive indices, with at most one jump
// in the middle from the last lane to the first.
for (int i = 1; i < kSimd128Size; ++i) {
if ((shuffle[i]) != ((shuffle[i - 1] + 1))) {
if (shuffle[i - 1] != 15) return false;
if (shuffle[i] % kSimd128Size != 0) return false;
}
}
*offset = start;
return true;
......@@ -2907,23 +2910,21 @@ bool InstructionSelector::TryMatchBlend(const uint8_t* shuffle) {
return true;
}
uint8_t InstructionSelector::CanonicalizeShuffle(Node* node) {
static const int kMaxLaneIndex = 15;
static const int kMaxShuffleIndex = 31;
void InstructionSelector::CanonicalizeShuffle(Node* node, uint8_t* shuffle,
bool* is_swizzle) {
// Get raw shuffle indices.
memcpy(shuffle, OpParameter<uint8_t*>(node->op()), kSimd128Size);
const uint8_t* shuffle = OpParameter<uint8_t*>(node->op());
uint8_t mask = kMaxShuffleIndex;
// If shuffle is unary, set 'mask' to ignore the high bit of the indices.
// Replace any unused source with the other.
// Detect shuffles that only operate on one input.
if (GetVirtualRegister(node->InputAt(0)) ==
GetVirtualRegister(node->InputAt(1))) {
// unary, src0 == src1.
mask = kMaxLaneIndex;
*is_swizzle = true;
} else {
// Inputs are distinct; check that both are required.
bool src0_is_used = false;
bool src1_is_used = false;
for (int i = 0; i < 16; ++i) {
if (shuffle[i] <= kMaxLaneIndex) {
for (int i = 0; i < kSimd128Size; ++i) {
if (shuffle[i] < kSimd128Size) {
src0_is_used = true;
} else {
src1_is_used = true;
......@@ -2931,25 +2932,47 @@ uint8_t InstructionSelector::CanonicalizeShuffle(Node* node) {
}
if (src0_is_used && !src1_is_used) {
node->ReplaceInput(1, node->InputAt(0));
mask = kMaxLaneIndex;
*is_swizzle = true;
} else if (src1_is_used && !src0_is_used) {
node->ReplaceInput(0, node->InputAt(1));
mask = kMaxLaneIndex;
*is_swizzle = true;
} else {
*is_swizzle = false;
// Canonicalize general 2 input shuffles so that the first input lanes are
// encountered first. This makes architectural shuffle pattern matching
// easier, since we only need to consider 1 input ordering instead of 2.
if (shuffle[0] >= kSimd128Size) {
// The second operand is used first. Swap inputs and adjust the shuffle.
SwapShuffleInputs(node);
for (int i = 0; i < kSimd128Size; ++i) {
shuffle[i] ^= kSimd128Size;
}
}
}
}
return mask;
if (*is_swizzle) {
for (int i = 0; i < kSimd128Size; ++i) shuffle[i] &= kSimd128Size - 1;
}
}
// static
int32_t InstructionSelector::Pack4Lanes(const uint8_t* shuffle, uint8_t mask) {
int32_t InstructionSelector::Pack4Lanes(const uint8_t* shuffle) {
int32_t result = 0;
for (int i = 3; i >= 0; --i) {
result <<= 8;
result |= shuffle[i] & mask;
result |= shuffle[i];
}
return result;
}
// static
void InstructionSelector::SwapShuffleInputs(Node* node) {
Node* input0 = node->InputAt(0);
Node* input1 = node->InputAt(1);
node->ReplaceInput(0, input1);
node->ReplaceInput(1, input0);
}
bool InstructionSelector::NeedsPoisoning(IsSafetyCheck safety_check) const {
switch (poisoning_level_) {
case PoisoningMitigationLevel::kDontPoison:
......
......@@ -633,22 +633,25 @@ class V8_EXPORT_PRIVATE InstructionSelector final {
// Tries to match a byte shuffle to a concatenate operation, formed by taking
// 16 bytes from the 32 byte concatenation of the inputs. If successful, it
// writes the byte offset. E.g. [4 5 6 7 .. 16 17 18 19] concatenates both
// source vectors with offset 4.
static bool TryMatchConcat(const uint8_t* shuffle, uint8_t mask,
uint8_t* offset);
// source vectors with offset 4. The shuffle should be canonicalized.
static bool TryMatchConcat(const uint8_t* shuffle, uint8_t* offset);
// Tries to match a byte shuffle to a blend operation, which is a shuffle
// where no lanes change position. E.g. [0 9 2 11 .. 14 31] interleaves the
// even lanes of the first source with the odd lanes of the second.
// even lanes of the first source with the odd lanes of the second. The
// shuffle should be canonicalized.
static bool TryMatchBlend(const uint8_t* shuffle);
// Packs 4 bytes of shuffle into a 32 bit immediate, using a mask from
// CanonicalizeShuffle to convert unary shuffles.
static int32_t Pack4Lanes(const uint8_t* shuffle, uint8_t mask);
// Packs 4 bytes of shuffle into a 32 bit immediate.
static int32_t Pack4Lanes(const uint8_t* shuffle);
// Canonicalize shuffles to make pattern matching simpler. Returns a mask that
// will clear the high bit of indices if shuffle is unary (a swizzle).
uint8_t CanonicalizeShuffle(Node* node);
// Canonicalize shuffles to make pattern matching simpler. Returns the shuffle
// indices, and a boolean indicating if the shuffle is a swizzle (one input).
void CanonicalizeShuffle(Node* node, uint8_t* shuffle, bool* is_swizzle);
// Swaps the two first input operands of the node, to help match shuffles
// to specific architectural instructions.
void SwapShuffleInputs(Node* node);
// ===========================================================================
......
......@@ -2133,7 +2133,9 @@ static const ShuffleEntry arch_shuffles[] = {
{{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}, kMipsS8x2Reverse}};
bool TryMatchArchShuffle(const uint8_t* shuffle, const ShuffleEntry* table,
size_t num_entries, uint8_t mask, ArchOpcode* opcode) {
size_t num_entries, bool is_swizzle,
ArchOpcode* opcode) {
uint8_t mask = is_swizzle ? kSimd128Size - 1 : 2 * kSimd128Size - 1;
for (size_t i = 0; i < num_entries; ++i) {
const ShuffleEntry& entry = table[i];
int j = 0;
......@@ -2153,35 +2155,35 @@ bool TryMatchArchShuffle(const uint8_t* shuffle, const ShuffleEntry* table,
} // namespace
void InstructionSelector::VisitS8x16Shuffle(Node* node) {
const uint8_t* shuffle = OpParameter<uint8_t*>(node->op());
uint8_t mask = CanonicalizeShuffle(node);
uint8_t shuffle[kSimd128Size];
bool is_swizzle;
CanonicalizeShuffle(node, shuffle, &is_swizzle);
uint8_t shuffle32x4[4];
ArchOpcode opcode;
if (TryMatchArchShuffle(shuffle, arch_shuffles, arraysize(arch_shuffles),
mask, &opcode)) {
is_swizzle, &opcode)) {
VisitRRR(this, opcode, node);
return;
}
Node* input0 = node->InputAt(0);
Node* input1 = node->InputAt(1);
uint8_t offset;
MipsOperandGenerator g(this);
if (TryMatchConcat(shuffle, mask, &offset)) {
Emit(kMipsS8x16Concat, g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(1)), g.UseRegister(node->InputAt(0)),
g.UseImmediate(offset));
if (TryMatchConcat(shuffle, &offset)) {
Emit(kMipsS8x16Concat, g.DefineSameAsFirst(node), g.UseRegister(input0),
g.UseRegister(input1), g.UseImmediate(offset));
return;
}
if (TryMatch32x4Shuffle(shuffle, shuffle32x4)) {
Emit(kMipsS32x4Shuffle, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
g.UseImmediate(Pack4Lanes(shuffle32x4, mask)));
Emit(kMipsS32x4Shuffle, g.DefineAsRegister(node), g.UseRegister(input0),
g.UseRegister(input1), g.UseImmediate(Pack4Lanes(shuffle32x4)));
return;
}
Emit(kMipsS8x16Shuffle, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
g.UseImmediate(Pack4Lanes(shuffle, mask)),
g.UseImmediate(Pack4Lanes(shuffle + 4, mask)),
g.UseImmediate(Pack4Lanes(shuffle + 8, mask)),
g.UseImmediate(Pack4Lanes(shuffle + 12, mask)));
Emit(kMipsS8x16Shuffle, g.DefineAsRegister(node), g.UseRegister(input0),
g.UseRegister(input1), g.UseImmediate(Pack4Lanes(shuffle)),
g.UseImmediate(Pack4Lanes(shuffle + 4)),
g.UseImmediate(Pack4Lanes(shuffle + 8)),
g.UseImmediate(Pack4Lanes(shuffle + 12)));
}
void InstructionSelector::VisitSignExtendWord8ToInt32(Node* node) {
......
......@@ -2802,7 +2802,9 @@ static const ShuffleEntry arch_shuffles[] = {
kMips64S8x2Reverse}};
bool TryMatchArchShuffle(const uint8_t* shuffle, const ShuffleEntry* table,
size_t num_entries, uint8_t mask, ArchOpcode* opcode) {
size_t num_entries, bool is_swizzle,
ArchOpcode* opcode) {
uint8_t mask = is_swizzle ? kSimd128Size - 1 : 2 * kSimd128Size - 1;
for (size_t i = 0; i < num_entries; ++i) {
const ShuffleEntry& entry = table[i];
int j = 0;
......@@ -2822,35 +2824,35 @@ bool TryMatchArchShuffle(const uint8_t* shuffle, const ShuffleEntry* table,
} // namespace
void InstructionSelector::VisitS8x16Shuffle(Node* node) {
const uint8_t* shuffle = OpParameter<uint8_t*>(node->op());
uint8_t mask = CanonicalizeShuffle(node);
uint8_t shuffle[kSimd128Size];
bool is_swizzle;
CanonicalizeShuffle(node, shuffle, &is_swizzle);
uint8_t shuffle32x4[4];
ArchOpcode opcode;
if (TryMatchArchShuffle(shuffle, arch_shuffles, arraysize(arch_shuffles),
mask, &opcode)) {
is_swizzle, &opcode)) {
VisitRRR(this, opcode, node);
return;
}
Node* input0 = node->InputAt(0);
Node* input1 = node->InputAt(1);
uint8_t offset;
Mips64OperandGenerator g(this);
if (TryMatchConcat(shuffle, mask, &offset)) {
Emit(kMips64S8x16Concat, g.DefineSameAsFirst(node),
g.UseRegister(node->InputAt(1)), g.UseRegister(node->InputAt(0)),
g.UseImmediate(offset));
if (TryMatchConcat(shuffle, &offset)) {
Emit(kMips64S8x16Concat, g.DefineSameAsFirst(node), g.UseRegister(input0),
g.UseRegister(input1), g.UseImmediate(offset));
return;
}
if (TryMatch32x4Shuffle(shuffle, shuffle32x4)) {
Emit(kMips64S32x4Shuffle, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
g.UseImmediate(Pack4Lanes(shuffle32x4, mask)));
Emit(kMips64S32x4Shuffle, g.DefineAsRegister(node), g.UseRegister(input0),
g.UseRegister(input1), g.UseImmediate(Pack4Lanes(shuffle32x4)));
return;
}
Emit(kMips64S8x16Shuffle, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
g.UseImmediate(Pack4Lanes(shuffle, mask)),
g.UseImmediate(Pack4Lanes(shuffle + 4, mask)),
g.UseImmediate(Pack4Lanes(shuffle + 8, mask)),
g.UseImmediate(Pack4Lanes(shuffle + 12, mask)));
Emit(kMips64S8x16Shuffle, g.DefineAsRegister(node), g.UseRegister(input0),
g.UseRegister(input1), g.UseImmediate(Pack4Lanes(shuffle)),
g.UseImmediate(Pack4Lanes(shuffle + 4)),
g.UseImmediate(Pack4Lanes(shuffle + 8)),
g.UseImmediate(Pack4Lanes(shuffle + 12)));
}
void InstructionSelector::VisitSignExtendWord8ToInt32(Node* node) {
......
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment