Commit a459f188 authored by bbudge's avatar bbudge Committed by Commit bot

[ARM] Implement irregular vector shuffles for SIMD.

- S32x4Shuffle by decomposing into s-register moves if no patterns match.
- S16x8Shuffle, S8x16Shuffle implemented with vtbl if no patterns match.

LOG=N
BUG=v8:6020

Review-Url: https://codereview.chromium.org/2856363003
Cr-Commit-Position: refs/heads/master@{#45210}
parent e9a05114
...@@ -347,6 +347,14 @@ Condition FlagsConditionToCondition(FlagsCondition condition) { ...@@ -347,6 +347,14 @@ Condition FlagsConditionToCondition(FlagsCondition condition) {
return kNoCondition; return kNoCondition;
} }
int GetVtblTableSize(const Simd128Register& src0, const Simd128Register& src1) {
// If unary shuffle, table is src0 (2 d-registers).
if (src0.is(src1)) return 2;
// Binary shuffle, table is src0, src1. They must be consecutive
DCHECK_EQ(src0.code() + 1, src1.code());
return 4; // 4 d-registers.
}
} // namespace } // namespace
#define ASSEMBLE_CHECKED_LOAD_FP(Type) \ #define ASSEMBLE_CHECKED_LOAD_FP(Type) \
...@@ -2186,6 +2194,36 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2186,6 +2194,36 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vtrn(Neon32, dst, kScratchQuadReg); // dst = [0, 4, 2, 6] __ vtrn(Neon32, dst, kScratchQuadReg); // dst = [0, 4, 2, 6]
break; break;
} }
case kArmS32x4Shuffle: {
Simd128Register dst = i.OutputSimd128Register(),
src0 = i.InputSimd128Register(0),
src1 = i.InputSimd128Register(1);
// Check for in-place shuffles.
// If dst == src0 == src1, then the shuffle is unary and we only use src0.
if (dst.is(src0)) {
__ vmov(kScratchQuadReg, src0);
src0 = kScratchQuadReg;
} else if (dst.is(src1)) {
__ vmov(kScratchQuadReg, src1);
src1 = kScratchQuadReg;
}
// Perform shuffle as a vmov per lane.
int dst_code = dst.code() * 4;
int src0_code = src0.code() * 4;
int src1_code = src1.code() * 4;
int32_t shuffle = i.InputInt32(2);
for (int i = 0; i < 4; i++) {
int lane = shuffle & 0x7;
int src_code = src0_code;
if (lane >= 4) {
src_code = src1_code;
lane &= 0x3;
}
__ VmovExtended(dst_code + i, src_code + lane, kScratchReg);
shuffle >>= 8;
}
break;
}
case kArmS32x4TransposeRight: { case kArmS32x4TransposeRight: {
Simd128Register dst = i.OutputSimd128Register(), Simd128Register dst = i.OutputSimd128Register(),
src1 = i.InputSimd128Register(1); src1 = i.InputSimd128Register(1);
...@@ -2249,6 +2287,39 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2249,6 +2287,39 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vtrn(Neon16, kScratchQuadReg, dst); // dst = [1, 9, 3, 11, ... 15] __ vtrn(Neon16, kScratchQuadReg, dst); // dst = [1, 9, 3, 11, ... 15]
break; break;
} }
case kArmS16x8Shuffle: {
Simd128Register dst = i.OutputSimd128Register(),
src0 = i.InputSimd128Register(0),
src1 = i.InputSimd128Register(1);
DwVfpRegister table_base = src0.low();
int table_size = GetVtblTableSize(src0, src1);
// Convert the shuffle lane masks to byte masks in kScratchQuadReg.
int scratch_s_base = kScratchQuadReg.code() * 4;
for (int j = 0; j < 2; j++) {
int32_t four_lanes = i.InputInt32(2 + j);
for (int k = 0; k < 2; k++) {
uint8_t w0 = (four_lanes & 0xF) * kShortSize;
four_lanes >>= 8;
uint8_t w1 = (four_lanes & 0xF) * kShortSize;
four_lanes >>= 8;
int32_t mask = w0 | ((w0 + 1) << 8) | (w1 << 16) | ((w1 + 1) << 24);
// Ensure byte indices are in [0, 31] so masks are never NaNs.
four_lanes &= 0x1F1F1F1F;
__ vmov(SwVfpRegister::from_code(scratch_s_base + 2 * j + k),
bit_cast<float>(mask));
}
}
NeonListOperand table(table_base, table_size);
if (!dst.is(src0) && !dst.is(src1)) {
__ vtbl(dst.low(), table, kScratchQuadReg.low());
__ vtbl(dst.high(), table, kScratchQuadReg.high());
} else {
__ vtbl(kScratchQuadReg.low(), table, kScratchQuadReg.low());
__ vtbl(kScratchQuadReg.high(), table, kScratchQuadReg.high());
__ vmov(dst, kScratchQuadReg);
}
break;
}
case kArmS8x16ZipLeft: { case kArmS8x16ZipLeft: {
Simd128Register dst = i.OutputSimd128Register(), Simd128Register dst = i.OutputSimd128Register(),
src1 = i.InputSimd128Register(1); src1 = i.InputSimd128Register(1);
...@@ -2308,6 +2379,32 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2308,6 +2379,32 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(1), i.InputInt4(2)); i.InputSimd128Register(1), i.InputInt4(2));
break; break;
} }
case kArmS8x16Shuffle: {
Simd128Register dst = i.OutputSimd128Register(),
src0 = i.InputSimd128Register(0),
src1 = i.InputSimd128Register(1);
DwVfpRegister table_base = src0.low();
int table_size = GetVtblTableSize(src0, src1);
// The shuffle lane mask is a byte mask, materialize in kScratchQuadReg.
int scratch_s_base = kScratchQuadReg.code() * 4;
for (int j = 0; j < 4; j++) {
int32_t four_lanes = i.InputInt32(2 + j);
// Ensure byte indices are in [0, 31] so masks are never NaNs.
four_lanes &= 0x1F1F1F1F;
__ vmov(SwVfpRegister::from_code(scratch_s_base + j),
bit_cast<float>(four_lanes));
}
NeonListOperand table(table_base, table_size);
if (!dst.is(src0) && !dst.is(src1)) {
__ vtbl(dst.low(), table, kScratchQuadReg.low());
__ vtbl(dst.high(), table, kScratchQuadReg.high());
} else {
__ vtbl(kScratchQuadReg.low(), table, kScratchQuadReg.low());
__ vtbl(kScratchQuadReg.high(), table, kScratchQuadReg.high());
__ vmov(dst, kScratchQuadReg);
}
break;
}
case kArmS32x2Reverse: { case kArmS32x2Reverse: {
__ vrev64(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0)); __ vrev64(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0));
break; break;
......
...@@ -239,12 +239,14 @@ namespace compiler { ...@@ -239,12 +239,14 @@ namespace compiler {
V(ArmS32x4UnzipRight) \ V(ArmS32x4UnzipRight) \
V(ArmS32x4TransposeLeft) \ V(ArmS32x4TransposeLeft) \
V(ArmS32x4TransposeRight) \ V(ArmS32x4TransposeRight) \
V(ArmS32x4Shuffle) \
V(ArmS16x8ZipLeft) \ V(ArmS16x8ZipLeft) \
V(ArmS16x8ZipRight) \ V(ArmS16x8ZipRight) \
V(ArmS16x8UnzipLeft) \ V(ArmS16x8UnzipLeft) \
V(ArmS16x8UnzipRight) \ V(ArmS16x8UnzipRight) \
V(ArmS16x8TransposeLeft) \ V(ArmS16x8TransposeLeft) \
V(ArmS16x8TransposeRight) \ V(ArmS16x8TransposeRight) \
V(ArmS16x8Shuffle) \
V(ArmS8x16ZipLeft) \ V(ArmS8x16ZipLeft) \
V(ArmS8x16ZipRight) \ V(ArmS8x16ZipRight) \
V(ArmS8x16UnzipLeft) \ V(ArmS8x16UnzipLeft) \
...@@ -252,6 +254,7 @@ namespace compiler { ...@@ -252,6 +254,7 @@ namespace compiler {
V(ArmS8x16TransposeLeft) \ V(ArmS8x16TransposeLeft) \
V(ArmS8x16TransposeRight) \ V(ArmS8x16TransposeRight) \
V(ArmS8x16Concat) \ V(ArmS8x16Concat) \
V(ArmS8x16Shuffle) \
V(ArmS32x2Reverse) \ V(ArmS32x2Reverse) \
V(ArmS16x4Reverse) \ V(ArmS16x4Reverse) \
V(ArmS16x2Reverse) \ V(ArmS16x2Reverse) \
......
...@@ -223,12 +223,14 @@ int InstructionScheduler::GetTargetInstructionFlags( ...@@ -223,12 +223,14 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmS32x4UnzipRight: case kArmS32x4UnzipRight:
case kArmS32x4TransposeLeft: case kArmS32x4TransposeLeft:
case kArmS32x4TransposeRight: case kArmS32x4TransposeRight:
case kArmS32x4Shuffle:
case kArmS16x8ZipLeft: case kArmS16x8ZipLeft:
case kArmS16x8ZipRight: case kArmS16x8ZipRight:
case kArmS16x8UnzipLeft: case kArmS16x8UnzipLeft:
case kArmS16x8UnzipRight: case kArmS16x8UnzipRight:
case kArmS16x8TransposeLeft: case kArmS16x8TransposeLeft:
case kArmS16x8TransposeRight: case kArmS16x8TransposeRight:
case kArmS16x8Shuffle:
case kArmS8x16ZipLeft: case kArmS8x16ZipLeft:
case kArmS8x16ZipRight: case kArmS8x16ZipRight:
case kArmS8x16UnzipLeft: case kArmS8x16UnzipLeft:
...@@ -236,6 +238,7 @@ int InstructionScheduler::GetTargetInstructionFlags( ...@@ -236,6 +238,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmS8x16TransposeLeft: case kArmS8x16TransposeLeft:
case kArmS8x16TransposeRight: case kArmS8x16TransposeRight:
case kArmS8x16Concat: case kArmS8x16Concat:
case kArmS8x16Shuffle:
case kArmS32x2Reverse: case kArmS32x2Reverse:
case kArmS16x4Reverse: case kArmS16x4Reverse:
case kArmS16x2Reverse: case kArmS16x2Reverse:
......
...@@ -2583,8 +2583,7 @@ static const ShuffleEntry<4> arch_s32x4_shuffles[] = { ...@@ -2583,8 +2583,7 @@ static const ShuffleEntry<4> arch_s32x4_shuffles[] = {
{{1, 3, 5, 7}, kArmS32x4UnzipRight}, {{1, 3, 5, 7}, kArmS32x4UnzipRight},
{{0, 4, 2, 6}, kArmS32x4TransposeLeft}, {{0, 4, 2, 6}, kArmS32x4TransposeLeft},
{{1, 5, 3, 7}, kArmS32x4TransposeRight}, {{1, 5, 3, 7}, kArmS32x4TransposeRight},
{{1, 0, 3, 2}, kArmS32x2Reverse}, {{1, 0, 3, 2}, kArmS32x2Reverse}};
};
static const ShuffleEntry<8> arch_s16x8_shuffles[] = { static const ShuffleEntry<8> arch_s16x8_shuffles[] = {
{{0, 8, 1, 9, 2, 10, 3, 11}, kArmS16x8ZipLeft}, {{0, 8, 1, 9, 2, 10, 3, 11}, kArmS16x8ZipLeft},
...@@ -2594,8 +2593,7 @@ static const ShuffleEntry<8> arch_s16x8_shuffles[] = { ...@@ -2594,8 +2593,7 @@ static const ShuffleEntry<8> arch_s16x8_shuffles[] = {
{{0, 8, 2, 10, 4, 12, 6, 14}, kArmS16x8TransposeLeft}, {{0, 8, 2, 10, 4, 12, 6, 14}, kArmS16x8TransposeLeft},
{{1, 9, 3, 11, 5, 13, 7, 15}, kArmS16x8TransposeRight}, {{1, 9, 3, 11, 5, 13, 7, 15}, kArmS16x8TransposeRight},
{{3, 2, 1, 0, 7, 6, 5, 4}, kArmS16x4Reverse}, {{3, 2, 1, 0, 7, 6, 5, 4}, kArmS16x4Reverse},
{{1, 0, 3, 2, 5, 4, 7, 6}, kArmS16x2Reverse}, {{1, 0, 3, 2, 5, 4, 7, 6}, kArmS16x2Reverse}};
};
static const ShuffleEntry<16> arch_s8x16_shuffles[] = { static const ShuffleEntry<16> arch_s8x16_shuffles[] = {
{{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23}, {{0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23},
...@@ -2612,8 +2610,7 @@ static const ShuffleEntry<16> arch_s8x16_shuffles[] = { ...@@ -2612,8 +2610,7 @@ static const ShuffleEntry<16> arch_s8x16_shuffles[] = {
kArmS8x16TransposeRight}, kArmS8x16TransposeRight},
{{7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}, kArmS8x8Reverse}, {{7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8}, kArmS8x8Reverse},
{{3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12}, kArmS8x4Reverse}, {{3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12}, kArmS8x4Reverse},
{{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}, kArmS8x2Reverse}, {{1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}, kArmS8x2Reverse}};
};
// Use a non-shuffle opcode to signal no match. // Use a non-shuffle opcode to signal no match.
static const ArchOpcode kNoShuffle = kArmS128Not; static const ArchOpcode kNoShuffle = kArmS128Not;
...@@ -2683,6 +2680,27 @@ uint8_t CanonicalizeShuffle(InstructionSelector* selector, Node* node, ...@@ -2683,6 +2680,27 @@ uint8_t CanonicalizeShuffle(InstructionSelector* selector, Node* node,
return mask; return mask;
} }
int32_t Pack4Lanes(const uint8_t* shuffle, uint8_t mask) {
int32_t result = 0;
for (int i = 3; i >= 0; i--) {
result <<= 8;
result |= shuffle[i] & mask;
}
return result;
}
void ArrangeShuffleTable(ArmOperandGenerator* g, Node* input0, Node* input1,
InstructionOperand* src0, InstructionOperand* src1) {
if (input0 == input1) {
// Unary, any q-register can be the table.
*src0 = *src1 = g->UseRegister(input0);
} else {
// Binary, table registers must be consecutive.
*src0 = g->UseFixed(input0, q0);
*src1 = g->UseFixed(input1, q1);
}
}
} // namespace } // namespace
void InstructionSelector::VisitS32x4Shuffle(Node* node) { void InstructionSelector::VisitS32x4Shuffle(Node* node) {
...@@ -2702,7 +2720,9 @@ void InstructionSelector::VisitS32x4Shuffle(Node* node) { ...@@ -2702,7 +2720,9 @@ void InstructionSelector::VisitS32x4Shuffle(Node* node) {
g.UseImmediate(lanes * 4)); g.UseImmediate(lanes * 4));
return; return;
} }
// TODO(bbudge) vtbl to handle all other shuffles. Emit(kArmS32x4Shuffle, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
g.UseImmediate(Pack4Lanes(shuffle, mask)));
} }
void InstructionSelector::VisitS16x8Shuffle(Node* node) { void InstructionSelector::VisitS16x8Shuffle(Node* node) {
...@@ -2715,13 +2735,20 @@ void InstructionSelector::VisitS16x8Shuffle(Node* node) { ...@@ -2715,13 +2735,20 @@ void InstructionSelector::VisitS16x8Shuffle(Node* node) {
return; return;
} }
ArmOperandGenerator g(this); ArmOperandGenerator g(this);
Node* input0 = node->InputAt(0);
Node* input1 = node->InputAt(1);
uint8_t lanes = TryMatchConcat<8>(shuffle, mask); uint8_t lanes = TryMatchConcat<8>(shuffle, mask);
if (lanes != 0) { if (lanes != 0) {
Emit(kArmS8x16Concat, g.DefineAsRegister(node), Emit(kArmS8x16Concat, g.DefineAsRegister(node), g.UseRegister(input0),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)), g.UseRegister(input1), g.UseImmediate(lanes * 2));
g.UseImmediate(lanes * 2)); return;
} }
// TODO(bbudge) vtbl to handle all other shuffles. // Code generator uses vtbl, arrange sources to form a valid lookup table.
InstructionOperand src0, src1;
ArrangeShuffleTable(&g, input0, input1, &src0, &src1);
Emit(kArmS16x8Shuffle, g.DefineAsRegister(node), src0, src1,
g.UseImmediate(Pack4Lanes(shuffle, mask)),
g.UseImmediate(Pack4Lanes(shuffle + 4, mask)));
} }
void InstructionSelector::VisitS8x16Shuffle(Node* node) { void InstructionSelector::VisitS8x16Shuffle(Node* node) {
...@@ -2734,13 +2761,22 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) { ...@@ -2734,13 +2761,22 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
return; return;
} }
ArmOperandGenerator g(this); ArmOperandGenerator g(this);
Node* input0 = node->InputAt(0);
Node* input1 = node->InputAt(1);
uint8_t lanes = TryMatchConcat<16>(shuffle, mask); uint8_t lanes = TryMatchConcat<16>(shuffle, mask);
if (lanes != 0) { if (lanes != 0) {
Emit(kArmS8x16Concat, g.DefineAsRegister(node), Emit(kArmS8x16Concat, g.DefineAsRegister(node), g.UseRegister(input0),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)), g.UseRegister(input1), g.UseImmediate(lanes));
g.UseImmediate(lanes)); return;
} }
// TODO(bbudge) vtbl to handle all other shuffles. // Code generator uses vtbl, arrange sources to form a valid lookup table.
InstructionOperand src0, src1;
ArrangeShuffleTable(&g, input0, input1, &src0, &src1);
Emit(kArmS8x16Shuffle, g.DefineAsRegister(node), src0, src1,
g.UseImmediate(Pack4Lanes(shuffle, mask)),
g.UseImmediate(Pack4Lanes(shuffle + 4, mask)),
g.UseImmediate(Pack4Lanes(shuffle + 8, mask)),
g.UseImmediate(Pack4Lanes(shuffle + 12, mask)));
} }
void InstructionSelector::VisitInt32AbsWithOverflow(Node* node) { void InstructionSelector::VisitInt32AbsWithOverflow(Node* node) {
......
...@@ -1710,6 +1710,12 @@ WASM_EXEC_COMPILED_TEST(S32x2Reverse) { ...@@ -1710,6 +1710,12 @@ WASM_EXEC_COMPILED_TEST(S32x2Reverse) {
RunBinaryLaneOpTest<int32_t>(kExprS32x4Shuffle, {{1, 0, 3, 2}}); RunBinaryLaneOpTest<int32_t>(kExprS32x4Shuffle, {{1, 0, 3, 2}});
} }
// Test irregular shuffle.
WASM_EXEC_COMPILED_TEST(S32x4Irregular) {
RunBinaryLaneOpTest<int32_t>(kExprS32x4Shuffle, {{0, 4, 4, 5}});
RunBinaryLaneOpTest<int32_t>(kExprS32x4Shuffle, {{0, 0, 0, 1}});
}
WASM_EXEC_COMPILED_TEST(S16x8ZipLeft) { WASM_EXEC_COMPILED_TEST(S16x8ZipLeft) {
RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle, {{0, 8, 1, 9, 2, 10, 3, 11}}); RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle, {{0, 8, 1, 9, 2, 10, 3, 11}});
RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle, {{0, 0, 1, 1, 2, 2, 3, 3}}); RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle, {{0, 0, 1, 1, 2, 2, 3, 3}});
...@@ -1753,6 +1759,11 @@ WASM_EXEC_COMPILED_TEST(S16x2Reverse) { ...@@ -1753,6 +1759,11 @@ WASM_EXEC_COMPILED_TEST(S16x2Reverse) {
RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle, {{1, 0, 3, 2, 5, 4, 7, 6}}); RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle, {{1, 0, 3, 2, 5, 4, 7, 6}});
} }
WASM_EXEC_COMPILED_TEST(S16x8Irregular) {
RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle, {{0, 8, 8, 0, 2, 10, 3, 11}});
RunBinaryLaneOpTest<int16_t>(kExprS16x8Shuffle, {{0, 0, 0, 0, 2, 2, 3, 3}});
}
WASM_EXEC_COMPILED_TEST(S8x16ZipLeft) { WASM_EXEC_COMPILED_TEST(S8x16ZipLeft) {
RunBinaryLaneOpTest<int8_t>( RunBinaryLaneOpTest<int8_t>(
kExprS8x16Shuffle, kExprS8x16Shuffle,
...@@ -1817,6 +1828,14 @@ WASM_EXEC_COMPILED_TEST(S8x2Reverse) { ...@@ -1817,6 +1828,14 @@ WASM_EXEC_COMPILED_TEST(S8x2Reverse) {
11, 10, 13, 12, 15, 14}}); 11, 10, 13, 12, 15, 14}});
} }
WASM_EXEC_COMPILED_TEST(S8x16Irregular) {
RunBinaryLaneOpTest<int8_t>(
kExprS8x16Shuffle,
{{0, 16, 0, 16, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23}});
RunBinaryLaneOpTest<int8_t>(
kExprS8x16Shuffle, {{0, 0, 0, 0, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7}});
}
// Test shuffles that concatenate the two vectors. // Test shuffles that concatenate the two vectors.
template <typename T> template <typename T>
void RunConcatOpTest(WasmOpcode simd_op) { void RunConcatOpTest(WasmOpcode simd_op) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment