Commit 8ab48b6c authored by Bill Budge's avatar Bill Budge Committed by Commit Bot

[WASM] Use scalar duplicate for matching shuffles on arm and arm64.

- Adds opcode for 32/16/8 bit dup instruction.
- Matches shuffles that are equivalent to dup's.

Bug: v8:6020
Change-Id: I8848d974adf30127d1dc31c09a9517f8f9573ce9
Reviewed-on: https://chromium-review.googlesource.com/571448
Commit-Queue: Bill Budge <bbudge@chromium.org>
Reviewed-by: 's avatarMartyn Capewell <martyn.capewell@arm.com>
Reviewed-by: 's avatarMircea Trofin <mtrofin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#46803}
parent 9d0438ba
......@@ -1577,7 +1577,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
case kArmF32x4Splat: {
int src_code = i.InputFloatRegister(0).code();
__ vdup(Neon32, i.OutputSimd128Register(),
DwVfpRegister::from_code(src_code / 2), src_code & 0x1);
DwVfpRegister::from_code(src_code / 2), src_code % 2);
break;
}
case kArmF32x4ExtractLane: {
......@@ -2088,6 +2088,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.OutputSimd128Register());
break;
}
case kArmS128Dup: {
NeonSize size = static_cast<NeonSize>(i.InputInt32(1));
int lanes = kSimd128Size >> size;
int index = i.InputInt32(2);
DCHECK(index < lanes);
int d_lanes = lanes / 2;
int src_d_index = index & (d_lanes - 1);
int src_d_code = i.InputSimd128Register(0).low().code() + index / d_lanes;
__ vdup(size, i.OutputSimd128Register(),
DwVfpRegister::from_code(src_d_code), src_d_index);
break;
}
case kArmS128And: {
__ vand(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
......
......@@ -228,6 +228,7 @@ namespace compiler {
V(ArmI8x16GtU) \
V(ArmI8x16GeU) \
V(ArmS128Zero) \
V(ArmS128Dup) \
V(ArmS128And) \
V(ArmS128Or) \
V(ArmS128Xor) \
......
......@@ -212,6 +212,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmI8x16GtU:
case kArmI8x16GeU:
case kArmS128Zero:
case kArmS128Dup:
case kArmS128And:
case kArmS128Or:
case kArmS128Xor:
......
......@@ -2538,35 +2538,6 @@ void InstructionSelector::VisitS128Select(Node* node) {
namespace {
// Tries to match 8x16 byte shuffle to equivalent 32x4 word shuffle.
bool TryMatch32x4Shuffle(const uint8_t* shuffle, uint8_t* shuffle32x4) {
static const int kLanes = 4;
static const int kLaneSize = 4;
for (int i = 0; i < kLanes; ++i) {
if (shuffle[i * kLaneSize] % kLaneSize != 0) return false;
for (int j = 1; j < kLaneSize; ++j) {
if (shuffle[i * kLaneSize + j] - shuffle[i * kLaneSize + j - 1] != 1)
return false;
}
shuffle32x4[i] = shuffle[i * kLaneSize] / kLaneSize;
}
return true;
}
// Tries to match byte shuffle to concatenate (vext) operation.
bool TryMatchConcat(const uint8_t* shuffle, uint8_t mask, uint8_t* offset) {
uint8_t start = shuffle[0];
for (int i = 1; i < kSimd128Size - start; ++i) {
if ((shuffle[i] & mask) != ((shuffle[i - 1] + 1) & mask)) return false;
}
uint8_t wrap = kSimd128Size;
for (int i = kSimd128Size - start; i < kSimd128Size; ++i, ++wrap) {
if ((shuffle[i] & mask) != (wrap & mask)) return false;
}
*offset = start;
return true;
}
struct ShuffleEntry {
uint8_t shuffle[kSimd128Size];
ArchOpcode opcode;
......@@ -2636,48 +2607,6 @@ bool TryMatchArchShuffle(const uint8_t* shuffle, const ShuffleEntry* table,
return false;
}
// Canonicalize shuffles to make pattern matching simpler. Returns a mask that
// will ignore the high bit of indices in some cases.
uint8_t CanonicalizeShuffle(InstructionSelector* selector, Node* node) {
static const int kUnaryShuffleMask = kSimd128Size - 1;
const uint8_t* shuffle = OpParameter<uint8_t*>(node);
uint8_t mask = 0xff;
// If shuffle is unary, set 'mask' to ignore the high bit of the indices.
// Replace any unused source with the other.
if (selector->GetVirtualRegister(node->InputAt(0)) ==
selector->GetVirtualRegister(node->InputAt(1))) {
// unary, src0 == src1.
mask = kUnaryShuffleMask;
} else {
bool src0_is_used = false;
bool src1_is_used = false;
for (int i = 0; i < kSimd128Size; i++) {
if (shuffle[i] < kSimd128Size) {
src0_is_used = true;
} else {
src1_is_used = true;
}
}
if (src0_is_used && !src1_is_used) {
node->ReplaceInput(1, node->InputAt(0));
mask = kUnaryShuffleMask;
} else if (src1_is_used && !src0_is_used) {
node->ReplaceInput(0, node->InputAt(1));
mask = kUnaryShuffleMask;
}
}
return mask;
}
int32_t Pack4Lanes(const uint8_t* shuffle, uint8_t mask) {
int32_t result = 0;
for (int i = 3; i >= 0; --i) {
result <<= 8;
result |= shuffle[i] & mask;
}
return result;
}
void ArrangeShuffleTable(ArmOperandGenerator* g, Node* input0, Node* input1,
InstructionOperand* src0, InstructionOperand* src1) {
if (input0 == input1) {
......@@ -2694,13 +2623,35 @@ void ArrangeShuffleTable(ArmOperandGenerator* g, Node* input0, Node* input1,
void InstructionSelector::VisitS8x16Shuffle(Node* node) {
const uint8_t* shuffle = OpParameter<uint8_t*>(node);
uint8_t mask = CanonicalizeShuffle(this, node);
uint8_t mask = CanonicalizeShuffle(node);
uint8_t shuffle32x4[4];
ArmOperandGenerator g(this);
int index = 0;
if (TryMatch32x4Shuffle(shuffle, shuffle32x4)) {
Emit(kArmS32x4Shuffle, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
g.UseImmediate(Pack4Lanes(shuffle32x4, mask)));
if (TryMatchDup<4>(shuffle, &index)) {
InstructionOperand src = index < 4 ? g.UseRegister(node->InputAt(0))
: g.UseRegister(node->InputAt(1));
Emit(kArmS128Dup, g.DefineAsRegister(node), src, g.UseImmediate(Neon32),
g.UseImmediate(index % 4));
} else {
Emit(kArmS32x4Shuffle, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
g.UseImmediate(Pack4Lanes(shuffle32x4, mask)));
}
return;
}
if (TryMatchDup<8>(shuffle, &index)) {
InstructionOperand src = index < 8 ? g.UseRegister(node->InputAt(0))
: g.UseRegister(node->InputAt(1));
Emit(kArmS128Dup, g.DefineAsRegister(node), src, g.UseImmediate(Neon16),
g.UseImmediate(index % 8));
return;
}
if (TryMatchDup<16>(shuffle, &index)) {
InstructionOperand src = index < 16 ? g.UseRegister(node->InputAt(0))
: g.UseRegister(node->InputAt(1));
Emit(kArmS128Dup, g.DefineAsRegister(node), src, g.UseImmediate(Neon8),
g.UseImmediate(index % 16));
return;
}
ArchOpcode opcode;
......
......@@ -2053,6 +2053,27 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
SIMD_BINOP_CASE(kArm64S128Or, Orr, 16B);
SIMD_BINOP_CASE(kArm64S128Xor, Eor, 16B);
SIMD_UNOP_CASE(kArm64S128Not, Mvn, 16B);
case kArm64S128Dup: {
VRegister dst = i.OutputSimd128Register(),
src = i.InputSimd128Register(0);
int lanes = i.InputInt32(1);
int index = i.InputInt32(2);
switch (lanes) {
case 4:
__ Dup(dst.V4S(), src.V4S(), index);
break;
case 8:
__ Dup(dst.V8H(), src.V8H(), index);
break;
case 16:
__ Dup(dst.V16B(), src.V16B(), index);
break;
default:
UNREACHABLE();
break;
}
break;
}
case kArm64S128Select: {
VRegister dst = i.OutputSimd128Register().V16B();
DCHECK(dst.is(i.InputSimd128Register(0).V16B()));
......
......@@ -260,6 +260,7 @@ namespace compiler {
V(Arm64I8x16GtU) \
V(Arm64I8x16GeU) \
V(Arm64S128Zero) \
V(Arm64S128Dup) \
V(Arm64S128And) \
V(Arm64S128Or) \
V(Arm64S128Xor) \
......
......@@ -236,6 +236,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64I8x16GtU:
case kArm64I8x16GeU:
case kArm64S128Zero:
case kArm64S128Dup:
case kArm64S128And:
case kArm64S128Or:
case kArm64S128Xor:
......
......@@ -3031,43 +3031,10 @@ void InstructionSelector::VisitS128Select(Node* node) {
g.UseRegister(node->InputAt(2)));
}
// Tries to match 8x16 byte shuffle to equivalent 32x4 word shuffle. If
// successful, writes the 32x4 shuffle indices.
bool TryMatch32x4Shuffle(const uint8_t* shuffle, uint8_t* shuffle32x4) {
for (int i = 0; i < 4; i++) {
if (shuffle[i * 4] % 4 != 0) return false;
for (int j = 1; j < 4; j++) {
if (shuffle[i * 4 + j] - shuffle[i * 4 + j - 1] != 1) return false;
}
shuffle32x4[i] = shuffle[i * 4] / 4;
}
return true;
}
// Tries to match byte shuffle to concatenate (vext) operation. If successful,
// writes the vext immediate value.
bool TryMatchConcat(const uint8_t* shuffle, uint8_t mask, uint8_t* vext) {
uint8_t start = shuffle[0];
int i = 1;
for (; i < 16 - start; i++) {
if ((shuffle[i] & mask) != ((shuffle[i - 1] + 1) & mask)) return false;
}
uint8_t wrap = 16;
for (; i < 16; i++, wrap++) {
if ((shuffle[i] & mask) != (wrap & mask)) return false;
}
*vext = start;
return true;
}
namespace {
static const int kShuffleLanes = 16;
static const int kMaxLaneIndex = 15;
static const int kMaxShuffleIndex = 31;
struct ShuffleEntry {
uint8_t shuffle[kShuffleLanes];
uint8_t shuffle[kSimd128Size];
ArchOpcode opcode;
};
......@@ -3126,12 +3093,12 @@ bool TryMatchArchShuffle(const uint8_t* shuffle, const ShuffleEntry* table,
for (size_t i = 0; i < num_entries; i++) {
const ShuffleEntry& entry = table[i];
int j = 0;
for (; j < kShuffleLanes; j++) {
for (; j < kSimd128Size; j++) {
if ((entry.shuffle[j] & mask) != (shuffle[j] & mask)) {
break;
}
}
if (j == kShuffleLanes) {
if (j == kSimd128Size) {
*opcode = entry.opcode;
return true;
}
......@@ -3139,47 +3106,6 @@ bool TryMatchArchShuffle(const uint8_t* shuffle, const ShuffleEntry* table,
return false;
}
// Canonicalize shuffles to make pattern matching simpler. Returns a mask that
// will ignore the high bit of indices in some cases.
uint8_t CanonicalizeShuffle(InstructionSelector* selector, Node* node) {
const uint8_t* shuffle = OpParameter<uint8_t*>(node);
uint8_t mask = kMaxShuffleIndex;
// If shuffle is unary, set 'mask' to ignore the high bit of the indices.
// Replace any unused source with the other.
if (selector->GetVirtualRegister(node->InputAt(0)) ==
selector->GetVirtualRegister(node->InputAt(1))) {
// unary, src0 == src1.
mask = kMaxLaneIndex;
} else {
bool src0_is_used = false;
bool src1_is_used = false;
for (int i = 0; i < 16; i++) {
if (shuffle[i] < 16) {
src0_is_used = true;
} else {
src1_is_used = true;
}
}
if (src0_is_used && !src1_is_used) {
node->ReplaceInput(1, node->InputAt(0));
mask = kMaxLaneIndex;
} else if (src1_is_used && !src0_is_used) {
node->ReplaceInput(0, node->InputAt(1));
mask = kMaxLaneIndex;
}
}
return mask;
}
int32_t Pack4Lanes(const uint8_t* shuffle, uint8_t mask) {
int32_t result = 0;
for (int i = 3; i >= 0; i--) {
result <<= 8;
result |= shuffle[i] & mask;
}
return result;
}
void ArrangeShuffleTable(Arm64OperandGenerator* g, Node* input0, Node* input1,
InstructionOperand* src0, InstructionOperand* src1) {
if (input0 == input1) {
......@@ -3196,7 +3122,7 @@ void ArrangeShuffleTable(Arm64OperandGenerator* g, Node* input0, Node* input1,
void InstructionSelector::VisitS8x16Shuffle(Node* node) {
const uint8_t* shuffle = OpParameter<uint8_t*>(node);
uint8_t mask = CanonicalizeShuffle(this, node);
uint8_t mask = CanonicalizeShuffle(node);
uint8_t shuffle32x4[4];
Arm64OperandGenerator g(this);
ArchOpcode opcode;
......@@ -3213,10 +3139,32 @@ void InstructionSelector::VisitS8x16Shuffle(Node* node) {
g.UseRegister(input1), g.UseImmediate(bias));
return;
}
int index = 0;
if (TryMatch32x4Shuffle(shuffle, shuffle32x4)) {
Emit(kArm64S32x4Shuffle, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
g.UseImmediate(Pack4Lanes(shuffle32x4, mask)));
if (TryMatchDup<4>(shuffle, &index)) {
InstructionOperand src = index < 4 ? g.UseRegister(node->InputAt(0))
: g.UseRegister(node->InputAt(1));
Emit(kArm64S128Dup, g.DefineAsRegister(node), src, g.UseImmediate(4),
g.UseImmediate(index % 4));
} else {
Emit(kArm64S32x4Shuffle, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
g.UseImmediate(Pack4Lanes(shuffle32x4, mask)));
}
return;
}
if (TryMatchDup<8>(shuffle, &index)) {
InstructionOperand src = index < 8 ? g.UseRegister(node->InputAt(0))
: g.UseRegister(node->InputAt(1));
Emit(kArm64S128Dup, g.DefineAsRegister(node), src, g.UseImmediate(8),
g.UseImmediate(index % 8));
return;
}
if (TryMatchDup<16>(shuffle, &index)) {
InstructionOperand src = index < 16 ? g.UseRegister(node->InputAt(0))
: g.UseRegister(node->InputAt(1));
Emit(kArm64S128Dup, g.DefineAsRegister(node), src, g.UseImmediate(16),
g.UseImmediate(index % 16));
return;
}
// Code generator uses vtbl, arrange sources to form a valid lookup table.
......
......@@ -2788,6 +2788,79 @@ FrameStateDescriptor* InstructionSelector::GetFrameStateDescriptor(
state_info.shared_info(), outer_state);
}
// static
bool InstructionSelector::TryMatch32x4Shuffle(const uint8_t* shuffle,
uint8_t* shuffle32x4) {
for (int i = 0; i < 4; ++i) {
if (shuffle[i * 4] % 4 != 0) return false;
for (int j = 1; j < 4; ++j) {
if (shuffle[i * 4 + j] - shuffle[i * 4 + j - 1] != 1) return false;
}
shuffle32x4[i] = shuffle[i * 4] / 4;
}
return true;
}
// static
bool InstructionSelector::TryMatchConcat(const uint8_t* shuffle, uint8_t mask,
uint8_t* vext) {
uint8_t start = shuffle[0];
int i = 1;
for (; i < 16 - start; ++i) {
if ((shuffle[i] & mask) != ((shuffle[i - 1] + 1) & mask)) return false;
}
uint8_t wrap = 16;
for (; i < 16; ++i, ++wrap) {
if ((shuffle[i] & mask) != (wrap & mask)) return false;
}
*vext = start;
return true;
}
// Canonicalize shuffles to make pattern matching simpler. Returns a mask that
// will ignore the high bit of indices in some cases.
uint8_t InstructionSelector::CanonicalizeShuffle(Node* node) {
static const int kMaxLaneIndex = 15;
static const int kMaxShuffleIndex = 31;
const uint8_t* shuffle = OpParameter<uint8_t*>(node);
uint8_t mask = kMaxShuffleIndex;
// If shuffle is unary, set 'mask' to ignore the high bit of the indices.
// Replace any unused source with the other.
if (GetVirtualRegister(node->InputAt(0)) ==
GetVirtualRegister(node->InputAt(1))) {
// unary, src0 == src1.
mask = kMaxLaneIndex;
} else {
bool src0_is_used = false;
bool src1_is_used = false;
for (int i = 0; i < 16; ++i) {
if (shuffle[i] < 16) {
src0_is_used = true;
} else {
src1_is_used = true;
}
}
if (src0_is_used && !src1_is_used) {
node->ReplaceInput(1, node->InputAt(0));
mask = kMaxLaneIndex;
} else if (src1_is_used && !src0_is_used) {
node->ReplaceInput(0, node->InputAt(1));
mask = kMaxLaneIndex;
}
}
return mask;
}
// static
int32_t InstructionSelector::Pack4Lanes(const uint8_t* shuffle, uint8_t mask) {
int32_t result = 0;
for (int i = 3; i >= 0; --i) {
result <<= 8;
result |= shuffle[i] & mask;
}
return result;
}
} // namespace compiler
} // namespace internal
......
......@@ -353,6 +353,51 @@ class V8_EXPORT_PRIVATE InstructionSelector final {
void EmitIdentity(Node* node);
bool CanProduceSignalingNaN(Node* node);
// ===========================================================================
// ============= Vector instruction (SIMD) helper fns. =======================
// ===========================================================================
// Tries to match a byte shuffle to a scalar splat operation. Returns the
// index of the lane if successful.
template <int LANES>
static bool TryMatchDup(const uint8_t* shuffle, int* index) {
const int kBytesPerLane = kSimd128Size / LANES;
// Get the first lane's worth of bytes and check that indices start at a
// lane boundary and are consecutive.
uint8_t lane0[kBytesPerLane];
lane0[0] = shuffle[0];
if (lane0[0] % kBytesPerLane != 0) return false;
for (int i = 1; i < kBytesPerLane; ++i) {
lane0[i] = shuffle[i];
if (lane0[i] != lane0[0] + i) return false;
}
// Now check that the other lanes are identical to lane0.
for (int i = 1; i < LANES; ++i) {
for (int j = 0; j < kBytesPerLane; ++j) {
if (lane0[j] != shuffle[i * kBytesPerLane + j]) return false;
}
}
*index = lane0[0] / kBytesPerLane;
return true;
}
// Tries to match 8x16 byte shuffle to an equivalent 32x4 word shuffle. If
// successful, it writes the 32x4 shuffle word indices.
static bool TryMatch32x4Shuffle(const uint8_t* shuffle, uint8_t* shuffle32x4);
// Tries to match a byte shuffle to a concatenate operation. If successful,
// it writes the byte offset.
static bool TryMatchConcat(const uint8_t* shuffle, uint8_t mask,
uint8_t* offset);
// Packs 4 bytes of shuffle into a 32 bit immediate, using a mask from
// CanonicalizeShuffle to convert unary shuffles.
static int32_t Pack4Lanes(const uint8_t* shuffle, uint8_t mask);
// Canonicalize shuffles to make pattern matching simpler. Returns a mask that
// will ignore the high bit of indices if shuffle is unary.
uint8_t CanonicalizeShuffle(Node* node);
// ===========================================================================
Schedule* schedule() const { return schedule_; }
......
......@@ -2107,35 +2107,6 @@ void InstructionSelector::VisitS128Select(Node* node) {
namespace {
// Tries to match 8x16 byte shuffle to equivalent 32x4 word shuffle.
bool TryMatch32x4Shuffle(const uint8_t* shuffle, uint8_t* shuffle32x4) {
static const int kLanes = 4;
static const int kLaneSize = 4;
for (int i = 0; i < kLanes; ++i) {
if (shuffle[i * kLaneSize] % kLaneSize != 0) return false;
for (int j = 1; j < kLaneSize; ++j) {
if (shuffle[i * kLaneSize + j] - shuffle[i * kLaneSize + j - 1] != 1)
return false;
}
shuffle32x4[i] = shuffle[i * kLaneSize] / kLaneSize;
}
return true;
}
// Tries to match byte shuffle to concatenate (sldi) operation.
bool TryMatchConcat(const uint8_t* shuffle, uint8_t mask, uint8_t* offset) {
uint8_t start = shuffle[0];
for (int i = 1; i < kSimd128Size - start; ++i) {
if ((shuffle[i] & mask) != ((shuffle[i - 1] + 1) & mask)) return false;
}
uint8_t wrap = kSimd128Size;
for (int i = kSimd128Size - start; i < kSimd128Size; ++i, ++wrap) {
if ((shuffle[i] & mask) != (wrap & mask)) return false;
}
*offset = start;
return true;
}
struct ShuffleEntry {
uint8_t shuffle[kSimd128Size];
ArchOpcode opcode;
......@@ -2204,53 +2175,11 @@ bool TryMatchArchShuffle(const uint8_t* shuffle, const ShuffleEntry* table,
return false;
}
// Canonicalize shuffles to make pattern matching simpler. Returns a mask that
// will ignore the high bit of indices in some cases.
uint8_t CanonicalizeShuffle(InstructionSelector* selector, Node* node) {
static const int kUnaryShuffleMask = kSimd128Size - 1;
const uint8_t* shuffle = OpParameter<uint8_t*>(node);
uint8_t mask = 0xff;
// If shuffle is unary, set 'mask' to ignore the high bit of the indices.
// Replace any unused source with the other.
if (selector->GetVirtualRegister(node->InputAt(0)) ==
selector->GetVirtualRegister(node->InputAt(1))) {
// unary, src0 == src1.
mask = kUnaryShuffleMask;
} else {
bool src0_is_used = false;
bool src1_is_used = false;
for (int i = 0; i < kSimd128Size; i++) {
if (shuffle[i] < kSimd128Size) {
src0_is_used = true;
} else {
src1_is_used = true;
}
}
if (src0_is_used && !src1_is_used) {
node->ReplaceInput(1, node->InputAt(0));
mask = kUnaryShuffleMask;
} else if (src1_is_used && !src0_is_used) {
node->ReplaceInput(0, node->InputAt(1));
mask = kUnaryShuffleMask;
}
}
return mask;
}
int32_t Pack4Lanes(const uint8_t* shuffle, uint8_t mask) {
int32_t result = 0;
for (int i = 3; i >= 0; --i) {
result <<= 8;
result |= shuffle[i] & mask;
}
return result;
}
} // namespace
void InstructionSelector::VisitS8x16Shuffle(Node* node) {
const uint8_t* shuffle = OpParameter<uint8_t*>(node);
uint8_t mask = CanonicalizeShuffle(this, node);
uint8_t mask = CanonicalizeShuffle(node);
uint8_t shuffle32x4[4];
ArchOpcode opcode;
if (TryMatchArchShuffle(shuffle, arch_shuffles, arraysize(arch_shuffles),
......
......@@ -2800,35 +2800,6 @@ void InstructionSelector::VisitS128Select(Node* node) {
namespace {
// Tries to match 8x16 byte shuffle to equivalent 32x4 word shuffle.
bool TryMatch32x4Shuffle(const uint8_t* shuffle, uint8_t* shuffle32x4) {
static const int kLanes = 4;
static const int kLaneSize = 4;
for (int i = 0; i < kLanes; ++i) {
if (shuffle[i * kLaneSize] % kLaneSize != 0) return false;
for (int j = 1; j < kLaneSize; ++j) {
if (shuffle[i * kLaneSize + j] - shuffle[i * kLaneSize + j - 1] != 1)
return false;
}
shuffle32x4[i] = shuffle[i * kLaneSize] / kLaneSize;
}
return true;
}
// Tries to match byte shuffle to concatenate (sldi) operation.
bool TryMatchConcat(const uint8_t* shuffle, uint8_t mask, uint8_t* offset) {
uint8_t start = shuffle[0];
for (int i = 1; i < kSimd128Size - start; ++i) {
if ((shuffle[i] & mask) != ((shuffle[i - 1] + 1) & mask)) return false;
}
uint8_t wrap = kSimd128Size;
for (int i = kSimd128Size - start; i < kSimd128Size; ++i, ++wrap) {
if ((shuffle[i] & mask) != (wrap & mask)) return false;
}
*offset = start;
return true;
}
struct ShuffleEntry {
uint8_t shuffle[kSimd128Size];
ArchOpcode opcode;
......@@ -2902,53 +2873,11 @@ bool TryMatchArchShuffle(const uint8_t* shuffle, const ShuffleEntry* table,
return false;
}
// Canonicalize shuffles to make pattern matching simpler. Returns a mask that
// will ignore the high bit of indices in some cases.
uint8_t CanonicalizeShuffle(InstructionSelector* selector, Node* node) {
static const int kUnaryShuffleMask = kSimd128Size - 1;
const uint8_t* shuffle = OpParameter<uint8_t*>(node);
uint8_t mask = 0xff;
// If shuffle is unary, set 'mask' to ignore the high bit of the indices.
// Replace any unused source with the other.
if (selector->GetVirtualRegister(node->InputAt(0)) ==
selector->GetVirtualRegister(node->InputAt(1))) {
// unary, src0 == src1.
mask = kUnaryShuffleMask;
} else {
bool src0_is_used = false;
bool src1_is_used = false;
for (int i = 0; i < kSimd128Size; i++) {
if (shuffle[i] < kSimd128Size) {
src0_is_used = true;
} else {
src1_is_used = true;
}
}
if (src0_is_used && !src1_is_used) {
node->ReplaceInput(1, node->InputAt(0));
mask = kUnaryShuffleMask;
} else if (src1_is_used && !src0_is_used) {
node->ReplaceInput(0, node->InputAt(1));
mask = kUnaryShuffleMask;
}
}
return mask;
}
int32_t Pack4Lanes(const uint8_t* shuffle, uint8_t mask) {
int32_t result = 0;
for (int i = 3; i >= 0; --i) {
result <<= 8;
result |= shuffle[i] & mask;
}
return result;
}
} // namespace
void InstructionSelector::VisitS8x16Shuffle(Node* node) {
const uint8_t* shuffle = OpParameter<uint8_t*>(node);
uint8_t mask = CanonicalizeShuffle(this, node);
uint8_t mask = CanonicalizeShuffle(node);
uint8_t shuffle32x4[4];
ArchOpcode opcode;
if (TryMatchArchShuffle(shuffle, arch_shuffles, arraysize(arch_shuffles),
......
......@@ -1591,6 +1591,14 @@ WASM_SIMD_TEST(F32x4AddHoriz) {
V8_TARGET_ARCH_MIPS64
// Test some regular shuffles that may have special handling on some targets.
// Test a normal and unary versions (where second operand isn't used).
WASM_SIMD_TEST(S32x4Dup) {
RunBinaryLaneOpTest<int8_t>(
kExprS8x16Shuffle,
{{16, 17, 18, 19, 16, 17, 18, 19, 16, 17, 18, 19, 16, 17, 18, 19}});
RunBinaryLaneOpTest<int8_t>(
kExprS8x16Shuffle, {{4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7}});
}
WASM_SIMD_TEST(S32x4ZipLeft) {
RunBinaryLaneOpTest<int8_t>(
kExprS8x16Shuffle,
......@@ -1657,6 +1665,14 @@ WASM_SIMD_TEST(S32x4Irregular) {
kExprS8x16Shuffle, {{0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7}});
}
WASM_SIMD_TEST(S16x8Dup) {
RunBinaryLaneOpTest<int8_t>(
kExprS8x16Shuffle,
{{18, 19, 18, 19, 18, 19, 18, 19, 18, 19, 18, 19, 18, 19, 18, 19}});
RunBinaryLaneOpTest<int8_t>(
kExprS8x16Shuffle, {{6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7, 6, 7}});
}
WASM_SIMD_TEST(S16x8ZipLeft) {
RunBinaryLaneOpTest<int8_t>(
kExprS8x16Shuffle,
......@@ -1726,6 +1742,14 @@ WASM_SIMD_TEST(S16x8Irregular) {
kExprS8x16Shuffle, {{0, 1, 0, 1, 0, 1, 0, 1, 4, 5, 4, 5, 6, 7, 6, 7}});
}
WASM_SIMD_TEST(S8x16Dup) {
RunBinaryLaneOpTest<int8_t>(
kExprS8x16Shuffle,
{{19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19}});
RunBinaryLaneOpTest<int8_t>(
kExprS8x16Shuffle, {{7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7}});
}
WASM_SIMD_TEST(S8x16ZipLeft) {
RunBinaryLaneOpTest<int8_t>(
kExprS8x16Shuffle,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment