Commit a60707f5 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][arm] Bitmask instructions

Implement i8x16.bitmask, i16x8.bitmask, i32x4.bitmask on arm.

Bug: v8:10308
Change-Id: Ifa2439522b74a310d98621104deda80f3dc25b33
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2101697Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66909}
parent bbe51873
......@@ -2353,6 +2353,23 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vabs(Neon32, i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
}
case kArmI32x4BitMask: {
Register dst = i.OutputRegister();
Simd128Register src = i.InputSimd128Register(0);
Simd128Register tmp2 = i.TempSimd128Register(0);
Simd128Register mask = i.TempSimd128Register(1);
__ vshr(NeonS32, tmp2, src, 31);
// Set i-th bit of each lane i. When AND with tmp, the lanes that
// are signed will have i-th bit set, unsigned will be 0.
__ vmov(mask.low(), Double((uint64_t)0x0000'0002'0000'0001));
__ vmov(mask.high(), Double((uint64_t)0x0000'0008'0000'0004));
__ vand(tmp2, mask, tmp2);
__ vpadd(Neon32, tmp2.low(), tmp2.low(), tmp2.high());
__ vpadd(Neon32, tmp2.low(), tmp2.low(), kDoubleRegZero);
__ VmovLow(dst, tmp2.low());
break;
}
case kArmI16x8Splat: {
__ vdup(Neon16, i.OutputSimd128Register(), i.InputRegister(0));
break;
......@@ -2513,6 +2530,24 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vabs(Neon16, i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
}
case kArmI16x8BitMask: {
Register dst = i.OutputRegister();
Simd128Register src = i.InputSimd128Register(0);
Simd128Register tmp2 = i.TempSimd128Register(0);
Simd128Register mask = i.TempSimd128Register(1);
__ vshr(NeonS16, tmp2, src, 15);
// Set i-th bit of each lane i. When AND with tmp, the lanes that
// are signed will have i-th bit set, unsigned will be 0.
__ vmov(mask.low(), Double((uint64_t)0x0008'0004'0002'0001));
__ vmov(mask.high(), Double((uint64_t)0x0080'0040'0020'0010));
__ vand(tmp2, mask, tmp2);
__ vpadd(Neon16, tmp2.low(), tmp2.low(), tmp2.high());
__ vpadd(Neon16, tmp2.low(), tmp2.low(), tmp2.low());
__ vpadd(Neon16, tmp2.low(), tmp2.low(), tmp2.low());
__ vmov(NeonU16, dst, tmp2.low(), 0);
break;
}
case kArmI8x16Splat: {
__ vdup(Neon8, i.OutputSimd128Register(), i.InputRegister(0));
break;
......@@ -2649,6 +2684,26 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vabs(Neon8, i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
}
case kArmI8x16BitMask: {
Register dst = i.OutputRegister();
Simd128Register src = i.InputSimd128Register(0);
Simd128Register tmp2 = i.TempSimd128Register(0);
Simd128Register mask = i.TempSimd128Register(1);
__ vshr(NeonS8, tmp2, src, 7);
// Set i-th bit of each lane i. When AND with tmp, the lanes that
// are signed will have i-th bit set, unsigned will be 0.
__ vmov(mask.low(), Double((uint64_t)0x8040'2010'0804'0201));
__ vmov(mask.high(), Double((uint64_t)0x8040'2010'0804'0201));
__ vand(tmp2, mask, tmp2);
__ vext(mask, tmp2, tmp2, 8);
__ vzip(Neon8, mask, tmp2);
__ vpadd(Neon16, tmp2.low(), tmp2.low(), tmp2.high());
__ vpadd(Neon16, tmp2.low(), tmp2.low(), tmp2.low());
__ vpadd(Neon16, tmp2.low(), tmp2.low(), tmp2.low());
__ vmov(NeonU16, dst, tmp2.low(), 0);
break;
}
case kArmS128Zero: {
__ veor(i.OutputSimd128Register(), i.OutputSimd128Register(),
i.OutputSimd128Register());
......
......@@ -202,6 +202,7 @@ namespace compiler {
V(ArmI32x4GtU) \
V(ArmI32x4GeU) \
V(ArmI32x4Abs) \
V(ArmI32x4BitMask) \
V(ArmI16x8Splat) \
V(ArmI16x8ExtractLaneS) \
V(ArmI16x8ReplaceLane) \
......@@ -236,6 +237,7 @@ namespace compiler {
V(ArmI16x8GeU) \
V(ArmI16x8RoundingAverageU) \
V(ArmI16x8Abs) \
V(ArmI16x8BitMask) \
V(ArmI8x16Splat) \
V(ArmI8x16ExtractLaneS) \
V(ArmI8x16ReplaceLane) \
......@@ -265,6 +267,7 @@ namespace compiler {
V(ArmI8x16GeU) \
V(ArmI8x16RoundingAverageU) \
V(ArmI8x16Abs) \
V(ArmI8x16BitMask) \
V(ArmS128Zero) \
V(ArmS128Dup) \
V(ArmS128And) \
......
......@@ -182,6 +182,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmI32x4GtU:
case kArmI32x4GeU:
case kArmI32x4Abs:
case kArmI32x4BitMask:
case kArmI16x8Splat:
case kArmI16x8ExtractLaneS:
case kArmI16x8ReplaceLane:
......@@ -216,6 +217,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmI16x8GeU:
case kArmI16x8RoundingAverageU:
case kArmI16x8Abs:
case kArmI16x8BitMask:
case kArmI8x16Splat:
case kArmI8x16ExtractLaneS:
case kArmI8x16ReplaceLane:
......@@ -245,6 +247,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmI8x16GeU:
case kArmI8x16RoundingAverageU:
case kArmI8x16Abs:
case kArmI8x16BitMask:
case kArmS128Zero:
case kArmS128Dup:
case kArmS128And:
......
......@@ -2915,6 +2915,29 @@ void InstructionSelector::VisitInt64AbsWithOverflow(Node* node) {
UNREACHABLE();
}
namespace {
template <ArchOpcode opcode>
void VisitBitMask(InstructionSelector* selector, Node* node) {
ArmOperandGenerator g(selector);
InstructionOperand temps[] = {g.TempSimd128Register(),
g.TempSimd128Register()};
selector->Emit(opcode, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)), arraysize(temps), temps);
}
} // namespace
void InstructionSelector::VisitI8x16BitMask(Node* node) {
VisitBitMask<kArmI8x16BitMask>(this, node);
}
void InstructionSelector::VisitI16x8BitMask(Node* node) {
VisitBitMask<kArmI16x8BitMask>(this, node);
}
void InstructionSelector::VisitI32x4BitMask(Node* node) {
VisitBitMask<kArmI32x4BitMask>(this, node);
}
// static
MachineOperatorBuilder::Flags
InstructionSelector::SupportedMachineOperatorFlags() {
......
......@@ -2634,11 +2634,11 @@ void InstructionSelector::VisitI64x2MinU(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2MaxU(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_S390X
#if !V8_TARGET_ARCH_ARM64
#if !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM
void InstructionSelector::VisitI8x16BitMask(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI16x8BitMask(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI32x4BitMask(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_ARM64
#endif // !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_ARM
void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); }
......
......@@ -1661,7 +1661,7 @@ WASM_SIMD_TEST(I16x8ReplaceLane) {
}
}
#if V8_TARGET_ARCH_ARM64
#if V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
WASM_SIMD_TEST_NO_LOWERING(I8x16BitMask) {
FLAG_SCOPE(wasm_simd_post_mvp);
WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
......@@ -1721,7 +1721,7 @@ WASM_SIMD_TEST_NO_LOWERING(I32x4BitMask) {
CHECK_EQ(actual, expected);
}
}
#endif // V8_TARGET_ARCH_ARM64
#endif // V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
WASM_SIMD_TEST(I8x16Splat) {
WasmRunner<int32_t, int32_t> r(execution_tier, lower_simd);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment