Commit d22326bb authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd] Implement rounding average for arm

Bug: v8:10039
Change-Id: If7c9668821a1cdfd5968f1533c3412247567bf3e
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1955550Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#65615}
parent 5eb4fada
......@@ -4143,7 +4143,8 @@ enum IntegerBinOp {
VTST,
VCEQ,
VCGE,
VCGT
VCGT,
VRHADD
};
static Instr EncodeNeonBinOp(IntegerBinOp op, NeonDataType dt,
......@@ -4184,6 +4185,9 @@ static Instr EncodeNeonBinOp(IntegerBinOp op, NeonDataType dt,
case VCGT:
op_encoding = 0x3 * B8;
break;
case VRHADD:
op_encoding = B8;
break;
default:
UNREACHABLE();
}
......@@ -4583,6 +4587,14 @@ void Assembler::vcgt(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
emit(EncodeNeonBinOp(VCGT, dt, dst, src1, src2));
}
void Assembler::vrhadd(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vrhadd(Qn, Qm) SIMD integer rounding halving add.
// Instruction details available in ARM DDI 0406C.b, A8-1030.
emit(EncodeNeonBinOp(VRHADD, dt, dst, src1, src2));
}
void Assembler::vext(QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2, int bytes) {
DCHECK(IsEnabled(NEON));
......
......@@ -925,6 +925,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vcgt(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2);
void vcgt(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2);
void vrhadd(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2);
void vext(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2,
int bytes);
void vzip(NeonSize size, DwVfpRegister src1, DwVfpRegister src2);
......
......@@ -2465,6 +2465,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(1));
break;
}
case kArmI16x8RoundingAverageU: {
__ vrhadd(NeonU16, i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
break;
}
case kArmI8x16Splat: {
__ vdup(Neon8, i.OutputSimd128Register(), i.InputRegister(0));
break;
......@@ -2612,6 +2617,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(1));
break;
}
case kArmI8x16RoundingAverageU: {
__ vrhadd(NeonU8, i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
break;
}
case kArmS128Zero: {
__ veor(i.OutputSimd128Register(), i.OutputSimd128Register(),
i.OutputSimd128Register());
......
......@@ -232,6 +232,7 @@ namespace compiler {
V(ArmI16x8MaxU) \
V(ArmI16x8GtU) \
V(ArmI16x8GeU) \
V(ArmI16x8RoundingAverageU) \
V(ArmI8x16Splat) \
V(ArmI8x16ExtractLaneS) \
V(ArmI8x16ReplaceLane) \
......@@ -259,6 +260,7 @@ namespace compiler {
V(ArmI8x16MaxU) \
V(ArmI8x16GtU) \
V(ArmI8x16GeU) \
V(ArmI8x16RoundingAverageU) \
V(ArmS128Zero) \
V(ArmS128Dup) \
V(ArmS128And) \
......
......@@ -212,6 +212,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmI16x8MaxU:
case kArmI16x8GtU:
case kArmI16x8GeU:
case kArmI16x8RoundingAverageU:
case kArmI8x16Splat:
case kArmI8x16ExtractLaneS:
case kArmI8x16ReplaceLane:
......@@ -239,6 +240,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmI8x16MaxU:
case kArmI8x16GtU:
case kArmI8x16GeU:
case kArmI8x16RoundingAverageU:
case kArmS128Zero:
case kArmS128Dup:
case kArmS128And:
......
......@@ -2491,84 +2491,86 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
V(I8x16ShrS) \
V(I8x16ShrU)
#define SIMD_BINOP_LIST(V) \
V(F64x2Add, kArmF64x2Add) \
V(F64x2Sub, kArmF64x2Sub) \
V(F64x2Mul, kArmF64x2Mul) \
V(F64x2Div, kArmF64x2Div) \
V(F64x2Min, kArmF64x2Min) \
V(F64x2Max, kArmF64x2Max) \
V(F64x2Eq, kArmF64x2Eq) \
V(F64x2Ne, kArmF64x2Ne) \
V(F64x2Lt, kArmF64x2Lt) \
V(F64x2Le, kArmF64x2Le) \
V(F32x4Add, kArmF32x4Add) \
V(F32x4AddHoriz, kArmF32x4AddHoriz) \
V(F32x4Sub, kArmF32x4Sub) \
V(F32x4Mul, kArmF32x4Mul) \
V(F32x4Min, kArmF32x4Min) \
V(F32x4Max, kArmF32x4Max) \
V(F32x4Eq, kArmF32x4Eq) \
V(F32x4Ne, kArmF32x4Ne) \
V(F32x4Lt, kArmF32x4Lt) \
V(F32x4Le, kArmF32x4Le) \
V(I64x2Add, kArmI64x2Add) \
V(I64x2Sub, kArmI64x2Sub) \
V(I32x4Add, kArmI32x4Add) \
V(I32x4AddHoriz, kArmI32x4AddHoriz) \
V(I32x4Sub, kArmI32x4Sub) \
V(I32x4Mul, kArmI32x4Mul) \
V(I32x4MinS, kArmI32x4MinS) \
V(I32x4MaxS, kArmI32x4MaxS) \
V(I32x4Eq, kArmI32x4Eq) \
V(I32x4Ne, kArmI32x4Ne) \
V(I32x4GtS, kArmI32x4GtS) \
V(I32x4GeS, kArmI32x4GeS) \
V(I32x4MinU, kArmI32x4MinU) \
V(I32x4MaxU, kArmI32x4MaxU) \
V(I32x4GtU, kArmI32x4GtU) \
V(I32x4GeU, kArmI32x4GeU) \
V(I16x8SConvertI32x4, kArmI16x8SConvertI32x4) \
V(I16x8Add, kArmI16x8Add) \
V(I16x8AddSaturateS, kArmI16x8AddSaturateS) \
V(I16x8AddHoriz, kArmI16x8AddHoriz) \
V(I16x8Sub, kArmI16x8Sub) \
V(I16x8SubSaturateS, kArmI16x8SubSaturateS) \
V(I16x8Mul, kArmI16x8Mul) \
V(I16x8MinS, kArmI16x8MinS) \
V(I16x8MaxS, kArmI16x8MaxS) \
V(I16x8Eq, kArmI16x8Eq) \
V(I16x8Ne, kArmI16x8Ne) \
V(I16x8GtS, kArmI16x8GtS) \
V(I16x8GeS, kArmI16x8GeS) \
V(I16x8UConvertI32x4, kArmI16x8UConvertI32x4) \
V(I16x8AddSaturateU, kArmI16x8AddSaturateU) \
V(I16x8SubSaturateU, kArmI16x8SubSaturateU) \
V(I16x8MinU, kArmI16x8MinU) \
V(I16x8MaxU, kArmI16x8MaxU) \
V(I16x8GtU, kArmI16x8GtU) \
V(I16x8GeU, kArmI16x8GeU) \
V(I8x16SConvertI16x8, kArmI8x16SConvertI16x8) \
V(I8x16Add, kArmI8x16Add) \
V(I8x16AddSaturateS, kArmI8x16AddSaturateS) \
V(I8x16Sub, kArmI8x16Sub) \
V(I8x16SubSaturateS, kArmI8x16SubSaturateS) \
V(I8x16Mul, kArmI8x16Mul) \
V(I8x16MinS, kArmI8x16MinS) \
V(I8x16MaxS, kArmI8x16MaxS) \
V(I8x16Eq, kArmI8x16Eq) \
V(I8x16Ne, kArmI8x16Ne) \
V(I8x16GtS, kArmI8x16GtS) \
V(I8x16GeS, kArmI8x16GeS) \
V(I8x16UConvertI16x8, kArmI8x16UConvertI16x8) \
V(I8x16AddSaturateU, kArmI8x16AddSaturateU) \
V(I8x16SubSaturateU, kArmI8x16SubSaturateU) \
V(I8x16MinU, kArmI8x16MinU) \
V(I8x16MaxU, kArmI8x16MaxU) \
V(I8x16GtU, kArmI8x16GtU) \
V(I8x16GeU, kArmI8x16GeU) \
V(S128And, kArmS128And) \
V(S128Or, kArmS128Or) \
#define SIMD_BINOP_LIST(V) \
V(F64x2Add, kArmF64x2Add) \
V(F64x2Sub, kArmF64x2Sub) \
V(F64x2Mul, kArmF64x2Mul) \
V(F64x2Div, kArmF64x2Div) \
V(F64x2Min, kArmF64x2Min) \
V(F64x2Max, kArmF64x2Max) \
V(F64x2Eq, kArmF64x2Eq) \
V(F64x2Ne, kArmF64x2Ne) \
V(F64x2Lt, kArmF64x2Lt) \
V(F64x2Le, kArmF64x2Le) \
V(F32x4Add, kArmF32x4Add) \
V(F32x4AddHoriz, kArmF32x4AddHoriz) \
V(F32x4Sub, kArmF32x4Sub) \
V(F32x4Mul, kArmF32x4Mul) \
V(F32x4Min, kArmF32x4Min) \
V(F32x4Max, kArmF32x4Max) \
V(F32x4Eq, kArmF32x4Eq) \
V(F32x4Ne, kArmF32x4Ne) \
V(F32x4Lt, kArmF32x4Lt) \
V(F32x4Le, kArmF32x4Le) \
V(I64x2Add, kArmI64x2Add) \
V(I64x2Sub, kArmI64x2Sub) \
V(I32x4Add, kArmI32x4Add) \
V(I32x4AddHoriz, kArmI32x4AddHoriz) \
V(I32x4Sub, kArmI32x4Sub) \
V(I32x4Mul, kArmI32x4Mul) \
V(I32x4MinS, kArmI32x4MinS) \
V(I32x4MaxS, kArmI32x4MaxS) \
V(I32x4Eq, kArmI32x4Eq) \
V(I32x4Ne, kArmI32x4Ne) \
V(I32x4GtS, kArmI32x4GtS) \
V(I32x4GeS, kArmI32x4GeS) \
V(I32x4MinU, kArmI32x4MinU) \
V(I32x4MaxU, kArmI32x4MaxU) \
V(I32x4GtU, kArmI32x4GtU) \
V(I32x4GeU, kArmI32x4GeU) \
V(I16x8SConvertI32x4, kArmI16x8SConvertI32x4) \
V(I16x8Add, kArmI16x8Add) \
V(I16x8AddSaturateS, kArmI16x8AddSaturateS) \
V(I16x8AddHoriz, kArmI16x8AddHoriz) \
V(I16x8Sub, kArmI16x8Sub) \
V(I16x8SubSaturateS, kArmI16x8SubSaturateS) \
V(I16x8Mul, kArmI16x8Mul) \
V(I16x8MinS, kArmI16x8MinS) \
V(I16x8MaxS, kArmI16x8MaxS) \
V(I16x8Eq, kArmI16x8Eq) \
V(I16x8Ne, kArmI16x8Ne) \
V(I16x8GtS, kArmI16x8GtS) \
V(I16x8GeS, kArmI16x8GeS) \
V(I16x8UConvertI32x4, kArmI16x8UConvertI32x4) \
V(I16x8AddSaturateU, kArmI16x8AddSaturateU) \
V(I16x8SubSaturateU, kArmI16x8SubSaturateU) \
V(I16x8MinU, kArmI16x8MinU) \
V(I16x8MaxU, kArmI16x8MaxU) \
V(I16x8GtU, kArmI16x8GtU) \
V(I16x8GeU, kArmI16x8GeU) \
V(I16x8RoundingAverageU, kArmI16x8RoundingAverageU) \
V(I8x16SConvertI16x8, kArmI8x16SConvertI16x8) \
V(I8x16Add, kArmI8x16Add) \
V(I8x16AddSaturateS, kArmI8x16AddSaturateS) \
V(I8x16Sub, kArmI8x16Sub) \
V(I8x16SubSaturateS, kArmI8x16SubSaturateS) \
V(I8x16Mul, kArmI8x16Mul) \
V(I8x16MinS, kArmI8x16MinS) \
V(I8x16MaxS, kArmI8x16MaxS) \
V(I8x16Eq, kArmI8x16Eq) \
V(I8x16Ne, kArmI8x16Ne) \
V(I8x16GtS, kArmI8x16GtS) \
V(I8x16GeS, kArmI8x16GeS) \
V(I8x16UConvertI16x8, kArmI8x16UConvertI16x8) \
V(I8x16AddSaturateU, kArmI8x16AddSaturateU) \
V(I8x16SubSaturateU, kArmI8x16SubSaturateU) \
V(I8x16MinU, kArmI8x16MinU) \
V(I8x16MaxU, kArmI8x16MaxU) \
V(I8x16GtU, kArmI8x16GtU) \
V(I8x16GeU, kArmI8x16GeU) \
V(I8x16RoundingAverageU, kArmI8x16RoundingAverageU) \
V(S128And, kArmS128And) \
V(S128Or, kArmS128Or) \
V(S128Xor, kArmS128Xor)
void InstructionSelector::VisitS128Zero(Node* node) {
......
......@@ -2635,13 +2635,13 @@ void InstructionSelector::VisitF64x2UConvertI64x2(Node* node) {
UNIMPLEMENTED();
}
#if !V8_TARGET_ARCH_ARM64
#if !V8_TARGET_ARCH_ARM
void InstructionSelector::VisitI16x8RoundingAverageU(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitI8x16RoundingAverageU(Node* node) {
UNIMPLEMENTED();
}
#if !V8_TARGET_ARCH_ARM
void InstructionSelector::VisitLoadTransform(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_ARM
#if !V8_TARGET_ARCH_IA32
......
......@@ -2089,6 +2089,16 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"veor q%d, q%d, q%d", Vd, Vn, Vm);
}
} else if (instr->Bit(4) == 0) {
if (instr->Bit(6) == 1) {
// vrhadd.u<size> Qd, Qm, Qn.
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_,
"vrhadd.u%d q%d, q%d, q%d", size, Vd, Vn, Vm);
} else {
// vrhadd.u<size> Dd, Dm, Dn.
Unknown(instr);
}
} else {
Unknown(instr);
}
......
......@@ -4225,6 +4225,20 @@ void PairwiseAdd(Simulator* simulator, int Vd, int Vm, int Vn) {
simulator->set_neon_register<T, kDoubleSize>(Vd, dst);
}
template <typename T, int SIZE = kSimd128Size>
void RoundingAverageUnsigned(Simulator* simulator, int Vd, int Vm, int Vn) {
static_assert(std::is_unsigned<T>::value,
"Implemented only for unsigned types.");
static const int kElems = SIZE / sizeof(T);
T src1[kElems], src2[kElems];
simulator->get_neon_register<T, SIZE>(Vn, src1);
simulator->get_neon_register<T, SIZE>(Vm, src2);
for (int i = 0; i < kElems; i++) {
src1[i] = base::RoundingAverageUnsigned(src1[i], src2[i]);
}
simulator->set_neon_register<T, SIZE>(Vd, src1);
}
void Simulator::DecodeSpecialCondition(Instruction* instr) {
switch (instr->SpecialValue()) {
case 4: {
......@@ -4721,6 +4735,27 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
for (int i = 0; i < 4; i++) src1[i] ^= src2[i];
set_neon_register(Vd, src1);
}
} else if (instr->Bit(4) == 0) {
if (instr->Bit(6) == 0) {
// vrhadd.u<size> Dd, Dm, Dn.
UNIMPLEMENTED();
}
// vrhadd.u<size> Qd, Qm, Qn.
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
switch (size) {
case Neon8:
RoundingAverageUnsigned<uint8_t>(this, Vd, Vm, Vn);
break;
case Neon16:
RoundingAverageUnsigned<uint16_t>(this, Vd, Vm, Vn);
break;
case Neon32:
RoundingAverageUnsigned<uint32_t>(this, Vd, Vm, Vn);
break;
default:
UNREACHABLE();
break;
}
} else {
UNIMPLEMENTED();
}
......
......@@ -1233,6 +1233,12 @@ TEST(Neon) {
"f3142360 vcgt.u16 q1, q2, q8");
COMPARE(vcgt(NeonS32, q15, q0, q8),
"f260e360 vcgt.s32 q15, q0, q8");
COMPARE(vrhadd(NeonU8, q0, q1, q2),
"f3020144 vrhadd.u8 q0, q1, q2");
COMPARE(vrhadd(NeonU16, q1, q2, q8),
"f3142160 vrhadd.u16 q1, q2, q8");
COMPARE(vrhadd(NeonU32, q15, q0, q8),
"f360e160 vrhadd.u32 q15, q0, q8");
COMPARE(vbsl(q0, q1, q2),
"f3120154 vbsl q0, q1, q2");
COMPARE(vbsl(q15, q0, q8),
......
......@@ -2181,13 +2181,13 @@ WASM_SIMD_TEST(I16x8LeU) {
UnsignedLessEqual);
}
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
WASM_SIMD_TEST_NO_LOWERING(I16x8RoundingAverageU) {
RunI16x8BinOpTest<uint16_t>(execution_tier, lower_simd,
kExprI16x8RoundingAverageU,
base::RoundingAverageUnsigned);
}
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
void RunI16x8ShiftOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode opcode, Int16ShiftOp expected_op) {
......@@ -2407,13 +2407,13 @@ WASM_SIMD_TEST(I8x16Mul) {
base::MulWithWraparound);
}
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
WASM_SIMD_TEST_NO_LOWERING(I8x16RoundingAverageU) {
RunI8x16BinOpTest<uint8_t>(execution_tier, lower_simd,
kExprI8x16RoundingAverageU,
base::RoundingAverageUnsigned);
}
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
void RunI8x16ShiftOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode opcode, Int8ShiftOp expected_op) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment