Commit 9ff2de44 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd] Implement i64x2.mul on arm

Bug: v8:9813
Change-Id: I0436c6a90284559a110e99476c12ae39183c961e
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1994382
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Cr-Commit-Position: refs/heads/master@{#65846}
parent 73c9a994
...@@ -4267,6 +4267,25 @@ void Assembler::vqsub(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1, ...@@ -4267,6 +4267,25 @@ void Assembler::vqsub(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
emit(EncodeNeonBinOp(VQSUB, dt, dst, src1, src2)); emit(EncodeNeonBinOp(VQSUB, dt, dst, src1, src2));
} }
void Assembler::vmlal(NeonDataType dt, QwNeonRegister dst, DwVfpRegister src1,
DwVfpRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vmlal(Dn, Dm) Vector Multiply Accumulate Long (integer)
// Instruction details available in ARM DDI 0406C.b, A8-931.
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
int size = NeonSz(dt);
int u = NeonU(dt);
if (!u) UNIMPLEMENTED();
DCHECK_NE(size, 3); // SEE "Related encodings"
emit(0xFU * B28 | B25 | u * B24 | B23 | d * B22 | size * B20 | vn * B16 |
vd * B12 | 0x8 * B8 | n * B7 | m * B5 | vm);
}
void Assembler::vmul(QwNeonRegister dst, QwNeonRegister src1, void Assembler::vmul(QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2) { QwNeonRegister src2) {
DCHECK(IsEnabled(NEON)); DCHECK(IsEnabled(NEON));
...@@ -4283,6 +4302,24 @@ void Assembler::vmul(NeonSize size, QwNeonRegister dst, QwNeonRegister src1, ...@@ -4283,6 +4302,24 @@ void Assembler::vmul(NeonSize size, QwNeonRegister dst, QwNeonRegister src1,
emit(EncodeNeonBinOp(VMUL, size, dst, src1, src2)); emit(EncodeNeonBinOp(VMUL, size, dst, src1, src2));
} }
void Assembler::vmull(NeonDataType dt, QwNeonRegister dst, DwVfpRegister src1,
DwVfpRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vmull(Dn, Dm) Vector Multiply Long (integer).
// Instruction details available in ARM DDI 0406C.b, A8-960.
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
int size = NeonSz(dt);
int u = NeonU(dt);
if (!u) UNIMPLEMENTED();
emit(0xFU * B28 | B25 | u * B24 | B23 | d * B22 | size * B20 | vn * B16 |
vd * B12 | 0xC * B8 | n * B7 | m * B5 | vm);
}
void Assembler::vmin(QwNeonRegister dst, QwNeonRegister src1, void Assembler::vmin(QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2) { QwNeonRegister src2) {
DCHECK(IsEnabled(NEON)); DCHECK(IsEnabled(NEON));
......
...@@ -888,9 +888,13 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -888,9 +888,13 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
QwNeonRegister src2); QwNeonRegister src2);
void vqsub(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1, void vqsub(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2); QwNeonRegister src2);
void vmlal(NeonDataType size, QwNeonRegister dst, DwVfpRegister src1,
DwVfpRegister src2);
void vmul(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2); void vmul(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2);
void vmul(NeonSize size, QwNeonRegister dst, QwNeonRegister src1, void vmul(NeonSize size, QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2); QwNeonRegister src2);
void vmull(NeonDataType size, QwNeonRegister dst, DwVfpRegister src1,
DwVfpRegister src2);
void vmin(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2); void vmin(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2);
void vmin(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1, void vmin(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2); QwNeonRegister src2);
......
...@@ -1975,6 +1975,51 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -1975,6 +1975,51 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(1)); i.InputSimd128Register(1));
break; break;
} }
case kArmI64x2Mul: {
QwNeonRegister dst = i.OutputSimd128Register();
QwNeonRegister left = i.InputSimd128Register(0);
QwNeonRegister right = i.InputSimd128Register(1);
QwNeonRegister tmp1 = i.TempSimd128Register(0);
QwNeonRegister tmp2 = i.TempSimd128Register(1);
// This algorithm uses vector operations to perform 64-bit integer
// multiplication by splitting it into a high and low 32-bit integers.
// The tricky part is getting the low and high integers in the correct
// place inside a NEON register, so that we can use as little vmull and
// vmlal as possible.
// Move left and right into temporaries, they will be modified by vtrn.
__ vmov(tmp1, left);
__ vmov(tmp2, right);
// This diagram shows how the 64-bit integers fit into NEON registers.
//
// [q.high()| q.low()]
// left/tmp1: [ a3, a2 | a1, a0 ]
// right/tmp2: [ b3, b2 | b1, b0 ]
//
// We want to multiply the low 32 bits of left with high 32 bits of right,
// for each lane, i.e. a2 * b3, a0 * b1. However, vmull takes two input d
// registers, and multiply the corresponding low/high 32 bits, to get a
// 64-bit integer: a1 * b1, a0 * b0. In order to make it work we transpose
// the vectors, so that we get the low 32 bits of each 64-bit integer into
// the same lane, similarly for high 32 bits.
__ vtrn(Neon32, tmp1.low(), tmp1.high());
// tmp1: [ a3, a1 | a2, a0 ]
__ vtrn(Neon32, tmp2.low(), tmp2.high());
// tmp2: [ b3, b1 | b2, b0 ]
__ vmull(NeonU32, dst, tmp1.low(), tmp2.high());
// dst: [ a2*b3 | a0*b1 ]
__ vmlal(NeonU32, dst, tmp1.high(), tmp2.low());
// dst: [ a2*b3 + a3*b2 | a0*b1 + a1*b0 ]
__ vshl(NeonU64, dst, dst, 32);
// dst: [ (a2*b3 + a3*b2) << 32 | (a0*b1 + a1*b0) << 32 ]
__ vmlal(NeonU32, dst, tmp1.low(), tmp2.low());
// dst: [ (a2*b3 + a3*b2)<<32 + (a2*b2) | (a0*b1 + a1*b0)<<32 + (a0*b0) ]
break;
}
case kArmI64x2Neg: { case kArmI64x2Neg: {
Simd128Register dst = i.OutputSimd128Register(); Simd128Register dst = i.OutputSimd128Register();
__ vmov(dst, static_cast<uint64_t>(0)); __ vmov(dst, static_cast<uint64_t>(0));
......
...@@ -172,6 +172,7 @@ namespace compiler { ...@@ -172,6 +172,7 @@ namespace compiler {
V(ArmI64x2ShrS) \ V(ArmI64x2ShrS) \
V(ArmI64x2Add) \ V(ArmI64x2Add) \
V(ArmI64x2Sub) \ V(ArmI64x2Sub) \
V(ArmI64x2Mul) \
V(ArmI64x2ShrU) \ V(ArmI64x2ShrU) \
V(ArmI32x4Splat) \ V(ArmI32x4Splat) \
V(ArmI32x4ExtractLane) \ V(ArmI32x4ExtractLane) \
......
...@@ -152,6 +152,7 @@ int InstructionScheduler::GetTargetInstructionFlags( ...@@ -152,6 +152,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmI64x2ShrS: case kArmI64x2ShrS:
case kArmI64x2Add: case kArmI64x2Add:
case kArmI64x2Sub: case kArmI64x2Sub:
case kArmI64x2Mul:
case kArmI64x2ShrU: case kArmI64x2ShrU:
case kArmI32x4Splat: case kArmI32x4Splat:
case kArmI32x4ExtractLane: case kArmI32x4ExtractLane:
......
...@@ -2656,6 +2656,15 @@ void InstructionSelector::VisitI64x2Neg(Node* node) { ...@@ -2656,6 +2656,15 @@ void InstructionSelector::VisitI64x2Neg(Node* node) {
g.UseUniqueRegister(node->InputAt(0))); g.UseUniqueRegister(node->InputAt(0)));
} }
void InstructionSelector::VisitI64x2Mul(Node* node) {
ArmOperandGenerator g(this);
InstructionOperand temps[] = {g.TempSimd128Register(),
g.TempSimd128Register()};
Emit(kArmI64x2Mul, g.DefineAsRegister(node),
g.UseUniqueRegister(node->InputAt(0)),
g.UseUniqueRegister(node->InputAt(1)), arraysize(temps), temps);
}
void InstructionSelector::VisitF32x4Sqrt(Node* node) { void InstructionSelector::VisitF32x4Sqrt(Node* node) {
ArmOperandGenerator g(this); ArmOperandGenerator g(this);
// Use fixed registers in the lower 8 Q-registers so we can directly access // Use fixed registers in the lower 8 Q-registers so we can directly access
......
...@@ -2635,9 +2635,6 @@ void InstructionSelector::VisitF64x2UConvertI64x2(Node* node) { ...@@ -2635,9 +2635,6 @@ void InstructionSelector::VisitF64x2UConvertI64x2(Node* node) {
#if !V8_TARGET_ARCH_ARM #if !V8_TARGET_ARCH_ARM
void InstructionSelector::VisitS128AndNot(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitS128AndNot(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_ARM #endif // !V8_TARGET_ARCH_ARM
#if !V8_TARGET_ARCH_IA32
void InstructionSelector::VisitI64x2Mul(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_IA32
void InstructionSelector::VisitI64x2Splat(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI64x2Splat(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2ExtractLane(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI64x2ExtractLane(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2ReplaceLane(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI64x2ReplaceLane(Node* node) { UNIMPLEMENTED(); }
......
...@@ -2395,6 +2395,26 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) { ...@@ -2395,6 +2395,26 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
out_buffer_pos_ += out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vs%ci.%d d%d, d%d, #%d", SNPrintF(out_buffer_ + out_buffer_pos_, "vs%ci.%d d%d, d%d, #%d",
direction, size, Vd, Vm, shift); direction, size, Vd, Vm, shift);
} else if (instr->Bits(11, 8) == 0x8 && instr->Bit(6) == 0 &&
instr->Bit(4) == 0) {
// vmlal.u<size> <Qd>, <Dn>, <Dm>
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kDoublePrecision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
int size = 8 << instr->Bits(21, 20);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vmlal.u%d q%d, d%d, d%d",
size, Vd, Vn, Vm);
} else if (instr->Bits(11, 8) == 0xC && instr->Bit(6) == 0 &&
instr->Bit(4) == 0) {
// vmull.u<size> <Qd>, <Dn>, <Dm>
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kDoublePrecision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
int size = 8 << instr->Bits(21, 20);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vmull.u%d q%d, d%d, d%d",
size, Vd, Vn, Vm);
} else { } else {
Unknown(instr); Unknown(instr);
} }
......
...@@ -5519,6 +5519,39 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { ...@@ -5519,6 +5519,39 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
UNREACHABLE(); UNREACHABLE();
break; break;
} }
} else if (instr->Bits(11, 8) == 0x8 && instr->Bit(6) == 0 &&
instr->Bit(4) == 0) {
// vmlal.u<size> Qd, Dn, Dm
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
if (size != Neon32) UNIMPLEMENTED();
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kDoublePrecision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
uint64_t src1, src2, dst[2];
get_neon_register<uint64_t>(Vd, dst);
get_d_register(Vn, &src1);
get_d_register(Vm, &src2);
dst[0] += (src1 & 0xFFFFFFFFULL) * (src2 & 0xFFFFFFFFULL);
dst[1] += (src1 >> 32) * (src2 >> 32);
set_neon_register<uint64_t>(Vd, dst);
} else if (instr->Bits(11, 8) == 0xC && instr->Bit(6) == 0 &&
instr->Bit(4) == 0) {
// vmull.u<size> Qd, Dn, Dm
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
if (size != Neon32) UNIMPLEMENTED();
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kDoublePrecision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
uint64_t src1, src2, dst[2];
get_d_register(Vn, &src1);
get_d_register(Vm, &src2);
dst[0] = (src1 & 0xFFFFFFFFULL) * (src2 & 0xFFFFFFFFULL);
dst[1] = (src1 >> 32) * (src2 >> 32);
set_neon_register<uint64_t>(Vd, dst);
} else { } else {
UNIMPLEMENTED(); UNIMPLEMENTED();
} }
......
...@@ -1171,6 +1171,12 @@ TEST(Neon) { ...@@ -1171,6 +1171,12 @@ TEST(Neon) {
"f2142970 vmul.i16 q1, q2, q8"); "f2142970 vmul.i16 q1, q2, q8");
COMPARE(vmul(Neon32, q15, q0, q8), COMPARE(vmul(Neon32, q15, q0, q8),
"f260e970 vmul.i32 q15, q0, q8"); "f260e970 vmul.i32 q15, q0, q8");
COMPARE(vmull(NeonU32, q15, d0, d8),
"f3e0ec08 vmull.u32 q15, d0, d8");
COMPARE(vmlal(NeonU32, q15, d0, d8),
"f3e0e808 vmlal.u32 q15, d0, d8");
COMPARE(vshl(NeonS8, q15, q0, 6), COMPARE(vshl(NeonS8, q15, q0, 6),
"f2cee550 vshl.i8 q15, q0, #6"); "f2cee550 vshl.i8 q15, q0, #6");
COMPARE(vshl(NeonU16, q15, q0, 10), COMPARE(vshl(NeonU16, q15, q0, 10),
......
...@@ -1454,7 +1454,6 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2Max) { ...@@ -1454,7 +1454,6 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2Max) {
RunF64x2BinOpTest(execution_tier, lower_simd, kExprF64x2Max, JSMax); RunF64x2BinOpTest(execution_tier, lower_simd, kExprF64x2Max, JSMax);
} }
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_IA32
WASM_SIMD_TEST_NO_LOWERING(I64x2Mul) { WASM_SIMD_TEST_NO_LOWERING(I64x2Mul) {
RunI64x2BinOpTest(execution_tier, lower_simd, kExprI64x2Mul, RunI64x2BinOpTest(execution_tier, lower_simd, kExprI64x2Mul,
base::MulWithWraparound); base::MulWithWraparound);
...@@ -1528,10 +1527,7 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2Qfms) { ...@@ -1528,10 +1527,7 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2Qfms) {
} }
} }
} }
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_IA32
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
WASM_SIMD_TEST_NO_LOWERING(F64x2ConvertI64x2) { WASM_SIMD_TEST_NO_LOWERING(F64x2ConvertI64x2) {
WasmRunner<int32_t, int64_t> r(execution_tier, lower_simd); WasmRunner<int32_t, int64_t> r(execution_tier, lower_simd);
// Create two output vectors to hold signed and unsigned results. // Create two output vectors to hold signed and unsigned results.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment