Commit b7ade853 authored by Joey Gouly's avatar Joey Gouly Committed by Commit Bot

[arm64][wasm] Implement I64x2 multiply

Implement I64x2 multiply using 32-bit multiplies.

This approach uses two fewer cycles (0.88x) on Cortex-A53 and three fewer cycles (0.86x)
on Cortex-A72, compared to moving to general purpose registers and doing two 64-bit multiplies.

Based on a patch by Zhi An Ng.

Bug: v8:8460
Change-Id: I9c8d3bb77f0d751eec2d85823522558b7f173628
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1781696
Commit-Queue: Martyn Capewell <martyn.capewell@arm.com>
Reviewed-by: 's avatarZhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#63558}
parent a14e2f12
...@@ -1903,6 +1903,65 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -1903,6 +1903,65 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} }
SIMD_BINOP_CASE(kArm64I64x2Add, Add, 2D); SIMD_BINOP_CASE(kArm64I64x2Add, Add, 2D);
SIMD_BINOP_CASE(kArm64I64x2Sub, Sub, 2D); SIMD_BINOP_CASE(kArm64I64x2Sub, Sub, 2D);
case kArm64I64x2Mul: {
UseScratchRegisterScope scope(tasm());
VRegister dst = i.OutputSimd128Register();
VRegister src1 = i.InputSimd128Register(0);
VRegister src2 = i.InputSimd128Register(1);
VRegister tmp1 = scope.AcquireSameSizeAs(dst);
VRegister tmp2 = scope.AcquireSameSizeAs(dst);
VRegister tmp3 = i.ToSimd128Register(instr->TempAt(0));
// This 2x64-bit multiplication is performed with several 32-bit
// multiplications.
// 64-bit numbers x and y, can be represented as:
// x = a + 2^32(b)
// y = c + 2^32(d)
// A 64-bit multiplication is:
// x * y = ac + 2^32(ad + bc) + 2^64(bd)
// note: `2^64(bd)` can be ignored, the value is too large to fit in
// 64-bits.
// This sequence implements a 2x64bit multiply, where the registers
// `src1` and `src2` are split up into 32-bit components:
// src1 = |d|c|b|a|
// src2 = |h|g|f|e|
//
// src1 * src2 = |cg + 2^32(ch + dg)|ae + 2^32(af + be)|
// Reverse the 32-bit elements in the 64-bit words.
// tmp2 = |g|h|e|f|
__ Rev64(tmp2.V4S(), src2.V4S());
// Calculate the high half components.
// tmp2 = |dg|ch|be|af|
__ Mul(tmp2.V4S(), tmp2.V4S(), src1.V4S());
// Extract the low half components of src1.
// tmp1 = |c|a|
__ Xtn(tmp1.V2S(), src1.V2D());
// Sum the respective high half components.
// tmp2 = |dg+ch|be+af||dg+ch|be+af|
__ Addp(tmp2.V4S(), tmp2.V4S(), tmp2.V4S());
// Extract the low half components of src2.
// tmp3 = |g|e|
__ Xtn(tmp3.V2S(), src2.V2D());
// Shift the high half components, into the high half.
// dst = |dg+ch << 32|be+af << 32|
__ Shll(dst.V2D(), tmp2.V2S(), 32);
// Multiply the low components together, and accumulate with the high
// half.
// dst = |dst[1] + cg|dst[0] + ae|
__ Umlal(dst.V2D(), tmp3.V2S(), tmp1.V2S());
break;
}
SIMD_BINOP_CASE(kArm64I64x2Eq, Cmeq, 2D); SIMD_BINOP_CASE(kArm64I64x2Eq, Cmeq, 2D);
case kArm64I64x2Ne: { case kArm64I64x2Ne: {
VRegister dst = i.OutputSimd128Register().V2D(); VRegister dst = i.OutputSimd128Register().V2D();
......
...@@ -213,6 +213,7 @@ namespace compiler { ...@@ -213,6 +213,7 @@ namespace compiler {
V(Arm64I64x2ShrS) \ V(Arm64I64x2ShrS) \
V(Arm64I64x2Add) \ V(Arm64I64x2Add) \
V(Arm64I64x2Sub) \ V(Arm64I64x2Sub) \
V(Arm64I64x2Mul) \
V(Arm64I64x2Eq) \ V(Arm64I64x2Eq) \
V(Arm64I64x2Ne) \ V(Arm64I64x2Ne) \
V(Arm64I64x2GtS) \ V(Arm64I64x2GtS) \
......
...@@ -180,6 +180,7 @@ int InstructionScheduler::GetTargetInstructionFlags( ...@@ -180,6 +180,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArm64I64x2ShrS: case kArm64I64x2ShrS:
case kArm64I64x2Add: case kArm64I64x2Add:
case kArm64I64x2Sub: case kArm64I64x2Sub:
case kArm64I64x2Mul:
case kArm64I64x2Eq: case kArm64I64x2Eq:
case kArm64I64x2Ne: case kArm64I64x2Ne:
case kArm64I64x2GtS: case kArm64I64x2GtS:
......
...@@ -3236,6 +3236,14 @@ SIMD_BINOP_LIST(SIMD_VISIT_BINOP) ...@@ -3236,6 +3236,14 @@ SIMD_BINOP_LIST(SIMD_VISIT_BINOP)
#undef SIMD_VISIT_BINOP #undef SIMD_VISIT_BINOP
#undef SIMD_BINOP_LIST #undef SIMD_BINOP_LIST
void InstructionSelector::VisitI64x2Mul(Node* node) {
Arm64OperandGenerator g(this);
InstructionOperand temps[] = {g.TempSimd128Register()};
Emit(kArm64I64x2Mul, g.DefineAsRegister(node),
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)),
arraysize(temps), temps);
}
void InstructionSelector::VisitS128Select(Node* node) { void InstructionSelector::VisitS128Select(Node* node) {
Arm64OperandGenerator g(this); Arm64OperandGenerator g(this);
Emit(kArm64S128Select, g.DefineSameAsFirst(node), Emit(kArm64S128Select, g.DefineSameAsFirst(node),
......
...@@ -2630,6 +2630,7 @@ void InstructionSelector::VisitI64x2Shl(Node* node) { UNIMPLEMENTED(); } ...@@ -2630,6 +2630,7 @@ void InstructionSelector::VisitI64x2Shl(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2ShrS(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI64x2ShrS(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2Add(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI64x2Add(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2Sub(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI64x2Sub(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2Mul(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2Eq(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI64x2Eq(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2Ne(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI64x2Ne(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2ShrU(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI64x2ShrU(Node* node) { UNIMPLEMENTED(); }
...@@ -2640,7 +2641,6 @@ void InstructionSelector::VisitI64x2GeU(Node* node) { UNIMPLEMENTED(); } ...@@ -2640,7 +2641,6 @@ void InstructionSelector::VisitI64x2GeU(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitS1x2AnyTrue(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitS1x2AnyTrue(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitS1x2AllTrue(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitS1x2AllTrue(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_ARM64 #endif // !V8_TARGET_ARCH_ARM64
void InstructionSelector::VisitI64x2Mul(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2MinS(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI64x2MinS(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2MaxS(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI64x2MaxS(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2MinU(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitI64x2MinU(Node* node) { UNIMPLEMENTED(); }
......
...@@ -1249,12 +1249,12 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2Max) { ...@@ -1249,12 +1249,12 @@ WASM_SIMD_TEST_NO_LOWERING(F64x2Max) {
RunF64x2BinOpTest(execution_tier, lower_simd, kExprF64x2Max, JSMax); RunF64x2BinOpTest(execution_tier, lower_simd, kExprF64x2Max, JSMax);
} }
#if V8_TARGET_ARCH_X64
WASM_SIMD_TEST_NO_LOWERING(I64x2Mul) { WASM_SIMD_TEST_NO_LOWERING(I64x2Mul) {
RunI64x2BinOpTest(execution_tier, lower_simd, kExprI64x2Mul, RunI64x2BinOpTest(execution_tier, lower_simd, kExprI64x2Mul,
base::MulWithWraparound); base::MulWithWraparound);
} }
#if V8_TARGET_ARCH_X64
WASM_SIMD_TEST_NO_LOWERING(I64x2MinS) { WASM_SIMD_TEST_NO_LOWERING(I64x2MinS) {
RunI64x2BinOpTest(execution_tier, lower_simd, kExprI64x2MinS, Minimum); RunI64x2BinOpTest(execution_tier, lower_simd, kExprI64x2MinS, Minimum);
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment