Commit dddfcfd0 authored by bbudge's avatar bbudge Committed by Commit bot

[WASM SIMD] Remove opcodes that are slow on some platforms.

These can be synthesized from existing operations and scheduled for
better performance than if we have to generate blocks of instructions
that take many cycles to complete.
- Remove F32x4RecipRefine, F32x4RecipSqrtRefine. Clients are better off
  synthesizing these from splats, multiplies and adds.
- Remove F32x4Div, F32x4Sqrt, F32x4MinNum, F32x4MaxNum. Clients are
  better off synthesizing these or using the reciprocal approximations,
  possibly with a refinement step.

LOG=N
BUG=v8:6020

Review-Url: https://codereview.chromium.org/2827143002
Cr-Commit-Position: refs/heads/master@{#44784}
parent 75ce09b5
......@@ -1631,16 +1631,6 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputSimd128Register(1));
break;
}
case kArmF32x4RecipRefine: {
__ vrecps(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
break;
}
case kArmF32x4RecipSqrtRefine: {
__ vrsqrts(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
break;
}
case kArmF32x4Eq: {
__ vceq(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputSimd128Register(1));
......
......@@ -138,8 +138,6 @@ namespace compiler {
V(ArmF32x4Mul) \
V(ArmF32x4Min) \
V(ArmF32x4Max) \
V(ArmF32x4RecipRefine) \
V(ArmF32x4RecipSqrtRefine) \
V(ArmF32x4Eq) \
V(ArmF32x4Ne) \
V(ArmF32x4Lt) \
......
......@@ -122,8 +122,6 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmF32x4Mul:
case kArmF32x4Min:
case kArmF32x4Max:
case kArmF32x4RecipRefine:
case kArmF32x4RecipSqrtRefine:
case kArmF32x4Eq:
case kArmF32x4Ne:
case kArmF32x4Lt:
......
......@@ -2447,8 +2447,6 @@ VISIT_ATOMIC_BINOP(Xor)
V(F32x4Mul, kArmF32x4Mul) \
V(F32x4Min, kArmF32x4Min) \
V(F32x4Max, kArmF32x4Max) \
V(F32x4RecipRefine, kArmF32x4RecipRefine) \
V(F32x4RecipSqrtRefine, kArmF32x4RecipSqrtRefine) \
V(F32x4Eq, kArmF32x4Eq) \
V(F32x4Ne, kArmF32x4Ne) \
V(F32x4Lt, kArmF32x4Lt) \
......
......@@ -1505,8 +1505,8 @@ void InstructionSelector::VisitNode(Node* node) {
return MarkAsSimd128(node), VisitF32x4Neg(node);
case IrOpcode::kF32x4RecipApprox:
return MarkAsSimd128(node), VisitF32x4RecipApprox(node);
case IrOpcode::kF32x4RecipRefine:
return MarkAsSimd128(node), VisitF32x4RecipRefine(node);
case IrOpcode::kF32x4RecipSqrtApprox:
return MarkAsSimd128(node), VisitF32x4RecipSqrtApprox(node);
case IrOpcode::kF32x4Add:
return MarkAsSimd128(node), VisitF32x4Add(node);
case IrOpcode::kF32x4Sub:
......@@ -1517,10 +1517,6 @@ void InstructionSelector::VisitNode(Node* node) {
return MarkAsSimd128(node), VisitF32x4Min(node);
case IrOpcode::kF32x4Max:
return MarkAsSimd128(node), VisitF32x4Max(node);
case IrOpcode::kF32x4RecipSqrtApprox:
return MarkAsSimd128(node), VisitF32x4RecipSqrtApprox(node);
case IrOpcode::kF32x4RecipSqrtRefine:
return MarkAsSimd128(node), VisitF32x4RecipSqrtRefine(node);
case IrOpcode::kF32x4Eq:
return MarkAsSimd1x4(node), VisitF32x4Eq(node);
case IrOpcode::kF32x4Ne:
......@@ -2152,10 +2148,6 @@ void InstructionSelector::VisitF32x4RecipSqrtApprox(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitF32x4RecipSqrtRefine(Node* node) {
UNIMPLEMENTED();
}
void InstructionSelector::VisitF32x4Add(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4Sub(Node* node) { UNIMPLEMENTED(); }
......@@ -2168,8 +2160,6 @@ void InstructionSelector::VisitF32x4Min(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4RecipApprox(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4RecipRefine(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4Eq(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4Ne(Node* node) { UNIMPLEMENTED(); }
......
......@@ -229,19 +229,13 @@ MachineType AtomicOpRepresentationOf(Operator const* op) {
V(F32x4UConvertI32x4, Operator::kNoProperties, 1, 0, 1) \
V(F32x4Abs, Operator::kNoProperties, 1, 0, 1) \
V(F32x4Neg, Operator::kNoProperties, 1, 0, 1) \
V(F32x4Sqrt, Operator::kNoProperties, 1, 0, 1) \
V(F32x4RecipApprox, Operator::kNoProperties, 1, 0, 1) \
V(F32x4RecipSqrtApprox, Operator::kNoProperties, 1, 0, 1) \
V(F32x4Add, Operator::kCommutative, 2, 0, 1) \
V(F32x4Sub, Operator::kNoProperties, 2, 0, 1) \
V(F32x4Mul, Operator::kCommutative, 2, 0, 1) \
V(F32x4Div, Operator::kNoProperties, 2, 0, 1) \
V(F32x4Min, Operator::kCommutative, 2, 0, 1) \
V(F32x4Max, Operator::kCommutative, 2, 0, 1) \
V(F32x4MinNum, Operator::kCommutative, 2, 0, 1) \
V(F32x4MaxNum, Operator::kCommutative, 2, 0, 1) \
V(F32x4RecipRefine, Operator::kNoProperties, 2, 0, 1) \
V(F32x4RecipSqrtRefine, Operator::kNoProperties, 2, 0, 1) \
V(F32x4Eq, Operator::kCommutative, 2, 0, 1) \
V(F32x4Ne, Operator::kCommutative, 2, 0, 1) \
V(F32x4Lt, Operator::kNoProperties, 2, 0, 1) \
......
......@@ -441,7 +441,6 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
const Operator* F32x4UConvertI32x4();
const Operator* F32x4Abs();
const Operator* F32x4Neg();
const Operator* F32x4Sqrt();
const Operator* F32x4RecipApprox();
const Operator* F32x4RecipSqrtApprox();
const Operator* F32x4Add();
......@@ -450,10 +449,6 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
const Operator* F32x4Div();
const Operator* F32x4Min();
const Operator* F32x4Max();
const Operator* F32x4MinNum();
const Operator* F32x4MaxNum();
const Operator* F32x4RecipRefine();
const Operator* F32x4RecipSqrtRefine();
const Operator* F32x4Eq();
const Operator* F32x4Ne();
const Operator* F32x4Lt();
......
......@@ -1782,34 +1782,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ frcp_w(i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
}
case kMipsF32x4RecipRefine: {
CpuFeatureScope msa_scope(masm(), MIPS_SIMD);
Simd128Register dst = i.OutputSimd128Register();
// Emulate with 2.0f - a * b
__ ldi_w(kSimd128ScratchReg, 2);
__ ffint_u_w(kSimd128ScratchReg, kSimd128ScratchReg);
__ fmul_w(dst, i.InputSimd128Register(0), i.InputSimd128Register(1));
__ fsub_w(dst, kSimd128ScratchReg, dst);
break;
}
case kMipsF32x4RecipSqrtApprox: {
CpuFeatureScope msa_scope(masm(), MIPS_SIMD);
__ frsqrt_w(i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
}
case kMipsF32x4RecipSqrtRefine: {
CpuFeatureScope msa_scope(masm(), MIPS_SIMD);
Simd128Register dst = i.OutputSimd128Register();
// Emulate with (3.0f - a * b) * 0.5f;
__ ldi_w(kSimd128ScratchReg, 3);
__ ffint_u_w(kSimd128ScratchReg, kSimd128ScratchReg);
__ fmul_w(dst, i.InputSimd128Register(0), i.InputSimd128Register(1));
__ fsub_w(dst, kSimd128ScratchReg, dst);
__ ldi_w(kSimd128ScratchReg, 0x3f);
__ slli_w(kSimd128ScratchReg, kSimd128ScratchReg, 24);
__ fmul_w(dst, dst, kSimd128ScratchReg);
break;
}
case kMipsF32x4Add: {
CpuFeatureScope msa_scope(masm(), MIPS_SIMD);
__ fadd_w(i.OutputSimd128Register(), i.InputSimd128Register(0),
......
......@@ -158,9 +158,7 @@ namespace compiler {
V(MipsF32x4Abs) \
V(MipsF32x4Neg) \
V(MipsF32x4RecipApprox) \
V(MipsF32x4RecipRefine) \
V(MipsF32x4RecipSqrtApprox) \
V(MipsF32x4RecipSqrtRefine) \
V(MipsF32x4Add) \
V(MipsF32x4Sub) \
V(MipsF32x4Mul) \
......
......@@ -2047,18 +2047,10 @@ void InstructionSelector::VisitF32x4RecipApprox(Node* node) {
VisitRR(this, kMipsF32x4RecipApprox, node);
}
void InstructionSelector::VisitF32x4RecipRefine(Node* node) {
VisitRRR(this, kMipsF32x4RecipRefine, node);
}
void InstructionSelector::VisitF32x4RecipSqrtApprox(Node* node) {
VisitRR(this, kMipsF32x4RecipSqrtApprox, node);
}
void InstructionSelector::VisitF32x4RecipSqrtRefine(Node* node) {
VisitRRR(this, kMipsF32x4RecipSqrtRefine, node);
}
void InstructionSelector::VisitF32x4Add(Node* node) {
VisitRRR(this, kMipsF32x4Add, node);
}
......
......@@ -2112,34 +2112,11 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ frcp_w(i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
}
case kMips64F32x4RecipRefine: {
CpuFeatureScope msa_scope(masm(), MIPS_SIMD);
Simd128Register dst = i.OutputSimd128Register();
// Emulate with 2.0f - a * b
__ ldi_w(kSimd128ScratchReg, 2);
__ ffint_u_w(kSimd128ScratchReg, kSimd128ScratchReg);
__ fmul_w(dst, i.InputSimd128Register(0), i.InputSimd128Register(1));
__ fsub_w(dst, kSimd128ScratchReg, dst);
break;
}
case kMips64F32x4RecipSqrtApprox: {
CpuFeatureScope msa_scope(masm(), MIPS_SIMD);
__ frsqrt_w(i.OutputSimd128Register(), i.InputSimd128Register(0));
break;
}
case kMips64F32x4RecipSqrtRefine: {
CpuFeatureScope msa_scope(masm(), MIPS_SIMD);
Simd128Register dst = i.OutputSimd128Register();
// Emulate with (3.0f - a * b) * 0.5f;
__ ldi_w(kSimd128ScratchReg, 3);
__ ffint_u_w(kSimd128ScratchReg, kSimd128ScratchReg);
__ fmul_w(dst, i.InputSimd128Register(0), i.InputSimd128Register(1));
__ fsub_w(dst, kSimd128ScratchReg, dst);
__ ldi_w(kSimd128ScratchReg, 0x3f);
__ slli_w(kSimd128ScratchReg, kSimd128ScratchReg, 24);
__ fmul_w(dst, dst, kSimd128ScratchReg);
break;
}
case kMips64F32x4Add: {
CpuFeatureScope msa_scope(masm(), MIPS_SIMD);
__ fadd_w(i.OutputSimd128Register(), i.InputSimd128Register(0),
......
......@@ -192,9 +192,7 @@ namespace compiler {
V(Mips64F32x4Abs) \
V(Mips64F32x4Neg) \
V(Mips64F32x4RecipApprox) \
V(Mips64F32x4RecipRefine) \
V(Mips64F32x4RecipSqrtApprox) \
V(Mips64F32x4RecipSqrtRefine) \
V(Mips64F32x4Add) \
V(Mips64F32x4Sub) \
V(Mips64F32x4Mul) \
......
......@@ -2798,18 +2798,10 @@ void InstructionSelector::VisitF32x4RecipApprox(Node* node) {
VisitRR(this, kMips64F32x4RecipApprox, node);
}
void InstructionSelector::VisitF32x4RecipRefine(Node* node) {
VisitRRR(this, kMips64F32x4RecipRefine, node);
}
void InstructionSelector::VisitF32x4RecipSqrtApprox(Node* node) {
VisitRR(this, kMips64F32x4RecipSqrtApprox, node);
}
void InstructionSelector::VisitF32x4RecipSqrtRefine(Node* node) {
VisitRRR(this, kMips64F32x4RecipSqrtRefine, node);
}
void InstructionSelector::VisitF32x4Add(Node* node) {
VisitRRR(this, kMips64F32x4Add, node);
}
......
......@@ -575,19 +575,13 @@
V(F32x4UConvertI32x4) \
V(F32x4Abs) \
V(F32x4Neg) \
V(F32x4Sqrt) \
V(F32x4RecipApprox) \
V(F32x4RecipSqrtApprox) \
V(F32x4Add) \
V(F32x4Sub) \
V(F32x4Mul) \
V(F32x4Div) \
V(F32x4Min) \
V(F32x4Max) \
V(F32x4MinNum) \
V(F32x4MaxNum) \
V(F32x4RecipRefine) \
V(F32x4RecipSqrtRefine) \
V(F32x4Eq) \
V(F32x4Ne) \
V(F32x4Lt) \
......
......@@ -96,7 +96,6 @@ void SimdScalarLowering::LowerGraph() {
V(F32x4Add) \
V(F32x4Sub) \
V(F32x4Mul) \
V(F32x4Div) \
V(F32x4Min) \
V(F32x4Max)
......@@ -646,7 +645,6 @@ void SimdScalarLowering::LowerNode(Node* node) {
F32X4_BINOP_CASE(Add)
F32X4_BINOP_CASE(Sub)
F32X4_BINOP_CASE(Mul)
F32X4_BINOP_CASE(Div)
F32X4_BINOP_CASE(Min)
F32X4_BINOP_CASE(Max)
#undef F32X4_BINOP_CASE
......@@ -657,7 +655,6 @@ void SimdScalarLowering::LowerNode(Node* node) {
}
F32X4_UNOP_CASE(Abs)
F32X4_UNOP_CASE(Neg)
F32X4_UNOP_CASE(Sqrt)
#undef F32x4_UNOP_CASE
case IrOpcode::kF32x4SConvertI32x4: {
LowerUnaryOp(node, SimdType::kInt32, machine()->RoundInt32ToFloat32());
......
......@@ -3188,8 +3188,6 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode,
return graph()->NewNode(jsgraph()->machine()->F32x4Abs(), inputs[0]);
case wasm::kExprF32x4Neg:
return graph()->NewNode(jsgraph()->machine()->F32x4Neg(), inputs[0]);
case wasm::kExprF32x4Sqrt:
return graph()->NewNode(jsgraph()->machine()->F32x4Sqrt(), inputs[0]);
case wasm::kExprF32x4RecipApprox:
return graph()->NewNode(jsgraph()->machine()->F32x4RecipApprox(),
inputs[0]);
......@@ -3205,21 +3203,12 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode,
case wasm::kExprF32x4Mul:
return graph()->NewNode(jsgraph()->machine()->F32x4Mul(), inputs[0],
inputs[1]);
case wasm::kExprF32x4Div:
return graph()->NewNode(jsgraph()->machine()->F32x4Div(), inputs[0],
inputs[1]);
case wasm::kExprF32x4Min:
return graph()->NewNode(jsgraph()->machine()->F32x4Min(), inputs[0],
inputs[1]);
case wasm::kExprF32x4Max:
return graph()->NewNode(jsgraph()->machine()->F32x4Max(), inputs[0],
inputs[1]);
case wasm::kExprF32x4RecipRefine:
return graph()->NewNode(jsgraph()->machine()->F32x4RecipRefine(),
inputs[0], inputs[1]);
case wasm::kExprF32x4RecipSqrtRefine:
return graph()->NewNode(jsgraph()->machine()->F32x4RecipSqrtRefine(),
inputs[0], inputs[1]);
case wasm::kExprF32x4Eq:
return graph()->NewNode(jsgraph()->machine()->F32x4Eq(), inputs[0],
inputs[1]);
......
......@@ -175,16 +175,10 @@ const char* WasmOpcodes::OpcodeName(WasmOpcode opcode) {
CASE_SIMD_OP(Sub, "sub")
CASE_SIMD_OP(Mul, "mul")
CASE_F32x4_OP(Abs, "abs")
CASE_F32x4_OP(Sqrt, "sqrt")
CASE_F32x4_OP(Div, "div")
CASE_F32x4_OP(RecipApprox, "recip_approx")
CASE_F32x4_OP(RecipRefine, "recip_refine")
CASE_F32x4_OP(RecipSqrtApprox, "recip_sqrt_approx")
CASE_F32x4_OP(RecipSqrtRefine, "recip_sqrt_refine")
CASE_F32x4_OP(Min, "min")
CASE_F32x4_OP(Max, "max")
CASE_F32x4_OP(MinNum, "min_num")
CASE_F32x4_OP(MaxNum, "max_num")
CASE_F32x4_OP(Lt, "lt")
CASE_F32x4_OP(Le, "le")
CASE_F32x4_OP(Gt, "gt")
......
......@@ -286,19 +286,13 @@ constexpr WasmCodePosition kNoCodePosition = -1;
V(F32x4Splat, 0xe500, s_f) \
V(F32x4Abs, 0xe503, s_s) \
V(F32x4Neg, 0xe504, s_s) \
V(F32x4Sqrt, 0xe505, s_s) \
V(F32x4RecipApprox, 0xe506, s_s) \
V(F32x4RecipSqrtApprox, 0xe507, s_s) \
V(F32x4Add, 0xe508, s_ss) \
V(F32x4Sub, 0xe509, s_ss) \
V(F32x4Mul, 0xe50a, s_ss) \
V(F32x4Div, 0xe50b, s_ss) \
V(F32x4Min, 0xe50c, s_ss) \
V(F32x4Max, 0xe50d, s_ss) \
V(F32x4MinNum, 0xe50e, s_ss) \
V(F32x4MaxNum, 0xe50f, s_ss) \
V(F32x4RecipRefine, 0xe592, s_ss) \
V(F32x4RecipSqrtRefine, 0xe593, s_ss) \
V(F32x4Eq, 0xe510, s1x4_ss) \
V(F32x4Ne, 0xe511, s1x4_ss) \
V(F32x4Lt, 0xe512, s1x4_ss) \
......
......@@ -266,16 +266,6 @@ T RecipSqrt(T a) {
return 1.0f / std::sqrt(a);
}
template <typename T>
T RecipRefine(T a, T b) {
return 2.0f - a * b;
}
template <typename T>
T RecipSqrtRefine(T a, T b) {
return (3.0f - a * b) * 0.5f;
}
} // namespace
#define WASM_SIMD_CHECK_LANE(TYPE, value, LANE_TYPE, lane_value, lane_index) \
......@@ -518,10 +508,6 @@ WASM_EXEC_COMPILED_TEST(F32x4Neg) { RunF32x4UnOpTest(kExprF32x4Neg, Negate); }
#endif // V8_TARGET_ARCH_ARM || SIMD_LOWERING_TARGET || V8_TARGET_ARCH_MIPS ||
// V8_TARGET_ARCH_MIPS64
#if SIMD_LOWERING_TARGET
WASM_EXEC_COMPILED_TEST(F32x4Sqrt) { RunF32x4UnOpTest(kExprF32x4Sqrt, Sqrt); }
#endif // SIMD_LOWERING_TARGET
#if V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_MIPS || V8_TARGET_ARCH_MIPS64
static const float kApproxError = 0.01f;
......@@ -573,20 +559,6 @@ WASM_EXEC_COMPILED_TEST(F32x4_Max) {
#endif // V8_TARGET_ARCH_ARM || SIMD_LOWERING_TARGET || V8_TARGET_ARCH_MIPS ||
// V8_TARGET_ARCH_MIPS64
#if SIMD_LOWERING_TARGET
WASM_EXEC_COMPILED_TEST(F32x4Div) { RunF32x4BinOpTest(kExprF32x4Div, Div); }
#endif // SIMD_LOWERING_TARGET
#if V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_MIPS || V8_TARGET_ARCH_MIPS64
WASM_EXEC_COMPILED_TEST(F32x4RecipRefine) {
RunF32x4BinOpTest(kExprF32x4RecipRefine, RecipRefine);
}
WASM_EXEC_COMPILED_TEST(F32x4RecipSqrtRefine) {
RunF32x4BinOpTest(kExprF32x4RecipSqrtRefine, RecipSqrtRefine);
}
#endif // V8_TARGET_ARCH_ARM || V8_TARGET_ARCH_MIPS || V8_TARGET_ARCH_MIPS64
#if V8_TARGET_ARCH_ARM || SIMD_LOWERING_TARGET || V8_TARGET_ARCH_MIPS || \
V8_TARGET_ARCH_MIPS64
void RunF32x4CompareOpTest(WasmOpcode simd_op, FloatCompareOp expected_op) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment