Commit 2cf821cc authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd] Implement QFMA and QFMS on x64

Quasi Fused Multiply-Add and Quasi Fused Multiply-Subtract performs, on floats, a + b * c and a - b * c respectively.
When there is only a single rounding, it is a fused operation. Quasi in this case means that the result can either be fused or not fused (two roundings), depending on hardware support.

It is tricky to write the test because we need to calculate the expected value, and there is no easy way to express fused or unfused operation in C++, i.e.
we cannot confirm that float expected = a + b * c will perform a fused or unfused operation (unless we use intrinsics).
Thus in the test we have a list of simple checks, plus interesting values that we know will produce different results depending on whether it was fused or not.

The difference between 32x4 and 64x2 qfma/qfms is the type, and also the values of b and c that will cause an overflow, and thus the intermediate rounding will affect the final result.
The same array can be copy pasted for both types, but with a bit of templating we can avoid that duplication.

Change-Id: I0973a3d28468d25f310b593c72f21bff54d809a7
Bug: v8:9415
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1779325
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Reviewed-by: 's avatarMichael Starzinger <mstarzinger@chromium.org>
Cr-Commit-Position: refs/heads/master@{#63878}
parent 72b8a49f
......@@ -4107,6 +4107,42 @@ void Assembler::vfmass(byte op, XMMRegister dst, XMMRegister src1,
emit_sse_operand(dst, src2);
}
void Assembler::vfmaps(byte op, XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
DCHECK(IsEnabled(FMA3));
EnsureSpace ensure_space(this);
emit_vex_prefix(dst, src1, src2, kL128, k66, k0F38, kW0);
emit(op);
emit_sse_operand(dst, src2);
}
void Assembler::vfmaps(byte op, XMMRegister dst, XMMRegister src1,
Operand src2) {
DCHECK(IsEnabled(FMA3));
EnsureSpace ensure_space(this);
emit_vex_prefix(dst, src1, src2, kL128, k66, k0F38, kW0);
emit(op);
emit_sse_operand(dst, src2);
}
void Assembler::vfmapd(byte op, XMMRegister dst, XMMRegister src1,
XMMRegister src2) {
DCHECK(IsEnabled(FMA3));
EnsureSpace ensure_space(this);
emit_vex_prefix(dst, src1, src2, kL128, k66, k0F38, kW1);
emit(op);
emit_sse_operand(dst, src2);
}
void Assembler::vfmapd(byte op, XMMRegister dst, XMMRegister src1,
Operand src2) {
DCHECK(IsEnabled(FMA3));
EnsureSpace ensure_space(this);
emit_vex_prefix(dst, src1, src2, kL128, k66, k0F38, kW1);
emit(op);
emit_sse_operand(dst, src2);
}
void Assembler::vmovd(XMMRegister dst, Register src) {
DCHECK(IsEnabled(AVX));
EnsureSpace ensure_space(this);
......
......@@ -1300,6 +1300,36 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vfmass(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vfmass(byte op, XMMRegister dst, XMMRegister src1, Operand src2);
void vfmadd231ps(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmaps(0xb8, dst, src1, src2);
}
void vfmadd231ps(XMMRegister dst, XMMRegister src1, Operand src2) {
vfmaps(0xb8, dst, src1, src2);
}
void vfnmadd231ps(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmaps(0xbc, dst, src1, src2);
}
void vfnmadd231ps(XMMRegister dst, XMMRegister src1, Operand src2) {
vfmaps(0xbc, dst, src1, src2);
}
void vfmaps(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vfmaps(byte op, XMMRegister dst, XMMRegister src1, Operand src2);
void vfmadd231pd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmapd(0xb8, dst, src1, src2);
}
void vfmadd231pd(XMMRegister dst, XMMRegister src1, Operand src2) {
vfmapd(0xb8, dst, src1, src2);
}
void vfnmadd231pd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
vfmapd(0xbc, dst, src1, src2);
}
void vfnmadd231pd(XMMRegister dst, XMMRegister src1, Operand src2) {
vfmapd(0xbc, dst, src1, src2);
}
void vfmapd(byte op, XMMRegister dst, XMMRegister src1, XMMRegister src2);
void vfmapd(byte op, XMMRegister dst, XMMRegister src1, Operand src2);
void vmovd(XMMRegister dst, Register src);
void vmovd(XMMRegister dst, Operand src);
void vmovd(Register dst, XMMRegister src);
......
......@@ -1881,6 +1881,10 @@ void InstructionSelector::VisitNode(Node* node) {
return MarkAsSimd128(node), VisitF64x2Lt(node);
case IrOpcode::kF64x2Le:
return MarkAsSimd128(node), VisitF64x2Le(node);
case IrOpcode::kF64x2Qfma:
return MarkAsSimd128(node), VisitF64x2Qfma(node);
case IrOpcode::kF64x2Qfms:
return MarkAsSimd128(node), VisitF64x2Qfms(node);
case IrOpcode::kF32x4Splat:
return MarkAsSimd128(node), VisitF32x4Splat(node);
case IrOpcode::kF32x4ExtractLane:
......@@ -1923,6 +1927,10 @@ void InstructionSelector::VisitNode(Node* node) {
return MarkAsSimd128(node), VisitF32x4Lt(node);
case IrOpcode::kF32x4Le:
return MarkAsSimd128(node), VisitF32x4Le(node);
case IrOpcode::kF32x4Qfma:
return MarkAsSimd128(node), VisitF32x4Qfma(node);
case IrOpcode::kF32x4Qfms:
return MarkAsSimd128(node), VisitF32x4Qfms(node);
case IrOpcode::kI64x2Splat:
return MarkAsSimd128(node), VisitI64x2Splat(node);
case IrOpcode::kI64x2ExtractLane:
......@@ -2655,6 +2663,10 @@ void InstructionSelector::VisitI64x2MinS(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2MaxS(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2MinU(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI64x2MaxU(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2Qfma(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2Qfms(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4Qfma(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4Qfms(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_X64
void InstructionSelector::VisitFinishRegion(Node* node) { EmitIdentity(node); }
......
......@@ -2371,6 +2371,32 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ cmplepd(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64F64x2Qfma: {
if (CpuFeatures::IsSupported(FMA3)) {
CpuFeatureScope fma3_scope(tasm(), FMA3);
__ vfmadd231pd(i.OutputSimd128Register(), i.InputSimd128Register(1),
i.InputSimd128Register(2));
} else {
XMMRegister tmp = i.TempSimd128Register(0);
__ movapd(tmp, i.InputSimd128Register(2));
__ mulpd(tmp, i.InputSimd128Register(1));
__ addpd(i.OutputSimd128Register(), tmp);
}
break;
}
case kX64F64x2Qfms: {
if (CpuFeatures::IsSupported(FMA3)) {
CpuFeatureScope fma3_scope(tasm(), FMA3);
__ vfnmadd231pd(i.OutputSimd128Register(), i.InputSimd128Register(1),
i.InputSimd128Register(2));
} else {
XMMRegister tmp = i.TempSimd128Register(0);
__ movapd(tmp, i.InputSimd128Register(2));
__ mulpd(tmp, i.InputSimd128Register(1));
__ subpd(i.OutputSimd128Register(), tmp);
}
break;
}
// TODO(gdeepti): Get rid of redundant moves for F32x4Splat/Extract below
case kX64F32x4Splat: {
XMMRegister dst = i.OutputSimd128Register();
......@@ -2545,6 +2571,32 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ cmpleps(i.OutputSimd128Register(), i.InputSimd128Register(1));
break;
}
case kX64F32x4Qfma: {
if (CpuFeatures::IsSupported(FMA3)) {
CpuFeatureScope fma3_scope(tasm(), FMA3);
__ vfmadd231ps(i.OutputSimd128Register(), i.InputSimd128Register(1),
i.InputSimd128Register(2));
} else {
XMMRegister tmp = i.TempSimd128Register(0);
__ movaps(tmp, i.InputSimd128Register(2));
__ mulps(tmp, i.InputSimd128Register(1));
__ addps(i.OutputSimd128Register(), tmp);
}
break;
}
case kX64F32x4Qfms: {
if (CpuFeatures::IsSupported(FMA3)) {
CpuFeatureScope fma3_scope(tasm(), FMA3);
__ vfnmadd231ps(i.OutputSimd128Register(), i.InputSimd128Register(1),
i.InputSimd128Register(2));
} else {
XMMRegister tmp = i.TempSimd128Register(0);
__ movaps(tmp, i.InputSimd128Register(2));
__ mulps(tmp, i.InputSimd128Register(1));
__ subps(i.OutputSimd128Register(), tmp);
}
break;
}
case kX64I64x2Splat: {
CpuFeatureScope sse_scope(tasm(), SSE3);
XMMRegister dst = i.OutputSimd128Register();
......
......@@ -171,6 +171,8 @@ namespace compiler {
V(X64F64x2Ne) \
V(X64F64x2Lt) \
V(X64F64x2Le) \
V(X64F64x2Qfma) \
V(X64F64x2Qfms) \
V(X64F32x4Splat) \
V(X64F32x4ExtractLane) \
V(X64F32x4ReplaceLane) \
......@@ -192,6 +194,8 @@ namespace compiler {
V(X64F32x4Ne) \
V(X64F32x4Lt) \
V(X64F32x4Le) \
V(X64F32x4Qfma) \
V(X64F32x4Qfms) \
V(X64I64x2Splat) \
V(X64I64x2ExtractLane) \
V(X64I64x2ReplaceLane) \
......
......@@ -140,6 +140,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64F64x2Ne:
case kX64F64x2Lt:
case kX64F64x2Le:
case kX64F64x2Qfma:
case kX64F64x2Qfms:
case kX64F32x4Splat:
case kX64F32x4ExtractLane:
case kX64F32x4ReplaceLane:
......@@ -161,6 +163,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kX64F32x4Ne:
case kX64F32x4Lt:
case kX64F32x4Le:
case kX64F32x4Qfma:
case kX64F32x4Qfms:
case kX64I64x2Splat:
case kX64I64x2ExtractLane:
case kX64I64x2ReplaceLane:
......
......@@ -2878,6 +2878,27 @@ void InstructionSelector::VisitF32x4UConvertI32x4(Node* node) {
g.UseRegister(node->InputAt(0)));
}
#define VISIT_SIMD_QFMOP(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \
X64OperandGenerator g(this); \
if (CpuFeatures::IsSupported(FMA3)) { \
Emit(kX64##Opcode, g.DefineSameAsFirst(node), \
g.UseRegister(node->InputAt(0)), g.UseRegister(node->InputAt(1)), \
g.UseRegister(node->InputAt(2))); \
} else { \
InstructionOperand temps[] = {g.TempSimd128Register()}; \
Emit(kX64##Opcode, g.DefineSameAsFirst(node), \
g.UseUniqueRegister(node->InputAt(0)), \
g.UseUniqueRegister(node->InputAt(1)), \
g.UseRegister(node->InputAt(2)), arraysize(temps), temps); \
} \
}
VISIT_SIMD_QFMOP(F64x2Qfma)
VISIT_SIMD_QFMOP(F64x2Qfms)
VISIT_SIMD_QFMOP(F32x4Qfma)
VISIT_SIMD_QFMOP(F32x4Qfms)
#undef VISIT_SIMD_QFMOP
void InstructionSelector::VisitI64x2ShrS(Node* node) {
X64OperandGenerator g(this);
InstructionOperand temps[] = {g.TempRegister()};
......
......@@ -267,6 +267,8 @@ MachineType AtomicOpType(Operator const* op) {
V(F64x2Ne, Operator::kCommutative, 2, 0, 1) \
V(F64x2Lt, Operator::kNoProperties, 2, 0, 1) \
V(F64x2Le, Operator::kNoProperties, 2, 0, 1) \
V(F64x2Qfma, Operator::kNoProperties, 3, 0, 1) \
V(F64x2Qfms, Operator::kNoProperties, 3, 0, 1) \
V(F32x4Splat, Operator::kNoProperties, 1, 0, 1) \
V(F32x4SConvertI32x4, Operator::kNoProperties, 1, 0, 1) \
V(F32x4UConvertI32x4, Operator::kNoProperties, 1, 0, 1) \
......@@ -286,6 +288,8 @@ MachineType AtomicOpType(Operator const* op) {
V(F32x4Ne, Operator::kCommutative, 2, 0, 1) \
V(F32x4Lt, Operator::kNoProperties, 2, 0, 1) \
V(F32x4Le, Operator::kNoProperties, 2, 0, 1) \
V(F32x4Qfma, Operator::kNoProperties, 3, 0, 1) \
V(F32x4Qfms, Operator::kNoProperties, 3, 0, 1) \
V(I64x2Splat, Operator::kNoProperties, 1, 0, 1) \
V(I64x2Neg, Operator::kNoProperties, 1, 0, 1) \
V(I64x2Shl, Operator::kNoProperties, 2, 0, 1) \
......
......@@ -496,6 +496,8 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
const Operator* F64x2Ne();
const Operator* F64x2Lt();
const Operator* F64x2Le();
const Operator* F64x2Qfma();
const Operator* F64x2Qfms();
const Operator* F32x4Splat();
const Operator* F32x4ExtractLane(int32_t);
......@@ -518,6 +520,8 @@ class V8_EXPORT_PRIVATE MachineOperatorBuilder final
const Operator* F32x4Ne();
const Operator* F32x4Lt();
const Operator* F32x4Le();
const Operator* F32x4Qfma();
const Operator* F32x4Qfms();
const Operator* I64x2Splat();
const Operator* I64x2ExtractLane(int32_t);
......
......@@ -761,6 +761,8 @@
V(F64x2Ne) \
V(F64x2Lt) \
V(F64x2Le) \
V(F64x2Qfma) \
V(F64x2Qfms) \
V(F32x4Splat) \
V(F32x4ExtractLane) \
V(F32x4ReplaceLane) \
......@@ -784,6 +786,8 @@
V(F32x4Le) \
V(F32x4Gt) \
V(F32x4Ge) \
V(F32x4Qfma) \
V(F32x4Qfms) \
V(I64x2Splat) \
V(I64x2ExtractLane) \
V(I64x2ReplaceLane) \
......
......@@ -4010,6 +4010,12 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
case wasm::kExprF64x2Ge:
return graph()->NewNode(mcgraph()->machine()->F64x2Le(), inputs[1],
inputs[0]);
case wasm::kExprF64x2Qfma:
return graph()->NewNode(mcgraph()->machine()->F64x2Qfma(), inputs[0],
inputs[1], inputs[2]);
case wasm::kExprF64x2Qfms:
return graph()->NewNode(mcgraph()->machine()->F64x2Qfms(), inputs[0],
inputs[1], inputs[2]);
case wasm::kExprF32x4Splat:
return graph()->NewNode(mcgraph()->machine()->F32x4Splat(), inputs[0]);
case wasm::kExprF32x4SConvertI32x4:
......@@ -4069,6 +4075,12 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
case wasm::kExprF32x4Ge:
return graph()->NewNode(mcgraph()->machine()->F32x4Le(), inputs[1],
inputs[0]);
case wasm::kExprF32x4Qfma:
return graph()->NewNode(mcgraph()->machine()->F32x4Qfma(), inputs[0],
inputs[1], inputs[2]);
case wasm::kExprF32x4Qfms:
return graph()->NewNode(mcgraph()->machine()->F32x4Qfms(), inputs[0],
inputs[1], inputs[2]);
case wasm::kExprI64x2Splat:
return graph()->NewNode(mcgraph()->machine()->I64x2Splat(), inputs[0]);
case wasm::kExprI64x2Neg:
......
......@@ -2613,6 +2613,23 @@ class ThreadImpl {
REDUCTION_CASE(S1x8AllTrue, i16x8, int8, 8, &)
REDUCTION_CASE(S1x16AllTrue, i8x16, int16, 16, &)
#undef REDUCTION_CASE
#define QFM_CASE(op, name, stype, count, operation) \
case kExpr##op: { \
stype c = Pop().to_s128().to_##name(); \
stype b = Pop().to_s128().to_##name(); \
stype a = Pop().to_s128().to_##name(); \
stype res; \
for (size_t i = 0; i < count; i++) { \
res.val[i] = a.val[i] operation(b.val[i] * c.val[i]); \
} \
Push(WasmValue(Simd128(res))); \
return true; \
}
QFM_CASE(F32x4Qfma, f32x4, float4, 4, +)
QFM_CASE(F32x4Qfms, f32x4, float4, 4, -)
QFM_CASE(F64x2Qfma, f64x2, float2, 2, +)
QFM_CASE(F64x2Qfms, f64x2, float2, 2, -)
#undef QFM_CASE
default:
return false;
}
......
......@@ -313,6 +313,10 @@ const char* WasmOpcodes::OpcodeName(WasmOpcode opcode) {
CASE_S1x8_OP(AllTrue, "all_true")
CASE_S1x16_OP(AnyTrue, "any_true")
CASE_S1x16_OP(AllTrue, "all_true")
CASE_F64x2_OP(Qfma, "qfma")
CASE_F64x2_OP(Qfms, "qfms")
CASE_F32x4_OP(Qfma, "qfma")
CASE_F32x4_OP(Qfms, "qfms")
// Atomic operations.
CASE_OP(AtomicNotify, "atomic.notify")
......
......@@ -397,8 +397,8 @@ bool IsJSCompatibleSignature(const FunctionSig* sig, const WasmFeatures&);
V(F32x4Abs, 0xfd95, s_s) \
V(F32x4Neg, 0xfd96, s_s) \
V(F32x4Sqrt, 0xfd97, s_s) \
V(F32x4RecipApprox, 0xfd98, s_s) \
V(F32x4RecipSqrtApprox, 0xfd99, s_s) \
V(F32x4Qfma, 0xfd98, s_sss) \
V(F32x4Qfms, 0xfd99, s_sss) \
V(F32x4Add, 0xfd9a, s_ss) \
V(F32x4Sub, 0xfd9b, s_ss) \
V(F32x4Mul, 0xfd9c, s_ss) \
......@@ -408,6 +408,8 @@ bool IsJSCompatibleSignature(const FunctionSig* sig, const WasmFeatures&);
V(F64x2Abs, 0xfda0, s_s) \
V(F64x2Neg, 0xfda1, s_s) \
V(F64x2Sqrt, 0xfda2, s_s) \
V(F64x2Qfma, 0xfda3, s_sss) \
V(F64x2Qfms, 0xfda4, s_sss) \
V(F64x2Add, 0xfda5, s_ss) \
V(F64x2Sub, 0xfda6, s_ss) \
V(F64x2Mul, 0xfda7, s_ss) \
......@@ -432,7 +434,9 @@ bool IsJSCompatibleSignature(const FunctionSig* sig, const WasmFeatures&);
V(I32x4UConvertI16x8High, 0xfdd1, s_s) \
V(I16x8AddHoriz, 0xfdbd, s_ss) \
V(I32x4AddHoriz, 0xfdbe, s_ss) \
V(F32x4AddHoriz, 0xfdbf, s_ss)
V(F32x4AddHoriz, 0xfdbf, s_ss) \
V(F32x4RecipApprox, 0xfde0, s_s) \
V(F32x4RecipSqrtApprox, 0xfde1, s_s)
#define FOREACH_SIMD_1_OPERAND_1_PARAM_OPCODE(V) \
V(I8x16ExtractLane, 0xfd05, _) \
......
......@@ -301,6 +301,87 @@ int64_t Less(double a, double b) { return a < b ? -1 : 0; }
int64_t LessEqual(double a, double b) { return a <= b ? -1 : 0; }
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
#if V8_TARGET_ARCH_X64
// Only used for qfma and qfms tests below.
// FMOperation holds the params (a, b, c) for a Multiply-Add or
// Multiply-Subtract operation, and the expected result if the operation was
// fused, rounded only once for the entire operation, or unfused, rounded after
// multiply and again after add/subtract.
template <typename T>
struct FMOperation {
const T a;
const T b;
const T c;
const T fused_result;
const T unfused_result;
};
// large_n is large number that overflows T when multiplied by itself, this is a
// useful constant to test fused/unfused behavior.
template <typename T>
constexpr T large_n = T(0);
template <>
constexpr double large_n<double> = 1e200;
template <>
constexpr float large_n<float> = 1e20;
// Fused Multiply-Add performs a + b * c.
template <typename T>
static constexpr FMOperation<T> qfma_array[] = {
{1.0f, 2.0f, 3.0f, 7.0f, 7.0f},
// fused: a + b * c = -inf + (positive overflow) = -inf
// unfused: a + b * c = -inf + inf = NaN
{-std::numeric_limits<T>::infinity(), large_n<T>, large_n<T>,
-std::numeric_limits<T>::infinity(), std::numeric_limits<T>::quiet_NaN()},
// fused: a + b * c = inf + (negative overflow) = inf
// unfused: a + b * c = inf + -inf = NaN
{std::numeric_limits<T>::infinity(), -large_n<T>, large_n<T>,
std::numeric_limits<T>::infinity(), std::numeric_limits<T>::quiet_NaN()},
// NaN
{std::numeric_limits<T>::quiet_NaN(), 2.0f, 3.0f,
std::numeric_limits<T>::quiet_NaN(), std::numeric_limits<T>::quiet_NaN()},
// -NaN
{-std::numeric_limits<T>::quiet_NaN(), 2.0f, 3.0f,
std::numeric_limits<T>::quiet_NaN(), std::numeric_limits<T>::quiet_NaN()}};
template <typename T>
static constexpr Vector<const FMOperation<T>> qfma_vector() {
return ArrayVector(qfma_array<T>);
}
// Fused Multiply-Subtract performs a - b * c.
template <typename T>
static constexpr FMOperation<T> qfms_array[]{
{1.0f, 2.0f, 3.0f, -5.0f, -5.0f},
// fused: a - b * c = inf - (positive overflow) = inf
// unfused: a - b * c = inf - inf = NaN
{std::numeric_limits<T>::infinity(), large_n<T>, large_n<T>,
std::numeric_limits<T>::infinity(), std::numeric_limits<T>::quiet_NaN()},
// fused: a - b * c = -inf - (negative overflow) = -inf
// unfused: a - b * c = -inf - -inf = NaN
{-std::numeric_limits<T>::infinity(), -large_n<T>, large_n<T>,
-std::numeric_limits<T>::infinity(), std::numeric_limits<T>::quiet_NaN()},
// NaN
{std::numeric_limits<T>::quiet_NaN(), 2.0f, 3.0f,
std::numeric_limits<T>::quiet_NaN(), std::numeric_limits<T>::quiet_NaN()},
// -NaN
{-std::numeric_limits<T>::quiet_NaN(), 2.0f, 3.0f,
std::numeric_limits<T>::quiet_NaN(), std::numeric_limits<T>::quiet_NaN()}};
template <typename T>
static constexpr Vector<const FMOperation<T>> qfms_vector() {
return ArrayVector(qfms_array<T>);
}
// Fused results only when fma3 feature is enabled, and running on TurboFan.
bool ExpectFused(ExecutionTier tier) {
return CpuFeatures::IsSupported(FMA3) && (tier == ExecutionTier::kTurbofan);
}
#endif // V8_TARGET_ARCH_X64
} // namespace
#define WASM_SIMD_CHECK_LANE(TYPE, value, LANE_TYPE, lane_value, lane_index) \
......@@ -367,6 +448,11 @@ int64_t LessEqual(double a, double b) { return a <= b ? -1 : 0; }
#define WASM_SIMD_STORE_MEM(index, val) \
index, val, WASM_SIMD_OP(kExprS128StoreMem), ZERO_ALIGNMENT, ZERO_OFFSET
#define WASM_SIMD_F64x2_QFMA(a, b, c) a, b, c, WASM_SIMD_OP(kExprF64x2Qfma)
#define WASM_SIMD_F64x2_QFMS(a, b, c) a, b, c, WASM_SIMD_OP(kExprF64x2Qfms)
#define WASM_SIMD_F32x4_QFMA(a, b, c) a, b, c, WASM_SIMD_OP(kExprF32x4Qfma)
#define WASM_SIMD_F32x4_QFMS(a, b, c) a, b, c, WASM_SIMD_OP(kExprF32x4Qfms)
// Runs tests of compiled code, using the interpreter as a reference.
#define WASM_SIMD_COMPILED_TEST(name) \
void RunWasm_##name##_Impl(LowerSimd lower_simd, \
......@@ -737,6 +823,56 @@ WASM_SIMD_TEST(F32x4Le) {
RunF32x4CompareOpTest(execution_tier, lower_simd, kExprF32x4Le, LessEqual);
}
#ifdef V8_TARGET_ARCH_X64
WASM_SIMD_TEST_NO_LOWERING(F32x4Qfma) {
WasmRunner<int32_t, float, float, float> r(execution_tier, lower_simd);
// Set up global to hold mask output.
float* g = r.builder().AddGlobal<float>(kWasmS128);
// Build fn to splat test values, perform compare op, and write the result.
byte value1 = 0, value2 = 1, value3 = 2;
BUILD(r,
WASM_SET_GLOBAL(0, WASM_SIMD_F32x4_QFMA(
WASM_SIMD_F32x4_SPLAT(WASM_GET_LOCAL(value1)),
WASM_SIMD_F32x4_SPLAT(WASM_GET_LOCAL(value2)),
WASM_SIMD_F32x4_SPLAT(WASM_GET_LOCAL(value3)))),
WASM_ONE);
for (FMOperation<float> x : qfma_vector<float>()) {
r.Call(x.a, x.b, x.c);
float expected =
ExpectFused(execution_tier) ? x.fused_result : x.unfused_result;
for (int i = 0; i < 4; i++) {
float actual = ReadLittleEndianValue<float>(&g[i]);
CheckFloatResult(x.a, x.b, expected, actual, true /* exact */);
}
}
}
WASM_SIMD_TEST_NO_LOWERING(F32x4Qfms) {
WasmRunner<int32_t, float, float, float> r(execution_tier, lower_simd);
// Set up global to hold mask output.
float* g = r.builder().AddGlobal<float>(kWasmS128);
// Build fn to splat test values, perform compare op, and write the result.
byte value1 = 0, value2 = 1, value3 = 2;
BUILD(r,
WASM_SET_GLOBAL(0, WASM_SIMD_F32x4_QFMS(
WASM_SIMD_F32x4_SPLAT(WASM_GET_LOCAL(value1)),
WASM_SIMD_F32x4_SPLAT(WASM_GET_LOCAL(value2)),
WASM_SIMD_F32x4_SPLAT(WASM_GET_LOCAL(value3)))),
WASM_ONE);
for (FMOperation<float> x : qfms_vector<float>()) {
r.Call(x.a, x.b, x.c);
float expected =
ExpectFused(execution_tier) ? x.fused_result : x.unfused_result;
for (int i = 0; i < 4; i++) {
float actual = ReadLittleEndianValue<float>(&g[i]);
CheckFloatResult(x.a, x.b, expected, actual, true /* exact */);
}
}
}
#endif // V8_TARGET_ARCH_X64
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
WASM_SIMD_TEST_NO_LOWERING(I64x2Splat) {
WasmRunner<int32_t, int64_t> r(execution_tier, lower_simd);
......@@ -1292,6 +1428,54 @@ WASM_SIMD_TEST_NO_LOWERING(I64x2MaxU) {
RunI64x2BinOpTest(execution_tier, lower_simd, kExprI64x2MaxU,
UnsignedMaximum);
}
WASM_SIMD_TEST_NO_LOWERING(F64x2Qfma) {
WasmRunner<int32_t, double, double, double> r(execution_tier, lower_simd);
// Set up global to hold mask output.
double* g = r.builder().AddGlobal<double>(kWasmS128);
// Build fn to splat test values, perform compare op, and write the result.
byte value1 = 0, value2 = 1, value3 = 2;
BUILD(r,
WASM_SET_GLOBAL(0, WASM_SIMD_F64x2_QFMA(
WASM_SIMD_F64x2_SPLAT(WASM_GET_LOCAL(value1)),
WASM_SIMD_F64x2_SPLAT(WASM_GET_LOCAL(value2)),
WASM_SIMD_F64x2_SPLAT(WASM_GET_LOCAL(value3)))),
WASM_ONE);
for (FMOperation<double> x : qfma_vector<double>()) {
r.Call(x.a, x.b, x.c);
double expected =
ExpectFused(execution_tier) ? x.fused_result : x.unfused_result;
for (int i = 0; i < 2; i++) {
double actual = ReadLittleEndianValue<double>(&g[i]);
CheckDoubleResult(x.a, x.b, expected, actual, true /* exact */);
}
}
}
WASM_SIMD_TEST_NO_LOWERING(F64x2Qfms) {
WasmRunner<int32_t, double, double, double> r(execution_tier, lower_simd);
// Set up global to hold mask output.
double* g = r.builder().AddGlobal<double>(kWasmS128);
// Build fn to splat test values, perform compare op, and write the result.
byte value1 = 0, value2 = 1, value3 = 2;
BUILD(r,
WASM_SET_GLOBAL(0, WASM_SIMD_F64x2_QFMS(
WASM_SIMD_F64x2_SPLAT(WASM_GET_LOCAL(value1)),
WASM_SIMD_F64x2_SPLAT(WASM_GET_LOCAL(value2)),
WASM_SIMD_F64x2_SPLAT(WASM_GET_LOCAL(value3)))),
WASM_ONE);
for (FMOperation<double> x : qfms_vector<double>()) {
r.Call(x.a, x.b, x.c);
double expected =
ExpectFused(execution_tier) ? x.fused_result : x.unfused_result;
for (int i = 0; i < 2; i++) {
double actual = ReadLittleEndianValue<double>(&g[i]);
CheckDoubleResult(x.a, x.b, expected, actual, true /* exact */);
}
}
}
#endif // V8_TARGET_ARCH_X64
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
......@@ -3101,6 +3285,10 @@ WASM_SIMD_TEST_NO_LOWERING(I16x8GtUMixed) {
#undef WASM_SIMD_TEST_NO_LOWERING
#undef WASM_SIMD_ANYTRUE_TEST
#undef WASM_SIMD_ALLTRUE_TEST
#undef WASM_SIMD_F64x2_QFMA
#undef WASM_SIMD_F64x2_QFMS
#undef WASM_SIMD_F32x4_QFMA
#undef WASM_SIMD_F32x4_QFMS
} // namespace test_run_wasm_simd
} // namespace wasm
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment