Commit d9381fd6 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][arm] Prototype f32x4.ceil

Prototype f32x4.ceil on ARM for both ARM v7 and ARM v8. ARM v8 has
support for vrintp, and for ARM v7 we fallback to runtime.

Since ARM v8 uses vrintp, which is the same instruction used for F32
Ceil (scalar), wasm-compiler reuses the Float32Round check, rather than
creating new F32x4Round optional operators.

Implementation for vrintp (Advanced SIMD version that takes Q
registers), assembler, disassembler support. Incomplete for now, but
more will be added as we add other rounding modes.

Bug: v8:10553
Change-Id: I4563608b9501f6f57c3a8325b17de89da7058a43
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2248779Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#68419}
parent 0d9eb105
...@@ -3596,6 +3596,23 @@ void Assembler::vrintp(const DwVfpRegister dst, const DwVfpRegister src) { ...@@ -3596,6 +3596,23 @@ void Assembler::vrintp(const DwVfpRegister dst, const DwVfpRegister src) {
vd * B12 | 0x5 * B9 | B8 | B6 | m * B5 | vm); vd * B12 | 0x5 * B9 | B8 | B6 | m * B5 | vm);
} }
void Assembler::vrintp(NeonDataType dt, const QwNeonRegister dst,
const QwNeonRegister src) {
// cond=kSpecialCondition(31-28) | 00111(27-23)| D(22) | 11(21-20) |
// size(19-18) | 10(17-16) | Vd(15-12) | 01(11-10) | 7(9-7) | 1(6) | M(5) |
// 0(4) | Vm(3-0)
DCHECK(IsEnabled(ARMv8));
int vd, d;
dst.split_code(&vd, &d);
int vm, m;
src.split_code(&vm, &m);
int size = NeonSz(dt);
// Only F32 is implemented for now.
DCHECK_EQ(0x2, dt);
emit(kSpecialCondition | 0x7 * B23 | d * B22 | 0x3 * B20 | size * B18 |
0x2 * B16 | vd * B12 | 0x1 * B10 | 0x7 * B7 | B6 | m * B5 | vm);
}
void Assembler::vrintm(const SwVfpRegister dst, const SwVfpRegister src) { void Assembler::vrintm(const SwVfpRegister dst, const SwVfpRegister src) {
// cond=kSpecialCondition(31-28) | 11101(27-23)| D(22) | 11(21-20) | // cond=kSpecialCondition(31-28) | 11101(27-23)| D(22) | 11(21-20) |
// 10(19-18) | RM=11(17-16) | Vd(15-12) | 101(11-9) | sz=0(8) | 01(7-6) | // 10(19-18) | RM=11(17-16) | Vd(15-12) | 101(11-9) | sz=0(8) | 01(7-6) |
......
...@@ -820,7 +820,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -820,7 +820,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vsqrt(const SwVfpRegister dst, const SwVfpRegister src, void vsqrt(const SwVfpRegister dst, const SwVfpRegister src,
const Condition cond = al); const Condition cond = al);
// ARMv8 rounding instructions. // ARMv8 rounding instructions (Scalar).
void vrinta(const SwVfpRegister dst, const SwVfpRegister src); void vrinta(const SwVfpRegister dst, const SwVfpRegister src);
void vrinta(const DwVfpRegister dst, const DwVfpRegister src); void vrinta(const DwVfpRegister dst, const DwVfpRegister src);
void vrintn(const SwVfpRegister dst, const SwVfpRegister src); void vrintn(const SwVfpRegister dst, const SwVfpRegister src);
...@@ -908,6 +908,11 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -908,6 +908,11 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
DwVfpRegister src2); DwVfpRegister src2);
void vpmax(NeonDataType dt, DwVfpRegister dst, DwVfpRegister src1, void vpmax(NeonDataType dt, DwVfpRegister dst, DwVfpRegister src1,
DwVfpRegister src2); DwVfpRegister src2);
// ARMv8 rounding instructions (NEON).
void vrintp(NeonDataType dt, const QwNeonRegister dst,
const QwNeonRegister src);
void vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src, int shift); void vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src, int shift);
void vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src, void vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src,
QwNeonRegister shift); QwNeonRegister shift);
......
...@@ -297,6 +297,7 @@ FUNCTION_REFERENCE(wasm_word32_rol, wasm::word32_rol_wrapper) ...@@ -297,6 +297,7 @@ FUNCTION_REFERENCE(wasm_word32_rol, wasm::word32_rol_wrapper)
FUNCTION_REFERENCE(wasm_word32_ror, wasm::word32_ror_wrapper) FUNCTION_REFERENCE(wasm_word32_ror, wasm::word32_ror_wrapper)
FUNCTION_REFERENCE(wasm_word64_rol, wasm::word64_rol_wrapper) FUNCTION_REFERENCE(wasm_word64_rol, wasm::word64_rol_wrapper)
FUNCTION_REFERENCE(wasm_word64_ror, wasm::word64_ror_wrapper) FUNCTION_REFERENCE(wasm_word64_ror, wasm::word64_ror_wrapper)
FUNCTION_REFERENCE(wasm_f32x4_ceil, wasm::f32x4_ceil_wrapper)
FUNCTION_REFERENCE(wasm_memory_init, wasm::memory_init_wrapper) FUNCTION_REFERENCE(wasm_memory_init, wasm::memory_init_wrapper)
FUNCTION_REFERENCE(wasm_memory_copy, wasm::memory_copy_wrapper) FUNCTION_REFERENCE(wasm_memory_copy, wasm::memory_copy_wrapper)
FUNCTION_REFERENCE(wasm_memory_fill, wasm::memory_fill_wrapper) FUNCTION_REFERENCE(wasm_memory_fill, wasm::memory_fill_wrapper)
......
...@@ -206,6 +206,7 @@ class StatsCounter; ...@@ -206,6 +206,7 @@ class StatsCounter;
V(wasm_word64_ror, "wasm::word64_ror") \ V(wasm_word64_ror, "wasm::word64_ror") \
V(wasm_word64_ctz, "wasm::word64_ctz") \ V(wasm_word64_ctz, "wasm::word64_ctz") \
V(wasm_word64_popcnt, "wasm::word64_popcnt") \ V(wasm_word64_popcnt, "wasm::word64_popcnt") \
V(wasm_f32x4_ceil, "wasm::f32x4_ceil_wrapper") \
V(wasm_memory_init, "wasm::memory_init") \ V(wasm_memory_init, "wasm::memory_init") \
V(wasm_memory_copy, "wasm::memory_copy") \ V(wasm_memory_copy, "wasm::memory_copy") \
V(wasm_memory_fill, "wasm::memory_fill") \ V(wasm_memory_fill, "wasm::memory_fill") \
......
...@@ -1466,7 +1466,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -1466,7 +1466,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} }
case kArmVrintpF32: { case kArmVrintpF32: {
CpuFeatureScope scope(tasm(), ARMv8); CpuFeatureScope scope(tasm(), ARMv8);
__ vrintp(i.OutputFloatRegister(), i.InputFloatRegister(0)); if (instr->InputAt(0)->IsSimd128Register()) {
__ vrintp(NeonS32, i.OutputSimd128Register(),
i.InputSimd128Register(0));
} else {
__ vrintp(i.OutputFloatRegister(), i.InputFloatRegister(0));
}
break; break;
} }
case kArmVrintpF64: { case kArmVrintpF64: {
......
...@@ -1495,7 +1495,8 @@ void InstructionSelector::VisitUint32Mod(Node* node) { ...@@ -1495,7 +1495,8 @@ void InstructionSelector::VisitUint32Mod(Node* node) {
V(Float64RoundTruncate, kArmVrintzF64) \ V(Float64RoundTruncate, kArmVrintzF64) \
V(Float64RoundTiesAway, kArmVrintaF64) \ V(Float64RoundTiesAway, kArmVrintaF64) \
V(Float32RoundTiesEven, kArmVrintnF32) \ V(Float32RoundTiesEven, kArmVrintnF32) \
V(Float64RoundTiesEven, kArmVrintnF64) V(Float64RoundTiesEven, kArmVrintnF64) \
V(F32x4Ceil, kArmVrintpF32)
#define RRR_OP_LIST(V) \ #define RRR_OP_LIST(V) \
V(Int32MulHigh, kArmSmmul) \ V(Int32MulHigh, kArmSmmul) \
......
...@@ -2690,11 +2690,15 @@ void InstructionSelector::VisitF64x2Pmax(Node* node) { UNIMPLEMENTED(); } ...@@ -2690,11 +2690,15 @@ void InstructionSelector::VisitF64x2Pmax(Node* node) { UNIMPLEMENTED(); }
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_S390X && \ #if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_S390X && \
!V8_TARGET_ARCH_IA32 !V8_TARGET_ARCH_IA32
// TODO(v8:10553) Prototyping floating point rounding instructions. // TODO(v8:10553) Prototyping floating point rounding instructions.
// TODO(zhin): Temporary convoluted way to for unimplemented opcodes on ARM as
// we are implementing them one at a time.
#if !V8_TARGET_ARCH_ARM
void InstructionSelector::VisitF32x4Ceil(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_ARM
void InstructionSelector::VisitF64x2Ceil(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitF64x2Ceil(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2Floor(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitF64x2Floor(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2Trunc(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitF64x2Trunc(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2NearestInt(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitF64x2NearestInt(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4Ceil(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4Floor(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitF32x4Floor(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4Trunc(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitF32x4Trunc(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4NearestInt(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitF32x4NearestInt(Node* node) { UNIMPLEMENTED(); }
......
...@@ -4040,6 +4040,12 @@ Node* WasmGraphBuilder::BuildAsmjsStoreMem(MachineType type, Node* index, ...@@ -4040,6 +4040,12 @@ Node* WasmGraphBuilder::BuildAsmjsStoreMem(MachineType type, Node* index,
return val; return val;
} }
Node* WasmGraphBuilder::BuildF32x4Ceil(Node* input) {
MachineType type = MachineType::Simd128();
ExternalReference ref = ExternalReference::wasm_f32x4_ceil();
return BuildCFuncInstruction(ref, type, input);
}
void WasmGraphBuilder::PrintDebugName(Node* node) { void WasmGraphBuilder::PrintDebugName(Node* node) {
PrintF("#%d:%s", node->id(), node->op()->mnemonic()); PrintF("#%d:%s", node->id(), node->op()->mnemonic());
} }
...@@ -4281,6 +4287,9 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) { ...@@ -4281,6 +4287,9 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
return graph()->NewNode(mcgraph()->machine()->F32x4Pmax(), inputs[0], return graph()->NewNode(mcgraph()->machine()->F32x4Pmax(), inputs[0],
inputs[1]); inputs[1]);
case wasm::kExprF32x4Ceil: case wasm::kExprF32x4Ceil:
// Architecture support for F32x4Ceil and Float32RoundUp is the same.
if (!mcgraph()->machine()->Float32RoundUp().IsSupported())
return BuildF32x4Ceil(inputs[0]);
return graph()->NewNode(mcgraph()->machine()->F32x4Ceil(), inputs[0]); return graph()->NewNode(mcgraph()->machine()->F32x4Ceil(), inputs[0]);
case wasm::kExprF32x4Floor: case wasm::kExprF32x4Floor:
return graph()->NewNode(mcgraph()->machine()->F32x4Floor(), inputs[0]); return graph()->NewNode(mcgraph()->machine()->F32x4Floor(), inputs[0]);
......
...@@ -553,6 +553,9 @@ class WasmGraphBuilder { ...@@ -553,6 +553,9 @@ class WasmGraphBuilder {
Node* BuildAsmjsLoadMem(MachineType type, Node* index); Node* BuildAsmjsLoadMem(MachineType type, Node* index);
Node* BuildAsmjsStoreMem(MachineType type, Node* index, Node* val); Node* BuildAsmjsStoreMem(MachineType type, Node* index, Node* val);
// Wasm SIMD.
Node* BuildF32x4Ceil(Node* input);
void BuildEncodeException32BitValue(Node* values_array, uint32_t* index, void BuildEncodeException32BitValue(Node* values_array, uint32_t* index,
Node* value); Node* value);
Node* BuildDecodeException32BitValue(Node* values_array, uint32_t* index); Node* BuildDecodeException32BitValue(Node* values_array, uint32_t* index);
......
...@@ -2264,6 +2264,21 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) { ...@@ -2264,6 +2264,21 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
out_buffer_pos_ += out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "%s.%c%i d%d, q%d", name, SNPrintF(out_buffer_ + out_buffer_pos_, "%s.%c%i d%d, q%d", name,
type, size, Vd, Vm); type, size, Vd, Vm);
} else if (instr->Bits(17, 16) == 0x2 && instr->Bit(10) == 1) {
// vrintp
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
bool dp_op = instr->Bit(6) == 0;
int rounding_mode = instr->Bits(9, 7);
if (rounding_mode != 7) {
UNIMPLEMENTED();
}
if (dp_op) {
Format(instr, "vrintp.f32.f32 'Dd, 'Dm");
} else {
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vrintp.f32.f32 q%d, q%d", Vd, Vm);
}
} else { } else {
int Vd, Vm; int Vd, Vm;
if (instr->Bit(6) == 0) { if (instr->Bit(6) == 0) {
......
...@@ -5442,6 +5442,33 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { ...@@ -5442,6 +5442,33 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
UNIMPLEMENTED(); UNIMPLEMENTED();
break; break;
} }
} else if (instr->Bits(17, 16) == 0x2 && instr->Bit(10) == 1) {
// vrint<q>.<dt> <Dd>, <Dm>
// vrint<q>.<dt> <Qd>, <Qm>
// See F6.1.205
int regs = instr->Bit(6) + 1;
int rounding_mode = instr->Bits(9, 7);
float (*fproundint)(float) = nullptr;
switch (rounding_mode) {
case 7:
fproundint = &ceilf;
break;
default:
UNIMPLEMENTED();
}
int vm = instr->VFPMRegValue(kDoublePrecision);
int vd = instr->VFPDRegValue(kDoublePrecision);
float floats[2];
for (int r = 0; r < regs; r++) {
// We cannot simply use GetVFPSingleValue since our Q registers
// might not map to any S registers at all.
get_neon_register<float, kDoubleSize>(vm + r, floats);
for (int e = 0; e < 2; e++) {
floats[e] = canonicalizeNaN(fproundint(floats[e]));
}
set_neon_register<float, kDoubleSize>(vd + r, floats);
}
} else { } else {
UNIMPLEMENTED(); UNIMPLEMENTED();
} }
......
...@@ -401,6 +401,20 @@ void float64_pow_wrapper(Address data) { ...@@ -401,6 +401,20 @@ void float64_pow_wrapper(Address data) {
WriteUnalignedValue<double>(data, base::ieee754::pow(x, y)); WriteUnalignedValue<double>(data, base::ieee754::pow(x, y));
} }
template <typename T, T (*float_round_op)(T)>
void simd_float_round_wrapper(Address data) {
constexpr int n = kSimd128Size / sizeof(T);
for (int i = 0; i < n; i++) {
WriteUnalignedValue<T>(
data + (i * sizeof(T)),
float_round_op(ReadUnalignedValue<T>(data + (i * sizeof(T)))));
}
}
void f32x4_ceil_wrapper(Address data) {
simd_float_round_wrapper<float, &ceilf>(data);
}
namespace { namespace {
class ThreadNotInWasmScope { class ThreadNotInWasmScope {
// Asan on Windows triggers exceptions to allocate shadow memory lazily. When // Asan on Windows triggers exceptions to allocate shadow memory lazily. When
......
...@@ -79,6 +79,8 @@ V8_EXPORT_PRIVATE void word64_ror_wrapper(Address data); ...@@ -79,6 +79,8 @@ V8_EXPORT_PRIVATE void word64_ror_wrapper(Address data);
V8_EXPORT_PRIVATE void float64_pow_wrapper(Address data); V8_EXPORT_PRIVATE void float64_pow_wrapper(Address data);
V8_EXPORT_PRIVATE void f32x4_ceil_wrapper(Address data);
// The return type is {int32_t} instead of {bool} to enforce the compiler to // The return type is {int32_t} instead of {bool} to enforce the compiler to
// zero-extend the result in the return register. // zero-extend the result in the return register.
int32_t memory_init_wrapper(Address data); int32_t memory_init_wrapper(Address data);
......
...@@ -916,6 +916,9 @@ TEST(ARMv8_vrintX_disasm) { ...@@ -916,6 +916,9 @@ TEST(ARMv8_vrintX_disasm) {
COMPARE(vrintz(d0, d0), "eeb60bc0 vrintz.f64.f64 d0, d0"); COMPARE(vrintz(d0, d0), "eeb60bc0 vrintz.f64.f64 d0, d0");
COMPARE(vrintz(d2, d3, ne), "1eb62bc3 vrintzne.f64.f64 d2, d3"); COMPARE(vrintz(d2, d3, ne), "1eb62bc3 vrintzne.f64.f64 d2, d3");
// Advanced SIMD
COMPARE(vrintp(NeonS32, q0, q3), "f3ba07c6 vrintp.f32.f32 q0, q3");
} }
VERIFY_RUN(); VERIFY_RUN();
......
...@@ -692,12 +692,15 @@ WASM_SIMD_TEST(F32x4RecipSqrtApprox) { ...@@ -692,12 +692,15 @@ WASM_SIMD_TEST(F32x4RecipSqrtApprox) {
// TODO(v8:10553) Prototyping floating-point rounding instructions. // TODO(v8:10553) Prototyping floating-point rounding instructions.
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_S390X || \ #if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_S390X || \
V8_TARGET_ARCH_IA32 V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM
WASM_SIMD_TEST_NO_LOWERING(F32x4Ceil) { WASM_SIMD_TEST_NO_LOWERING(F32x4Ceil) {
FLAG_SCOPE(wasm_simd_post_mvp); FLAG_SCOPE(wasm_simd_post_mvp);
RunF32x4UnOpTest(execution_tier, lower_simd, kExprF32x4Ceil, ceilf, true); RunF32x4UnOpTest(execution_tier, lower_simd, kExprF32x4Ceil, ceilf, true);
} }
// TODO(zhin): Temporary convoluted way to exclude running these tests on ARM as
// we are implementing each opcode one at a time.
#if !V8_TARGET_ARCH_ARM
WASM_SIMD_TEST_NO_LOWERING(F32x4Floor) { WASM_SIMD_TEST_NO_LOWERING(F32x4Floor) {
FLAG_SCOPE(wasm_simd_post_mvp); FLAG_SCOPE(wasm_simd_post_mvp);
RunF32x4UnOpTest(execution_tier, lower_simd, kExprF32x4Floor, floorf, true); RunF32x4UnOpTest(execution_tier, lower_simd, kExprF32x4Floor, floorf, true);
...@@ -713,8 +716,9 @@ WASM_SIMD_TEST_NO_LOWERING(F32x4NearestInt) { ...@@ -713,8 +716,9 @@ WASM_SIMD_TEST_NO_LOWERING(F32x4NearestInt) {
RunF32x4UnOpTest(execution_tier, lower_simd, kExprF32x4NearestInt, nearbyintf, RunF32x4UnOpTest(execution_tier, lower_simd, kExprF32x4NearestInt, nearbyintf,
true); true);
} }
#endif // !V8_TARGET_ARCH_ARM
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_S390X || #endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_S390X ||
// V8_TARGET_ARCH_IA32 // V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM
void RunF32x4BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd, void RunF32x4BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode opcode, FloatBinOp expected_op) { WasmOpcode opcode, FloatBinOp expected_op) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment