Commit d9381fd6 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd][arm] Prototype f32x4.ceil

Prototype f32x4.ceil on ARM for both ARM v7 and ARM v8. ARM v8 has
support for vrintp, and for ARM v7 we fallback to runtime.

Since ARM v8 uses vrintp, which is the same instruction used for F32
Ceil (scalar), wasm-compiler reuses the Float32Round check, rather than
creating new F32x4Round optional operators.

Implementation for vrintp (Advanced SIMD version that takes Q
registers), assembler, disassembler support. Incomplete for now, but
more will be added as we add other rounding modes.

Bug: v8:10553
Change-Id: I4563608b9501f6f57c3a8325b17de89da7058a43
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2248779Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#68419}
parent 0d9eb105
......@@ -3596,6 +3596,23 @@ void Assembler::vrintp(const DwVfpRegister dst, const DwVfpRegister src) {
vd * B12 | 0x5 * B9 | B8 | B6 | m * B5 | vm);
}
void Assembler::vrintp(NeonDataType dt, const QwNeonRegister dst,
const QwNeonRegister src) {
// cond=kSpecialCondition(31-28) | 00111(27-23)| D(22) | 11(21-20) |
// size(19-18) | 10(17-16) | Vd(15-12) | 01(11-10) | 7(9-7) | 1(6) | M(5) |
// 0(4) | Vm(3-0)
DCHECK(IsEnabled(ARMv8));
int vd, d;
dst.split_code(&vd, &d);
int vm, m;
src.split_code(&vm, &m);
int size = NeonSz(dt);
// Only F32 is implemented for now.
DCHECK_EQ(0x2, dt);
emit(kSpecialCondition | 0x7 * B23 | d * B22 | 0x3 * B20 | size * B18 |
0x2 * B16 | vd * B12 | 0x1 * B10 | 0x7 * B7 | B6 | m * B5 | vm);
}
void Assembler::vrintm(const SwVfpRegister dst, const SwVfpRegister src) {
// cond=kSpecialCondition(31-28) | 11101(27-23)| D(22) | 11(21-20) |
// 10(19-18) | RM=11(17-16) | Vd(15-12) | 101(11-9) | sz=0(8) | 01(7-6) |
......
......@@ -820,7 +820,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vsqrt(const SwVfpRegister dst, const SwVfpRegister src,
const Condition cond = al);
// ARMv8 rounding instructions.
// ARMv8 rounding instructions (Scalar).
void vrinta(const SwVfpRegister dst, const SwVfpRegister src);
void vrinta(const DwVfpRegister dst, const DwVfpRegister src);
void vrintn(const SwVfpRegister dst, const SwVfpRegister src);
......@@ -908,6 +908,11 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
DwVfpRegister src2);
void vpmax(NeonDataType dt, DwVfpRegister dst, DwVfpRegister src1,
DwVfpRegister src2);
// ARMv8 rounding instructions (NEON).
void vrintp(NeonDataType dt, const QwNeonRegister dst,
const QwNeonRegister src);
void vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src, int shift);
void vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src,
QwNeonRegister shift);
......
......@@ -297,6 +297,7 @@ FUNCTION_REFERENCE(wasm_word32_rol, wasm::word32_rol_wrapper)
FUNCTION_REFERENCE(wasm_word32_ror, wasm::word32_ror_wrapper)
FUNCTION_REFERENCE(wasm_word64_rol, wasm::word64_rol_wrapper)
FUNCTION_REFERENCE(wasm_word64_ror, wasm::word64_ror_wrapper)
FUNCTION_REFERENCE(wasm_f32x4_ceil, wasm::f32x4_ceil_wrapper)
FUNCTION_REFERENCE(wasm_memory_init, wasm::memory_init_wrapper)
FUNCTION_REFERENCE(wasm_memory_copy, wasm::memory_copy_wrapper)
FUNCTION_REFERENCE(wasm_memory_fill, wasm::memory_fill_wrapper)
......
......@@ -206,6 +206,7 @@ class StatsCounter;
V(wasm_word64_ror, "wasm::word64_ror") \
V(wasm_word64_ctz, "wasm::word64_ctz") \
V(wasm_word64_popcnt, "wasm::word64_popcnt") \
V(wasm_f32x4_ceil, "wasm::f32x4_ceil_wrapper") \
V(wasm_memory_init, "wasm::memory_init") \
V(wasm_memory_copy, "wasm::memory_copy") \
V(wasm_memory_fill, "wasm::memory_fill") \
......
......@@ -1466,7 +1466,12 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kArmVrintpF32: {
CpuFeatureScope scope(tasm(), ARMv8);
__ vrintp(i.OutputFloatRegister(), i.InputFloatRegister(0));
if (instr->InputAt(0)->IsSimd128Register()) {
__ vrintp(NeonS32, i.OutputSimd128Register(),
i.InputSimd128Register(0));
} else {
__ vrintp(i.OutputFloatRegister(), i.InputFloatRegister(0));
}
break;
}
case kArmVrintpF64: {
......
......@@ -1495,7 +1495,8 @@ void InstructionSelector::VisitUint32Mod(Node* node) {
V(Float64RoundTruncate, kArmVrintzF64) \
V(Float64RoundTiesAway, kArmVrintaF64) \
V(Float32RoundTiesEven, kArmVrintnF32) \
V(Float64RoundTiesEven, kArmVrintnF64)
V(Float64RoundTiesEven, kArmVrintnF64) \
V(F32x4Ceil, kArmVrintpF32)
#define RRR_OP_LIST(V) \
V(Int32MulHigh, kArmSmmul) \
......
......@@ -2690,11 +2690,15 @@ void InstructionSelector::VisitF64x2Pmax(Node* node) { UNIMPLEMENTED(); }
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_ARM64 && !V8_TARGET_ARCH_S390X && \
!V8_TARGET_ARCH_IA32
// TODO(v8:10553) Prototyping floating point rounding instructions.
// TODO(zhin): Temporary convoluted way to for unimplemented opcodes on ARM as
// we are implementing them one at a time.
#if !V8_TARGET_ARCH_ARM
void InstructionSelector::VisitF32x4Ceil(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_ARM
void InstructionSelector::VisitF64x2Ceil(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2Floor(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2Trunc(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2NearestInt(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4Ceil(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4Floor(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4Trunc(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF32x4NearestInt(Node* node) { UNIMPLEMENTED(); }
......
......@@ -4040,6 +4040,12 @@ Node* WasmGraphBuilder::BuildAsmjsStoreMem(MachineType type, Node* index,
return val;
}
Node* WasmGraphBuilder::BuildF32x4Ceil(Node* input) {
MachineType type = MachineType::Simd128();
ExternalReference ref = ExternalReference::wasm_f32x4_ceil();
return BuildCFuncInstruction(ref, type, input);
}
void WasmGraphBuilder::PrintDebugName(Node* node) {
PrintF("#%d:%s", node->id(), node->op()->mnemonic());
}
......@@ -4281,6 +4287,9 @@ Node* WasmGraphBuilder::SimdOp(wasm::WasmOpcode opcode, Node* const* inputs) {
return graph()->NewNode(mcgraph()->machine()->F32x4Pmax(), inputs[0],
inputs[1]);
case wasm::kExprF32x4Ceil:
// Architecture support for F32x4Ceil and Float32RoundUp is the same.
if (!mcgraph()->machine()->Float32RoundUp().IsSupported())
return BuildF32x4Ceil(inputs[0]);
return graph()->NewNode(mcgraph()->machine()->F32x4Ceil(), inputs[0]);
case wasm::kExprF32x4Floor:
return graph()->NewNode(mcgraph()->machine()->F32x4Floor(), inputs[0]);
......
......@@ -553,6 +553,9 @@ class WasmGraphBuilder {
Node* BuildAsmjsLoadMem(MachineType type, Node* index);
Node* BuildAsmjsStoreMem(MachineType type, Node* index, Node* val);
// Wasm SIMD.
Node* BuildF32x4Ceil(Node* input);
void BuildEncodeException32BitValue(Node* values_array, uint32_t* index,
Node* value);
Node* BuildDecodeException32BitValue(Node* values_array, uint32_t* index);
......
......@@ -2264,6 +2264,21 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "%s.%c%i d%d, q%d", name,
type, size, Vd, Vm);
} else if (instr->Bits(17, 16) == 0x2 && instr->Bit(10) == 1) {
// vrintp
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
bool dp_op = instr->Bit(6) == 0;
int rounding_mode = instr->Bits(9, 7);
if (rounding_mode != 7) {
UNIMPLEMENTED();
}
if (dp_op) {
Format(instr, "vrintp.f32.f32 'Dd, 'Dm");
} else {
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vrintp.f32.f32 q%d, q%d", Vd, Vm);
}
} else {
int Vd, Vm;
if (instr->Bit(6) == 0) {
......
......@@ -5442,6 +5442,33 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
UNIMPLEMENTED();
break;
}
} else if (instr->Bits(17, 16) == 0x2 && instr->Bit(10) == 1) {
// vrint<q>.<dt> <Dd>, <Dm>
// vrint<q>.<dt> <Qd>, <Qm>
// See F6.1.205
int regs = instr->Bit(6) + 1;
int rounding_mode = instr->Bits(9, 7);
float (*fproundint)(float) = nullptr;
switch (rounding_mode) {
case 7:
fproundint = &ceilf;
break;
default:
UNIMPLEMENTED();
}
int vm = instr->VFPMRegValue(kDoublePrecision);
int vd = instr->VFPDRegValue(kDoublePrecision);
float floats[2];
for (int r = 0; r < regs; r++) {
// We cannot simply use GetVFPSingleValue since our Q registers
// might not map to any S registers at all.
get_neon_register<float, kDoubleSize>(vm + r, floats);
for (int e = 0; e < 2; e++) {
floats[e] = canonicalizeNaN(fproundint(floats[e]));
}
set_neon_register<float, kDoubleSize>(vd + r, floats);
}
} else {
UNIMPLEMENTED();
}
......
......@@ -401,6 +401,20 @@ void float64_pow_wrapper(Address data) {
WriteUnalignedValue<double>(data, base::ieee754::pow(x, y));
}
template <typename T, T (*float_round_op)(T)>
void simd_float_round_wrapper(Address data) {
constexpr int n = kSimd128Size / sizeof(T);
for (int i = 0; i < n; i++) {
WriteUnalignedValue<T>(
data + (i * sizeof(T)),
float_round_op(ReadUnalignedValue<T>(data + (i * sizeof(T)))));
}
}
void f32x4_ceil_wrapper(Address data) {
simd_float_round_wrapper<float, &ceilf>(data);
}
namespace {
class ThreadNotInWasmScope {
// Asan on Windows triggers exceptions to allocate shadow memory lazily. When
......
......@@ -79,6 +79,8 @@ V8_EXPORT_PRIVATE void word64_ror_wrapper(Address data);
V8_EXPORT_PRIVATE void float64_pow_wrapper(Address data);
V8_EXPORT_PRIVATE void f32x4_ceil_wrapper(Address data);
// The return type is {int32_t} instead of {bool} to enforce the compiler to
// zero-extend the result in the return register.
int32_t memory_init_wrapper(Address data);
......
......@@ -916,6 +916,9 @@ TEST(ARMv8_vrintX_disasm) {
COMPARE(vrintz(d0, d0), "eeb60bc0 vrintz.f64.f64 d0, d0");
COMPARE(vrintz(d2, d3, ne), "1eb62bc3 vrintzne.f64.f64 d2, d3");
// Advanced SIMD
COMPARE(vrintp(NeonS32, q0, q3), "f3ba07c6 vrintp.f32.f32 q0, q3");
}
VERIFY_RUN();
......
......@@ -692,12 +692,15 @@ WASM_SIMD_TEST(F32x4RecipSqrtApprox) {
// TODO(v8:10553) Prototyping floating-point rounding instructions.
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_S390X || \
V8_TARGET_ARCH_IA32
V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM
WASM_SIMD_TEST_NO_LOWERING(F32x4Ceil) {
FLAG_SCOPE(wasm_simd_post_mvp);
RunF32x4UnOpTest(execution_tier, lower_simd, kExprF32x4Ceil, ceilf, true);
}
// TODO(zhin): Temporary convoluted way to exclude running these tests on ARM as
// we are implementing each opcode one at a time.
#if !V8_TARGET_ARCH_ARM
WASM_SIMD_TEST_NO_LOWERING(F32x4Floor) {
FLAG_SCOPE(wasm_simd_post_mvp);
RunF32x4UnOpTest(execution_tier, lower_simd, kExprF32x4Floor, floorf, true);
......@@ -713,8 +716,9 @@ WASM_SIMD_TEST_NO_LOWERING(F32x4NearestInt) {
RunF32x4UnOpTest(execution_tier, lower_simd, kExprF32x4NearestInt, nearbyintf,
true);
}
#endif // !V8_TARGET_ARCH_ARM
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_S390X ||
// V8_TARGET_ARCH_IA32
// V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM
void RunF32x4BinOpTest(ExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode opcode, FloatBinOp expected_op) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment