Commit 996aadbd authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[wasm-simd][arm] Prototype load lane and store lane

Prototype v128.{load,store}{8,16,32,64}_lane on arm.

Bug: v8:10975
Change-Id: I649f567f39f8a5ba6992a86b761f93f62619c139
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2565079
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71628}
parent 7e67c9a8
......@@ -3749,6 +3749,22 @@ void Assembler::vst1(NeonSize size, const NeonListOperand& src,
src.type() * B8 | size * B6 | dst.align() * B4 | dst.rm().code());
}
void Assembler::vst1s(NeonSize size, const NeonListOperand& src, uint8_t index,
const NeonMemOperand& dst) {
// Instruction details available in ARM DDI 0487F.b F6.1.236.
// 1111(31-28) | 01001(27-23) | D(22) | 00(21-20) | Rn(19-16) |
// Vd(15-12) | size(11-10) | 00(9-8) | index_align(7-4) | Rm(3-0)
DCHECK(IsEnabled(NEON));
DCHECK_NE(size, 0x3);
DCHECK_GT(1 << (3 - size), index);
// Specifying alignment not supported, use standard alignment.
uint8_t index_align = index << (size + 1);
int vd, d;
src.base().split_code(&vd, &d);
emit(0xFU * B28 | 9 * B23 | d * B22 | dst.rn().code() * B16 | vd * B12 |
size * B10 | index_align * B4 | dst.rm().code());
}
void Assembler::vmovl(NeonDataType dt, QwNeonRegister dst, DwVfpRegister src) {
// Instruction details available in ARM DDI 0406C.b, A8.8.346.
// 1111(31-28) | 001(27-25) | U(24) | 1(23) | D(22) | imm3(21-19) |
......
......@@ -846,6 +846,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
const NeonMemOperand& src);
void vst1(NeonSize size, const NeonListOperand& src,
const NeonMemOperand& dst);
// vst1s(single element from one lane).
void vst1s(NeonSize size, const NeonListOperand& src, uint8_t index,
const NeonMemOperand& dst);
// dt represents the narrower type
void vmovl(NeonDataType dt, QwNeonRegister dst, DwVfpRegister src);
// dst_dt represents the narrower type, src_dt represents the src type.
......
......@@ -1162,6 +1162,26 @@ void TurboAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
vmov(double_dst, src_lane);
}
void TurboAssembler::LoadLane(NeonSize sz, NeonListOperand dst_list,
uint8_t lane, NeonMemOperand src) {
if (sz == Neon64) {
// vld1s is not valid for Neon64.
vld1(Neon64, dst_list, src);
} else {
vld1s(sz, dst_list, lane, src);
}
}
void TurboAssembler::StoreLane(NeonSize sz, NeonListOperand src_list,
uint8_t lane, NeonMemOperand dst) {
if (sz == Neon64) {
// vst1s is not valid for Neon64.
vst1(Neon64, src_list, dst);
} else {
vst1s(sz, src_list, lane, dst);
}
}
void TurboAssembler::LslPair(Register dst_low, Register dst_high,
Register src_low, Register src_high,
Register shift) {
......
......@@ -463,6 +463,11 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
void ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
DwVfpRegister src_lane, int lane);
void LoadLane(NeonSize sz, NeonListOperand dst_list, uint8_t lane,
NeonMemOperand src);
void StoreLane(NeonSize sz, NeonListOperand src_list, uint8_t lane,
NeonMemOperand dst);
// Register move. May do nothing if the registers are identical.
void Move(Register dst, Smi smi);
void Move(Register dst, Handle<HeapObject> value);
......
......@@ -2,6 +2,7 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "src/codegen/arm/constants-arm.h"
#include "src/codegen/assembler-inl.h"
#include "src/codegen/macro-assembler.h"
#include "src/codegen/optimized-compilation-info.h"
......@@ -3299,6 +3300,36 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vld1(Neon64, NeonListOperand(dst.low()), i.NeonInputOperand(0));
break;
}
case kArmS128LoadLaneLow: {
Simd128Register dst = i.OutputSimd128Register();
DCHECK_EQ(dst, i.InputSimd128Register(0));
auto sz = static_cast<NeonSize>(MiscField::decode(instr->opcode()));
NeonListOperand dst_list = NeonListOperand(dst.low());
__ LoadLane(sz, dst_list, i.InputUint8(1), i.NeonInputOperand(2));
break;
}
case kArmS128LoadLaneHigh: {
Simd128Register dst = i.OutputSimd128Register();
DCHECK_EQ(dst, i.InputSimd128Register(0));
auto sz = static_cast<NeonSize>(MiscField::decode(instr->opcode()));
NeonListOperand dst_list = NeonListOperand(dst.high());
__ LoadLane(sz, dst_list, i.InputUint8(1), i.NeonInputOperand(2));
break;
}
case kArmS128StoreLaneLow: {
Simd128Register src = i.InputSimd128Register(0);
NeonListOperand src_list = NeonListOperand(src.low());
auto sz = static_cast<NeonSize>(MiscField::decode(instr->opcode()));
__ StoreLane(sz, src_list, i.InputUint8(1), i.NeonInputOperand(2));
break;
}
case kArmS128StoreLaneHigh: {
Simd128Register src = i.InputSimd128Register(0);
NeonListOperand src_list = NeonListOperand(src.high());
auto sz = static_cast<NeonSize>(MiscField::decode(instr->opcode()));
__ StoreLane(sz, src_list, i.InputUint8(1), i.NeonInputOperand(2));
break;
}
case kWord32AtomicLoadInt8:
ASSEMBLE_ATOMIC_LOAD_INTEGER(ldrsb);
break;
......
......@@ -335,6 +335,10 @@ namespace compiler {
V(ArmS128Load32x2U) \
V(ArmS128Load32Zero) \
V(ArmS128Load64Zero) \
V(ArmS128LoadLaneLow) \
V(ArmS128LoadLaneHigh) \
V(ArmS128StoreLaneLow) \
V(ArmS128StoreLaneHigh) \
V(ArmWord32AtomicPairLoad) \
V(ArmWord32AtomicPairStore) \
V(ArmWord32AtomicPairAdd) \
......
......@@ -328,6 +328,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmS128Load32x2U:
case kArmS128Load32Zero:
case kArmS128Load64Zero:
case kArmS128LoadLaneLow:
case kArmS128LoadLaneHigh:
return kIsLoadOperation;
case kArmVstrF32:
......@@ -349,6 +351,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmWord32AtomicPairXor:
case kArmWord32AtomicPairExchange:
case kArmWord32AtomicPairCompareExchange:
case kArmS128StoreLaneLow:
case kArmS128StoreLaneHigh:
return kHasSideEffect;
#define CASE(Name) case k##Name:
......
......@@ -6,6 +6,7 @@
#include "src/base/enum-set.h"
#include "src/base/iterator.h"
#include "src/base/platform/wrappers.h"
#include "src/codegen/machine-type.h"
#include "src/compiler/backend/instruction-selector-impl.h"
#include "src/compiler/node-matchers.h"
#include "src/compiler/node-properties.h"
......@@ -500,6 +501,76 @@ void InstructionSelector::VisitAbortCSAAssert(Node* node) {
Emit(kArchAbortCSAAssert, g.NoOutput(), g.UseFixed(node->InputAt(0), r1));
}
namespace {
// Helper struct for load lane and store lane to indicate which opcode to use
// and what memory size to be encoded in the opcode, and the new lane index.
struct LoadStoreLaneParams {
bool low_op;
NeonSize sz;
uint8_t laneidx;
LoadStoreLaneParams(uint8_t laneidx, NeonSize sz, int lanes)
: low_op(laneidx < lanes), sz(sz), laneidx(laneidx % lanes) {}
};
// The register mapping on ARM (1 Q to 2 D), means that loading/storing high
// lanes of a Q register is equivalent to loading/storing the high D reg, modulo
// number of lanes in a D reg. This function decides, based on the laneidx and
// load/store size, whether the low or high D reg is accessed, and what the new
// lane index is.
LoadStoreLaneParams GetLoadStoreLaneParams(MachineRepresentation rep,
uint8_t laneidx) {
if (rep == MachineRepresentation::kWord8) {
return LoadStoreLaneParams(laneidx, Neon8, 8);
} else if (rep == MachineRepresentation::kWord16) {
return LoadStoreLaneParams(laneidx, Neon16, 4);
} else if (rep == MachineRepresentation::kWord32) {
return LoadStoreLaneParams(laneidx, Neon32, 2);
} else if (rep == MachineRepresentation::kWord64) {
return LoadStoreLaneParams(laneidx, Neon64, 1);
} else {
UNREACHABLE();
}
}
} // namespace
void InstructionSelector::VisitStoreLane(Node* node) {
StoreLaneParameters params = StoreLaneParametersOf(node->op());
LoadStoreLaneParams f = GetLoadStoreLaneParams(params.rep, params.laneidx);
InstructionCode opcode =
f.low_op ? kArmS128StoreLaneLow : kArmS128StoreLaneHigh;
opcode |= MiscField::encode(f.sz);
ArmOperandGenerator g(this);
InstructionOperand inputs[4];
size_t input_count = 4;
inputs[0] = g.UseRegister(node->InputAt(2));
inputs[1] = g.UseImmediate(f.laneidx);
inputs[2] = g.UseRegister(node->InputAt(0));
inputs[3] = g.UseRegister(node->InputAt(1));
EmitAddBeforeS128LoadStore(this, &opcode, &input_count, &inputs[2]);
Emit(opcode, 0, nullptr, input_count, inputs);
}
void InstructionSelector::VisitLoadLane(Node* node) {
LoadLaneParameters params = LoadLaneParametersOf(node->op());
LoadStoreLaneParams f =
GetLoadStoreLaneParams(params.rep.representation(), params.laneidx);
InstructionCode opcode =
f.low_op ? kArmS128LoadLaneLow : kArmS128LoadLaneHigh;
opcode |= MiscField::encode(f.sz);
ArmOperandGenerator g(this);
InstructionOperand output = g.DefineSameAsFirst(node);
InstructionOperand inputs[4];
size_t input_count = 4;
inputs[0] = g.UseRegister(node->InputAt(2));
inputs[1] = g.UseImmediate(f.laneidx);
inputs[2] = g.UseRegister(node->InputAt(0));
inputs[3] = g.UseRegister(node->InputAt(1));
EmitAddBeforeS128LoadStore(this, &opcode, &input_count, &inputs[2]);
Emit(opcode, 1, &output, input_count, inputs);
}
void InstructionSelector::VisitLoadTransform(Node* node) {
LoadTransformParameters params = LoadTransformParametersOf(node->op());
InstructionCode opcode = kArchNop;
......
......@@ -2820,11 +2820,13 @@ void InstructionSelector::VisitPrefetchNonTemporal(Node* node) {
}
#endif // !V8_TARGET_ARCH_ARM64
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM64
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM64 && \
!V8_TARGET_ARCH_ARM
// TODO(v8:10975): Prototyping load lane and store lane.
void InstructionSelector::VisitLoadLane(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitStoreLane(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32 && !V8_TARGET_ARCH_ARM64
// && !V8_TARGET_ARCH_ARM
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32
// TODO(v8:10983) Prototyping sign select.
......
......@@ -2501,14 +2501,15 @@ void Decoder::DecodeAdvancedSIMDElementOrStructureLoadStore(
} else if (op1 != 0b11) {
// Advanced SIMD load/store single structure to one lane.
int size = op1; // size and op1 occupy the same bits in decoding.
if (l && n == 0b00) {
// VLD1 (single element to one lane) - A1, A2, A3
int index_align = instr->Bits(7, 4);
int index = index_align >> (size + 1);
int index_align = instr->Bits(7, 4);
int index = index_align >> (size + 1);
if (n == 0b00) {
// vld1 (single element to one lane) - A1, A2, A3.
// vst1 (single element to one lane) - A1, A2, A3.
// Omit alignment.
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vld1.%d {d%d[%d]}",
(1 << size) << 3, Vd, index);
SNPrintF(out_buffer_ + out_buffer_pos_, "v%s1.%d {d%d[%d]}",
(l ? "ld" : "st"), (1 << size) << 3, Vd, index);
Print(", ");
FormatNeonMemory(Rn, 0, Rm);
} else {
......
......@@ -5702,16 +5702,16 @@ void Simulator::DecodeAdvancedSIMDElementOrStructureLoadStore(
}
break;
case 9: {
int Vd = instr->VFPDRegValue(kDoublePrecision);
int Rn = instr->VnValue();
int Rm = instr->VmValue();
int32_t address = get_register(Rn);
if (instr->Bits(21, 20) == 2) {
// Bits(11, 8) is the B field in A7.7 Advanced SIMD element or structure
// load/store instructions. See table A7-21.
if (instr->Bits(11, 8) == 0xC) {
// vld1 (single element to all lanes).
DCHECK_EQ(instr->Bits(11, 8), 0b1100); // Type field.
int Vd = (instr->Bit(22) << 4) | instr->VdValue();
int Rn = instr->VnValue();
int Rm = instr->VmValue();
int32_t address = get_register(Rn);
int regs = instr->Bit(5) + 1;
int size = instr->Bits(7, 6);
uint32_t q_data[2];
......@@ -5753,10 +5753,6 @@ void Simulator::DecodeAdvancedSIMDElementOrStructureLoadStore(
} else if (instr->Bits(11, 8) == 8 ||
((instr->Bits(11, 8) & 0b1011) == 0)) {
// vld1 (single element to one lane)
int Vd = (instr->Bit(22) << 4) | instr->VdValue();
int Rn = instr->VnValue();
int Rm = instr->VmValue();
int32_t address = get_register(Rn);
int size = instr->Bits(11, 10);
uint64_t dreg;
get_d_register(Vd, &dreg);
......@@ -5800,6 +5796,41 @@ void Simulator::DecodeAdvancedSIMDElementOrStructureLoadStore(
} else {
UNIMPLEMENTED();
}
} else if (instr->Bits(21, 20) == 0) {
// TODO(zhin): Refactor this function to follow decoding guide.
// vst1s (single element from one lane).
int size = instr->Bits(11, 10);
DCHECK_NE(3, size);
uint64_t dreg;
get_d_register(Vd, &dreg);
switch (size) {
case Neon8: {
DCHECK_EQ(0, instr->Bit(4));
int i = instr->Bits(7, 5) * 8;
dreg = (dreg >> i) & 0xff;
WriteB(address, static_cast<uint8_t>(dreg));
break;
}
case Neon16: {
DCHECK_EQ(0, instr->Bits(5, 4)); // Alignment not supported.
int i = instr->Bits(7, 6) * 16;
dreg = (dreg >> i) & 0xffff;
WriteH(address, static_cast<uint16_t>(dreg));
break;
}
case Neon32: {
DCHECK_EQ(0, instr->Bits(6, 4)); // Alignment not supported.
int i = instr->Bit(7) * 32;
dreg = (dreg >> i) & 0xffffffff;
WriteW(address, bit_cast<int>(static_cast<uint32_t>(dreg)));
break;
}
case Neon64: {
// Should have been handled by vst1 (single element to all lanes).
UNREACHABLE();
}
}
} else {
UNIMPLEMENTED();
}
......
......@@ -1018,6 +1018,12 @@ TEST(Neon) {
"f406460f vst1.8 {d4, d5, d6}, [r6]")
COMPARE(vst1(Neon16, NeonListOperand(d17, 4), NeonMemOperand(r9)),
"f449124f vst1.16 {d17, d18, d19, d20}, [r9]");
COMPARE(vst1s(Neon8, NeonListOperand(d4), 1, NeonMemOperand(r1)),
"f481402f vst1.8 {d4[1]}, [r1]");
COMPARE(vst1s(Neon16, NeonListOperand(d4), 2, NeonMemOperand(r1)),
"f481448f vst1.16 {d4[2]}, [r1]");
COMPARE(vst1s(Neon32, NeonListOperand(d4), 0, NeonMemOperand(r1)),
"f481480f vst1.32 {d4[0]}, [r1]");
COMPARE(vmovl(NeonU8, q3, d1), "f3886a11 vmovl.u8 q3, d1");
COMPARE(vmovl(NeonU8, q4, d2), "f3888a12 vmovl.u8 q4, d2");
COMPARE(vmovl(NeonS16, q4, d2), "f2908a12 vmovl.s16 q4, d2");
......
......@@ -3974,7 +3974,8 @@ WASM_SIMD_TEST(S128Load64Zero) {
RunLoadZeroTest<int64_t>(execution_tier, lower_simd, kExprS128Load64Zero);
}
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM64
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM64 || \
V8_TARGET_ARCH_ARM
// TODO(v8:10975): Prototyping load lane and store lane.
template <typename T>
void RunLoadLaneTest(TestExecutionTier execution_tier, LowerSimd lower_simd,
......@@ -4179,7 +4180,8 @@ WASM_SIMD_TEST_NO_LOWERING(S128Store64Lane) {
kExprI64x2Splat);
}
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM64
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM64 ||
// V8_TARGET_ARCH_ARM
#define WASM_SIMD_ANYTRUE_TEST(format, lanes, max, param_type) \
WASM_SIMD_TEST(S##format##AnyTrue) { \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment