Commit 9124b7f9 authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[wasm-simd][arm] Implement v128.load32_zero v128.load64_zero

Implement these two instructions on arm.

Bug: v8:11038
Change-Id: I23d9c7291f60e29415cfbebced1bff323fd2465a
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2485250
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarJakob Kummerow <jkummerow@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#70779}
parent eee329ac
......@@ -2649,12 +2649,30 @@ static bool FitsVmovIntImm(uint64_t imm, uint32_t* encoding, uint8_t* cmode) {
return false;
}
void Assembler::vmov(const DwVfpRegister dst, uint64_t imm) {
uint32_t enc;
uint8_t cmode;
uint8_t op = 0;
if (CpuFeatures::IsSupported(NEON) && FitsVmovIntImm(imm, &enc, &cmode)) {
CpuFeatureScope scope(this, NEON);
// Instruction details available in ARM DDI 0406C.b, A8-937.
// 001i1(27-23) | D(22) | 000(21-19) | imm3(18-16) | Vd(15-12) | cmode(11-8)
// | 0(7) | 0(6) | op(5) | 4(1) | imm4(3-0)
int vd, d;
dst.split_code(&vd, &d);
emit(kSpecialCondition | 0x05 * B23 | d * B22 | vd * B12 | cmode * B8 |
op * B5 | 0x1 * B4 | enc);
} else {
UNIMPLEMENTED();
}
}
void Assembler::vmov(const QwNeonRegister dst, uint64_t imm) {
uint32_t enc;
uint8_t cmode;
uint8_t op = 0;
if (CpuFeatures::IsSupported(VFPv3) && FitsVmovIntImm(imm, &enc, &cmode)) {
CpuFeatureScope scope(this, VFPv3);
if (CpuFeatures::IsSupported(NEON) && FitsVmovIntImm(imm, &enc, &cmode)) {
CpuFeatureScope scope(this, NEON);
// Instruction details available in ARM DDI 0406C.b, A8-937.
// 001i1(27-23) | D(22) | 000(21-19) | imm3(18-16) | Vd(15-12) | cmode(11-8)
// | 0(7) | Q(6) | op(5) | 4(1) | imm4(3-0)
......@@ -3677,6 +3695,28 @@ void Assembler::vld1(NeonSize size, const NeonListOperand& dst,
src.rm().code());
}
// vld1s(ingle element to one lane).
void Assembler::vld1s(NeonSize size, const NeonListOperand& dst, uint8_t index,
const NeonMemOperand& src) {
// Instruction details available in ARM DDI 0406C.b, A8.8.322.
// 1111(31-28) | 01001(27-23) | D(22) | 10(21-20) | Rn(19-16) |
// Vd(15-12) | size(11-10) | index_align(7-4) | Rm(3-0)
// See vld1 (single element to all lanes) if size == 0x3, implemented as
// vld1r(eplicate).
DCHECK_NE(size, 0x3);
// Check for valid lane indices.
DCHECK_GT(1 << (3 - size), index);
// Specifying alignment not supported, use standard alignment.
uint8_t index_align = index << (size + 1);
DCHECK(IsEnabled(NEON));
int vd, d;
dst.base().split_code(&vd, &d);
emit(0xFU * B28 | 4 * B24 | 1 * B23 | d * B22 | 2 * B20 |
src.rn().code() * B16 | vd * B12 | size * B10 | index_align * B4 |
src.rm().code());
}
// vld1r(eplicate)
void Assembler::vld1r(NeonSize size, const NeonListOperand& dst,
const NeonMemOperand& src) {
......
......@@ -839,6 +839,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
// All these APIs support D0 to D31 and Q0 to Q15.
void vld1(NeonSize size, const NeonListOperand& dst,
const NeonMemOperand& src);
// vld1s(ingle element to one lane).
void vld1s(NeonSize size, const NeonListOperand& dst, uint8_t index,
const NeonMemOperand& src);
void vld1r(NeonSize size, const NeonListOperand& dst,
const NeonMemOperand& src);
void vst1(NeonSize size, const NeonListOperand& src,
......@@ -853,6 +856,7 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void vmov(NeonDataType dt, DwVfpRegister dst, int index, Register src);
void vmov(NeonDataType dt, Register dst, DwVfpRegister src, int index);
void vmov(DwVfpRegister dst, uint64_t imm);
void vmov(QwNeonRegister dst, uint64_t imm);
void vmov(QwNeonRegister dst, QwNeonRegister src);
void vdup(NeonSize size, QwNeonRegister dst, Register src);
......
......@@ -3264,6 +3264,18 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ vmovl(NeonU32, dst, dst.low());
break;
}
case kArmS128Load32Zero: {
Simd128Register dst = i.OutputSimd128Register();
__ vmov(dst, 0);
__ vld1s(Neon32, NeonListOperand(dst.low()), 0, i.NeonInputOperand(0));
break;
}
case kArmS128Load64Zero: {
Simd128Register dst = i.OutputSimd128Register();
__ vmov(dst.high(), 0);
__ vld1(Neon64, NeonListOperand(dst.low()), i.NeonInputOperand(0));
break;
}
case kWord32AtomicLoadInt8:
ASSEMBLE_ATOMIC_LOAD_INTEGER(ldrsb);
break;
......
......@@ -331,6 +331,8 @@ namespace compiler {
V(ArmS128Load16x4U) \
V(ArmS128Load32x2S) \
V(ArmS128Load32x2U) \
V(ArmS128Load32Zero) \
V(ArmS128Load64Zero) \
V(ArmWord32AtomicPairLoad) \
V(ArmWord32AtomicPairStore) \
V(ArmWord32AtomicPairAdd) \
......
......@@ -324,6 +324,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmS128Load16x4U:
case kArmS128Load32x2S:
case kArmS128Load32x2U:
case kArmS128Load32Zero:
case kArmS128Load64Zero:
return kIsLoadOperation;
case kArmVstrF32:
......
......@@ -532,6 +532,12 @@ void InstructionSelector::VisitLoadTransform(Node* node) {
case LoadTransformation::kS128Load32x2U:
opcode = kArmS128Load32x2U;
break;
case LoadTransformation::kS128Load32Zero:
opcode = kArmS128Load32Zero;
break;
case LoadTransformation::kS128Load64Zero:
opcode = kArmS128Load64Zero;
break;
default:
UNIMPLEMENTED();
}
......
......@@ -2514,9 +2514,10 @@ void Decoder::DecodeMemoryHintsAndBarriers(Instruction* instr) {
void Decoder::DecodeAdvancedSIMDElementOrStructureLoadStore(
Instruction* instr) {
int op0 = instr->Bit(23);
int op1 = instr->Bits(11, 10);
int l = instr->Bit(21);
if (op0 == 0) {
// Advanced SIMD load/store multiple structures.
int l = instr->Bit(21);
int itype = instr->Bits(11, 8);
if (itype == 0b0010) {
// vld1/vst1
......@@ -2534,6 +2535,26 @@ void Decoder::DecodeAdvancedSIMDElementOrStructureLoadStore(
} else {
Unknown(instr);
}
} else if (op1 != 0b11) {
// Advanced SIMD load/store single structure to one lane.
int size = op1; // size and op1 occupy the same bits in decoding.
int n = instr->Bits(9, 8);
if (l && n == 0b00) {
// VLD1 (single element to one lane) - A1, A2, A3
int Vd = instr->VFPDRegValue(kDoublePrecision);
int Rn = instr->VnValue();
int Rm = instr->VmValue();
int index_align = instr->Bits(7, 4);
int index = index_align >> (size + 1);
// Omit alignment.
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vld1.%d {d%d[%d]}",
(1 << size) << 3, Vd, index);
Print(", ");
FormatNeonMemory(Rn, 0, Rm);
} else {
Unknown(instr);
}
} else {
Unknown(instr);
}
......
......@@ -4046,7 +4046,9 @@ uint16_t Multiply(uint16_t a, uint16_t b) {
void VmovImmediate(Simulator* simulator, Instruction* instr) {
byte cmode = instr->Bits(11, 8);
int vd = instr->VFPDRegValue(kSimd128Precision);
int vd = instr->VFPDRegValue(kDoublePrecision);
int q = instr->Bit(6);
int regs = q ? 2 : 1;
uint8_t imm = instr->Bit(24) << 7; // i
imm |= instr->Bits(18, 16) << 4; // imm3
imm |= instr->Bits(3, 0); // imm4
......@@ -4054,14 +4056,20 @@ void VmovImmediate(Simulator* simulator, Instruction* instr) {
case 0: {
// Set the LSB of each 64-bit halves.
uint64_t imm64 = imm;
simulator->set_neon_register(vd, {imm64, imm64});
for (int r = 0; r < regs; r++) {
simulator->set_d_register(vd + r, &imm64);
}
break;
}
case 0xe: {
uint8_t imms[kSimd128Size];
// Set all bytes of register.
std::fill_n(imms, kSimd128Size, imm);
simulator->set_neon_register(vd, imms);
uint64_t imm64;
memcpy(&imm64, imms, 8);
for (int r = 0; r < regs; r++) {
simulator->set_d_register(vd + r, &imm64);
}
break;
}
default: {
......@@ -5743,7 +5751,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
case 9: {
if (instr->Bits(21, 20) == 2) {
// Bits(11, 8) is the B field in A7.7 Advanced SIMD element or structure
// load/store instructions.
// load/store instructions. See table A7-21.
if (instr->Bits(11, 8) == 0xC) {
// vld1 (single element to all lanes).
DCHECK_EQ(instr->Bits(11, 8), 0b1100); // Type field.
......@@ -5789,6 +5797,53 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
set_register(Rn, get_register(Rn) + get_register(Rm));
}
}
} else if (instr->Bits(11, 8) == 8 ||
((instr->Bits(11, 8) & 0b1011) == 0)) {
// vld1 (single element to one lane)
int Vd = (instr->Bit(22) << 4) | instr->VdValue();
int Rn = instr->VnValue();
int Rm = instr->VmValue();
int32_t address = get_register(Rn);
int size = instr->Bits(11, 10);
uint64_t dreg;
get_d_register(Vd, &dreg);
switch (size) {
case Neon8: {
uint64_t data = ReadBU(address);
DCHECK_EQ(0, instr->Bit(4));
int i = instr->Bits(7, 5) * 8;
dreg = (dreg & ~(uint64_t{0xff} << i)) | (data << i);
break;
}
case Neon16: {
DCHECK_EQ(0, instr->Bits(5, 4)); // Alignment not supported.
uint64_t data = ReadHU(address);
int i = instr->Bits(7, 6) * 16;
dreg = (dreg & ~(uint64_t{0xffff} << i)) | (data << i);
break;
}
case Neon32: {
DCHECK_EQ(0, instr->Bits(6, 4)); // Alignment not supported.
uint64_t data = static_cast<unsigned>(ReadW(address));
int i = instr->Bit(7) * 32;
dreg = (dreg & ~(uint64_t{0xffffffff} << i)) | (data << i);
break;
}
case Neon64: {
// Should have been handled by vld1 (single element to all lanes).
UNREACHABLE();
}
}
set_d_register(Vd, &dreg);
// write back
if (Rm != 15) {
if (Rm == 13) {
set_register(Rn, address);
} else {
set_register(Rn, get_register(Rn) + get_register(Rm));
}
}
} else {
UNIMPLEMENTED();
}
......
......@@ -996,6 +996,10 @@ TEST(Neon) {
CpuFeatureScope scope(&assm, NEON);
COMPARE(vld1(Neon8, NeonListOperand(d4, 4), NeonMemOperand(r1)),
"f421420f vld1.8 {d4, d5, d6, d7}, [r1]");
COMPARE(vld1s(Neon32, NeonListOperand(d4, 1), 0, NeonMemOperand(r1)),
"f4a1480f vld1.32 {d4[0]}, [r1]");
COMPARE(vld1s(Neon16, NeonListOperand(d4, 1), 3, NeonMemOperand(r1)),
"f4a144cf vld1.16 {d4[3]}, [r1]");
COMPARE(vst1(Neon16, NeonListOperand(d17, 4), NeonMemOperand(r9)),
"f449124f vst1.16 {d17, d18, d19, d20}, [r9]");
COMPARE(vmovl(NeonU8, q3, d1), "f3886a11 vmovl.u8 q3, d1");
......@@ -1055,6 +1059,8 @@ TEST(Neon) {
"f2812052 vmov.i32 q1, 18");
COMPARE(vmov(q0, 0xffffffffffffffff),
"f3870e5f vmov.i8 q0, 255");
COMPARE(vmov(d0, 0xffffffffffffffff),
"f3870e1f vmov.i8 q0, 255");
COMPARE(vmvn(q0, q15),
"f3b005ee vmvn q0, q15");
COMPARE(vmvn(q8, q9),
......
......@@ -3697,7 +3697,8 @@ WASM_SIMD_TEST(S128Load32x2S) {
}
// TODO(v8:10713): Prototyping v128.load32_zero and v128.load64_zero.
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_IA32
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_IA32 || \
V8_TARGET_ARCH_ARM
template <typename S>
void RunLoadZeroTest(TestExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode op) {
......@@ -3777,7 +3778,8 @@ WASM_SIMD_TEST_NO_LOWERING(S128Load32Zero) {
WASM_SIMD_TEST_NO_LOWERING(S128Load64Zero) {
RunLoadZeroTest<int64_t>(execution_tier, lower_simd, kExprS128Load64Zero);
}
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_IA32
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_IA32 ||
// V8_TARGET_ARCH_ARM
#if V8_TARGET_ARCH_X64
// TODO(v8:10975): Prototyping load lane and store lane.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment