Commit 72b68dee authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd] Implement load splat and load extend on arm

Bug: v8:9886
Change-Id: Idd44fb99be54c56385db55895dba58b35c1b660e
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1928150Reviewed-by: 's avatarMichael Starzinger <mstarzinger@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#65275}
parent 5d80a202
...@@ -3663,6 +3663,17 @@ void Assembler::vld1(NeonSize size, const NeonListOperand& dst, ...@@ -3663,6 +3663,17 @@ void Assembler::vld1(NeonSize size, const NeonListOperand& dst,
src.rm().code()); src.rm().code());
} }
// vld1r(eplicate)
void Assembler::vld1r(NeonSize size, const NeonListOperand& dst,
const NeonMemOperand& src) {
DCHECK(IsEnabled(NEON));
int vd, d;
dst.base().split_code(&vd, &d);
emit(0xFU * B28 | 4 * B24 | 1 * B23 | d * B22 | 2 * B20 |
src.rn().code() * B16 | vd * B12 | 0xC * B8 | size * B6 |
dst.length() * B5 | src.rm().code());
}
void Assembler::vst1(NeonSize size, const NeonListOperand& src, void Assembler::vst1(NeonSize size, const NeonListOperand& src,
const NeonMemOperand& dst) { const NeonMemOperand& dst) {
// Instruction details available in ARM DDI 0406C.b, A8.8.404. // Instruction details available in ARM DDI 0406C.b, A8.8.404.
......
...@@ -839,6 +839,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase { ...@@ -839,6 +839,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
// All these APIs support D0 to D31 and Q0 to Q15. // All these APIs support D0 to D31 and Q0 to Q15.
void vld1(NeonSize size, const NeonListOperand& dst, void vld1(NeonSize size, const NeonListOperand& dst,
const NeonMemOperand& src); const NeonMemOperand& src);
void vld1r(NeonSize size, const NeonListOperand& dst,
const NeonMemOperand& src);
void vst1(NeonSize size, const NeonListOperand& src, void vst1(NeonSize size, const NeonListOperand& src,
const NeonMemOperand& dst); const NeonMemOperand& dst);
// dt represents the narrower type // dt represents the narrower type
......
...@@ -2957,6 +2957,63 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2957,6 +2957,63 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ mov(i.OutputRegister(), Operand(1), LeaveCC, ne); __ mov(i.OutputRegister(), Operand(1), LeaveCC, ne);
break; break;
} }
case kArmS8x16LoadSplat: {
__ vld1r(Neon8, NeonListOperand(i.OutputSimd128Register()),
i.NeonInputOperand(0));
break;
}
case kArmS16x8LoadSplat: {
__ vld1r(Neon16, NeonListOperand(i.OutputSimd128Register()),
i.NeonInputOperand(0));
break;
}
case kArmS32x4LoadSplat: {
__ vld1r(Neon32, NeonListOperand(i.OutputSimd128Register()),
i.NeonInputOperand(0));
break;
}
case kArmS64x2LoadSplat: {
Simd128Register dst = i.OutputSimd128Register();
__ vld1(Neon32, NeonListOperand(dst.low()), i.NeonInputOperand(0));
__ Move(dst.high(), dst.low());
break;
}
case kArmI16x8Load8x8S: {
Simd128Register dst = i.OutputSimd128Register();
__ vld1(Neon8, NeonListOperand(dst.low()), i.NeonInputOperand(0));
__ vmovl(NeonS8, dst, dst.low());
break;
}
case kArmI16x8Load8x8U: {
Simd128Register dst = i.OutputSimd128Register();
__ vld1(Neon8, NeonListOperand(dst.low()), i.NeonInputOperand(0));
__ vmovl(NeonU8, dst, dst.low());
break;
}
case kArmI32x4Load16x4S: {
Simd128Register dst = i.OutputSimd128Register();
__ vld1(Neon16, NeonListOperand(dst.low()), i.NeonInputOperand(0));
__ vmovl(NeonS16, dst, dst.low());
break;
}
case kArmI32x4Load16x4U: {
Simd128Register dst = i.OutputSimd128Register();
__ vld1(Neon16, NeonListOperand(dst.low()), i.NeonInputOperand(0));
__ vmovl(NeonU16, dst, dst.low());
break;
}
case kArmI64x2Load32x2S: {
Simd128Register dst = i.OutputSimd128Register();
__ vld1(Neon32, NeonListOperand(dst.low()), i.NeonInputOperand(0));
__ vmovl(NeonS32, dst, dst.low());
break;
}
case kArmI64x2Load32x2U: {
Simd128Register dst = i.OutputSimd128Register();
__ vld1(Neon32, NeonListOperand(dst.low()), i.NeonInputOperand(0));
__ vmovl(NeonU32, dst, dst.low());
break;
}
case kWord32AtomicLoadInt8: case kWord32AtomicLoadInt8:
ASSEMBLE_ATOMIC_LOAD_INTEGER(ldrsb); ASSEMBLE_ATOMIC_LOAD_INTEGER(ldrsb);
break; break;
......
...@@ -296,6 +296,16 @@ namespace compiler { ...@@ -296,6 +296,16 @@ namespace compiler {
V(ArmS1x8AllTrue) \ V(ArmS1x8AllTrue) \
V(ArmS1x16AnyTrue) \ V(ArmS1x16AnyTrue) \
V(ArmS1x16AllTrue) \ V(ArmS1x16AllTrue) \
V(ArmS8x16LoadSplat) \
V(ArmS16x8LoadSplat) \
V(ArmS32x4LoadSplat) \
V(ArmS64x2LoadSplat) \
V(ArmI16x8Load8x8S) \
V(ArmI16x8Load8x8U) \
V(ArmI32x4Load16x4S) \
V(ArmI32x4Load16x4U) \
V(ArmI64x2Load32x2S) \
V(ArmI64x2Load32x2U) \
V(ArmWord32AtomicPairLoad) \ V(ArmWord32AtomicPairLoad) \
V(ArmWord32AtomicPairStore) \ V(ArmWord32AtomicPairStore) \
V(ArmWord32AtomicPairAdd) \ V(ArmWord32AtomicPairAdd) \
......
...@@ -289,6 +289,16 @@ int InstructionScheduler::GetTargetInstructionFlags( ...@@ -289,6 +289,16 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmLdr: case kArmLdr:
case kArmPeek: case kArmPeek:
case kArmWord32AtomicPairLoad: case kArmWord32AtomicPairLoad:
case kArmS8x16LoadSplat:
case kArmS16x8LoadSplat:
case kArmS32x4LoadSplat:
case kArmS64x2LoadSplat:
case kArmI16x8Load8x8S:
case kArmI16x8Load8x8U:
case kArmI32x4Load16x4S:
case kArmI32x4Load16x4U:
case kArmI64x2Load32x2S:
case kArmI64x2Load32x2U:
return kIsLoadOperation; return kIsLoadOperation;
case kArmVstrF32: case kArmVstrF32:
......
...@@ -361,7 +361,6 @@ void EmitAddBeforeS128LoadStore(InstructionSelector* selector, ...@@ -361,7 +361,6 @@ void EmitAddBeforeS128LoadStore(InstructionSelector* selector,
InstructionCode* opcode_return, InstructionCode* opcode_return,
size_t* input_count_return, size_t* input_count_return,
InstructionOperand* inputs) { InstructionOperand* inputs) {
DCHECK(*opcode_return == kArmVld1S128 || *opcode_return == kArmVst1S128);
ArmOperandGenerator g(selector); ArmOperandGenerator g(selector);
InstructionOperand addr = g.TempRegister(); InstructionOperand addr = g.TempRegister();
InstructionCode op = kArmAdd; InstructionCode op = kArmAdd;
...@@ -473,6 +472,54 @@ void InstructionSelector::VisitAbortCSAAssert(Node* node) { ...@@ -473,6 +472,54 @@ void InstructionSelector::VisitAbortCSAAssert(Node* node) {
Emit(kArchAbortCSAAssert, g.NoOutput(), g.UseFixed(node->InputAt(0), r1)); Emit(kArchAbortCSAAssert, g.NoOutput(), g.UseFixed(node->InputAt(0), r1));
} }
void InstructionSelector::VisitLoadTransform(Node* node) {
LoadTransformParameters params = LoadTransformParametersOf(node->op());
InstructionCode opcode = kArchNop;
switch (params.transformation) {
case LoadTransformation::kS8x16LoadSplat:
opcode = kArmS8x16LoadSplat;
break;
case LoadTransformation::kS16x8LoadSplat:
opcode = kArmS16x8LoadSplat;
break;
case LoadTransformation::kS32x4LoadSplat:
opcode = kArmS32x4LoadSplat;
break;
case LoadTransformation::kS64x2LoadSplat:
opcode = kArmS64x2LoadSplat;
break;
case LoadTransformation::kI16x8Load8x8S:
opcode = kArmI16x8Load8x8S;
break;
case LoadTransformation::kI16x8Load8x8U:
opcode = kArmI16x8Load8x8U;
break;
case LoadTransformation::kI32x4Load16x4S:
opcode = kArmI32x4Load16x4S;
break;
case LoadTransformation::kI32x4Load16x4U:
opcode = kArmI32x4Load16x4U;
break;
case LoadTransformation::kI64x2Load32x2S:
opcode = kArmI64x2Load32x2S;
break;
case LoadTransformation::kI64x2Load32x2U:
opcode = kArmI64x2Load32x2U;
break;
default:
UNIMPLEMENTED();
}
ArmOperandGenerator g(this);
InstructionOperand output = g.DefineAsRegister(node);
InstructionOperand inputs[2];
size_t input_count = 2;
inputs[0] = g.UseRegister(node->InputAt(0));
inputs[1] = g.UseRegister(node->InputAt(1));
EmitAddBeforeS128LoadStore(this, &opcode, &input_count, &inputs[0]);
Emit(opcode, 1, &output, input_count, inputs);
}
void InstructionSelector::VisitLoad(Node* node) { void InstructionSelector::VisitLoad(Node* node) {
LoadRepresentation load_rep = LoadRepresentationOf(node->op()); LoadRepresentation load_rep = LoadRepresentationOf(node->op());
ArmOperandGenerator g(this); ArmOperandGenerator g(this);
......
...@@ -2627,7 +2627,9 @@ void InstructionSelector::VisitF64x2UConvertI64x2(Node* node) { ...@@ -2627,7 +2627,9 @@ void InstructionSelector::VisitF64x2UConvertI64x2(Node* node) {
UNIMPLEMENTED(); UNIMPLEMENTED();
} }
#if !V8_TARGET_ARCH_ARM64 #if !V8_TARGET_ARCH_ARM64
#if !V8_TARGET_ARCH_ARM
void InstructionSelector::VisitLoadTransform(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitLoadTransform(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_ARM
#if !V8_TARGET_ARCH_IA32 #if !V8_TARGET_ARCH_IA32
void InstructionSelector::VisitF64x2Min(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitF64x2Min(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2Max(Node* node) { UNIMPLEMENTED(); } void InstructionSelector::VisitF64x2Max(Node* node) { UNIMPLEMENTED(); }
......
...@@ -5556,6 +5556,63 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { ...@@ -5556,6 +5556,63 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
UNIMPLEMENTED(); UNIMPLEMENTED();
} }
break; break;
case 9: {
if (instr->Bits(21, 20) == 2) {
// Bits(11, 8) is the B field in A7.7 Advanced SIMD element or structure
// load/store instructions.
if (instr->Bits(11, 8) == 0xC) {
// vld1 (single element to all lanes).
DCHECK_EQ(instr->Bits(11, 8), 0b1100); // Type field.
int Vd = (instr->Bit(22) << 4) | instr->VdValue();
int Rn = instr->VnValue();
int Rm = instr->VmValue();
int32_t address = get_register(Rn);
int regs = instr->Bit(5) + 1;
int size = instr->Bits(7, 6);
uint32_t q_data[4];
switch (size) {
case Neon8: {
uint8_t data = ReadBU(address);
uint8_t* dst = reinterpret_cast<uint8_t*>(q_data);
for (int i = 0; i < 16; i++) {
dst[i] = data;
}
break;
}
case Neon16: {
uint16_t data = ReadHU(address);
uint16_t* dst = reinterpret_cast<uint16_t*>(q_data);
for (int i = 0; i < 8; i++) {
dst[i] = data;
}
break;
}
case Neon32: {
uint32_t data = ReadW(address);
for (int i = 0; i < 4; i++) {
q_data[i] = data;
}
break;
}
}
for (int r = 0; r < regs; r++) {
set_neon_register(Vd + r, q_data);
}
if (Rm != 15) {
if (Rm == 13) {
set_register(Rn, address);
} else {
set_register(Rn, get_register(Rn) + get_register(Rm));
}
}
} else {
UNIMPLEMENTED();
}
} else {
UNIMPLEMENTED();
}
break;
}
case 0xA: case 0xA:
case 0xB: case 0xB:
if ((instr->Bits(22, 20) == 5) && (instr->Bits(15, 12) == 0xF)) { if ((instr->Bits(22, 20) == 5) && (instr->Bits(15, 12) == 0xF)) {
......
...@@ -3264,7 +3264,7 @@ WASM_SIMD_TEST(SimdLoadStoreLoadMemargOffset) { ...@@ -3264,7 +3264,7 @@ WASM_SIMD_TEST(SimdLoadStoreLoadMemargOffset) {
} }
} }
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 #if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
template <typename T> template <typename T>
void RunLoadSplatTest(ExecutionTier execution_tier, LowerSimd lower_simd, void RunLoadSplatTest(ExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode op) { WasmOpcode op) {
...@@ -3354,7 +3354,7 @@ WASM_SIMD_TEST_NO_LOWERING(I64x2Load32x2S) { ...@@ -3354,7 +3354,7 @@ WASM_SIMD_TEST_NO_LOWERING(I64x2Load32x2S) {
RunLoadExtendTest<int32_t, int64_t>(execution_tier, lower_simd, RunLoadExtendTest<int32_t, int64_t>(execution_tier, lower_simd,
kExprI64x2Load32x2S); kExprI64x2Load32x2S);
} }
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 #endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM64 || \ #if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM64 || \
V8_TARGET_ARCH_ARM V8_TARGET_ARCH_ARM
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment