Commit 72b68dee authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd] Implement load splat and load extend on arm

Bug: v8:9886
Change-Id: Idd44fb99be54c56385db55895dba58b35c1b660e
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/1928150Reviewed-by: 's avatarMichael Starzinger <mstarzinger@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#65275}
parent 5d80a202
......@@ -3663,6 +3663,17 @@ void Assembler::vld1(NeonSize size, const NeonListOperand& dst,
src.rm().code());
}
// vld1r(eplicate)
void Assembler::vld1r(NeonSize size, const NeonListOperand& dst,
const NeonMemOperand& src) {
DCHECK(IsEnabled(NEON));
int vd, d;
dst.base().split_code(&vd, &d);
emit(0xFU * B28 | 4 * B24 | 1 * B23 | d * B22 | 2 * B20 |
src.rn().code() * B16 | vd * B12 | 0xC * B8 | size * B6 |
dst.length() * B5 | src.rm().code());
}
void Assembler::vst1(NeonSize size, const NeonListOperand& src,
const NeonMemOperand& dst) {
// Instruction details available in ARM DDI 0406C.b, A8.8.404.
......
......@@ -839,6 +839,8 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
// All these APIs support D0 to D31 and Q0 to Q15.
void vld1(NeonSize size, const NeonListOperand& dst,
const NeonMemOperand& src);
void vld1r(NeonSize size, const NeonListOperand& dst,
const NeonMemOperand& src);
void vst1(NeonSize size, const NeonListOperand& src,
const NeonMemOperand& dst);
// dt represents the narrower type
......
......@@ -2957,6 +2957,63 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ mov(i.OutputRegister(), Operand(1), LeaveCC, ne);
break;
}
case kArmS8x16LoadSplat: {
__ vld1r(Neon8, NeonListOperand(i.OutputSimd128Register()),
i.NeonInputOperand(0));
break;
}
case kArmS16x8LoadSplat: {
__ vld1r(Neon16, NeonListOperand(i.OutputSimd128Register()),
i.NeonInputOperand(0));
break;
}
case kArmS32x4LoadSplat: {
__ vld1r(Neon32, NeonListOperand(i.OutputSimd128Register()),
i.NeonInputOperand(0));
break;
}
case kArmS64x2LoadSplat: {
Simd128Register dst = i.OutputSimd128Register();
__ vld1(Neon32, NeonListOperand(dst.low()), i.NeonInputOperand(0));
__ Move(dst.high(), dst.low());
break;
}
case kArmI16x8Load8x8S: {
Simd128Register dst = i.OutputSimd128Register();
__ vld1(Neon8, NeonListOperand(dst.low()), i.NeonInputOperand(0));
__ vmovl(NeonS8, dst, dst.low());
break;
}
case kArmI16x8Load8x8U: {
Simd128Register dst = i.OutputSimd128Register();
__ vld1(Neon8, NeonListOperand(dst.low()), i.NeonInputOperand(0));
__ vmovl(NeonU8, dst, dst.low());
break;
}
case kArmI32x4Load16x4S: {
Simd128Register dst = i.OutputSimd128Register();
__ vld1(Neon16, NeonListOperand(dst.low()), i.NeonInputOperand(0));
__ vmovl(NeonS16, dst, dst.low());
break;
}
case kArmI32x4Load16x4U: {
Simd128Register dst = i.OutputSimd128Register();
__ vld1(Neon16, NeonListOperand(dst.low()), i.NeonInputOperand(0));
__ vmovl(NeonU16, dst, dst.low());
break;
}
case kArmI64x2Load32x2S: {
Simd128Register dst = i.OutputSimd128Register();
__ vld1(Neon32, NeonListOperand(dst.low()), i.NeonInputOperand(0));
__ vmovl(NeonS32, dst, dst.low());
break;
}
case kArmI64x2Load32x2U: {
Simd128Register dst = i.OutputSimd128Register();
__ vld1(Neon32, NeonListOperand(dst.low()), i.NeonInputOperand(0));
__ vmovl(NeonU32, dst, dst.low());
break;
}
case kWord32AtomicLoadInt8:
ASSEMBLE_ATOMIC_LOAD_INTEGER(ldrsb);
break;
......
......@@ -296,6 +296,16 @@ namespace compiler {
V(ArmS1x8AllTrue) \
V(ArmS1x16AnyTrue) \
V(ArmS1x16AllTrue) \
V(ArmS8x16LoadSplat) \
V(ArmS16x8LoadSplat) \
V(ArmS32x4LoadSplat) \
V(ArmS64x2LoadSplat) \
V(ArmI16x8Load8x8S) \
V(ArmI16x8Load8x8U) \
V(ArmI32x4Load16x4S) \
V(ArmI32x4Load16x4U) \
V(ArmI64x2Load32x2S) \
V(ArmI64x2Load32x2U) \
V(ArmWord32AtomicPairLoad) \
V(ArmWord32AtomicPairStore) \
V(ArmWord32AtomicPairAdd) \
......
......@@ -289,6 +289,16 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kArmLdr:
case kArmPeek:
case kArmWord32AtomicPairLoad:
case kArmS8x16LoadSplat:
case kArmS16x8LoadSplat:
case kArmS32x4LoadSplat:
case kArmS64x2LoadSplat:
case kArmI16x8Load8x8S:
case kArmI16x8Load8x8U:
case kArmI32x4Load16x4S:
case kArmI32x4Load16x4U:
case kArmI64x2Load32x2S:
case kArmI64x2Load32x2U:
return kIsLoadOperation;
case kArmVstrF32:
......
......@@ -361,7 +361,6 @@ void EmitAddBeforeS128LoadStore(InstructionSelector* selector,
InstructionCode* opcode_return,
size_t* input_count_return,
InstructionOperand* inputs) {
DCHECK(*opcode_return == kArmVld1S128 || *opcode_return == kArmVst1S128);
ArmOperandGenerator g(selector);
InstructionOperand addr = g.TempRegister();
InstructionCode op = kArmAdd;
......@@ -473,6 +472,54 @@ void InstructionSelector::VisitAbortCSAAssert(Node* node) {
Emit(kArchAbortCSAAssert, g.NoOutput(), g.UseFixed(node->InputAt(0), r1));
}
void InstructionSelector::VisitLoadTransform(Node* node) {
LoadTransformParameters params = LoadTransformParametersOf(node->op());
InstructionCode opcode = kArchNop;
switch (params.transformation) {
case LoadTransformation::kS8x16LoadSplat:
opcode = kArmS8x16LoadSplat;
break;
case LoadTransformation::kS16x8LoadSplat:
opcode = kArmS16x8LoadSplat;
break;
case LoadTransformation::kS32x4LoadSplat:
opcode = kArmS32x4LoadSplat;
break;
case LoadTransformation::kS64x2LoadSplat:
opcode = kArmS64x2LoadSplat;
break;
case LoadTransformation::kI16x8Load8x8S:
opcode = kArmI16x8Load8x8S;
break;
case LoadTransformation::kI16x8Load8x8U:
opcode = kArmI16x8Load8x8U;
break;
case LoadTransformation::kI32x4Load16x4S:
opcode = kArmI32x4Load16x4S;
break;
case LoadTransformation::kI32x4Load16x4U:
opcode = kArmI32x4Load16x4U;
break;
case LoadTransformation::kI64x2Load32x2S:
opcode = kArmI64x2Load32x2S;
break;
case LoadTransformation::kI64x2Load32x2U:
opcode = kArmI64x2Load32x2U;
break;
default:
UNIMPLEMENTED();
}
ArmOperandGenerator g(this);
InstructionOperand output = g.DefineAsRegister(node);
InstructionOperand inputs[2];
size_t input_count = 2;
inputs[0] = g.UseRegister(node->InputAt(0));
inputs[1] = g.UseRegister(node->InputAt(1));
EmitAddBeforeS128LoadStore(this, &opcode, &input_count, &inputs[0]);
Emit(opcode, 1, &output, input_count, inputs);
}
void InstructionSelector::VisitLoad(Node* node) {
LoadRepresentation load_rep = LoadRepresentationOf(node->op());
ArmOperandGenerator g(this);
......
......@@ -2627,7 +2627,9 @@ void InstructionSelector::VisitF64x2UConvertI64x2(Node* node) {
UNIMPLEMENTED();
}
#if !V8_TARGET_ARCH_ARM64
#if !V8_TARGET_ARCH_ARM
void InstructionSelector::VisitLoadTransform(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_ARM
#if !V8_TARGET_ARCH_IA32
void InstructionSelector::VisitF64x2Min(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitF64x2Max(Node* node) { UNIMPLEMENTED(); }
......
......@@ -5556,6 +5556,63 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
UNIMPLEMENTED();
}
break;
case 9: {
if (instr->Bits(21, 20) == 2) {
// Bits(11, 8) is the B field in A7.7 Advanced SIMD element or structure
// load/store instructions.
if (instr->Bits(11, 8) == 0xC) {
// vld1 (single element to all lanes).
DCHECK_EQ(instr->Bits(11, 8), 0b1100); // Type field.
int Vd = (instr->Bit(22) << 4) | instr->VdValue();
int Rn = instr->VnValue();
int Rm = instr->VmValue();
int32_t address = get_register(Rn);
int regs = instr->Bit(5) + 1;
int size = instr->Bits(7, 6);
uint32_t q_data[4];
switch (size) {
case Neon8: {
uint8_t data = ReadBU(address);
uint8_t* dst = reinterpret_cast<uint8_t*>(q_data);
for (int i = 0; i < 16; i++) {
dst[i] = data;
}
break;
}
case Neon16: {
uint16_t data = ReadHU(address);
uint16_t* dst = reinterpret_cast<uint16_t*>(q_data);
for (int i = 0; i < 8; i++) {
dst[i] = data;
}
break;
}
case Neon32: {
uint32_t data = ReadW(address);
for (int i = 0; i < 4; i++) {
q_data[i] = data;
}
break;
}
}
for (int r = 0; r < regs; r++) {
set_neon_register(Vd + r, q_data);
}
if (Rm != 15) {
if (Rm == 13) {
set_register(Rn, address);
} else {
set_register(Rn, get_register(Rn) + get_register(Rm));
}
}
} else {
UNIMPLEMENTED();
}
} else {
UNIMPLEMENTED();
}
break;
}
case 0xA:
case 0xB:
if ((instr->Bits(22, 20) == 5) && (instr->Bits(15, 12) == 0xF)) {
......
......@@ -3264,7 +3264,7 @@ WASM_SIMD_TEST(SimdLoadStoreLoadMemargOffset) {
}
}
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
template <typename T>
void RunLoadSplatTest(ExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode op) {
......@@ -3354,7 +3354,7 @@ WASM_SIMD_TEST_NO_LOWERING(I64x2Load32x2S) {
RunLoadExtendTest<int32_t, int64_t>(execution_tier, lower_simd,
kExprI64x2Load32x2S);
}
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_ARM64 || V8_TARGET_ARCH_ARM
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32 || V8_TARGET_ARCH_ARM64 || \
V8_TARGET_ARCH_ARM
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment