Commit 03f33f2e authored by bbudge's avatar bbudge Committed by Commit bot

[Turbofan] Add ARM NEON instructions for implementing SIMD.

- Adds NEON instructions to assembler, disassembler, simulator.
- Adds ExtractLane, ReplaceLane functions to macro assembler.

LOG=N
BUG=v8:4124

Review-Url: https://codereview.chromium.org/2546933002
Cr-Commit-Position: refs/heads/master@{#41737}
parent 250e85f8
This diff is collapsed.
......@@ -640,12 +640,26 @@ class NeonMemOperand BASE_EMBEDDED {
// Class NeonListOperand represents a list of NEON registers
class NeonListOperand BASE_EMBEDDED {
public:
explicit NeonListOperand(DoubleRegister base, int registers_count = 1);
explicit NeonListOperand(DoubleRegister base, int register_count = 1)
: base_(base), register_count_(register_count) {}
explicit NeonListOperand(QwNeonRegister q_reg)
: base_(q_reg.low()), register_count_(2) {}
DoubleRegister base() const { return base_; }
NeonListType type() const { return type_; }
int register_count() { return register_count_; }
int length() const { return register_count_ - 1; }
NeonListType type() const {
switch (register_count_) {
default: UNREACHABLE();
// Fall through.
case 1: return nlt_1;
case 2: return nlt_2;
case 3: return nlt_3;
case 4: return nlt_4;
}
}
private:
DoubleRegister base_;
NeonListType type_;
int register_count_;
};
......@@ -1149,6 +1163,8 @@ class Assembler : public AssemblerBase {
void vmov(const DwVfpRegister dst,
const DwVfpRegister src,
const Condition cond = al);
// TODO(bbudge) Replace uses of these with the more general core register to
// scalar register vmov's.
void vmov(const DwVfpRegister dst,
const VmovIndex index,
const Register src,
......@@ -1329,11 +1345,43 @@ class Assembler : public AssemblerBase {
const NeonMemOperand& dst);
void vmovl(NeonDataType dt, QwNeonRegister dst, DwVfpRegister src);
// Only unconditional core <-> scalar moves are currently supported.
void vmov(NeonDataType dt, DwVfpRegister dst, int index, Register src);
void vmov(NeonDataType dt, Register dst, DwVfpRegister src, int index);
void vmov(const QwNeonRegister dst, const QwNeonRegister src);
void vmvn(const QwNeonRegister dst, const QwNeonRegister src);
void vswp(DwVfpRegister dst, DwVfpRegister src);
void vswp(QwNeonRegister dst, QwNeonRegister src);
// vdup conditional execution isn't supported.
void vdup(NeonSize size, const QwNeonRegister dst, const Register src);
void vdup(const QwNeonRegister dst, const SwVfpRegister src);
void vcvt_f32_s32(const QwNeonRegister dst, const QwNeonRegister src);
void vcvt_f32_u32(const QwNeonRegister dst, const QwNeonRegister src);
void vcvt_s32_f32(const QwNeonRegister dst, const QwNeonRegister src);
void vcvt_u32_f32(const QwNeonRegister dst, const QwNeonRegister src);
void veor(DwVfpRegister dst, DwVfpRegister src1, DwVfpRegister src2);
void veor(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2);
void vadd(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vadd(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vsub(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vsub(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vtst(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vceq(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vbsl(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vtbl(const DwVfpRegister dst, const NeonListOperand& list,
const DwVfpRegister index);
void vtbx(const DwVfpRegister dst, const NeonListOperand& list,
const DwVfpRegister index);
// Pseudo instructions
......
......@@ -190,6 +190,7 @@ enum {
B7 = 1 << 7,
B8 = 1 << 8,
B9 = 1 << 9,
B10 = 1 << 10,
B12 = 1 << 12,
B16 = 1 << 16,
B17 = 1 << 17,
......@@ -218,7 +219,6 @@ enum {
kOff8Mask = (1 << 8) - 1
};
enum BarrierOption {
OSHLD = 0x1,
OSHST = 0x2,
......@@ -327,12 +327,12 @@ enum LFlag {
// NEON data type
enum NeonDataType {
NeonS8 = 0x1, // U = 0, imm3 = 0b001
NeonS16 = 0x2, // U = 0, imm3 = 0b010
NeonS32 = 0x4, // U = 0, imm3 = 0b100
NeonS8 = 0x1, // U = 0, imm3 = 0b001
NeonS16 = 0x2, // U = 0, imm3 = 0b010
NeonS32 = 0x4, // U = 0, imm3 = 0b100
NeonU8 = 1 << 24 | 0x1, // U = 1, imm3 = 0b001
NeonU16 = 1 << 24 | 0x2, // U = 1, imm3 = 0b010
NeonU32 = 1 << 24 | 0x4, // U = 1, imm3 = 0b100
NeonU32 = 1 << 24 | 0x4, // U = 1, imm3 = 0b100
NeonDataTypeSizeMask = 0x7,
NeonDataTypeUMask = 1 << 24
};
......@@ -667,7 +667,7 @@ class Instruction {
private:
// Join split register codes, depending on single or double precision.
// Join split register codes, depending on register precision.
// four_bit is the position of the least-significant bit of the four
// bit specifier. one_bit is the position of the additional single bit
// specifier.
......
......@@ -1419,6 +1419,9 @@ int Decoder::DecodeType7(Instruction* instr) {
// Sd = vsqrt(Sm)
// vmrs
// vmsr
// Qd = vdup.size(Qd, Rt)
// vmov.size: Dd[i] = Rt
// vmov.sign.size: Rt = Dn[i]
void Decoder::DecodeTypeVFP(Instruction* instr) {
VERIFY((instr->TypeValue() == 7) && (instr->Bit(24) == 0x0) );
VERIFY(instr->Bits(11, 9) == 0x5);
......@@ -1531,21 +1534,71 @@ void Decoder::DecodeTypeVFP(Instruction* instr) {
if ((instr->VCValue() == 0x0) &&
(instr->VAValue() == 0x0)) {
DecodeVMOVBetweenCoreAndSinglePrecisionRegisters(instr);
} else if ((instr->VLValue() == 0x0) &&
(instr->VCValue() == 0x1) &&
(instr->Bit(23) == 0x0)) {
if (instr->Bit(21) == 0x0) {
Format(instr, "vmov'cond.32 'Dd[0], 'rt");
} else if ((instr->VLValue() == 0x0) && (instr->VCValue() == 0x1)) {
if (instr->Bit(23) == 0) {
int opc1_opc2 = (instr->Bits(22, 21) << 2) | instr->Bits(6, 5);
if ((opc1_opc2 & 0xb) == 0) {
// NeonS32/NeonU32
if (instr->Bit(21) == 0x0) {
Format(instr, "vmov'cond.32 'Dd[0], 'rt");
} else {
Format(instr, "vmov'cond.32 'Dd[1], 'rt");
}
} else {
int vd = instr->VFPNRegValue(kDoublePrecision);
int rt = instr->RtValue();
if ((opc1_opc2 & 0x8) != 0) {
// NeonS8 / NeonU8
int i = opc1_opc2 & 0x7;
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmov.8 d%d[%d], r%d", vd, i, rt);
} else if ((opc1_opc2 & 0x1) != 0) {
// NeonS16 / NeonU16
int i = (opc1_opc2 >> 1) & 0x3;
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmov.16 d%d[%d], r%d", vd, i, rt);
} else {
Unknown(instr);
}
}
} else {
Format(instr, "vmov'cond.32 'Dd[1], 'rt");
}
} else if ((instr->VLValue() == 0x1) &&
(instr->VCValue() == 0x1) &&
(instr->Bit(23) == 0x0)) {
if (instr->Bit(21) == 0x0) {
Format(instr, "vmov'cond.32 'rt, 'Dd[0]");
int size = 32;
if (instr->Bit(5) != 0)
size = 16;
else if (instr->Bit(22) != 0)
size = 8;
int Vd = instr->VFPNRegValue(kSimd128Precision);
int Rt = instr->RtValue();
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vdup.%i q%d, r%d", size, Vd, Rt);
}
} else if ((instr->VLValue() == 0x1) && (instr->VCValue() == 0x1)) {
int opc1_opc2 = (instr->Bits(22, 21) << 2) | instr->Bits(6, 5);
if ((opc1_opc2 & 0xb) == 0) {
// NeonS32 / NeonU32
if (instr->Bit(21) == 0x0) {
Format(instr, "vmov'cond.32 'rt, 'Dd[0]");
} else {
Format(instr, "vmov'cond.32 'rt, 'Dd[1]");
}
} else {
Format(instr, "vmov'cond.32 'rt, 'Dd[1]");
const char* sign = instr->Bit(23) != 0 ? "u" : "s";
int rt = instr->RtValue();
int vn = instr->VFPNRegValue(kDoublePrecision);
if ((opc1_opc2 & 0x8) != 0) {
// NeonS8 / NeonU8
int i = opc1_opc2 & 0x7;
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmov.%s8 r%d, d%d[%d]", sign, rt, vn, i);
} else if ((opc1_opc2 & 0x1) != 0) {
// NeonS16 / NeonU16
int i = (opc1_opc2 >> 1) & 0x3;
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vmov.%s16 r%d, d%d[%d]",
sign, rt, vn, i);
} else {
Unknown(instr);
}
}
} else if ((instr->VCValue() == 0x0) &&
(instr->VAValue() == 0x7) &&
......@@ -1563,6 +1616,8 @@ void Decoder::DecodeTypeVFP(Instruction* instr) {
Format(instr, "vmrs'cond 'rt, FPSCR");
}
}
} else {
Unknown(instr); // Not used by V8.
}
}
}
......@@ -1809,6 +1864,25 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
int Vm = instr->VFPMRegValue(kSimd128Precision);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vmov q%d, q%d", Vd, Vm);
} else if (instr->Bits(11, 8) == 8) {
const char* op = (instr->Bit(4) == 0) ? "vadd" : "vtst";
int size = kBitsPerByte * (1 << instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
// vadd/vtst.i<size> Qd, Qm, Qn.
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "%s.i%d q%d, q%d, q%d", op,
size, Vd, Vn, Vm);
} else if (instr->Bits(11, 8) == 0xd && instr->Bit(4) == 0) {
const char* op = (instr->Bits(21, 20) == 0) ? "vadd" : "vsub";
int size = kBitsPerByte * (1 << instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
// vadd/vsub.f32 Qd, Qm, Qn.
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"%s.f32 q%d, q%d, q%d", op, Vd, Vn, Vm);
} else {
Unknown(instr);
}
......@@ -1828,8 +1902,29 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
}
break;
case 6:
if (instr->Bits(21, 20) == 0 && instr->Bits(11, 8) == 1 &&
instr->Bit(4) == 1) {
if (instr->Bits(11, 8) == 8) {
int size = kBitsPerByte * (1 << instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
if (instr->Bit(4) == 0) {
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vsub.i%d q%d, q%d, q%d",
size, Vd, Vn, Vm);
} else {
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vceq.i%d q%d, q%d, q%d",
size, Vd, Vn, Vm);
}
} else if (instr->Bits(21, 20) == 1 && instr->Bits(11, 8) == 1 &&
instr->Bit(4) == 1) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vbsl q%d, q%d, q%d", Vd, Vn, Vm);
} else if (instr->Bits(21, 20) == 0 && instr->Bits(11, 8) == 1 &&
instr->Bit(4) == 1) {
if (instr->Bit(6) == 0) {
// veor Dd, Dn, Dm
int Vd = instr->VFPDRegValue(kDoublePrecision);
......@@ -1860,6 +1955,35 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
int imm3 = instr->Bits(21, 19);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmovl.u%d q%d, d%d", imm3*8, Vd, Vm);
} else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0 &&
instr->Bits(11, 6) == 0x17 && instr->Bit(4) == 0) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vmvn q%d, q%d", Vd, Vm);
} else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0xB &&
instr->Bits(11, 9) == 0x3 && instr->Bit(6) == 1 &&
instr->Bit(4) == 0) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
const char* suffix = nullptr;
int op = instr->Bits(8, 7);
switch (op) {
case 0:
suffix = "f32.s32";
break;
case 1:
suffix = "f32.u32";
break;
case 2:
suffix = "s32.f32";
break;
case 3:
suffix = "u32.f32";
break;
}
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vcvt.%s q%d, q%d", suffix, Vd, Vm);
} else if ((instr->Bits(21, 16) == 0x32) && (instr->Bits(11, 7) == 0) &&
(instr->Bit(4) == 0)) {
if (instr->Bit(6) == 0) {
......@@ -1873,6 +1997,26 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vswp q%d, q%d", Vd, Vm);
}
} else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 7) == 0x18 &&
instr->Bit(4) == 0x0) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
int index = instr->Bit(19);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vdup q%d, d%d[%d]", Vd, Vm, index);
} else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 10) == 0x2 &&
instr->Bit(4) == 0x0) {
int Vd = instr->VFPDRegValue(kDoublePrecision);
int Vn = instr->VFPNRegValue(kDoublePrecision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
int len = instr->Bits(9, 8);
NeonListOperand list(DwVfpRegister::from_code(Vn), len + 1);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "%s d%d, ",
instr->Bit(6) == 0 ? "vtbl.8" : "vtbx.8", Vd);
FormatNeonList(Vn, list.type());
Print(", ");
PrintDRegister(Vm);
} else {
Unknown(instr);
}
......
......@@ -1081,8 +1081,8 @@ void MacroAssembler::VmovLow(DwVfpRegister dst, Register src) {
}
void MacroAssembler::VmovExtended(Register dst, int src_code) {
DCHECK_LE(32, src_code);
DCHECK_GT(64, src_code);
DCHECK_LE(SwVfpRegister::kMaxNumRegisters, src_code);
DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, src_code);
if (src_code & 0x1) {
VmovHigh(dst, DwVfpRegister::from_code(src_code / 2));
} else {
......@@ -1091,8 +1091,8 @@ void MacroAssembler::VmovExtended(Register dst, int src_code) {
}
void MacroAssembler::VmovExtended(int dst_code, Register src) {
DCHECK_LE(32, dst_code);
DCHECK_GT(64, dst_code);
DCHECK_LE(SwVfpRegister::kMaxNumRegisters, dst_code);
DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, dst_code);
if (dst_code & 0x1) {
VmovHigh(DwVfpRegister::from_code(dst_code / 2), src);
} else {
......@@ -1102,22 +1102,23 @@ void MacroAssembler::VmovExtended(int dst_code, Register src) {
void MacroAssembler::VmovExtended(int dst_code, int src_code,
Register scratch) {
if (src_code < 32 && dst_code < 32) {
if (src_code < SwVfpRegister::kMaxNumRegisters &&
dst_code < SwVfpRegister::kMaxNumRegisters) {
// src and dst are both s-registers.
vmov(SwVfpRegister::from_code(dst_code),
SwVfpRegister::from_code(src_code));
} else if (src_code < 32) {
} else if (src_code < SwVfpRegister::kMaxNumRegisters) {
// src is an s-register.
vmov(scratch, SwVfpRegister::from_code(src_code));
VmovExtended(dst_code, scratch);
} else if (dst_code < 32) {
} else if (dst_code < SwVfpRegister::kMaxNumRegisters) {
// dst is an s-register.
VmovExtended(scratch, src_code);
vmov(SwVfpRegister::from_code(dst_code), scratch);
} else {
// Neither src or dst are s-registers.
DCHECK_GT(64, src_code);
DCHECK_GT(64, dst_code);
DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, src_code);
DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, dst_code);
VmovExtended(scratch, src_code);
VmovExtended(dst_code, scratch);
}
......@@ -1125,7 +1126,7 @@ void MacroAssembler::VmovExtended(int dst_code, int src_code,
void MacroAssembler::VmovExtended(int dst_code, const MemOperand& src,
Register scratch) {
if (dst_code >= 32) {
if (dst_code >= SwVfpRegister::kMaxNumRegisters) {
ldr(scratch, src);
VmovExtended(dst_code, scratch);
} else {
......@@ -1135,7 +1136,7 @@ void MacroAssembler::VmovExtended(int dst_code, const MemOperand& src,
void MacroAssembler::VmovExtended(const MemOperand& dst, int src_code,
Register scratch) {
if (src_code >= 32) {
if (src_code >= SwVfpRegister::kMaxNumRegisters) {
VmovExtended(scratch, src_code);
str(scratch, dst);
} else {
......@@ -1143,6 +1144,47 @@ void MacroAssembler::VmovExtended(const MemOperand& dst, int src_code,
}
}
void MacroAssembler::ExtractLane(Register dst, QwNeonRegister src,
NeonDataType dt, int lane) {
int bytes_per_lane = dt & NeonDataTypeSizeMask; // 1, 2, 4
int log2_bytes_per_lane = bytes_per_lane / 2; // 0, 1, 2
int byte = lane << log2_bytes_per_lane;
int double_word = byte >> kDoubleSizeLog2;
int double_byte = byte & (kDoubleSize - 1);
int double_lane = double_byte >> log2_bytes_per_lane;
DwVfpRegister double_source =
DwVfpRegister::from_code(src.code() * 2 + double_word);
vmov(dt, dst, double_source, double_lane);
}
void MacroAssembler::ExtractLane(SwVfpRegister dst, QwNeonRegister src,
Register scratch, int lane) {
int s_code = src.code() * 4 + lane;
VmovExtended(dst.code(), s_code, scratch);
}
void MacroAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
Register src_lane, NeonDataType dt, int lane) {
Move(dst, src);
int bytes_per_lane = dt & NeonDataTypeSizeMask; // 1, 2, 4
int log2_bytes_per_lane = bytes_per_lane / 2; // 0, 1, 2
int byte = lane << log2_bytes_per_lane;
int double_word = byte >> kDoubleSizeLog2;
int double_byte = byte & (kDoubleSize - 1);
int double_lane = double_byte >> log2_bytes_per_lane;
DwVfpRegister double_dst =
DwVfpRegister::from_code(dst.code() * 2 + double_word);
vmov(dt, double_dst, double_lane, src_lane);
}
void MacroAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
SwVfpRegister src_lane, Register scratch,
int lane) {
Move(dst, src);
int s_code = dst.code() * 4 + lane;
VmovExtended(s_code, src_lane.code(), scratch);
}
void MacroAssembler::LslPair(Register dst_low, Register dst_high,
Register src_low, Register src_high,
Register scratch, Register shift) {
......
......@@ -561,6 +561,14 @@ class MacroAssembler: public Assembler {
void VmovExtended(int dst_code, const MemOperand& src, Register scratch);
void VmovExtended(const MemOperand& dst, int src_code, Register scratch);
void ExtractLane(Register dst, QwNeonRegister src, NeonDataType dt, int lane);
void ExtractLane(SwVfpRegister dst, QwNeonRegister src, Register scratch,
int lane);
void ReplaceLane(QwNeonRegister dst, QwNeonRegister src, Register src_lane,
NeonDataType dt, int lane);
void ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
SwVfpRegister src_lane, Register scratch, int lane);
void LslPair(Register dst_low, Register dst_high, Register src_low,
Register src_high, Register scratch, Register shift);
void LslPair(Register dst_low, Register dst_high, Register src_low,
......
This diff is collapsed.
......@@ -339,6 +339,8 @@ class Simulator {
void DecodeVMOVBetweenCoreAndSinglePrecisionRegisters(Instruction* instr);
void DecodeVCMP(Instruction* instr);
void DecodeVCVTBetweenDoubleAndSingle(Instruction* instr);
int32_t ConvertDoubleToInt(double val, bool unsigned_integer,
VFPRoundingMode mode);
void DecodeVCVTBetweenFloatingPointAndInteger(Instruction* instr);
// Executes one instruction.
......
This diff is collapsed.
......@@ -936,10 +936,45 @@ TEST(Neon) {
"f3886a11 vmovl.u8 q3, d1");
COMPARE(vmovl(NeonU8, q4, d2),
"f3888a12 vmovl.u8 q4, d2");
COMPARE(vmov(NeonS8, d0, 0, r0), "ee400b10 vmov.8 d0[0], r0");
COMPARE(vmov(NeonU8, d1, 1, r1), "ee411b30 vmov.8 d1[1], r1");
COMPARE(vmov(NeonS8, d2, 2, r2), "ee422b50 vmov.8 d2[2], r2");
COMPARE(vmov(NeonU8, d3, 3, r8), "ee438b70 vmov.8 d3[3], r8");
COMPARE(vmov(NeonS8, d4, 4, r0), "ee640b10 vmov.8 d4[4], r0");
COMPARE(vmov(NeonU8, d5, 5, r1), "ee651b30 vmov.8 d5[5], r1");
COMPARE(vmov(NeonS8, d6, 6, r2), "ee662b50 vmov.8 d6[6], r2");
COMPARE(vmov(NeonU8, d7, 7, r8), "ee678b70 vmov.8 d7[7], r8");
COMPARE(vmov(NeonS16, d0, 0, r0), "ee000b30 vmov.16 d0[0], r0");
COMPARE(vmov(NeonS16, d1, 1, r1), "ee011b70 vmov.16 d1[1], r1");
COMPARE(vmov(NeonS16, d2, 2, r2), "ee222b30 vmov.16 d2[2], r2");
COMPARE(vmov(NeonS16, d3, 3, r7), "ee237b70 vmov.16 d3[3], r7");
COMPARE(vmov(NeonS32, d0, 0, r0), "ee000b10 vmov.32 d0[0], r0");
COMPARE(vmov(NeonU32, d0, 1, r0), "ee200b10 vmov.32 d0[1], r0");
COMPARE(vmov(NeonS8, r0, d0, 0), "ee500b10 vmov.s8 r0, d0[0]");
COMPARE(vmov(NeonU8, r1, d1, 1), "eed11b30 vmov.u8 r1, d1[1]");
COMPARE(vmov(NeonS8, r2, d2, 2), "ee522b50 vmov.s8 r2, d2[2]");
COMPARE(vmov(NeonU8, r8, d3, 3), "eed38b70 vmov.u8 r8, d3[3]");
COMPARE(vmov(NeonS8, r0, d4, 4), "ee740b10 vmov.s8 r0, d4[4]");
COMPARE(vmov(NeonU8, r1, d5, 5), "eef51b30 vmov.u8 r1, d5[5]");
COMPARE(vmov(NeonS8, r2, d6, 6), "ee762b50 vmov.s8 r2, d6[6]");
COMPARE(vmov(NeonU8, r8, d7, 7), "eef78b70 vmov.u8 r8, d7[7]");
COMPARE(vmov(NeonS16, r0, d0, 0), "ee100b30 vmov.s16 r0, d0[0]");
COMPARE(vmov(NeonU16, r1, d1, 1), "ee911b70 vmov.u16 r1, d1[1]");
COMPARE(vmov(NeonS16, r2, d2, 2), "ee322b30 vmov.s16 r2, d2[2]");
COMPARE(vmov(NeonU16, r7, d3, 3), "eeb37b70 vmov.u16 r7, d3[3]");
COMPARE(vmov(NeonS32, r2, d15, 0), "ee1f2b10 vmov.32 r2, d15[0]");
COMPARE(vmov(NeonS32, r3, d14, 1), "ee3e3b10 vmov.32 r3, d14[1]");
COMPARE(vmov(q0, q15),
"f22e01fe vmov q0, q15");
COMPARE(vmov(q8, q9),
"f26201f2 vmov q8, q9");
COMPARE(vmvn(q0, q15),
"f3b005ee vmvn q0, q15");
COMPARE(vmvn(q8, q9),
"f3f005e2 vmvn q8, q9");
COMPARE(vswp(d0, d31),
"f3b2002f vswp d0, d31");
COMPARE(vswp(d16, d14),
......@@ -948,6 +983,24 @@ TEST(Neon) {
"f3b2006e vswp q0, q15");
COMPARE(vswp(q8, q9),
"f3f20062 vswp q8, q9");
COMPARE(vdup(Neon8, q0, r0),
"eee00b10 vdup.8 q0, r0");
COMPARE(vdup(Neon16, q1, r4),
"eea24b30 vdup.16 q1, r4");
COMPARE(vdup(Neon32, q15, r1),
"eeae1b90 vdup.32 q15, r1");
COMPARE(vdup(q0, s3),
"f3bc0c41 vdup q0, d1[1]");
COMPARE(vdup(q15, s2),
"f3f4ec41 vdup q15, d1[0]");
COMPARE(vcvt_f32_s32(q15, q1),
"f3fbe642 vcvt.f32.s32 q15, q1");
COMPARE(vcvt_f32_u32(q8, q9),
"f3fb06e2 vcvt.f32.u32 q8, q9");
COMPARE(vcvt_s32_f32(q15, q1),
"f3fbe742 vcvt.s32.f32 q15, q1");
COMPARE(vcvt_u32_f32(q8, q9),
"f3fb07e2 vcvt.u32.f32 q8, q9");
COMPARE(veor(d0, d1, d2),
"f3010112 veor d0, d1, d2");
COMPARE(veor(d0, d30, d31),
......@@ -956,6 +1009,54 @@ TEST(Neon) {
"f3020154 veor q0, q1, q2");
COMPARE(veor(q15, q0, q8),
"f340e170 veor q15, q0, q8");
COMPARE(vadd(q15, q0, q8),
"f240ed60 vadd.f32 q15, q0, q8");
COMPARE(vadd(Neon8, q0, q1, q2),
"f2020844 vadd.i8 q0, q1, q2");
COMPARE(vadd(Neon16, q1, q2, q8),
"f2142860 vadd.i16 q1, q2, q8");
COMPARE(vadd(Neon32, q15, q0, q8),
"f260e860 vadd.i32 q15, q0, q8");
COMPARE(vsub(q15, q0, q8),
"f260ed60 vsub.f32 q15, q0, q8");
COMPARE(vsub(Neon8, q0, q1, q2),
"f3020844 vsub.i8 q0, q1, q2");
COMPARE(vsub(Neon16, q1, q2, q8),
"f3142860 vsub.i16 q1, q2, q8");
COMPARE(vsub(Neon32, q15, q0, q8),
"f360e860 vsub.i32 q15, q0, q8");
COMPARE(vtst(Neon8, q0, q1, q2),
"f2020854 vtst.i8 q0, q1, q2");
COMPARE(vtst(Neon16, q1, q2, q8),
"f2142870 vtst.i16 q1, q2, q8");
COMPARE(vtst(Neon32, q15, q0, q8),
"f260e870 vtst.i32 q15, q0, q8");
COMPARE(vceq(Neon8, q0, q1, q2),
"f3020854 vceq.i8 q0, q1, q2");
COMPARE(vceq(Neon16, q1, q2, q8),
"f3142870 vceq.i16 q1, q2, q8");
COMPARE(vceq(Neon32, q15, q0, q8),
"f360e870 vceq.i32 q15, q0, q8");
COMPARE(vbsl(q0, q1, q2),
"f3120154 vbsl q0, q1, q2");
COMPARE(vbsl(q15, q0, q8),
"f350e170 vbsl q15, q0, q8");
COMPARE(vtbl(d0, NeonListOperand(d1, 1), d2),
"f3b10802 vtbl.8 d0, {d1}, d2");
COMPARE(vtbl(d31, NeonListOperand(d0, 2), d4),
"f3f0f904 vtbl.8 d31, {d0, d1}, d4");
COMPARE(vtbl(d15, NeonListOperand(d1, 3), d5),
"f3b1fa05 vtbl.8 d15, {d1, d2, d3}, d5");
COMPARE(vtbl(d15, NeonListOperand(d1, 4), d5),
"f3b1fb05 vtbl.8 d15, {d1, d2, d3, d4}, d5");
COMPARE(vtbx(d0, NeonListOperand(d1, 1), d2),
"f3b10842 vtbx.8 d0, {d1}, d2");
COMPARE(vtbx(d31, NeonListOperand(d0, 2), d4),
"f3f0f944 vtbx.8 d31, {d0, d1}, d4");
COMPARE(vtbx(d15, NeonListOperand(d1, 3), d5),
"f3b1fa45 vtbx.8 d15, {d1, d2, d3}, d5");
COMPARE(vtbx(d15, NeonListOperand(d1, 4), d5),
"f3b1fb45 vtbx.8 d15, {d1, d2, d3, d4}, d5");
}
VERIFY_RUN();
......
......@@ -42,6 +42,7 @@ typedef void* (*F)(int x, int y, int p2, int p3, int p4);
#define __ masm->
typedef Object* (*F3)(void* p0, int p1, int p2, int p3, int p4);
typedef int (*F5)(void*, void*, void*, void*, void*);
......@@ -134,4 +135,248 @@ TEST(LoadAndStoreWithRepresentation) {
CHECK(!CALL_GENERATED_CODE(isolate, f, 0, 0, 0, 0, 0));
}
TEST(ExtractLane) {
if (!CpuFeatures::IsSupported(NEON)) return;
// Allocate an executable page of memory.
size_t actual_size;
byte* buffer = static_cast<byte*>(v8::base::OS::Allocate(
Assembler::kMinimalBufferSize, &actual_size, true));
CHECK(buffer);
Isolate* isolate = CcTest::i_isolate();
HandleScope handles(isolate);
MacroAssembler assembler(isolate, buffer, static_cast<int>(actual_size),
v8::internal::CodeObjectRequired::kYes);
MacroAssembler* masm = &assembler; // Create a pointer for the __ macro.
typedef struct {
int32_t i32x4_low[4];
int32_t i32x4_high[4];
int32_t i16x8_low[8];
int32_t i16x8_high[8];
int32_t i8x16_low[16];
int32_t i8x16_high[16];
int32_t f32x4_low[4];
int32_t f32x4_high[4];
} T;
T t;
__ stm(db_w, sp, r4.bit() | r5.bit() | lr.bit());
for (int i = 0; i < 4; i++) {
__ mov(r4, Operand(i));
__ vdup(Neon32, q1, r4);
__ ExtractLane(r5, q1, NeonS32, i);
__ str(r5, MemOperand(r0, offsetof(T, i32x4_low) + 4 * i));
SwVfpRegister si = SwVfpRegister::from_code(i);
__ ExtractLane(si, q1, r4, i);
__ vstr(si, r0, offsetof(T, f32x4_low) + 4 * i);
}
for (int i = 0; i < 8; i++) {
__ mov(r4, Operand(i));
__ vdup(Neon16, q1, r4);
__ ExtractLane(r5, q1, NeonS16, i);
__ str(r5, MemOperand(r0, offsetof(T, i16x8_low) + 4 * i));
}
for (int i = 0; i < 16; i++) {
__ mov(r4, Operand(i));
__ vdup(Neon8, q1, r4);
__ ExtractLane(r5, q1, NeonS8, i);
__ str(r5, MemOperand(r0, offsetof(T, i8x16_low) + 4 * i));
}
if (CpuFeatures::IsSupported(VFP32DREGS)) {
for (int i = 0; i < 4; i++) {
__ mov(r4, Operand(-i));
__ vdup(Neon32, q15, r4);
__ ExtractLane(r5, q15, NeonS32, i);
__ str(r5, MemOperand(r0, offsetof(T, i32x4_high) + 4 * i));
SwVfpRegister si = SwVfpRegister::from_code(i);
__ ExtractLane(si, q15, r4, i);
__ vstr(si, r0, offsetof(T, f32x4_high) + 4 * i);
}
for (int i = 0; i < 8; i++) {
__ mov(r4, Operand(-i));
__ vdup(Neon16, q15, r4);
__ ExtractLane(r5, q15, NeonS16, i);
__ str(r5, MemOperand(r0, offsetof(T, i16x8_high) + 4 * i));
}
for (int i = 0; i < 16; i++) {
__ mov(r4, Operand(-i));
__ vdup(Neon8, q15, r4);
__ ExtractLane(r5, q15, NeonS8, i);
__ str(r5, MemOperand(r0, offsetof(T, i8x16_high) + 4 * i));
}
}
__ ldm(ia_w, sp, r4.bit() | r5.bit() | pc.bit());
CodeDesc desc;
masm->GetCode(&desc);
Handle<Code> code = isolate->factory()->NewCode(
desc, Code::ComputeFlags(Code::STUB), Handle<Code>());
#ifdef DEBUG
OFStream os(stdout);
code->Print(os);
#endif
F3 f = FUNCTION_CAST<F3>(code->entry());
Object* dummy = CALL_GENERATED_CODE(isolate, f, &t, 0, 0, 0, 0);
USE(dummy);
for (int i = 0; i < 4; i++) {
CHECK_EQ(i, t.i32x4_low[i]);
CHECK_EQ(i, t.f32x4_low[i]);
}
for (int i = 0; i < 8; i++) {
CHECK_EQ(i, t.i16x8_low[i]);
}
for (int i = 0; i < 16; i++) {
CHECK_EQ(i, t.i8x16_low[i]);
}
if (CpuFeatures::IsSupported(VFP32DREGS)) {
for (int i = 0; i < 4; i++) {
CHECK_EQ(-i, t.i32x4_high[i]);
CHECK_EQ(-i, t.f32x4_high[i]);
}
for (int i = 0; i < 8; i++) {
CHECK_EQ(-i, t.i16x8_high[i]);
}
for (int i = 0; i < 16; i++) {
CHECK_EQ(-i, t.i8x16_high[i]);
}
}
}
TEST(ReplaceLane) {
if (!CpuFeatures::IsSupported(NEON)) return;
// Allocate an executable page of memory.
size_t actual_size;
byte* buffer = static_cast<byte*>(v8::base::OS::Allocate(
Assembler::kMinimalBufferSize, &actual_size, true));
CHECK(buffer);
Isolate* isolate = CcTest::i_isolate();
HandleScope handles(isolate);
MacroAssembler assembler(isolate, buffer, static_cast<int>(actual_size),
v8::internal::CodeObjectRequired::kYes);
MacroAssembler* masm = &assembler; // Create a pointer for the __ macro.
typedef struct {
int32_t i32x4_low[4];
int32_t i32x4_high[4];
int16_t i16x8_low[8];
int16_t i16x8_high[8];
int8_t i8x16_low[16];
int8_t i8x16_high[16];
int32_t f32x4_low[4];
int32_t f32x4_high[4];
} T;
T t;
__ stm(db_w, sp, r4.bit() | r5.bit() | r6.bit() | r7.bit() | lr.bit());
const Register kScratch = r5;
__ veor(q0, q0, q0); // Zero
__ veor(q1, q1, q1); // Zero
for (int i = 0; i < 4; i++) {
__ mov(r4, Operand(i));
__ ReplaceLane(q0, q0, r4, NeonS32, i);
SwVfpRegister si = SwVfpRegister::from_code(i);
__ vmov(si, r4);
__ ReplaceLane(q1, q1, si, kScratch, i);
}
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, i32x4_low))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, f32x4_low))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ veor(q0, q0, q0); // Zero
for (int i = 0; i < 8; i++) {
__ mov(r4, Operand(i));
__ ReplaceLane(q0, q0, r4, NeonS16, i);
}
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, i16x8_low))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ veor(q0, q0, q0); // Zero
for (int i = 0; i < 16; i++) {
__ mov(r4, Operand(i));
__ ReplaceLane(q0, q0, r4, NeonS8, i);
}
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, i8x16_low))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
if (CpuFeatures::IsSupported(VFP32DREGS)) {
__ veor(q14, q14, q14); // Zero
__ veor(q15, q15, q15); // Zero
for (int i = 0; i < 4; i++) {
__ mov(r4, Operand(-i));
__ ReplaceLane(q14, q14, r4, NeonS32, i);
SwVfpRegister si = SwVfpRegister::from_code(i);
__ vmov(si, r4);
__ ReplaceLane(q15, q15, si, kScratch, i);
}
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, i32x4_high))));
__ vst1(Neon8, NeonListOperand(q14), NeonMemOperand(r4));
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, f32x4_high))));
__ vst1(Neon8, NeonListOperand(q15), NeonMemOperand(r4));
__ veor(q14, q14, q14); // Zero
for (int i = 0; i < 8; i++) {
__ mov(r4, Operand(-i));
__ ReplaceLane(q14, q14, r4, NeonS16, i);
}
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, i16x8_high))));
__ vst1(Neon8, NeonListOperand(q14), NeonMemOperand(r4));
__ veor(q14, q14, q14); // Zero
for (int i = 0; i < 16; i++) {
__ mov(r4, Operand(-i));
__ ReplaceLane(q14, q14, r4, NeonS8, i);
}
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, i8x16_high))));
__ vst1(Neon8, NeonListOperand(q14), NeonMemOperand(r4));
}
__ ldm(ia_w, sp, r4.bit() | r5.bit() | r6.bit() | r7.bit() | pc.bit());
CodeDesc desc;
masm->GetCode(&desc);
Handle<Code> code = isolate->factory()->NewCode(
desc, Code::ComputeFlags(Code::STUB), Handle<Code>());
#ifdef DEBUG
OFStream os(stdout);
code->Print(os);
#endif
F3 f = FUNCTION_CAST<F3>(code->entry());
Object* dummy = CALL_GENERATED_CODE(isolate, f, &t, 0, 0, 0, 0);
USE(dummy);
for (int i = 0; i < 4; i++) {
CHECK_EQ(i, t.i32x4_low[i]);
CHECK_EQ(i, t.f32x4_low[i]);
}
for (int i = 0; i < 8; i++) {
CHECK_EQ(i, t.i16x8_low[i]);
}
for (int i = 0; i < 16; i++) {
CHECK_EQ(i, t.i8x16_low[i]);
}
if (CpuFeatures::IsSupported(VFP32DREGS)) {
for (int i = 0; i < 4; i++) {
CHECK_EQ(-i, t.i32x4_high[i]);
CHECK_EQ(-i, t.f32x4_high[i]);
}
for (int i = 0; i < 8; i++) {
CHECK_EQ(-i, t.i16x8_high[i]);
}
for (int i = 0; i < 16; i++) {
CHECK_EQ(-i, t.i8x16_high[i]);
}
}
}
#undef __
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment