Commit b3acc272 authored by bbudge's avatar bbudge Committed by Commit bot

[ARM] Improve VFP register moves.

- Adds vdup.<size> Dd/Qd, Dm[i] instruction.
- Adds vsli, vsri instructions.
- Changes VMovExtended to use these to avoid moves to core registers.

LOG=N
BUG=v8:6020

Review-Url: https://codereview.chromium.org/2868603002
Cr-Commit-Position: refs/heads/master@{#45351}
parent 211cc585
......@@ -3910,19 +3910,47 @@ void Assembler::vdup(NeonSize size, QwNeonRegister dst, Register src) {
0xB * B8 | d * B7 | E * B5 | B4);
}
void Assembler::vdup(QwNeonRegister dst, SwVfpRegister src) {
DCHECK(IsEnabled(NEON));
// Instruction details available in ARM DDI 0406C.b, A8-884.
int index = src.code() & 1;
int d_reg = src.code() / 2;
int imm4 = 4 | index << 3; // esize = 32, index in bit 3.
enum NeonRegType { NEON_D, NEON_Q };
void NeonSplitCode(NeonRegType type, int code, int* vm, int* m, int* encoding) {
if (type == NEON_D) {
DwVfpRegister::split_code(code, vm, m);
} else {
DCHECK_EQ(type, NEON_Q);
QwNeonRegister::split_code(code, vm, m);
*encoding |= B6;
}
}
static Instr EncodeNeonDupOp(NeonSize size, NeonRegType reg_type, int dst_code,
DwVfpRegister src, int index) {
DCHECK_NE(Neon64, size);
int sz = static_cast<int>(size);
DCHECK_LE(0, index);
DCHECK_GT(kSimd128Size / (1 << sz), index);
int imm4 = (1 << sz) | ((index << (sz + 1)) & 0xF);
int qbit = 0;
int vd, d;
dst.split_code(&vd, &d);
NeonSplitCode(reg_type, dst_code, &vd, &d, &qbit);
int vm, m;
DwVfpRegister::from_code(d_reg).split_code(&vm, &m);
src.split_code(&vm, &m);
emit(0x1E7U * B23 | d * B22 | 0x3 * B20 | imm4 * B16 | vd * B12 | 0x18 * B7 |
B6 | m * B5 | vm);
return 0x1E7U * B23 | d * B22 | 0x3 * B20 | imm4 * B16 | vd * B12 |
0x18 * B7 | qbit | m * B5 | vm;
}
void Assembler::vdup(NeonSize size, DwVfpRegister dst, DwVfpRegister src,
int index) {
DCHECK(IsEnabled(NEON));
// Instruction details available in ARM DDI 0406C.b, A8-884.
emit(EncodeNeonDupOp(size, NEON_D, dst.code(), src, index));
}
void Assembler::vdup(NeonSize size, QwNeonRegister dst, DwVfpRegister src,
int index) {
// Instruction details available in ARM DDI 0406C.b, A8-884.
DCHECK(IsEnabled(NEON));
emit(EncodeNeonDupOp(size, NEON_Q, dst.code(), src, index));
}
// Encode NEON vcvt.src_type.dst_type instruction.
......@@ -3977,18 +4005,6 @@ void Assembler::vcvt_u32_f32(QwNeonRegister dst, QwNeonRegister src) {
emit(EncodeNeonVCVT(U32, dst, F32, src));
}
enum NeonRegType { NEON_D, NEON_Q };
void NeonSplitCode(NeonRegType type, int code, int* vm, int* m, int* encoding) {
if (type == NEON_D) {
DwVfpRegister::split_code(code, vm, m);
} else {
DCHECK_EQ(type, NEON_Q);
QwNeonRegister::split_code(code, vm, m);
*encoding |= B6;
}
}
enum UnaryOp { VMVN, VSWP, VABS, VABSF, VNEG, VNEGF };
static Instr EncodeNeonUnaryOp(UnaryOp op, NeonRegType reg_type, NeonSize size,
......@@ -4403,30 +4419,55 @@ void Assembler::vmax(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
emit(EncodeNeonBinOp(VMAX, dt, dst, src1, src2));
}
enum NeonShiftOp { VSHL, VSHR };
enum NeonShiftOp { VSHL, VSHR, VSLI, VSRI };
static Instr EncodeNeonShiftOp(NeonShiftOp op, NeonDataType dt,
QwNeonRegister dst, QwNeonRegister src,
static Instr EncodeNeonShiftOp(NeonShiftOp op, NeonSize size, bool is_unsigned,
NeonRegType reg_type, int dst_code, int src_code,
int shift) {
int vd, d;
dst.split_code(&vd, &d);
int vm, m;
src.split_code(&vm, &m);
int size_in_bits = kBitsPerByte << NeonSz(dt);
int op_encoding = 0;
int imm6 = 0;
if (op == VSHL) {
DCHECK(shift >= 0 && size_in_bits > shift);
imm6 = size_in_bits + shift;
op_encoding = 0x5 * B8;
} else {
DCHECK_EQ(VSHR, op);
DCHECK(shift > 0 && size_in_bits >= shift);
imm6 = 2 * size_in_bits - shift;
op_encoding = NeonU(dt) * B24;
int size_in_bits = kBitsPerByte << static_cast<int>(size);
int op_encoding = 0;
switch (op) {
case VSHL: {
DCHECK(shift >= 0 && size_in_bits > shift);
imm6 = size_in_bits + shift;
op_encoding = 0x5 * B8;
break;
}
case VSHR: {
DCHECK(shift > 0 && size_in_bits >= shift);
imm6 = 2 * size_in_bits - shift;
if (is_unsigned) op_encoding |= B24;
break;
}
case VSLI: {
DCHECK(shift >= 0 && size_in_bits > shift);
imm6 = size_in_bits + shift;
int L = imm6 >> 6;
imm6 &= 0x3F;
op_encoding = B24 | 0x5 * B8 | L * B7;
break;
}
case VSRI: {
DCHECK(shift > 0 && size_in_bits >= shift);
imm6 = 2 * size_in_bits - shift;
int L = imm6 >> 6;
imm6 &= 0x3F;
op_encoding = B24 | 0x4 * B8 | L * B7;
break;
}
default:
UNREACHABLE();
break;
}
return 0x1E5U * B23 | d * B22 | imm6 * B16 | vd * B12 | B6 | m * B5 | B4 |
vm | op_encoding;
int vd, d;
NeonSplitCode(reg_type, dst_code, &vd, &d, &op_encoding);
int vm, m;
NeonSplitCode(reg_type, src_code, &vm, &m, &op_encoding);
return 0x1E5U * B23 | d * B22 | imm6 * B16 | vd * B12 | m * B5 | B4 | vm |
op_encoding;
}
void Assembler::vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src,
......@@ -4434,7 +4475,8 @@ void Assembler::vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src,
DCHECK(IsEnabled(NEON));
// Qd = vshl(Qm, bits) SIMD shift left immediate.
// Instruction details available in ARM DDI 0406C.b, A8-1046.
emit(EncodeNeonShiftOp(VSHL, dt, dst, src, shift));
emit(EncodeNeonShiftOp(VSHL, NeonDataTypeToSize(dt), false, NEON_Q,
dst.code(), src.code(), shift));
}
void Assembler::vshr(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src,
......@@ -4442,7 +4484,26 @@ void Assembler::vshr(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src,
DCHECK(IsEnabled(NEON));
// Qd = vshl(Qm, bits) SIMD shift right immediate.
// Instruction details available in ARM DDI 0406C.b, A8-1052.
emit(EncodeNeonShiftOp(VSHR, dt, dst, src, shift));
emit(EncodeNeonShiftOp(VSHR, NeonDataTypeToSize(dt), NeonU(dt), NEON_Q,
dst.code(), src.code(), shift));
}
void Assembler::vsli(NeonSize size, DwVfpRegister dst, DwVfpRegister src,
int shift) {
DCHECK(IsEnabled(NEON));
// Dd = vsli(Dm, bits) SIMD shift left and insert.
// Instruction details available in ARM DDI 0406C.b, A8-1056.
emit(EncodeNeonShiftOp(VSLI, size, false, NEON_D, dst.code(), src.code(),
shift));
}
void Assembler::vsri(NeonSize size, DwVfpRegister dst, DwVfpRegister src,
int shift) {
DCHECK(IsEnabled(NEON));
// Dd = vsri(Dm, bits) SIMD shift right and insert.
// Instruction details available in ARM DDI 0406C.b, A8-1062.
emit(EncodeNeonShiftOp(VSRI, size, false, NEON_D, dst.code(), src.code(),
shift));
}
static Instr EncodeNeonEstimateOp(bool is_rsqrt, QwNeonRegister dst,
......@@ -4539,7 +4600,7 @@ void Assembler::vpadd(NeonSize size, DwVfpRegister dst, DwVfpRegister src1,
DCHECK(IsEnabled(NEON));
// Dd = vpadd(Dn, Dm) SIMD integer pairwise ADD.
// Instruction details available in ARM DDI 0406C.b, A8-980.
emit(EncodeNeonPairwiseOp(VPADD, NeonSizeToDatatype(size), dst, src1, src2));
emit(EncodeNeonPairwiseOp(VPADD, NeonSizeToDataType(size), dst, src1, src2));
}
void Assembler::vpmin(NeonDataType dt, DwVfpRegister dst, DwVfpRegister src1,
......
......@@ -426,9 +426,10 @@ constexpr LowDwVfpRegister kLastCalleeSavedDoubleReg = d15;
constexpr LowDwVfpRegister kDoubleRegZero = d13;
constexpr LowDwVfpRegister kScratchDoubleReg = d14;
// This scratch q-register aliases d14 (kScratchDoubleReg) and d15, but is only
// used when NEON is supported. d15 is still allocatable if there are only 16
// VFP registers.
// used if NEON is supported, which implies VFP32DREGS. When there are only 16
// d-registers, d15 is still allocatable.
constexpr QwNeonRegister kScratchQuadReg = q7;
constexpr LowDwVfpRegister kScratchDoubleReg2 = d15;
// Coprocessor register
struct CRegister {
......@@ -1331,7 +1332,8 @@ class Assembler : public AssemblerBase {
void vmov(QwNeonRegister dst, QwNeonRegister src);
void vdup(NeonSize size, QwNeonRegister dst, Register src);
void vdup(QwNeonRegister dst, SwVfpRegister src);
void vdup(NeonSize size, QwNeonRegister dst, DwVfpRegister src, int index);
void vdup(NeonSize size, DwVfpRegister dst, DwVfpRegister src, int index);
void vcvt_f32_s32(QwNeonRegister dst, QwNeonRegister src);
void vcvt_f32_u32(QwNeonRegister dst, QwNeonRegister src);
......@@ -1380,6 +1382,8 @@ class Assembler : public AssemblerBase {
DwVfpRegister src2);
void vshl(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src, int shift);
void vshr(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src, int shift);
void vsli(NeonSize size, DwVfpRegister dst, DwVfpRegister src, int shift);
void vsri(NeonSize size, DwVfpRegister dst, DwVfpRegister src, int shift);
// vrecpe and vrsqrte only support floating point lanes.
void vrecpe(QwNeonRegister dst, QwNeonRegister src);
void vrsqrte(QwNeonRegister dst, QwNeonRegister src);
......
......@@ -342,10 +342,15 @@ inline int NeonU(NeonDataType dt) { return static_cast<int>(dt) >> 2; }
inline int NeonSz(NeonDataType dt) { return static_cast<int>(dt) & 0x3; }
// Convert sizes to data types (U bit is clear).
inline NeonDataType NeonSizeToDatatype(NeonSize size) {
inline NeonDataType NeonSizeToDataType(NeonSize size) {
DCHECK_NE(Neon64, size);
return static_cast<NeonDataType>(size);
}
inline NeonSize NeonDataTypeToSize(NeonDataType dt) {
return static_cast<NeonSize>(NeonSz(dt));
}
enum NeonListType {
nlt_1 = 0x7,
nlt_2 = 0xA,
......
......@@ -2211,11 +2211,30 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
"vmovl.u%d q%d, d%d", imm3 * 8, Vd, Vm);
} else if (instr->Opc1Value() == 7 && instr->Bit(4) == 0) {
if (instr->Bits(11, 7) == 0x18) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
int index = instr->Bit(19);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vdup q%d, d%d[%d]", Vd, Vm, index);
int imm4 = instr->Bits(19, 16);
int size = 0, index = 0;
if ((imm4 & 0x1) != 0) {
size = 8;
index = imm4 >> 1;
} else if ((imm4 & 0x2) != 0) {
size = 16;
index = imm4 >> 2;
} else {
size = 32;
index = imm4 >> 3;
}
if (instr->Bit(6) == 0) {
int Vd = instr->VFPDRegValue(kDoublePrecision);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vdup.%i d%d, d%d[%d]",
size, Vd, Vm, index);
} else {
int Vd = instr->VFPDRegValue(kSimd128Precision);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vdup.%i q%d, d%d[%d]",
size, Vd, Vm, index);
}
} else if (instr->Bits(11, 10) == 0x2) {
int Vd = instr->VFPDRegValue(kDoublePrecision);
int Vn = instr->VFPNRegValue(kDoublePrecision);
......@@ -2346,6 +2365,27 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vshr.u%d q%d, q%d, #%d",
size, Vd, Vm, shift);
} else if (instr->Bit(10) == 1 && instr->Bit(6) == 0 &&
instr->Bit(4) == 1) {
// vsli.<size> Dd, Dm, shift
// vsri.<size> Dd, Dm, shift
int imm7 = instr->Bits(21, 16);
if (instr->Bit(7) != 0) imm7 += 64;
int size = base::bits::RoundDownToPowerOfTwo32(imm7);
int shift;
char direction;
if (instr->Bit(8) == 1) {
shift = imm7 - size;
direction = 'l'; // vsli
} else {
shift = 2 * size - imm7;
direction = 'r'; // vsri
}
int Vd = instr->VFPDRegValue(kDoublePrecision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vs%ci.%d d%d, d%d, #%d",
direction, size, Vd, Vm, shift);
} else {
Unknown(instr);
}
......
......@@ -1079,47 +1079,90 @@ void MacroAssembler::VmovExtended(int dst_code, Register src) {
}
}
void MacroAssembler::VmovExtended(int dst_code, int src_code,
Register scratch) {
void MacroAssembler::VmovExtended(int dst_code, int src_code) {
if (src_code == dst_code) return;
if (src_code < SwVfpRegister::kMaxNumRegisters &&
dst_code < SwVfpRegister::kMaxNumRegisters) {
// src and dst are both s-registers.
vmov(SwVfpRegister::from_code(dst_code),
SwVfpRegister::from_code(src_code));
} else if (src_code < SwVfpRegister::kMaxNumRegisters) {
// src is an s-register.
vmov(scratch, SwVfpRegister::from_code(src_code));
VmovExtended(dst_code, scratch);
return;
}
DwVfpRegister dst_d_reg = DwVfpRegister::from_code(dst_code / 2);
DwVfpRegister src_d_reg = DwVfpRegister::from_code(src_code / 2);
int dst_offset = dst_code & 1;
int src_offset = src_code & 1;
if (CpuFeatures::IsSupported(NEON)) {
// On Neon we can shift and insert from d-registers.
if (src_offset == dst_offset) {
// Offsets are the same, use vdup to copy the source to the opposite lane.
vdup(Neon32, kScratchDoubleReg, src_d_reg, src_offset);
src_d_reg = kScratchDoubleReg;
src_offset = dst_offset ^ 1;
}
if (dst_offset) {
if (dst_d_reg.is(src_d_reg)) {
vdup(Neon32, dst_d_reg, src_d_reg, 0);
} else {
vsli(Neon64, dst_d_reg, src_d_reg, 32);
}
} else {
if (dst_d_reg.is(src_d_reg)) {
vdup(Neon32, dst_d_reg, src_d_reg, 1);
} else {
vsri(Neon64, dst_d_reg, src_d_reg, 32);
}
}
return;
}
// Without Neon, use the scratch registers to move src and/or dst into
// s-registers.
int scratchSCode = kScratchDoubleReg.low().code();
int scratchSCode2 = kScratchDoubleReg2.low().code();
if (src_code < SwVfpRegister::kMaxNumRegisters) {
// src is an s-register, dst is not.
vmov(kScratchDoubleReg, dst_d_reg);
vmov(SwVfpRegister::from_code(scratchSCode + dst_offset),
SwVfpRegister::from_code(src_code));
vmov(dst_d_reg, kScratchDoubleReg);
} else if (dst_code < SwVfpRegister::kMaxNumRegisters) {
// dst is an s-register.
VmovExtended(scratch, src_code);
vmov(SwVfpRegister::from_code(dst_code), scratch);
// dst is an s-register, src is not.
vmov(kScratchDoubleReg, src_d_reg);
vmov(SwVfpRegister::from_code(dst_code),
SwVfpRegister::from_code(scratchSCode + src_offset));
} else {
// Neither src or dst are s-registers.
DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, src_code);
DCHECK_GT(SwVfpRegister::kMaxNumRegisters * 2, dst_code);
VmovExtended(scratch, src_code);
VmovExtended(dst_code, scratch);
// Neither src or dst are s-registers. Both scratch double registers are
// available when there are 32 VFP registers.
vmov(kScratchDoubleReg, src_d_reg);
vmov(kScratchDoubleReg2, dst_d_reg);
vmov(SwVfpRegister::from_code(scratchSCode + dst_offset),
SwVfpRegister::from_code(scratchSCode2 + src_offset));
vmov(dst_d_reg, kScratchQuadReg.high());
}
}
void MacroAssembler::VmovExtended(int dst_code, const MemOperand& src,
Register scratch) {
if (dst_code >= SwVfpRegister::kMaxNumRegisters) {
ldr(scratch, src);
VmovExtended(dst_code, scratch);
} else {
void MacroAssembler::VmovExtended(int dst_code, const MemOperand& src) {
if (dst_code < SwVfpRegister::kMaxNumRegisters) {
vldr(SwVfpRegister::from_code(dst_code), src);
} else {
// TODO(bbudge) If Neon supported, use load single lane form of vld1.
int dst_s_code = kScratchDoubleReg.low().code() + (dst_code & 1);
vmov(kScratchDoubleReg, DwVfpRegister::from_code(dst_code / 2));
vldr(SwVfpRegister::from_code(dst_s_code), src);
vmov(DwVfpRegister::from_code(dst_code / 2), kScratchDoubleReg);
}
}
void MacroAssembler::VmovExtended(const MemOperand& dst, int src_code,
Register scratch) {
if (src_code >= SwVfpRegister::kMaxNumRegisters) {
VmovExtended(scratch, src_code);
str(scratch, dst);
} else {
void MacroAssembler::VmovExtended(const MemOperand& dst, int src_code) {
if (src_code < SwVfpRegister::kMaxNumRegisters) {
vstr(SwVfpRegister::from_code(src_code), dst);
} else {
// TODO(bbudge) If Neon supported, use store single lane form of vst1.
int src_s_code = kScratchDoubleReg.low().code() + (src_code & 1);
vmov(kScratchDoubleReg, DwVfpRegister::from_code(src_code / 2));
vstr(SwVfpRegister::from_code(src_s_code), dst);
}
}
......@@ -1145,9 +1188,9 @@ void MacroAssembler::ExtractLane(Register dst, DwVfpRegister src,
}
void MacroAssembler::ExtractLane(SwVfpRegister dst, QwNeonRegister src,
Register scratch, int lane) {
int lane) {
int s_code = src.code() * 4 + lane;
VmovExtended(dst.code(), s_code, scratch);
VmovExtended(dst.code(), s_code);
}
void MacroAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
......@@ -1164,11 +1207,10 @@ void MacroAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
}
void MacroAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
SwVfpRegister src_lane, Register scratch,
int lane) {
SwVfpRegister src_lane, int lane) {
Move(dst, src);
int s_code = dst.code() * 4 + lane;
VmovExtended(s_code, src_lane.code(), scratch);
VmovExtended(s_code, src_lane.code());
}
void MacroAssembler::LslPair(Register dst_low, Register dst_high,
......
......@@ -559,18 +559,17 @@ class MacroAssembler: public Assembler {
void VmovExtended(Register dst, int src_code);
void VmovExtended(int dst_code, Register src);
// Move between s-registers and imaginary s-registers.
void VmovExtended(int dst_code, int src_code, Register scratch);
void VmovExtended(int dst_code, const MemOperand& src, Register scratch);
void VmovExtended(const MemOperand& dst, int src_code, Register scratch);
void VmovExtended(int dst_code, int src_code);
void VmovExtended(int dst_code, const MemOperand& src);
void VmovExtended(const MemOperand& dst, int src_code);
void ExtractLane(Register dst, QwNeonRegister src, NeonDataType dt, int lane);
void ExtractLane(Register dst, DwVfpRegister src, NeonDataType dt, int lane);
void ExtractLane(SwVfpRegister dst, QwNeonRegister src, Register scratch,
int lane);
void ExtractLane(SwVfpRegister dst, QwNeonRegister src, int lane);
void ReplaceLane(QwNeonRegister dst, QwNeonRegister src, Register src_lane,
NeonDataType dt, int lane);
void ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
SwVfpRegister src_lane, Register scratch, int lane);
SwVfpRegister src_lane, int lane);
void LslPair(Register dst_low, Register dst_high, Register src_low,
Register src_high, Register scratch, Register shift);
......
......@@ -4222,6 +4222,34 @@ void ArithmeticShiftRight(Simulator* simulator, int Vd, int Vm, int shift) {
simulator->set_neon_register<T, SIZE>(Vd, src);
}
template <typename T, int SIZE>
void ShiftLeftAndInsert(Simulator* simulator, int Vd, int Vm, int shift) {
static const int kElems = SIZE / sizeof(T);
T src[kElems];
T dst[kElems];
simulator->get_neon_register<T, SIZE>(Vm, src);
simulator->get_neon_register<T, SIZE>(Vd, dst);
uint64_t mask = (1llu << shift) - 1llu;
for (int i = 0; i < kElems; i++) {
dst[i] = (src[i] << shift) | (dst[i] & mask);
}
simulator->set_neon_register<T, SIZE>(Vd, dst);
}
template <typename T, int SIZE>
void ShiftRightAndInsert(Simulator* simulator, int Vd, int Vm, int shift) {
static const int kElems = SIZE / sizeof(T);
T src[kElems];
T dst[kElems];
simulator->get_neon_register<T, SIZE>(Vm, src);
simulator->get_neon_register<T, SIZE>(Vd, dst);
uint64_t mask = ~((1llu << (kBitsPerByte * SIZE - shift)) - 1llu);
for (int i = 0; i < kElems; i++) {
dst[i] = (src[i] >> shift) | (dst[i] & mask);
}
simulator->set_neon_register<T, SIZE>(Vd, dst);
}
template <typename T, int SIZE>
void CompareEqual(Simulator* simulator, int Vd, int Vm, int Vn) {
static const int kElems = SIZE / sizeof(T);
......@@ -4995,14 +5023,40 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
set_neon_register(vd, mval);
}
} else if (instr->Bits(11, 7) == 0x18) {
// vdup.32 Qd, Sm.
int vd = instr->VFPDRegValue(kSimd128Precision);
// vdup.<size> Dd, Dm[index].
// vdup.<size> Qd, Dm[index].
int vm = instr->VFPMRegValue(kDoublePrecision);
int index = instr->Bit(19);
uint32_t s_data = get_s_register(vm * 2 + index);
uint32_t q_data[4];
for (int i = 0; i < 4; i++) q_data[i] = s_data;
set_neon_register(vd, q_data);
int imm4 = instr->Bits(19, 16);
int size = 0, index = 0, mask = 0;
if ((imm4 & 0x1) != 0) {
size = 8;
index = imm4 >> 1;
mask = 0xffu;
} else if ((imm4 & 0x2) != 0) {
size = 16;
index = imm4 >> 2;
mask = 0xffffu;
} else {
size = 32;
index = imm4 >> 3;
mask = 0xffffffffu;
}
uint64_t d_data;
get_d_register(vm, &d_data);
uint32_t scalar = (d_data >> (size * index)) & mask;
uint32_t duped = scalar;
for (int i = 1; i < 32 / size; i++) {
scalar <<= size;
duped |= scalar;
}
uint32_t result[4] = {duped, duped, duped, duped};
if (instr->Bit(6) == 0) {
int vd = instr->VFPDRegValue(kDoublePrecision);
set_d_register(vd, result);
} else {
int vd = instr->VFPDRegValue(kSimd128Precision);
set_neon_register(vd, result);
}
} else if (instr->Bits(19, 16) == 0 && instr->Bits(11, 6) == 0x17) {
// vmvn Qd, Qm.
int vd = instr->VFPDRegValue(kSimd128Precision);
......@@ -5379,6 +5433,58 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
UNREACHABLE();
break;
}
} else if (instr->Bits(11, 8) == 0x5 && instr->Bit(6) == 0 &&
instr->Bit(4) == 1) {
// vsli.<size> Dd, Dm, shift
int imm7 = instr->Bits(21, 16);
if (instr->Bit(7) != 0) imm7 += 64;
int size = base::bits::RoundDownToPowerOfTwo32(imm7);
int shift = imm7 - size;
int Vd = instr->VFPDRegValue(kDoublePrecision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
switch (size) {
case 8:
ShiftLeftAndInsert<uint8_t, kDoubleSize>(this, Vd, Vm, shift);
break;
case 16:
ShiftLeftAndInsert<uint16_t, kDoubleSize>(this, Vd, Vm, shift);
break;
case 32:
ShiftLeftAndInsert<uint32_t, kDoubleSize>(this, Vd, Vm, shift);
break;
case 64:
ShiftLeftAndInsert<uint64_t, kDoubleSize>(this, Vd, Vm, shift);
break;
default:
UNREACHABLE();
break;
}
} else if (instr->Bits(11, 8) == 0x4 && instr->Bit(6) == 0 &&
instr->Bit(4) == 1) {
// vsri.<size> Dd, Dm, shift
int imm7 = instr->Bits(21, 16);
if (instr->Bit(7) != 0) imm7 += 64;
int size = base::bits::RoundDownToPowerOfTwo32(imm7);
int shift = 2 * size - imm7;
int Vd = instr->VFPDRegValue(kDoublePrecision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
switch (size) {
case 8:
ShiftRightAndInsert<uint8_t, kDoubleSize>(this, Vd, Vm, shift);
break;
case 16:
ShiftRightAndInsert<uint16_t, kDoubleSize>(this, Vd, Vm, shift);
break;
case 32:
ShiftRightAndInsert<uint32_t, kDoubleSize>(this, Vd, Vm, shift);
break;
case 64:
ShiftRightAndInsert<uint64_t, kDoubleSize>(this, Vd, Vm, shift);
break;
default:
UNREACHABLE();
break;
}
} else {
UNIMPLEMENTED();
}
......
......@@ -1612,17 +1612,19 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kArmF32x4Splat: {
__ vdup(i.OutputSimd128Register(), i.InputFloatRegister(0));
int src_code = i.InputFloatRegister(0).code();
__ vdup(Neon32, i.OutputSimd128Register(),
DwVfpRegister::from_code(src_code / 2), src_code & 0x1);
break;
}
case kArmF32x4ExtractLane: {
__ ExtractLane(i.OutputFloatRegister(), i.InputSimd128Register(0),
kScratchReg, i.InputInt8(1));
i.InputInt8(1));
break;
}
case kArmF32x4ReplaceLane: {
__ ReplaceLane(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.InputFloatRegister(2), kScratchReg, i.InputInt8(1));
i.InputFloatRegister(2), i.InputInt8(1));
break;
}
case kArmF32x4SConvertI32x4: {
......@@ -2219,7 +2221,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
src_code = src1_code;
lane &= 0x3;
}
__ VmovExtended(dst_code + i, src_code + lane, kScratchReg);
__ VmovExtended(dst_code + i, src_code + lane);
shuffle >>= 8;
}
break;
......@@ -3038,10 +3040,10 @@ void CodeGenerator::AssembleMove(InstructionOperand* source,
int src_code = LocationOperand::cast(source)->register_code();
if (destination->IsFloatRegister()) {
int dst_code = LocationOperand::cast(destination)->register_code();
__ VmovExtended(dst_code, src_code, kScratchReg);
__ VmovExtended(dst_code, src_code);
} else {
DCHECK(destination->IsFloatStackSlot());
__ VmovExtended(g.ToMemOperand(destination), src_code, kScratchReg);
__ VmovExtended(g.ToMemOperand(destination), src_code);
}
} else {
DCHECK_EQ(MachineRepresentation::kSimd128, rep);
......@@ -3068,7 +3070,7 @@ void CodeGenerator::AssembleMove(InstructionOperand* source,
// GapResolver may give us reg codes that don't map to actual
// s-registers. Generate code to work around those cases.
int dst_code = LocationOperand::cast(destination)->register_code();
__ VmovExtended(dst_code, src, kScratchReg);
__ VmovExtended(dst_code, src);
} else {
DCHECK_EQ(MachineRepresentation::kSimd128, rep);
QwNeonRegister dst = g.ToSimd128Register(destination);
......@@ -3152,14 +3154,14 @@ void CodeGenerator::AssembleSwap(InstructionOperand* source,
int src_code = LocationOperand::cast(source)->register_code();
if (destination->IsFPRegister()) {
int dst_code = LocationOperand::cast(destination)->register_code();
__ VmovExtended(temp.low().code(), src_code, kScratchReg);
__ VmovExtended(src_code, dst_code, kScratchReg);
__ VmovExtended(dst_code, temp.low().code(), kScratchReg);
__ VmovExtended(temp.low().code(), src_code);
__ VmovExtended(src_code, dst_code);
__ VmovExtended(dst_code, temp.low().code());
} else {
DCHECK(destination->IsFPStackSlot());
MemOperand dst = g.ToMemOperand(destination);
__ VmovExtended(temp.low().code(), src_code, kScratchReg);
__ VmovExtended(src_code, dst, kScratchReg);
__ VmovExtended(temp.low().code(), src_code);
__ VmovExtended(src_code, dst);
__ vstr(temp.low(), dst);
}
} else {
......
......@@ -1298,6 +1298,7 @@ TEST(15) {
uint32_t vneg_s8[4], vneg_s16[4], vneg_s32[4];
uint32_t veor[4], vand[4], vorr[4];
float vdupf[4], vaddf[4], vpaddf[2], vsubf[4], vmulf[4];
uint32_t vdupf_16[2], vdupf_8[4];
uint32_t vmin_s8[4], vmin_u16[4], vmin_s32[4];
uint32_t vmax_s8[4], vmax_u16[4], vmax_s32[4];
uint32_t vpadd_i8[2], vpadd_i16[2], vpadd_i32[2];
......@@ -1310,6 +1311,7 @@ TEST(15) {
uint32_t vmul8[4], vmul16[4], vmul32[4];
uint32_t vshl8[4], vshl16[4], vshl32[5];
uint32_t vshr_s8[4], vshr_u16[4], vshr_s32[5];
uint32_t vsli_64[2], vsri_64[2], vsli_32[2], vsri_32[2];
uint32_t vceq[4], vceqf[4], vcgef[4], vcgtf[4];
uint32_t vcge_s8[4], vcge_u16[4], vcge_s32[4];
uint32_t vcgt_s8[4], vcgt_u16[4], vcgt_s32[4];
......@@ -1440,7 +1442,7 @@ TEST(15) {
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vcvt_f32_u32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vdup (integer).
// vdup (from register).
__ mov(r4, Operand(0xa));
__ vdup(Neon8, q0, r4);
__ vdup(Neon16, q1, r4);
......@@ -1452,11 +1454,16 @@ TEST(15) {
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdup32))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
// vdup (float).
// vdup (from scalar).
__ vmov(s0, -1.0);
__ vdup(q0, s0);
__ vdup(Neon32, q1, d0, 0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdupf))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ vdup(Neon16, d2, d0, 1);
__ vstr(d2, r0, offsetof(T, vdupf_16));
__ vdup(Neon8, q1, d0, 3);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vdupf_8))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vabs (float).
__ vmov(s0, -1.0);
......@@ -1525,24 +1532,24 @@ TEST(15) {
// vmin (float).
__ vmov(s4, 2.0);
__ vdup(q0, s4);
__ vdup(Neon32, q0, d2, 0);
__ vmov(s4, 1.0);
__ vdup(q1, s4);
__ vdup(Neon32, q1, d2, 0);
__ vmin(q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vminf))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vmax (float).
__ vmov(s4, 2.0);
__ vdup(q0, s4);
__ vdup(Neon32, q0, d2, 0);
__ vmov(s4, 1.0);
__ vdup(q1, s4);
__ vdup(Neon32, q1, d2, 0);
__ vmax(q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmaxf))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vadd (float).
__ vmov(s4, 1.0);
__ vdup(q0, s4);
__ vdup(q1, s4);
__ vdup(Neon32, q0, d2, 0);
__ vdup(Neon32, q1, d2, 0);
__ vadd(q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vaddf))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
......@@ -1555,51 +1562,51 @@ TEST(15) {
__ vstr(d2, r0, offsetof(T, vpaddf));
// vsub (float).
__ vmov(s4, 2.0);
__ vdup(q0, s4);
__ vdup(Neon32, q0, d2, 0);
__ vmov(s4, 1.0);
__ vdup(q1, s4);
__ vdup(Neon32, q1, d2, 0);
__ vsub(q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vsubf))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vmul (float).
__ vmov(s4, 2.0);
__ vdup(q0, s4);
__ vdup(q1, s4);
__ vdup(Neon32, q0, d2, 0);
__ vdup(Neon32, q1, d2, 0);
__ vmul(q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmulf))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vrecpe.
__ vmov(s4, 2.0);
__ vdup(q0, s4);
__ vdup(Neon32, q0, d2, 0);
__ vrecpe(q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrecpe))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vrecps.
__ vmov(s4, 2.0);
__ vdup(q0, s4);
__ vdup(Neon32, q0, d2, 0);
__ vmov(s4, 1.5);
__ vdup(q1, s4);
__ vdup(Neon32, q1, d2, 0);
__ vrecps(q1, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrecps))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vrsqrte.
__ vmov(s4, 4.0);
__ vdup(q0, s4);
__ vdup(Neon32, q0, d2, 0);
__ vrsqrte(q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrsqrte))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vrsqrts.
__ vmov(s4, 2.0);
__ vdup(q0, s4);
__ vdup(Neon32, q0, d2, 0);
__ vmov(s4, 2.5);
__ vdup(q1, s4);
__ vdup(Neon32, q1, d2, 0);
__ vrsqrts(q1, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vrsqrts))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vceq (float).
__ vmov(s4, 1.0);
__ vdup(q0, s4);
__ vdup(q1, s4);
__ vdup(Neon32, q0, d2, 0);
__ vdup(Neon32, q1, d2, 0);
__ vceq(q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vceqf))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
......@@ -1608,7 +1615,7 @@ TEST(15) {
__ vmov(s1, -1.0);
__ vmov(s2, -0.0);
__ vmov(s3, 0.0);
__ vdup(q1, s3);
__ vdup(Neon32, q1, d1, 1);
__ vcge(q2, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vcgef))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
......@@ -1814,6 +1821,26 @@ TEST(15) {
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vshr_s32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vsli, vsri.
__ mov(r4, Operand(0xffffffff));
__ mov(r5, Operand(0x1));
__ vmov(d0, r4, r5);
__ vmov(d1, r5, r5);
__ vsli(Neon64, d1, d0, 32);
__ vstr(d1, r0, offsetof(T, vsli_64));
__ vmov(d0, r5, r4);
__ vmov(d1, r5, r5);
__ vsri(Neon64, d1, d0, 32);
__ vstr(d1, r0, offsetof(T, vsri_64));
__ vmov(d0, r4, r5);
__ vmov(d1, r5, r5);
__ vsli(Neon32, d1, d0, 16);
__ vstr(d1, r0, offsetof(T, vsli_32));
__ vmov(d0, r5, r4);
__ vmov(d1, r5, r5);
__ vsri(Neon32, d1, d0, 16);
__ vstr(d1, r0, offsetof(T, vsri_32));
// vceq.
__ mov(r4, Operand(0x03));
__ vdup(Neon8, q0, r4);
......@@ -2107,7 +2134,9 @@ TEST(15) {
CHECK_EQ_SPLAT(vdup8, 0x0a0a0a0au);
CHECK_EQ_SPLAT(vdup16, 0x000a000au);
CHECK_EQ_SPLAT(vdup32, 0x0000000au);
CHECK_EQ_SPLAT(vdupf, -1.0);
CHECK_EQ_SPLAT(vdupf, -1.0); // bit pattern is 0xbf800000.
CHECK_EQ_32X2(vdupf_16, 0xbf80bf80u, 0xbf80bf80u);
CHECK_EQ_SPLAT(vdupf_8, 0xbfbfbfbfu);
// src: [-1, -1, 1, 1]
CHECK_EQ_32X4(vcvt_s32_f32, -1, -1, 1, 1);
......@@ -2189,6 +2218,10 @@ TEST(15) {
CHECK_EQ_SPLAT(vshr_s8, 0xc0c0c0c0u);
CHECK_EQ_SPLAT(vshr_u16, 0x00400040u);
CHECK_EQ_SPLAT(vshr_s32, 0xffffc040u);
CHECK_EQ_32X2(vsli_64, 0x01u, 0xffffffffu);
CHECK_EQ_32X2(vsri_64, 0xffffffffu, 0x01u);
CHECK_EQ_32X2(vsli_32, 0xffff0001u, 0x00010001u);
CHECK_EQ_32X2(vsri_32, 0x00000000u, 0x0000ffffu);
CHECK_EQ_SPLAT(vceq, 0x00ff00ffu);
// [0, 3, 0, 3, ...] >= [3, 3, 3, 3, ...]
CHECK_EQ_SPLAT(vcge_s8, 0x00ff00ffu);
......@@ -3840,11 +3873,8 @@ TEST(vswp) {
const uint32_t test_2 = 0x89abcdef;
__ mov(r4, Operand(test_1));
__ mov(r5, Operand(test_2));
// TODO(bbudge) replace with vdup when implemented.
__ vmov(d8, r4, r4);
__ vmov(d9, r4, r4); // q4 = [1.0, 1.0]
__ vmov(d10, r5, r5);
__ vmov(d11, r5, r5); // q5 = [-1.0, -1.0]
__ vdup(Neon32, q4, r4);
__ vdup(Neon32, q5, r5);
__ vswp(q4, q5);
__ add(r6, r0, Operand(static_cast<int32_t>(offsetof(T, vswp_q4))));
__ vst1(Neon8, NeonListOperand(q4), NeonMemOperand(r6));
......
......@@ -994,10 +994,14 @@ TEST(Neon) {
"eea24b30 vdup.16 q1, r4");
COMPARE(vdup(Neon32, q15, r1),
"eeae1b90 vdup.32 q15, r1");
COMPARE(vdup(q0, s3),
"f3bc0c41 vdup q0, d1[1]");
COMPARE(vdup(q15, s2),
"f3f4ec41 vdup q15, d1[0]");
COMPARE(vdup(Neon32, q0, d1, 1),
"f3bc0c41 vdup.32 q0, d1[1]");
COMPARE(vdup(Neon32, q15, d1, 0),
"f3f4ec41 vdup.32 q15, d1[0]");
COMPARE(vdup(Neon16, q7, d8, 3),
"f3beec48 vdup.16 q7, d8[3]");
COMPARE(vdup(Neon32, d0, d30, 0),
"f3b40c2e vdup.32 d0, d30[0]");
COMPARE(vcvt_f32_s32(q15, q1),
"f3fbe642 vcvt.f32.s32 q15, q1");
COMPARE(vcvt_f32_u32(q8, q9),
......@@ -1106,6 +1110,14 @@ TEST(Neon) {
"f3d6e050 vshr.u16 q15, q0, #10");
COMPARE(vshr(NeonS32, q15, q0, 17),
"f2efe050 vshr.s32 q15, q0, #17");
COMPARE(vsli(Neon64, d2, d0, 32),
"f3a02590 vsli.64 d2, d0, #32");
COMPARE(vsli(Neon32, d7, d8, 17),
"f3b17518 vsli.32 d7, d8, #17");
COMPARE(vsri(Neon64, d2, d0, 32),
"f3a02490 vsri.64 d2, d0, #32");
COMPARE(vsri(Neon16, d7, d8, 8),
"f3987418 vsri.16 d7, d8, #8");
COMPARE(vrecpe(q15, q0),
"f3fbe540 vrecpe.f32 q15, q0");
COMPARE(vrecps(q15, q0, q8),
......
......@@ -169,7 +169,7 @@ TEST(ExtractLane) {
__ ExtractLane(r5, q1, NeonS32, i);
__ str(r5, MemOperand(r0, offsetof(T, i32x4_low) + 4 * i));
SwVfpRegister si = SwVfpRegister::from_code(i);
__ ExtractLane(si, q1, r4, i);
__ ExtractLane(si, q1, i);
__ vstr(si, r0, offsetof(T, f32x4_low) + 4 * i);
}
......@@ -203,7 +203,7 @@ TEST(ExtractLane) {
__ ExtractLane(r5, q15, NeonS32, i);
__ str(r5, MemOperand(r0, offsetof(T, i32x4_high) + 4 * i));
SwVfpRegister si = SwVfpRegister::from_code(i);
__ ExtractLane(si, q15, r4, i);
__ ExtractLane(si, q15, i);
__ vstr(si, r0, offsetof(T, f32x4_high) + 4 * i);
}
......@@ -304,8 +304,6 @@ TEST(ReplaceLane) {
__ stm(db_w, sp, r4.bit() | r5.bit() | r6.bit() | r7.bit() | lr.bit());
const Register kScratch = r5;
__ veor(q0, q0, q0); // Zero
__ veor(q1, q1, q1); // Zero
for (int i = 0; i < 4; i++) {
......@@ -313,7 +311,7 @@ TEST(ReplaceLane) {
__ ReplaceLane(q0, q0, r4, NeonS32, i);
SwVfpRegister si = SwVfpRegister::from_code(i);
__ vmov(si, r4);
__ ReplaceLane(q1, q1, si, kScratch, i);
__ ReplaceLane(q1, q1, si, i);
}
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, i32x4_low))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
......@@ -344,7 +342,7 @@ TEST(ReplaceLane) {
__ ReplaceLane(q14, q14, r4, NeonS32, i);
SwVfpRegister si = SwVfpRegister::from_code(i);
__ vmov(si, r4);
__ ReplaceLane(q15, q15, si, kScratch, i);
__ ReplaceLane(q15, q15, si, i);
}
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, i32x4_high))));
__ vst1(Neon8, NeonListOperand(q14), NeonMemOperand(r4));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment