Commit 431223f3 authored by bbudge's avatar bbudge Committed by Commit bot

[ARM] Add NEON instructions for implementing SIMD.

- Adds vabs, vneg, vmul, vext, vzip, vrev instructions.
- Adds Swizzle function to macro assembler.
- Simplifies if-else logic in disassembler, simulator, for Neon special.
- Some refactoring of Neon assembler, macro-assembler tests.

LOG=N
BUG=v8:4124

Review-Url: https://codereview.chromium.org/2579913002
Cr-Commit-Position: refs/heads/master@{#41781}
parent 01a21606
......@@ -4074,6 +4074,50 @@ void Assembler::vcvt_u32_f32(const QwNeonRegister dst,
emit(EncodeNeonVCVT(U32, dst, F32, src));
}
// op is instr->Bits(11, 7).
static Instr EncodeNeonUnaryOp(int op, bool is_float, NeonSize size,
const QwNeonRegister dst,
const QwNeonRegister src) {
DCHECK_IMPLIES(is_float, size == Neon32);
int vd, d;
dst.split_code(&vd, &d);
int vm, m;
src.split_code(&vm, &m);
int F = is_float ? 1 : 0;
return 0x1E7U * B23 | d * B22 | 0x3 * B20 | size * B18 | B16 | vd * B12 |
F * B10 | B8 | op * B7 | B6 | m * B5 | vm;
}
void Assembler::vabs(const QwNeonRegister dst, const QwNeonRegister src) {
// Qd = vabs.f<size>(Qn, Qm) SIMD floating point absolute value.
// Instruction details available in ARM DDI 0406C.b, A8.8.824.
DCHECK(IsEnabled(NEON));
emit(EncodeNeonUnaryOp(0x6, true, Neon32, dst, src));
}
void Assembler::vabs(NeonSize size, const QwNeonRegister dst,
const QwNeonRegister src) {
// Qd = vabs.s<size>(Qn, Qm) SIMD integer absolute value.
// Instruction details available in ARM DDI 0406C.b, A8.8.824.
DCHECK(IsEnabled(NEON));
emit(EncodeNeonUnaryOp(0x6, false, size, dst, src));
}
void Assembler::vneg(const QwNeonRegister dst, const QwNeonRegister src) {
// Qd = vabs.f<size>(Qn, Qm) SIMD floating point negate.
// Instruction details available in ARM DDI 0406C.b, A8.8.968.
DCHECK(IsEnabled(NEON));
emit(EncodeNeonUnaryOp(0x7, true, Neon32, dst, src));
}
void Assembler::vneg(NeonSize size, const QwNeonRegister dst,
const QwNeonRegister src) {
// Qd = vabs.s<size>(Qn, Qm) SIMD integer negate.
// Instruction details available in ARM DDI 0406C.b, A8.8.968.
DCHECK(IsEnabled(NEON));
emit(EncodeNeonUnaryOp(0x7, false, size, dst, src));
}
void Assembler::veor(DwVfpRegister dst, DwVfpRegister src1,
DwVfpRegister src2) {
// Dd = veor(Dn, Dm) 64 bit integer exclusive OR.
......@@ -4166,6 +4210,37 @@ void Assembler::vsub(NeonSize size, QwNeonRegister dst,
n * B7 | B6 | m * B5 | vm);
}
void Assembler::vmul(QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vadd(Qn, Qm) SIMD floating point multiply.
// Instruction details available in ARM DDI 0406C.b, A8-958.
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
emit(0x1E6U * B23 | d * B22 | vn * B16 | vd * B12 | 0xD * B8 | n * B7 | B6 |
m * B5 | B4 | vm);
}
void Assembler::vmul(NeonSize size, QwNeonRegister dst,
const QwNeonRegister src1, const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vadd(Qn, Qm) SIMD integer multiply.
// Instruction details available in ARM DDI 0406C.b, A8-960.
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
int sz = static_cast<int>(size);
emit(0x1E4U * B23 | d * B22 | sz * B20 | vn * B16 | vd * B12 | 0x9 * B8 |
n * B7 | B6 | m * B5 | B4 | vm);
}
void Assembler::vtst(NeonSize size, QwNeonRegister dst,
const QwNeonRegister src1, const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
......@@ -4185,7 +4260,7 @@ void Assembler::vtst(NeonSize size, QwNeonRegister dst,
void Assembler::vceq(NeonSize size, QwNeonRegister dst,
const QwNeonRegister src1, const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vceq(Qn, Qm) SIMD integer compare equal.
// Qd = vceq(Qn, Qm) SIMD bitwise compare equal.
// Instruction details available in ARM DDI 0406C.b, A8-844.
int vd, d;
dst.split_code(&vd, &d);
......@@ -4214,6 +4289,70 @@ void Assembler::vbsl(QwNeonRegister dst, const QwNeonRegister src1,
n * B7 | B6 | m * B5 | B4 | vm);
}
void Assembler::vext(QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2, int bytes) {
DCHECK(IsEnabled(NEON));
// Qd = vext(Qn, Qm) SIMD byte extract.
// Instruction details available in ARM DDI 0406C.b, A8-890.
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
DCHECK_GT(16, bytes);
emit(0x1E5U * B23 | d * B22 | 0x3 * B20 | vn * B16 | vd * B12 | bytes * B8 |
n * B7 | B6 | m * B5 | vm);
}
void Assembler::vzip(NeonSize size, QwNeonRegister dst,
const QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
// Qd = vzip.<size>(Qn, Qm) SIMD zip (interleave).
// Instruction details available in ARM DDI 0406C.b, A8-1102.
int vd, d;
dst.split_code(&vd, &d);
int vm, m;
src.split_code(&vm, &m);
int sz = static_cast<int>(size);
emit(0x1E7U * B23 | d * B22 | 0x3 * B20 | sz * B18 | 2 * B16 | vd * B12 |
0x3 * B7 | B6 | m * B5 | vm);
}
static Instr EncodeNeonVREV(NeonSize op_size, NeonSize size,
const QwNeonRegister dst,
const QwNeonRegister src) {
// Qd = vrev<op_size>.<size>(Qn, Qm) SIMD scalar reverse.
// Instruction details available in ARM DDI 0406C.b, A8-1028.
DCHECK_GT(op_size, static_cast<int>(size));
int vd, d;
dst.split_code(&vd, &d);
int vm, m;
src.split_code(&vm, &m);
int sz = static_cast<int>(size);
int op = static_cast<int>(Neon64) - static_cast<int>(op_size);
return 0x1E7U * B23 | d * B22 | 0x3 * B20 | sz * B18 | vd * B12 | op * B7 |
B6 | m * B5 | vm;
}
void Assembler::vrev16(NeonSize size, const QwNeonRegister dst,
const QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
emit(EncodeNeonVREV(Neon16, size, dst, src));
}
void Assembler::vrev32(NeonSize size, const QwNeonRegister dst,
const QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
emit(EncodeNeonVREV(Neon32, size, dst, src));
}
void Assembler::vrev64(NeonSize size, const QwNeonRegister dst,
const QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
emit(EncodeNeonVREV(Neon64, size, dst, src));
}
// Encode NEON vtbl / vtbx instruction.
static Instr EncodeNeonVTB(const DwVfpRegister dst, const NeonListOperand& list,
const DwVfpRegister index, bool vtbx) {
......
......@@ -1362,6 +1362,10 @@ class Assembler : public AssemblerBase {
void vcvt_s32_f32(const QwNeonRegister dst, const QwNeonRegister src);
void vcvt_u32_f32(const QwNeonRegister dst, const QwNeonRegister src);
void vabs(const QwNeonRegister dst, const QwNeonRegister src);
void vabs(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src);
void vneg(const QwNeonRegister dst, const QwNeonRegister src);
void vneg(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src);
void veor(DwVfpRegister dst, DwVfpRegister src1, DwVfpRegister src2);
void veor(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2);
void vadd(const QwNeonRegister dst, const QwNeonRegister src1,
......@@ -1372,12 +1376,25 @@ class Assembler : public AssemblerBase {
const QwNeonRegister src2);
void vsub(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vmul(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vmul(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vtst(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vceq(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vbsl(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vext(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2, int bytes);
void vzip(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src);
void vrev16(NeonSize size, const QwNeonRegister dst,
const QwNeonRegister src);
void vrev32(NeonSize size, const QwNeonRegister dst,
const QwNeonRegister src);
void vrev64(NeonSize size, const QwNeonRegister dst,
const QwNeonRegister src);
void vtbl(const DwVfpRegister dst, const NeonListOperand& list,
const DwVfpRegister index);
void vtbx(const DwVfpRegister dst, const NeonListOperand& list,
......
......@@ -1883,6 +1883,15 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
// vadd/vsub.f32 Qd, Qm, Qn.
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"%s.f32 q%d, q%d, q%d", op, Vd, Vn, Vm);
} else if (instr->Bits(11, 8) == 0x9 && instr->Bit(6) == 1 &&
instr->Bit(4) == 1) {
int size = kBitsPerByte * (1 << instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
// vmul.i<size> Qd, Qm, Qn.
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmul.i%d q%d, q%d, q%d", size, Vd, Vn, Vm);
} else {
Unknown(instr);
}
......@@ -1897,6 +1906,15 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
int imm3 = instr->Bits(21, 19);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmovl.s%d q%d, d%d", imm3*8, Vd, Vm);
} else if (instr->Bits(21, 20) == 3 && instr->Bit(4) == 0) {
// vext.8 Qd, Qm, Qn, imm4
int imm4 = instr->Bits(11, 8);
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vext.8 q%d, q%d, q%d, #%d",
Vd, Vn, Vm, imm4);
} else {
Unknown(instr);
}
......@@ -1941,6 +1959,14 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"veor q%d, q%d, q%d", Vd, Vn, Vm);
}
} else if (instr->Bit(21) == 0 && instr->Bits(11, 8) == 0xd &&
instr->Bit(6) == 1 && instr->Bit(4) == 1) {
// vmul.f32 Qd, Qn, Qm
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmul.f32 q%d, q%d, q%d", Vd, Vn, Vm);
} else {
Unknown(instr);
}
......@@ -1955,68 +1981,102 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
int imm3 = instr->Bits(21, 19);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmovl.u%d q%d, d%d", imm3*8, Vd, Vm);
} else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0 &&
instr->Bits(11, 6) == 0x17 && instr->Bit(4) == 0) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vmvn q%d, q%d", Vd, Vm);
} else if (instr->Opc1Value() == 7 && instr->Bits(19, 16) == 0xB &&
instr->Bits(11, 9) == 0x3 && instr->Bit(6) == 1 &&
} else if (instr->Opc1Value() == 7 && instr->Bits(21, 20) == 0x3 &&
instr->Bit(4) == 0) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
const char* suffix = nullptr;
int op = instr->Bits(8, 7);
switch (op) {
case 0:
suffix = "f32.s32";
break;
case 1:
suffix = "f32.u32";
break;
case 2:
suffix = "s32.f32";
break;
case 3:
suffix = "u32.f32";
break;
}
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vcvt.%s q%d, q%d", suffix, Vd, Vm);
} else if ((instr->Bits(21, 16) == 0x32) && (instr->Bits(11, 7) == 0) &&
(instr->Bit(4) == 0)) {
if (instr->Bit(6) == 0) {
int Vd = instr->VFPDRegValue(kDoublePrecision);
if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 7) == 0) {
if (instr->Bit(6) == 0) {
int Vd = instr->VFPDRegValue(kDoublePrecision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vswp d%d, d%d", Vd, Vm);
} else {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vswp q%d, q%d", Vd, Vm);
}
} else if (instr->Bits(11, 7) == 0x18) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
int index = instr->Bit(19);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vdup q%d, d%d[%d]", Vd, Vm, index);
} else if (instr->Bits(19, 16) == 0 && instr->Bits(11, 6) == 0x17) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vswp d%d, d%d", Vd, Vm);
} else {
SNPrintF(out_buffer_ + out_buffer_pos_, "vmvn q%d, q%d", Vd, Vm);
} else if (instr->Bits(19, 16) == 0xB && instr->Bits(11, 9) == 0x3 &&
instr->Bit(6) == 1) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
const char* suffix = nullptr;
int op = instr->Bits(8, 7);
switch (op) {
case 0:
suffix = "f32.s32";
break;
case 1:
suffix = "f32.u32";
break;
case 2:
suffix = "s32.f32";
break;
case 3:
suffix = "u32.f32";
break;
}
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vcvt.%s q%d, q%d", suffix, Vd, Vm);
} else if (instr->Bits(11, 10) == 0x2) {
int Vd = instr->VFPDRegValue(kDoublePrecision);
int Vn = instr->VFPNRegValue(kDoublePrecision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
int len = instr->Bits(9, 8);
NeonListOperand list(DwVfpRegister::from_code(Vn), len + 1);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vswp q%d, q%d", Vd, Vm);
SNPrintF(out_buffer_ + out_buffer_pos_, "%s d%d, ",
instr->Bit(6) == 0 ? "vtbl.8" : "vtbx.8", Vd);
FormatNeonList(Vn, list.type());
Print(", ");
PrintDRegister(Vm);
} else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 6) == 0x7) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int size = kBitsPerByte * (1 << instr->Bits(19, 18));
// vzip.<size> Qd, Qm.
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vzip.%d q%d, q%d", size, Vd, Vm);
} else if (instr->Bits(17, 16) == 0 && instr->Bits(11, 9) == 0) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int size = kBitsPerByte * (1 << instr->Bits(19, 18));
int op = kBitsPerByte
<< (static_cast<int>(Neon64) - instr->Bits(8, 7));
// vrev<op>.<size> Qd, Qm.
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vrev%d.%d q%d, q%d", op, size, Vd, Vm);
} else if (instr->Bits(17, 16) == 0x1 && instr->Bit(11) == 0) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int size = kBitsPerByte * (1 << instr->Bits(19, 18));
const char* type = instr->Bit(10) != 0 ? "f" : "s";
if (instr->Bits(9, 6) == 0xd) {
// vabs<type>.<size> Qd, Qm.
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vabs.%s%d q%d, q%d",
type, size, Vd, Vm);
} else if (instr->Bits(9, 6) == 0xf) {
// vneg<type>.<size> Qd, Qm.
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vneg.%s%d q%d, q%d",
type, size, Vd, Vm);
} else {
Unknown(instr);
}
} else {
Unknown(instr);
}
} else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 7) == 0x18 &&
instr->Bit(4) == 0x0) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
int index = instr->Bit(19);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vdup q%d, d%d[%d]", Vd, Vm, index);
} else if (instr->Opc1Value() == 0x7 && instr->Bits(11, 10) == 0x2 &&
instr->Bit(4) == 0x0) {
int Vd = instr->VFPDRegValue(kDoublePrecision);
int Vn = instr->VFPNRegValue(kDoublePrecision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
int len = instr->Bits(9, 8);
NeonListOperand list(DwVfpRegister::from_code(Vn), len + 1);
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "%s d%d, ",
instr->Bit(6) == 0 ? "vtbl.8" : "vtbx.8", Vd);
FormatNeonList(Vn, list.type());
Print(", ");
PrintDRegister(Vm);
} else {
Unknown(instr);
}
......
......@@ -1185,6 +1185,64 @@ void MacroAssembler::ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
VmovExtended(s_code, src_lane.code(), scratch);
}
void MacroAssembler::Swizzle(QwNeonRegister dst, QwNeonRegister src,
Register scratch, NeonSize size, uint32_t lanes) {
// TODO(bbudge) Handle Int16x8, Int8x16 vectors.
DCHECK_EQ(Neon32, size);
DCHECK_IMPLIES(size == Neon32, lanes < 0xFFFFu);
if (size == Neon32) {
switch (lanes) {
// TODO(bbudge) Handle more special cases.
case 0x3210: // Identity.
Move(dst, src);
return;
case 0x1032: // Swap top and bottom.
vext(dst, src, src, 8);
return;
case 0x2103: // Rotation.
vext(dst, src, src, 12);
return;
case 0x0321: // Rotation.
vext(dst, src, src, 4);
return;
case 0x0000: // Equivalent to vdup.
case 0x1111:
case 0x2222:
case 0x3333: {
int lane_code = src.code() * 4 + (lanes & 0xF);
if (lane_code >= SwVfpRegister::kMaxNumRegisters) {
// TODO(bbudge) use vdup (vdup.32 dst, D<src>[lane]) once implemented.
int temp_code = kScratchDoubleReg.code() * 2;
VmovExtended(temp_code, lane_code, scratch);
lane_code = temp_code;
}
vdup(dst, SwVfpRegister::from_code(lane_code));
return;
}
case 0x2301: // Swap lanes 0, 1 and lanes 2, 3.
vrev64(Neon32, dst, src);
return;
default: // Handle all other cases with vmovs.
int src_code = src.code() * 4;
int dst_code = dst.code() * 4;
bool in_place = src.is(dst);
if (in_place) {
vmov(kScratchQuadReg, src);
src_code = kScratchQuadReg.code() * 4;
}
for (int i = 0; i < 4; i++) {
int lane = (lanes >> (i * 4) & 0xF);
VmovExtended(dst_code + i, src_code + lane, scratch);
}
if (in_place) {
// Restore zero reg.
veor(kDoubleRegZero, kDoubleRegZero, kDoubleRegZero);
}
return;
}
}
}
void MacroAssembler::LslPair(Register dst_low, Register dst_high,
Register src_low, Register src_high,
Register scratch, Register shift) {
......
......@@ -568,6 +568,8 @@ class MacroAssembler: public Assembler {
NeonDataType dt, int lane);
void ReplaceLane(QwNeonRegister dst, QwNeonRegister src,
SwVfpRegister src_lane, Register scratch, int lane);
void Swizzle(QwNeonRegister dst, QwNeonRegister src, Register scratch,
NeonSize size, uint32_t lanes);
void LslPair(Register dst_low, Register dst_high, Register src_low,
Register src_high, Register scratch, Register shift);
......
This diff is collapsed.
This diff is collapsed.
......@@ -1001,6 +1001,22 @@ TEST(Neon) {
"f3fbe742 vcvt.s32.f32 q15, q1");
COMPARE(vcvt_u32_f32(q8, q9),
"f3fb07e2 vcvt.u32.f32 q8, q9");
COMPARE(vabs(q0, q1),
"f3b90742 vabs.f32 q0, q1");
COMPARE(vabs(Neon8, q6, q7),
"f3b1c34e vabs.s8 q6, q7");
COMPARE(vabs(Neon16, q0, q1),
"f3b50342 vabs.s16 q0, q1");
COMPARE(vabs(Neon32, q0, q1),
"f3b90342 vabs.s32 q0, q1");
COMPARE(vneg(q0, q1),
"f3b907c2 vneg.f32 q0, q1");
COMPARE(vneg(Neon8, q6, q7),
"f3b1c3ce vneg.s8 q6, q7");
COMPARE(vneg(Neon16, q0, q1),
"f3b503c2 vneg.s16 q0, q1");
COMPARE(vneg(Neon32, q0, q1),
"f3b903c2 vneg.s32 q0, q1");
COMPARE(veor(d0, d1, d2),
"f3010112 veor d0, d1, d2");
COMPARE(veor(d0, d30, d31),
......@@ -1025,6 +1041,14 @@ TEST(Neon) {
"f3142860 vsub.i16 q1, q2, q8");
COMPARE(vsub(Neon32, q15, q0, q8),
"f360e860 vsub.i32 q15, q0, q8");
COMPARE(vmul(q0, q1, q2),
"f3020d54 vmul.f32 q0, q1, q2");
COMPARE(vmul(Neon8, q0, q1, q2),
"f2020954 vmul.i8 q0, q1, q2");
COMPARE(vmul(Neon16, q1, q2, q8),
"f2142970 vmul.i16 q1, q2, q8");
COMPARE(vmul(Neon32, q15, q0, q8),
"f260e970 vmul.i32 q15, q0, q8");
COMPARE(vtst(Neon8, q0, q1, q2),
"f2020854 vtst.i8 q0, q1, q2");
COMPARE(vtst(Neon16, q1, q2, q8),
......@@ -1041,6 +1065,12 @@ TEST(Neon) {
"f3120154 vbsl q0, q1, q2");
COMPARE(vbsl(q15, q0, q8),
"f350e170 vbsl q15, q0, q8");
COMPARE(vext(q15, q0, q8, 3),
"f2f0e360 vext.8 q15, q0, q8, #3");
COMPARE(vzip(Neon16, q15, q0),
"f3f6e1c0 vzip.16 q15, q0");
COMPARE(vrev64(Neon8, q15, q0),
"f3f0e040 vrev64.8 q15, q0");
COMPARE(vtbl(d0, NeonListOperand(d1, 1), d2),
"f3b10802 vtbl.8 d0, {d1}, d2");
COMPARE(vtbl(d31, NeonListOperand(d0, 2), d4),
......
......@@ -379,4 +379,115 @@ TEST(ReplaceLane) {
}
}
#define CHECK_EQ_32X4(field, v0, v1, v2, v3) \
CHECK_EQ(v0, t.field[0]); \
CHECK_EQ(v1, t.field[1]); \
CHECK_EQ(v2, t.field[2]); \
CHECK_EQ(v3, t.field[3]);
TEST(Swizzle) {
if (!CpuFeatures::IsSupported(NEON)) return;
// Allocate an executable page of memory.
size_t actual_size;
byte* buffer = static_cast<byte*>(v8::base::OS::Allocate(
Assembler::kMinimalBufferSize, &actual_size, true));
CHECK(buffer);
Isolate* isolate = CcTest::i_isolate();
HandleScope handles(isolate);
MacroAssembler assembler(isolate, buffer, static_cast<int>(actual_size),
v8::internal::CodeObjectRequired::kYes);
MacroAssembler* masm = &assembler; // Create a pointer for the __ macro.
typedef struct {
int32_t _32x4_3210[4]; // identity
int32_t _32x4_1032[4]; // high / low swap
int32_t _32x4_0000[4]; // vdup's
int32_t _32x4_1111[4];
int32_t _32x4_2222[4];
int32_t _32x4_3333[4];
int32_t _32x4_2103[4]; // rotate left
int32_t _32x4_0321[4]; // rotate right
int32_t _32x4_1132[4]; // irregular
int32_t _32x4_1132_in_place[4]; // irregular, in-place
} T;
T t;
__ stm(db_w, sp, r4.bit() | r5.bit() | r6.bit() | r7.bit() | lr.bit());
const Register kScratch = r5;
// Make test vector [0, 1, 2, 3]
__ veor(q1, q1, q1); // Zero
for (int i = 0; i < 4; i++) {
__ mov(r4, Operand(i));
__ ReplaceLane(q1, q1, r4, NeonS32, i);
}
__ Swizzle(q0, q1, kScratch, Neon32, 0x3210);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_3210))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ Swizzle(q0, q1, kScratch, Neon32, 0x1032);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_1032))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ Swizzle(q0, q1, kScratch, Neon32, 0x0000);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_0000))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ Swizzle(q0, q1, kScratch, Neon32, 0x1111);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_1111))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ Swizzle(q0, q1, kScratch, Neon32, 0x2222);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_2222))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ Swizzle(q0, q1, kScratch, Neon32, 0x3333);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_3333))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ Swizzle(q0, q1, kScratch, Neon32, 0x2103);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_2103))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ Swizzle(q0, q1, kScratch, Neon32, 0x0321);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_0321))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ Swizzle(q0, q1, kScratch, Neon32, 0x1132);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, _32x4_1132))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ vmov(q0, q1);
__ Swizzle(q0, q0, kScratch, Neon32, 0x1132);
__ add(r4, r0,
Operand(static_cast<int32_t>(offsetof(T, _32x4_1132_in_place))));
__ vst1(Neon8, NeonListOperand(q0), NeonMemOperand(r4));
__ ldm(ia_w, sp, r4.bit() | r5.bit() | r6.bit() | r7.bit() | pc.bit());
CodeDesc desc;
masm->GetCode(&desc);
Handle<Code> code = isolate->factory()->NewCode(
desc, Code::ComputeFlags(Code::STUB), Handle<Code>());
#ifdef DEBUG
OFStream os(stdout);
code->Print(os);
#endif
F3 f = FUNCTION_CAST<F3>(code->entry());
Object* dummy = CALL_GENERATED_CODE(isolate, f, &t, 0, 0, 0, 0);
USE(dummy);
CHECK_EQ_32X4(_32x4_3210, 0, 1, 2, 3);
CHECK_EQ_32X4(_32x4_1032, 2, 3, 0, 1);
CHECK_EQ_32X4(_32x4_0000, 0, 0, 0, 0);
CHECK_EQ_32X4(_32x4_1111, 1, 1, 1, 1);
CHECK_EQ_32X4(_32x4_2222, 2, 2, 2, 2);
CHECK_EQ_32X4(_32x4_3333, 3, 3, 3, 3);
CHECK_EQ_32X4(_32x4_2103, 3, 0, 1, 2);
CHECK_EQ_32X4(_32x4_0321, 1, 2, 3, 0);
CHECK_EQ_32X4(_32x4_1132, 2, 3, 1, 1);
CHECK_EQ_32X4(_32x4_1132_in_place, 2, 3, 1, 1);
}
#undef __
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment