Commit a7e67924 authored by bbudge's avatar bbudge Committed by Commit bot

[ARM] Add vmin, vmax NEON instructions.

- Adds vmin, vmax for FP and integer vectors, both signed and unsigned.
- Regularizes switching logic in disasm and simulator for special codes
4 and 6.
- Factors vrecpe, vrsqrte, vrecps, vrsqrts into helper fns.

LOG=N
BUG=v8:4124

Review-Url: https://codereview.chromium.org/2623993006
Cr-Commit-Position: refs/heads/master@{#42385}
parent 39e455db
......@@ -4302,58 +4302,118 @@ void Assembler::vmul(NeonSize size, QwNeonRegister dst,
n * B7 | B6 | m * B5 | B4 | vm);
}
void Assembler::vrecpe(const QwNeonRegister dst, const QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
// Qd = vadd(Qn, Qm) SIMD reciprocal estimate.
// Instruction details available in ARM DDI 0406C.b, A8-1024.
static Instr EncodeNeonMinMax(bool is_min, QwNeonRegister dst,
QwNeonRegister src1, QwNeonRegister src2) {
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src.split_code(&vm, &m);
emit(0x1E7U * B23 | d * B22 | 0x3B * B16 | vd * B12 | 0x5 * B8 | B6 | m * B5 |
vm);
src2.split_code(&vm, &m);
int min = is_min ? 1 : 0;
return 0x1E4U * B23 | d * B22 | min * B21 | vn * B16 | vd * B12 | 0xF * B8 |
n * B7 | B6 | m * B5 | vm;
}
void Assembler::vrsqrte(const QwNeonRegister dst, const QwNeonRegister src) {
static Instr EncodeNeonMinMax(bool is_min, NeonDataType dt, QwNeonRegister dst,
QwNeonRegister src1, QwNeonRegister src2) {
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
int min = is_min ? 1 : 0;
int size = (dt & NeonDataTypeSizeMask) / 2;
int U = dt & NeonDataTypeUMask;
return 0x1E4U * B23 | U | d * B22 | size * B20 | vn * B16 | vd * B12 |
0x6 * B8 | B6 | m * B5 | min * B4 | vm;
}
void Assembler::vmin(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vadd(Qn, Qm) SIMD reciprocal square root estimate.
// Instruction details available in ARM DDI 0406C.b, A8-1038.
// Qd = vmin(Qn, Qm) SIMD floating point MIN.
// Instruction details available in ARM DDI 0406C.b, A8-928.
emit(EncodeNeonMinMax(true, dst, src1, src2));
}
void Assembler::vmin(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vmin(Qn, Qm) SIMD integer MIN.
// Instruction details available in ARM DDI 0406C.b, A8-926.
emit(EncodeNeonMinMax(true, dt, dst, src1, src2));
}
void Assembler::vmax(QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vmax(Qn, Qm) SIMD floating point MAX.
// Instruction details available in ARM DDI 0406C.b, A8-928.
emit(EncodeNeonMinMax(false, dst, src1, src2));
}
void Assembler::vmax(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vmax(Qn, Qm) SIMD integer MAX.
// Instruction details available in ARM DDI 0406C.b, A8-926.
emit(EncodeNeonMinMax(false, dt, dst, src1, src2));
}
static Instr EncodeNeonEstimateOp(bool is_rsqrt, QwNeonRegister dst,
QwNeonRegister src) {
int vd, d;
dst.split_code(&vd, &d);
int vm, m;
src.split_code(&vm, &m);
emit(0x1E7U * B23 | d * B22 | 0x3B * B16 | vd * B12 | 0x5 * B8 | 0x3 * B6 |
m * B5 | vm);
int rsqrt = is_rsqrt ? 1 : 0;
return 0x1E7U * B23 | d * B22 | 0x3B * B16 | vd * B12 | 0x5 * B8 |
rsqrt * B7 | B6 | m * B5 | vm;
}
void Assembler::vrecps(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2) {
void Assembler::vrecpe(const QwNeonRegister dst, const QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
// Qd = vadd(Qn, Qm) SIMD reciprocal refinement step.
// Instruction details available in ARM DDI 0406C.b, A8-1026.
// Qd = vrecpe(Qm) SIMD reciprocal estimate.
// Instruction details available in ARM DDI 0406C.b, A8-1024.
emit(EncodeNeonEstimateOp(false, dst, src));
}
void Assembler::vrsqrte(const QwNeonRegister dst, const QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
// Qd = vrsqrte(Qm) SIMD reciprocal square root estimate.
// Instruction details available in ARM DDI 0406C.b, A8-1038.
emit(EncodeNeonEstimateOp(true, dst, src));
}
static Instr EncodeNeonRefinementOp(bool is_rsqrt, QwNeonRegister dst,
QwNeonRegister src1, QwNeonRegister src2) {
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
emit(0x1E4U * B23 | d * B22 | vn * B16 | vd * B12 | 0xF * B8 | n * B7 | B6 |
m * B5 | B4 | vm);
int rsqrt = is_rsqrt ? 1 : 0;
return 0x1E4U * B23 | d * B22 | rsqrt * B21 | vn * B16 | vd * B12 | 0xF * B8 |
n * B7 | B6 | m * B5 | B4 | vm;
}
void Assembler::vrecps(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vrecps(Qn, Qm) SIMD reciprocal refinement step.
// Instruction details available in ARM DDI 0406C.b, A8-1026.
emit(EncodeNeonRefinementOp(false, dst, src1, src2));
}
void Assembler::vrsqrts(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vadd(Qn, Qm) SIMD reciprocal square root refinement step.
// Qd = vrsqrts(Qn, Qm) SIMD reciprocal square root refinement step.
// Instruction details available in ARM DDI 0406C.b, A8-1040.
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
emit(0x1E4U * B23 | d * B22 | B21 | vn * B16 | vd * B12 | 0xF * B8 | n * B7 |
B6 | m * B5 | B4 | vm);
emit(EncodeNeonRefinementOp(true, dst, src1, src2));
}
void Assembler::vtst(NeonSize size, QwNeonRegister dst,
......
......@@ -1383,6 +1383,14 @@ class Assembler : public AssemblerBase {
const QwNeonRegister src2);
void vmul(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vmin(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vmin(NeonDataType dt, const QwNeonRegister dst,
const QwNeonRegister src1, const QwNeonRegister src2);
void vmax(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vmax(NeonDataType dt, const QwNeonRegister dst,
const QwNeonRegister src1, const QwNeonRegister src2);
// vrecpe and vrsqrte only support floating point lanes.
void vrecpe(const QwNeonRegister dst, const QwNeonRegister src);
void vrsqrte(const QwNeonRegister dst, const QwNeonRegister src);
......
......@@ -1857,7 +1857,7 @@ static const char* const barrier_option_names[] = {
void Decoder::DecodeSpecialCondition(Instruction* instr) {
switch (instr->SpecialValue()) {
case 4:
if (instr->Bits(21, 20) == 2 && instr->Bits(11, 8) == 1 &&
if (instr->Bits(11, 8) == 1 && instr->Bits(21, 20) == 2 &&
instr->Bit(6) == 1 && instr->Bit(4) == 1) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
......@@ -1898,7 +1898,7 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
// vmul.i<size> Qd, Qm, Qn.
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmul.i%d q%d, q%d, q%d", size, Vd, Vn, Vm);
} else if (instr->Bits(21, 20) == 0 && instr->Bits(11, 8) == 0xe &&
} else if (instr->Bits(11, 8) == 0xe && instr->Bits(21, 20) == 0 &&
instr->Bit(4) == 0) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
......@@ -1924,15 +1924,32 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "%s.s%d q%d, q%d, q%d", op,
size, Vd, Vn, Vm);
} else if (instr->Bit(20) == 0 && instr->Bits(11, 8) == 0xf &&
instr->Bit(6) == 1 && instr->Bit(4) == 1) {
} else if (instr->Bits(11, 8) == 0xf && instr->Bit(20) == 0 &&
instr->Bit(6) == 1) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
const char* op = instr->Bit(21) == 0 ? "vrecps" : "vrsqrts";
// vrecps/vrsqrts.f32 Qd, Qm, Qn.
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"%s.f32 q%d, q%d, q%d", op, Vd, Vn, Vm);
if (instr->Bit(4) == 1) {
// vrecps/vrsqrts.f32 Qd, Qm, Qn.
const char* op = instr->Bit(21) == 0 ? "vrecps" : "vrsqrts";
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"%s.f32 q%d, q%d, q%d", op, Vd, Vn, Vm);
} else {
// vmin/max.f32 Qd, Qm, Qn.
const char* op = instr->Bit(21) == 1 ? "vmin" : "vmax";
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"%s.f32 q%d, q%d, q%d", op, Vd, Vn, Vm);
}
} else if (instr->Bits(11, 8) == 0x6) {
int size = kBitsPerByte * (1 << instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
// vmin/vmax.s<size> Qd, Qm, Qn.
const char* op = instr->Bit(4) == 1 ? "vmin" : "vmax";
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "%s.s%d q%d, q%d, q%d", op,
size, Vd, Vn, Vm);
} else {
Unknown(instr);
}
......@@ -1975,14 +1992,14 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
SNPrintF(out_buffer_ + out_buffer_pos_, "vceq.i%d q%d, q%d, q%d",
size, Vd, Vn, Vm);
}
} else if (instr->Bits(21, 20) == 1 && instr->Bits(11, 8) == 1 &&
} else if (instr->Bits(11, 8) == 1 && instr->Bits(21, 20) == 1 &&
instr->Bit(4) == 1) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vbsl q%d, q%d, q%d", Vd, Vn, Vm);
} else if (instr->Bits(21, 20) == 0 && instr->Bits(11, 8) == 1 &&
} else if (instr->Bits(11, 8) == 1 && instr->Bits(21, 20) == 0 &&
instr->Bit(4) == 1) {
if (instr->Bit(6) == 0) {
// veor Dd, Dn, Dm
......@@ -2000,7 +2017,7 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"veor q%d, q%d, q%d", Vd, Vn, Vm);
}
} else if (instr->Bit(21) == 0 && instr->Bits(11, 8) == 0xd &&
} else if (instr->Bits(11, 8) == 0xd && instr->Bit(21) == 0 &&
instr->Bit(6) == 1 && instr->Bit(4) == 1) {
// vmul.f32 Qd, Qn, Qm
int Vd = instr->VFPDRegValue(kSimd128Precision);
......@@ -2008,7 +2025,7 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
int Vm = instr->VFPMRegValue(kSimd128Precision);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmul.f32 q%d, q%d, q%d", Vd, Vn, Vm);
} else if (instr->Bit(20) == 0 && instr->Bits(11, 8) == 0xe &&
} else if (instr->Bits(11, 8) == 0xe && instr->Bit(20) == 0 &&
instr->Bit(4) == 0) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
......@@ -2027,6 +2044,16 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "%s.u%d q%d, q%d, q%d", op,
size, Vd, Vn, Vm);
} else if (instr->Bits(11, 8) == 0x6) {
int size = kBitsPerByte * (1 << instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
// vmin/vmax.u<size> Qd, Qm, Qn.
const char* op = instr->Bit(4) == 1 ? "vmin" : "vmax";
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "%s.u%d q%d, q%d, q%d", op,
size, Vd, Vn, Vm);
} else {
Unknown(instr);
}
......
......@@ -3830,7 +3830,7 @@ void Simulator::DecodeType6CoprocessorIns(Instruction* instr) {
void Simulator::DecodeSpecialCondition(Instruction* instr) {
switch (instr->SpecialValue()) {
case 4:
if (instr->Bits(21, 20) == 2 && instr->Bits(11, 8) == 1 &&
if (instr->Bits(11, 8) == 1 && instr->Bits(21, 20) == 2 &&
instr->Bit(4) == 1) {
// vmov Qd, Qm.
// vorr, Qd, Qm, Qn.
......@@ -3928,7 +3928,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
break;
}
}
} else if (instr->Bit(20) == 0 && instr->Bits(11, 8) == 0xd &&
} else if (instr->Bits(11, 8) == 0xd && instr->Bit(20) == 0 &&
instr->Bit(4) == 0) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
......@@ -3988,7 +3988,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
UNIMPLEMENTED();
break;
}
} else if (instr->Bits(21, 20) == 0 && instr->Bits(11, 8) == 0xe &&
} else if (instr->Bits(11, 8) == 0xe && instr->Bits(21, 20) == 0 &&
instr->Bit(4) == 0) {
// vceq.f32.
int Vd = instr->VFPDRegValue(kSimd128Precision);
......@@ -4066,26 +4066,91 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
UNREACHABLE();
break;
}
} else if (instr->Bit(20) == 0 && instr->Bits(11, 8) == 0xf &&
instr->Bit(6) == 1 && instr->Bit(4) == 1) {
} else if (instr->Bits(11, 8) == 0xf && instr->Bit(20) == 0 &&
instr->Bit(6) == 1) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
float src1[4], src2[4];
get_q_register(Vn, src1);
get_q_register(Vm, src2);
if (instr->Bit(21) == 0) {
// vrecps.f32 Qd, Qm, Qn.
for (int i = 0; i < 4; i++) {
src1[i] = 2.0f - src1[i] * src2[i];
if (instr->Bit(4) == 1) {
if (instr->Bit(21) == 0) {
// vrecps.f32 Qd, Qm, Qn.
for (int i = 0; i < 4; i++) {
src1[i] = 2.0f - src1[i] * src2[i];
}
} else {
// vrsqrts.f32 Qd, Qm, Qn.
for (int i = 0; i < 4; i++) {
src1[i] = (3.0f - src1[i] * src2[i]) * 0.5f;
}
}
} else {
// vrsqrts.f32 Qd, Qm, Qn.
for (int i = 0; i < 4; i++) {
src1[i] = (3.0f - src1[i] * src2[i]) * 0.5f;
if (instr->Bit(21) == 1) {
// vmin.f32 Qd, Qm, Qn.
for (int i = 0; i < 4; i++) {
src1[i] = std::min(src1[i], src2[i]);
}
} else {
// vmax.f32 Qd, Qm, Qn.
for (int i = 0; i < 4; i++) {
src1[i] = std::max(src1[i], src2[i]);
}
}
}
set_q_register(Vd, src1);
} else if (instr->Bits(11, 8) == 0x6) {
// vmin/vmax.s<size> Qd, Qm, Qn.
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
bool min = instr->Bit(4) != 0;
switch (size) {
case Neon8: {
int8_t src1[16], src2[16];
get_q_register(Vn, src1);
get_q_register(Vm, src2);
for (int i = 0; i < 16; i++) {
if (min)
src1[i] = std::min(src1[i], src2[i]);
else
src1[i] = std::max(src1[i], src2[i]);
}
set_q_register(Vd, src1);
break;
}
case Neon16: {
int16_t src1[8], src2[8];
get_q_register(Vn, src1);
get_q_register(Vm, src2);
for (int i = 0; i < 8; i++) {
if (min)
src1[i] = std::min(src1[i], src2[i]);
else
src1[i] = std::max(src1[i], src2[i]);
}
set_q_register(Vd, src1);
break;
}
case Neon32: {
int32_t src1[4], src2[4];
get_q_register(Vn, src1);
get_q_register(Vm, src2);
for (int i = 0; i < 4; i++) {
if (min)
src1[i] = std::min(src1[i], src2[i]);
else
src1[i] = std::max(src1[i], src2[i]);
}
set_q_register(Vd, src1);
break;
}
default:
UNREACHABLE();
break;
}
} else {
UNIMPLEMENTED();
}
......@@ -4215,7 +4280,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
UNREACHABLE();
break;
}
} else if (instr->Bits(21, 20) == 1 && instr->Bits(11, 8) == 1 &&
} else if (instr->Bits(11, 8) == 1 && instr->Bits(21, 20) == 1 &&
instr->Bit(4) == 1) {
// vbsl.size Qd, Qm, Qn.
int Vd = instr->VFPDRegValue(kSimd128Precision);
......@@ -4229,7 +4294,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
dst[i] = (dst[i] & src1[i]) | (~dst[i] & src2[i]);
}
set_q_register(Vd, dst);
} else if (instr->Bits(21, 20) == 0 && instr->Bits(11, 8) == 1 &&
} else if (instr->Bits(11, 8) == 1 && instr->Bits(21, 20) == 0 &&
instr->Bit(4) == 1) {
if (instr->Bit(6) == 0) {
// veor Dd, Dn, Dm
......@@ -4253,7 +4318,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
for (int i = 0; i < 4; i++) src1[i] ^= src2[i];
set_q_register(Vd, src1);
}
} else if (instr->Bit(21) == 0 && instr->Bits(11, 8) == 0xd &&
} else if (instr->Bits(11, 8) == 0xd && instr->Bit(21) == 0 &&
instr->Bit(6) == 1 && instr->Bit(4) == 1) {
// vmul.f32 Qd, Qn, Qm
int Vd = instr->VFPDRegValue(kSimd128Precision);
......@@ -4266,7 +4331,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
src1[i] = src1[i] * src2[i];
}
set_q_register(Vd, src1);
} else if (instr->Bit(20) == 0 && instr->Bits(11, 8) == 0xe &&
} else if (instr->Bits(11, 8) == 0xe && instr->Bit(20) == 0 &&
instr->Bit(4) == 0) {
// vcge/vcgt.f32 Qd, Qm, Qn
bool ge = instr->Bit(21) == 0;
......@@ -4336,6 +4401,57 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
UNREACHABLE();
break;
}
} else if (instr->Bits(11, 8) == 0x6) {
// vmin/vmax.u<size> Qd, Qm, Qn.
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
bool min = instr->Bit(4) != 0;
switch (size) {
case Neon8: {
uint8_t src1[16], src2[16];
get_q_register(Vn, src1);
get_q_register(Vm, src2);
for (int i = 0; i < 16; i++) {
if (min)
src1[i] = std::min(src1[i], src2[i]);
else
src1[i] = std::max(src1[i], src2[i]);
}
set_q_register(Vd, src1);
break;
}
case Neon16: {
uint16_t src1[8], src2[8];
get_q_register(Vn, src1);
get_q_register(Vm, src2);
for (int i = 0; i < 8; i++) {
if (min)
src1[i] = std::min(src1[i], src2[i]);
else
src1[i] = std::max(src1[i], src2[i]);
}
set_q_register(Vd, src1);
break;
}
case Neon32: {
uint32_t src1[4], src2[4];
get_q_register(Vn, src1);
get_q_register(Vm, src2);
for (int i = 0; i < 4; i++) {
if (min)
src1[i] = std::min(src1[i], src2[i]);
else
src1[i] = std::max(src1[i], src2[i]);
}
set_q_register(Vd, src1);
break;
}
default:
UNREACHABLE();
break;
}
} else {
UNIMPLEMENTED();
}
......
......@@ -1296,6 +1296,8 @@ TEST(15) {
uint32_t vneg_s8[4], vneg_s16[4], vneg_s32[4];
uint32_t veor[4], vand[4], vorr[4];
float vdupf[4], vaddf[4], vsubf[4], vmulf[4];
uint32_t vmin_s8[4], vmin_u16[4], vmin_s32[4];
uint32_t vmax_s8[4], vmax_u16[4], vmax_s32[4];
uint32_t vadd8[4], vadd16[4], vadd32[4];
uint32_t vsub8[4], vsub16[4], vsub32[4];
uint32_t vmul8[4], vmul16[4], vmul32[4];
......@@ -1303,6 +1305,7 @@ TEST(15) {
uint32_t vcge_s8[4], vcge_u16[4], vcge_s32[4];
uint32_t vcgt_s8[4], vcgt_u16[4], vcgt_s32[4];
float vrecpe[4], vrecps[4], vrsqrte[4], vrsqrts[4];
float vminf[4], vmaxf[4];
uint32_t vtst[4], vbsl[4];
uint32_t vext[4];
uint32_t vzip8a[4], vzip8b[4], vzip16a[4], vzip16b[4], vzip32a[4],
......@@ -1490,6 +1493,22 @@ TEST(15) {
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vorr))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vmin (float).
__ vmov(s4, 2.0);
__ vdup(q0, s4);
__ vmov(s4, 1.0);
__ vdup(q1, s4);
__ vmin(q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vminf))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vmax (float).
__ vmov(s4, 2.0);
__ vdup(q0, s4);
__ vmov(s4, 1.0);
__ vdup(q1, s4);
__ vmax(q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmaxf))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vadd (float).
__ vmov(s4, 1.0);
__ vdup(q0, s4);
......@@ -1560,6 +1579,35 @@ TEST(15) {
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vcgtf))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
// vmin/vmax integer.
__ mov(r4, Operand(0x03));
__ vdup(Neon16, q0, r4);
__ vdup(Neon8, q1, r4);
__ vmin(NeonS8, q2, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmin_s8))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ vmax(NeonS8, q2, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmax_s8))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ mov(r4, Operand(0xff));
__ vdup(Neon16, q0, r4);
__ vdup(Neon8, q1, r4);
__ vmin(NeonU16, q2, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmin_u16))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ vmax(NeonU16, q2, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmax_u16))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ mov(r4, Operand(0xff));
__ vdup(Neon32, q0, r4);
__ vdup(Neon8, q1, r4);
__ vmin(NeonS32, q2, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmin_s32))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ vmax(NeonS32, q2, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmax_s32))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
// vadd (integer).
__ mov(r4, Operand(0x81));
__ vdup(Neon8, q0, r4);
......@@ -1631,7 +1679,7 @@ TEST(15) {
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vceq))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vcge/vcgt.
// vcge/vcgt (integer).
__ mov(r4, Operand(0x03));
__ vdup(Neon16, q0, r4);
__ vdup(Neon8, q1, r4);
......@@ -1848,6 +1896,8 @@ TEST(15) {
CHECK_EQ_SPLAT(vand, 0x00fe00feu);
CHECK_EQ_SPLAT(vorr, 0x00ff00ffu);
CHECK_EQ_SPLAT(vaddf, 2.0);
CHECK_EQ_SPLAT(vminf, 1.0);
CHECK_EQ_SPLAT(vmaxf, 2.0);
CHECK_EQ_SPLAT(vsubf, -1.0);
CHECK_EQ_SPLAT(vmulf, 4.0);
CHECK_ESTIMATE_SPLAT(vrecpe, 0.5f, 0.1f); // 1 / 2
......@@ -1858,6 +1908,15 @@ TEST(15) {
// [0] >= [-1, 1, -0, 0]
CHECK_EQ_32X4(vcgef, 0u, 0xffffffffu, 0xffffffffu, 0xffffffffu);
CHECK_EQ_32X4(vcgtf, 0u, 0xffffffffu, 0u, 0u);
// [0, 3, 0, 3, ...] and [3, 3, 3, 3, ...]
CHECK_EQ_SPLAT(vmin_s8, 0x00030003u);
CHECK_EQ_SPLAT(vmax_s8, 0x03030303u);
// [0x00ff, 0x00ff, ...] and [0xffff, 0xffff, ...]
CHECK_EQ_SPLAT(vmin_u16, 0x00ff00ffu);
CHECK_EQ_SPLAT(vmax_u16, 0xffffffffu);
// [0x000000ff, 0x000000ff, ...] and [0xffffffff, 0xffffffff, ...]
CHECK_EQ_SPLAT(vmin_s32, 0xffffffffu);
CHECK_EQ_SPLAT(vmax_s32, 0xffu);
CHECK_EQ_SPLAT(vadd8, 0x03030303u);
CHECK_EQ_SPLAT(vadd16, 0x00030003u);
CHECK_EQ_SPLAT(vadd32, 0x00000003u);
......
......@@ -1029,6 +1029,16 @@ TEST(Neon) {
"f240e170 vand q15, q0, q8");
COMPARE(vorr(q15, q0, q8),
"f260e170 vorr q15, q0, q8");
COMPARE(vmin(q15, q0, q8),
"f260ef60 vmin.f32 q15, q0, q8");
COMPARE(vmax(q15, q0, q8),
"f240ef60 vmax.f32 q15, q0, q8");
COMPARE(vmax(NeonS8, q0, q1, q2),
"f2020644 vmax.s8 q0, q1, q2");
COMPARE(vmin(NeonU16, q1, q2, q8),
"f3142670 vmin.u16 q1, q2, q8");
COMPARE(vmax(NeonS32, q15, q0, q8),
"f260e660 vmax.s32 q15, q0, q8");
COMPARE(vadd(q15, q0, q8),
"f240ed60 vadd.f32 q15, q0, q8");
COMPARE(vadd(Neon8, q0, q1, q2),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment