Commit a7e67924 authored by bbudge's avatar bbudge Committed by Commit bot

[ARM] Add vmin, vmax NEON instructions.

- Adds vmin, vmax for FP and integer vectors, both signed and unsigned.
- Regularizes switching logic in disasm and simulator for special codes
4 and 6.
- Factors vrecpe, vrsqrte, vrecps, vrsqrts into helper fns.

LOG=N
BUG=v8:4124

Review-Url: https://codereview.chromium.org/2623993006
Cr-Commit-Position: refs/heads/master@{#42385}
parent 39e455db
...@@ -4302,58 +4302,118 @@ void Assembler::vmul(NeonSize size, QwNeonRegister dst, ...@@ -4302,58 +4302,118 @@ void Assembler::vmul(NeonSize size, QwNeonRegister dst,
n * B7 | B6 | m * B5 | B4 | vm); n * B7 | B6 | m * B5 | B4 | vm);
} }
void Assembler::vrecpe(const QwNeonRegister dst, const QwNeonRegister src) { static Instr EncodeNeonMinMax(bool is_min, QwNeonRegister dst,
DCHECK(IsEnabled(NEON)); QwNeonRegister src1, QwNeonRegister src2) {
// Qd = vadd(Qn, Qm) SIMD reciprocal estimate.
// Instruction details available in ARM DDI 0406C.b, A8-1024.
int vd, d; int vd, d;
dst.split_code(&vd, &d); dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m; int vm, m;
src.split_code(&vm, &m); src2.split_code(&vm, &m);
emit(0x1E7U * B23 | d * B22 | 0x3B * B16 | vd * B12 | 0x5 * B8 | B6 | m * B5 | int min = is_min ? 1 : 0;
vm); return 0x1E4U * B23 | d * B22 | min * B21 | vn * B16 | vd * B12 | 0xF * B8 |
n * B7 | B6 | m * B5 | vm;
} }
void Assembler::vrsqrte(const QwNeonRegister dst, const QwNeonRegister src) { static Instr EncodeNeonMinMax(bool is_min, NeonDataType dt, QwNeonRegister dst,
QwNeonRegister src1, QwNeonRegister src2) {
int vd, d;
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
int min = is_min ? 1 : 0;
int size = (dt & NeonDataTypeSizeMask) / 2;
int U = dt & NeonDataTypeUMask;
return 0x1E4U * B23 | U | d * B22 | size * B20 | vn * B16 | vd * B12 |
0x6 * B8 | B6 | m * B5 | min * B4 | vm;
}
void Assembler::vmin(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON)); DCHECK(IsEnabled(NEON));
// Qd = vadd(Qn, Qm) SIMD reciprocal square root estimate. // Qd = vmin(Qn, Qm) SIMD floating point MIN.
// Instruction details available in ARM DDI 0406C.b, A8-1038. // Instruction details available in ARM DDI 0406C.b, A8-928.
emit(EncodeNeonMinMax(true, dst, src1, src2));
}
void Assembler::vmin(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vmin(Qn, Qm) SIMD integer MIN.
// Instruction details available in ARM DDI 0406C.b, A8-926.
emit(EncodeNeonMinMax(true, dt, dst, src1, src2));
}
void Assembler::vmax(QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vmax(Qn, Qm) SIMD floating point MAX.
// Instruction details available in ARM DDI 0406C.b, A8-928.
emit(EncodeNeonMinMax(false, dst, src1, src2));
}
void Assembler::vmax(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vmax(Qn, Qm) SIMD integer MAX.
// Instruction details available in ARM DDI 0406C.b, A8-926.
emit(EncodeNeonMinMax(false, dt, dst, src1, src2));
}
static Instr EncodeNeonEstimateOp(bool is_rsqrt, QwNeonRegister dst,
QwNeonRegister src) {
int vd, d; int vd, d;
dst.split_code(&vd, &d); dst.split_code(&vd, &d);
int vm, m; int vm, m;
src.split_code(&vm, &m); src.split_code(&vm, &m);
emit(0x1E7U * B23 | d * B22 | 0x3B * B16 | vd * B12 | 0x5 * B8 | 0x3 * B6 | int rsqrt = is_rsqrt ? 1 : 0;
m * B5 | vm); return 0x1E7U * B23 | d * B22 | 0x3B * B16 | vd * B12 | 0x5 * B8 |
rsqrt * B7 | B6 | m * B5 | vm;
} }
void Assembler::vrecps(const QwNeonRegister dst, const QwNeonRegister src1, void Assembler::vrecpe(const QwNeonRegister dst, const QwNeonRegister src) {
const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON)); DCHECK(IsEnabled(NEON));
// Qd = vadd(Qn, Qm) SIMD reciprocal refinement step. // Qd = vrecpe(Qm) SIMD reciprocal estimate.
// Instruction details available in ARM DDI 0406C.b, A8-1026. // Instruction details available in ARM DDI 0406C.b, A8-1024.
emit(EncodeNeonEstimateOp(false, dst, src));
}
void Assembler::vrsqrte(const QwNeonRegister dst, const QwNeonRegister src) {
DCHECK(IsEnabled(NEON));
// Qd = vrsqrte(Qm) SIMD reciprocal square root estimate.
// Instruction details available in ARM DDI 0406C.b, A8-1038.
emit(EncodeNeonEstimateOp(true, dst, src));
}
static Instr EncodeNeonRefinementOp(bool is_rsqrt, QwNeonRegister dst,
QwNeonRegister src1, QwNeonRegister src2) {
int vd, d; int vd, d;
dst.split_code(&vd, &d); dst.split_code(&vd, &d);
int vn, n; int vn, n;
src1.split_code(&vn, &n); src1.split_code(&vn, &n);
int vm, m; int vm, m;
src2.split_code(&vm, &m); src2.split_code(&vm, &m);
emit(0x1E4U * B23 | d * B22 | vn * B16 | vd * B12 | 0xF * B8 | n * B7 | B6 | int rsqrt = is_rsqrt ? 1 : 0;
m * B5 | B4 | vm); return 0x1E4U * B23 | d * B22 | rsqrt * B21 | vn * B16 | vd * B12 | 0xF * B8 |
n * B7 | B6 | m * B5 | B4 | vm;
}
void Assembler::vrecps(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vrecps(Qn, Qm) SIMD reciprocal refinement step.
// Instruction details available in ARM DDI 0406C.b, A8-1026.
emit(EncodeNeonRefinementOp(false, dst, src1, src2));
} }
void Assembler::vrsqrts(const QwNeonRegister dst, const QwNeonRegister src1, void Assembler::vrsqrts(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2) { const QwNeonRegister src2) {
DCHECK(IsEnabled(NEON)); DCHECK(IsEnabled(NEON));
// Qd = vadd(Qn, Qm) SIMD reciprocal square root refinement step. // Qd = vrsqrts(Qn, Qm) SIMD reciprocal square root refinement step.
// Instruction details available in ARM DDI 0406C.b, A8-1040. // Instruction details available in ARM DDI 0406C.b, A8-1040.
int vd, d; emit(EncodeNeonRefinementOp(true, dst, src1, src2));
dst.split_code(&vd, &d);
int vn, n;
src1.split_code(&vn, &n);
int vm, m;
src2.split_code(&vm, &m);
emit(0x1E4U * B23 | d * B22 | B21 | vn * B16 | vd * B12 | 0xF * B8 | n * B7 |
B6 | m * B5 | B4 | vm);
} }
void Assembler::vtst(NeonSize size, QwNeonRegister dst, void Assembler::vtst(NeonSize size, QwNeonRegister dst,
......
...@@ -1383,6 +1383,14 @@ class Assembler : public AssemblerBase { ...@@ -1383,6 +1383,14 @@ class Assembler : public AssemblerBase {
const QwNeonRegister src2); const QwNeonRegister src2);
void vmul(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1, void vmul(NeonSize size, const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2); const QwNeonRegister src2);
void vmin(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vmin(NeonDataType dt, const QwNeonRegister dst,
const QwNeonRegister src1, const QwNeonRegister src2);
void vmax(const QwNeonRegister dst, const QwNeonRegister src1,
const QwNeonRegister src2);
void vmax(NeonDataType dt, const QwNeonRegister dst,
const QwNeonRegister src1, const QwNeonRegister src2);
// vrecpe and vrsqrte only support floating point lanes. // vrecpe and vrsqrte only support floating point lanes.
void vrecpe(const QwNeonRegister dst, const QwNeonRegister src); void vrecpe(const QwNeonRegister dst, const QwNeonRegister src);
void vrsqrte(const QwNeonRegister dst, const QwNeonRegister src); void vrsqrte(const QwNeonRegister dst, const QwNeonRegister src);
......
...@@ -1857,7 +1857,7 @@ static const char* const barrier_option_names[] = { ...@@ -1857,7 +1857,7 @@ static const char* const barrier_option_names[] = {
void Decoder::DecodeSpecialCondition(Instruction* instr) { void Decoder::DecodeSpecialCondition(Instruction* instr) {
switch (instr->SpecialValue()) { switch (instr->SpecialValue()) {
case 4: case 4:
if (instr->Bits(21, 20) == 2 && instr->Bits(11, 8) == 1 && if (instr->Bits(11, 8) == 1 && instr->Bits(21, 20) == 2 &&
instr->Bit(6) == 1 && instr->Bit(4) == 1) { instr->Bit(6) == 1 && instr->Bit(4) == 1) {
int Vd = instr->VFPDRegValue(kSimd128Precision); int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision); int Vm = instr->VFPMRegValue(kSimd128Precision);
...@@ -1898,7 +1898,7 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) { ...@@ -1898,7 +1898,7 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
// vmul.i<size> Qd, Qm, Qn. // vmul.i<size> Qd, Qm, Qn.
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmul.i%d q%d, q%d, q%d", size, Vd, Vn, Vm); "vmul.i%d q%d, q%d, q%d", size, Vd, Vn, Vm);
} else if (instr->Bits(21, 20) == 0 && instr->Bits(11, 8) == 0xe && } else if (instr->Bits(11, 8) == 0xe && instr->Bits(21, 20) == 0 &&
instr->Bit(4) == 0) { instr->Bit(4) == 0) {
int Vd = instr->VFPDRegValue(kSimd128Precision); int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision); int Vm = instr->VFPMRegValue(kSimd128Precision);
...@@ -1924,15 +1924,32 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) { ...@@ -1924,15 +1924,32 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
out_buffer_pos_ += out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "%s.s%d q%d, q%d, q%d", op, SNPrintF(out_buffer_ + out_buffer_pos_, "%s.s%d q%d, q%d, q%d", op,
size, Vd, Vn, Vm); size, Vd, Vn, Vm);
} else if (instr->Bit(20) == 0 && instr->Bits(11, 8) == 0xf && } else if (instr->Bits(11, 8) == 0xf && instr->Bit(20) == 0 &&
instr->Bit(6) == 1 && instr->Bit(4) == 1) { instr->Bit(6) == 1) {
int Vd = instr->VFPDRegValue(kSimd128Precision); int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision); int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision); int Vn = instr->VFPNRegValue(kSimd128Precision);
const char* op = instr->Bit(21) == 0 ? "vrecps" : "vrsqrts"; if (instr->Bit(4) == 1) {
// vrecps/vrsqrts.f32 Qd, Qm, Qn. // vrecps/vrsqrts.f32 Qd, Qm, Qn.
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, const char* op = instr->Bit(21) == 0 ? "vrecps" : "vrsqrts";
"%s.f32 q%d, q%d, q%d", op, Vd, Vn, Vm); out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"%s.f32 q%d, q%d, q%d", op, Vd, Vn, Vm);
} else {
// vmin/max.f32 Qd, Qm, Qn.
const char* op = instr->Bit(21) == 1 ? "vmin" : "vmax";
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"%s.f32 q%d, q%d, q%d", op, Vd, Vn, Vm);
}
} else if (instr->Bits(11, 8) == 0x6) {
int size = kBitsPerByte * (1 << instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
// vmin/vmax.s<size> Qd, Qm, Qn.
const char* op = instr->Bit(4) == 1 ? "vmin" : "vmax";
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "%s.s%d q%d, q%d, q%d", op,
size, Vd, Vn, Vm);
} else { } else {
Unknown(instr); Unknown(instr);
} }
...@@ -1975,14 +1992,14 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) { ...@@ -1975,14 +1992,14 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
SNPrintF(out_buffer_ + out_buffer_pos_, "vceq.i%d q%d, q%d, q%d", SNPrintF(out_buffer_ + out_buffer_pos_, "vceq.i%d q%d, q%d, q%d",
size, Vd, Vn, Vm); size, Vd, Vn, Vm);
} }
} else if (instr->Bits(21, 20) == 1 && instr->Bits(11, 8) == 1 && } else if (instr->Bits(11, 8) == 1 && instr->Bits(21, 20) == 1 &&
instr->Bit(4) == 1) { instr->Bit(4) == 1) {
int Vd = instr->VFPDRegValue(kSimd128Precision); int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision); int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision); int Vn = instr->VFPNRegValue(kSimd128Precision);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vbsl q%d, q%d, q%d", Vd, Vn, Vm); "vbsl q%d, q%d, q%d", Vd, Vn, Vm);
} else if (instr->Bits(21, 20) == 0 && instr->Bits(11, 8) == 1 && } else if (instr->Bits(11, 8) == 1 && instr->Bits(21, 20) == 0 &&
instr->Bit(4) == 1) { instr->Bit(4) == 1) {
if (instr->Bit(6) == 0) { if (instr->Bit(6) == 0) {
// veor Dd, Dn, Dm // veor Dd, Dn, Dm
...@@ -2000,7 +2017,7 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) { ...@@ -2000,7 +2017,7 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"veor q%d, q%d, q%d", Vd, Vn, Vm); "veor q%d, q%d, q%d", Vd, Vn, Vm);
} }
} else if (instr->Bit(21) == 0 && instr->Bits(11, 8) == 0xd && } else if (instr->Bits(11, 8) == 0xd && instr->Bit(21) == 0 &&
instr->Bit(6) == 1 && instr->Bit(4) == 1) { instr->Bit(6) == 1 && instr->Bit(4) == 1) {
// vmul.f32 Qd, Qn, Qm // vmul.f32 Qd, Qn, Qm
int Vd = instr->VFPDRegValue(kSimd128Precision); int Vd = instr->VFPDRegValue(kSimd128Precision);
...@@ -2008,7 +2025,7 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) { ...@@ -2008,7 +2025,7 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
int Vm = instr->VFPMRegValue(kSimd128Precision); int Vm = instr->VFPMRegValue(kSimd128Precision);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_, out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmul.f32 q%d, q%d, q%d", Vd, Vn, Vm); "vmul.f32 q%d, q%d, q%d", Vd, Vn, Vm);
} else if (instr->Bit(20) == 0 && instr->Bits(11, 8) == 0xe && } else if (instr->Bits(11, 8) == 0xe && instr->Bit(20) == 0 &&
instr->Bit(4) == 0) { instr->Bit(4) == 0) {
int Vd = instr->VFPDRegValue(kSimd128Precision); int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision); int Vm = instr->VFPMRegValue(kSimd128Precision);
...@@ -2027,6 +2044,16 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) { ...@@ -2027,6 +2044,16 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
out_buffer_pos_ += out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "%s.u%d q%d, q%d, q%d", op, SNPrintF(out_buffer_ + out_buffer_pos_, "%s.u%d q%d, q%d, q%d", op,
size, Vd, Vn, Vm); size, Vd, Vn, Vm);
} else if (instr->Bits(11, 8) == 0x6) {
int size = kBitsPerByte * (1 << instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
// vmin/vmax.u<size> Qd, Qm, Qn.
const char* op = instr->Bit(4) == 1 ? "vmin" : "vmax";
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "%s.u%d q%d, q%d, q%d", op,
size, Vd, Vn, Vm);
} else { } else {
Unknown(instr); Unknown(instr);
} }
......
...@@ -3830,7 +3830,7 @@ void Simulator::DecodeType6CoprocessorIns(Instruction* instr) { ...@@ -3830,7 +3830,7 @@ void Simulator::DecodeType6CoprocessorIns(Instruction* instr) {
void Simulator::DecodeSpecialCondition(Instruction* instr) { void Simulator::DecodeSpecialCondition(Instruction* instr) {
switch (instr->SpecialValue()) { switch (instr->SpecialValue()) {
case 4: case 4:
if (instr->Bits(21, 20) == 2 && instr->Bits(11, 8) == 1 && if (instr->Bits(11, 8) == 1 && instr->Bits(21, 20) == 2 &&
instr->Bit(4) == 1) { instr->Bit(4) == 1) {
// vmov Qd, Qm. // vmov Qd, Qm.
// vorr, Qd, Qm, Qn. // vorr, Qd, Qm, Qn.
...@@ -3928,7 +3928,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { ...@@ -3928,7 +3928,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
break; break;
} }
} }
} else if (instr->Bit(20) == 0 && instr->Bits(11, 8) == 0xd && } else if (instr->Bits(11, 8) == 0xd && instr->Bit(20) == 0 &&
instr->Bit(4) == 0) { instr->Bit(4) == 0) {
int Vd = instr->VFPDRegValue(kSimd128Precision); int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision); int Vm = instr->VFPMRegValue(kSimd128Precision);
...@@ -3988,7 +3988,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { ...@@ -3988,7 +3988,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
UNIMPLEMENTED(); UNIMPLEMENTED();
break; break;
} }
} else if (instr->Bits(21, 20) == 0 && instr->Bits(11, 8) == 0xe && } else if (instr->Bits(11, 8) == 0xe && instr->Bits(21, 20) == 0 &&
instr->Bit(4) == 0) { instr->Bit(4) == 0) {
// vceq.f32. // vceq.f32.
int Vd = instr->VFPDRegValue(kSimd128Precision); int Vd = instr->VFPDRegValue(kSimd128Precision);
...@@ -4066,26 +4066,91 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { ...@@ -4066,26 +4066,91 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
UNREACHABLE(); UNREACHABLE();
break; break;
} }
} else if (instr->Bit(20) == 0 && instr->Bits(11, 8) == 0xf && } else if (instr->Bits(11, 8) == 0xf && instr->Bit(20) == 0 &&
instr->Bit(6) == 1 && instr->Bit(4) == 1) { instr->Bit(6) == 1) {
int Vd = instr->VFPDRegValue(kSimd128Precision); int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision); int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision); int Vn = instr->VFPNRegValue(kSimd128Precision);
float src1[4], src2[4]; float src1[4], src2[4];
get_q_register(Vn, src1); get_q_register(Vn, src1);
get_q_register(Vm, src2); get_q_register(Vm, src2);
if (instr->Bit(21) == 0) { if (instr->Bit(4) == 1) {
// vrecps.f32 Qd, Qm, Qn. if (instr->Bit(21) == 0) {
for (int i = 0; i < 4; i++) { // vrecps.f32 Qd, Qm, Qn.
src1[i] = 2.0f - src1[i] * src2[i]; for (int i = 0; i < 4; i++) {
src1[i] = 2.0f - src1[i] * src2[i];
}
} else {
// vrsqrts.f32 Qd, Qm, Qn.
for (int i = 0; i < 4; i++) {
src1[i] = (3.0f - src1[i] * src2[i]) * 0.5f;
}
} }
} else { } else {
// vrsqrts.f32 Qd, Qm, Qn. if (instr->Bit(21) == 1) {
for (int i = 0; i < 4; i++) { // vmin.f32 Qd, Qm, Qn.
src1[i] = (3.0f - src1[i] * src2[i]) * 0.5f; for (int i = 0; i < 4; i++) {
src1[i] = std::min(src1[i], src2[i]);
}
} else {
// vmax.f32 Qd, Qm, Qn.
for (int i = 0; i < 4; i++) {
src1[i] = std::max(src1[i], src2[i]);
}
} }
} }
set_q_register(Vd, src1); set_q_register(Vd, src1);
} else if (instr->Bits(11, 8) == 0x6) {
// vmin/vmax.s<size> Qd, Qm, Qn.
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
bool min = instr->Bit(4) != 0;
switch (size) {
case Neon8: {
int8_t src1[16], src2[16];
get_q_register(Vn, src1);
get_q_register(Vm, src2);
for (int i = 0; i < 16; i++) {
if (min)
src1[i] = std::min(src1[i], src2[i]);
else
src1[i] = std::max(src1[i], src2[i]);
}
set_q_register(Vd, src1);
break;
}
case Neon16: {
int16_t src1[8], src2[8];
get_q_register(Vn, src1);
get_q_register(Vm, src2);
for (int i = 0; i < 8; i++) {
if (min)
src1[i] = std::min(src1[i], src2[i]);
else
src1[i] = std::max(src1[i], src2[i]);
}
set_q_register(Vd, src1);
break;
}
case Neon32: {
int32_t src1[4], src2[4];
get_q_register(Vn, src1);
get_q_register(Vm, src2);
for (int i = 0; i < 4; i++) {
if (min)
src1[i] = std::min(src1[i], src2[i]);
else
src1[i] = std::max(src1[i], src2[i]);
}
set_q_register(Vd, src1);
break;
}
default:
UNREACHABLE();
break;
}
} else { } else {
UNIMPLEMENTED(); UNIMPLEMENTED();
} }
...@@ -4215,7 +4280,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { ...@@ -4215,7 +4280,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
UNREACHABLE(); UNREACHABLE();
break; break;
} }
} else if (instr->Bits(21, 20) == 1 && instr->Bits(11, 8) == 1 && } else if (instr->Bits(11, 8) == 1 && instr->Bits(21, 20) == 1 &&
instr->Bit(4) == 1) { instr->Bit(4) == 1) {
// vbsl.size Qd, Qm, Qn. // vbsl.size Qd, Qm, Qn.
int Vd = instr->VFPDRegValue(kSimd128Precision); int Vd = instr->VFPDRegValue(kSimd128Precision);
...@@ -4229,7 +4294,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { ...@@ -4229,7 +4294,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
dst[i] = (dst[i] & src1[i]) | (~dst[i] & src2[i]); dst[i] = (dst[i] & src1[i]) | (~dst[i] & src2[i]);
} }
set_q_register(Vd, dst); set_q_register(Vd, dst);
} else if (instr->Bits(21, 20) == 0 && instr->Bits(11, 8) == 1 && } else if (instr->Bits(11, 8) == 1 && instr->Bits(21, 20) == 0 &&
instr->Bit(4) == 1) { instr->Bit(4) == 1) {
if (instr->Bit(6) == 0) { if (instr->Bit(6) == 0) {
// veor Dd, Dn, Dm // veor Dd, Dn, Dm
...@@ -4253,7 +4318,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { ...@@ -4253,7 +4318,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
for (int i = 0; i < 4; i++) src1[i] ^= src2[i]; for (int i = 0; i < 4; i++) src1[i] ^= src2[i];
set_q_register(Vd, src1); set_q_register(Vd, src1);
} }
} else if (instr->Bit(21) == 0 && instr->Bits(11, 8) == 0xd && } else if (instr->Bits(11, 8) == 0xd && instr->Bit(21) == 0 &&
instr->Bit(6) == 1 && instr->Bit(4) == 1) { instr->Bit(6) == 1 && instr->Bit(4) == 1) {
// vmul.f32 Qd, Qn, Qm // vmul.f32 Qd, Qn, Qm
int Vd = instr->VFPDRegValue(kSimd128Precision); int Vd = instr->VFPDRegValue(kSimd128Precision);
...@@ -4266,7 +4331,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { ...@@ -4266,7 +4331,7 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
src1[i] = src1[i] * src2[i]; src1[i] = src1[i] * src2[i];
} }
set_q_register(Vd, src1); set_q_register(Vd, src1);
} else if (instr->Bit(20) == 0 && instr->Bits(11, 8) == 0xe && } else if (instr->Bits(11, 8) == 0xe && instr->Bit(20) == 0 &&
instr->Bit(4) == 0) { instr->Bit(4) == 0) {
// vcge/vcgt.f32 Qd, Qm, Qn // vcge/vcgt.f32 Qd, Qm, Qn
bool ge = instr->Bit(21) == 0; bool ge = instr->Bit(21) == 0;
...@@ -4336,6 +4401,57 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) { ...@@ -4336,6 +4401,57 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
UNREACHABLE(); UNREACHABLE();
break; break;
} }
} else if (instr->Bits(11, 8) == 0x6) {
// vmin/vmax.u<size> Qd, Qm, Qn.
NeonSize size = static_cast<NeonSize>(instr->Bits(21, 20));
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int Vn = instr->VFPNRegValue(kSimd128Precision);
bool min = instr->Bit(4) != 0;
switch (size) {
case Neon8: {
uint8_t src1[16], src2[16];
get_q_register(Vn, src1);
get_q_register(Vm, src2);
for (int i = 0; i < 16; i++) {
if (min)
src1[i] = std::min(src1[i], src2[i]);
else
src1[i] = std::max(src1[i], src2[i]);
}
set_q_register(Vd, src1);
break;
}
case Neon16: {
uint16_t src1[8], src2[8];
get_q_register(Vn, src1);
get_q_register(Vm, src2);
for (int i = 0; i < 8; i++) {
if (min)
src1[i] = std::min(src1[i], src2[i]);
else
src1[i] = std::max(src1[i], src2[i]);
}
set_q_register(Vd, src1);
break;
}
case Neon32: {
uint32_t src1[4], src2[4];
get_q_register(Vn, src1);
get_q_register(Vm, src2);
for (int i = 0; i < 4; i++) {
if (min)
src1[i] = std::min(src1[i], src2[i]);
else
src1[i] = std::max(src1[i], src2[i]);
}
set_q_register(Vd, src1);
break;
}
default:
UNREACHABLE();
break;
}
} else { } else {
UNIMPLEMENTED(); UNIMPLEMENTED();
} }
......
...@@ -1296,6 +1296,8 @@ TEST(15) { ...@@ -1296,6 +1296,8 @@ TEST(15) {
uint32_t vneg_s8[4], vneg_s16[4], vneg_s32[4]; uint32_t vneg_s8[4], vneg_s16[4], vneg_s32[4];
uint32_t veor[4], vand[4], vorr[4]; uint32_t veor[4], vand[4], vorr[4];
float vdupf[4], vaddf[4], vsubf[4], vmulf[4]; float vdupf[4], vaddf[4], vsubf[4], vmulf[4];
uint32_t vmin_s8[4], vmin_u16[4], vmin_s32[4];
uint32_t vmax_s8[4], vmax_u16[4], vmax_s32[4];
uint32_t vadd8[4], vadd16[4], vadd32[4]; uint32_t vadd8[4], vadd16[4], vadd32[4];
uint32_t vsub8[4], vsub16[4], vsub32[4]; uint32_t vsub8[4], vsub16[4], vsub32[4];
uint32_t vmul8[4], vmul16[4], vmul32[4]; uint32_t vmul8[4], vmul16[4], vmul32[4];
...@@ -1303,6 +1305,7 @@ TEST(15) { ...@@ -1303,6 +1305,7 @@ TEST(15) {
uint32_t vcge_s8[4], vcge_u16[4], vcge_s32[4]; uint32_t vcge_s8[4], vcge_u16[4], vcge_s32[4];
uint32_t vcgt_s8[4], vcgt_u16[4], vcgt_s32[4]; uint32_t vcgt_s8[4], vcgt_u16[4], vcgt_s32[4];
float vrecpe[4], vrecps[4], vrsqrte[4], vrsqrts[4]; float vrecpe[4], vrecps[4], vrsqrte[4], vrsqrts[4];
float vminf[4], vmaxf[4];
uint32_t vtst[4], vbsl[4]; uint32_t vtst[4], vbsl[4];
uint32_t vext[4]; uint32_t vext[4];
uint32_t vzip8a[4], vzip8b[4], vzip16a[4], vzip16b[4], vzip32a[4], uint32_t vzip8a[4], vzip8b[4], vzip16a[4], vzip16b[4], vzip32a[4],
...@@ -1490,6 +1493,22 @@ TEST(15) { ...@@ -1490,6 +1493,22 @@ TEST(15) {
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vorr)))); __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vorr))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vmin (float).
__ vmov(s4, 2.0);
__ vdup(q0, s4);
__ vmov(s4, 1.0);
__ vdup(q1, s4);
__ vmin(q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vminf))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vmax (float).
__ vmov(s4, 2.0);
__ vdup(q0, s4);
__ vmov(s4, 1.0);
__ vdup(q1, s4);
__ vmax(q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmaxf))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vadd (float). // vadd (float).
__ vmov(s4, 1.0); __ vmov(s4, 1.0);
__ vdup(q0, s4); __ vdup(q0, s4);
...@@ -1560,6 +1579,35 @@ TEST(15) { ...@@ -1560,6 +1579,35 @@ TEST(15) {
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vcgtf)))); __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vcgtf))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4)); __ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
// vmin/vmax integer.
__ mov(r4, Operand(0x03));
__ vdup(Neon16, q0, r4);
__ vdup(Neon8, q1, r4);
__ vmin(NeonS8, q2, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmin_s8))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ vmax(NeonS8, q2, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmax_s8))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ mov(r4, Operand(0xff));
__ vdup(Neon16, q0, r4);
__ vdup(Neon8, q1, r4);
__ vmin(NeonU16, q2, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmin_u16))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ vmax(NeonU16, q2, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmax_u16))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ mov(r4, Operand(0xff));
__ vdup(Neon32, q0, r4);
__ vdup(Neon8, q1, r4);
__ vmin(NeonS32, q2, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmin_s32))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ vmax(NeonS32, q2, q0, q1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmax_s32))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
// vadd (integer). // vadd (integer).
__ mov(r4, Operand(0x81)); __ mov(r4, Operand(0x81));
__ vdup(Neon8, q0, r4); __ vdup(Neon8, q0, r4);
...@@ -1631,7 +1679,7 @@ TEST(15) { ...@@ -1631,7 +1679,7 @@ TEST(15) {
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vceq)))); __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vceq))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vcge/vcgt. // vcge/vcgt (integer).
__ mov(r4, Operand(0x03)); __ mov(r4, Operand(0x03));
__ vdup(Neon16, q0, r4); __ vdup(Neon16, q0, r4);
__ vdup(Neon8, q1, r4); __ vdup(Neon8, q1, r4);
...@@ -1848,6 +1896,8 @@ TEST(15) { ...@@ -1848,6 +1896,8 @@ TEST(15) {
CHECK_EQ_SPLAT(vand, 0x00fe00feu); CHECK_EQ_SPLAT(vand, 0x00fe00feu);
CHECK_EQ_SPLAT(vorr, 0x00ff00ffu); CHECK_EQ_SPLAT(vorr, 0x00ff00ffu);
CHECK_EQ_SPLAT(vaddf, 2.0); CHECK_EQ_SPLAT(vaddf, 2.0);
CHECK_EQ_SPLAT(vminf, 1.0);
CHECK_EQ_SPLAT(vmaxf, 2.0);
CHECK_EQ_SPLAT(vsubf, -1.0); CHECK_EQ_SPLAT(vsubf, -1.0);
CHECK_EQ_SPLAT(vmulf, 4.0); CHECK_EQ_SPLAT(vmulf, 4.0);
CHECK_ESTIMATE_SPLAT(vrecpe, 0.5f, 0.1f); // 1 / 2 CHECK_ESTIMATE_SPLAT(vrecpe, 0.5f, 0.1f); // 1 / 2
...@@ -1858,6 +1908,15 @@ TEST(15) { ...@@ -1858,6 +1908,15 @@ TEST(15) {
// [0] >= [-1, 1, -0, 0] // [0] >= [-1, 1, -0, 0]
CHECK_EQ_32X4(vcgef, 0u, 0xffffffffu, 0xffffffffu, 0xffffffffu); CHECK_EQ_32X4(vcgef, 0u, 0xffffffffu, 0xffffffffu, 0xffffffffu);
CHECK_EQ_32X4(vcgtf, 0u, 0xffffffffu, 0u, 0u); CHECK_EQ_32X4(vcgtf, 0u, 0xffffffffu, 0u, 0u);
// [0, 3, 0, 3, ...] and [3, 3, 3, 3, ...]
CHECK_EQ_SPLAT(vmin_s8, 0x00030003u);
CHECK_EQ_SPLAT(vmax_s8, 0x03030303u);
// [0x00ff, 0x00ff, ...] and [0xffff, 0xffff, ...]
CHECK_EQ_SPLAT(vmin_u16, 0x00ff00ffu);
CHECK_EQ_SPLAT(vmax_u16, 0xffffffffu);
// [0x000000ff, 0x000000ff, ...] and [0xffffffff, 0xffffffff, ...]
CHECK_EQ_SPLAT(vmin_s32, 0xffffffffu);
CHECK_EQ_SPLAT(vmax_s32, 0xffu);
CHECK_EQ_SPLAT(vadd8, 0x03030303u); CHECK_EQ_SPLAT(vadd8, 0x03030303u);
CHECK_EQ_SPLAT(vadd16, 0x00030003u); CHECK_EQ_SPLAT(vadd16, 0x00030003u);
CHECK_EQ_SPLAT(vadd32, 0x00000003u); CHECK_EQ_SPLAT(vadd32, 0x00000003u);
......
...@@ -1029,6 +1029,16 @@ TEST(Neon) { ...@@ -1029,6 +1029,16 @@ TEST(Neon) {
"f240e170 vand q15, q0, q8"); "f240e170 vand q15, q0, q8");
COMPARE(vorr(q15, q0, q8), COMPARE(vorr(q15, q0, q8),
"f260e170 vorr q15, q0, q8"); "f260e170 vorr q15, q0, q8");
COMPARE(vmin(q15, q0, q8),
"f260ef60 vmin.f32 q15, q0, q8");
COMPARE(vmax(q15, q0, q8),
"f240ef60 vmax.f32 q15, q0, q8");
COMPARE(vmax(NeonS8, q0, q1, q2),
"f2020644 vmax.s8 q0, q1, q2");
COMPARE(vmin(NeonU16, q1, q2, q8),
"f3142670 vmin.u16 q1, q2, q8");
COMPARE(vmax(NeonS32, q15, q0, q8),
"f260e660 vmax.s32 q15, q0, q8");
COMPARE(vadd(q15, q0, q8), COMPARE(vadd(q15, q0, q8),
"f240ed60 vadd.f32 q15, q0, q8"); "f240ed60 vadd.f32 q15, q0, q8");
COMPARE(vadd(Neon8, q0, q1, q2), COMPARE(vadd(Neon8, q0, q1, q2),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment