Commit d7a09280 authored by bbudge's avatar bbudge Committed by Commit bot

[ARM] Implement widening and narrowing integer moves, vmovl, vqmovn.

- Fixes vmovl for widening 16 to 32, 32 to 64.
- Adds vqmovn.

LOG=N
BUG=v8:6020

Review-Url: https://codereview.chromium.org/2773303002
Cr-Commit-Position: refs/heads/master@{#44156}
parent bd8447af
......@@ -3919,6 +3919,21 @@ void Assembler::vmovl(NeonDataType dt, QwNeonRegister dst, DwVfpRegister src) {
0xA * B8 | m * B5 | B4 | vm);
}
void Assembler::vqmovn(NeonDataType dt, DwVfpRegister dst, QwNeonRegister src) {
// Instruction details available in ARM DDI 0406C.b, A8.8.1004.
// vqmovn.<type><size> Dd, Qm. ARM vector narrowing move with saturation.
DCHECK(IsEnabled(NEON));
int vd, d;
dst.split_code(&vd, &d);
int vm, m;
src.split_code(&vm, &m);
int size = NeonSz(dt);
int u = NeonU(dt);
int op = u != 0 ? 3 : 2;
emit(0x1E7U * B23 | d * B22 | 0x3 * B20 | size * B18 | 0x2 * B16 | vd * B12 |
0x2 * B8 | op * B6 | m * B5 | vm);
}
static int EncodeScalar(NeonDataType dt, int index) {
int opc1_opc2 = 0;
DCHECK_LE(0, index);
......
......@@ -1321,7 +1321,10 @@ class Assembler : public AssemblerBase {
void vst1(NeonSize size,
const NeonListOperand& src,
const NeonMemOperand& dst);
// dt represents the narrower type
void vmovl(NeonDataType dt, QwNeonRegister dst, DwVfpRegister src);
// dt represents the narrower type.
void vqmovn(NeonDataType dt, DwVfpRegister dst, QwNeonRegister src);
// Only unconditional core <-> scalar moves are currently supported.
void vmov(NeonDataType dt, DwVfpRegister dst, int index, Register src);
......
......@@ -1576,19 +1576,19 @@ void Decoder::DecodeTypeVFP(Instruction* instr) {
Format(instr, "vmov'cond.32 'rt, 'Dd[1]");
}
} else {
const char* sign = instr->Bit(23) != 0 ? "u" : "s";
char sign = instr->Bit(23) != 0 ? 'u' : 's';
int rt = instr->RtValue();
int vn = instr->VFPNRegValue(kDoublePrecision);
if ((opc1_opc2 & 0x8) != 0) {
// NeonS8 / NeonU8
int i = opc1_opc2 & 0x7;
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmov.%s8 r%d, d%d[%d]", sign, rt, vn, i);
"vmov.%c8 r%d, d%d[%d]", sign, rt, vn, i);
} else if ((opc1_opc2 & 0x1) != 0) {
// NeonS16 / NeonU16
int i = (opc1_opc2 >> 1) & 0x3;
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vmov.%s16 r%d, d%d[%d]",
SNPrintF(out_buffer_ + out_buffer_pos_, "vmov.%c16 r%d, d%d[%d]",
sign, rt, vn, i);
} else {
Unknown(instr);
......@@ -2166,8 +2166,7 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
int imm3 = instr->Bits(21, 19);
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"vmovl.u%d q%d, d%d", imm3 * 8, Vd, Vm);
} else if (instr->Opc1Value() == 7 && instr->Bits(21, 20) == 0x3 &&
instr->Bit(4) == 0) {
} else if (instr->Opc1Value() == 7 && instr->Bit(4) == 0) {
if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 7) == 0) {
if (instr->Bit(6) == 0) {
int Vd = instr->VFPDRegValue(kDoublePrecision);
......@@ -2256,16 +2255,16 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
int size = kBitsPerByte * (1 << instr->Bits(19, 18));
const char* type = instr->Bit(10) != 0 ? "f" : "s";
char type = instr->Bit(10) != 0 ? 'f' : 's';
if (instr->Bits(9, 6) == 0xd) {
// vabs<type>.<size> Qd, Qm.
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vabs.%s%d q%d, q%d",
SNPrintF(out_buffer_ + out_buffer_pos_, "vabs.%c%d q%d, q%d",
type, size, Vd, Vm);
} else if (instr->Bits(9, 6) == 0xf) {
// vneg<type>.<size> Qd, Qm.
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vneg.%s%d q%d, q%d",
SNPrintF(out_buffer_ + out_buffer_pos_, "vneg.%c%d q%d, q%d",
type, size, Vd, Vm);
} else {
Unknown(instr);
......@@ -2278,6 +2277,16 @@ void Decoder::DecodeSpecialCondition(Instruction* instr) {
const char* op = instr->Bit(7) == 0 ? "vrecpe" : "vrsqrte";
out_buffer_pos_ += SNPrintF(out_buffer_ + out_buffer_pos_,
"%s.f32 q%d, q%d", op, Vd, Vm);
} else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 8) == 0x2 &&
instr->Bits(7, 6) != 0) {
// vqmovn.<type><size> Dd, Qm.
int Vd = instr->VFPDRegValue(kDoublePrecision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
char type = instr->Bit(6) != 0 ? 'u' : 's';
int size = 2 * kBitsPerByte * (1 << instr->Bits(19, 18));
out_buffer_pos_ +=
SNPrintF(out_buffer_ + out_buffer_pos_, "vqmovn.%c%i d%d, q%d",
type, size, Vd, Vm);
} else {
Unknown(instr);
}
......
......@@ -3996,10 +3996,24 @@ void Simulator::DecodeType6CoprocessorIns(Instruction* instr) {
// Templated operations for NEON instructions.
// TODO(bbudge) Add more templates for use in DecodeSpecialCondition.
template <typename T>
int64_t Widen(T value) {
template <typename T, typename U>
U Widen(T value) {
static_assert(sizeof(int64_t) > sizeof(T), "T must be int32_t or smaller");
return static_cast<int64_t>(value);
static_assert(sizeof(U) > sizeof(T), "T must smaller than U");
return static_cast<U>(value);
}
template <typename T, typename U>
U Narrow(T value) {
static_assert(sizeof(int8_t) < sizeof(T), "T must be int16_t or larger");
static_assert(sizeof(U) < sizeof(T), "T must larger than U");
static_assert(std::is_unsigned<T>() == std::is_unsigned<U>(),
"Signed-ness of T and U must match");
// Make sure value can be expressed in the smaller type; otherwise, the
// casted result is implementation defined.
DCHECK_LE(std::numeric_limits<T>::min(), value);
DCHECK_GE(std::numeric_limits<T>::max(), value);
return static_cast<U>(value);
}
template <typename T>
......@@ -4016,6 +4030,30 @@ T MinMax(T a, T b, bool is_min) {
return is_min ? std::min(a, b) : std::max(a, b);
}
template <typename T, typename U>
void Widen(Simulator* simulator, int Vd, int Vm) {
static const int kLanes = 8 / sizeof(T);
T src[kLanes];
U dst[kLanes];
simulator->get_d_register(Vm, src);
for (int i = 0; i < kLanes; i++) {
dst[i] = Widen<T, U>(src[i]);
}
simulator->set_q_register(Vd, dst);
}
template <typename T, typename U>
void SaturatingNarrow(Simulator* simulator, int Vd, int Vm) {
static const int kLanes = 16 / sizeof(T);
T src[kLanes];
U dst[kLanes];
simulator->get_q_register(Vm, src);
for (int i = 0; i < kLanes; i++) {
dst[i] = Narrow<T, U>(Clamp<U>(src[i]));
}
simulator->set_d_register(Vd, dst);
}
template <typename T>
void AddSaturate(Simulator* simulator, int Vd, int Vm, int Vn) {
static const int kLanes = 16 / sizeof(T);
......@@ -4023,7 +4061,7 @@ void AddSaturate(Simulator* simulator, int Vd, int Vm, int Vn) {
simulator->get_q_register(Vn, src1);
simulator->get_q_register(Vm, src2);
for (int i = 0; i < kLanes; i++) {
src1[i] = Clamp<T>(Widen(src1[i]) + Widen(src2[i]));
src1[i] = Clamp<T>(Widen<T, int64_t>(src1[i]) + Widen<T, int64_t>(src2[i]));
}
simulator->set_q_register(Vd, src1);
}
......@@ -4035,7 +4073,7 @@ void SubSaturate(Simulator* simulator, int Vd, int Vm, int Vn) {
simulator->get_q_register(Vn, src1);
simulator->get_q_register(Vm, src2);
for (int i = 0; i < kLanes; i++) {
src1[i] = Clamp<T>(Widen(src1[i]) - Widen(src2[i]));
src1[i] = Clamp<T>(Widen<T, int64_t>(src1[i]) - Widen<T, int64_t>(src2[i]));
}
simulator->set_q_register(Vd, src1);
}
......@@ -4464,21 +4502,23 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
(instr->Bit(4) == 1)) {
// vmovl signed
if ((instr->VdValue() & 1) != 0) UNIMPLEMENTED();
int Vd = (instr->Bit(22) << 3) | (instr->VdValue() >> 1);
int Vm = (instr->Bit(5) << 4) | instr->VmValue();
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
int imm3 = instr->Bits(21, 19);
if ((imm3 != 1) && (imm3 != 2) && (imm3 != 4)) UNIMPLEMENTED();
int esize = 8 * imm3;
int elements = 64 / esize;
int8_t from[8];
get_d_register(Vm, reinterpret_cast<uint64_t*>(from));
int16_t to[8];
int e = 0;
while (e < elements) {
to[e] = from[e];
e++;
switch (imm3) {
case 1:
Widen<int8_t, int16_t>(this, Vd, Vm);
break;
case 2:
Widen<int16_t, int32_t>(this, Vd, Vm);
break;
case 4:
Widen<int32_t, int64_t>(this, Vd, Vm);
break;
default:
UNIMPLEMENTED();
break;
}
set_q_register(Vd, reinterpret_cast<uint64_t*>(to));
} else if (instr->Bits(21, 20) == 3 && instr->Bit(4) == 0) {
// vext.
int imm4 = instr->Bits(11, 8);
......@@ -4930,21 +4970,23 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
(instr->Bit(4) == 1)) {
// vmovl unsigned
if ((instr->VdValue() & 1) != 0) UNIMPLEMENTED();
int Vd = (instr->Bit(22) << 3) | (instr->VdValue() >> 1);
int Vm = (instr->Bit(5) << 4) | instr->VmValue();
int Vd = instr->VFPDRegValue(kSimd128Precision);
int Vm = instr->VFPMRegValue(kDoublePrecision);
int imm3 = instr->Bits(21, 19);
if ((imm3 != 1) && (imm3 != 2) && (imm3 != 4)) UNIMPLEMENTED();
int esize = 8 * imm3;
int elements = 64 / esize;
uint8_t from[8];
get_d_register(Vm, reinterpret_cast<uint64_t*>(from));
uint16_t to[8];
int e = 0;
while (e < elements) {
to[e] = from[e];
e++;
switch (imm3) {
case 1:
Widen<uint8_t, uint16_t>(this, Vd, Vm);
break;
case 2:
Widen<uint16_t, uint32_t>(this, Vd, Vm);
break;
case 4:
Widen<uint32_t, uint64_t>(this, Vd, Vm);
break;
default:
UNIMPLEMENTED();
break;
}
set_q_register(Vd, reinterpret_cast<uint64_t*>(to));
} else if (instr->Opc1Value() == 7 && instr->Bit(4) == 0) {
if (instr->Bits(19, 16) == 0xB && instr->Bits(11, 9) == 0x3 &&
instr->Bit(6) == 1) {
......@@ -5392,6 +5434,42 @@ void Simulator::DecodeSpecialCondition(Instruction* instr) {
}
}
set_q_register(Vd, src);
} else if (instr->Bits(17, 16) == 0x2 && instr->Bits(11, 8) == 0x2 &&
instr->Bits(7, 6) != 0) {
// vqmovn.<type><size> Dd, Qm.
int Vd = instr->VFPDRegValue(kDoublePrecision);
int Vm = instr->VFPMRegValue(kSimd128Precision);
NeonSize size = static_cast<NeonSize>(instr->Bits(19, 18));
bool is_unsigned = instr->Bit(6) != 0;
switch (size) {
case Neon8: {
if (is_unsigned) {
SaturatingNarrow<uint16_t, uint8_t>(this, Vd, Vm);
} else {
SaturatingNarrow<int16_t, int8_t>(this, Vd, Vm);
}
break;
}
case Neon16: {
if (is_unsigned) {
SaturatingNarrow<uint32_t, uint16_t>(this, Vd, Vm);
} else {
SaturatingNarrow<int32_t, int16_t>(this, Vd, Vm);
}
break;
}
case Neon32: {
if (is_unsigned) {
SaturatingNarrow<uint64_t, uint32_t>(this, Vd, Vm);
} else {
SaturatingNarrow<int64_t, int32_t>(this, Vd, Vm);
}
break;
}
default:
UNIMPLEMENTED();
break;
}
} else {
UNIMPLEMENTED();
}
......
......@@ -1281,16 +1281,14 @@ TEST(15) {
uint32_t dstA1;
uint32_t dstA2;
uint32_t dstA3;
uint32_t dstA4;
uint32_t dstA5;
uint32_t dstA6;
uint32_t dstA7;
uint32_t lane_test[4];
uint64_t vmov_to_scalar1, vmov_to_scalar2;
uint32_t vmov_from_scalar_s8, vmov_from_scalar_u8;
uint32_t vmov_from_scalar_s16, vmov_from_scalar_u16;
uint32_t vmov_from_scalar_32;
uint32_t vmov[4], vmvn[4];
uint32_t vmovl_s8[4], vmovl_u16[4], vmovl_s32[4];
uint32_t vqmovn_s8[2], vqmovn_u16[2], vqmovn_s32[2];
int32_t vcvt_s32_f32[4];
uint32_t vcvt_u32_f32[4];
float vcvt_f32_s32[4], vcvt_f32_u32[4];
......@@ -1354,9 +1352,23 @@ TEST(15) {
// The same expansion, but with different source and destination registers.
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, srcA0))));
__ vld1(Neon8, NeonListOperand(d1), NeonMemOperand(r4));
__ vmovl(NeonU8, q1, d1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, dstA4))));
__ vst1(Neon8, NeonListOperand(d2, 2), NeonMemOperand(r4));
__ vmovl(NeonS8, q1, d1);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmovl_s8))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ vmovl(NeonU16, q2, d3);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmovl_u16))));
__ vst1(Neon8, NeonListOperand(q2), NeonMemOperand(r4));
__ vmovl(NeonS32, q3, d4);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vmovl_s32))));
__ vst1(Neon8, NeonListOperand(q3), NeonMemOperand(r4));
// Narrow what we widened.
__ vqmovn(NeonU16, d0, q2);
__ vstr(d0, r0, offsetof(T, vqmovn_u16));
__ vmov(d1, d0);
__ vqmovn(NeonS8, d2, q0);
__ vstr(d2, r0, offsetof(T, vqmovn_s8));
__ vqmovn(NeonS32, d4, q3);
__ vstr(d4, r0, offsetof(T, vqmovn_s32));
// ARM core register to scalar.
__ mov(r4, Operand(0xfffffff8));
......@@ -1987,10 +1999,6 @@ TEST(15) {
t.dstA1 = 0;
t.dstA2 = 0;
t.dstA3 = 0;
t.dstA4 = 0;
t.dstA5 = 0;
t.dstA6 = 0;
t.dstA7 = 0;
t.lane_test[0] = 0x03020100;
t.lane_test[1] = 0x07060504;
t.lane_test[2] = 0x0b0a0908;
......@@ -2010,10 +2018,13 @@ TEST(15) {
CHECK_EQ(0x00410042u, t.dstA1);
CHECK_EQ(0x00830084u, t.dstA2);
CHECK_EQ(0x00810082u, t.dstA3);
CHECK_EQ(0x00430044u, t.dstA4);
CHECK_EQ(0x00410042u, t.dstA5);
CHECK_EQ(0x00830084u, t.dstA6);
CHECK_EQ(0x00810082u, t.dstA7);
CHECK_EQ_32X4(vmovl_s8, 0x00430044u, 0x00410042u, 0xff83ff84u, 0xff81ff82u);
CHECK_EQ_32X4(vmovl_u16, 0xff84u, 0xff83u, 0xff82u, 0xff81u);
CHECK_EQ_32X4(vmovl_s32, 0xff84u, 0x0u, 0xff83u, 0x0u);
CHECK_EQ_32X2(vqmovn_u16, 0xff83ff84u, 0xff81ff82u);
CHECK_EQ_32X2(vqmovn_s8, 0x81828384u, 0x81828384u);
CHECK_EQ_32X2(vqmovn_s32, 0xff84u, 0xff83u);
CHECK_EQ(0xfffffff8fff8f800u, t.vmov_to_scalar1);
CHECK_EQ(0xfff80000f8000000u, t.vmov_to_scalar2);
......
......@@ -933,10 +933,14 @@ TEST(Neon) {
"f421420f vld1.8 {d4, d5, d6, d7}, [r1]");
COMPARE(vst1(Neon16, NeonListOperand(d17, 4), NeonMemOperand(r9)),
"f449124f vst1.16 {d17, d18, d19, d20}, [r9]");
COMPARE(vmovl(NeonU8, q3, d1),
"f3886a11 vmovl.u8 q3, d1");
COMPARE(vmovl(NeonU8, q4, d2),
"f3888a12 vmovl.u8 q4, d2");
COMPARE(vmovl(NeonU8, q3, d1), "f3886a11 vmovl.u8 q3, d1");
COMPARE(vmovl(NeonU8, q4, d2), "f3888a12 vmovl.u8 q4, d2");
COMPARE(vmovl(NeonS16, q4, d2), "f2908a12 vmovl.s16 q4, d2");
COMPARE(vmovl(NeonU32, q4, d2), "f3a08a12 vmovl.u32 q4, d2");
COMPARE(vqmovn(NeonU8, d16, q8), "f3f202e0 vqmovn.u16 d16, q8");
COMPARE(vqmovn(NeonS16, d16, q8), "f3f602a0 vqmovn.s32 d16, q8");
COMPARE(vqmovn(NeonU32, d2, q4), "f3ba22c8 vqmovn.u64 d2, q4");
COMPARE(vmov(NeonS8, d0, 0, r0), "ee400b10 vmov.8 d0[0], r0");
COMPARE(vmov(NeonU8, d1, 1, r1), "ee411b30 vmov.8 d1[1], r1");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment