Commit b7df78f3 authored by bbudge's avatar bbudge Committed by Commit bot

[ARM] Add Neon saturating add and subtract instructions.

- Adds vqadd.s/u, vqsub.s/u for all integer lane sizes.
- Refactors disassembler and simulator, using switches instead
of long if-else chains.

LOG=N
BUG=v8:4124

Review-Url: https://codereview.chromium.org/2649323012
Cr-Commit-Position: refs/heads/master@{#42865}
parent 5a02d3e8
...@@ -4272,7 +4272,19 @@ static Instr EncodeNeonBinOp(FPBinOp op, QwNeonRegister dst, ...@@ -4272,7 +4272,19 @@ static Instr EncodeNeonBinOp(FPBinOp op, QwNeonRegister dst,
vm | op_encoding; vm | op_encoding;
} }
enum IntegerBinOp { VADD, VSUB, VMUL, VMIN, VMAX, VTST, VCEQ, VCGE, VCGT }; enum IntegerBinOp {
VADD,
VQADD,
VSUB,
VQSUB,
VMUL,
VMIN,
VMAX,
VTST,
VCEQ,
VCGE,
VCGT
};
static Instr EncodeNeonBinOp(IntegerBinOp op, NeonDataType dt, static Instr EncodeNeonBinOp(IntegerBinOp op, NeonDataType dt,
const QwNeonRegister dst, const QwNeonRegister dst,
...@@ -4283,9 +4295,15 @@ static Instr EncodeNeonBinOp(IntegerBinOp op, NeonDataType dt, ...@@ -4283,9 +4295,15 @@ static Instr EncodeNeonBinOp(IntegerBinOp op, NeonDataType dt,
case VADD: case VADD:
op_encoding = 0x8 * B8; op_encoding = 0x8 * B8;
break; break;
case VQADD:
op_encoding = B4;
break;
case VSUB: case VSUB:
op_encoding = B24 | 0x8 * B8; op_encoding = B24 | 0x8 * B8;
break; break;
case VQSUB:
op_encoding = 0x2 * B8 | B4;
break;
case VMUL: case VMUL:
op_encoding = 0x9 * B8 | B4; op_encoding = 0x9 * B8 | B4;
break; break;
...@@ -4348,6 +4366,14 @@ void Assembler::vadd(NeonSize size, QwNeonRegister dst, QwNeonRegister src1, ...@@ -4348,6 +4366,14 @@ void Assembler::vadd(NeonSize size, QwNeonRegister dst, QwNeonRegister src1,
emit(EncodeNeonBinOp(VADD, size, dst, src1, src2)); emit(EncodeNeonBinOp(VADD, size, dst, src1, src2));
} }
void Assembler::vqadd(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vqadd(Qn, Qm) SIMD integer saturating addition.
// Instruction details available in ARM DDI 0406C.b, A8-996.
emit(EncodeNeonBinOp(VQADD, dt, dst, src1, src2));
}
void Assembler::vsub(QwNeonRegister dst, QwNeonRegister src1, void Assembler::vsub(QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2) { QwNeonRegister src2) {
DCHECK(IsEnabled(NEON)); DCHECK(IsEnabled(NEON));
...@@ -4364,6 +4390,14 @@ void Assembler::vsub(NeonSize size, QwNeonRegister dst, QwNeonRegister src1, ...@@ -4364,6 +4390,14 @@ void Assembler::vsub(NeonSize size, QwNeonRegister dst, QwNeonRegister src1,
emit(EncodeNeonBinOp(VSUB, size, dst, src1, src2)); emit(EncodeNeonBinOp(VSUB, size, dst, src1, src2));
} }
void Assembler::vqsub(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2) {
DCHECK(IsEnabled(NEON));
// Qd = vqsub(Qn, Qm) SIMD integer saturating subtraction.
// Instruction details available in ARM DDI 0406C.b, A8-1020.
emit(EncodeNeonBinOp(VQSUB, dt, dst, src1, src2));
}
void Assembler::vmul(QwNeonRegister dst, QwNeonRegister src1, void Assembler::vmul(QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2) { QwNeonRegister src2) {
DCHECK(IsEnabled(NEON)); DCHECK(IsEnabled(NEON));
......
...@@ -1374,9 +1374,13 @@ class Assembler : public AssemblerBase { ...@@ -1374,9 +1374,13 @@ class Assembler : public AssemblerBase {
void vadd(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2); void vadd(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2);
void vadd(NeonSize size, QwNeonRegister dst, QwNeonRegister src1, void vadd(NeonSize size, QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2); QwNeonRegister src2);
void vqadd(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2);
void vsub(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2); void vsub(QwNeonRegister dst, QwNeonRegister src1, QwNeonRegister src2);
void vsub(NeonSize size, QwNeonRegister dst, QwNeonRegister src1, void vsub(NeonSize size, QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2); QwNeonRegister src2);
void vqsub(NeonDataType dt, QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2);
void vmul(QwNeonRegister dst, QwNeonRegister src1, void vmul(QwNeonRegister dst, QwNeonRegister src1,
QwNeonRegister src2); QwNeonRegister src2);
void vmul(NeonSize size, QwNeonRegister dst, QwNeonRegister src1, void vmul(NeonSize size, QwNeonRegister dst, QwNeonRegister src1,
......
This diff is collapsed.
This diff is collapsed.
...@@ -1299,7 +1299,9 @@ TEST(15) { ...@@ -1299,7 +1299,9 @@ TEST(15) {
uint32_t vmin_s8[4], vmin_u16[4], vmin_s32[4]; uint32_t vmin_s8[4], vmin_u16[4], vmin_s32[4];
uint32_t vmax_s8[4], vmax_u16[4], vmax_s32[4]; uint32_t vmax_s8[4], vmax_u16[4], vmax_s32[4];
uint32_t vadd8[4], vadd16[4], vadd32[4]; uint32_t vadd8[4], vadd16[4], vadd32[4];
uint32_t vqadd_s8[4], vqadd_u16[4], vqadd_s32[4];
uint32_t vsub8[4], vsub16[4], vsub32[4]; uint32_t vsub8[4], vsub16[4], vsub32[4];
uint32_t vqsub_u8[4], vqsub_s16[4], vqsub_u32[4];
uint32_t vmul8[4], vmul16[4], vmul32[4]; uint32_t vmul8[4], vmul16[4], vmul32[4];
uint32_t vshl8[4], vshl16[4], vshl32[5]; uint32_t vshl8[4], vshl16[4], vshl32[5];
uint32_t vshr_s8[4], vshr_u16[4], vshr_s32[5]; uint32_t vshr_s8[4], vshr_u16[4], vshr_s32[5];
...@@ -1633,6 +1635,28 @@ TEST(15) { ...@@ -1633,6 +1635,28 @@ TEST(15) {
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vadd32)))); __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vadd32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vqadd.
__ mov(r4, Operand(0x81));
__ vdup(Neon8, q0, r4);
__ mov(r4, Operand(0x82));
__ vdup(Neon8, q1, r4);
__ vqadd(NeonS8, q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vqadd_s8))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ mov(r4, Operand(0x8000));
__ vdup(Neon16, q0, r4);
__ vdup(Neon16, q1, r4);
__ vqadd(NeonU16, q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vqadd_u16))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ mov(r4, Operand(0x80000001));
__ vdup(Neon32, q0, r4);
__ mov(r4, Operand(0x80000002));
__ vdup(Neon32, q1, r4);
__ vqadd(NeonS32, q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vqadd_s32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vsub (integer). // vsub (integer).
__ mov(r4, Operand(0x01)); __ mov(r4, Operand(0x01));
__ vdup(Neon8, q0, r4); __ vdup(Neon8, q0, r4);
...@@ -1656,6 +1680,29 @@ TEST(15) { ...@@ -1656,6 +1680,29 @@ TEST(15) {
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vsub32)))); __ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vsub32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4)); __ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vqsub.
__ mov(r4, Operand(0x7f));
__ vdup(Neon8, q0, r4);
__ mov(r4, Operand(0x3f));
__ vdup(Neon8, q1, r4);
__ vqsub(NeonU8, q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vqsub_u8))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ mov(r4, Operand(0x8000));
__ vdup(Neon16, q0, r4);
__ mov(r4, Operand(0x7fff));
__ vdup(Neon16, q1, r4);
__ vqsub(NeonS16, q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vqsub_s16))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
__ mov(r4, Operand(0x80000001));
__ vdup(Neon32, q0, r4);
__ mov(r4, Operand(0x80000000));
__ vdup(Neon32, q1, r4);
__ vqsub(NeonU32, q1, q1, q0);
__ add(r4, r0, Operand(static_cast<int32_t>(offsetof(T, vqsub_u32))));
__ vst1(Neon8, NeonListOperand(q1), NeonMemOperand(r4));
// vmul (integer). // vmul (integer).
__ mov(r4, Operand(0x02)); __ mov(r4, Operand(0x02));
__ vdup(Neon8, q0, r4); __ vdup(Neon8, q0, r4);
...@@ -1948,6 +1995,12 @@ TEST(15) { ...@@ -1948,6 +1995,12 @@ TEST(15) {
CHECK_EQ_SPLAT(vadd8, 0x03030303u); CHECK_EQ_SPLAT(vadd8, 0x03030303u);
CHECK_EQ_SPLAT(vadd16, 0x00030003u); CHECK_EQ_SPLAT(vadd16, 0x00030003u);
CHECK_EQ_SPLAT(vadd32, 0x00000003u); CHECK_EQ_SPLAT(vadd32, 0x00000003u);
CHECK_EQ_SPLAT(vqadd_s8, 0x80808080u);
CHECK_EQ_SPLAT(vqadd_u16, 0xffffffffu);
CHECK_EQ_SPLAT(vqadd_s32, 0x80000000u);
CHECK_EQ_SPLAT(vqsub_u8, 0x00000000u);
CHECK_EQ_SPLAT(vqsub_s16, 0x7fff7fffu);
CHECK_EQ_SPLAT(vqsub_u32, 0x00000000u);
CHECK_EQ_SPLAT(vsub8, 0xfefefefeu); CHECK_EQ_SPLAT(vsub8, 0xfefefefeu);
CHECK_EQ_SPLAT(vsub16, 0xfffefffeu); CHECK_EQ_SPLAT(vsub16, 0xfffefffeu);
CHECK_EQ_SPLAT(vsub32, 0xfffffffeu); CHECK_EQ_SPLAT(vsub32, 0xfffffffeu);
......
...@@ -1047,6 +1047,12 @@ TEST(Neon) { ...@@ -1047,6 +1047,12 @@ TEST(Neon) {
"f2142860 vadd.i16 q1, q2, q8"); "f2142860 vadd.i16 q1, q2, q8");
COMPARE(vadd(Neon32, q15, q0, q8), COMPARE(vadd(Neon32, q15, q0, q8),
"f260e860 vadd.i32 q15, q0, q8"); "f260e860 vadd.i32 q15, q0, q8");
COMPARE(vqadd(NeonU8, q0, q1, q2),
"f3020054 vqadd.u8 q0, q1, q2");
COMPARE(vqadd(NeonS16, q1, q2, q8),
"f2142070 vqadd.s16 q1, q2, q8");
COMPARE(vqadd(NeonU32, q15, q0, q8),
"f360e070 vqadd.u32 q15, q0, q8");
COMPARE(vsub(q15, q0, q8), COMPARE(vsub(q15, q0, q8),
"f260ed60 vsub.f32 q15, q0, q8"); "f260ed60 vsub.f32 q15, q0, q8");
COMPARE(vsub(Neon8, q0, q1, q2), COMPARE(vsub(Neon8, q0, q1, q2),
...@@ -1055,6 +1061,12 @@ TEST(Neon) { ...@@ -1055,6 +1061,12 @@ TEST(Neon) {
"f3142860 vsub.i16 q1, q2, q8"); "f3142860 vsub.i16 q1, q2, q8");
COMPARE(vsub(Neon32, q15, q0, q8), COMPARE(vsub(Neon32, q15, q0, q8),
"f360e860 vsub.i32 q15, q0, q8"); "f360e860 vsub.i32 q15, q0, q8");
COMPARE(vqsub(NeonU8, q0, q1, q2),
"f3020254 vqsub.u8 q0, q1, q2");
COMPARE(vqsub(NeonS16, q1, q2, q8),
"f2142270 vqsub.s16 q1, q2, q8");
COMPARE(vqsub(NeonU32, q15, q0, q8),
"f360e270 vqsub.u32 q15, q0, q8");
COMPARE(vmul(q0, q1, q2), COMPARE(vmul(q0, q1, q2),
"f3020d54 vmul.f32 q0, q1, q2"); "f3020d54 vmul.f32 q0, q1, q2");
COMPARE(vmul(Neon8, q0, q1, q2), COMPARE(vmul(Neon8, q0, q1, q2),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment