Commit 72f449d2 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[ia32][x64] Use *ps instruction on SSE

On SSE:

- use movaps (instead of movapd, movdqa)
- use movups (instead of movupd, movdqu)
- use andps (instead of andpd, pand)
- use andnps (instead of andnpd, pandn)
- use orps (instead of orpd, por)
- use xorps (instead of xorpd, pxor)

These *ps instructions are 1 byte shorter than the *pd or p*
instructions, and on systems without AVX, and most SSE-level processors
don't differentiate between integer and floating point domains.

For AVX systems, we use the instructions appropriate for the domain we
are operating in.

Related to b/175399220.

Bug: v8:11384
Change-Id: I332a2e741652f6c063ea1b84b0d9d41226d641ea
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2773787Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#73553}
parent 06b0f23d
......@@ -635,7 +635,7 @@ void TurboAssembler::Pmulhrsw(XMMRegister dst, XMMRegister src1,
vpmulhrsw(dst, src1, src2);
} else {
if (dst != src1) {
movdqu(dst, src1);
movaps(dst, src1);
}
CpuFeatureScope sse_scope(this, SSSE3);
pmulhrsw(dst, src2);
......@@ -689,7 +689,7 @@ void TurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst);
} else {
DCHECK_EQ(dst, src1);
movdqu(scratch, src1);
movaps(scratch, src1);
pmullw(dst, src2);
is_signed ? pmulhw(scratch, src2) : pmulhuw(scratch, src2);
low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch);
......@@ -1259,7 +1259,7 @@ void TurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src,
pcmpeqd(tmp, tmp);
psrld(tmp, byte{16});
// tmp =|0|b|0|d|0|f|0|h|
pand(tmp, src);
andps(tmp, src);
// dst = |0|a|0|c|0|e|0|g|
if (dst != src) {
movaps(dst, src);
......@@ -2275,7 +2275,7 @@ void TurboAssembler::Pinsrb(XMMRegister dst, XMMRegister src1, Operand src2,
if (CpuFeatures::IsSupported(SSE4_1)) {
CpuFeatureScope sse_scope(this, SSE4_1);
if (dst != src1) {
movdqu(dst, src1);
movaps(dst, src1);
}
pinsrb(dst, src2, imm8);
return;
......@@ -2291,7 +2291,7 @@ void TurboAssembler::Pinsrd(XMMRegister dst, XMMRegister src1, Operand src2,
return;
}
if (dst != src1) {
movdqu(dst, src1);
movaps(dst, src1);
}
if (CpuFeatures::IsSupported(SSE4_1)) {
CpuFeatureScope sse_scope(this, SSE4_1);
......@@ -2333,7 +2333,7 @@ void TurboAssembler::Pinsrw(XMMRegister dst, XMMRegister src1, Operand src2,
return;
} else {
if (dst != src1) {
movdqu(dst, src1);
movaps(dst, src1);
}
pinsrw(dst, src2, imm8);
return;
......
......@@ -1939,7 +1939,7 @@ void PinsrHelper(Assembler* assm, AvxFn<Src> avx, NoAvxFn<Src> noavx,
}
if (dst != src1) {
assm->movdqu(dst, src1);
assm->movaps(dst, src1);
}
if (feature.has_value()) {
DCHECK(CpuFeatures::IsSupported(*feature));
......@@ -2111,7 +2111,7 @@ void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src,
// Make sure these are different so that we won't overwrite mask.
DCHECK_NE(dst, mask);
if (dst != src) {
movapd(dst, src);
movaps(dst, src);
}
CpuFeatureScope sse_scope(this, SSSE3);
pshufb(dst, mask);
......@@ -2296,7 +2296,7 @@ void TurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
: vpunpckhwd(dst, kScratchDoubleReg, dst);
} else {
DCHECK_EQ(dst, src1);
movdqu(kScratchDoubleReg, src1);
movaps(kScratchDoubleReg, src1);
pmullw(dst, src2);
is_signed ? pmulhw(kScratchDoubleReg, src2)
: pmulhuw(kScratchDoubleReg, src2);
......@@ -2689,7 +2689,7 @@ void TurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst,
pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
psrld(kScratchDoubleReg, byte{16});
// kScratchDoubleReg =|0|b|0|d|0|f|0|h|
pand(kScratchDoubleReg, src);
andps(kScratchDoubleReg, src);
// dst = |0|a|0|c|0|e|0|g|
if (dst != src) {
movaps(dst, src);
......
......@@ -1488,7 +1488,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister tmp = i.TempSimd128Register(0);
__ pcmpeqd(tmp, tmp);
__ psrlq(tmp, 1);
__ andpd(i.OutputDoubleRegister(), tmp);
__ andps(i.OutputDoubleRegister(), tmp);
break;
}
case kSSEFloat64Neg: {
......@@ -1496,7 +1496,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister tmp = i.TempSimd128Register(0);
__ pcmpeqd(tmp, tmp);
__ psllq(tmp, 63);
__ xorpd(i.OutputDoubleRegister(), tmp);
__ xorps(i.OutputDoubleRegister(), tmp);
break;
}
case kSSEFloat64Sqrt:
......@@ -1657,7 +1657,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break;
}
case kSSEFloat64SilenceNaN:
__ xorpd(kScratchDoubleReg, kScratchDoubleReg);
__ xorps(kScratchDoubleReg, kScratchDoubleReg);
__ subsd(i.InputDoubleRegister(0), kScratchDoubleReg);
break;
case kIA32Movsxbl:
......@@ -2687,7 +2687,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pcmpeqd(i.OutputSimd128Register(), i.InputOperand(1));
__ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ pxor(i.OutputSimd128Register(), kScratchDoubleReg);
__ xorps(i.OutputSimd128Register(), kScratchDoubleReg);
break;
}
case kAVXI32x4Ne: {
......@@ -2733,7 +2733,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister dst = i.OutputSimd128Register();
XMMRegister tmp = i.TempSimd128Register(0);
// NAN->0, negative->0
__ pxor(kScratchDoubleReg, kScratchDoubleReg);
__ xorps(kScratchDoubleReg, kScratchDoubleReg);
__ maxps(dst, kScratchDoubleReg);
// scratch: float representation of max_signed
__ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
......@@ -2746,8 +2746,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ subps(tmp, kScratchDoubleReg);
__ cmpleps(kScratchDoubleReg, tmp);
__ cvttps2dq(tmp, tmp);
__ pxor(tmp, kScratchDoubleReg);
__ pxor(kScratchDoubleReg, kScratchDoubleReg);
__ xorps(tmp, kScratchDoubleReg);
__ xorps(kScratchDoubleReg, kScratchDoubleReg);
__ pmaxsd(tmp, kScratchDoubleReg);
// convert. Overflow lanes above max_signed will be 0x80000000
__ cvttps2dq(dst, dst);
......@@ -2827,7 +2827,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ pmaxud(dst, src);
__ pcmpeqd(dst, src);
__ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ pxor(dst, kScratchDoubleReg);
__ xorps(dst, kScratchDoubleReg);
break;
}
case kAVXI32x4GtU: {
......@@ -3016,7 +3016,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pcmpeqw(i.OutputSimd128Register(), i.InputOperand(1));
__ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
__ pxor(i.OutputSimd128Register(), kScratchDoubleReg);
__ xorps(i.OutputSimd128Register(), kScratchDoubleReg);
break;
}
case kAVXI16x8Ne: {
......@@ -3135,7 +3135,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ pmaxuw(dst, src);
__ pcmpeqw(dst, src);
__ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
__ pxor(dst, kScratchDoubleReg);
__ xorps(dst, kScratchDoubleReg);
break;
}
case kAVXI16x8GtU: {
......@@ -3386,7 +3386,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pcmpeqb(i.OutputSimd128Register(), i.InputOperand(1));
__ pcmpeqb(kScratchDoubleReg, kScratchDoubleReg);
__ pxor(i.OutputSimd128Register(), kScratchDoubleReg);
__ xorps(i.OutputSimd128Register(), kScratchDoubleReg);
break;
}
case kAVXI8x16Ne: {
......@@ -3493,7 +3493,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ pmaxub(dst, src);
__ pcmpeqb(dst, src);
__ pcmpeqb(kScratchDoubleReg, kScratchDoubleReg);
__ pxor(dst, kScratchDoubleReg);
__ xorps(dst, kScratchDoubleReg);
break;
}
case kAVXI8x16GtU: {
......@@ -3577,7 +3577,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kSSES128And: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pand(i.OutputSimd128Register(), i.InputOperand(1));
__ andps(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXS128And: {
......@@ -3588,7 +3588,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kSSES128Or: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ por(i.OutputSimd128Register(), i.InputOperand(1));
__ orps(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXS128Or: {
......@@ -3599,7 +3599,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
}
case kSSES128Xor: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pxor(i.OutputSimd128Register(), i.InputOperand(1));
__ xorps(i.OutputSimd128Register(), i.InputOperand(1));
break;
}
case kAVXS128Xor: {
......@@ -3864,7 +3864,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister dst = i.OutputSimd128Register();
XMMRegister src2 = dst;
DCHECK_EQ(dst, i.InputSimd128Register(0));
__ pxor(kScratchDoubleReg, kScratchDoubleReg);
__ xorps(kScratchDoubleReg, kScratchDoubleReg);
if (instr->InputCount() == 2) {
__ pblendw(kScratchDoubleReg, i.InputOperand(1), 0x55);
src2 = kScratchDoubleReg;
......@@ -3953,7 +3953,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ psllw(kScratchDoubleReg, 8);
}
__ psrlw(dst, 8);
__ por(dst, kScratchDoubleReg);
__ orps(dst, kScratchDoubleReg);
break;
}
case kAVXS8x16TransposeLow: {
......@@ -3983,7 +3983,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ psrlw(kScratchDoubleReg, 8);
}
__ psllw(kScratchDoubleReg, 8);
__ por(dst, kScratchDoubleReg);
__ orps(dst, kScratchDoubleReg);
break;
}
case kAVXS8x16TransposeHigh: {
......@@ -4016,7 +4016,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ movaps(kScratchDoubleReg, dst);
__ psrlw(kScratchDoubleReg, 8);
__ psllw(dst, 8);
__ por(dst, kScratchDoubleReg);
__ orps(dst, kScratchDoubleReg);
break;
}
case kAVXS8x2Reverse:
......
......@@ -3148,7 +3148,7 @@ void LiftoffAssembler::emit_i64x2_gt_s(LiftoffRegister dst, LiftoffRegister lhs,
} else if (CpuFeatures::IsSupported(SSE4_2)) {
// 2. SSE4_2, dst == lhs.
if (dst != lhs) {
movdqa(dst.fp(), lhs.fp());
movaps(dst.fp(), lhs.fp());
}
I64x2GtS(dst.fp(), dst.fp(), rhs.fp(), liftoff::kScratchDoubleReg);
} else {
......@@ -3177,7 +3177,7 @@ void LiftoffAssembler::emit_i64x2_ge_s(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegList::ForRegs(lhs));
// macro-assembler uses kScratchDoubleReg, so don't use it.
I64x2GeS(tmp.fp(), lhs.fp(), rhs.fp(), liftoff::kScratchDoubleReg);
movdqa(dst.fp(), tmp.fp());
movaps(dst.fp(), tmp.fp());
} else {
I64x2GeS(dst.fp(), lhs.fp(), rhs.fp(), liftoff::kScratchDoubleReg);
}
......@@ -3293,11 +3293,11 @@ void LiftoffAssembler::emit_s128_select(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
LiftoffRegister mask) {
// Ensure that we don't overwrite any inputs with the movdqu below.
// Ensure that we don't overwrite any inputs with the movaps below.
DCHECK_NE(dst, src1);
DCHECK_NE(dst, src2);
if (!CpuFeatures::IsSupported(AVX) && dst != mask) {
movdqu(dst.fp(), mask.fp());
movaps(dst.fp(), mask.fp());
S128Select(dst.fp(), dst.fp(), src1.fp(), src2.fp(),
liftoff::kScratchDoubleReg);
} else {
......@@ -3353,7 +3353,7 @@ void LiftoffAssembler::emit_i8x16_shl(LiftoffRegister dst, LiftoffRegister lhs,
vpand(dst.fp(), lhs.fp(), liftoff::kScratchDoubleReg);
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
pand(dst.fp(), liftoff::kScratchDoubleReg);
andps(dst.fp(), liftoff::kScratchDoubleReg);
}
sub(tmp.gp(), Immediate(8));
Movd(tmp_simd.fp(), tmp.gp());
......@@ -4368,7 +4368,7 @@ void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst,
movaps(liftoff::kScratchDoubleReg, src.fp());
cmpeqps(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
if (dst.fp() != src.fp()) movaps(dst.fp(), src.fp());
pand(dst.fp(), liftoff::kScratchDoubleReg);
andps(dst.fp(), liftoff::kScratchDoubleReg);
}
// Set top bit if >= 0 (but not -0.0!).
Pxor(liftoff::kScratchDoubleReg, dst.fp());
......
......@@ -2735,7 +2735,7 @@ void LiftoffAssembler::emit_i64x2_gt_s(LiftoffRegister dst, LiftoffRegister lhs,
} else if (CpuFeatures::IsSupported(SSE4_2)) {
// 2. SSE4_2, dst == lhs.
if (dst != lhs) {
movdqa(dst.fp(), lhs.fp());
movaps(dst.fp(), lhs.fp());
}
I64x2GtS(dst.fp(), dst.fp(), rhs.fp());
} else {
......@@ -2761,7 +2761,7 @@ void LiftoffAssembler::emit_i64x2_ge_s(LiftoffRegister dst, LiftoffRegister lhs,
if (dst == lhs) {
// macro-assembler uses kScratchDoubleReg, so don't use it.
I64x2GeS(liftoff::kScratchDoubleReg2, lhs.fp(), rhs.fp());
movdqa(dst.fp(), liftoff::kScratchDoubleReg2);
movaps(dst.fp(), liftoff::kScratchDoubleReg2);
} else {
I64x2GeS(dst.fp(), lhs.fp(), rhs.fp());
}
......@@ -2870,11 +2870,11 @@ void LiftoffAssembler::emit_s128_select(LiftoffRegister dst,
LiftoffRegister src1,
LiftoffRegister src2,
LiftoffRegister mask) {
// Ensure that we don't overwrite any inputs with the movdqu below.
// Ensure that we don't overwrite any inputs with the movaps below.
DCHECK_NE(dst, src1);
DCHECK_NE(dst, src2);
if (!CpuFeatures::IsSupported(AVX) && dst != mask) {
movdqu(dst.fp(), mask.fp());
movaps(dst.fp(), mask.fp());
S128Select(dst.fp(), dst.fp(), src1.fp(), src2.fp());
} else {
S128Select(dst.fp(), mask.fp(), src1.fp(), src2.fp());
......@@ -2926,7 +2926,7 @@ void LiftoffAssembler::emit_i8x16_shl(LiftoffRegister dst, LiftoffRegister lhs,
vpand(dst.fp(), lhs.fp(), kScratchDoubleReg);
} else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
pand(dst.fp(), kScratchDoubleReg);
andps(dst.fp(), kScratchDoubleReg);
}
subq(kScratchRegister, Immediate(8));
Movq(tmp_simd.fp(), kScratchRegister);
......@@ -3427,7 +3427,7 @@ void LiftoffAssembler::emit_i64x2_neg(LiftoffRegister dst,
vpsubq(dst.fp(), reg, src.fp());
} else {
psubq(reg, src.fp());
if (dst.fp() != reg) movapd(dst.fp(), reg);
if (dst.fp() != reg) movaps(dst.fp(), reg);
}
}
......@@ -3813,13 +3813,13 @@ void LiftoffAssembler::emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs,
vminpd(dst.fp(), rhs.fp(), lhs.fp());
} else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
movapd(kScratchDoubleReg, src);
movaps(kScratchDoubleReg, src);
minpd(kScratchDoubleReg, dst.fp());
minpd(dst.fp(), src);
} else {
movapd(kScratchDoubleReg, lhs.fp());
movaps(kScratchDoubleReg, lhs.fp());
minpd(kScratchDoubleReg, rhs.fp());
movapd(dst.fp(), rhs.fp());
movaps(dst.fp(), rhs.fp());
minpd(dst.fp(), lhs.fp());
}
// propagate -0's and NaNs, which may be non-canonical.
......@@ -3841,13 +3841,13 @@ void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
vmaxpd(dst.fp(), rhs.fp(), lhs.fp());
} else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
movapd(kScratchDoubleReg, src);
movaps(kScratchDoubleReg, src);
maxpd(kScratchDoubleReg, dst.fp());
maxpd(dst.fp(), src);
} else {
movapd(kScratchDoubleReg, lhs.fp());
movaps(kScratchDoubleReg, lhs.fp());
maxpd(kScratchDoubleReg, rhs.fp());
movapd(dst.fp(), rhs.fp());
movaps(dst.fp(), rhs.fp());
maxpd(dst.fp(), lhs.fp());
}
// Find discrepancies.
......@@ -3902,7 +3902,7 @@ void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst,
movaps(kScratchDoubleReg, src.fp());
cmpeqps(kScratchDoubleReg, kScratchDoubleReg);
if (dst.fp() != src.fp()) movaps(dst.fp(), src.fp());
pand(dst.fp(), kScratchDoubleReg);
andps(dst.fp(), kScratchDoubleReg);
}
// Set top bit if >= 0 (but not -0.0!).
Pxor(kScratchDoubleReg, dst.fp());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment