Commit 72f449d2 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[ia32][x64] Use *ps instruction on SSE

On SSE:

- use movaps (instead of movapd, movdqa)
- use movups (instead of movupd, movdqu)
- use andps (instead of andpd, pand)
- use andnps (instead of andnpd, pandn)
- use orps (instead of orpd, por)
- use xorps (instead of xorpd, pxor)

These *ps instructions are 1 byte shorter than the *pd or p*
instructions, and on systems without AVX, and most SSE-level processors
don't differentiate between integer and floating point domains.

For AVX systems, we use the instructions appropriate for the domain we
are operating in.

Related to b/175399220.

Bug: v8:11384
Change-Id: I332a2e741652f6c063ea1b84b0d9d41226d641ea
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2773787Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#73553}
parent 06b0f23d
...@@ -635,7 +635,7 @@ void TurboAssembler::Pmulhrsw(XMMRegister dst, XMMRegister src1, ...@@ -635,7 +635,7 @@ void TurboAssembler::Pmulhrsw(XMMRegister dst, XMMRegister src1,
vpmulhrsw(dst, src1, src2); vpmulhrsw(dst, src1, src2);
} else { } else {
if (dst != src1) { if (dst != src1) {
movdqu(dst, src1); movaps(dst, src1);
} }
CpuFeatureScope sse_scope(this, SSSE3); CpuFeatureScope sse_scope(this, SSSE3);
pmulhrsw(dst, src2); pmulhrsw(dst, src2);
...@@ -689,7 +689,7 @@ void TurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1, ...@@ -689,7 +689,7 @@ void TurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst); low ? vpunpcklwd(dst, scratch, dst) : vpunpckhwd(dst, scratch, dst);
} else { } else {
DCHECK_EQ(dst, src1); DCHECK_EQ(dst, src1);
movdqu(scratch, src1); movaps(scratch, src1);
pmullw(dst, src2); pmullw(dst, src2);
is_signed ? pmulhw(scratch, src2) : pmulhuw(scratch, src2); is_signed ? pmulhw(scratch, src2) : pmulhuw(scratch, src2);
low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch); low ? punpcklwd(dst, scratch) : punpckhwd(dst, scratch);
...@@ -1259,7 +1259,7 @@ void TurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src, ...@@ -1259,7 +1259,7 @@ void TurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst, XMMRegister src,
pcmpeqd(tmp, tmp); pcmpeqd(tmp, tmp);
psrld(tmp, byte{16}); psrld(tmp, byte{16});
// tmp =|0|b|0|d|0|f|0|h| // tmp =|0|b|0|d|0|f|0|h|
pand(tmp, src); andps(tmp, src);
// dst = |0|a|0|c|0|e|0|g| // dst = |0|a|0|c|0|e|0|g|
if (dst != src) { if (dst != src) {
movaps(dst, src); movaps(dst, src);
...@@ -2275,7 +2275,7 @@ void TurboAssembler::Pinsrb(XMMRegister dst, XMMRegister src1, Operand src2, ...@@ -2275,7 +2275,7 @@ void TurboAssembler::Pinsrb(XMMRegister dst, XMMRegister src1, Operand src2,
if (CpuFeatures::IsSupported(SSE4_1)) { if (CpuFeatures::IsSupported(SSE4_1)) {
CpuFeatureScope sse_scope(this, SSE4_1); CpuFeatureScope sse_scope(this, SSE4_1);
if (dst != src1) { if (dst != src1) {
movdqu(dst, src1); movaps(dst, src1);
} }
pinsrb(dst, src2, imm8); pinsrb(dst, src2, imm8);
return; return;
...@@ -2291,7 +2291,7 @@ void TurboAssembler::Pinsrd(XMMRegister dst, XMMRegister src1, Operand src2, ...@@ -2291,7 +2291,7 @@ void TurboAssembler::Pinsrd(XMMRegister dst, XMMRegister src1, Operand src2,
return; return;
} }
if (dst != src1) { if (dst != src1) {
movdqu(dst, src1); movaps(dst, src1);
} }
if (CpuFeatures::IsSupported(SSE4_1)) { if (CpuFeatures::IsSupported(SSE4_1)) {
CpuFeatureScope sse_scope(this, SSE4_1); CpuFeatureScope sse_scope(this, SSE4_1);
...@@ -2333,7 +2333,7 @@ void TurboAssembler::Pinsrw(XMMRegister dst, XMMRegister src1, Operand src2, ...@@ -2333,7 +2333,7 @@ void TurboAssembler::Pinsrw(XMMRegister dst, XMMRegister src1, Operand src2,
return; return;
} else { } else {
if (dst != src1) { if (dst != src1) {
movdqu(dst, src1); movaps(dst, src1);
} }
pinsrw(dst, src2, imm8); pinsrw(dst, src2, imm8);
return; return;
......
...@@ -1939,7 +1939,7 @@ void PinsrHelper(Assembler* assm, AvxFn<Src> avx, NoAvxFn<Src> noavx, ...@@ -1939,7 +1939,7 @@ void PinsrHelper(Assembler* assm, AvxFn<Src> avx, NoAvxFn<Src> noavx,
} }
if (dst != src1) { if (dst != src1) {
assm->movdqu(dst, src1); assm->movaps(dst, src1);
} }
if (feature.has_value()) { if (feature.has_value()) {
DCHECK(CpuFeatures::IsSupported(*feature)); DCHECK(CpuFeatures::IsSupported(*feature));
...@@ -2111,7 +2111,7 @@ void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src, ...@@ -2111,7 +2111,7 @@ void TurboAssembler::Pshufb(XMMRegister dst, XMMRegister src,
// Make sure these are different so that we won't overwrite mask. // Make sure these are different so that we won't overwrite mask.
DCHECK_NE(dst, mask); DCHECK_NE(dst, mask);
if (dst != src) { if (dst != src) {
movapd(dst, src); movaps(dst, src);
} }
CpuFeatureScope sse_scope(this, SSSE3); CpuFeatureScope sse_scope(this, SSSE3);
pshufb(dst, mask); pshufb(dst, mask);
...@@ -2296,7 +2296,7 @@ void TurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1, ...@@ -2296,7 +2296,7 @@ void TurboAssembler::I32x4ExtMul(XMMRegister dst, XMMRegister src1,
: vpunpckhwd(dst, kScratchDoubleReg, dst); : vpunpckhwd(dst, kScratchDoubleReg, dst);
} else { } else {
DCHECK_EQ(dst, src1); DCHECK_EQ(dst, src1);
movdqu(kScratchDoubleReg, src1); movaps(kScratchDoubleReg, src1);
pmullw(dst, src2); pmullw(dst, src2);
is_signed ? pmulhw(kScratchDoubleReg, src2) is_signed ? pmulhw(kScratchDoubleReg, src2)
: pmulhuw(kScratchDoubleReg, src2); : pmulhuw(kScratchDoubleReg, src2);
...@@ -2689,7 +2689,7 @@ void TurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst, ...@@ -2689,7 +2689,7 @@ void TurboAssembler::I32x4ExtAddPairwiseI16x8U(XMMRegister dst,
pcmpeqd(kScratchDoubleReg, kScratchDoubleReg); pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
psrld(kScratchDoubleReg, byte{16}); psrld(kScratchDoubleReg, byte{16});
// kScratchDoubleReg =|0|b|0|d|0|f|0|h| // kScratchDoubleReg =|0|b|0|d|0|f|0|h|
pand(kScratchDoubleReg, src); andps(kScratchDoubleReg, src);
// dst = |0|a|0|c|0|e|0|g| // dst = |0|a|0|c|0|e|0|g|
if (dst != src) { if (dst != src) {
movaps(dst, src); movaps(dst, src);
......
...@@ -1488,7 +1488,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -1488,7 +1488,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister tmp = i.TempSimd128Register(0); XMMRegister tmp = i.TempSimd128Register(0);
__ pcmpeqd(tmp, tmp); __ pcmpeqd(tmp, tmp);
__ psrlq(tmp, 1); __ psrlq(tmp, 1);
__ andpd(i.OutputDoubleRegister(), tmp); __ andps(i.OutputDoubleRegister(), tmp);
break; break;
} }
case kSSEFloat64Neg: { case kSSEFloat64Neg: {
...@@ -1496,7 +1496,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -1496,7 +1496,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister tmp = i.TempSimd128Register(0); XMMRegister tmp = i.TempSimd128Register(0);
__ pcmpeqd(tmp, tmp); __ pcmpeqd(tmp, tmp);
__ psllq(tmp, 63); __ psllq(tmp, 63);
__ xorpd(i.OutputDoubleRegister(), tmp); __ xorps(i.OutputDoubleRegister(), tmp);
break; break;
} }
case kSSEFloat64Sqrt: case kSSEFloat64Sqrt:
...@@ -1657,7 +1657,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -1657,7 +1657,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
break; break;
} }
case kSSEFloat64SilenceNaN: case kSSEFloat64SilenceNaN:
__ xorpd(kScratchDoubleReg, kScratchDoubleReg); __ xorps(kScratchDoubleReg, kScratchDoubleReg);
__ subsd(i.InputDoubleRegister(0), kScratchDoubleReg); __ subsd(i.InputDoubleRegister(0), kScratchDoubleReg);
break; break;
case kIA32Movsxbl: case kIA32Movsxbl:
...@@ -2687,7 +2687,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2687,7 +2687,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pcmpeqd(i.OutputSimd128Register(), i.InputOperand(1)); __ pcmpeqd(i.OutputSimd128Register(), i.InputOperand(1));
__ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg); __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ pxor(i.OutputSimd128Register(), kScratchDoubleReg); __ xorps(i.OutputSimd128Register(), kScratchDoubleReg);
break; break;
} }
case kAVXI32x4Ne: { case kAVXI32x4Ne: {
...@@ -2733,7 +2733,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2733,7 +2733,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
XMMRegister tmp = i.TempSimd128Register(0); XMMRegister tmp = i.TempSimd128Register(0);
// NAN->0, negative->0 // NAN->0, negative->0
__ pxor(kScratchDoubleReg, kScratchDoubleReg); __ xorps(kScratchDoubleReg, kScratchDoubleReg);
__ maxps(dst, kScratchDoubleReg); __ maxps(dst, kScratchDoubleReg);
// scratch: float representation of max_signed // scratch: float representation of max_signed
__ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg); __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
...@@ -2746,8 +2746,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2746,8 +2746,8 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ subps(tmp, kScratchDoubleReg); __ subps(tmp, kScratchDoubleReg);
__ cmpleps(kScratchDoubleReg, tmp); __ cmpleps(kScratchDoubleReg, tmp);
__ cvttps2dq(tmp, tmp); __ cvttps2dq(tmp, tmp);
__ pxor(tmp, kScratchDoubleReg); __ xorps(tmp, kScratchDoubleReg);
__ pxor(kScratchDoubleReg, kScratchDoubleReg); __ xorps(kScratchDoubleReg, kScratchDoubleReg);
__ pmaxsd(tmp, kScratchDoubleReg); __ pmaxsd(tmp, kScratchDoubleReg);
// convert. Overflow lanes above max_signed will be 0x80000000 // convert. Overflow lanes above max_signed will be 0x80000000
__ cvttps2dq(dst, dst); __ cvttps2dq(dst, dst);
...@@ -2827,7 +2827,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2827,7 +2827,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ pmaxud(dst, src); __ pmaxud(dst, src);
__ pcmpeqd(dst, src); __ pcmpeqd(dst, src);
__ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg); __ pcmpeqd(kScratchDoubleReg, kScratchDoubleReg);
__ pxor(dst, kScratchDoubleReg); __ xorps(dst, kScratchDoubleReg);
break; break;
} }
case kAVXI32x4GtU: { case kAVXI32x4GtU: {
...@@ -3016,7 +3016,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3016,7 +3016,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pcmpeqw(i.OutputSimd128Register(), i.InputOperand(1)); __ pcmpeqw(i.OutputSimd128Register(), i.InputOperand(1));
__ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg); __ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
__ pxor(i.OutputSimd128Register(), kScratchDoubleReg); __ xorps(i.OutputSimd128Register(), kScratchDoubleReg);
break; break;
} }
case kAVXI16x8Ne: { case kAVXI16x8Ne: {
...@@ -3135,7 +3135,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3135,7 +3135,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ pmaxuw(dst, src); __ pmaxuw(dst, src);
__ pcmpeqw(dst, src); __ pcmpeqw(dst, src);
__ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg); __ pcmpeqw(kScratchDoubleReg, kScratchDoubleReg);
__ pxor(dst, kScratchDoubleReg); __ xorps(dst, kScratchDoubleReg);
break; break;
} }
case kAVXI16x8GtU: { case kAVXI16x8GtU: {
...@@ -3386,7 +3386,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3386,7 +3386,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pcmpeqb(i.OutputSimd128Register(), i.InputOperand(1)); __ pcmpeqb(i.OutputSimd128Register(), i.InputOperand(1));
__ pcmpeqb(kScratchDoubleReg, kScratchDoubleReg); __ pcmpeqb(kScratchDoubleReg, kScratchDoubleReg);
__ pxor(i.OutputSimd128Register(), kScratchDoubleReg); __ xorps(i.OutputSimd128Register(), kScratchDoubleReg);
break; break;
} }
case kAVXI8x16Ne: { case kAVXI8x16Ne: {
...@@ -3493,7 +3493,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3493,7 +3493,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ pmaxub(dst, src); __ pmaxub(dst, src);
__ pcmpeqb(dst, src); __ pcmpeqb(dst, src);
__ pcmpeqb(kScratchDoubleReg, kScratchDoubleReg); __ pcmpeqb(kScratchDoubleReg, kScratchDoubleReg);
__ pxor(dst, kScratchDoubleReg); __ xorps(dst, kScratchDoubleReg);
break; break;
} }
case kAVXI8x16GtU: { case kAVXI8x16GtU: {
...@@ -3577,7 +3577,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3577,7 +3577,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} }
case kSSES128And: { case kSSES128And: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pand(i.OutputSimd128Register(), i.InputOperand(1)); __ andps(i.OutputSimd128Register(), i.InputOperand(1));
break; break;
} }
case kAVXS128And: { case kAVXS128And: {
...@@ -3588,7 +3588,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3588,7 +3588,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} }
case kSSES128Or: { case kSSES128Or: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ por(i.OutputSimd128Register(), i.InputOperand(1)); __ orps(i.OutputSimd128Register(), i.InputOperand(1));
break; break;
} }
case kAVXS128Or: { case kAVXS128Or: {
...@@ -3599,7 +3599,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3599,7 +3599,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} }
case kSSES128Xor: { case kSSES128Xor: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0)); DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ pxor(i.OutputSimd128Register(), i.InputOperand(1)); __ xorps(i.OutputSimd128Register(), i.InputOperand(1));
break; break;
} }
case kAVXS128Xor: { case kAVXS128Xor: {
...@@ -3864,7 +3864,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3864,7 +3864,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
XMMRegister dst = i.OutputSimd128Register(); XMMRegister dst = i.OutputSimd128Register();
XMMRegister src2 = dst; XMMRegister src2 = dst;
DCHECK_EQ(dst, i.InputSimd128Register(0)); DCHECK_EQ(dst, i.InputSimd128Register(0));
__ pxor(kScratchDoubleReg, kScratchDoubleReg); __ xorps(kScratchDoubleReg, kScratchDoubleReg);
if (instr->InputCount() == 2) { if (instr->InputCount() == 2) {
__ pblendw(kScratchDoubleReg, i.InputOperand(1), 0x55); __ pblendw(kScratchDoubleReg, i.InputOperand(1), 0x55);
src2 = kScratchDoubleReg; src2 = kScratchDoubleReg;
...@@ -3953,7 +3953,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3953,7 +3953,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ psllw(kScratchDoubleReg, 8); __ psllw(kScratchDoubleReg, 8);
} }
__ psrlw(dst, 8); __ psrlw(dst, 8);
__ por(dst, kScratchDoubleReg); __ orps(dst, kScratchDoubleReg);
break; break;
} }
case kAVXS8x16TransposeLow: { case kAVXS8x16TransposeLow: {
...@@ -3983,7 +3983,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3983,7 +3983,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ psrlw(kScratchDoubleReg, 8); __ psrlw(kScratchDoubleReg, 8);
} }
__ psllw(kScratchDoubleReg, 8); __ psllw(kScratchDoubleReg, 8);
__ por(dst, kScratchDoubleReg); __ orps(dst, kScratchDoubleReg);
break; break;
} }
case kAVXS8x16TransposeHigh: { case kAVXS8x16TransposeHigh: {
...@@ -4016,7 +4016,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -4016,7 +4016,7 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ movaps(kScratchDoubleReg, dst); __ movaps(kScratchDoubleReg, dst);
__ psrlw(kScratchDoubleReg, 8); __ psrlw(kScratchDoubleReg, 8);
__ psllw(dst, 8); __ psllw(dst, 8);
__ por(dst, kScratchDoubleReg); __ orps(dst, kScratchDoubleReg);
break; break;
} }
case kAVXS8x2Reverse: case kAVXS8x2Reverse:
......
...@@ -3148,7 +3148,7 @@ void LiftoffAssembler::emit_i64x2_gt_s(LiftoffRegister dst, LiftoffRegister lhs, ...@@ -3148,7 +3148,7 @@ void LiftoffAssembler::emit_i64x2_gt_s(LiftoffRegister dst, LiftoffRegister lhs,
} else if (CpuFeatures::IsSupported(SSE4_2)) { } else if (CpuFeatures::IsSupported(SSE4_2)) {
// 2. SSE4_2, dst == lhs. // 2. SSE4_2, dst == lhs.
if (dst != lhs) { if (dst != lhs) {
movdqa(dst.fp(), lhs.fp()); movaps(dst.fp(), lhs.fp());
} }
I64x2GtS(dst.fp(), dst.fp(), rhs.fp(), liftoff::kScratchDoubleReg); I64x2GtS(dst.fp(), dst.fp(), rhs.fp(), liftoff::kScratchDoubleReg);
} else { } else {
...@@ -3177,7 +3177,7 @@ void LiftoffAssembler::emit_i64x2_ge_s(LiftoffRegister dst, LiftoffRegister lhs, ...@@ -3177,7 +3177,7 @@ void LiftoffAssembler::emit_i64x2_ge_s(LiftoffRegister dst, LiftoffRegister lhs,
LiftoffRegList::ForRegs(lhs)); LiftoffRegList::ForRegs(lhs));
// macro-assembler uses kScratchDoubleReg, so don't use it. // macro-assembler uses kScratchDoubleReg, so don't use it.
I64x2GeS(tmp.fp(), lhs.fp(), rhs.fp(), liftoff::kScratchDoubleReg); I64x2GeS(tmp.fp(), lhs.fp(), rhs.fp(), liftoff::kScratchDoubleReg);
movdqa(dst.fp(), tmp.fp()); movaps(dst.fp(), tmp.fp());
} else { } else {
I64x2GeS(dst.fp(), lhs.fp(), rhs.fp(), liftoff::kScratchDoubleReg); I64x2GeS(dst.fp(), lhs.fp(), rhs.fp(), liftoff::kScratchDoubleReg);
} }
...@@ -3293,11 +3293,11 @@ void LiftoffAssembler::emit_s128_select(LiftoffRegister dst, ...@@ -3293,11 +3293,11 @@ void LiftoffAssembler::emit_s128_select(LiftoffRegister dst,
LiftoffRegister src1, LiftoffRegister src1,
LiftoffRegister src2, LiftoffRegister src2,
LiftoffRegister mask) { LiftoffRegister mask) {
// Ensure that we don't overwrite any inputs with the movdqu below. // Ensure that we don't overwrite any inputs with the movaps below.
DCHECK_NE(dst, src1); DCHECK_NE(dst, src1);
DCHECK_NE(dst, src2); DCHECK_NE(dst, src2);
if (!CpuFeatures::IsSupported(AVX) && dst != mask) { if (!CpuFeatures::IsSupported(AVX) && dst != mask) {
movdqu(dst.fp(), mask.fp()); movaps(dst.fp(), mask.fp());
S128Select(dst.fp(), dst.fp(), src1.fp(), src2.fp(), S128Select(dst.fp(), dst.fp(), src1.fp(), src2.fp(),
liftoff::kScratchDoubleReg); liftoff::kScratchDoubleReg);
} else { } else {
...@@ -3353,7 +3353,7 @@ void LiftoffAssembler::emit_i8x16_shl(LiftoffRegister dst, LiftoffRegister lhs, ...@@ -3353,7 +3353,7 @@ void LiftoffAssembler::emit_i8x16_shl(LiftoffRegister dst, LiftoffRegister lhs,
vpand(dst.fp(), lhs.fp(), liftoff::kScratchDoubleReg); vpand(dst.fp(), lhs.fp(), liftoff::kScratchDoubleReg);
} else { } else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp()); if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
pand(dst.fp(), liftoff::kScratchDoubleReg); andps(dst.fp(), liftoff::kScratchDoubleReg);
} }
sub(tmp.gp(), Immediate(8)); sub(tmp.gp(), Immediate(8));
Movd(tmp_simd.fp(), tmp.gp()); Movd(tmp_simd.fp(), tmp.gp());
...@@ -4368,7 +4368,7 @@ void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst, ...@@ -4368,7 +4368,7 @@ void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst,
movaps(liftoff::kScratchDoubleReg, src.fp()); movaps(liftoff::kScratchDoubleReg, src.fp());
cmpeqps(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg); cmpeqps(liftoff::kScratchDoubleReg, liftoff::kScratchDoubleReg);
if (dst.fp() != src.fp()) movaps(dst.fp(), src.fp()); if (dst.fp() != src.fp()) movaps(dst.fp(), src.fp());
pand(dst.fp(), liftoff::kScratchDoubleReg); andps(dst.fp(), liftoff::kScratchDoubleReg);
} }
// Set top bit if >= 0 (but not -0.0!). // Set top bit if >= 0 (but not -0.0!).
Pxor(liftoff::kScratchDoubleReg, dst.fp()); Pxor(liftoff::kScratchDoubleReg, dst.fp());
......
...@@ -2735,7 +2735,7 @@ void LiftoffAssembler::emit_i64x2_gt_s(LiftoffRegister dst, LiftoffRegister lhs, ...@@ -2735,7 +2735,7 @@ void LiftoffAssembler::emit_i64x2_gt_s(LiftoffRegister dst, LiftoffRegister lhs,
} else if (CpuFeatures::IsSupported(SSE4_2)) { } else if (CpuFeatures::IsSupported(SSE4_2)) {
// 2. SSE4_2, dst == lhs. // 2. SSE4_2, dst == lhs.
if (dst != lhs) { if (dst != lhs) {
movdqa(dst.fp(), lhs.fp()); movaps(dst.fp(), lhs.fp());
} }
I64x2GtS(dst.fp(), dst.fp(), rhs.fp()); I64x2GtS(dst.fp(), dst.fp(), rhs.fp());
} else { } else {
...@@ -2761,7 +2761,7 @@ void LiftoffAssembler::emit_i64x2_ge_s(LiftoffRegister dst, LiftoffRegister lhs, ...@@ -2761,7 +2761,7 @@ void LiftoffAssembler::emit_i64x2_ge_s(LiftoffRegister dst, LiftoffRegister lhs,
if (dst == lhs) { if (dst == lhs) {
// macro-assembler uses kScratchDoubleReg, so don't use it. // macro-assembler uses kScratchDoubleReg, so don't use it.
I64x2GeS(liftoff::kScratchDoubleReg2, lhs.fp(), rhs.fp()); I64x2GeS(liftoff::kScratchDoubleReg2, lhs.fp(), rhs.fp());
movdqa(dst.fp(), liftoff::kScratchDoubleReg2); movaps(dst.fp(), liftoff::kScratchDoubleReg2);
} else { } else {
I64x2GeS(dst.fp(), lhs.fp(), rhs.fp()); I64x2GeS(dst.fp(), lhs.fp(), rhs.fp());
} }
...@@ -2870,11 +2870,11 @@ void LiftoffAssembler::emit_s128_select(LiftoffRegister dst, ...@@ -2870,11 +2870,11 @@ void LiftoffAssembler::emit_s128_select(LiftoffRegister dst,
LiftoffRegister src1, LiftoffRegister src1,
LiftoffRegister src2, LiftoffRegister src2,
LiftoffRegister mask) { LiftoffRegister mask) {
// Ensure that we don't overwrite any inputs with the movdqu below. // Ensure that we don't overwrite any inputs with the movaps below.
DCHECK_NE(dst, src1); DCHECK_NE(dst, src1);
DCHECK_NE(dst, src2); DCHECK_NE(dst, src2);
if (!CpuFeatures::IsSupported(AVX) && dst != mask) { if (!CpuFeatures::IsSupported(AVX) && dst != mask) {
movdqu(dst.fp(), mask.fp()); movaps(dst.fp(), mask.fp());
S128Select(dst.fp(), dst.fp(), src1.fp(), src2.fp()); S128Select(dst.fp(), dst.fp(), src1.fp(), src2.fp());
} else { } else {
S128Select(dst.fp(), mask.fp(), src1.fp(), src2.fp()); S128Select(dst.fp(), mask.fp(), src1.fp(), src2.fp());
...@@ -2926,7 +2926,7 @@ void LiftoffAssembler::emit_i8x16_shl(LiftoffRegister dst, LiftoffRegister lhs, ...@@ -2926,7 +2926,7 @@ void LiftoffAssembler::emit_i8x16_shl(LiftoffRegister dst, LiftoffRegister lhs,
vpand(dst.fp(), lhs.fp(), kScratchDoubleReg); vpand(dst.fp(), lhs.fp(), kScratchDoubleReg);
} else { } else {
if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp()); if (dst.fp() != lhs.fp()) movaps(dst.fp(), lhs.fp());
pand(dst.fp(), kScratchDoubleReg); andps(dst.fp(), kScratchDoubleReg);
} }
subq(kScratchRegister, Immediate(8)); subq(kScratchRegister, Immediate(8));
Movq(tmp_simd.fp(), kScratchRegister); Movq(tmp_simd.fp(), kScratchRegister);
...@@ -3427,7 +3427,7 @@ void LiftoffAssembler::emit_i64x2_neg(LiftoffRegister dst, ...@@ -3427,7 +3427,7 @@ void LiftoffAssembler::emit_i64x2_neg(LiftoffRegister dst,
vpsubq(dst.fp(), reg, src.fp()); vpsubq(dst.fp(), reg, src.fp());
} else { } else {
psubq(reg, src.fp()); psubq(reg, src.fp());
if (dst.fp() != reg) movapd(dst.fp(), reg); if (dst.fp() != reg) movaps(dst.fp(), reg);
} }
} }
...@@ -3813,13 +3813,13 @@ void LiftoffAssembler::emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs, ...@@ -3813,13 +3813,13 @@ void LiftoffAssembler::emit_f64x2_min(LiftoffRegister dst, LiftoffRegister lhs,
vminpd(dst.fp(), rhs.fp(), lhs.fp()); vminpd(dst.fp(), rhs.fp(), lhs.fp());
} else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) { } else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp(); XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
movapd(kScratchDoubleReg, src); movaps(kScratchDoubleReg, src);
minpd(kScratchDoubleReg, dst.fp()); minpd(kScratchDoubleReg, dst.fp());
minpd(dst.fp(), src); minpd(dst.fp(), src);
} else { } else {
movapd(kScratchDoubleReg, lhs.fp()); movaps(kScratchDoubleReg, lhs.fp());
minpd(kScratchDoubleReg, rhs.fp()); minpd(kScratchDoubleReg, rhs.fp());
movapd(dst.fp(), rhs.fp()); movaps(dst.fp(), rhs.fp());
minpd(dst.fp(), lhs.fp()); minpd(dst.fp(), lhs.fp());
} }
// propagate -0's and NaNs, which may be non-canonical. // propagate -0's and NaNs, which may be non-canonical.
...@@ -3841,13 +3841,13 @@ void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs, ...@@ -3841,13 +3841,13 @@ void LiftoffAssembler::emit_f64x2_max(LiftoffRegister dst, LiftoffRegister lhs,
vmaxpd(dst.fp(), rhs.fp(), lhs.fp()); vmaxpd(dst.fp(), rhs.fp(), lhs.fp());
} else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) { } else if (dst.fp() == lhs.fp() || dst.fp() == rhs.fp()) {
XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp(); XMMRegister src = dst.fp() == lhs.fp() ? rhs.fp() : lhs.fp();
movapd(kScratchDoubleReg, src); movaps(kScratchDoubleReg, src);
maxpd(kScratchDoubleReg, dst.fp()); maxpd(kScratchDoubleReg, dst.fp());
maxpd(dst.fp(), src); maxpd(dst.fp(), src);
} else { } else {
movapd(kScratchDoubleReg, lhs.fp()); movaps(kScratchDoubleReg, lhs.fp());
maxpd(kScratchDoubleReg, rhs.fp()); maxpd(kScratchDoubleReg, rhs.fp());
movapd(dst.fp(), rhs.fp()); movaps(dst.fp(), rhs.fp());
maxpd(dst.fp(), lhs.fp()); maxpd(dst.fp(), lhs.fp());
} }
// Find discrepancies. // Find discrepancies.
...@@ -3902,7 +3902,7 @@ void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst, ...@@ -3902,7 +3902,7 @@ void LiftoffAssembler::emit_i32x4_sconvert_f32x4(LiftoffRegister dst,
movaps(kScratchDoubleReg, src.fp()); movaps(kScratchDoubleReg, src.fp());
cmpeqps(kScratchDoubleReg, kScratchDoubleReg); cmpeqps(kScratchDoubleReg, kScratchDoubleReg);
if (dst.fp() != src.fp()) movaps(dst.fp(), src.fp()); if (dst.fp() != src.fp()) movaps(dst.fp(), src.fp());
pand(dst.fp(), kScratchDoubleReg); andps(dst.fp(), kScratchDoubleReg);
} }
// Set top bit if >= 0 (but not -0.0!). // Set top bit if >= 0 (but not -0.0!).
Pxor(kScratchDoubleReg, dst.fp()); Pxor(kScratchDoubleReg, dst.fp());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment