Commit e21f9ab4 authored by bmeurer's avatar bmeurer Committed by Commit bot

[x86] Allow (v)divsd->(v)mulsd to execute in parallel.

This tweak was already present in CrankShaft for the non-AVX case. As it
turns out, it's also relevant even with AVX. Now the same optimization
is applied in case of TurboFan as well.

R=dcarney@chromium.org

Review URL: https://codereview.chromium.org/1081033003

Cr-Commit-Position: refs/heads/master@{#27774}
parent 1dbc4327
...@@ -468,6 +468,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) { ...@@ -468,6 +468,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
break; break;
case kSSEFloat32Div: case kSSEFloat32Div:
__ divss(i.InputDoubleRegister(0), i.InputOperand(1)); __ divss(i.InputDoubleRegister(0), i.InputOperand(1));
// Don't delete this mov. It may improve performance on some CPUs,
// when there is a (v)mulss depending on the result.
__ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
break; break;
case kSSEFloat32Max: case kSSEFloat32Max:
__ maxss(i.InputDoubleRegister(0), i.InputOperand(1)); __ maxss(i.InputDoubleRegister(0), i.InputOperand(1));
...@@ -506,6 +509,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) { ...@@ -506,6 +509,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
break; break;
case kSSEFloat64Div: case kSSEFloat64Div:
__ divsd(i.InputDoubleRegister(0), i.InputOperand(1)); __ divsd(i.InputDoubleRegister(0), i.InputOperand(1));
// Don't delete this mov. It may improve performance on some CPUs,
// when there is a (v)mulsd depending on the result.
__ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
break; break;
case kSSEFloat64Max: case kSSEFloat64Max:
__ maxsd(i.InputDoubleRegister(0), i.InputOperand(1)); __ maxsd(i.InputDoubleRegister(0), i.InputOperand(1));
...@@ -629,6 +635,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) { ...@@ -629,6 +635,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
CpuFeatureScope avx_scope(masm(), AVX); CpuFeatureScope avx_scope(masm(), AVX);
__ vdivss(i.OutputDoubleRegister(), i.InputDoubleRegister(0), __ vdivss(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputOperand(1)); i.InputOperand(1));
// Don't delete this mov. It may improve performance on some CPUs,
// when there is a (v)mulss depending on the result.
__ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
break; break;
} }
case kAVXFloat32Max: { case kAVXFloat32Max: {
...@@ -665,6 +674,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) { ...@@ -665,6 +674,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
CpuFeatureScope avx_scope(masm(), AVX); CpuFeatureScope avx_scope(masm(), AVX);
__ vdivsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0), __ vdivsd(i.OutputDoubleRegister(), i.InputDoubleRegister(0),
i.InputOperand(1)); i.InputOperand(1));
// Don't delete this mov. It may improve performance on some CPUs,
// when there is a (v)mulsd depending on the result.
__ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
break; break;
} }
case kAVXFloat64Max: { case kAVXFloat64Max: {
......
...@@ -726,6 +726,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) { ...@@ -726,6 +726,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
break; break;
case kSSEFloat32Div: case kSSEFloat32Div:
ASSEMBLE_SSE_BINOP(divss); ASSEMBLE_SSE_BINOP(divss);
// Don't delete this mov. It may improve performance on some CPUs,
// when there is a (v)mulss depending on the result.
__ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
break; break;
case kSSEFloat32Abs: { case kSSEFloat32Abs: {
// TODO(bmeurer): Use RIP relative 128-bit constants. // TODO(bmeurer): Use RIP relative 128-bit constants.
...@@ -767,6 +770,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) { ...@@ -767,6 +770,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
break; break;
case kSSEFloat64Div: case kSSEFloat64Div:
ASSEMBLE_SSE_BINOP(divsd); ASSEMBLE_SSE_BINOP(divsd);
// Don't delete this mov. It may improve performance on some CPUs,
// when there is a (v)mulsd depending on the result.
__ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
break; break;
case kSSEFloat64Mod: { case kSSEFloat64Mod: {
__ subq(rsp, Immediate(kDoubleSize)); __ subq(rsp, Immediate(kDoubleSize));
...@@ -919,6 +925,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) { ...@@ -919,6 +925,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
break; break;
case kAVXFloat32Div: case kAVXFloat32Div:
ASSEMBLE_AVX_BINOP(vdivss); ASSEMBLE_AVX_BINOP(vdivss);
// Don't delete this mov. It may improve performance on some CPUs,
// when there is a (v)mulss depending on the result.
__ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
break; break;
case kAVXFloat32Max: case kAVXFloat32Max:
ASSEMBLE_AVX_BINOP(vmaxss); ASSEMBLE_AVX_BINOP(vmaxss);
...@@ -946,6 +955,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) { ...@@ -946,6 +955,9 @@ void CodeGenerator::AssembleArchInstruction(Instruction* instr) {
break; break;
case kAVXFloat64Div: case kAVXFloat64Div:
ASSEMBLE_AVX_BINOP(vdivsd); ASSEMBLE_AVX_BINOP(vdivsd);
// Don't delete this mov. It may improve performance on some CPUs,
// when there is a (v)mulsd depending on the result.
__ movaps(i.OutputDoubleRegister(), i.OutputDoubleRegister());
break; break;
case kAVXFloat64Max: case kAVXFloat64Max:
ASSEMBLE_AVX_BINOP(vmaxsd); ASSEMBLE_AVX_BINOP(vmaxsd);
......
...@@ -2040,10 +2040,10 @@ void LCodeGen::DoArithmeticD(LArithmeticD* instr) { ...@@ -2040,10 +2040,10 @@ void LCodeGen::DoArithmeticD(LArithmeticD* instr) {
} else { } else {
DCHECK(result.is(left)); DCHECK(result.is(left));
__ divsd(left, right); __ divsd(left, right);
// Don't delete this mov. It may improve performance on some CPUs,
// when there is a mulsd depending on the result
__ movaps(left, left);
} }
// Don't delete this mov. It may improve performance on some CPUs,
// when there is a (v)mulsd depending on the result
__ movaps(result, result);
break; break;
case Token::MOD: { case Token::MOD: {
// Pass two doubles as arguments on the stack. // Pass two doubles as arguments on the stack.
......
...@@ -2076,10 +2076,10 @@ void LCodeGen::DoArithmeticD(LArithmeticD* instr) { ...@@ -2076,10 +2076,10 @@ void LCodeGen::DoArithmeticD(LArithmeticD* instr) {
} else { } else {
DCHECK(result.is(left)); DCHECK(result.is(left));
__ divsd(left, right); __ divsd(left, right);
// Don't delete this mov. It may improve performance on some CPUs,
// when there is a mulsd depending on the result
__ movaps(left, left);
} }
// Don't delete this mov. It may improve performance on some CPUs,
// when there is a (v)mulsd depending on the result
__ movaps(result, result);
break; break;
case Token::MOD: { case Token::MOD: {
XMMRegister xmm_scratch = double_scratch0(); XMMRegister xmm_scratch = double_scratch0();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment