Commit 40d86c61 authored by lrn@chromium.org's avatar lrn@chromium.org

X64: Remove more fpu code. Unroll more local initialization loops.

Review URL: http://codereview.chromium.org/2815028

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@4934 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 52a25749
......@@ -2738,6 +2738,28 @@ void Assembler::cvtsd2ss(XMMRegister dst, XMMRegister src) {
}
void Assembler::cvtsd2si(Register dst, XMMRegister src) {
EnsureSpace ensure_space(this);
last_pc_ = pc_;
emit(0xF2);
emit_optional_rex_32(dst, src);
emit(0x0F);
emit(0x2D);
emit_sse_operand(dst, src);
}
void Assembler::cvtsd2siq(Register dst, XMMRegister src) {
EnsureSpace ensure_space(this);
last_pc_ = pc_;
emit(0xF2);
emit_rex_64(dst, src);
emit(0x0F);
emit(0x2D);
emit_sse_operand(dst, src);
}
void Assembler::addsd(XMMRegister dst, XMMRegister src) {
EnsureSpace ensure_space(this);
last_pc_ = pc_;
......
......@@ -1128,6 +1128,9 @@ class Assembler : public Malloced {
void cvtss2sd(XMMRegister dst, const Operand& src);
void cvtsd2ss(XMMRegister dst, XMMRegister src);
void cvtsd2si(Register dst, XMMRegister src);
void cvtsd2siq(Register dst, XMMRegister src);
void addsd(XMMRegister dst, XMMRegister src);
void subsd(XMMRegister dst, XMMRegister src);
void mulsd(XMMRegister dst, XMMRegister src);
......
......@@ -2641,7 +2641,7 @@ void CodeGenerator::VisitArrayLiteral(ArrayLiteral* node) {
// Generate code to set the elements in the array that are not
// literals.
for (int i = 0; i < node->values()->length(); i++) {
for (int i = 0; i < length; i++) {
Expression* value = node->values()->at(i);
// If value is a literal the property value is already set in the
......
......@@ -791,7 +791,6 @@ void KeyedLoadIC::GenerateExternalArray(MacroAssembler* masm,
// Allocate a HeapNumber for the int and perform int-to-double
// conversion.
ASSERT(array_type == kExternalUnsignedIntArray);
// The value is zero-extended since we loaded the value from memory
// with movl.
__ cvtqsi2sd(xmm0, rcx);
......@@ -1121,55 +1120,41 @@ void KeyedStoreIC::GenerateExternalArray(MacroAssembler* masm,
// The WebGL specification leaves the behavior of storing NaN and
// +/-Infinity into integer arrays basically undefined. For more
// reproducible behavior, convert these to zero.
__ fld_d(FieldOperand(rax, HeapNumber::kValueOffset));
__ movsd(xmm0, FieldOperand(rax, HeapNumber::kValueOffset));
__ movq(rbx, FieldOperand(rbx, ExternalArray::kExternalPointerOffset));
// rdi: untagged index
// rbx: base pointer of external storage
// top of FPU stack: value
if (array_type == kExternalFloatArray) {
__ fstp_s(Operand(rbx, rdi, times_4, 0));
__ cvtsd2ss(xmm0, xmm0);
__ movss(Operand(rbx, rdi, times_4, 0), xmm0);
__ ret(0);
} else {
// Need to perform float-to-int conversion.
// Test the top of the FP stack for NaN.
Label is_nan;
__ fucomi(0);
__ j(parity_even, &is_nan);
__ push(rdx); // Make room on the stack. Receiver is no longer needed.
// TODO(lrn): If the rounding of this conversion is not deliberate, maybe
// switch to xmm registers.
__ fistp_d(Operand(rsp, 0));
__ pop(rdx);
// Test the value for NaN.
// Convert to int32 and store the low byte/word.
// If the value is NaN or +/-infinity, the result is 0x80000000,
// which is automatically zero when taken mod 2^n, n < 32.
// rdx: value (converted to an untagged integer)
// rdi: untagged index
// rbx: base pointer of external storage
switch (array_type) {
case kExternalByteArray:
case kExternalUnsignedByteArray:
__ cvtsd2si(rdx, xmm0);
__ movb(Operand(rbx, rdi, times_1, 0), rdx);
break;
case kExternalShortArray:
case kExternalUnsignedShortArray:
__ cvtsd2si(rdx, xmm0);
__ movw(Operand(rbx, rdi, times_2, 0), rdx);
break;
case kExternalIntArray:
case kExternalUnsignedIntArray: {
// We also need to explicitly check for +/-Infinity. These are
// converted to MIN_INT, but we need to be careful not to
// confuse with legal uses of MIN_INT. Since MIN_INT truncated
// to 8 or 16 bits is zero, we only perform this test when storing
// 32-bit ints.
Label not_infinity;
// This test would apparently detect both NaN and Infinity,
// but we've already checked for NaN using the FPU hardware
// above.
__ movzxwq(rcx, FieldOperand(rax, HeapNumber::kValueOffset + 6));
__ and_(rcx, Immediate(0x7FF0));
__ cmpw(rcx, Immediate(0x7FF0));
__ j(not_equal, &not_infinity);
__ movq(rdx, Immediate(0));
__ bind(&not_infinity);
// Convert to int64, so that NaN and infinities become
// 0x8000000000000000, which is zero mod 2^32.
__ cvtsd2siq(rdx, xmm0);
__ movl(Operand(rbx, rdi, times_4, 0), rdx);
break;
}
......@@ -1178,31 +1163,6 @@ void KeyedStoreIC::GenerateExternalArray(MacroAssembler* masm,
break;
}
__ ret(0);
__ bind(&is_nan);
// rdi: untagged index
// rbx: base pointer of external storage
__ ffree();
__ fincstp();
__ Set(rdx, 0);
switch (array_type) {
case kExternalByteArray:
case kExternalUnsignedByteArray:
__ movb(Operand(rbx, rdi, times_1, 0), rdx);
break;
case kExternalShortArray:
case kExternalUnsignedShortArray:
__ movw(Operand(rbx, rdi, times_2, 0), rdx);
break;
case kExternalIntArray:
case kExternalUnsignedIntArray:
__ movl(Operand(rbx, rdi, times_4, 0), rdx);
break;
default:
UNREACHABLE();
break;
}
__ ret(0);
}
// Slow case: call runtime.
......
......@@ -115,25 +115,45 @@ void VirtualFrame::AllocateStackSlots() {
Handle<Object> undefined = Factory::undefined_value();
FrameElement initial_value =
FrameElement::ConstantElement(undefined, FrameElement::SYNCED);
if (count == 1) {
__ Push(undefined);
} else if (count < kLocalVarBound) {
// For less locals the unrolled loop is more compact.
__ movq(kScratchRegister, undefined, RelocInfo::EMBEDDED_OBJECT);
if (count < kLocalVarBound) {
// For fewer locals the unrolled loop is more compact.
// Hope for one of the first eight registers, where the push operation
// takes only one byte (kScratchRegister needs the REX.W bit).
Result tmp = cgen()->allocator()->Allocate();
ASSERT(tmp.is_valid());
__ movq(tmp.reg(), undefined, RelocInfo::EMBEDDED_OBJECT);
for (int i = 0; i < count; i++) {
__ push(kScratchRegister);
__ push(tmp.reg());
}
} else {
// For more locals a loop in generated code is more compact.
Label alloc_locals_loop;
Result cnt = cgen()->allocator()->Allocate();
ASSERT(cnt.is_valid());
__ movq(cnt.reg(), Immediate(count));
__ movq(kScratchRegister, undefined, RelocInfo::EMBEDDED_OBJECT);
#ifdef DEBUG
Label loop_size;
__ bind(&loop_size);
#endif
if (is_uint8(count)) {
// Loading imm8 is shorter than loading imm32.
// Loading only partial byte register, and using decb below.
__ movb(cnt.reg(), Immediate(count));
} else {
__ movl(cnt.reg(), Immediate(count));
}
__ bind(&alloc_locals_loop);
__ push(kScratchRegister);
__ decl(cnt.reg());
if (is_uint8(count)) {
__ decb(cnt.reg());
} else {
__ decl(cnt.reg());
}
__ j(not_zero, &alloc_locals_loop);
#ifdef DEBUG
CHECK(masm()->SizeOfCodeGeneratedSince(&loop_size) < kLocalVarBound);
#endif
}
for (int i = 0; i < count; i++) {
elements_.Add(initial_value);
......
......@@ -200,7 +200,7 @@ class VirtualFrame : public ZoneObject {
inline void PrepareForReturn();
// Number of local variables after when we use a loop for allocating.
static const int kLocalVarBound = 7;
static const int kLocalVarBound = 14;
// Allocate and initialize the frame-allocated locals.
void AllocateStackSlots();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment