Commit d3d295ef authored by lrn@chromium.org's avatar lrn@chromium.org

Add optimized version of memcpy on ia32.

Only used in one place right now.
Still room for tweaking.

Review URL: http://codereview.chromium.org/2582001

git-svn-id: http://v8.googlecode.com/svn/branches/bleeding_edge@4796 ce2b1a6d-e550-0410-aec6-3dcde31c8c00
parent 0fc3dca6
......@@ -647,7 +647,9 @@ F FUNCTION_CAST(Address addr) {
// Feature flags bit positions. They are mostly based on the CPUID spec.
// (We assign CPUID itself to one of the currently reserved bits --
// feel free to change this if needed.)
enum CpuFeature { SSE3 = 32, // x86
// On X86/X64, values below 32 are bits in EDX, values above 32 are bits in ECX.
enum CpuFeature { SSE4_1 = 32 + 19, // x86
SSE3 = 32 + 0, // x86
SSE2 = 26, // x86
CMOV = 15, // x86
RDTSC = 4, // x86
......
......@@ -2230,6 +2230,40 @@ void Assembler::movdqu(XMMRegister dst, const Operand& src) {
}
void Assembler::movntdqa(XMMRegister dst, const Operand& src) {
ASSERT(CpuFeatures::IsEnabled(SSE4_1));
EnsureSpace ensure_space(this);
last_pc_ = pc_;
EMIT(0x66);
EMIT(0x0F);
EMIT(0x38);
EMIT(0x2A);
emit_sse_operand(dst, src);
}
void Assembler::movntdq(const Operand& dst, XMMRegister src) {
ASSERT(CpuFeatures::IsEnabled(SSE2));
EnsureSpace ensure_space(this);
last_pc_ = pc_;
EMIT(0x66);
EMIT(0x0F);
EMIT(0xE7);
emit_sse_operand(src, dst);
}
void Assembler::prefetch(const Operand& src, int level) {
ASSERT(is_uint2(level));
EnsureSpace ensure_space(this);
last_pc_ = pc_;
EMIT(0x0F);
EMIT(0x18);
XMMRegister code = { level }; // Emit hint number in Reg position of RegR/M.
emit_sse_operand(code, src);
}
void Assembler::movdbl(XMMRegister dst, const Operand& src) {
EnsureSpace ensure_space(this);
last_pc_ = pc_;
......@@ -2309,7 +2343,6 @@ void Assembler::ptest(XMMRegister dst, XMMRegister src) {
emit_sse_operand(dst, src);
}
void Assembler::emit_sse_operand(XMMRegister reg, const Operand& adr) {
Register ireg = { reg.code() };
emit_operand(ireg, adr);
......
......@@ -791,6 +791,15 @@ class Assembler : public Malloced {
void pxor(XMMRegister dst, XMMRegister src);
void ptest(XMMRegister dst, XMMRegister src);
// Parallel XMM operations.
void movntdqa(XMMRegister src, const Operand& dst);
void movntdq(const Operand& dst, XMMRegister src);
// Prefetch src position into cache level.
// Level 1, 2 or 3 specifies CPU cache level. Level 0 specifies a
// non-temporal
void prefetch(const Operand& src, int level);
// TODO(lrn): Need SFENCE for movnt?
// Debugging
void Print();
......
......@@ -13498,6 +13498,211 @@ void StringCompareStub::Generate(MacroAssembler* masm) {
#undef __
#define __ masm.
MemCopyFunction CreateMemCopyFunction() {
size_t actual_size;
byte* buffer = static_cast<byte*>(OS::Allocate(Assembler::kMinimalBufferSize,
&actual_size,
true));
CHECK(buffer);
HandleScope handles;
MacroAssembler masm(buffer, static_cast<int>(actual_size));
// Generated code is put into a fixed, unmovable, buffer, and not into
// the V8 heap. We can't, and don't, refer to any relocatable addresses
// (e.g. the JavaScript nan-object).
// 32-bit C declaration function calls pass arguments on stack.
// Stack layout:
// esp[12]: Third argument, size.
// esp[8]: Second argument, source pointer.
// esp[4]: First argument, destination pointer.
// esp[0]: return address
const int kDestinationOffset = 1 * kPointerSize;
const int kSourceOffset = 2 * kPointerSize;
const int kSizeOffset = 3 * kPointerSize;
int stack_offset = 0; // Update if we change the stack height.
if (FLAG_debug_code) {
__ cmp(Operand(esp, kSizeOffset + stack_offset),
Immediate(kMinComplexMemCopy));
Label ok;
__ j(greater_equal, &ok);
__ int3();
__ bind(&ok);
}
if (CpuFeatures::IsSupported(SSE2)) {
CpuFeatures::Scope enable(SSE2);
__ push(edi);
__ push(esi);
stack_offset += 2 * kPointerSize;
Register dst = edi;
Register src = esi;
Register count = ecx;
__ mov(dst, Operand(esp, stack_offset + kDestinationOffset));
__ mov(src, Operand(esp, stack_offset + kSourceOffset));
__ mov(count, Operand(esp, stack_offset + kSizeOffset));
__ movdqu(xmm0, Operand(src, 0));
__ movdqu(Operand(dst, 0), xmm0);
__ mov(edx, dst);
__ and_(edx, 0xF);
__ neg(edx);
__ add(Operand(edx), Immediate(16));
__ add(dst, Operand(edx));
__ add(src, Operand(edx));
__ sub(Operand(count), edx);
// edi is now aligned. Check if esi is also aligned.
Label unaligned_source;
__ test(Operand(src), Immediate(0x0F));
__ j(not_zero, &unaligned_source);
{
__ IncrementCounter(&Counters::memcopy_aligned, 1);
// Copy loop for aligned source and destination.
__ mov(edx, count);
Register loop_count = ecx;
Register count = edx;
__ shr(loop_count, 5);
{
// Main copy loop.
Label loop;
__ bind(&loop);
__ prefetch(Operand(src, 0x20), 1);
__ movdqa(xmm0, Operand(src, 0x00));
__ movdqa(xmm1, Operand(src, 0x10));
__ add(Operand(src), Immediate(0x20));
__ movdqa(Operand(dst, 0x00), xmm0);
__ movdqa(Operand(dst, 0x10), xmm1);
__ add(Operand(dst), Immediate(0x20));
__ dec(loop_count);
__ j(not_zero, &loop);
}
// At most 31 bytes to copy.
Label move_less_16;
__ test(Operand(count), Immediate(0x10));
__ j(zero, &move_less_16);
__ movdqa(xmm0, Operand(src, 0));
__ add(Operand(src), Immediate(0x10));
__ movdqa(Operand(dst, 0), xmm0);
__ add(Operand(dst), Immediate(0x10));
__ bind(&move_less_16);
// At most 15 bytes to copy. Copy 16 bytes at end of string.
__ and_(count, 0xF);
__ movdqu(xmm0, Operand(src, count, times_1, -0x10));
__ movdqu(Operand(dst, count, times_1, -0x10), xmm0);
__ pop(esi);
__ pop(edi);
__ ret(0);
}
__ Align(16);
{
// Copy loop for unaligned source and aligned destination.
// If source is not aligned, we can't read it as efficiently.
__ bind(&unaligned_source);
__ IncrementCounter(&Counters::memcopy_unaligned, 1);
__ mov(edx, ecx);
Register loop_count = ecx;
Register count = edx;
__ shr(loop_count, 5);
{
// Main copy loop
Label loop;
__ bind(&loop);
__ prefetch(Operand(src, 0x20), 1);
__ movdqu(xmm0, Operand(src, 0x00));
__ movdqu(xmm1, Operand(src, 0x10));
__ add(Operand(src), Immediate(0x20));
__ movdqa(Operand(dst, 0x00), xmm0);
__ movdqa(Operand(dst, 0x10), xmm1);
__ add(Operand(dst), Immediate(0x20));
__ dec(loop_count);
__ j(not_zero, &loop);
}
// At most 31 bytes to copy.
Label move_less_16;
__ test(Operand(count), Immediate(0x10));
__ j(zero, &move_less_16);
__ movdqu(xmm0, Operand(src, 0));
__ add(Operand(src), Immediate(0x10));
__ movdqa(Operand(dst, 0), xmm0);
__ add(Operand(dst), Immediate(0x10));
__ bind(&move_less_16);
// At most 15 bytes to copy. Copy 16 bytes at end of string.
__ and_(count, 0x0F);
__ movdqu(xmm0, Operand(src, count, times_1, -0x10));
__ movdqu(Operand(dst, count, times_1, -0x10), xmm0);
__ pop(esi);
__ pop(edi);
__ ret(0);
}
} else {
__ IncrementCounter(&Counters::memcopy_noxmm, 1);
// SSE2 not supported. Unlikely to happen in practice.
__ push(edi);
__ push(esi);
stack_offset += 2 * kPointerSize;
__ cld();
Register dst = edi;
Register src = esi;
Register count = ecx;
__ mov(dst, Operand(esp, stack_offset + kDestinationOffset));
__ mov(src, Operand(esp, stack_offset + kSourceOffset));
__ mov(count, Operand(esp, stack_offset + kSizeOffset));
// Copy the first word.
__ mov(eax, Operand(src, 0));
__ mov(Operand(dst, 0), eax);
// Increment src,dstso that dst is aligned.
__ mov(edx, dst);
__ and_(edx, 0x03);
__ neg(edx);
__ add(Operand(edx), Immediate(4)); // edx = 4 - (dst & 3)
__ add(dst, Operand(edx));
__ add(src, Operand(edx));
__ sub(Operand(count), edx);
// edi is now aligned, ecx holds number of remaning bytes to copy.
__ mov(edx, count);
count = edx;
__ shr(ecx, 2); // Make word count instead of byte count.
__ rep_movs();
// At most 3 bytes left to copy. Copy 4 bytes at end of string.
__ and_(count, 3);
__ mov(eax, Operand(src, count, times_1, -4));
__ mov(Operand(dst, count, times_1, -4), eax);
__ pop(esi);
__ pop(edi);
__ ret(0);
}
CodeDesc desc;
masm.GetCode(&desc);
// Call the function from C++.
return FUNCTION_CAST<MemCopyFunction>(buffer);
}
#undef __
} } // namespace v8::internal
#endif // V8_TARGET_ARCH_IA32
......@@ -817,6 +817,7 @@ int DisassemblerIA32::RegisterFPUInstruction(int escape_opcode,
// Returns NULL if the instruction is not handled here.
static const char* F0Mnem(byte f0byte) {
switch (f0byte) {
case 0x18: return "prefetch";
case 0xA2: return "cpuid";
case 0x31: return "rdtsc";
case 0xBE: return "movsx_b";
......@@ -942,7 +943,13 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector<char> out_buffer,
case 0x0F:
{ byte f0byte = *(data+1);
const char* f0mnem = F0Mnem(f0byte);
if (f0byte == 0xA2 || f0byte == 0x31) {
if (f0byte == 0x18) {
int mod, regop, rm;
get_modrm(*data, &mod, &regop, &rm);
const char* suffix[] = {"nta", "1", "2", "3"};
AppendToBuffer("%s%s ", f0mnem, suffix[regop & 0x03]);
data += PrintRightOperand(data);
} else if (f0byte == 0xA2 || f0byte == 0x31) {
AppendToBuffer("%s", f0mnem);
data += 2;
} else if ((f0byte & 0xF0) == 0x80) {
......@@ -1070,6 +1077,13 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector<char> out_buffer,
NameOfXMMRegister(regop),
NameOfXMMRegister(rm));
data++;
} else if (*data == 0x2A) {
// movntdqa
data++;
int mod, regop, rm;
get_modrm(*data, &mod, &regop, &rm);
AppendToBuffer("movntdqa %s,", NameOfXMMRegister(regop));
data += PrintRightOperand(data);
} else {
UnimplementedInstruction();
}
......@@ -1122,6 +1136,13 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector<char> out_buffer,
get_modrm(*data, &mod, &regop, &rm);
data += PrintRightOperand(data);
AppendToBuffer(",%s", NameOfXMMRegister(regop));
} else if (*data == 0xE7) {
AppendToBuffer("movntdq ");
data++;
int mod, regop, rm;
get_modrm(*data, &mod, &regop, &rm);
data += PrintRightOperand(data);
AppendToBuffer(",%s", NameOfXMMRegister(regop));
} else if (*data == 0xEF) {
data++;
int mod, regop, rm;
......
......@@ -525,12 +525,54 @@ class StringBuilder {
};
// Custom memcpy implementation for platforms where the standard version
// may not be good enough.
// TODO(lrn): Check whether some IA32 platforms should be excluded.
#if defined(V8_TARGET_ARCH_IA32)
// TODO(lrn): Extend to other platforms as needed.
typedef void (*MemCopyFunction)(void* dest, const void* src, size_t size);
// Implemented in codegen-<arch>.cc.
MemCopyFunction CreateMemCopyFunction();
// Copy memory area to disjoint memory area.
static inline void MemCopy(void* dest, const void* src, size_t size) {
static MemCopyFunction memcopy = CreateMemCopyFunction();
(*memcopy)(dest, src, size);
#ifdef DEBUG
CHECK_EQ(0, memcmp(dest, src, size));
#endif
}
// Limit below which the extra overhead of the MemCopy function is likely
// to outweigh the benefits of faster copying.
// TODO(lrn): Try to find a more precise value.
static const int kMinComplexMemCopy = 256;
#else // V8_TARGET_ARCH_IA32
static inline void MemCopy(void* dest, const void* src, size_t size) {
memcpy(dest, src, size);
}
static const int kMinComplexMemCopy = 256;
#endif // V8_TARGET_ARCH_IA32
// Copy from ASCII/16bit chars to ASCII/16bit chars.
template <typename sourcechar, typename sinkchar>
static inline void CopyChars(sinkchar* dest, const sourcechar* src, int chars) {
sinkchar* limit = dest + chars;
#ifdef V8_HOST_CAN_READ_UNALIGNED
if (sizeof(*dest) == sizeof(*src)) {
if (chars >= static_cast<int>(kMinComplexMemCopy / sizeof(*dest))) {
MemCopy(dest, src, chars * sizeof(*dest));
return;
}
// Number of characters in a uintptr_t.
static const int kStepSize = sizeof(uintptr_t) / sizeof(*dest); // NOLINT
while (dest <= limit - kStepSize) {
......
......@@ -157,6 +157,9 @@ namespace internal {
SC(array_function_runtime, V8.ArrayFunctionRuntime) \
SC(array_function_native, V8.ArrayFunctionNative) \
SC(for_in, V8.ForIn) \
SC(memcopy_aligned, V8.MemCopyAligned) \
SC(memcopy_unaligned, V8.MemCopyUnaligned) \
SC(memcopy_noxmm, V8.MemCopyNoXMM) \
SC(enum_cache_hits, V8.EnumCacheHits) \
SC(enum_cache_misses, V8.EnumCacheMisses) \
SC(reloc_info_count, V8.RelocInfoCount) \
......
......@@ -79,3 +79,55 @@ TEST(SNPrintF) {
buffer.Dispose();
}
}
void TestMemCopy(Vector<byte> src,
Vector<byte> dst,
int source_alignment,
int destination_alignment,
int length_alignment) {
memset(dst.start(), 0xFF, dst.length());
byte* to = dst.start() + 32 + destination_alignment;
byte* from = src.start() + source_alignment;
int length = kMinComplexMemCopy + length_alignment;
MemCopy(to, from, static_cast<size_t>(length));
printf("[%d,%d,%d]\n",
source_alignment, destination_alignment, length_alignment);
for (int i = 0; i < length; i++) {
CHECK_EQ(from[i], to[i]);
}
CHECK_EQ(0xFF, to[-1]);
CHECK_EQ(0xFF, to[length]);
}
TEST(MemCopy) {
const int N = kMinComplexMemCopy + 128;
Vector<byte> buffer1 = Vector<byte>::New(N);
Vector<byte> buffer2 = Vector<byte>::New(N);
for (int i = 0; i < N; i++) {
buffer1[i] = static_cast<byte>(i & 0x7F);
}
// Same alignment.
for (int i = 0; i < 32; i++) {
TestMemCopy(buffer1, buffer2, i, i, i * 2);
}
// Different alignment.
for (int i = 0; i < 32; i++) {
for (int j = 1; j < 32; j++) {
TestMemCopy(buffer1, buffer2, i, (i + j) & 0x1F , 0);
}
}
// Different lengths
for (int i = 0; i < 32; i++) {
TestMemCopy(buffer1, buffer2, 3, 7, i);
}
buffer2.Dispose();
buffer1.Dispose();
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment