// Copyright 2018 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef V8_UTILS_MEMCOPY_H_
#define V8_UTILS_MEMCOPY_H_

#include <stdint.h>
#include <stdlib.h>
#include <string.h>

#include <algorithm>

#include "src/base/logging.h"
#include "src/base/macros.h"
#include "src/base/platform/wrappers.h"

namespace v8 {
namespace internal {

using Address = uintptr_t;

// ----------------------------------------------------------------------------
// Generated memcpy/memmove for ia32, arm, and mips.

void init_memcopy_functions();

#if defined(V8_TARGET_ARCH_IA32)
// Limit below which the extra overhead of the MemCopy function is likely
// to outweigh the benefits of faster copying.
const size_t kMinComplexMemCopy = 64;

// Copy memory area. No restrictions.
V8_EXPORT_PRIVATE void MemMove(void* dest, const void* src, size_t size);
using MemMoveFunction = void (*)(void* dest, const void* src, size_t size);

// Keep the distinction of "move" vs. "copy" for the benefit of other
// architectures.
V8_INLINE void MemCopy(void* dest, const void* src, size_t size) {
  MemMove(dest, src, size);
}
#elif defined(V8_HOST_ARCH_ARM)
using MemCopyUint8Function = void (*)(uint8_t* dest, const uint8_t* src,
                                      size_t size);
V8_EXPORT_PRIVATE extern MemCopyUint8Function memcopy_uint8_function;
V8_INLINE void MemCopyUint8Wrapper(uint8_t* dest, const uint8_t* src,
                                   size_t chars) {
  memcpy(dest, src, chars);
}
// For values < 16, the assembler function is slower than the inlined C code.
const size_t kMinComplexMemCopy = 16;
V8_INLINE void MemCopy(void* dest, const void* src, size_t size) {
  (*memcopy_uint8_function)(reinterpret_cast<uint8_t*>(dest),
                            reinterpret_cast<const uint8_t*>(src), size);
}
V8_EXPORT_PRIVATE V8_INLINE void MemMove(void* dest, const void* src,
                                         size_t size) {
  memmove(dest, src, size);
}

// For values < 12, the assembler function is slower than the inlined C code.
const int kMinComplexConvertMemCopy = 12;
#elif defined(V8_HOST_ARCH_MIPS)
using MemCopyUint8Function = void (*)(uint8_t* dest, const uint8_t* src,
                                      size_t size);
V8_EXPORT_PRIVATE extern MemCopyUint8Function memcopy_uint8_function;
V8_INLINE void MemCopyUint8Wrapper(uint8_t* dest, const uint8_t* src,
                                   size_t chars) {
  memcpy(dest, src, chars);
}
// For values < 16, the assembler function is slower than the inlined C code.
const size_t kMinComplexMemCopy = 16;
V8_INLINE void MemCopy(void* dest, const void* src, size_t size) {
  (*memcopy_uint8_function)(reinterpret_cast<uint8_t*>(dest),
                            reinterpret_cast<const uint8_t*>(src), size);
}
V8_EXPORT_PRIVATE V8_INLINE void MemMove(void* dest, const void* src,
                                         size_t size) {
  memmove(dest, src, size);
}
#else
// Copy memory area to disjoint memory area.
inline void MemCopy(void* dest, const void* src, size_t size) {
  // Fast path for small sizes. The compiler will expand the {memcpy} for small
  // fixed sizes to a sequence of move instructions. This avoids the overhead of
  // the general {memcpy} function.
  switch (size) {
#define CASE(N)           \
  case N:                 \
    memcpy(dest, src, N); \
    return;
    CASE(1)
    CASE(2)
    CASE(3)
    CASE(4)
    CASE(5)
    CASE(6)
    CASE(7)
    CASE(8)
    CASE(9)
    CASE(10)
    CASE(11)
    CASE(12)
    CASE(13)
    CASE(14)
    CASE(15)
    CASE(16)
#undef CASE
    default:
      memcpy(dest, src, size);
      return;
  }
}
V8_EXPORT_PRIVATE inline void MemMove(void* dest, const void* src,
                                      size_t size) {
  // Fast path for small sizes. The compiler will expand the {memmove} for small
  // fixed sizes to a sequence of move instructions. This avoids the overhead of
  // the general {memmove} function.
  switch (size) {
#define CASE(N)            \
  case N:                  \
    memmove(dest, src, N); \
    return;
    CASE(1)
    CASE(2)
    CASE(3)
    CASE(4)
    CASE(5)
    CASE(6)
    CASE(7)
    CASE(8)
    CASE(9)
    CASE(10)
    CASE(11)
    CASE(12)
    CASE(13)
    CASE(14)
    CASE(15)
    CASE(16)
#undef CASE
    default:
      memmove(dest, src, size);
      return;
  }
}
const size_t kMinComplexMemCopy = 8;
#endif  // V8_TARGET_ARCH_IA32

// Copies words from |src| to |dst|. The data spans must not overlap.
// |src| and |dst| must be TWord-size aligned.
template <size_t kBlockCopyLimit, typename T>
inline void CopyImpl(T* dst_ptr, const T* src_ptr, size_t count) {
  constexpr int kTWordSize = sizeof(T);
#ifdef DEBUG
  Address dst = reinterpret_cast<Address>(dst_ptr);
  Address src = reinterpret_cast<Address>(src_ptr);
  DCHECK(IsAligned(dst, kTWordSize));
  DCHECK(IsAligned(src, kTWordSize));
  DCHECK(((src <= dst) && ((src + count * kTWordSize) <= dst)) ||
         ((dst <= src) && ((dst + count * kTWordSize) <= src)));
#endif
  if (count == 0) return;

  // Use block copying MemCopy if the segment we're copying is
  // enough to justify the extra call/setup overhead.
  if (count < kBlockCopyLimit) {
    do {
      count--;
      *dst_ptr++ = *src_ptr++;
    } while (count > 0);
  } else {
    MemCopy(dst_ptr, src_ptr, count * kTWordSize);
  }
}

// Copies kSystemPointerSize-sized words from |src| to |dst|. The data spans
// must not overlap. |src| and |dst| must be kSystemPointerSize-aligned.
inline void CopyWords(Address dst, const Address src, size_t num_words) {
  static const size_t kBlockCopyLimit = 16;
  CopyImpl<kBlockCopyLimit>(reinterpret_cast<Address*>(dst),
                            reinterpret_cast<const Address*>(src), num_words);
}

// Copies data from |src| to |dst|.  The data spans must not overlap.
template <typename T>
inline void CopyBytes(T* dst, const T* src, size_t num_bytes) {
  STATIC_ASSERT(sizeof(T) == 1);
  if (num_bytes == 0) return;
  CopyImpl<kMinComplexMemCopy>(dst, src, num_bytes);
}

inline void MemsetUint32(uint32_t* dest, uint32_t value, size_t counter) {
#if V8_HOST_ARCH_IA32 || V8_HOST_ARCH_X64
#define STOS "stosl"
#endif

#if defined(MEMORY_SANITIZER)
  // MemorySanitizer does not understand inline assembly.
#undef STOS
#endif

#if defined(__GNUC__) && defined(STOS)
  asm volatile(
      "cld;"
      "rep ; " STOS
      : "+&c"(counter), "+&D"(dest)
      : "a"(value)
      : "memory", "cc");
#else
  for (size_t i = 0; i < counter; i++) {
    dest[i] = value;
  }
#endif

#undef STOS
}

inline void MemsetPointer(Address* dest, Address value, size_t counter) {
#if V8_HOST_ARCH_IA32
#define STOS "stosl"
#elif V8_HOST_ARCH_X64
#define STOS "stosq"
#endif

#if defined(MEMORY_SANITIZER)
  // MemorySanitizer does not understand inline assembly.
#undef STOS
#endif

#if defined(__GNUC__) && defined(STOS)
  asm volatile(
      "cld;"
      "rep ; " STOS
      : "+&c"(counter), "+&D"(dest)
      : "a"(value)
      : "memory", "cc");
#else
  for (size_t i = 0; i < counter; i++) {
    dest[i] = value;
  }
#endif

#undef STOS
}

template <typename T, typename U>
inline void MemsetPointer(T** dest, U* value, size_t counter) {
#ifdef DEBUG
  T* a = nullptr;
  U* b = nullptr;
  a = b;  // Fake assignment to check assignability.
  USE(a);
#endif  // DEBUG
  MemsetPointer(reinterpret_cast<Address*>(dest),
                reinterpret_cast<Address>(value), counter);
}

template <typename T>
inline void MemsetPointer(T** dest, std::nullptr_t, size_t counter) {
  MemsetPointer(reinterpret_cast<Address*>(dest), Address{0}, counter);
}

// Copy from 8bit/16bit chars to 8bit/16bit chars. Values are zero-extended if
// needed. Ranges are not allowed to overlap.
// The separate declaration is needed for the V8_NONNULL, which is not allowed
// on a definition.
template <typename SrcType, typename DstType>
void CopyChars(DstType* dst, const SrcType* src, size_t count) V8_NONNULL(1, 2);

template <typename SrcType, typename DstType>
void CopyChars(DstType* dst, const SrcType* src, size_t count) {
  STATIC_ASSERT(std::is_integral<SrcType>::value);
  STATIC_ASSERT(std::is_integral<DstType>::value);
  using SrcTypeUnsigned = typename std::make_unsigned<SrcType>::type;
  using DstTypeUnsigned = typename std::make_unsigned<DstType>::type;

#ifdef DEBUG
  // Check for no overlap, otherwise {std::copy_n} cannot be used.
  Address src_start = reinterpret_cast<Address>(src);
  Address src_end = src_start + count * sizeof(SrcType);
  Address dst_start = reinterpret_cast<Address>(dst);
  Address dst_end = dst_start + count * sizeof(DstType);
  DCHECK(src_end <= dst_start || dst_end <= src_start);
#endif

  auto* dst_u = reinterpret_cast<DstTypeUnsigned*>(dst);
  auto* src_u = reinterpret_cast<const SrcTypeUnsigned*>(src);

  // Especially Atom CPUs profit from this explicit instantiation for small
  // counts. This gives up to 20 percent improvement for microbenchmarks such as
  // joining an array of small integers (2019-10-16).
  switch (count) {
#define CASE(N)                   \
  case N:                         \
    std::copy_n(src_u, N, dst_u); \
    return;
    CASE(1)
    CASE(2)
    CASE(3)
    CASE(4)
    CASE(5)
    CASE(6)
    CASE(7)
    CASE(8)
    CASE(9)
    CASE(10)
    CASE(11)
    CASE(12)
    CASE(13)
    CASE(14)
    CASE(15)
    CASE(16)
#undef CASE
    default:
      std::copy_n(src_u, count, dst_u);
      return;
  }
}

}  // namespace internal
}  // namespace v8

#endif  // V8_UTILS_MEMCOPY_H_