Commit fba61177 authored by Ng Zhi An's avatar Ng Zhi An Committed by V8 LUCI CQ

[x64][ia32] Share Pinsrd code

We need to support pre-SSE4.1 for Pinsrd. Share the AVX and SSE4_1 code,
but delegate to base class for the pre-SSE4.1 code (via CRTP).

Bug: v8:11589
Change-Id: I7563670174e44d8061182aefc311593d9578b0e4
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/3159183Reviewed-by: 's avatarAdam Klein <adamk@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/main@{#76827}
parent 66cbbb7a
......@@ -1613,21 +1613,8 @@ void TurboAssembler::Pextrd(Register dst, XMMRegister src, uint8_t imm8) {
add(esp, Immediate(kDoubleSize));
}
void TurboAssembler::Pinsrd(XMMRegister dst, XMMRegister src1, Operand src2,
uint8_t imm8) {
if (CpuFeatures::IsSupported(AVX)) {
CpuFeatureScope scope(this, AVX);
vpinsrd(dst, src1, src2, imm8);
return;
}
if (dst != src1) {
movaps(dst, src1);
}
if (CpuFeatures::IsSupported(SSE4_1)) {
CpuFeatureScope sse_scope(this, SSE4_1);
pinsrd(dst, src2, imm8);
return;
}
void TurboAssembler::PinsrdPreSse41(XMMRegister dst, Operand src, uint8_t imm8,
uint32_t* load_pc_offset) {
// Without AVX or SSE, we can only have 64-bit values in xmm registers.
// We don't have an xmm scratch register, so move the data via the stack. This
// path is rarely required, so it's acceptable to be slow.
......@@ -1636,10 +1623,10 @@ void TurboAssembler::Pinsrd(XMMRegister dst, XMMRegister src1, Operand src2,
// Write original content of {dst} to the stack.
movsd(Operand(esp, 0), dst);
// Overwrite the portion specified in {imm8}.
if (src2.is_reg_only()) {
mov(Operand(esp, imm8 * kUInt32Size), src2.reg());
if (src.is_reg_only()) {
mov(Operand(esp, imm8 * kUInt32Size), src.reg());
} else {
movss(dst, src2);
movss(dst, src);
movss(Operand(esp, imm8 * kUInt32Size), dst);
}
// Load back the full value into {dst}.
......@@ -1647,10 +1634,6 @@ void TurboAssembler::Pinsrd(XMMRegister dst, XMMRegister src1, Operand src2,
add(esp, Immediate(kDoubleSize));
}
void TurboAssembler::Pinsrd(XMMRegister dst, Operand src, uint8_t imm8) {
Pinsrd(dst, dst, src, imm8);
}
void TurboAssembler::Lzcnt(Register dst, Operand src) {
if (CpuFeatures::IsSupported(LZCNT)) {
CpuFeatureScope scope(this, LZCNT);
......
......@@ -337,12 +337,12 @@ class V8_EXPORT_PRIVATE TurboAssembler
}
void Pextrd(Register dst, XMMRegister src, uint8_t imm8);
void Pinsrd(XMMRegister dst, Register src, uint8_t imm8) {
Pinsrd(dst, Operand(src), imm8);
void PinsrdPreSse41(XMMRegister dst, Register src, uint8_t imm8,
uint32_t* load_pc_offset) {
PinsrdPreSse41(dst, Operand(src), imm8, load_pc_offset);
}
void Pinsrd(XMMRegister dst, Operand src, uint8_t imm8);
// Moves src1 to dst if AVX is not supported.
void Pinsrd(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t imm8);
void PinsrdPreSse41(XMMRegister dst, Operand src, uint8_t imm8,
uint32_t* load_pc_offset);
// Expression support
// cvtsi2sd instruction only writes to the low 64-bit of dst register, which
......
......@@ -528,6 +528,27 @@ class V8_EXPORT_PRIVATE SharedTurboAssemblerBase : public SharedTurboAssembler {
}
#undef FLOAT_UNOP
template <typename Op>
void Pinsrd(XMMRegister dst, XMMRegister src1, Op src2, uint8_t imm8,
uint32_t* load_pc_offset = nullptr) {
if (CpuFeatures::IsSupported(SSE4_1)) {
PinsrHelper(this, &Assembler::vpinsrd, &Assembler::pinsrd, dst, src1,
src2, imm8, load_pc_offset,
base::Optional<CpuFeature>(SSE4_1));
} else {
if (dst != src1) {
movaps(dst, src1);
}
impl()->PinsrdPreSse41(dst, src2, imm8, load_pc_offset);
}
}
template <typename Op>
void Pinsrd(XMMRegister dst, Op src, uint8_t imm8,
uint32_t* load_pc_offset = nullptr) {
Pinsrd(dst, dst, src, imm8, load_pc_offset);
}
void F64x2ConvertLowI32x4U(XMMRegister dst, XMMRegister src,
Register scratch) {
ASM_CODE_COMMENT(this);
......
......@@ -2118,56 +2118,28 @@ void TurboAssembler::Pextrd(Register dst, XMMRegister src, uint8_t imm8) {
}
namespace {
} // namespace
void TurboAssembler::Pinsrd(XMMRegister dst, XMMRegister src1, Register src2,
uint8_t imm8, uint32_t* load_pc_offset) {
// Need a fall back when SSE4_1 is unavailable. Pinsrb and Pinsrq are used
// only by Wasm SIMD, which requires SSE4_1 already.
if (CpuFeatures::IsSupported(SSE4_1)) {
PinsrHelper(this, &Assembler::vpinsrd, &Assembler::pinsrd, dst, src1, src2,
imm8, load_pc_offset, {SSE4_1});
return;
}
Movd(kScratchDoubleReg, src2);
if (load_pc_offset) *load_pc_offset = pc_offset();
template <typename Op>
void PinsrdPreSse41Helper(TurboAssembler* tasm, XMMRegister dst, Op src,
uint8_t imm8, uint32_t* load_pc_offset) {
tasm->Movd(kScratchDoubleReg, src);
if (load_pc_offset) *load_pc_offset = tasm->pc_offset();
if (imm8 == 1) {
punpckldq(dst, kScratchDoubleReg);
tasm->punpckldq(dst, kScratchDoubleReg);
} else {
DCHECK_EQ(0, imm8);
Movss(dst, kScratchDoubleReg);
}
}
void TurboAssembler::Pinsrd(XMMRegister dst, XMMRegister src1, Operand src2,
uint8_t imm8, uint32_t* load_pc_offset) {
// Need a fall back when SSE4_1 is unavailable. Pinsrb and Pinsrq are used
// only by Wasm SIMD, which requires SSE4_1 already.
if (CpuFeatures::IsSupported(SSE4_1)) {
PinsrHelper(this, &Assembler::vpinsrd, &Assembler::pinsrd, dst, src1, src2,
imm8, load_pc_offset, {SSE4_1});
return;
}
Movd(kScratchDoubleReg, src2);
if (load_pc_offset) *load_pc_offset = pc_offset();
if (imm8 == 1) {
punpckldq(dst, kScratchDoubleReg);
} else {
DCHECK_EQ(0, imm8);
Movss(dst, kScratchDoubleReg);
tasm->Movss(dst, kScratchDoubleReg);
}
}
} // namespace
void TurboAssembler::Pinsrd(XMMRegister dst, Register src2, uint8_t imm8,
uint32_t* load_pc_offset) {
Pinsrd(dst, dst, src2, imm8, load_pc_offset);
void TurboAssembler::PinsrdPreSse41(XMMRegister dst, Register src, uint8_t imm8,
uint32_t* load_pc_offset) {
PinsrdPreSse41Helper(this, dst, src, imm8, load_pc_offset);
}
void TurboAssembler::Pinsrd(XMMRegister dst, Operand src2, uint8_t imm8,
uint32_t* load_pc_offset) {
Pinsrd(dst, dst, src2, imm8, load_pc_offset);
void TurboAssembler::PinsrdPreSse41(XMMRegister dst, Operand src, uint8_t imm8,
uint32_t* load_pc_offset) {
PinsrdPreSse41Helper(this, dst, src, imm8, load_pc_offset);
}
void TurboAssembler::Pinsrq(XMMRegister dst, XMMRegister src1, Register src2,
......
......@@ -415,14 +415,10 @@ class V8_EXPORT_PRIVATE TurboAssembler
// Non-SSE2 instructions.
void Pextrd(Register dst, XMMRegister src, uint8_t imm8);
void Pinsrd(XMMRegister dst, XMMRegister src1, Register src2, uint8_t imm8,
uint32_t* load_pc_offset = nullptr);
void Pinsrd(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t imm8,
uint32_t* load_pc_offset = nullptr);
void Pinsrd(XMMRegister dst, Register src2, uint8_t imm8,
uint32_t* load_pc_offset = nullptr);
void Pinsrd(XMMRegister dst, Operand src2, uint8_t imm8,
uint32_t* load_pc_offset = nullptr);
void PinsrdPreSse41(XMMRegister dst, Register src2, uint8_t imm8,
uint32_t* load_pc_offset = nullptr);
void PinsrdPreSse41(XMMRegister dst, Operand src2, uint8_t imm8,
uint32_t* load_pc_offset = nullptr);
void Pinsrq(XMMRegister dst, XMMRegister src1, Register src2, uint8_t imm8,
uint32_t* load_pc_offset = nullptr);
void Pinsrq(XMMRegister dst, XMMRegister src1, Operand src2, uint8_t imm8,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment