Commit c0c6078a authored by Zhi An Ng's avatar Zhi An Ng Committed by Commit Bot

[wasm-simd][ia32] Prototype load lane

Prototype v128.load{8,16,32,64}_lane on IA32 (stores will come later).

This is pretty similar to x64 version, except that there is no signal
handler for OOB access, so kProtected is not a valid access mode.

Left some TODOs for myself to merge the new instruction codes
(kIA32Pinsrb) with the replace lane Wasm instructions.

Bug: v8:10975
Change-Id: I5c9f9a45e2e7f06e8fab4a28cdfe1857ccc35880
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2557063
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Reviewed-by: 's avatarBill Budge <bbudge@chromium.org>
Cr-Commit-Position: refs/heads/master@{#71394}
parent 063ee234
......@@ -2414,6 +2414,20 @@ void Assembler::shufpd(XMMRegister dst, XMMRegister src, byte imm8) {
EMIT(imm8);
}
void Assembler::movlps(XMMRegister dst, Operand src) {
EnsureSpace ensure_space(this);
EMIT(0x0F);
EMIT(0x12);
emit_sse_operand(dst, src);
}
void Assembler::movhps(XMMRegister dst, Operand src) {
EnsureSpace ensure_space(this);
EMIT(0x0F);
EMIT(0x16);
emit_sse_operand(dst, src);
}
void Assembler::movdqa(Operand dst, XMMRegister src) {
EnsureSpace ensure_space(this);
EMIT(0x66);
......@@ -2835,6 +2849,14 @@ void Assembler::vshufpd(XMMRegister dst, XMMRegister src1, Operand src2,
EMIT(imm8);
}
void Assembler::vmovlps(XMMRegister dst, XMMRegister src1, Operand src2) {
vinstr(0x12, dst, src1, src2, kNone, k0F, kWIG);
}
void Assembler::vmovhps(XMMRegister dst, XMMRegister src1, Operand src2) {
vinstr(0x16, dst, src1, src2, kNone, k0F, kWIG);
}
void Assembler::vcmpps(XMMRegister dst, XMMRegister src1, Operand src2,
uint8_t cmp) {
vps(0xC2, dst, src1, src2);
......
......@@ -861,6 +861,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
void shufps(XMMRegister dst, XMMRegister src, byte imm8);
void shufpd(XMMRegister dst, XMMRegister src, byte imm8);
void movlps(XMMRegister dst, Operand src);
void movhps(XMMRegister dst, Operand src);
void maxss(XMMRegister dst, XMMRegister src) { maxss(dst, Operand(src)); }
void maxss(XMMRegister dst, Operand src);
void minss(XMMRegister dst, XMMRegister src) { minss(dst, Operand(src)); }
......@@ -1369,6 +1372,9 @@ class V8_EXPORT_PRIVATE Assembler : public AssemblerBase {
}
void vshufpd(XMMRegister dst, XMMRegister src1, Operand src2, byte imm8);
void vmovlps(XMMRegister dst, XMMRegister src1, Operand src2);
void vmovhps(XMMRegister dst, XMMRegister src1, Operand src2);
void vpsllw(XMMRegister dst, XMMRegister src, uint8_t imm8);
void vpslld(XMMRegister dst, XMMRegister src, uint8_t imm8);
void vpsllq(XMMRegister dst, XMMRegister src, uint8_t imm8);
......
......@@ -418,6 +418,9 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_PACKED_OP3_WITH_TYPE(Psrlq, psrlq, XMMRegister, uint8_t)
AVX_PACKED_OP3_WITH_TYPE(Psraw, psraw, XMMRegister, uint8_t)
AVX_PACKED_OP3_WITH_TYPE(Psrad, psrad, XMMRegister, uint8_t)
AVX_PACKED_OP3_WITH_TYPE(Movlps, movlps, XMMRegister, Operand)
AVX_PACKED_OP3_WITH_TYPE(Movhps, movhps, XMMRegister, Operand)
#undef AVX_PACKED_OP3_WITH_TYPE
// Non-SSE2 instructions.
......
......@@ -509,6 +509,25 @@ class OutOfLineRecordWrite final : public OutOfLineCode {
} \
} while (false)
#define ASSEMBLE_SIMD_PINSR(OPCODE, CPU_FEATURE) \
do { \
XMMRegister dst = i.OutputSimd128Register(); \
XMMRegister src = i.InputSimd128Register(0); \
int8_t laneidx = i.InputInt8(1); \
if (HasAddressingMode(instr)) { \
if (CpuFeatures::IsSupported(AVX)) { \
CpuFeatureScope avx_scope(tasm(), AVX); \
__ v##OPCODE(dst, src, i.MemoryOperand(2), laneidx); \
} else { \
DCHECK_EQ(dst, src); \
CpuFeatureScope sse_scope(tasm(), CPU_FEATURE); \
__ OPCODE(dst, i.MemoryOperand(2), laneidx); \
} \
} else { \
UNIMPLEMENTED(); \
} \
} while (false)
void CodeGenerator::AssembleDeconstructFrame() {
__ mov(esp, ebp);
__ pop(ebp);
......@@ -3174,6 +3193,33 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
i.InputOperand(2), i.InputInt8(1));
break;
}
case kIA32Pinsrb: {
// TODO(zhin): Move i8x16 replace lane into this opcode.
ASSEMBLE_SIMD_PINSR(pinsrb, SSE4_1);
break;
}
case kIA32Pinsrw: {
// TODO(zhin): Move i16x8 replace lane into this opcode.
ASSEMBLE_SIMD_PINSR(pinsrw, SSE4_1);
break;
}
case kIA32Pinsrd: {
// TODO(zhin): Move i32x4 replace lane into this opcode.
ASSEMBLE_SIMD_PINSR(pinsrd, SSE4_1);
break;
}
case kIA32Movlps: {
DCHECK(instr->HasOutput()); // Move to memory unimplemented for now.
__ Movlps(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.MemoryOperand(2));
break;
}
case kIA32Movhps: {
DCHECK(instr->HasOutput()); // Move to memory unimplemented for now.
__ Movhps(i.OutputSimd128Register(), i.InputSimd128Register(0),
i.MemoryOperand(2));
break;
}
case kSSEI8x16SConvertI16x8: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
__ packsswb(i.OutputSimd128Register(), i.InputOperand(1));
......
......@@ -108,6 +108,8 @@ namespace compiler {
V(IA32Movss) \
V(IA32Movsd) \
V(IA32Movdqu) \
V(IA32Movlps) \
V(IA32Movhps) \
V(IA32BitcastFI) \
V(IA32BitcastIF) \
V(IA32Lea) \
......@@ -296,6 +298,9 @@ namespace compiler {
V(IA32I8x16ExtractLaneS) \
V(SSEI8x16ReplaceLane) \
V(AVXI8x16ReplaceLane) \
V(IA32Pinsrb) \
V(IA32Pinsrw) \
V(IA32Pinsrd) \
V(SSEI8x16SConvertI16x8) \
V(AVXI8x16SConvertI16x8) \
V(IA32I8x16Neg) \
......
......@@ -277,6 +277,9 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I8x16ExtractLaneS:
case kSSEI8x16ReplaceLane:
case kAVXI8x16ReplaceLane:
case kIA32Pinsrb:
case kIA32Pinsrw:
case kIA32Pinsrd:
case kSSEI8x16SConvertI16x8:
case kAVXI8x16SConvertI16x8:
case kIA32I8x16Neg:
......@@ -398,6 +401,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32Movss:
case kIA32Movsd:
case kIA32Movdqu:
case kIA32Movlps:
case kIA32Movhps:
// Moves are used for memory load/store operations.
case kIA32S128Load8Splat:
case kIA32S128Load16Splat:
......
......@@ -364,6 +364,54 @@ void InstructionSelector::VisitAbortCSAAssert(Node* node) {
Emit(kArchAbortCSAAssert, g.NoOutput(), g.UseFixed(node->InputAt(0), edx));
}
void InstructionSelector::VisitLoadLane(Node* node) {
LoadLaneParameters params = LoadLaneParametersOf(node->op());
InstructionCode opcode = kArchNop;
if (params.rep == MachineType::Int8()) {
opcode = kIA32Pinsrb;
} else if (params.rep == MachineType::Int16()) {
opcode = kIA32Pinsrw;
} else if (params.rep == MachineType::Int32()) {
opcode = kIA32Pinsrd;
} else if (params.rep == MachineType::Int64()) {
// pinsrq not available on IA32.
if (params.laneidx == 0) {
opcode = kIA32Movlps;
} else {
DCHECK_EQ(1, params.laneidx);
opcode = kIA32Movhps;
}
} else {
UNREACHABLE();
}
IA32OperandGenerator g(this);
InstructionOperand outputs[] = {g.DefineAsRegister(node)};
// Input 0 is value node, 1 is lane idx, and GetEffectiveAddressMemoryOperand
// uses up to 3 inputs. This ordering is consistent with other operations that
// use the same opcode.
InstructionOperand inputs[5];
size_t input_count = 0;
inputs[input_count++] = g.UseRegister(node->InputAt(2));
inputs[input_count++] = g.UseImmediate(params.laneidx);
AddressingMode mode =
g.GetEffectiveAddressMemoryOperand(node, inputs, &input_count);
opcode |= AddressingModeField::encode(mode);
DCHECK_GE(5, input_count);
// IA32 supports unaligned loads.
DCHECK_NE(params.kind, MemoryAccessKind::kUnaligned);
// Trap handler is not supported on IA32.
DCHECK_NE(params.kind, MemoryAccessKind::kProtected);
Emit(opcode, 1, outputs, input_count, inputs);
}
void InstructionSelector::VisitStoreLane(Node* node) {}
void InstructionSelector::VisitLoadTransform(Node* node) {
LoadTransformParameters params = LoadTransformParametersOf(node->op());
InstructionCode opcode;
......
......@@ -2809,11 +2809,13 @@ void InstructionSelector::VisitPrefetchNonTemporal(Node* node) {
}
#endif // !V8_TARGET_ARCH_ARM64
#if !V8_TARGET_ARCH_X64
#if !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32
// TODO(v8:10975): Prototyping load lane and store lane.
void InstructionSelector::VisitLoadLane(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitStoreLane(Node* node) { UNIMPLEMENTED(); }
#endif // !V8_TARGET_ARCH_X64 && !V8_TARGET_ARCH_IA32
#if !V8_TARGET_ARCH_X64
// TODO(v8:10983) Prototyping sign select.
void InstructionSelector::VisitI8x16SignSelect(Node* node) { UNIMPLEMENTED(); }
void InstructionSelector::VisitI16x8SignSelect(Node* node) { UNIMPLEMENTED(); }
......
......@@ -4093,16 +4093,18 @@ Node* WasmGraphBuilder::LoadLane(MachineType memtype, Node* value, Node* index,
MemoryAccessKind load_kind =
GetMemoryAccessKind(mcgraph(), memtype, use_trap_handler());
// {offset} is validated to be within uintptr_t range in {BoundsCheckMem}.
uintptr_t capped_offset = static_cast<uintptr_t>(offset);
load = SetEffect(graph()->NewNode(
mcgraph()->machine()->LoadLane(load_kind, memtype, laneidx),
MemBuffer(offset), index, value, effect(), control()));
MemBuffer(capped_offset), index, value, effect(), control()));
if (load_kind == MemoryAccessKind::kProtected) {
SetSourcePosition(load, position);
}
if (FLAG_trace_wasm_memory) {
TraceMemoryOperation(false, memtype.representation(), index, offset,
TraceMemoryOperation(false, memtype.representation(), index, capped_offset,
position);
}
......
......@@ -1103,6 +1103,14 @@ int DisassemblerIA32::AVXInstruction(byte* data) {
current += PrintRightXMMOperand(current);
AppendToBuffer(",%s", NameOfXMMRegister(regop));
break;
case 0x12:
AppendToBuffer("vmovlps %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
break;
case 0x16:
AppendToBuffer("vmovhps %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
break;
case 0x28:
AppendToBuffer("vmovaps %s,", NameOfXMMRegister(regop));
current += PrintRightXMMOperand(current);
......@@ -1820,7 +1828,15 @@ int DisassemblerIA32::InstructionDecode(v8::internal::Vector<char> out_buffer,
// Not every instruction use this, and it is safe to index data+2 as all
// instructions are at least 3 bytes with operands.
get_modrm(*(data + 2), &mod, &regop, &rm);
if (f0byte == 0x18) {
if (f0byte == 0x12) {
data += 2;
AppendToBuffer("movlps %s,%s", NameOfXMMRegister(regop),
NameOfXMMRegister(rm));
} else if (f0byte == 0x16) {
data += 2;
AppendToBuffer("movhps %s,%s", NameOfXMMRegister(regop),
NameOfXMMRegister(rm));
} else if (f0byte == 0x18) {
data += 2;
const char* suffix[] = {"nta", "1", "2", "3"};
AppendToBuffer("%s%s ", f0mnem, suffix[regop & 0x03]);
......
......@@ -396,6 +396,9 @@ TEST(DisasmIa320) {
__ cvtsd2ss(xmm0, Operand(ebx, ecx, times_4, 10000));
__ movq(xmm0, Operand(edx, 4));
__ movlps(xmm0, Operand(ebx, ecx, times_4, 10000));
__ movhps(xmm0, Operand(ebx, ecx, times_4, 10000));
// logic operation
__ andps(xmm0, xmm1);
__ andps(xmm0, Operand(ebx, ecx, times_4, 10000));
......@@ -698,6 +701,9 @@ TEST(DisasmIa320) {
__ vhaddps(xmm0, xmm1, xmm2);
__ vhaddps(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000));
__ vmovlps(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000));
__ vmovhps(xmm0, xmm1, Operand(ebx, ecx, times_4, 10000));
__ vcmpeqps(xmm5, xmm4, xmm1);
__ vcmpeqps(xmm5, xmm4, Operand(ebx, ecx, times_4, 10000));
__ vcmpltps(xmm5, xmm4, xmm1);
......
......@@ -3970,7 +3970,7 @@ WASM_SIMD_TEST(S128Load64Zero) {
RunLoadZeroTest<int64_t>(execution_tier, lower_simd, kExprS128Load64Zero);
}
#if V8_TARGET_ARCH_X64
#if V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32
// TODO(v8:10975): Prototyping load lane and store lane.
template <typename T>
void RunLoadLaneTest(TestExecutionTier execution_tier, LowerSimd lower_simd,
......@@ -4075,7 +4075,9 @@ WASM_SIMD_TEST_NO_LOWERING(S128Load64Lane) {
RunLoadLaneTest<int64_t>(execution_tier, lower_simd, kExprS128Load64Lane,
kExprI64x2Splat);
}
#endif // V8_TARGET_ARCH_X64 || V8_TARGET_ARCH_IA32
#if V8_TARGET_ARCH_X64
template <typename T>
void RunStoreLaneTest(TestExecutionTier execution_tier, LowerSimd lower_simd,
WasmOpcode store_op, WasmOpcode splat_op) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment