Commit 148b5391 authored by Ng Zhi An's avatar Ng Zhi An Committed by Commit Bot

[wasm-simd] Unify sse and avx impl for i16x8 shifts

The implementation is pretty much the same, and we instead delegate to a
macro assembler to decide if we want the sse or avx instruction.

This unification will simplify optimization of constant shifts later on.

Bug: v8:10115
Change-Id: I68e60cb3fd51156438989812be189f71e6e47ba7
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2026470Reviewed-by: 's avatarDeepti Gandluri <gdeepti@chromium.org>
Commit-Queue: Zhi An Ng <zhin@chromium.org>
Cr-Commit-Position: refs/heads/master@{#66121}
parent 1775684e
...@@ -335,10 +335,13 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase { ...@@ -335,10 +335,13 @@ class V8_EXPORT_PRIVATE TurboAssembler : public TurboAssemblerBase {
AVX_PACKED_OP3(Minpd, minpd) AVX_PACKED_OP3(Minpd, minpd)
AVX_PACKED_OP3(Maxpd, maxpd) AVX_PACKED_OP3(Maxpd, maxpd)
AVX_PACKED_OP3(Cmpunordpd, cmpunordpd) AVX_PACKED_OP3(Cmpunordpd, cmpunordpd)
AVX_PACKED_OP3(Psllw, psllw)
AVX_PACKED_OP3(Pslld, pslld) AVX_PACKED_OP3(Pslld, pslld)
AVX_PACKED_OP3(Psllq, psllq) AVX_PACKED_OP3(Psllq, psllq)
AVX_PACKED_OP3(Psrlw, psrlw)
AVX_PACKED_OP3(Psrld, psrld) AVX_PACKED_OP3(Psrld, psrld)
AVX_PACKED_OP3(Psrlq, psrlq) AVX_PACKED_OP3(Psrlq, psrlq)
AVX_PACKED_OP3(Psraw, psraw)
AVX_PACKED_OP3(Psrad, psrad) AVX_PACKED_OP3(Psrad, psrad)
AVX_PACKED_OP3(Paddq, paddq) AVX_PACKED_OP3(Paddq, paddq)
AVX_PACKED_OP3(Psubq, psubq) AVX_PACKED_OP3(Psubq, psubq)
......
...@@ -2818,44 +2818,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -2818,44 +2818,22 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
} }
break; break;
} }
case kSSEI16x8Shl: { case kIA32I16x8Shl: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
XMMRegister tmp = i.TempSimd128Register(0);
Register shift = i.InputRegister(1);
// Take shift value modulo 16.
__ and_(shift, 15);
__ movd(tmp, shift);
__ psllw(i.OutputSimd128Register(), tmp);
break;
}
case kAVXI16x8Shl: {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister tmp = i.TempSimd128Register(0);
Register shift = i.InputRegister(1);
// Take shift value modulo 16.
__ and_(shift, 15);
__ movd(tmp, shift);
__ vpsllw(i.OutputSimd128Register(), i.InputSimd128Register(0), tmp);
break;
}
case kSSEI16x8ShrS: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
XMMRegister tmp = i.TempSimd128Register(0); XMMRegister tmp = i.TempSimd128Register(0);
Register shift = i.InputRegister(1); Register shift = i.InputRegister(1);
// Take shift value modulo 16. // Take shift value modulo 16.
__ and_(shift, 15); __ and_(shift, 15);
__ movd(tmp, shift); __ Movd(tmp, shift);
__ psraw(i.OutputSimd128Register(), tmp); __ Psllw(i.OutputSimd128Register(), i.InputSimd128Register(0), tmp);
break; break;
} }
case kAVXI16x8ShrS: { case kIA32I16x8ShrS: {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister tmp = i.TempSimd128Register(0); XMMRegister tmp = i.TempSimd128Register(0);
Register shift = i.InputRegister(1); Register shift = i.InputRegister(1);
// Take shift value modulo 16. // Take shift value modulo 16.
__ and_(shift, 15); __ and_(shift, 15);
__ movd(tmp, shift); __ Movd(tmp, shift);
__ vpsraw(i.OutputSimd128Register(), i.InputSimd128Register(0), tmp); __ Psraw(i.OutputSimd128Register(), i.InputSimd128Register(0), tmp);
break; break;
} }
case kSSEI16x8SConvertI32x4: { case kSSEI16x8SConvertI32x4: {
...@@ -3022,24 +3000,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction( ...@@ -3022,24 +3000,13 @@ CodeGenerator::CodeGenResult CodeGenerator::AssembleArchInstruction(
__ Pmovzxbw(dst, dst); __ Pmovzxbw(dst, dst);
break; break;
} }
case kSSEI16x8ShrU: { case kIA32I16x8ShrU: {
DCHECK_EQ(i.OutputSimd128Register(), i.InputSimd128Register(0));
XMMRegister tmp = i.TempSimd128Register(0);
Register shift = i.InputRegister(1);
// Take shift value modulo 16.
__ and_(shift, 15);
__ movd(tmp, shift);
__ psrlw(i.OutputSimd128Register(), tmp);
break;
}
case kAVXI16x8ShrU: {
CpuFeatureScope avx_scope(tasm(), AVX);
XMMRegister tmp = i.TempSimd128Register(0); XMMRegister tmp = i.TempSimd128Register(0);
Register shift = i.InputRegister(1); Register shift = i.InputRegister(1);
// Take shift value modulo 16. // Take shift value modulo 16.
__ and_(shift, 15); __ and_(shift, 15);
__ movd(tmp, shift); __ Movd(tmp, shift);
__ vpsrlw(i.OutputSimd128Register(), i.InputSimd128Register(0), tmp); __ Psrlw(i.OutputSimd128Register(), i.InputSimd128Register(0), tmp);
break; break;
} }
case kSSEI16x8UConvertI32x4: { case kSSEI16x8UConvertI32x4: {
......
...@@ -233,10 +233,8 @@ namespace compiler { ...@@ -233,10 +233,8 @@ namespace compiler {
V(IA32I16x8SConvertI8x16Low) \ V(IA32I16x8SConvertI8x16Low) \
V(IA32I16x8SConvertI8x16High) \ V(IA32I16x8SConvertI8x16High) \
V(IA32I16x8Neg) \ V(IA32I16x8Neg) \
V(SSEI16x8Shl) \ V(IA32I16x8Shl) \
V(AVXI16x8Shl) \ V(IA32I16x8ShrS) \
V(SSEI16x8ShrS) \
V(AVXI16x8ShrS) \
V(SSEI16x8SConvertI32x4) \ V(SSEI16x8SConvertI32x4) \
V(AVXI16x8SConvertI32x4) \ V(AVXI16x8SConvertI32x4) \
V(SSEI16x8Add) \ V(SSEI16x8Add) \
...@@ -265,8 +263,7 @@ namespace compiler { ...@@ -265,8 +263,7 @@ namespace compiler {
V(AVXI16x8GeS) \ V(AVXI16x8GeS) \
V(IA32I16x8UConvertI8x16Low) \ V(IA32I16x8UConvertI8x16Low) \
V(IA32I16x8UConvertI8x16High) \ V(IA32I16x8UConvertI8x16High) \
V(SSEI16x8ShrU) \ V(IA32I16x8ShrU) \
V(AVXI16x8ShrU) \
V(SSEI16x8UConvertI32x4) \ V(SSEI16x8UConvertI32x4) \
V(AVXI16x8UConvertI32x4) \ V(AVXI16x8UConvertI32x4) \
V(SSEI16x8AddSaturateU) \ V(SSEI16x8AddSaturateU) \
......
...@@ -214,10 +214,8 @@ int InstructionScheduler::GetTargetInstructionFlags( ...@@ -214,10 +214,8 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kIA32I16x8SConvertI8x16Low: case kIA32I16x8SConvertI8x16Low:
case kIA32I16x8SConvertI8x16High: case kIA32I16x8SConvertI8x16High:
case kIA32I16x8Neg: case kIA32I16x8Neg:
case kSSEI16x8Shl: case kIA32I16x8Shl:
case kAVXI16x8Shl: case kIA32I16x8ShrS:
case kSSEI16x8ShrS:
case kAVXI16x8ShrS:
case kSSEI16x8SConvertI32x4: case kSSEI16x8SConvertI32x4:
case kAVXI16x8SConvertI32x4: case kAVXI16x8SConvertI32x4:
case kSSEI16x8Add: case kSSEI16x8Add:
...@@ -246,8 +244,7 @@ int InstructionScheduler::GetTargetInstructionFlags( ...@@ -246,8 +244,7 @@ int InstructionScheduler::GetTargetInstructionFlags(
case kAVXI16x8GeS: case kAVXI16x8GeS:
case kIA32I16x8UConvertI8x16Low: case kIA32I16x8UConvertI8x16Low:
case kIA32I16x8UConvertI8x16High: case kIA32I16x8UConvertI8x16High:
case kSSEI16x8ShrU: case kIA32I16x8ShrU:
case kAVXI16x8ShrU:
case kSSEI16x8UConvertI32x4: case kSSEI16x8UConvertI32x4:
case kAVXI16x8UConvertI32x4: case kAVXI16x8UConvertI32x4:
case kSSEI16x8AddSaturateU: case kSSEI16x8AddSaturateU:
......
...@@ -305,18 +305,13 @@ void VisitRRISimd(InstructionSelector* selector, Node* node, ...@@ -305,18 +305,13 @@ void VisitRRISimd(InstructionSelector* selector, Node* node,
} }
void VisitRROSimdShift(InstructionSelector* selector, Node* node, void VisitRROSimdShift(InstructionSelector* selector, Node* node,
ArchOpcode avx_opcode, ArchOpcode sse_opcode) { ArchOpcode opcode) {
IA32OperandGenerator g(selector); IA32OperandGenerator g(selector);
InstructionOperand operand0 = g.UseUniqueRegister(node->InputAt(0)); InstructionOperand operand0 = g.UseUniqueRegister(node->InputAt(0));
InstructionOperand operand1 = g.UseUniqueRegister(node->InputAt(1)); InstructionOperand operand1 = g.UseUniqueRegister(node->InputAt(1));
InstructionOperand temps[] = {g.TempSimd128Register()}; InstructionOperand temps[] = {g.TempSimd128Register()};
if (selector->IsSupported(AVX)) { selector->Emit(opcode, g.DefineSameAsFirst(node), operand0, operand1,
selector->Emit(avx_opcode, g.DefineAsRegister(node), operand0, operand1,
arraysize(temps), temps);
} else {
selector->Emit(sse_opcode, g.DefineSameAsFirst(node), operand0, operand1,
arraysize(temps), temps); arraysize(temps), temps);
}
} }
void VisitRROI8x16SimdRightShift(InstructionSelector* selector, Node* node, void VisitRROI8x16SimdRightShift(InstructionSelector* selector, Node* node,
...@@ -2130,17 +2125,15 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) { ...@@ -2130,17 +2125,15 @@ void InstructionSelector::VisitWord32AtomicPairCompareExchange(Node* node) {
V(S1x8AllTrue) \ V(S1x8AllTrue) \
V(S1x16AllTrue) V(S1x16AllTrue)
#define SIMD_SHIFT_OPCODES(V) \
V(I16x8Shl) \
V(I16x8ShrS) \
V(I16x8ShrU)
#define SIMD_SHIFT_OPCODES_UNIFED_SSE_AVX(V) \ #define SIMD_SHIFT_OPCODES_UNIFED_SSE_AVX(V) \
V(I64x2Shl) \ V(I64x2Shl) \
V(I64x2ShrU) \ V(I64x2ShrU) \
V(I32x4Shl) \ V(I32x4Shl) \
V(I32x4ShrS) \ V(I32x4ShrS) \
V(I32x4ShrU) V(I32x4ShrU) \
V(I16x8Shl) \
V(I16x8ShrS) \
V(I16x8ShrU)
#define SIMD_I8X16_RIGHT_SHIFT_OPCODES(V) \ #define SIMD_I8X16_RIGHT_SHIFT_OPCODES(V) \
V(I8x16ShrS) \ V(I8x16ShrS) \
...@@ -2360,17 +2353,9 @@ VISIT_SIMD_REPLACE_LANE(F32x4) ...@@ -2360,17 +2353,9 @@ VISIT_SIMD_REPLACE_LANE(F32x4)
VISIT_SIMD_REPLACE_LANE_USE_REG(F64x2) VISIT_SIMD_REPLACE_LANE_USE_REG(F64x2)
#undef VISIT_SIMD_REPLACE_LANE_USE_REG #undef VISIT_SIMD_REPLACE_LANE_USE_REG
#define VISIT_SIMD_SHIFT(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \
VisitRROSimdShift(this, node, kAVX##Opcode, kSSE##Opcode); \
}
SIMD_SHIFT_OPCODES(VISIT_SIMD_SHIFT)
#undef VISIT_SIMD_SHIFT
#undef SIMD_SHIFT_OPCODES
#define VISIT_SIMD_SHIFT_UNIFIED_SSE_AVX(Opcode) \ #define VISIT_SIMD_SHIFT_UNIFIED_SSE_AVX(Opcode) \
void InstructionSelector::Visit##Opcode(Node* node) { \ void InstructionSelector::Visit##Opcode(Node* node) { \
VisitRROSimdShift(this, node, kIA32##Opcode, kIA32##Opcode); \ VisitRROSimdShift(this, node, kIA32##Opcode); \
} }
SIMD_SHIFT_OPCODES_UNIFED_SSE_AVX(VISIT_SIMD_SHIFT_UNIFIED_SSE_AVX) SIMD_SHIFT_OPCODES_UNIFED_SSE_AVX(VISIT_SIMD_SHIFT_UNIFIED_SSE_AVX)
#undef VISIT_SIMD_SHIFT_UNIFIED_SSE_AVX #undef VISIT_SIMD_SHIFT_UNIFIED_SSE_AVX
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment