Commit 856e8577 authored by Igor Sheludko's avatar Igor Sheludko Committed by Commit Bot

[dict-proto] SIMD support for SwissNameDictionary in Torque

This CL adds a Torque-counterpart for swiss_table::GroupSse2Impl in
Torque. This allows the Torque version of SwissNameDictionary to use
SSE for lookups, rather than needing to bailout to the runtime on
x64/ia32.

Bug: v8:11330
Change-Id: I74e3f97c460a8b89031016967ec0e545265016a9
Reviewed-on: https://chromium-review.googlesource.com/c/v8/v8/+/2787485Reviewed-by: 's avatarIgor Sheludko <ishell@chromium.org>
Reviewed-by: 's avatarSantiago Aboy Solanes <solanes@chromium.org>
Reviewed-by: 's avatarZhi An Ng <zhin@chromium.org>
Commit-Queue: Igor Sheludko <ishell@chromium.org>
Cr-Commit-Position: refs/heads/master@{#73727}
parent eff32ae8
......@@ -113,6 +113,9 @@ type bool generates 'TNode<BoolT>' constexpr 'bool';
type bint generates 'TNode<BInt>' constexpr 'BInt';
type string constexpr 'const char*';
type Simd128 generates 'TNode<Simd128T>';
type I8X16 extends Simd128 generates 'TNode<I8x16T>';
// Represents a std::function which produces the generated TNode type of T.
// Useful for passing values to and from CSA code that uses LazyNode<T>, which
// is a typedef for std::function<TNode<T>()>. Can be created with %MakeLazy and
......@@ -917,7 +920,7 @@ extern operator '*' macro ConstexprInt31Mul(
extern operator '-' macro Int32Sub(int16, int16): int32;
extern operator '-' macro Int32Sub(uint16, uint16): int32;
extern operator '-' macro Int32Sub(int32, int32): int32;
extern operator '-' macro UInt32Sub(uint32, uint32): uint32;
extern operator '-' macro Uint32Sub(uint32, uint32): uint32;
extern operator '*' macro Int32Mul(int32, int32): int32;
extern operator '*' macro Uint32Mul(uint32, uint32): uint32;
extern operator '/' macro Int32Div(int32, int32): int32;
......@@ -1050,6 +1053,7 @@ operator '==' macro PromiseStateEquals(
}
extern macro CountLeadingZeros64(uint64): int64;
extern macro CountTrailingZeros32(uint32): int32;
extern macro CountTrailingZeros64(uint64): int64;
extern macro TaggedIsSmi(Object): bool;
......@@ -1845,3 +1849,8 @@ extern operator '[]' macro LoadWeakFixedArrayElement(
const kNoHashSentinel:
constexpr int32 generates 'PropertyArray::kNoHashSentinel';
extern macro LoadNameHash(Name): uint32;
extern macro LoadSimd128(intptr): Simd128;
extern macro I8x16BitMask(I8X16): int32;
extern macro I8x16Eq(I8X16, I8X16): I8X16;
extern macro I8x16Splat(int32): I8X16;
......@@ -335,3 +335,6 @@ Convert<PromiseState, int32>(s: int32): PromiseState {
Convert<ScopeFlags, Smi>(s: Smi): ScopeFlags {
return %RawDownCast<ScopeFlags>(Unsigned(SmiToInt32(s)));
}
Convert<I8X16, Simd128>(s: Simd128): I8X16 {
return %RawDownCast<I8X16>(s);
}
......@@ -1137,6 +1137,10 @@ class V8_EXPORT_PRIVATE CodeStubAssembler
Map::kConstructorOrBackPointerOrNativeContextOffset);
}
TNode<Simd128T> LoadSimd128(TNode<IntPtrT> ptr) {
return Load<Simd128T>(ptr);
}
// Reference is the CSA-equivalent of a Torque reference value, representing
// an inner pointer into a HeapObject.
//
......
......@@ -110,6 +110,16 @@ struct BoolT : Word32T {};
template <class T1, class T2>
struct PairT {};
struct Simd128T : UntaggedT {
static const MachineRepresentation kMachineRepresentation =
MachineRepresentation::kSimd128;
static constexpr MachineType kMachineType = MachineType::Simd128();
};
struct I8x16T : Simd128T {};
struct I16x8T : Simd128T {};
struct I32x2T : Simd128T {};
inline constexpr MachineType CommonMachineType(MachineType type1,
MachineType type2) {
return (type1 == type2) ? type1
......
......@@ -270,6 +270,7 @@ class CodeAssemblerParameterizedLabel;
V(Float64Min, Float64T, Float64T, Float64T) \
V(Float64InsertLowWord32, Float64T, Float64T, Word32T) \
V(Float64InsertHighWord32, Float64T, Float64T, Word32T) \
V(I8x16Eq, I8x16T, I8x16T, I8x16T) \
V(IntPtrAdd, WordT, WordT, WordT) \
V(IntPtrSub, WordT, WordT, WordT) \
V(IntPtrMul, WordT, WordT, WordT) \
......@@ -374,6 +375,8 @@ TNode<Float64T> Float64Add(TNode<Float64T> a, TNode<Float64T> b);
V(Word32BitwiseNot, Word32T, Word32T) \
V(WordNot, WordT, WordT) \
V(Word64Not, Word64T, Word64T) \
V(I8x16BitMask, Int32T, I8x16T) \
V(I8x16Splat, I8x16T, Int32T) \
V(Int32AbsWithOverflow, PAIR_TYPE(Int32T, BoolT), Int32T) \
V(Int64AbsWithOverflow, PAIR_TYPE(Int64T, BoolT), Int64T) \
V(IntPtrAbsWithOverflow, PAIR_TYPE(IntPtrT, BoolT), IntPtrT) \
......
......@@ -272,6 +272,7 @@ class MachineRepresentationInferrer {
case IrOpcode::kFloat64ExtractLowWord32:
case IrOpcode::kFloat64ExtractHighWord32:
case IrOpcode::kWord32Popcnt:
case IrOpcode::kI8x16BitMask:
MACHINE_UNOP_32_LIST(LABEL)
MACHINE_BINOP_32_LIST(LABEL) {
representation_vector_[node->id()] =
......@@ -323,6 +324,8 @@ class MachineRepresentationInferrer {
break;
case IrOpcode::kI32x4ReplaceLane:
case IrOpcode::kI32x4Splat:
case IrOpcode::kI8x16Splat:
case IrOpcode::kI8x16Eq:
representation_vector_[node->id()] =
MachineRepresentation::kSimd128;
break;
......@@ -445,6 +448,7 @@ class MachineRepresentationChecker {
case IrOpcode::kI32x4ExtractLane:
case IrOpcode::kI16x8ExtractLaneU:
case IrOpcode::kI16x8ExtractLaneS:
case IrOpcode::kI8x16BitMask:
case IrOpcode::kI8x16ExtractLaneU:
case IrOpcode::kI8x16ExtractLaneS:
CheckValueInputRepresentationIs(node, 0,
......@@ -456,8 +460,16 @@ class MachineRepresentationChecker {
CheckValueInputForInt32Op(node, 1);
break;
case IrOpcode::kI32x4Splat:
case IrOpcode::kI8x16Splat:
CheckValueInputForInt32Op(node, 0);
break;
case IrOpcode::kI8x16Eq:
CheckValueInputRepresentationIs(node, 0,
MachineRepresentation::kSimd128);
CheckValueInputRepresentationIs(node, 1,
MachineRepresentation::kSimd128);
break;
#define LABEL(opcode) case IrOpcode::k##opcode:
case IrOpcode::kChangeInt32ToTagged:
case IrOpcode::kChangeUint32ToTagged:
......
......@@ -881,6 +881,12 @@ class V8_EXPORT_PRIVATE RawMachineAssembler {
Node* I16x8Splat(Node* a) { return AddNode(machine()->I16x8Splat(), a); }
Node* I8x16Splat(Node* a) { return AddNode(machine()->I8x16Splat(), a); }
Node* I8x16BitMask(Node* a) { return AddNode(machine()->I8x16BitMask(), a); }
Node* I8x16Eq(Node* a, Node* b) {
return AddNode(machine()->I8x16Eq(), a, b);
}
// Stack operations.
Node* LoadFramePointer() { return AddNode(machine()->LoadFramePointer()); }
Node* LoadParentFramePointer() {
......
......@@ -349,11 +349,29 @@ struct GroupPortableImpl {
};
// Determine which Group implementation SwissNameDictionary uses.
#if defined(V8_ENABLE_SWISS_NAME_DICTIONARY) && DEBUG
// TODO(v8:11388) If v8_enable_swiss_name_dictionary is enabled, we are supposed
// to use SwissNameDictionary as the dictionary backing store. If we want to use
// the SIMD version of SwissNameDictionary, that would require us to compile SSE
// instructions into the snapshot that exceed the minimum requirements for V8
// SSE support. Therefore, this fails a DCHECK. However, given the experimental
// nature of v8_enable_swiss_name_dictionary mode, we only except this to be run
// by developers/bots, that always have the necessary instructions. This means
// that if v8_enable_swiss_name_dictionary is enabled and debug mode isn't, we
// ignore the DCHECK that would fail in debug mode. However, if both
// v8_enable_swiss_name_dictionary and debug mode are enabled, we must fallback
// to the non-SSE implementation. Given that V8 requires SSE2, there should be a
// solution that doesn't require the workaround present here. Instead, the
// backend should only use SSE2 when compiling the SIMD version of
// SwissNameDictionary into the builtin.
using Group = GroupPortableImpl;
#else
#if SWISS_TABLE_HAVE_SSE2
using Group = GroupSse2Impl;
#else
using Group = GroupPortableImpl;
#endif
#endif
#undef SWISS_TABLE_HAVE_SSE2
#undef SWISS_TABLE_HAVE_SSE3
......
......@@ -45,6 +45,10 @@ struct ProbeSequence {
index: uint32;
}
macro ClearLowestSetBit<T: type>(value: T): T {
return value & (value - FromConstexpr<T>(1));
}
const kByteMaskShift: uint64 = 3;
// Counterpart to swiss_table::BitMask<uint64_t, kWidth, 3>, as used by
......@@ -61,12 +65,31 @@ struct ByteMask {
// Counterpart to operator++() in C++ version.
macro ClearLowestSetBit() {
this.mask = this.mask & (this.mask - FromConstexpr<uint64>(1));
this.mask = ClearLowestSetBit<uint64>(this.mask);
}
mask: uint64;
}
// Counterpart to swiss_table::BitMask<uint32t, kWidth, 0>, as used by
// swiss_table::GroupSse2Impl in C++ implementation.
struct BitMask {
macro HasBitsSet(): bool {
return this.mask != FromConstexpr<uint32>(0);
}
macro LowestBitSet(): int32 {
return Convert<int32>(CountTrailingZeros32(this.mask));
}
// Counterpart to operator++() in C++ version.
macro ClearLowestSetBit() {
this.mask = ClearLowestSetBit<uint32>(this.mask);
}
mask: uint32;
}
macro H1(hash: uint32): uint32 {
return hash >>> Unsigned(FromConstexpr<int32>(kH2Bits));
}
......@@ -80,6 +103,7 @@ const kLsbs: constexpr uint64
const kMsbs: constexpr uint64
generates 'swiss_table::GroupPortableImpl::kMsbs';
// Counterpart to swiss_table::GroupPortableImpl in C++.
struct GroupPortableImpl {
macro Match(h2: uint32): ByteMask {
const x = Word64Xor(this.ctrl, (kLsbs * Convert<uint64>(h2)));
......@@ -95,6 +119,45 @@ struct GroupPortableImpl {
const ctrl: uint64;
}
// Counterpart to swiss_table::GroupSse2Impl in C++. Note that the name is
// chosen for consistency, this struct is not actually SSE-specific.
struct GroupSse2Impl {
macro Match(h2: uint32): BitMask {
// Fill 16 8-bit lanes with |h2|:
const searchPattern = I8x16Splat(Signed(h2));
// Create a 128 bit mask such that in each of the 16 8-bit lanes, the MSB
// indicates whether or not the corresponding lanes of |this.ctrl| and
// |searchPattern| have the same value:
const matches128 = I8x16Eq(searchPattern, this.ctrl);
// Turn the 128 bit mask into a 32 bit one, by turning the MSB of the i-th
// lane into the i-th bit in the output mask:
const matches32 = Unsigned(I8x16BitMask(matches128));
return BitMask{mask: matches32};
}
macro MatchEmpty(): BitMask {
// TODO(v8:11330) The C++ implementation in
// swiss_table::GroupSse2Impl::MatchEmpty utilizes a special trick that is
// possible due to kEmpty being -128 and allows shaving off one SSE
// instruction. This depends on having access to _mm_cmpeq_epi8 aka PCMPEQB,
// which the V8 backend currently doesn't expose.
// Fill 16 8-bit lanes with |kEmpty|:
const searchPattern =
I8x16Splat(Convert<int32>(FromConstexpr<uint8>(ctrl::kEmpty)));
// Create a 128 bit mask such that in each of the 16 8-bit lanes, the MSB
// indicates whether or not the corresponding lanes of |this.ctrl| contains
// |kEmpty|:
const matches128 = I8x16Eq(searchPattern, this.ctrl);
// Turn the 128 bit mask into a 32 bit one, by turning the MSB of the i-th
// lane into the i-th bit in the output mask:
const matches32 = Unsigned(I8x16BitMask(matches128));
return BitMask{mask: matches32};
}
const ctrl: I8X16;
}
struct GroupPortableLoader {
macro LoadGroup(ctrlPtr: intptr): GroupPortableImpl {
return GroupPortableImpl{
......@@ -102,4 +165,10 @@ struct GroupPortableLoader {
};
}
}
struct GroupSse2Loader {
macro LoadGroup(ctrlPtr: intptr): GroupSse2Impl {
return GroupSse2Impl{ctrl: Convert<I8X16>(LoadSimd128(ctrlPtr))};
}
}
}
......@@ -28,12 +28,6 @@ const kMax2ByteMetaTableCapacity: constexpr int32
const kNotFoundSentinel:
constexpr int32 generates 'SwissNameDictionary::kNotFoundSentinel';
extern macro LoadSwissNameDictionaryNumberOfElements(
SwissNameDictionary, intptr): intptr;
extern macro LoadSwissNameDictionaryNumberOfDeletedElements(
SwissNameDictionary, intptr): intptr;
extern macro LoadSwissNameDictionaryKey(SwissNameDictionary, intptr): Name;
extern macro StoreSwissNameDictionaryKeyAndValue(
......@@ -287,14 +281,8 @@ macro SwissNameDictionaryDelete(table: SwissNameDictionary, entry: intptr)
@export
macro SwissNameDictionaryFindEntrySIMD(table: SwissNameDictionary, key: Name):
never labels Found(intptr), NotFound {
// TODO(v8:11330) Not implemented in Torque, yet, doing runtime call
// instead.
const res = runtime::SwissTableFindEntry(kNoContext, table, key);
if (res == kNotFoundSentinel) {
goto NotFound;
} else {
goto Found(Convert<intptr>(res));
}
FindEntry<GroupSse2Loader>(table, key)
otherwise Found, NotFound;
}
@export
......@@ -317,26 +305,8 @@ Found(intptr),
macro SwissNameDictionaryAddSIMD(
table: SwissNameDictionary, key: Name, value: Object,
propertyDetails: uint8) labels Bailout {
// TODO(v8:11330) Not implemented in Torque, yet, doing runtime call
// instead. However, must bailout if the runtime call would allocate a new
// dictionary.
// Determine if bailout needed:
const capacity = Convert<intptr>(table.capacity);
const maxUsable = SwissNameDictionaryMaxUsableCapacity(capacity);
// Doing two independent accesses to the meta table here (repeating the
// branching), rather than combining the accesses. Accepting that due to
// the fact that this is a slow placeholder until the SIMD version
// replaces it.
const nof = LoadSwissNameDictionaryNumberOfElements(table, capacity);
const nod = LoadSwissNameDictionaryNumberOfDeletedElements(table, capacity);
const used = nof + nod;
if (used >= maxUsable) {
goto Bailout;
}
runtime::SwissTableAdd(
kNoContext, table, key, value,
Convert<Smi>(Convert<int32>(propertyDetails)));
Add<GroupSse2Loader>(table, key, value, propertyDetails)
otherwise Bailout;
}
@export
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment