Skip to content

Commit 80cb381

Browse files
committed
EE Cache: Use GSVector instead of intrinsics
1 parent 73a4d25 commit 80cb381

File tree

1 file changed

+14
-77
lines changed

1 file changed

+14
-77
lines changed

pcsx2/vtlb.cpp

Lines changed: 14 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@
2929

3030
#include "fmt/format.h"
3131

32+
#include "GS/GSVector.h"
33+
3234
#include <bit>
3335
#include <map>
3436
#include <unordered_set>
@@ -109,16 +111,6 @@ vtlb_private::VTLBVirtual::VTLBVirtual(VTLBPhysical phys, u32 paddr, u32 vaddr)
109111
}
110112
}
111113

112-
#if defined(_M_X86)
113-
#include <immintrin.h>
114-
#elif defined(_M_ARM64)
115-
#if defined(_MSC_VER) && !defined(__clang__)
116-
#include <arm64_neon.h>
117-
#else
118-
#include <arm_neon.h>
119-
#endif
120-
#endif
121-
122114
__inline int CheckCache(u32 addr)
123115
{
124116
// Check if the cache is enabled
@@ -130,83 +122,28 @@ __inline int CheckCache(u32 addr)
130122
size_t i = 0;
131123
const size_t size = cachedTlbs.count;
132124

133-
#if defined(_M_X86)
134125
const int stride = 4;
135126

136-
const __m128i addr_vec = _mm_set1_epi32(addr);
127+
const GSVector4i addr_vec = GSVector4i::load(addr);
137128

138129
for (; i + stride <= size; i += stride)
139130
{
140-
const __m128i pfn1_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&cachedTlbs.PFN1s[i]));
141-
const __m128i pfn0_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&cachedTlbs.PFN0s[i]));
142-
const __m128i mask_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&cachedTlbs.PageMasks[i]));
143-
144-
const __m128i cached1_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&cachedTlbs.CacheEnabled1[i]));
145-
const __m128i cached0_vec = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&cachedTlbs.CacheEnabled0[i]));
146-
147-
const __m128i pfn1_end_vec = _mm_add_epi32(pfn1_vec, mask_vec);
148-
const __m128i pfn0_end_vec = _mm_add_epi32(pfn0_vec, mask_vec);
149-
150-
// pfn0 <= addr
151-
const __m128i gteLowerBound0 = _mm_or_si128(
152-
_mm_cmpgt_epi32(addr_vec, pfn0_vec),
153-
_mm_cmpeq_epi32(addr_vec, pfn0_vec));
154-
// pfn0 + mask >= addr
155-
const __m128i gteUpperBound0 = _mm_or_si128(
156-
_mm_cmpgt_epi32(pfn0_end_vec, addr_vec),
157-
_mm_cmpeq_epi32(pfn0_end_vec, addr_vec));
158-
159-
// pfn1 <= addr
160-
const __m128i gteUpperBound1 = _mm_or_si128(
161-
_mm_cmpgt_epi32(pfn1_end_vec, addr_vec),
162-
_mm_cmpeq_epi32(pfn1_end_vec, addr_vec));
163-
// pfn1 + mask >= addr
164-
const __m128i gteLowerBound1 = _mm_or_si128(
165-
_mm_cmpgt_epi32(addr_vec, pfn1_vec),
166-
_mm_cmpeq_epi32(addr_vec, pfn1_vec));
167-
168-
// pfn0 <= addr <= pfn0 + mask
169-
__m128i cmp0 = _mm_and_si128(gteLowerBound0, gteUpperBound0);
170-
// pfn1 <= addr <= pfn1 + mask
171-
__m128i cmp1 = _mm_and_si128(gteLowerBound1, gteUpperBound1);
172-
173-
cmp1 = _mm_and_si128(cmp1, cached1_vec);
174-
cmp0 = _mm_and_si128(cmp0, cached0_vec);
175-
176-
const __m128i cmp = _mm_or_si128(cmp1, cmp0);
177-
178-
if (!_mm_testz_si128(cmp, cmp))
179-
{
180-
return true;
181-
}
182-
}
183-
#elif defined(_M_ARM64)
184-
const int stride = 4;
131+
const GSVector4i pfn1_vec = GSVector4i::load<true>(&cachedTlbs.PFN1s[i]);
132+
const GSVector4i pfn0_vec = GSVector4i::load<true>(&cachedTlbs.PFN0s[i]);
133+
const GSVector4i mask_vec = GSVector4i::load<true>(&cachedTlbs.PageMasks[i]);
185134

186-
const uint32x4_t addr_vec = vld1q_dup_u32(&addr);
135+
const GSVector4i cached1_enable_vec = GSVector4i::load<true>(&cachedTlbs.CacheEnabled1[i]);
136+
const GSVector4i cached0_enable_vec = GSVector4i::load<true>(&cachedTlbs.CacheEnabled0[i]);
187137

188-
for (; i + stride <= size; i += stride)
189-
{
190-
const uint32x4_t pfn1_vec = vld1q_u32(&cachedTlbs.PFN1s[i]);
191-
const uint32x4_t pfn0_vec = vld1q_u32(&cachedTlbs.PFN0s[i]);
192-
const uint32x4_t mask_vec = vld1q_u32(&cachedTlbs.PageMasks[i]);
193-
194-
const uint32x4_t cached1_vec = vld1q_u32(&cachedTlbs.CacheEnabled1[i]);
195-
const uint32x4_t cached0_vec = vld1q_u32(&cachedTlbs.CacheEnabled0[i]);
138+
const GSVector4i cmp1 = addr_vec.ge32(pfn1_vec) & addr_vec.le32(pfn1_vec + mask_vec);
139+
const GSVector4i cmp0 = addr_vec.ge32(pfn0_vec) & addr_vec.le32(pfn1_vec + mask_vec);
196140

197-
const uint32x4_t pfn1_end_vec = vaddq_u32(pfn1_vec, mask_vec);
198-
const uint32x4_t pfn0_end_vec = vaddq_u32(pfn0_vec, mask_vec);
141+
const GSVector4i lanes_enabled = (cmp1 & cached1_enable_vec) | (cmp0 & cached0_enable_vec);
199142

200-
const uint32x4_t cmp1 = vandq_u32(vcgeq_u32(addr_vec, pfn1_vec), vcleq_u32(addr_vec, pfn1_end_vec));
201-
const uint32x4_t cmp0 = vandq_u32(vcgeq_u32(addr_vec, pfn0_vec), vcleq_u32(addr_vec, pfn0_end_vec));
202-
203-
const uint32x4_t lanes_enabled = vorrq_u32(vandq_u32(cached1_vec, cmp1), vandq_u32(cached0_vec, cmp0));
204-
205-
const uint32x2_t tmp = vorr_u32(vget_low_u32(lanes_enabled), vget_high_u32(lanes_enabled));
206-
if (vget_lane_u32(vpmax_u32(tmp, tmp), 0))
143+
if (!lanes_enabled.allfalse())
207144
return true;
208145
}
209-
#endif
146+
210147
for (; i < size; i++)
211148
{
212149
const u32 mask = cachedTlbs.PageMasks[i];
@@ -637,7 +574,7 @@ static void TAKES_R128 vtlbUnmappedVWriteLg(u32 addr, r128 data) { vtlb_Miss(add
637574
template <typename OperandType>
638575
static OperandType vtlbUnmappedPReadSm(u32 addr) {
639576
vtlb_BusError(addr, 0);
640-
if(!CHECK_EEREC && CHECK_CACHE && CheckCache(addr)){
577+
if (!CHECK_EEREC && CHECK_CACHE && CheckCache(addr)){
641578
switch (sizeof(OperandType)) {
642579
case 1: return readCache8(addr, false);
643580
case 2: return readCache16(addr, false);

0 commit comments

Comments
 (0)