29
29
30
30
#include " fmt/format.h"
31
31
32
+ #include " GS/GSVector.h"
33
+
32
34
#include < bit>
33
35
#include < map>
34
36
#include < unordered_set>
@@ -109,16 +111,6 @@ vtlb_private::VTLBVirtual::VTLBVirtual(VTLBPhysical phys, u32 paddr, u32 vaddr)
109
111
}
110
112
}
111
113
112
- #if defined(_M_X86)
113
- #include < immintrin.h>
114
- #elif defined(_M_ARM64)
115
- #if defined(_MSC_VER) && !defined(__clang__)
116
- #include < arm64_neon.h>
117
- #else
118
- #include < arm_neon.h>
119
- #endif
120
- #endif
121
-
122
114
__inline int CheckCache (u32 addr)
123
115
{
124
116
// Check if the cache is enabled
@@ -130,83 +122,28 @@ __inline int CheckCache(u32 addr)
130
122
size_t i = 0 ;
131
123
const size_t size = cachedTlbs.count ;
132
124
133
- #if defined(_M_X86)
134
125
const int stride = 4 ;
135
126
136
- const __m128i addr_vec = _mm_set1_epi32 (addr);
127
+ const GSVector4i addr_vec = GSVector4i::load (addr);
137
128
138
129
for (; i + stride <= size; i += stride)
139
130
{
140
- const __m128i pfn1_vec = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(&cachedTlbs.PFN1s [i]));
141
- const __m128i pfn0_vec = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(&cachedTlbs.PFN0s [i]));
142
- const __m128i mask_vec = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(&cachedTlbs.PageMasks [i]));
143
-
144
- const __m128i cached1_vec = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(&cachedTlbs.CacheEnabled1 [i]));
145
- const __m128i cached0_vec = _mm_loadu_si128 (reinterpret_cast <const __m128i*>(&cachedTlbs.CacheEnabled0 [i]));
146
-
147
- const __m128i pfn1_end_vec = _mm_add_epi32 (pfn1_vec, mask_vec);
148
- const __m128i pfn0_end_vec = _mm_add_epi32 (pfn0_vec, mask_vec);
149
-
150
- // pfn0 <= addr
151
- const __m128i gteLowerBound0 = _mm_or_si128 (
152
- _mm_cmpgt_epi32 (addr_vec, pfn0_vec),
153
- _mm_cmpeq_epi32 (addr_vec, pfn0_vec));
154
- // pfn0 + mask >= addr
155
- const __m128i gteUpperBound0 = _mm_or_si128 (
156
- _mm_cmpgt_epi32 (pfn0_end_vec, addr_vec),
157
- _mm_cmpeq_epi32 (pfn0_end_vec, addr_vec));
158
-
159
- // pfn1 <= addr
160
- const __m128i gteUpperBound1 = _mm_or_si128 (
161
- _mm_cmpgt_epi32 (pfn1_end_vec, addr_vec),
162
- _mm_cmpeq_epi32 (pfn1_end_vec, addr_vec));
163
- // pfn1 + mask >= addr
164
- const __m128i gteLowerBound1 = _mm_or_si128 (
165
- _mm_cmpgt_epi32 (addr_vec, pfn1_vec),
166
- _mm_cmpeq_epi32 (addr_vec, pfn1_vec));
167
-
168
- // pfn0 <= addr <= pfn0 + mask
169
- __m128i cmp0 = _mm_and_si128 (gteLowerBound0, gteUpperBound0);
170
- // pfn1 <= addr <= pfn1 + mask
171
- __m128i cmp1 = _mm_and_si128 (gteLowerBound1, gteUpperBound1);
172
-
173
- cmp1 = _mm_and_si128 (cmp1, cached1_vec);
174
- cmp0 = _mm_and_si128 (cmp0, cached0_vec);
175
-
176
- const __m128i cmp = _mm_or_si128 (cmp1, cmp0);
177
-
178
- if (!_mm_testz_si128 (cmp, cmp))
179
- {
180
- return true ;
181
- }
182
- }
183
- #elif defined(_M_ARM64)
184
- const int stride = 4 ;
131
+ const GSVector4i pfn1_vec = GSVector4i::load<true >(&cachedTlbs.PFN1s [i]);
132
+ const GSVector4i pfn0_vec = GSVector4i::load<true >(&cachedTlbs.PFN0s [i]);
133
+ const GSVector4i mask_vec = GSVector4i::load<true >(&cachedTlbs.PageMasks [i]);
185
134
186
- const uint32x4_t addr_vec = vld1q_dup_u32 (&addr);
135
+ const GSVector4i cached1_enable_vec = GSVector4i::load<true >(&cachedTlbs.CacheEnabled1 [i]);
136
+ const GSVector4i cached0_enable_vec = GSVector4i::load<true >(&cachedTlbs.CacheEnabled0 [i]);
187
137
188
- for (; i + stride <= size; i += stride)
189
- {
190
- const uint32x4_t pfn1_vec = vld1q_u32 (&cachedTlbs.PFN1s [i]);
191
- const uint32x4_t pfn0_vec = vld1q_u32 (&cachedTlbs.PFN0s [i]);
192
- const uint32x4_t mask_vec = vld1q_u32 (&cachedTlbs.PageMasks [i]);
193
-
194
- const uint32x4_t cached1_vec = vld1q_u32 (&cachedTlbs.CacheEnabled1 [i]);
195
- const uint32x4_t cached0_vec = vld1q_u32 (&cachedTlbs.CacheEnabled0 [i]);
138
+ const GSVector4i cmp1 = addr_vec.ge32 (pfn1_vec) & addr_vec.le32 (pfn1_vec + mask_vec);
139
+ const GSVector4i cmp0 = addr_vec.ge32 (pfn0_vec) & addr_vec.le32 (pfn1_vec + mask_vec);
196
140
197
- const uint32x4_t pfn1_end_vec = vaddq_u32 (pfn1_vec, mask_vec);
198
- const uint32x4_t pfn0_end_vec = vaddq_u32 (pfn0_vec, mask_vec);
141
+ const GSVector4i lanes_enabled = (cmp1 & cached1_enable_vec) | (cmp0 & cached0_enable_vec);
199
142
200
- const uint32x4_t cmp1 = vandq_u32 (vcgeq_u32 (addr_vec, pfn1_vec), vcleq_u32 (addr_vec, pfn1_end_vec));
201
- const uint32x4_t cmp0 = vandq_u32 (vcgeq_u32 (addr_vec, pfn0_vec), vcleq_u32 (addr_vec, pfn0_end_vec));
202
-
203
- const uint32x4_t lanes_enabled = vorrq_u32 (vandq_u32 (cached1_vec, cmp1), vandq_u32 (cached0_vec, cmp0));
204
-
205
- const uint32x2_t tmp = vorr_u32 (vget_low_u32 (lanes_enabled), vget_high_u32 (lanes_enabled));
206
- if (vget_lane_u32 (vpmax_u32 (tmp, tmp), 0 ))
143
+ if (!lanes_enabled.allfalse ())
207
144
return true ;
208
145
}
209
- # endif
146
+
210
147
for (; i < size; i++)
211
148
{
212
149
const u32 mask = cachedTlbs.PageMasks [i];
@@ -637,7 +574,7 @@ static void TAKES_R128 vtlbUnmappedVWriteLg(u32 addr, r128 data) { vtlb_Miss(add
637
574
template <typename OperandType>
638
575
static OperandType vtlbUnmappedPReadSm (u32 addr) {
639
576
vtlb_BusError (addr, 0 );
640
- if (!CHECK_EEREC && CHECK_CACHE && CheckCache (addr)){
577
+ if (!CHECK_EEREC && CHECK_CACHE && CheckCache (addr)){
641
578
switch (sizeof (OperandType)) {
642
579
case 1 : return readCache8 (addr, false );
643
580
case 2 : return readCache16 (addr, false );
0 commit comments