@@ -338,25 +338,101 @@ extern const quat_t quatIdentity;
338
338
339
339
#define Q_ftol (x ) ((long )(x))
340
340
341
- // Overall relative error bound (ignoring unknown powerpc case): 5 * 10^-6
342
- // https://en.wikipedia.org/wiki/Fast_inverse_square_root#/media/File:2nd-iter.png
343
- inline float Q_rsqrt ( float number )
344
- {
345
- float x = 0 .5f * number;
346
- float y;
341
+ /* The original Q_rsqrt algorithm is:
342
+
343
+ float Q_rsqrt( float n )
344
+ {
345
+ uint32_t magic = 0x5f3759dful;
346
+ float a = 0.5f;
347
+ float b = 3.0f;
348
+ union { float f; uint32_t u; } o = {n};
349
+ o.u = magic - ( o.u >> 1 );
350
+ return a * o.f * ( b - n * o.f * o.f );
351
+ }
352
+
353
+ It could be written like this, this is what Quake 3 did:
354
+
355
+ float Q_rsqrt( float n )
356
+ {
357
+ uint32_t magic = 0x5f3759dful;
358
+ float a = 0.5f;
359
+ float b = 3.0f;
360
+ float c = a * b; // 1.5f
361
+ union { float f; uint32_t u; } o = {n};
362
+ o.u = magic - ( o.u >> 1);
363
+ float x = n * a;
364
+ return o.f * ( c - ( x * o.f * o.f ) );
365
+ o.f *= c - ( x * o.f * o.f );
366
+ // o.f *= c - ( x * o.f * o.f );
367
+ return o.f;
368
+ }
369
+
370
+ It was written with a second iteration commented out.
371
+
372
+ The relative error bound after the initial iteration was: 1.8×10⁻³
373
+ The relative error bound after a second iteration was: 5×10⁻⁶
374
+
375
+ The 0x5f3759df magic constant comes from the Quake 3 source code:
376
+ https://github.com/id-Software/Quake-III-Arena/blob/dbe4ddb/code/game/q_math.c#L56
377
+
378
+ That magic constant was good enough but better ones can be used.
347
379
348
- // compute approximate inverse square root
380
+ Chris lomont computed a better magic constant of 0x5f375a86 while
381
+ keeping the other values of 0.5 and 3.0 for all iterations:
382
+ https://www.lomont.org/papers/2003/InvSqrt.pdf
383
+
384
+ Jan Kadlec computed an ever better magic constant but it requires
385
+ different values for the first iteration: http://rrrola.wz.cz/inv_sqrt.html
386
+
387
+ float Q_rsqrt( float n )
388
+ {
389
+ uint32_t magic = 0x5f1ffff9ul:
390
+ float a = 0.703952253f;
391
+ float b = 2.38924456f;
392
+ union { float f; uint32_t u; } o = {n};
393
+ o.u = magic - ( o.u >> 1 );
394
+ return a * o.f * ( b - n * y.f * y.f );
395
+ }
396
+
397
+ The relative error bound is: 6.50196699×10⁻⁴ */
398
+
399
+ // Compute approximate inverse square root.
400
+ inline float Q_rsqrt_fast ( const float n )
401
+ {
349
402
#if defined(DAEMON_USE_ARCH_INTRINSICS_i686_sse)
350
- // SSE rsqrt relative error bound: 3.7 * 10^-4
351
- _mm_store_ss ( &y, _mm_rsqrt_ss ( _mm_load_ss ( &number ) ) );
403
+ float o;
404
+ // The SSE rsqrt relative error bound is 3.7×10⁻⁴.
405
+ _mm_store_ss ( &o, _mm_rsqrt_ss ( _mm_load_ss ( &n ) ) );
352
406
#else
353
- y = Util::bit_cast<float >( 0x5f3759df - ( Util::bit_cast<uint32_t >( number ) >> 1 ) );
354
- y *= ( 1 .5f - ( x * y * y ) ); // initial iteration
355
- // relative error bound after the initial iteration: 1.8 * 10^-3
407
+ /* Magic constants by Jan Kadlec, with a relative error bound
408
+ of 6.50196699×10⁻⁴.
409
+ See: http://rrrola.wz.cz/inv_sqrt.html */
410
+ constexpr float a = 0 .703952253f ;
411
+ constexpr float b = 2 .38924456f ;
412
+ constexpr uint32_t magic = 0x5f1ffff9ul ;
413
+ float o = Util::bit_cast<float >( magic - ( Util::bit_cast<uint32_t >( n ) >> 1 ) );
414
+ o *= a * ( b - n * o * o );
356
415
#endif
357
- y *= ( 1 .5f - ( x * y * y ) ); // second iteration for higher precision
358
- return y;
359
- }
416
+ return o;
417
+ }
418
+
419
+ inline float Q_rsqrt ( const float n )
420
+ {
421
+ /* When using the magic constants, the relative error bound after the
422
+ iteration is expected to be at most 5×10⁻⁶. It was achieved with the
423
+ less-good Quake 3 constants with a first iteration having originally
424
+ a relative error bound of 1.8×10⁻³.
425
+ Since the new magic constants provide a better relative error bound of
426
+ 6.50196699×10⁻⁴, the relative error bound is now expected to be smaller.
427
+ When using the SSE rsqrt, the initial error bound is 3.7×10⁻⁴ so after
428
+ the iteration it is also expected to be smaller. */
429
+ constexpr float a = 0 .5f ;
430
+ constexpr float b = 3 .0f ;
431
+ float o = Q_rsqrt_fast ( n );
432
+ // Do an iteration of Newton's method for finding the zero of: f(x) = 1÷x² - n
433
+ o *= a * ( b - n * o * o );
434
+ return o;
435
+ }
360
436
361
437
inline float Q_fabs ( float x )
362
438
{
@@ -617,7 +693,7 @@ inline vec_t VectorNormalize( vec3_t v )
617
693
// that length != 0, nor does it return length
618
694
inline void VectorNormalizeFast ( vec3_t v )
619
695
{
620
- vec_t ilength = Q_rsqrt ( DotProduct ( v, v ) );
696
+ vec_t ilength = Q_rsqrt_fast ( DotProduct ( v, v ) );
621
697
622
698
VectorScale ( v, ilength, v );
623
699
}
0 commit comments