[펌] FastCode

2004. 4. 28. 18:03

No, 215
이름:wodawoda (wodawoda@hitel.net) ( 남 )
레벨:기타
2002/6/5(수)
조회:571
평가:

Fast Code Library

안녕하세요. 이 자료는 각종 3D엔진들에서 "빠르다"는 코드들을 모아둔것입니다. 3D프로그래밍을 하시는분에게 도움이 될까하고 올림니다.

코드의 출처는 아래와 같습니다.

- Titan Engine (http://talika.fie.us.es/~titan/)
- Fast Game Programming (http://members.aol.com/form1/index.html)
- GameDev.net Forums (http://www.gamedev.net)
- nVidia Developer Section (http://www.nvidia.com/developer)
- Paul Hsieh's sqrt (www.azillionmonkeys.com/qed/sqroot.html)
- Golgotha Engine
- Unreal Engine (http://www.unreal.com/index2.html)
- Quake 3 Engine (www.idsoftware.com/archives/quake3arc.html

여기서부터....^^........................................................................................

#ifndef __FAST_CODE_H_INCLUDED__
#define __FAST_CODE_H_INCLUDED__
#pragma warning (disable : 4035)
#include

////////////////////////////////////////////////////////////////////////
// Function prototypes
////////////////////////////////////////////////////////////////////////

__inline int FixMul(int a, int b);
__inline void FastMemZero(void *Dest, int Count);
__inline void FastMemCpy(void *Dest, const void *Src, int Count);
__inline float __fastcall InverseSqrt(float a);
__inline int isqrt0 (unsigned long r);
__inline float RSqrt(float number);
__forceinline float fastsqrt(float n);
__inline double __fastcall Inv_Sqrt(double x);
__forceinline float __cdecl DotProduct(const float v1[3], const float v2[3]);
__inline void FastNormVect2(float *v);
__inline void FastNormVect3(float *v);
__inline DWORD Log2(DWORD val);
__inline DWORD NextPowerOfTwo(DWORD N);
__forceinline float __fastcall FastCos(float a);
__forceinline float __fastcall FastSin(float a);
__forceinline float __fastcall FastAbs(float a);
__forceinline int asmifloor(float f);
__forceinline int __stdcall ifloor(float  x);
__forceinline BYTE __stdcall FloatToByte (float  x);
__forceinline int __stdcall FloatToIntRet(float  x);
__forceinline int RoundFloatToInt (float f);
__forceinline void FloatToInt(int *int_pointer, float f);
void BuildSqrtTable();

////////////////////////////////////////////////////////////////////////
// Casting - Taken from the www.gamedev.net forums and www.nvidia.com
////////////////////////////////////////////////////////////////////////

// Fast float to int conversion, NEVER cast with (int) when
// performance has any importance
__forceinline void FloatToInt(int *int_pointer, float f)
{
__asm  fld f
__asm  mov edx, int_pointer
__asm  FRNDINT
__asm  fistp dword ptr [edx];
}

// Round a floating point number to an integer. Note that (int + .5)
// is rounded to (int + 1).
__forceinline int RoundFloatToInt (float f)
{
int i;
__asm fld [f]
__asm fistp [i]
return i;
}

// Doesn't take the pointer, is a bit faster
__forceinline int __stdcall FloatToIntRet(float  x)
{
int    t;
__asm  fld   x
__asm  fistp t
return t;
}

// Casting floats to unsigned chars is also very expensive, just
// NEVER cast with (unsigned char)
__forceinline BYTE __stdcall FloatToByte(float  x)
{
float  t = x + (float) 0xC00000;
return * (BYTE *) &t;
}

////////////////////////////////////////////////////////////////////////
// Floating point arithmetic - Taken from the www.gamedev.net forums and
// the Titan engine
////////////////////////////////////////////////////////////////////////

// Fast floor() for (x >= 0) && (x < 2^31). MUCH faster than the normal
// floor()
__forceinline int __stdcall ifloor(float  x)
{
DWORD e = (0x7F + 31) - ((* (DWORD *) &x & 0x7F800000) >> 23);
DWORD m = 0x80000000 | (* (DWORD *) &x << 8);
return (m >> e) & -(e < 32);
}

// Converts to integer equal to or less than, asm version
__forceinline int asmifloor(float f)
{
static float Half = 0.5;
int i;
__asm fld [f]
__asm fsub [Half]
__asm fistp [i]
return i;
}

// Asm version of fabs()
__forceinline float __fastcall FastAbs(float a)
{
__asm
{
  fld  DWORD PTR [esp+4]
  fabs
  ret 4
}
}

// Asm version of sinf()
__forceinline float __fastcall FastSin(float a)
{
__asm
{
  fld  DWORD PTR [esp+4]
  fsin
  ret 4
}
}

// Asm version of cosf()
__forceinline float __fastcall FastCos(float a)
{
__asm
{
  fld  DWORD PTR [esp+4]
  fcos
  ret 4
}
}

// Allows you to set your FPU to single precision mode and back.
// This especially speeds up divisions and square roots. Be careful
// with this instrucktions, some of the optimized functions won't work
// in single precision mode
#define SET_TO_SINGLE_PRECISION _controlfp(_PC_24, MCW_PC);
#define SET_TO_DOUBLE_PRECISION _controlfp(_CW_DEFAULT, 0xfffff);

////////////////////////////////////////////////////////////////////////
// Integer point arithmetic - Credits to the Titan and Unreal Engine
// (http://talika.fie.us.es/~titan/)
////////////////////////////////////////////////////////////////////////

// Find the closest power of 2 that is >= N. (Unreal engine)
__inline DWORD NextPowerOfTwo(DWORD N)
{
if (N<=0L  ) return 0L;
if (N<=1L  ) return 1L;
if (N<=2L  ) return 2L;
if (N<=4L  ) return 4L;
if (N<=8L  ) return 8L;
if (N<=16L     ) return 16L;
if (N<=32L     ) return 32L;
if (N<=64L      ) return 64L;
if (N<=128L     ) return 128L;
if (N<=256L     ) return 256L;
if (N<=512L     ) return 512L;
if (N<=1024L    ) return 1024L;
if (N<=2048L    ) return 2048L;
if (N<=4096L    ) return 4096L;
if (N<=8192L    ) return 8192L;
if (N<=16384L   ) return 16384L;
if (N<=32768L   ) return 32768L;
if (N<=65536L ) return 65536L;
else     return 0;
}

// Fast logarithm
__inline DWORD Log2(DWORD val)
{
DWORD answer = 0;

while (val >>= 1)
  answer++;

return answer;
}

////////////////////////////////////////////////////////////////////////
// Vector math
////////////////////////////////////////////////////////////////////////

// Take the dot product of the two vectors. Is sometimes faster than
// the optimized asm dot product
#define DOT(v1, v2) (v1[0] * v2[0] + v1[1] * v2[1] + v1[2] * v2[2])

// Take the cross of the two vectors
#define CROSS(out, v1, v2) \
out[x] = v1[y] * v2[z] - v1[z] * v2[y]; \
out[y] = v1[z] * v2[x] - v1[x] * v2[z]; \
out[z] = v1[x] * v2[y] - v1[y] * v2[x]; \

// Substract the two vectors
#define SUB(dest, v1, v2) \
dest[0] = v1[0] - v2[0]; \
    dest[1] = v1[1] - v2[1]; \
    dest[2] = v1[2] - v2[2];

// Add the two vectors
#define ADD(dest, v1, v2) \
dest[0] = v1[0] + v2[0]; \
    dest[1] = v1[1] + v2[1]; \
    dest[2] = v1[2] + v2[2];

// Fast normalization of 3 component vector.
// Does not test if the vector has 0 length
__inline void FastNormVect3(float *v)
{
float ilength;

ilength = RSqrt(DotProduct(v, v));

v[0] *= ilength;
v[1] *= ilength;
v[2] *= ilength;
}

// Fast normalization of 2 component vector.
// Does not test if the vector has 0 length
__inline void FastNormVect2(float *v)
{
float ilength;

ilength = RSqrt(v[0] * v[0] + v[1] * v[1]);

v[0] *= ilength;
v[1] *= ilength;
}

// Fast 15 cycle asm dot product, credits to Golgotha
__forceinline float __cdecl DotProduct(const float v1[3], const float v2[3])
{
FLOAT dotret;

__asm
{
  mov ecx, v1
  mov eax, v2

  ;optimized dot product  ;15 cycles
  fld dword ptr   [eax+0]     ;starts & ends on cycle 0
  fmul dword ptr  [ecx+0]     ;starts on cycle 1
  fld dword ptr   [eax+4]     ;starts & ends on cycle 2
  fmul dword ptr  [ecx+4]     ;starts on cycle 3
  fld dword ptr   [eax+8]     ;starts & ends on cycle 4
  fmul dword ptr  [ecx+8]     ;starts on cycle 5
  fxch            st(1)       ;no cost
  faddp           st(2),st(0) ;starts on cycle 6, stalls for cycles 7-8
  faddp           st(1),st(0) ;starts on cycle 9, stalls for cycles 10-12
  fstp dword ptr  [dotret]    ;starts on cycle 13, ends on cycle 14
}

return dotret;
}

////////////////////////////////////////////////////////////////////////
// Square root - Credits to the Titan Engine and the Q3 game code
// (http://talika.fie.us.es/~titan/)
////////////////////////////////////////////////////////////////////////

// Fast reciprocal square root (Quake 3 game code)
__inline float RSqrt(float number)
{
long i;
float x2, y;
const float threehalfs = 1.5f;

x2 = number * 0.5f;
y  = number;
i  = * (long *) &y;      // evil floating point bit level hacking
i  = 0x5f3759df - (i >> 1);             // what the fuck?
y  = * (float *) &i;
y  = y * (threehalfs - (x2 * y * y));   // 1st iteration

return y;
}

// Fast inverse square root
__inline float __fastcall InverseSqrt(float a)
{
__asm
{
  mov  eax, 0be6eb508h
  mov  DWORD PTR [esp-12],03fc00000h ;  1.5 on the stack
  sub  eax, DWORD PTR [esp+4]; a
  sub  DWORD PTR [esp+4], 800000h ; a/2 a=Y0
  shr  eax, 1     ; firs approx in eax=R0
  mov  DWORD PTR [esp-8], eax

  fld  DWORD PTR [esp-8] ;r
  fmul st, st            ;r*r
  fld  DWORD PTR [esp-8] ;r
  fxch st(1)
  fmul DWORD PTR [esp+4];a ;r*r*y0
  fld  DWORD PTR [esp-12];load 1.5
  fld  st(0)
  fsub st,st(2)      ;r1 = 1.5 - y1
  ;x1 = st(3)
  ;y1 = st(2)
  ;1.5 = st(1)
  ;r1 = st(0)

  fld  st(1)
  fxch st(1)
  fmul st(3),st   ; y2=y1*r1*...
  fmul st(3),st   ; y2=y1*r1*r1
  fmulp st(4),st            ; x2=x1*r1
  fsub st,st(2)               ; r2=1.5-y2
  ;x2=st(3)
  ;y2=st(2)
  ;1.5=st(1)
  ;r2 = st(0)

  fmul st(2),st   ;y3=y2*r2*...
  fmul st(3),st   ;x3=x2*r2
  fmulp st(2),st   ;y3=y2*r2*r2
  fxch st(1)
  fsubp st(1),st   ;r3= 1.5 - y3
  ;x3 = st(1)
  ;r3 = st(0)
  fmulp st(1), st
  ret 4
}
}

// Another fast inverse square root
// Copyright (C) 1997 by Vesa Karvonen. All rights reserved.
// Use freely as long as my copyright is retained.
__inline double __fastcall Inv_Sqrt(double x)
{
__asm
{
  ; I'm assuming that the argument is aligned to a 64-bit boundary.

        mov     eax,0BFCDD6A1h  ; 1u    Constant from James Van Buskirk
        mov     edx,[esp+8]     ; 1v    Potential pms.
        sub     eax,edx         ; 2u
        push    03FC00000h      ; 2v    Constant 1.5, aligns stack
        shr     eax,1           ; 3u
        sub     edx,000100000h  ; 3v    =.5*x, biased exponent must > 1
        mov     [esp+12],edx    ; 4u
        push    eax             ; 4v

  ; The lower 32-bits of the estimate come from uninitialized stack.

        fld     QWORD PTR [esp-4]       ; 5     Potential pms
        fmul    st,st                   ; 6-8
        fld     QWORD PTR [esp-4]       ; 7
        fxch    st(1)                   ; 7x
        fmul    QWORD PTR [esp+12]      ; 9-11  Potential pms
        fld     DWORD PTR [esp+4]       ; 10
        add     esp,4                   ; 12    Faster on Pro/PII
        fsub    st,st(1)                ; 12-14

  fmul st(1),st ; 15-17
  fmul st(1),st ; 18-20
  fld DWORD PTR [esp] ; 19
  fxch st(1)  ; 19
  fmulp st(2),st ; 20-22
  fsub st,st(1) ; 21-23

  fmul st(1),st ; 24-26
  fmul st(1),st ; 27-29
  fld DWORD PTR [esp] ; 28
  fxch st(1)  ; 28
  fmulp st(2),st ; 29-31
  fsub st,st(1) ; 30-32

  fmul st(1),st ; 33-35
  pop eax  ; 34
  fmul st(1),st ; 36-38
  fld DWORD PTR [esp] ; 37
  fxch st(1)  ; 37
  fmulp st(2),st ; 38-40
  fsubrp st(1),st ; 39-41
  fmulp   st(1),st ; 42-44
}
}

// Fast integer square root. Doesn't seem to be any faster than the
// floating-point fastsqrt() from nVidia. Code from
// http://www.azillionmonkeys.com/qed/sqroot.html
__inline int isqrt0 (unsigned long r)
{
double x, y;
float rr;
float tempf;
long is;

    rr = (float) r;
    y = rr * 0.5;
    * (unsigned long *) &tempf = (0xbe6f0000 -* (unsigned long *) &rr) >> 1;
    x = tempf;
    x = (1.5 * x) - (x * x) * (x * y);

    if (r > 101123)
        x = (1.5 * x ) - (x * x) * (x * y);

    x *= rr;

    _asm
{
        fld x
        fistp is
    }

    is += (((signed int) (r - is * is)) >> 31);

    return is;
}

////////////////////////////////////////////////////////////////////////
// Square root with lookup table (http://www.nvidia.com/developer)
////////////////////////////////////////////////////////////////////////

#define FP_BITS(fp) (* (DWORD *) &(fp))

typedef union FastSqrtUnion
{
float f;
unsigned int i;
} FastSqrtUnion;

static unsigned int iFastSqrtTable[0x10000];

// Build the square root table
void BuildSqrtTable()
{
unsigned int i;
FastSqrtUnion s;

// Build the fast square root table
for (i = 0; i <= 0x7FFF; i++)
{
  // Build a float with the bit pattern i as mantissa
  // and an exponent of 0, stored as 127
  s.i = (i << 8) | (0x7F << 23);
  s.f = (float) sqrt(s.f);

  // Take the square root then strip the first 7 bits of
  // the mantissa into the table
  iFastSqrtTable[i + 0x8000] = (s.i & 0x7FFFFF);

  // Repeat the process, this time with an exponent of 1,
  // stored as 128
  s.i = (i << 8) | (0x80 << 23);
  s.f = (float) sqrt(s.f);

  iFastSqrtTable[i] = (s.i & 0x7FFFFF);
}
}

__forceinline float fastsqrt(float n)
{
// Check for square root of 0
if (FP_BITS(n) == 0)
  return 0.0;

FP_BITS(n) = iFastSqrtTable[(FP_BITS(n) >> 8) & 0xFFFF] |
  ((((FP_BITS(n) - 0x3F800000) >> 1) +
  0x3F800000) & 0x7F800000);

return n;
}

////////////////////////////////////////////////////////////////////////
// Memory operation - Credits to the Unreal Engine
////////////////////////////////////////////////////////////////////////

// Most of the time, these functions seem to be slower than the one of
// MSVC's runtime library. Compare the speed before you use them !

// Fast memory copy
__inline void FastMemCpy(void *Dest, const void *Src, int Count)
{
__asm
{
  mov  ecx, Count
  mov  esi, Src
  mov  edi, Dest
  mov     ebx, ecx
  shr     ecx, 2
  and     ebx, 3
  rep     movsd
  mov     ecx, ebx
  rep     movsb
}
}

// Fast memory zero
__inline void FastMemZero(void *Dest, int Count)
{
// FIXME: if dest is unaligned, that wont be very fast
__asm
{
  mov  ecx, [Count]
  mov  edi, [Dest]
  xor     eax, eax
  mov  ebx, ecx
  shr  ecx, 2
  and  ebx, 3
  rep     stosd
  mov     ecx, ebx
  rep     stosb
}
}

////////////////////////////////////////////////////////////////////////
// Fixed point math - Credits to Fast Game Programming
// (http://members.aol.com/form1/index.html)
////////////////////////////////////////////////////////////////////////

typedef long fixed;      // Our new fixed point type.

#define itofx(x) ((x) << 8)    // Integer to fixed point
#define ftofx(x) ((x) * 256)   // Float to fixed point
#define dtofx(x) ((x) * 256)   // Double to fixed point
#define fxtoi(x) ((x) >> 8)    // Fixed point to integer
#define fxtof(x) ((float) (x) / 256) // Fixed point to float
#define fxtod(x) ((double) (x) / 256) // Fixed point to double
#define mulfx(x, y) (((y) * (x)) >> 8) // Multiply a fixed by a fixed
#define divfx(x, y) ((y << 8) / (x))    // Divide a fixed by a fixed

// Taken from the Poly Engine source
__inline int FixMul(int a, int b)
{
_asm
{
  xor edx, edx
  mov eax, a
  mov ebx, b
  imul ebx
  shrd eax, edx, 16
  mov a, eax
}

return a;
}

#endif // __FAST_CODE_H_INCLUDED__

genie: 옷. 감사합니다. 잘 쓰겠습니다....만 이거 상용 게임에 써도 되나요? [06/06-01:57]
ichiroh: 오오~ 짝짝짝.... [06/06-11:31]
아노아: + _+) 오옷 [06/07-01:05]
ccash: #define 을 사용한 몇가지는 약간 위험하지 않을까.. 생각됩니다.
CROSS 같은 경우에는,
#define CROSS(out, v1, v2) \
{ out[x] = v1[y] * v2[z] - v1[z] * v2[y]; \
out[y] = v1[z] * v2[x] - v1[x] * v2[z]; \
out[z] = v1[x] * v2[y] - v1[y] * v2[x]; }
와 같이 앞뒤에 { } 를 붙여 주는 것이 안전하지 않을까요 ? ^^;
[06/07-18:48]
ccash: 예를 들면, 만약..
if (DOT(v1, v2) > 0)
CROSS(v3, v1,v2);
이렇게 작성하면,
out[y] = v1[z] * v2[x] - v1[x] * v2[z];
out[z] = v1[x] * v2[y] - v1[y] * v2[x];
이 두개의 문장은 if 조건문과 상관없이 무조건 실행되니까요. 실제로 이
런 실수를 굉장히 많이 하기 때문에..
[06/07-18:52]
puru: 오훙.. 좋군요. 근데.. 이런거 쓸곳은..엔진 내부 약간뿐이고, 일반적인경우에는 절대 쓰지 않는다는걸 초보분들은 아셔야 될듯하네여..어셈을 잘 모르신다면 더더욱.. 나중에 황당한 버그가 종종 생깁니다. [06/09-09:37]

'KB > 기타' 카테고리의 다른 글

get all the fact (windows의 역습) (0)	2004.06.24
인터넷 사이트 분석 관련 (0)	2004.06.15
UltraEdit-32 와 ctags 사용 (0)	2004.03.20
div 절대좌표로 태그로 놓기 (0)	2004.03.19
[db] mssql에서 트리거 (0)	2004.03.19

broneri homebrew story

[펌] FastCode

'KB > 기타' 카테고리의 다른 글

+ Recent posts

티스토리툴바