Skip to content

Commit

Permalink
Replace gluProjectf/gluUnprojectf with own versions. Also changed gGL…
Browse files Browse the repository at this point in the history
…ModelView/gGLProjection and other related matrices to LLMatrix4a.
  • Loading branch information
Shyotl committed Jun 6, 2014
1 parent ee60a98 commit 24ca32f
Show file tree
Hide file tree
Showing 26 changed files with 453 additions and 310 deletions.
29 changes: 29 additions & 0 deletions LICENSES/LEGAL-intel_matrixlib.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
INTEL LICENSE AGREEMENT

IMPORTANT - READ BEFORE COPYING OR USING.
Do not use or load this library and any associated materials (collectively,
the "Software") until you have read the following terms and conditions. By
loading or using the Software, you agree to the terms of this Agreement. If
you do not wish to so agree, do not use the Software.

LICENSE: Subject to the restrictions below, Intel Corporation ("Intel")
grants to you the permission to use, copy, distribute and prepare derivative
works of this Software for any purpose and without fee, provided, that
Intel's copyright notice appear in all copies of the Software files.
The distribution of derivative works of the Software is also subject to the
following limitations: you (i) are solely responsible to your customers for
any liability which may arise from the distribution, (ii) do not make any
statement that your product is "certified", or that its performance is
guaranteed, by Intel, and (iii) do not use Intel's name or trademarks to
market your product without written permission.

EXCLUSION OF ALL WARRANTIES. The Software is provided "AS IS" without any
express or implies warranty of any kind including warranties of
merchantability, noninfringement, or fitness for a particular purpose.
Intel does not warrant or assume responsibility for the accuracy or
completeness of any information contained within the Software.
As this Software is given free of charge, in no event shall Intel be liable
for any damages whatsoever arising out of the use of or inability to use the
Software, even if Intel has been adviced of the possibility of such damages.
Intel does not assume any responsibility for any errors which may appear in
this Software nor any responsibility to update it.
160 changes: 159 additions & 1 deletion indra/llmath/llmatrix4a.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ class LLMatrix4a
public:
LL_ALIGN_16(LLVector4a mMatrix[4]);

inline F32* getF32ptr()
{
return mMatrix[0].getF32ptr();
}

inline void clear()
{
mMatrix[0].clear();
Expand All @@ -44,13 +49,21 @@ class LLMatrix4a
mMatrix[3].clear();
}

inline void setIdentity()
{
static __m128 ones = _mm_set_ps(1.f,0.f,0.f,1.f);
mMatrix[0] = _mm_movelh_ps(ones,_mm_setzero_ps());
mMatrix[1] = _mm_movehl_ps(_mm_setzero_ps(),ones);
mMatrix[2] = _mm_movelh_ps(_mm_setzero_ps(),ones);
mMatrix[3] = _mm_movehl_ps(ones,_mm_setzero_ps());
}

inline void loadu(const LLMatrix4& src)
{
mMatrix[0] = _mm_loadu_ps(src.mMatrix[0]);
mMatrix[1] = _mm_loadu_ps(src.mMatrix[1]);
mMatrix[2] = _mm_loadu_ps(src.mMatrix[2]);
mMatrix[3] = _mm_loadu_ps(src.mMatrix[3]);

}

inline void loadu(const LLMatrix3& src)
Expand All @@ -61,6 +74,14 @@ class LLMatrix4a
mMatrix[3].set(0,0,0,1.f);
}

inline void loadu(const F32* src)
{
mMatrix[0] = _mm_loadu_ps(src+0);
mMatrix[1] = _mm_loadu_ps(src+4);
mMatrix[2] = _mm_loadu_ps(src+8);
mMatrix[3] = _mm_loadu_ps(src+12);
}

inline void add(const LLMatrix4a& rhs)
{
mMatrix[0].add(rhs.mMatrix[0]);
Expand All @@ -84,6 +105,14 @@ class LLMatrix4a
mMatrix[3].setMul(m.mMatrix[3], s);
}

inline void setMul(const LLMatrix4a& m0, const LLMatrix4a& m1)
{
m0.rotate4(m1.mMatrix[0],mMatrix[0]);
m0.rotate4(m1.mMatrix[1],mMatrix[1]);
m0.rotate4(m1.mMatrix[2],mMatrix[2]);
m0.rotate4(m1.mMatrix[3],mMatrix[3]);
}

inline void setLerp(const LLMatrix4a& a, const LLMatrix4a& b, F32 w)
{
LLVector4a d0,d1,d2,d3;
Expand Down Expand Up @@ -158,6 +187,135 @@ class LLMatrix4a
z.add(mMatrix[3]);
res.setAdd(x,z);
}

inline void transpose()
{
__m128 q1 = _mm_unpackhi_ps(mMatrix[0],mMatrix[1]);
__m128 q2 = _mm_unpacklo_ps(mMatrix[0],mMatrix[1]);
__m128 q3 = _mm_unpacklo_ps(mMatrix[2],mMatrix[3]);
__m128 q4 = _mm_unpackhi_ps(mMatrix[2],mMatrix[3]);

mMatrix[0] = _mm_movelh_ps(q2,q3);
mMatrix[1] = _mm_movehl_ps(q3,q2);
mMatrix[2] = _mm_movelh_ps(q1,q4);
mMatrix[3] = _mm_movehl_ps(q4,q1);
}

// Following procedure adapted from:
// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
//
// License/Copyright Statement:
//
// Copyright (c) 2001 Intel Corporation.
//
// Permition is granted to use, copy, distribute and prepare derivative works
// of this library for any purpose and without fee, provided, that the above
// copyright notice and this statement appear in all copies.
// Intel makes no representations about the suitability of this library for
// any purpose, and specifically disclaims all warranties.
// See LEGAL-intel_matrixlib.TXT for all the legal information.
inline float invert()
{
LL_ALIGN_16(const unsigned int Sign_PNNP[4]) = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 };

// The inverse is calculated using "Divide and Conquer" technique. The
// original matrix is divide into four 2x2 sub-matrices. Since each
// register holds four matrix element, the smaller matrices are
// represented as a registers. Hence we get a better locality of the
// calculations.

LLVector4a A = _mm_movelh_ps(mMatrix[0], mMatrix[1]), // the four sub-matrices
B = _mm_movehl_ps(mMatrix[1], mMatrix[0]),
C = _mm_movelh_ps(mMatrix[2], mMatrix[3]),
D = _mm_movehl_ps(mMatrix[3], mMatrix[2]);
LLVector4a iA, iB, iC, iD, // partial inverse of the sub-matrices
DC, AB;
LLSimdScalar dA, dB, dC, dD; // determinant of the sub-matrices
LLSimdScalar det, d, d1, d2;
LLVector4a rd;

// AB = A# * B
AB.setMul(_mm_shuffle_ps(A,A,0x0F), B);
AB.sub(_mm_mul_ps(_mm_shuffle_ps(A,A,0xA5), _mm_shuffle_ps(B,B,0x4E)));
// DC = D# * C
DC.setMul(_mm_shuffle_ps(D,D,0x0F), C);
DC.sub(_mm_mul_ps(_mm_shuffle_ps(D,D,0xA5), _mm_shuffle_ps(C,C,0x4E)));

// dA = |A|
dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F),A);
dA -= _mm_movehl_ps(dA,dA);
// dB = |B|
dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F),B);
dB -= _mm_movehl_ps(dB,dB);

// dC = |C|
dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F),C);
dC -= _mm_movehl_ps(dC,dC);
// dD = |D|
dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F),D);
dD -= _mm_movehl_ps(dD,dD);

// d = trace(AB*DC) = trace(A#*B*D#*C)
d = _mm_mul_ps(_mm_shuffle_ps(DC,DC,0xD8),AB);

// iD = C*A#*B
iD.setMul(_mm_shuffle_ps(C,C,0xA0), _mm_movelh_ps(AB,AB));
iD.add(_mm_mul_ps(_mm_shuffle_ps(C,C,0xF5), _mm_movehl_ps(AB,AB)));
// iA = B*D#*C
iA.setMul(_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC));
iA.add(_mm_mul_ps(_mm_shuffle_ps(B,B,0xF5), _mm_movehl_ps(DC,DC)));

// d = trace(AB*DC) = trace(A#*B*D#*C) [continue]
d = _mm_add_ps(d, _mm_movehl_ps(d, d));
d += _mm_shuffle_ps(d, d, 1);
d1 = dA*dD;
d2 = dB*dC;

// iD = D*|A| - C*A#*B
iD.setSub(_mm_mul_ps(D,_mm_shuffle_ps(dA,dA,0)), iD);

// iA = A*|D| - B*D#*C;
iA.setSub(_mm_mul_ps(A,_mm_shuffle_ps(dD,dD,0)), iA);

// det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C)
det = d1+d2-d;

__m128 is_zero_mask = _mm_cmpeq_ps(det,_mm_setzero_ps());
rd = _mm_div_ss(_mm_set_ss(1.f),_mm_or_ps(_mm_andnot_ps(is_zero_mask, det), _mm_and_ps(is_zero_mask, _mm_set_ss(1.f))));
#ifdef ZERO_SINGULAR
rd = _mm_and_ps(_mm_cmpneq_ss(det,_mm_setzero_ps()), rd);
#endif

// iB = D * (A#B)# = D*B#*A
iB.setMul(D, _mm_shuffle_ps(AB,AB,0x33));
iB.sub(_mm_mul_ps(_mm_shuffle_ps(D,D,0xB1), _mm_shuffle_ps(AB,AB,0x66)));
// iC = A * (D#C)# = A*C#*D
iC.setMul(A, _mm_shuffle_ps(DC,DC,0x33));
iC.sub(_mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66)));

rd = _mm_shuffle_ps(rd,rd,0);
rd = _mm_xor_ps(rd, _mm_load_ps((const float*)Sign_PNNP));

// iB = C*|B| - D*B#*A
iB.setSub(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB);

// iC = B*|C| - A*C#*D;
iC.setSub(_mm_mul_ps(B,_mm_shuffle_ps(dC,dC,0)), iC);


// iX = iX / det
iA.mul(rd);
iB.mul(rd);
iC.mul(rd);
iD.mul(rd);

mMatrix[0] = _mm_shuffle_ps(iA,iB,0x77);
mMatrix[1] = _mm_shuffle_ps(iA,iB,0x22);
mMatrix[2] = _mm_shuffle_ps(iC,iD,0x77);
mMatrix[3] = _mm_shuffle_ps(iC,iD,0x22);

return *(float*)&det;
}
};

#endif
1 change: 0 additions & 1 deletion indra/llmath/llvolume.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
#include "llstl.h"
#include "llsdserialize.h"
#include "llvector4a.h"
#include "llmatrix4a.h"
#include "lltimer.h"

#define DEBUG_SILHOUETTE_BINORMALS 0
Expand Down
1 change: 1 addition & 0 deletions indra/llprimitive/llmodel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "LLConvexDecomposition.h"
#include "llsdserialize.h"
#include "llvector4a.h"
#include "llmatrix4a.h"
#if LL_MSVC
#pragma warning (push)
#pragma warning (disable : 4068)
Expand Down
12 changes: 7 additions & 5 deletions indra/llrender/llcubemap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "v3dmath.h"
#include "m3math.h"
#include "m4math.h"
#include "llmatrix4a.h"

#include "llrender.h"
#include "llglslshader.h"
Expand Down Expand Up @@ -265,18 +266,19 @@ void LLCubeMap::setMatrix(S32 stage)
gGL.getTexUnit(stage)->activate();
}

LLVector3 x(gGLModelView+0);
LLVector3 y(gGLModelView+4);
LLVector3 z(gGLModelView+8);
LLVector3 x(gGLModelView.mMatrix[0].getF32ptr());
LLVector3 y(gGLModelView.mMatrix[1].getF32ptr());
LLVector3 z(gGLModelView.mMatrix[2].getF32ptr());

LLMatrix3 mat3;
mat3.setRows(x,y,z);
LLMatrix4 trans(mat3);
LLMatrix4a trans;
trans.loadu(mat3);
trans.transpose();

gGL.matrixMode(LLRender::MM_TEXTURE);
gGL.pushMatrix();
gGL.loadMatrix((F32 *)trans.mMatrix);
gGL.loadMatrix(trans.getF32ptr());
gGL.matrixMode(LLRender::MM_MODELVIEW);

/*if (stage > 0)
Expand Down
2 changes: 0 additions & 2 deletions indra/llrender/llgl.h
Original file line number Diff line number Diff line change
Expand Up @@ -455,8 +455,6 @@ class LLGLSyncFence : public LLGLFence
void wait();
};

extern LLMatrix4 gGLObliqueProjectionInverse;

#include "llglstates.h"

void init_glstates();
Expand Down
11 changes: 6 additions & 5 deletions indra/llrender/llpostprocess.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#include "llsdutil_math.h"
#include "llvertexbuffer.h"
#include "llfasttimer.h"
#include "llmatrix4a.h"

extern LLGLSLShader gPostColorFilterProgram;
extern LLGLSLShader gPostNightVisionProgram;
Expand Down Expand Up @@ -305,16 +306,16 @@ class LLMotionShader : public LLPostProcessShader
{
addSetting(mStrength);
}
/*virtual*/ bool isEnabled() const { return LLPostProcessShader::isEnabled() && llabs(gGLModelView[0] - gGLPreviousModelView[0]) > .0000001; }
/*virtual*/ bool isEnabled() const { return LLPostProcessShader::isEnabled() && llabs(gGLModelView.getF32ptr()[0] - gGLPreviousModelView.getF32ptr()[0]) > .0000001; }
/*virtual*/ S32 getColorChannel() const { return 0; }
/*virtual*/ S32 getDepthChannel() const { return 1; }
/*virtual*/ QuadType preDraw()
{
glh::matrix4f inv_proj(gGLModelView);
inv_proj.mult_left(gGLProjection);
glh::matrix4f inv_proj(gGLModelView.getF32ptr());
inv_proj.mult_left(gGLProjection.getF32ptr());
inv_proj = inv_proj.inverse();
glh::matrix4f prev_proj(gGLPreviousModelView);
prev_proj.mult_left(gGLProjection);
glh::matrix4f prev_proj(gGLPreviousModelView.getF32ptr());
prev_proj.mult_left(gGLProjection.getF32ptr());

LLVector2 screen_rect = LLPostProcess::getInstance()->getDimensions();

Expand Down
Loading

0 comments on commit 24ca32f

Please sign in to comment.