Replace gluProjectf/gluUnprojectf with own versions. Also changed gGL…

…ModelView/gGLProjection and other related matrices to LLMatrix4a.
siana · Jun 6, 2014 · 24ca32f · 24ca32f
1 parent ee60a98
commit 24ca32f
Show file tree

Hide file tree

Showing 26 changed files with 453 additions and 310 deletions.
diff --git a/LICENSES/LEGAL-intel_matrixlib.txt b/LICENSES/LEGAL-intel_matrixlib.txt
@@ -0,0 +1,29 @@
+INTEL LICENSE AGREEMENT
+
+IMPORTANT - READ BEFORE COPYING OR USING. 
+Do not use or load this library and any associated materials (collectively, 
+the "Software") until you have read the following terms and conditions. By 
+loading or using the Software, you agree to the terms of this Agreement. If 
+you do not wish to so agree, do not use the Software.
+
+LICENSE:  Subject to the restrictions below, Intel Corporation ("Intel") 
+grants to you the permission to use, copy, distribute and prepare derivative 
+works of this Software for any purpose and without fee, provided, that 
+Intel's copyright notice appear in all copies of the Software files.
+The distribution of derivative works of the Software is also subject to the 
+following limitations:  you (i) are solely responsible to your customers for 
+any liability which may arise from the distribution, (ii) do not make any 
+statement that your product is "certified", or that its performance is 
+guaranteed, by Intel, and (iii) do not use Intel's name or trademarks to 
+market your product without written permission.
+
+EXCLUSION OF ALL WARRANTIES. The Software is provided "AS IS" without any 
+express or implies warranty of any kind including warranties of 
+merchantability, noninfringement, or fitness for a particular purpose.  
+Intel does not warrant or assume responsibility for the accuracy or 
+completeness of any information contained within the Software.
+As this Software is given free of charge, in no event shall Intel be liable 
+for any damages whatsoever arising out of the use of or inability to use the 
+Software, even if Intel has been adviced of the possibility of such damages. 
+Intel does not assume any responsibility for any errors which may appear in 
+this Software nor any responsibility to update it.
diff --git a/indra/llmath/llmatrix4a.h b/indra/llmath/llmatrix4a.h
@@ -36,6 +36,11 @@ class LLMatrix4a
 public:
 	LL_ALIGN_16(LLVector4a mMatrix[4]);
 
+	inline F32* getF32ptr()
+	{
+		return mMatrix[0].getF32ptr();
+	}
+
 	inline void clear()
 	{
 		mMatrix[0].clear();
@@ -44,13 +49,21 @@ class LLMatrix4a
 		mMatrix[3].clear();
 	}
 
+	inline void setIdentity()
+	{
+		static __m128 ones = _mm_set_ps(1.f,0.f,0.f,1.f);
+		mMatrix[0] = _mm_movelh_ps(ones,_mm_setzero_ps());
+		mMatrix[1] = _mm_movehl_ps(_mm_setzero_ps(),ones);
+		mMatrix[2] = _mm_movelh_ps(_mm_setzero_ps(),ones);
+		mMatrix[3] = _mm_movehl_ps(ones,_mm_setzero_ps());
+	}
+
 	inline void loadu(const LLMatrix4& src)
 	{
 		mMatrix[0] = _mm_loadu_ps(src.mMatrix[0]);
 		mMatrix[1] = _mm_loadu_ps(src.mMatrix[1]);
 		mMatrix[2] = _mm_loadu_ps(src.mMatrix[2]);
 		mMatrix[3] = _mm_loadu_ps(src.mMatrix[3]);
-
 	}
 
 	inline void loadu(const LLMatrix3& src)
@@ -61,6 +74,14 @@ class LLMatrix4a
 		mMatrix[3].set(0,0,0,1.f);
 	}
 
+	inline void loadu(const F32* src)
+	{
+		mMatrix[0] = _mm_loadu_ps(src+0);
+		mMatrix[1] = _mm_loadu_ps(src+4);
+		mMatrix[2] = _mm_loadu_ps(src+8);
+		mMatrix[3] = _mm_loadu_ps(src+12);
+	}
+
 	inline void add(const LLMatrix4a& rhs)
 	{
 		mMatrix[0].add(rhs.mMatrix[0]);
@@ -84,6 +105,14 @@ class LLMatrix4a
 		mMatrix[3].setMul(m.mMatrix[3], s);
 	}
 
+	inline void setMul(const LLMatrix4a& m0, const LLMatrix4a& m1)
+	{
+		m0.rotate4(m1.mMatrix[0],mMatrix[0]);
+		m0.rotate4(m1.mMatrix[1],mMatrix[1]);
+		m0.rotate4(m1.mMatrix[2],mMatrix[2]);
+		m0.rotate4(m1.mMatrix[3],mMatrix[3]);
+	}
+
 	inline void setLerp(const LLMatrix4a& a, const LLMatrix4a& b, F32 w)
 	{
 		LLVector4a d0,d1,d2,d3;
@@ -158,6 +187,135 @@ class LLMatrix4a
 		z.add(mMatrix[3]);
 		res.setAdd(x,z);
 	}
+
+	inline void transpose()
+	{
+		__m128 q1 = _mm_unpackhi_ps(mMatrix[0],mMatrix[1]);
+		__m128 q2 = _mm_unpacklo_ps(mMatrix[0],mMatrix[1]);
+		__m128 q3 = _mm_unpacklo_ps(mMatrix[2],mMatrix[3]);
+		__m128 q4 = _mm_unpackhi_ps(mMatrix[2],mMatrix[3]);
+
+		mMatrix[0] = _mm_movelh_ps(q2,q3);
+		mMatrix[1] = _mm_movehl_ps(q3,q2);
+		mMatrix[2] = _mm_movelh_ps(q1,q4);
+		mMatrix[3] = _mm_movehl_ps(q4,q1);
+	}
+
+//  Following procedure adapted from:
+//		http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
+//
+//  License/Copyright Statement:
+//		
+//			Copyright (c) 2001 Intel Corporation.
+//
+//		Permition is granted to use, copy, distribute and prepare derivative works 
+//		of this library for any purpose and without fee, provided, that the above 
+//		copyright notice and this statement appear in all copies.  
+//		Intel makes no representations about the suitability of this library for 
+//		any purpose, and specifically disclaims all warranties. 
+//		See LEGAL-intel_matrixlib.TXT for all the legal information.
+	inline float invert()
+	{
+		LL_ALIGN_16(const unsigned int Sign_PNNP[4]) = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 };
+
+		// The inverse is calculated using "Divide and Conquer" technique. The 
+		// original matrix is divide into four 2x2 sub-matrices. Since each 
+		// register holds four matrix element, the smaller matrices are 
+		// represented as a registers. Hence we get a better locality of the 
+		// calculations.
+
+		LLVector4a A = _mm_movelh_ps(mMatrix[0], mMatrix[1]),    // the four sub-matrices 
+				B = _mm_movehl_ps(mMatrix[1], mMatrix[0]),
+				C = _mm_movelh_ps(mMatrix[2], mMatrix[3]),
+				D = _mm_movehl_ps(mMatrix[3], mMatrix[2]);
+		LLVector4a iA, iB, iC, iD,					// partial inverse of the sub-matrices
+				DC, AB;
+		LLSimdScalar dA, dB, dC, dD;                 // determinant of the sub-matrices
+		LLSimdScalar det, d, d1, d2;
+		LLVector4a rd;
+
+		//  AB = A# * B
+		AB.setMul(_mm_shuffle_ps(A,A,0x0F), B);
+		AB.sub(_mm_mul_ps(_mm_shuffle_ps(A,A,0xA5), _mm_shuffle_ps(B,B,0x4E)));
+		//  DC = D# * C
+		DC.setMul(_mm_shuffle_ps(D,D,0x0F), C);
+		DC.sub(_mm_mul_ps(_mm_shuffle_ps(D,D,0xA5), _mm_shuffle_ps(C,C,0x4E)));
+
+		//  dA = |A|
+		dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F),A);
+		dA -= _mm_movehl_ps(dA,dA);
+		//  dB = |B|
+		dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F),B);
+		dB -= _mm_movehl_ps(dB,dB);
+
+		//  dC = |C|
+		dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F),C);
+		dC -= _mm_movehl_ps(dC,dC);
+		//  dD = |D|
+		dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F),D);
+		dD -= _mm_movehl_ps(dD,dD);
+
+		//  d = trace(AB*DC) = trace(A#*B*D#*C)
+		d = _mm_mul_ps(_mm_shuffle_ps(DC,DC,0xD8),AB);
+
+		//  iD = C*A#*B
+		iD.setMul(_mm_shuffle_ps(C,C,0xA0), _mm_movelh_ps(AB,AB));
+		iD.add(_mm_mul_ps(_mm_shuffle_ps(C,C,0xF5), _mm_movehl_ps(AB,AB)));
+		//  iA = B*D#*C
+		iA.setMul(_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC));
+		iA.add(_mm_mul_ps(_mm_shuffle_ps(B,B,0xF5), _mm_movehl_ps(DC,DC)));
+
+		//  d = trace(AB*DC) = trace(A#*B*D#*C) [continue]
+		d = _mm_add_ps(d, _mm_movehl_ps(d, d));
+		d += _mm_shuffle_ps(d, d, 1);
+		d1 = dA*dD;
+		d2 = dB*dC;
+
+		//  iD = D*|A| - C*A#*B
+		iD.setSub(_mm_mul_ps(D,_mm_shuffle_ps(dA,dA,0)), iD);
+
+		//  iA = A*|D| - B*D#*C;
+		iA.setSub(_mm_mul_ps(A,_mm_shuffle_ps(dD,dD,0)), iA);
+
+		//  det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C)
+		det = d1+d2-d;
+
+		__m128 is_zero_mask = _mm_cmpeq_ps(det,_mm_setzero_ps());
+		rd = _mm_div_ss(_mm_set_ss(1.f),_mm_or_ps(_mm_andnot_ps(is_zero_mask, det), _mm_and_ps(is_zero_mask, _mm_set_ss(1.f))));
+#ifdef ZERO_SINGULAR
+		rd = _mm_and_ps(_mm_cmpneq_ss(det,_mm_setzero_ps()), rd);
+#endif
+
+		//  iB = D * (A#B)# = D*B#*A
+		iB.setMul(D, _mm_shuffle_ps(AB,AB,0x33));
+		iB.sub(_mm_mul_ps(_mm_shuffle_ps(D,D,0xB1), _mm_shuffle_ps(AB,AB,0x66)));
+		//  iC = A * (D#C)# = A*C#*D
+		iC.setMul(A, _mm_shuffle_ps(DC,DC,0x33));
+		iC.sub(_mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66)));
+
+		rd = _mm_shuffle_ps(rd,rd,0);
+		rd = _mm_xor_ps(rd, _mm_load_ps((const float*)Sign_PNNP));
+
+		//  iB = C*|B| - D*B#*A
+		iB.setSub(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB);
+
+		//  iC = B*|C| - A*C#*D;
+		iC.setSub(_mm_mul_ps(B,_mm_shuffle_ps(dC,dC,0)), iC);
+
+
+		//  iX = iX / det
+		iA.mul(rd);
+		iB.mul(rd);
+		iC.mul(rd);
+		iD.mul(rd);
+
+		mMatrix[0] = _mm_shuffle_ps(iA,iB,0x77);
+		mMatrix[1] = _mm_shuffle_ps(iA,iB,0x22);
+		mMatrix[2] = _mm_shuffle_ps(iC,iD,0x77);
+		mMatrix[3] = _mm_shuffle_ps(iC,iD,0x22);
+
+		return *(float*)&det;
+	}
 };
 
 #endif
diff --git a/indra/llmath/llvolume.cpp b/indra/llmath/llvolume.cpp
@@ -50,7 +50,6 @@
 #include "llstl.h"
 #include "llsdserialize.h"
 #include "llvector4a.h"
-#include "llmatrix4a.h"
 #include "lltimer.h"
 
 #define DEBUG_SILHOUETTE_BINORMALS 0

diff --git a/indra/llprimitive/llmodel.cpp b/indra/llprimitive/llmodel.cpp
@@ -31,6 +31,7 @@
 #include "LLConvexDecomposition.h"
 #include "llsdserialize.h"
 #include "llvector4a.h"
+#include "llmatrix4a.h"
 #if LL_MSVC
 #pragma warning (push)
 #pragma warning (disable : 4068)

diff --git a/indra/llrender/llcubemap.cpp b/indra/llrender/llcubemap.cpp
@@ -34,6 +34,7 @@
 #include "v3dmath.h"
 #include "m3math.h"
 #include "m4math.h"
+#include "llmatrix4a.h"
 
 #include "llrender.h"
 #include "llglslshader.h"
@@ -265,18 +266,19 @@ void LLCubeMap::setMatrix(S32 stage)
 		gGL.getTexUnit(stage)->activate();
 	}
 
-	LLVector3 x(gGLModelView+0);
-	LLVector3 y(gGLModelView+4);
-	LLVector3 z(gGLModelView+8);
+	LLVector3 x(gGLModelView.mMatrix[0].getF32ptr());
+	LLVector3 y(gGLModelView.mMatrix[1].getF32ptr());
+	LLVector3 z(gGLModelView.mMatrix[2].getF32ptr());
 
 	LLMatrix3 mat3;
 	mat3.setRows(x,y,z);
-	LLMatrix4 trans(mat3);
+	LLMatrix4a trans;
+	trans.loadu(mat3);
 	trans.transpose();
 
 	gGL.matrixMode(LLRender::MM_TEXTURE);
 	gGL.pushMatrix();
-	gGL.loadMatrix((F32 *)trans.mMatrix);
+	gGL.loadMatrix(trans.getF32ptr());
 	gGL.matrixMode(LLRender::MM_MODELVIEW);
 
 	/*if (stage > 0)

diff --git a/indra/llrender/llgl.h b/indra/llrender/llgl.h
@@ -455,8 +455,6 @@ class LLGLSyncFence : public LLGLFence
 	void wait();
 };
 
-extern LLMatrix4 gGLObliqueProjectionInverse;
-
 #include "llglstates.h"
 
 void init_glstates();

diff --git a/indra/llrender/llpostprocess.cpp b/indra/llrender/llpostprocess.cpp
@@ -43,6 +43,7 @@
 #include "llsdutil_math.h"
 #include "llvertexbuffer.h"
 #include "llfasttimer.h"
+#include "llmatrix4a.h"
 
 extern LLGLSLShader			gPostColorFilterProgram;
 extern LLGLSLShader			gPostNightVisionProgram;
@@ -305,16 +306,16 @@ class LLMotionShader : public LLPostProcessShader
 	{
 		addSetting(mStrength);
 	}
-	/*virtual*/ bool isEnabled()		const	{ return LLPostProcessShader::isEnabled() && llabs(gGLModelView[0] - gGLPreviousModelView[0]) > .0000001; }
+	/*virtual*/ bool isEnabled()		const	{ return LLPostProcessShader::isEnabled() && llabs(gGLModelView.getF32ptr()[0] - gGLPreviousModelView.getF32ptr()[0]) > .0000001; }
 	/*virtual*/ S32 getColorChannel()	const	{ return 0; }
 	/*virtual*/ S32 getDepthChannel()	const	{ return 1; }
 	/*virtual*/ QuadType preDraw()
 	{
-		glh::matrix4f inv_proj(gGLModelView);
-		inv_proj.mult_left(gGLProjection);
+		glh::matrix4f inv_proj(gGLModelView.getF32ptr());
+		inv_proj.mult_left(gGLProjection.getF32ptr());
 		inv_proj = inv_proj.inverse();
-		glh::matrix4f prev_proj(gGLPreviousModelView);
-		prev_proj.mult_left(gGLProjection);
+		glh::matrix4f prev_proj(gGLPreviousModelView.getF32ptr());
+		prev_proj.mult_left(gGLProjection.getF32ptr());
 
 		LLVector2 screen_rect = LLPostProcess::getInstance()->getDimensions();