Tests of a SIMD version of DVector3

From GlueXWiki
Jump to: navigation, search
  • SIMD (Single-Instruction Multiple-Data) instructions provide means of doing multiple calculations with or manipulations of several pieces of data with a single operation
    • Size of special registers is 128 bits: can do 4 single-precision floating point calculations (SSE) or 2 double-precision floating point calculations (SSE2) at the same time
    • Basic arithmetic and logical operations are supported, as are square roots
  • Implemented DVector3 with SSE2 instructions
    • Three pairs of coordinates: xy,yz,zx
    • gcc compiler flags: -msse2 -mfpmath=sse -O2
  • Compare to TVector3 on ifarml6: 10 million events with randomly generated vectors
Operation        DVector3 time (s)    TVector3 time (s)     T3/D3
-----------------------------------------------------------------
 v2=-v1               0.054                 0.171            3.17
 v3=v1+v2             0.092                 0.191            2.08
 v3+=v1               0.056                 0.048            0.86 <-
 v3=v1-v2             0.092                 0.194            2.11
 v2=k v1              0.054                 0.174            3.24   
 v2*=k                0.030                 0.018            0.60 <-
 SetMag               0.198                 0.198            1.00
 SetXYZ               0.007                 0.008            1.14
 SetMagThetaPhi       0.680                 0.890            1.31  Expensive!
 Rotate(a,v)          0.625                 1.528            2.44   "
 RotateZ(a)           0.353                 0.402            1.14
 make orthogonal v    0.142                 0.273            1.92
 v3 = v1 x v2         0.107                 0.193            1.80
  • The basic unit is a union:
  union dvec{
    __m128d v;
    double d[2];
  };
  • A simple example:
// Add two vectors
inline DVector3 operator+(const DVector3 &v1,const DVector3 &v2){
  return DVector3(_mm_add_pd(v1.GetVxy(),v2.GetVxy()),
		  _mm_add_pd(v1.GetVzx(),v2.GetVzx()));
};
  • A more complicated example:

// Rotate by angle a about the axis specified by v.  The following ugly mess 
// of SIMD instructions does the following:
//            sa = sin(a), ca= cos(a)
//            dx=vx/|v|, dy=vy/|v|, dz=vz/|v|
// |x'| |ca+(1-ca)*dx*dx          (1-ca)*dx*dy-sa*dz    (1-ca)*dx*dz+sa*dy||x|
// |y'|=|   (1-ca)*dy*dx+sa*dz ca+(1-ca)*dy*dy          (1-ca)*dy*dz-sa*dx||y|
// |z'| |   (1-ca)*dz*dx-sa*dy    (1-ca)*dz*dy+sa*dx ca+(1-ca)*dz*dz      ||z|
//
  DVector3 Rotate(const double a,const DVector3 &v){
    if (a!=0){
      double vmag=v.Mag();
      if (vmag==0.0){
	//cerr << "axis vector has zero magnitude." <<endl;
      }
      else{
	__m128d xx=_mm_set1_pd(x());
	__m128d yy=_mm_set1_pd(y());
	__m128d zz=_mm_set1_pd(z());
	double sa=sin(a);
	double ca=cos(a);
	__m128d one_minus_ca=_mm_set1_pd(1.-ca);
	__m128d sa_zero=_mm_setr_pd(sa,0.);
	__m128d ca_zero=_mm_setr_pd(ca,0.);
	__m128d zero_sa=_mm_setr_pd(0.,sa);
	__m128d zero_ca=_mm_setr_pd(0.,ca);
	__m128d scale=_mm_set1_pd(1./vmag);
	__m128d dxdy=_mm_mul_pd(scale,v.GetVxy());
	__m128d dzdx=_mm_mul_pd(scale,v.GetVzx());
	__m128d dxdx=_mm_mul_pd(scale,_mm_set1_pd(v.x()));
	__m128d dydy=_mm_mul_pd(scale,_mm_set1_pd(v.y()));
	__m128d dzdz=_mm_mul_pd(scale,_mm_set1_pd(v.z()));
	__m128d AD=_mm_add_pd(_mm_add_pd(ca_zero,
					 _mm_mul_pd(one_minus_ca,
						    _mm_mul_pd(dxdy,dxdx))), 
			      _mm_mul_pd(zero_sa,dzdz));
	__m128d	BE=_mm_sub_pd(_mm_add_pd(zero_ca,
					 _mm_mul_pd(one_minus_ca,
						    _mm_mul_pd(dxdy,dydy))),
			      _mm_mul_pd(sa_zero,dzdz));
	__m128d CF=_mm_sub_pd(_mm_add_pd(_mm_mul_pd(one_minus_ca,
						    _mm_mul_pd(dxdy,dzdz)),
					 _mm_mul_pd(sa_zero,dydy)),
			      _mm_mul_pd(zero_sa,dxdx));
	__m128d IA=_mm_sub_pd(_mm_add_pd(zero_ca,
					 _mm_mul_pd(one_minus_ca,
						    _mm_mul_pd(dzdx,dxdx))),
			      _mm_mul_pd(sa_zero,dydy));
	__m128d	JB=_mm_sub_pd(_mm_add_pd(_mm_mul_pd(sa_zero,dxdx),
					 _mm_mul_pd(one_minus_ca,
						    _mm_mul_pd(dzdx,dydy))),
			      _mm_mul_pd(zero_sa,dzdz));
	__m128d KC=_mm_add_pd(_mm_add_pd(ca_zero,
					 _mm_mul_pd(one_minus_ca,
						    _mm_mul_pd(dzdx,dzdz))),
			      _mm_mul_pd(zero_sa,dydy));

	xy.v=_mm_add_pd(_mm_add_pd(_mm_mul_pd(AD,xx), _mm_mul_pd(BE,yy)),
			_mm_mul_pd(CF,zz));
	zx.v=_mm_add_pd(_mm_add_pd(_mm_mul_pd(IA,xx), _mm_mul_pd(JB,yy)),
			_mm_mul_pd(KC,zz));
	yz.v=_mm_setr_pd(y(),z());
	
      }
    }
    return *this;
  }