lammps/lib/gpu/ellipsoid_extra.h

// **************************************************************************
//                              ellipsoid_extra.h
//                             -------------------
//                               W. Michael Brown
//
//  Device code for Ellipsoid math routines
//
// __________________________________________________________________________
//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
// __________________________________________________________________________
//
//    begin                :
//    email                : brownw@ornl.gov
// ***************************************************************************/

#ifndef ELLIPSOID_EXTRA_H
#define ELLIPSOID_EXTRA_H

enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};

#ifdef _DOUBLE_DOUBLE
#define numtyp double
#define numtyp2 double2
#define numtyp4 double4
#define acctyp double
#define acctyp4 double4
#endif

#ifdef _SINGLE_DOUBLE
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp double
#define acctyp4 double4
#endif

#ifndef numtyp
#define numtyp float
#define numtyp2 float2
#define numtyp4 float4
#define acctyp float
#define acctyp4 float4
#endif

#ifdef NV_KERNEL

#include "nv_kernel_def.h"

#else

#pragma OPENCL EXTENSION cl_khr_fp64: enable
#define GLOBAL_ID_X get_global_id(0)
#define THREAD_ID_X get_local_id(0)
#define BLOCK_ID_X get_group_id(0)
#define BLOCK_SIZE_X get_local_size(0)
#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
#define __inline inline
#define BLOCK_PAIR 64
#define MAX_SHARED_TYPES 8

#endif

/* ----------------------------------------------------------------------
   dot product of 2 vectors
------------------------------------------------------------------------- */

__inline numtyp gpu_dot3(const numtyp *v1, const numtyp *v2)
{
  return v1[0]*v2[0]+v1[1]*v2[1]+v1[2]*v2[2];
}

/* ----------------------------------------------------------------------
   cross product of 2 vectors
------------------------------------------------------------------------- */

__inline void gpu_cross3(const numtyp *v1, const numtyp *v2, numtyp *ans)
{
  ans[0] = v1[1]*v2[2]-v1[2]*v2[1];
  ans[1] = v1[2]*v2[0]-v1[0]*v2[2];
  ans[2] = v1[0]*v2[1]-v1[1]*v2[0];
}

/* ----------------------------------------------------------------------
   determinant of a matrix
------------------------------------------------------------------------- */

__inline numtyp gpu_det3(const numtyp m[9])
{
  numtyp ans = m[0]*m[4]*m[8] - m[0]*m[5]*m[7] - 
    m[3]*m[1]*m[8] + m[3]*m[2]*m[7] + 
    m[6]*m[1]*m[5] - m[6]*m[2]*m[4];
  return ans;
}

/* ----------------------------------------------------------------------
   diagonal matrix times a full matrix
------------------------------------------------------------------------- */

__inline void gpu_diag_times3(const numtyp4 shape, const numtyp m[9], 
                              numtyp ans[9])
{
  ans[0] = shape.x*m[0];
  ans[1] = shape.x*m[1];
  ans[2] = shape.x*m[2];
  ans[3] = shape.y*m[3];
  ans[4] = shape.y*m[4];
  ans[5] = shape.y*m[5];
  ans[6] = shape.z*m[6];
  ans[7] = shape.z*m[7];
  ans[8] = shape.z*m[8];
}

/* ----------------------------------------------------------------------
   add two matrices
------------------------------------------------------------------------- */

__inline void gpu_plus3(const numtyp m[9], const numtyp m2[9], numtyp ans[9])
{
  ans[0] = m[0]+m2[0];
  ans[1] = m[1]+m2[1];
  ans[2] = m[2]+m2[2];
  ans[3] = m[3]+m2[3];
  ans[4] = m[4]+m2[4];
  ans[5] = m[5]+m2[5];
  ans[6] = m[6]+m2[6];
  ans[7] = m[7]+m2[7];
  ans[8] = m[8]+m2[8];
}

/* ----------------------------------------------------------------------
   multiply the transpose of mat1 times mat2
------------------------------------------------------------------------- */

__inline void gpu_transpose_times3(const numtyp m[9], const numtyp m2[9],
                                   numtyp ans[9])
{
  ans[0] = m[0]*m2[0]+m[3]*m2[3]+m[6]*m2[6];
  ans[1] = m[0]*m2[1]+m[3]*m2[4]+m[6]*m2[7];
  ans[2] = m[0]*m2[2]+m[3]*m2[5]+m[6]*m2[8];
  ans[3] = m[1]*m2[0]+m[4]*m2[3]+m[7]*m2[6];
  ans[4] = m[1]*m2[1]+m[4]*m2[4]+m[7]*m2[7];
  ans[5] = m[1]*m2[2]+m[4]*m2[5]+m[7]*m2[8];
  ans[6] = m[2]*m2[0]+m[5]*m2[3]+m[8]*m2[6];
  ans[7] = m[2]*m2[1]+m[5]*m2[4]+m[8]*m2[7];
  ans[8] = m[2]*m2[2]+m[5]*m2[5]+m[8]*m2[8];
}

/* ----------------------------------------------------------------------
   row vector times matrix
------------------------------------------------------------------------- */

__inline void gpu_row_times3(const numtyp *v, const numtyp m[9], numtyp *ans)
{
  ans[0] = m[0]*v[0]+v[1]*m[3]+v[2]*m[6];
  ans[1] = v[0]*m[1]+m[4]*v[1]+v[2]*m[7];
  ans[2] = v[0]*m[2]+v[1]*m[5]+m[8]*v[2];
}

/* ----------------------------------------------------------------------
   solve Ax = b or M ans = v
   use gaussian elimination & partial pivoting on matrix
   error_flag set to 2 if bad matrix inversion attempted
------------------------------------------------------------------------- */

__inline void gpu_mldivide3(const numtyp m[9], const numtyp *v, numtyp *ans,
                            __global int *error_flag)
{
  // create augmented matrix for pivoting

  numtyp aug[12], t;

  aug[3] = v[0];
  aug[0] = m[0];
  aug[1] = m[1];
  aug[2] = m[2];
  aug[7] = v[1];
  aug[4] = m[3];
  aug[5] = m[4];
  aug[6] = m[5];
  aug[11] = v[2];
  aug[8] = m[6];
  aug[9] = m[7];
  aug[10] = m[8];

  if (fabs(aug[4]) > fabs(aug[0])) {
    numtyp swapt;
    swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
    swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
    swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
    swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
  }
  if (fabs(aug[8]) > fabs(aug[0])) {
    numtyp swapt;
    swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
    swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
    swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
    swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
  }

  if (aug[0] != (numtyp)0.0) {
    if (0!=0) {
      numtyp swapt;
      swapt=aug[0]; aug[0]=aug[0]; aug[0]=swapt;
      swapt=aug[1]; aug[1]=aug[1]; aug[1]=swapt;
      swapt=aug[2]; aug[2]=aug[2]; aug[2]=swapt;
      swapt=aug[3]; aug[3]=aug[3]; aug[3]=swapt;
    }
  } else if (aug[4] != (numtyp)0.0) {
    if (1!=0) {
      numtyp swapt;
      swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;
      swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;
      swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;
      swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;
    }
  } else if (aug[8] != (numtyp)0.0) {
    if (2!=0) {
      numtyp swapt;
      swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;
      swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;
      swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;
      swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;
    }
  } else
    *error_flag=2;

  t = aug[4]/aug[0];
  aug[5]-=t*aug[1];
  aug[6]-=t*aug[2];
  aug[7]-=t*aug[3];
  t = aug[8]/aug[0];
  aug[9]-=t*aug[1];
  aug[10]-=t*aug[2];
  aug[11]-=t*aug[3];

  if (fabs(aug[9]) > fabs(aug[5])) {
    numtyp swapt;
    swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
    swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
    swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
    swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
  }

  if (aug[5] != (numtyp)0.0) {
    if (1!=1) {
      numtyp swapt;
      swapt=aug[4]; aug[4]=aug[4]; aug[4]=swapt;
      swapt=aug[5]; aug[5]=aug[5]; aug[5]=swapt;
      swapt=aug[6]; aug[6]=aug[6]; aug[6]=swapt;
      swapt=aug[7]; aug[7]=aug[7]; aug[7]=swapt;
    }
  } else if (aug[9] != (numtyp)0.0) {
    if (2!=1) {
      numtyp swapt;
      swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;
      swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;
      swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;
      swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;
    }
  }

  t = aug[9]/aug[5];
  aug[10]-=t*aug[6];
  aug[11]-=t*aug[7];
  
  if (aug[10] == (numtyp)0.0)
    *error_flag=2;

  ans[2] = aug[11]/aug[10];
  t = (numtyp)0.0;
  t += aug[6]*ans[2];
  ans[1] = (aug[7]-t) / aug[5];
  t = (numtyp)0.0;
  t += aug[1]*ans[1];
  t += aug[2]*ans[2];
  ans[0] = (aug[3]-t) / aug[0];
}

/* ----------------------------------------------------------------------
   compute rotation matrix from quaternion conjugate
   quat = [w i j k]
------------------------------------------------------------------------- */

__inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi, 
                                    numtyp mat[9])
{
  numtyp4 q=qif[qi];
  
  numtyp w2 = q.x*q.x;
  numtyp i2 = q.y*q.y;
  numtyp j2 = q.z*q.z;
  numtyp k2 = q.w*q.w;
  numtyp twoij = (numtyp)2.0*q.y*q.z;
  numtyp twoik = (numtyp)2.0*q.y*q.w;
  numtyp twojk = (numtyp)2.0*q.z*q.w;
  numtyp twoiw = (numtyp)2.0*q.y*q.x;
  numtyp twojw = (numtyp)2.0*q.z*q.x;
  numtyp twokw = (numtyp)2.0*q.w*q.x;

  mat[0] = w2+i2-j2-k2;
  mat[3] = twoij-twokw;
  mat[6] = twojw+twoik;

  mat[1] = twoij+twokw;
  mat[4] = w2-i2+j2-k2;
  mat[7] = twojk-twoiw;
	
  mat[2] = twoik-twojw;
  mat[5] = twojk+twoiw;
  mat[8] = w2-i2-j2+k2;
}

/* ----------------------------------------------------------------------
   transposed matrix times diagonal matrix
------------------------------------------------------------------------- */

__inline void gpu_transpose_times_diag3(const numtyp m[9],
                                        const numtyp4 d, numtyp ans[9])
{
  ans[0] = m[0]*d.x;
  ans[1] = m[3]*d.y;
  ans[2] = m[6]*d.z;
  ans[3] = m[1]*d.x;
  ans[4] = m[4]*d.y;
  ans[5] = m[7]*d.z;
  ans[6] = m[2]*d.x;
  ans[7] = m[5]*d.y;
  ans[8] = m[8]*d.z;
}

/* ----------------------------------------------------------------------
   multiply mat1 times mat2
------------------------------------------------------------------------- */

__inline void gpu_times3(const numtyp m[9], const numtyp m2[9],
                         numtyp ans[9])
{
  ans[0] = m[0]*m2[0] + m[1]*m2[3] + m[2]*m2[6];
  ans[1] = m[0]*m2[1] + m[1]*m2[4] + m[2]*m2[7];
  ans[2] = m[0]*m2[2] + m[1]*m2[5] + m[2]*m2[8];
  ans[3] = m[3]*m2[0] + m[4]*m2[3] + m[5]*m2[6];
  ans[4] = m[3]*m2[1] + m[4]*m2[4] + m[5]*m2[7];
  ans[5] = m[3]*m2[2] + m[4]*m2[5] + m[5]*m2[8];
  ans[6] = m[6]*m2[0] + m[7]*m2[3] + m[8]*m2[6];
  ans[7] = m[6]*m2[1] + m[7]*m2[4] + m[8]*m2[7];
  ans[8] = m[6]*m2[2] + m[7]*m2[5] + m[8]*m2[8];
}

/* ----------------------------------------------------------------------
   Apply principal rotation generator about x to rotation matrix m
------------------------------------------------------------------------- */

__inline void gpu_rotation_generator_x(const numtyp m[9], numtyp ans[9])
{
  ans[0] = 0;
  ans[1] = -m[2];
  ans[2] = m[1];
  ans[3] = 0;
  ans[4] = -m[5];
  ans[5] = m[4];
  ans[6] = 0;
  ans[7] = -m[8];
  ans[8] = m[7];
}

/* ----------------------------------------------------------------------
   Apply principal rotation generator about y to rotation matrix m
------------------------------------------------------------------------- */

__inline void gpu_rotation_generator_y(const numtyp m[9], numtyp ans[9])
{
  ans[0] = m[2];
  ans[1] = 0;
  ans[2] = -m[0];
  ans[3] = m[5];
  ans[4] = 0;
  ans[5] = -m[3];
  ans[6] = m[8];
  ans[7] = 0;
  ans[8] = -m[6];
}

/* ----------------------------------------------------------------------
   Apply principal rotation generator about z to rotation matrix m
------------------------------------------------------------------------- */

__inline void gpu_rotation_generator_z(const numtyp m[9], numtyp ans[9])
{
  ans[0] = -m[1];
  ans[1] = m[0];
  ans[2] = 0;
  ans[3] = -m[4];
  ans[4] = m[3];
  ans[5] = 0;
  ans[6] = -m[7];
  ans[7] = m[6];
  ans[8] = 0;
}

/* ----------------------------------------------------------------------
   matrix times vector
------------------------------------------------------------------------- */

__inline void gpu_times_column3(const numtyp m[9], const numtyp v[3],
                                numtyp ans[3]) 
{
  ans[0] = m[0]*v[0] + m[1]*v[1] + m[2]*v[2];
  ans[1] = m[3]*v[0] + m[4]*v[1] + m[5]*v[2];
  ans[2] = m[6]*v[0] + m[7]*v[1] + m[8]*v[2];
}


#endif
git-svn-id: svn://svn.icms.temple.edu/lammps-ro/trunk@6406 f3b2605a-c512-4ea7-a41b-209d697bcdaa 2011-06-14 21:33:48 +08:00			`// **************************************************************************`
			`// ellipsoid_extra.h`
			`// -------------------`
			`// W. Michael Brown`
			`//`
			`// Device code for Ellipsoid math routines`
			`//`
			`// __________________________________________________________________________`
			`// This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)`
			`// __________________________________________________________________________`
			`//`
			`// begin :`
			`// email : brownw@ornl.gov`
			`// ***************************************************************************/`

			`#ifndef ELLIPSOID_EXTRA_H`
			`#define ELLIPSOID_EXTRA_H`

			`enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};`

			`#ifdef _DOUBLE_DOUBLE`
			`#define numtyp double`
			`#define numtyp2 double2`
			`#define numtyp4 double4`
			`#define acctyp double`
			`#define acctyp4 double4`
			`#endif`

			`#ifdef _SINGLE_DOUBLE`
			`#define numtyp float`
			`#define numtyp2 float2`
			`#define numtyp4 float4`
			`#define acctyp double`
			`#define acctyp4 double4`
			`#endif`

			`#ifndef numtyp`
			`#define numtyp float`
			`#define numtyp2 float2`
			`#define numtyp4 float4`
			`#define acctyp float`
			`#define acctyp4 float4`
			`#endif`

			`#ifdef NV_KERNEL`

			`#include "nv_kernel_def.h"`

			`#else`

			`#pragma OPENCL EXTENSION cl_khr_fp64: enable`
			`#define GLOBAL_ID_X get_global_id(0)`
			`#define THREAD_ID_X get_local_id(0)`
			`#define BLOCK_ID_X get_group_id(0)`
			`#define BLOCK_SIZE_X get_local_size(0)`
			`#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)`
			`#define __inline inline`
			`#define BLOCK_PAIR 64`
			`#define MAX_SHARED_TYPES 8`

			`#endif`

			`/* ----------------------------------------------------------------------`
			`dot product of 2 vectors`
			`------------------------------------------------------------------------- */`

			`__inline numtyp gpu_dot3(const numtyp v1, const numtyp v2)`
			`{`
			`return v1[0]v2[0]+v1[1]v2[1]+v1[2]*v2[2];`
			`}`

			`/* ----------------------------------------------------------------------`
			`cross product of 2 vectors`
			`------------------------------------------------------------------------- */`

			`__inline void gpu_cross3(const numtyp v1, const numtyp v2, numtyp *ans)`
			`{`
			`ans[0] = v1[1]v2[2]-v1[2]v2[1];`
			`ans[1] = v1[2]v2[0]-v1[0]v2[2];`
			`ans[2] = v1[0]v2[1]-v1[1]v2[0];`
			`}`

			`/* ----------------------------------------------------------------------`
			`determinant of a matrix`
			`------------------------------------------------------------------------- */`

			`__inline numtyp gpu_det3(const numtyp m[9])`
			`{`
			`numtyp ans = m[0]m[4]m[8] - m[0]m[5]m[7] -`
			`m[3]m[1]m[8] + m[3]m[2]m[7] +`
			`m[6]m[1]m[5] - m[6]m[2]m[4];`
			`return ans;`
			`}`

			`/* ----------------------------------------------------------------------`
			`diagonal matrix times a full matrix`
			`------------------------------------------------------------------------- */`

			`__inline void gpu_diag_times3(const numtyp4 shape, const numtyp m[9],`
			`numtyp ans[9])`
			`{`
			`ans[0] = shape.x*m[0];`
			`ans[1] = shape.x*m[1];`
			`ans[2] = shape.x*m[2];`
			`ans[3] = shape.y*m[3];`
			`ans[4] = shape.y*m[4];`
			`ans[5] = shape.y*m[5];`
			`ans[6] = shape.z*m[6];`
			`ans[7] = shape.z*m[7];`
			`ans[8] = shape.z*m[8];`
			`}`

			`/* ----------------------------------------------------------------------`
			`add two matrices`
			`------------------------------------------------------------------------- */`

			`__inline void gpu_plus3(const numtyp m[9], const numtyp m2[9], numtyp ans[9])`
			`{`
			`ans[0] = m[0]+m2[0];`
			`ans[1] = m[1]+m2[1];`
			`ans[2] = m[2]+m2[2];`
			`ans[3] = m[3]+m2[3];`
			`ans[4] = m[4]+m2[4];`
			`ans[5] = m[5]+m2[5];`
			`ans[6] = m[6]+m2[6];`
			`ans[7] = m[7]+m2[7];`
			`ans[8] = m[8]+m2[8];`
			`}`

			`/* ----------------------------------------------------------------------`
			`multiply the transpose of mat1 times mat2`
			`------------------------------------------------------------------------- */`

			`__inline void gpu_transpose_times3(const numtyp m[9], const numtyp m2[9],`
			`numtyp ans[9])`
			`{`
			`ans[0] = m[0]m2[0]+m[3]m2[3]+m[6]*m2[6];`
			`ans[1] = m[0]m2[1]+m[3]m2[4]+m[6]*m2[7];`
			`ans[2] = m[0]m2[2]+m[3]m2[5]+m[6]*m2[8];`
			`ans[3] = m[1]m2[0]+m[4]m2[3]+m[7]*m2[6];`
			`ans[4] = m[1]m2[1]+m[4]m2[4]+m[7]*m2[7];`
			`ans[5] = m[1]m2[2]+m[4]m2[5]+m[7]*m2[8];`
			`ans[6] = m[2]m2[0]+m[5]m2[3]+m[8]*m2[6];`
			`ans[7] = m[2]m2[1]+m[5]m2[4]+m[8]*m2[7];`
			`ans[8] = m[2]m2[2]+m[5]m2[5]+m[8]*m2[8];`
			`}`

			`/* ----------------------------------------------------------------------`
			`row vector times matrix`
			`------------------------------------------------------------------------- */`

			`__inline void gpu_row_times3(const numtyp v, const numtyp m[9], numtyp ans)`
			`{`
			`ans[0] = m[0]v[0]+v[1]m[3]+v[2]*m[6];`
			`ans[1] = v[0]m[1]+m[4]v[1]+v[2]*m[7];`
			`ans[2] = v[0]m[2]+v[1]m[5]+m[8]*v[2];`
			`}`

			`/* ----------------------------------------------------------------------`
			`solve Ax = b or M ans = v`
			`use gaussian elimination & partial pivoting on matrix`
			`error_flag set to 2 if bad matrix inversion attempted`
			`------------------------------------------------------------------------- */`

			`__inline void gpu_mldivide3(const numtyp m[9], const numtyp v, numtyp ans,`
			`__global int *error_flag)`
			`{`
			`// create augmented matrix for pivoting`

			`numtyp aug[12], t;`

			`aug[3] = v[0];`
			`aug[0] = m[0];`
			`aug[1] = m[1];`
			`aug[2] = m[2];`
			`aug[7] = v[1];`
			`aug[4] = m[3];`
			`aug[5] = m[4];`
			`aug[6] = m[5];`
			`aug[11] = v[2];`
			`aug[8] = m[6];`
			`aug[9] = m[7];`
			`aug[10] = m[8];`

			`if (fabs(aug[4]) > fabs(aug[0])) {`
			`numtyp swapt;`
			`swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;`
			`swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;`
			`swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;`
			`swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;`
			`}`
			`if (fabs(aug[8]) > fabs(aug[0])) {`
			`numtyp swapt;`
			`swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;`
			`swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;`
			`swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;`
			`swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;`
			`}`

			`if (aug[0] != (numtyp)0.0) {`
			`if (0!=0) {`
			`numtyp swapt;`
			`swapt=aug[0]; aug[0]=aug[0]; aug[0]=swapt;`
			`swapt=aug[1]; aug[1]=aug[1]; aug[1]=swapt;`
			`swapt=aug[2]; aug[2]=aug[2]; aug[2]=swapt;`
			`swapt=aug[3]; aug[3]=aug[3]; aug[3]=swapt;`
			`}`
			`} else if (aug[4] != (numtyp)0.0) {`
			`if (1!=0) {`
			`numtyp swapt;`
			`swapt=aug[0]; aug[0]=aug[4]; aug[4]=swapt;`
			`swapt=aug[1]; aug[1]=aug[5]; aug[5]=swapt;`
			`swapt=aug[2]; aug[2]=aug[6]; aug[6]=swapt;`
			`swapt=aug[3]; aug[3]=aug[7]; aug[7]=swapt;`
			`}`
			`} else if (aug[8] != (numtyp)0.0) {`
			`if (2!=0) {`
			`numtyp swapt;`
			`swapt=aug[0]; aug[0]=aug[8]; aug[8]=swapt;`
			`swapt=aug[1]; aug[1]=aug[9]; aug[9]=swapt;`
			`swapt=aug[2]; aug[2]=aug[10]; aug[10]=swapt;`
			`swapt=aug[3]; aug[3]=aug[11]; aug[11]=swapt;`
			`}`
			`} else`
			`*error_flag=2;`

			`t = aug[4]/aug[0];`
			`aug[5]-=t*aug[1];`
			`aug[6]-=t*aug[2];`
			`aug[7]-=t*aug[3];`
			`t = aug[8]/aug[0];`
			`aug[9]-=t*aug[1];`
			`aug[10]-=t*aug[2];`
			`aug[11]-=t*aug[3];`

			`if (fabs(aug[9]) > fabs(aug[5])) {`
			`numtyp swapt;`
			`swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;`
			`swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;`
			`swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;`
			`swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;`
			`}`

			`if (aug[5] != (numtyp)0.0) {`
			`if (1!=1) {`
			`numtyp swapt;`
			`swapt=aug[4]; aug[4]=aug[4]; aug[4]=swapt;`
			`swapt=aug[5]; aug[5]=aug[5]; aug[5]=swapt;`
			`swapt=aug[6]; aug[6]=aug[6]; aug[6]=swapt;`
			`swapt=aug[7]; aug[7]=aug[7]; aug[7]=swapt;`
			`}`
			`} else if (aug[9] != (numtyp)0.0) {`
			`if (2!=1) {`
			`numtyp swapt;`
			`swapt=aug[4]; aug[4]=aug[8]; aug[8]=swapt;`
			`swapt=aug[5]; aug[5]=aug[9]; aug[9]=swapt;`
			`swapt=aug[6]; aug[6]=aug[10]; aug[10]=swapt;`
			`swapt=aug[7]; aug[7]=aug[11]; aug[11]=swapt;`
			`}`
			`}`

			`t = aug[9]/aug[5];`
			`aug[10]-=t*aug[6];`
			`aug[11]-=t*aug[7];`

			`if (aug[10] == (numtyp)0.0)`
			`*error_flag=2;`

			`ans[2] = aug[11]/aug[10];`
			`t = (numtyp)0.0;`
			`t += aug[6]*ans[2];`
			`ans[1] = (aug[7]-t) / aug[5];`
			`t = (numtyp)0.0;`
			`t += aug[1]*ans[1];`
			`t += aug[2]*ans[2];`
			`ans[0] = (aug[3]-t) / aug[0];`
			`}`

			`/* ----------------------------------------------------------------------`
			`compute rotation matrix from quaternion conjugate`
			`quat = [w i j k]`
			`------------------------------------------------------------------------- */`

			`__inline void gpu_quat_to_mat_trans(__global const numtyp4 *qif, const int qi,`
			`numtyp mat[9])`
			`{`
			`numtyp4 q=qif[qi];`

			`numtyp w2 = q.x*q.x;`
			`numtyp i2 = q.y*q.y;`
			`numtyp j2 = q.z*q.z;`
			`numtyp k2 = q.w*q.w;`
			`numtyp twoij = (numtyp)2.0q.yq.z;`
			`numtyp twoik = (numtyp)2.0q.yq.w;`
			`numtyp twojk = (numtyp)2.0q.zq.w;`
			`numtyp twoiw = (numtyp)2.0q.yq.x;`
			`numtyp twojw = (numtyp)2.0q.zq.x;`
			`numtyp twokw = (numtyp)2.0q.wq.x;`

			`mat[0] = w2+i2-j2-k2;`
			`mat[3] = twoij-twokw;`
			`mat[6] = twojw+twoik;`

			`mat[1] = twoij+twokw;`
			`mat[4] = w2-i2+j2-k2;`
			`mat[7] = twojk-twoiw;`

			`mat[2] = twoik-twojw;`
			`mat[5] = twojk+twoiw;`
			`mat[8] = w2-i2-j2+k2;`
			`}`

			`/* ----------------------------------------------------------------------`
			`transposed matrix times diagonal matrix`
			`------------------------------------------------------------------------- */`

			`__inline void gpu_transpose_times_diag3(const numtyp m[9],`
			`const numtyp4 d, numtyp ans[9])`
			`{`
			`ans[0] = m[0]*d.x;`
			`ans[1] = m[3]*d.y;`
			`ans[2] = m[6]*d.z;`
			`ans[3] = m[1]*d.x;`
			`ans[4] = m[4]*d.y;`
			`ans[5] = m[7]*d.z;`
			`ans[6] = m[2]*d.x;`
			`ans[7] = m[5]*d.y;`
			`ans[8] = m[8]*d.z;`
			`}`

			`/* ----------------------------------------------------------------------`
			`multiply mat1 times mat2`
			`------------------------------------------------------------------------- */`

			`__inline void gpu_times3(const numtyp m[9], const numtyp m2[9],`
			`numtyp ans[9])`
			`{`
			`ans[0] = m[0]m2[0] + m[1]m2[3] + m[2]*m2[6];`
			`ans[1] = m[0]m2[1] + m[1]m2[4] + m[2]*m2[7];`
			`ans[2] = m[0]m2[2] + m[1]m2[5] + m[2]*m2[8];`
			`ans[3] = m[3]m2[0] + m[4]m2[3] + m[5]*m2[6];`
			`ans[4] = m[3]m2[1] + m[4]m2[4] + m[5]*m2[7];`
			`ans[5] = m[3]m2[2] + m[4]m2[5] + m[5]*m2[8];`
			`ans[6] = m[6]m2[0] + m[7]m2[3] + m[8]*m2[6];`
			`ans[7] = m[6]m2[1] + m[7]m2[4] + m[8]*m2[7];`
			`ans[8] = m[6]m2[2] + m[7]m2[5] + m[8]*m2[8];`
			`}`

			`/* ----------------------------------------------------------------------`
			`Apply principal rotation generator about x to rotation matrix m`
			`------------------------------------------------------------------------- */`

			`__inline void gpu_rotation_generator_x(const numtyp m[9], numtyp ans[9])`
			`{`
			`ans[0] = 0;`
			`ans[1] = -m[2];`
			`ans[2] = m[1];`
			`ans[3] = 0;`
			`ans[4] = -m[5];`
			`ans[5] = m[4];`
			`ans[6] = 0;`
			`ans[7] = -m[8];`
			`ans[8] = m[7];`
			`}`

			`/* ----------------------------------------------------------------------`
			`Apply principal rotation generator about y to rotation matrix m`
			`------------------------------------------------------------------------- */`

			`__inline void gpu_rotation_generator_y(const numtyp m[9], numtyp ans[9])`
			`{`
			`ans[0] = m[2];`
			`ans[1] = 0;`
			`ans[2] = -m[0];`
			`ans[3] = m[5];`
			`ans[4] = 0;`
			`ans[5] = -m[3];`
			`ans[6] = m[8];`
			`ans[7] = 0;`
			`ans[8] = -m[6];`
			`}`

			`/* ----------------------------------------------------------------------`
			`Apply principal rotation generator about z to rotation matrix m`
			`------------------------------------------------------------------------- */`

			`__inline void gpu_rotation_generator_z(const numtyp m[9], numtyp ans[9])`
			`{`
			`ans[0] = -m[1];`
			`ans[1] = m[0];`
			`ans[2] = 0;`
			`ans[3] = -m[4];`
			`ans[4] = m[3];`
			`ans[5] = 0;`
			`ans[6] = -m[7];`
			`ans[7] = m[6];`
			`ans[8] = 0;`
			`}`

			`/* ----------------------------------------------------------------------`
			`matrix times vector`
			`------------------------------------------------------------------------- */`

			`__inline void gpu_times_column3(const numtyp m[9], const numtyp v[3],`
			`numtyp ans[3])`
			`{`
			`ans[0] = m[0]v[0] + m[1]v[1] + m[2]*v[2];`
			`ans[1] = m[3]v[0] + m[4]v[1] + m[5]*v[2];`
			`ans[2] = m[6]v[0] + m[7]v[1] + m[8]*v[2];`
			`}`













			`#endif`