forked from OSchip/llvm-project
47 lines
944 B
C
47 lines
944 B
C
|
#define M 1024
|
||
|
#define N 1024
|
||
|
#define K 1024
|
||
|
float A[K][M];
|
||
|
float B[N][K];
|
||
|
float C[M][N];
|
||
|
/*
|
||
|
void matmul_vec(void) {
|
||
|
int i, j, k;
|
||
|
|
||
|
|
||
|
/* With much unrolling
|
||
|
for (i=0;i<=M;i++)
|
||
|
for (j=0;j<=N;j+=4)
|
||
|
for (k=0;k<=K;k+=8)
|
||
|
for (kk=k;kk<=k+7;kk++)
|
||
|
for (jj=j;jj<=j+3;jj++)
|
||
|
C[i][jj] += A[kk][i] * B[jj][kk];
|
||
|
vec_load splat scalar_load
|
||
|
*/
|
||
|
/* Without unrolling
|
||
|
for (i=0;i<=M;i++)
|
||
|
for (j=0;j<=N;j+=4)
|
||
|
for (k=0;k<=K;k++)
|
||
|
for (jj=j;jj<=j+3;jj++)
|
||
|
C[i][jj] += A[k][i] * B[jj][kk];
|
||
|
vec_load splat scalar_load
|
||
|
/
|
||
|
|
||
|
}
|
||
|
i*/
|
||
|
int main()
|
||
|
{
|
||
|
int i, j, k;
|
||
|
//matmul_vec();
|
||
|
for(i=0; i<M/4; i++)
|
||
|
for(k=0; k<K; k++) {
|
||
|
for(j=0; j<N; j++)
|
||
|
C[i+0][j] += A[k][i+0] * B[j][k];
|
||
|
C[i+1][j] += A[k][i+1] * B[j][k];
|
||
|
C[i+2][j] += A[k][i+2] * B[j][k];
|
||
|
C[i+3][j] += A[k][i+3] * B[j][k];
|
||
|
}
|
||
|
|
||
|
return A[42][42];
|
||
|
}
|