高性能矩阵乘代码编写问题
我目前在做BLAS数学库中dgemm函数的c语言实现,虽然目前有很多高性能的数学库开源代码,但大部分都是用汇编代码和fortran代码实现。我在用c语言实现的时候,在Intel平台上测试性能很低,只达到理论峰值的三分之一,希望在这方面有研究的大侠高手多多指教,比较急切,谢谢各位!!
程序代码:#include<iostream>
#include<climits>
#define MAX 1024
using namespace std;
class MatrixChain{
private:
int *p;
int **m,**s;
const int size;
protected:
public:
MatrixChain(int n):size(n)
{
p=new int[n+1];
for( int i=0;i<n;i++)
{
cin>>p[i];
}
m=new int*[n+1];
s=new int*[n+1];
for( int i=0;i<n+1;i++)
{
m[i]=new int[n+1];
s[i]=new int[n+1];
}
}
~MatrixChain(){
delete[] p;
for(int i=0;i<size+1;i++)
{
delete[] m[i];
delete[] s[i];
}
delete [] m;
delete [] s;
}
void Matrix_Chain_Order()
{
int n=size-1;
for( int i;i <=n;i++)
{
m[i][i]=0;
}
for( int l=2;l<=n;l++)
{
for( int i=1; i<=n-l+1;i++)
{
int j= i+l-1;
m[i][j]=INT_MAX;
for(int k= i; k<=j-1;k++)
{
int q=m[i][k]+m[k+1][j]+p[i-1]*p[k]*p[j];
if(q<m[i][j])
{
m[i][j]=q;
s[i][j]=k;
}
}
}
}
}
void Print_Optimal_Parens( int i, int j)
{
if( i==j)
{
cout<<"A"<<i;
return ;
}
else{
cout<<"(";
Print_Optimal_Parens(i,s[i][j]);
Print_Optimal_Parens(s[i][j]+1,j);
cout<<")";
}
}
};
int main()
{
int s;
while(cin>>s && !cin.eof())
{
if(s==0)
break;
MatrixChain matrix(s);
matrix.Matrix_Chain_Order();
matrix.Print_Optimal_Parens(1,s-1);
cout<<endl;
}
return 0;
}