编译器优化性能初步比较

OS：Windows XP 32 bit

CPU：Intel Mobile Core 2 Duo T6600

一、混合四则运算

main.c

#include <stdio.h>
#include <time.h>
int main()
{
      int i,j,a=1,b=1;
      float c=1.0,d=1.0;
      double e=1.0,f=1.0;
      double start, finish, duration;
      start=clock();
      for (i = 0; i < 1000; i++)
      {
            for (j = 0; j < 1000000; j++)
            {
                  a = a + 50;
                  b = a - 100;
                  a = b * 20;
                  c = a + 300.89;
                  d = c - 600.89;
                  c = d * 90.89;
                  d = c / 55.89;
                  e = c * 90.89;
                  f = e / 55.89;
            }
      }
      finish=clock();
      duration=finish-start;
      printf("%f,%f\n",e,f);
      printf("%10e",duration);
      return 0;
}

耗时比较（单位：秒）

	O1	O2	O3（Ox）	优化集合（无快速浮点优化）	优化集合
VS2008 C/C++ Compiler	10.015	9.530	9.530	2.734	1.968
gcc4.4.4	10.250	10.250	10.265	7.203	5.328
gcc4.5.1	10.390	10.375	10.969	6.156	4.265
Intel C/C++ Compiler 11.1	9.375	9.343	9.343	9.015	8.843

优化集合为

VS2008 C/C++ Compiler	/Ox /Ob2 /Og /Oi /Ot /Oy /fp:fast /arch:SSE2
gcc4.4.4 gcc4.5.1	-O3 -ftracer -fivopts -ftree-loop-linear -ftree-vectorize -fforce-addr -fomit-frame-pointer -fno-bounds-check -funroll-loops -ffast-math -march=native -mfpmath=sse -mmmx -msse -msse2 -msse3
Intel C/C++ Compiler 11.1	/fast /O3 /Ot /Og /Oi /Qipo /QxHost /arch:SSE3 /Qunroll /Qvec /Quse-intel-optimized-headers /Qparallel /fp:fast=2 /Ob2 /GT /GA

二、三角函数

main.c(来源于Intel官方)

#include <stdio.h>
#include <stdlib.h> 
#include <time.h> 
#include <math.h>

#define INTEG_FUNC(x)  abs(sin(x))

int main(void)
{
   unsigned int i, j, N;
   double step, x_i, sum;
   double start, finish, duration;
   double interval_begin = 0.0;
   double interval_end = 2.0 * 3.141592653589793238;

   start = clock();

   printf("     \n");
   printf("    Number of    | Computed Integral | \n");
   printf(" Interior Points |                   | \n");
   for (j=2;j<27;j++)
   {
    printf("------------------------------------- \n");

     N =  1 << j;
     step = (interval_end - interval_begin) / N;
     sum = INTEG_FUNC(interval_begin) * step / 2.0;

     for (i=1;i<N;i++)
     {
        x_i = i * step;
        sum += INTEG_FUNC(x_i) * step;
     }

     sum += INTEG_FUNC(interval_end) * step / 2.0;

     printf(" %10d      |  %14e   | \n", N, sum);
   }
   finish = clock();
   duration = (finish - start);
   printf("     \n");
   printf("   Application Clocks   = %10e  \n", duration);
   printf("     \n");
}

耗时比较（单位：秒）

	O1	O2	O3（Ox）	优化集合（无快速浮点优化）	优化集合
VS2008 C/C++ Compiler	9.687	9.343	8.734	8.281	6.843
gcc4.4.4	20.219	20.296	20.593	15.062	15.046
gcc4.5.1	20.125	19.953	20.094	15.000	15.187
Intel C/C++ Compiler 11.1	6.640	4.828	4.828	4.812	4.812

优化集合同上

三、OpenMP测试

prime.cpp

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>

int main(int argc, char *argv[])
{
    int i;
    int start, end; 
    int    number_of_primes=0; 
    int number_of_41primes=0;
    int number_of_43primes=0;
    double s1,s2;
    start = 1;
    end = 40000000; 

    printf("Range to check for Primes: %d - %d\n\n",start, end);
    s1=clock();
#pragma omp parallel for schedule(dynamic,100) 
        reduction(+:number_of_primes,number_of_41primes,number_of_43primes)

    for (i = start; i <= end; i += 2) {
        int limit, j, prime;
        limit = (int) sqrt((float)i) + 1;
        prime = 1; 
        j = 3;
        
        while (prime && (j <= limit)) {
            if (i%j == 0) prime = 0;
            j += 2;
        }
        if (prime) {
            number_of_primes++;
            if (i%4 == 1) number_of_41primes++;
            if (i%4 == 3) number_of_43primes++;
        }
    }
    s2=clock();
      printf("\n%10e\n",s2-s1);
    printf("\nProgram Done.\n %d primes found\n",number_of_primes);
    printf("\nNumber of 4n+1 primes found: %d\n",number_of_41primes);
    printf("\nNumber of 4n-1 primes found: %d\n",number_of_43primes);

    return 0; 
}

采用优化集合+OpenMP参数

其中，VS2008为/openmp，gcc为-fopenmp，intel compiler为/Qopenmp。

VS2008 C/C++ Compiler	16.781
gcc4.4.4	16.828
gcc4.5.1	15.672
Intel C/C++ Compiler 11.1	16.703

四、Fortran Compiler测试

Fortran编译器和以上的结果类似，除了VS2008（不支持Fortran），

gfortran在普通计算上和intel compiler相差很少，

只是在三角函数运算上落后较多。

linpk标准测试

代码来源：http://www.polyhedron.com/compare0html

	O1	O2	O3	优化集合（无快速浮点优化）	优化集合
gfortran4.4.4	25.109	24.938	25.172	24.846	24.922
gfortran4.5.1	24.375	24.313	24.203	24.063	24.234
Intel Fortran Compiler 11.1	25.813	25.188	25.016	25.484	25.203

矩阵相乘测试(内置函数)

main.f90

program main
implicit none
real(kind = 8) :: A(2000, 2000), B(2000, 2000), C(2000, 2000)
real(kind = 8) :: time_begin, time_end

CALL RANDOM_SEED()
CALL RANDOM_NUMBER(A)
CALL RANDOM_NUMBER(B)

CALL CPU_TIME(time_begin)
C=matmul(A, B)
CALL CPU_TIME(time_end)
WRITE(*,*)"consumed CPU_time(s):", time_end - time_begin

end program

	O1	O2	O3	优化集合（无快速浮点优化）	优化集合
gfortran4.4.4	15.500	15.563	15.688	15.656	15.469
Intel Fortran Compiler 11.1	37.734	37.359	4.484	5.047	4.953

矩阵相乘测试（调用原始blas）

blas代码来源：http://www.netlib.org/lapack/

main.f90

program main
implicit none
real(kind = 8) :: A(2000, 2000), B(2000, 2000), C(2000, 2000)
real(kind = 8) :: time_begin, time_end

CALL RANDOM_SEED()
CALL RANDOM_NUMBER(A)
CALL RANDOM_NUMBER(B)

CALL CPU_TIME(time_begin)
CALL dgemm('N', 'N', 2000, 2000, 2000, 1.0_8, A, 2000, B, 2000, 0.0_8, C, 2000)
CALL CPU_TIME(time_end)
WRITE(*,*)"consumed CPU_time(s):", time_end - time_begin

end program

	O1	O2	O3	优化集合（无快速浮点优化）	优化集合
gfortran4.4.4	18.500	17.844	17.391	17.016	17.156
Intel Fortran Compiler 11.1	14.938	13.969	13.938	18.227	18.430

五、结论

Intel Compiler在测试中表现良好，尤其对内置函数进行了比较多的优化，VS2008亦表现不错，

gcc除了在三角函数计算里远远落后外，其他的性能表现也还是不错的，考虑到gcc的开源跨平台，因此

占有比Intel Compiler和M$ Compiler更重要的位置。

原文链接: https://www.cnblogs.com/xunxun1982/archive/2010/08/26/1808623.html

欢迎关注

微信关注下方公众号，第一时间获取干货硬货；公众号内回复【pdf】免费获取数百本计算机经典书籍

原创文章受到原创版权保护。转载请注明出处：https://www.ccppcoding.com/archives/14368

非原创文章文中已经注明原地址，如有侵权，联系删除

关注公众号【高性能架构探索】，第一时间获取最新文章

转载文章受原作者版权保护。转载请注明原作者出处！

编译器优化性能初步比较

相关推荐