OS:Windows XP 32 bit
CPU:Intel Mobile Core 2 Duo T6600
一、混合四则运算
main.c
#include <stdio.h> #include <time.h> int main() { int i,j,a=1,b=1; float c=1.0,d=1.0; double e=1.0,f=1.0; double start, finish, duration; start=clock(); for (i = 0; i < 1000; i++) { for (j = 0; j < 1000000; j++) { a = a + 50; b = a - 100; a = b * 20; c = a + 300.89; d = c - 600.89; c = d * 90.89; d = c / 55.89; e = c * 90.89; f = e / 55.89; } } finish=clock(); duration=finish-start; printf("%f,%f\n",e,f); printf("%10e",duration); return 0; }
O1 | O2 | O3(Ox) | 优化集合(无快速浮点优化) | 优化集合 | |
VS2008 C/C++ Compiler | 10.015 | 9.530 | 9.530 | 2.734 | 1.968 |
gcc4.4.4 | 10.250 | 10.250 | 10.265 | 7.203 | 5.328 |
gcc4.5.1 | 10.390 | 10.375 | 10.969 | 6.156 | 4.265 |
Intel C/C++ Compiler 11.1 | 9.375 | 9.343 | 9.343 | 9.015 | 8.843 |
优化集合为
VS2008 C/C++ Compiler | /Ox /Ob2 /Og /Oi /Ot /Oy /fp:fast /arch:SSE2 |
gcc4.4.4 gcc4.5.1 |
-O3 -ftracer -fivopts -ftree-loop-linear -ftree-vectorize -fforce-addr -fomit-frame-pointer -fno-bounds-check -funroll-loops -ffast-math -march=native -mfpmath=sse -mmmx -msse -msse2 -msse3 |
Intel C/C++ Compiler 11.1 | /fast /O3 /Ot /Og /Oi /Qipo /QxHost /arch:SSE3 /Qunroll /Qvec /Quse-intel-optimized-headers /Qparallel /fp:fast=2 /Ob2 /GT /GA |
二、三角函数
main.c(来源于Intel官方)
#include <stdio.h> #include <stdlib.h> #include <time.h> #include <math.h> #define INTEG_FUNC(x) abs(sin(x)) int main(void) { unsigned int i, j, N; double step, x_i, sum; double start, finish, duration; double interval_begin = 0.0; double interval_end = 2.0 * 3.141592653589793238; start = clock(); printf(" \n"); printf(" Number of | Computed Integral | \n"); printf(" Interior Points | | \n"); for (j=2;j<27;j++) { printf("------------------------------------- \n"); N = 1 << j; step = (interval_end - interval_begin) / N; sum = INTEG_FUNC(interval_begin) * step / 2.0; for (i=1;i<N;i++) { x_i = i * step; sum += INTEG_FUNC(x_i) * step; } sum += INTEG_FUNC(interval_end) * step / 2.0; printf(" %10d | %14e | \n", N, sum); } finish = clock(); duration = (finish - start); printf(" \n"); printf(" Application Clocks = %10e \n", duration); printf(" \n"); }
耗时比较(单位:秒)
O1 | O2 | O3(Ox) | 优化集合(无快速浮点优化) | 优化集合 | |
VS2008 C/C++ Compiler | 9.687 | 9.343 | 8.734 | 8.281 | 6.843 |
gcc4.4.4 | 20.219 | 20.296 | 20.593 | 15.062 | 15.046 |
gcc4.5.1 | 20.125 | 19.953 | 20.094 | 15.000 | 15.187 |
Intel C/C++ Compiler 11.1 | 6.640 | 4.828 | 4.828 | 4.812 | 4.812 |
优化集合同上
三、OpenMP测试
prime.cpp
#include <stdio.h> #include <stdlib.h> #include <math.h> #include <time.h> int main(int argc, char *argv[]) { int i; int start, end; int number_of_primes=0; int number_of_41primes=0; int number_of_43primes=0; double s1,s2; start = 1; end = 40000000; printf("Range to check for Primes: %d - %d\n\n",start, end); s1=clock(); #pragma omp parallel for schedule(dynamic,100) reduction(+:number_of_primes,number_of_41primes,number_of_43primes) for (i = start; i <= end; i += 2) { int limit, j, prime; limit = (int) sqrt((float)i) + 1; prime = 1; j = 3; while (prime && (j <= limit)) { if (i%j == 0) prime = 0; j += 2; } if (prime) { number_of_primes++; if (i%4 == 1) number_of_41primes++; if (i%4 == 3) number_of_43primes++; } } s2=clock(); printf("\n%10e\n",s2-s1); printf("\nProgram Done.\n %d primes found\n",number_of_primes); printf("\nNumber of 4n+1 primes found: %d\n",number_of_41primes); printf("\nNumber of 4n-1 primes found: %d\n",number_of_43primes); return 0; }
采用优化集合+OpenMP参数
其中,VS2008为/openmp,gcc为-fopenmp,intel compiler为/Qopenmp。
VS2008 C/C++ Compiler | 16.781 |
gcc4.4.4 | 16.828 |
gcc4.5.1 | 15.672 |
Intel C/C++ Compiler 11.1 | 16.703 |
四、Fortran Compiler测试
Fortran编译器和以上的结果类似,除了VS2008(不支持Fortran),
gfortran在普通计算上和intel compiler相差很少,
只是在三角函数运算上落后较多。
linpk标准测试
代码来源:http://www.polyhedron.com/compare0html
O1 | O2 | O3 | 优化集合(无快速浮点优化) | 优化集合 | |
gfortran4.4.4 | 25.109 | 24.938 | 25.172 | 24.846 | 24.922 |
gfortran4.5.1 | 24.375 | 24.313 | 24.203 | 24.063 | 24.234 |
Intel Fortran Compiler 11.1 | 25.813 | 25.188 | 25.016 | 25.484 | 25.203 |
矩阵相乘测试(内置函数)
main.f90
program main implicit none real(kind = 8) :: A(2000, 2000), B(2000, 2000), C(2000, 2000) real(kind = 8) :: time_begin, time_end CALL RANDOM_SEED() CALL RANDOM_NUMBER(A) CALL RANDOM_NUMBER(B) CALL CPU_TIME(time_begin) C=matmul(A, B) CALL CPU_TIME(time_end) WRITE(*,*)"consumed CPU_time(s):", time_end - time_begin end program
O1 | O2 | O3 | 优化集合(无快速浮点优化) | 优化集合 | |
gfortran4.4.4 | 15.500 | 15.563 | 15.688 | 15.656 | 15.469 |
Intel Fortran Compiler 11.1 | 37.734 | 37.359 | 4.484 | 5.047 | 4.953 |
矩阵相乘测试(调用原始blas)
blas代码来源:http://www.netlib.org/lapack/
main.f90
program main implicit none real(kind = 8) :: A(2000, 2000), B(2000, 2000), C(2000, 2000) real(kind = 8) :: time_begin, time_end CALL RANDOM_SEED() CALL RANDOM_NUMBER(A) CALL RANDOM_NUMBER(B) CALL CPU_TIME(time_begin) CALL dgemm('N', 'N', 2000, 2000, 2000, 1.0_8, A, 2000, B, 2000, 0.0_8, C, 2000) CALL CPU_TIME(time_end) WRITE(*,*)"consumed CPU_time(s):", time_end - time_begin end program
O1 | O2 | O3 | 优化集合(无快速浮点优化) | 优化集合 | |
gfortran4.4.4 | 18.500 | 17.844 | 17.391 | 17.016 | 17.156 |
Intel Fortran Compiler 11.1 | 14.938 | 13.969 | 13.938 | 18.227 | 18.430 |
五、结论
Intel Compiler在测试中表现良好,尤其对内置函数进行了比较多的优化,VS2008亦表现不错,
gcc除了在三角函数计算里远远落后外,其他的性能表现也还是不错的,考虑到gcc的开源跨平台,因此
占有比Intel Compiler和M$ Compiler更重要的位置。
原文链接: https://www.cnblogs.com/xunxun1982/archive/2010/08/26/1808623.html
欢迎关注
微信关注下方公众号,第一时间获取干货硬货;公众号内回复【pdf】免费获取数百本计算机经典书籍
原创文章受到原创版权保护。转载请注明出处:https://www.ccppcoding.com/archives/14368
非原创文章文中已经注明原地址,如有侵权,联系删除
关注公众号【高性能架构探索】,第一时间获取最新文章
转载文章受原作者版权保护。转载请注明原作者出处!