假设a1,b1,c1,并d1指向堆内存和我的数字代码具有下列核心循环。
const int n = 100000;
for (int j = 0; j < n; j++) {
    a1[j] += b1[j];
    c1[j] += d1[j];
}
该循环通过另一个外部for循环执行了10,000次。为了加快速度,我将代码更改为:
for (int j = 0; j < n; j++) {
    a1[j] += b1[j];
}
for (int j = 0; j < n; j++) {
    c1[j] += d1[j];
}
在具有完全优化功能的MS Visual C ++ 10.0上编译,并在Intel Core 2 Duo(x64)上为32位启用了SSE2,第一个示例花费5.5秒,而双循环示例仅花费1.9秒。我的问题是:(请参阅底部的我改写的问题)
PS:我不确定,这是否有帮助:
第一个循环的反汇编基本上是这样的(此块在整个程序中重复了大约五次):
movsd       xmm0,mmword ptr [edx+18h]
addsd       xmm0,mmword ptr [ecx+20h]
movsd       mmword ptr [ecx+20h],xmm0
movsd       xmm0,mmword ptr [esi+10h]
addsd       xmm0,mmword ptr [eax+30h]
movsd       mmword ptr [eax+30h],xmm0
movsd       xmm0,mmword ptr [edx+20h]
addsd       xmm0,mmword ptr [ecx+28h]
movsd       mmword ptr [ecx+28h],xmm0
movsd       xmm0,mmword ptr [esi+18h]
addsd       xmm0,mmword ptr [eax+38h]
双循环示例的每个循环都会生成此代码(以下块重复大约三遍):
addsd       xmm0,mmword ptr [eax+28h]
movsd       mmword ptr [eax+28h],xmm0
movsd       xmm0,mmword ptr [ecx+20h]
addsd       xmm0,mmword ptr [eax+30h]
movsd       mmword ptr [eax+30h],xmm0
movsd       xmm0,mmword ptr [ecx+28h]
addsd       xmm0,mmword ptr [eax+38h]
movsd       mmword ptr [eax+38h],xmm0
movsd       xmm0,mmword ptr [ecx+30h]
addsd       xmm0,mmword ptr [eax+40h]
movsd       mmword ptr [eax+40h],xmm0
事实证明这个问题无关紧要,因为行为严重取决于阵列(n)的大小和CPU缓存。因此,如果有进一步的兴趣,我重新提出一个问题:
您能否对导致不同缓存行为的细节提供深入的了解,如下图的五个区域所示?
通过为这些CPU提供类似的图形来指出CPU /缓存体系结构之间的差异也可能很有趣。
PPS:这是完整的代码。它使用TBB Tick_Count进行更高分辨率的定时,可以通过不定义TBB_TIMING宏来禁用它:
#include <iostream>
#include <iomanip>
#include <cmath>
#include <string>
//#define TBB_TIMING
#ifdef TBB_TIMING   
#include <tbb/tick_count.h>
using tbb::tick_count;
#else
#include <time.h>
#endif
using namespace std;
//#define preallocate_memory new_cont
enum { new_cont, new_sep };
double *a1, *b1, *c1, *d1;
void allo(int cont, int n)
{
    switch(cont) {
      case new_cont:
        a1 = new double[n*4];
        b1 = a1 + n;
        c1 = b1 + n;
        d1 = c1 + n;
        break;
      case new_sep:
        a1 = new double[n];
        b1 = new double[n];
        c1 = new double[n];
        d1 = new double[n];
        break;
    }
    for (int i = 0; i < n; i++) {
        a1[i] = 1.0;
        d1[i] = 1.0;
        c1[i] = 1.0;
        b1[i] = 1.0;
    }
}
void ff(int cont)
{
    switch(cont){
      case new_sep:
        delete[] b1;
        delete[] c1;
        delete[] d1;
      case new_cont:
        delete[] a1;
    }
}
double plain(int n, int m, int cont, int loops)
{
#ifndef preallocate_memory
    allo(cont,n);
#endif
#ifdef TBB_TIMING   
    tick_count t0 = tick_count::now();
#else
    clock_t start = clock();
#endif
    if (loops == 1) {
        for (int i = 0; i < m; i++) {
            for (int j = 0; j < n; j++){
                a1[j] += b1[j];
                c1[j] += d1[j];
            }
        }
    } else {
        for (int i = 0; i < m; i++) {
            for (int j = 0; j < n; j++) {
                a1[j] += b1[j];
            }
            for (int j = 0; j < n; j++) {
                c1[j] += d1[j];
            }
        }
    }
    double ret;
#ifdef TBB_TIMING   
    tick_count t1 = tick_count::now();
    ret = 2.0*double(n)*double(m)/(t1-t0).seconds();
#else
    clock_t end = clock();
    ret = 2.0*double(n)*double(m)/(double)(end - start) *double(CLOCKS_PER_SEC);
#endif
#ifndef preallocate_memory
    ff(cont);
#endif
    return ret;
}
void main()
{   
    freopen("C:\\test.csv", "w", stdout);
    char *s = " ";
    string na[2] ={"new_cont", "new_sep"};
    cout << "n";
    for (int j = 0; j < 2; j++)
        for (int i = 1; i <= 2; i++)
#ifdef preallocate_memory
            cout << s << i << "_loops_" << na[preallocate_memory];
#else
            cout << s << i << "_loops_" << na[j];
#endif
    cout << endl;
    long long nmax = 1000000;
#ifdef preallocate_memory
    allo(preallocate_memory, nmax);
#endif
    for (long long n = 1L; n < nmax; n = max(n+1, long long(n*1.2)))
    {
        const long long m = 10000000/n;
        cout << n;
        for (int j = 0; j < 2; j++)
            for (int i = 1; i <= 2; i++)
                cout << s << plain(n, m, j, i);
        cout << endl;
    }
}
(显示不同值的FLOP / s n。)

restrict此类情况的关键字。我不知道MSVC是否有类似的东西。当然,如果这是问题所在,那么SSE代码将不正确。
                d1[j]可以使用别名a1[j],因此编译器可以退出进行某些内存优化的工作。如果您将写作分为两个循环,则不会发生这种情况。
                



