我决定自己尝试__builtin_prefetch()。我将其发布在此处作为答案,以防其他人想要在其计算机上对其进行测试。结果接近Jukka的描述:与预取0个元素相比,预取20个元素可将运行时间减少20%。
结果:
prefetch = 0, time = 1.58000
prefetch = 1, time = 1.47000
prefetch = 2, time = 1.39000
prefetch = 3, time = 1.34000
prefetch = 4, time = 1.31000
prefetch = 5, time = 1.30000
prefetch = 6, time = 1.27000
prefetch = 7, time = 1.28000
prefetch = 8, time = 1.26000
prefetch = 9, time = 1.27000
prefetch = 10, time = 1.27000
prefetch = 11, time = 1.27000
prefetch = 12, time = 1.30000
prefetch = 13, time = 1.29000
prefetch = 14, time = 1.30000
prefetch = 15, time = 1.28000
prefetch = 16, time = 1.24000
prefetch = 17, time = 1.28000
prefetch = 18, time = 1.29000
prefetch = 19, time = 1.25000
prefetch = 20, time = 1.24000
prefetch = 19, time = 1.26000
prefetch = 18, time = 1.27000
prefetch = 17, time = 1.26000
prefetch = 16, time = 1.27000
prefetch = 15, time = 1.28000
prefetch = 14, time = 1.29000
prefetch = 13, time = 1.26000
prefetch = 12, time = 1.28000
prefetch = 11, time = 1.30000
prefetch = 10, time = 1.31000
prefetch = 9, time = 1.27000
prefetch = 8, time = 1.32000
prefetch = 7, time = 1.31000
prefetch = 6, time = 1.30000
prefetch = 5, time = 1.27000
prefetch = 4, time = 1.33000
prefetch = 3, time = 1.38000
prefetch = 2, time = 1.41000
prefetch = 1, time = 1.41000
prefetch = 0, time = 1.59000
码:
#include <stdlib.h>
#include <time.h>
#include <stdio.h>
void cracker(int *y, int *x, int *p, int n, int pf) {
int i;
int saved = pf; /* let compiler optimize address computations */
for (i = 0; i < n; i++) {
__builtin_prefetch(&x[p[i+saved]]);
y[i] += x[p[i]];
}
}
int main(void) {
int n = 50000000;
int *x, *y, *p, i, pf, k;
clock_t start, stop;
double elapsed;
/* set up arrays */
x = malloc(sizeof(int)*n);
y = malloc(sizeof(int)*n);
p = malloc(sizeof(int)*n);
for (i = 0; i < n; i++)
p[i] = rand()%n;
/* warm-up exercise */
cracker(y, x, p, n, pf);
k = 20;
for (pf = 0; pf < k; pf++) {
start = clock();
cracker(y, x, p, n, pf);
stop = clock();
elapsed = ((double)(stop-start))/CLOCKS_PER_SEC;
printf("prefetch = %3d, time = %.5lf\n", pf, elapsed);
}
for (pf = k; pf >= 0; pf--) {
start = clock();
cracker(y, x, p, n, pf);
stop = clock();
elapsed = ((double)(stop-start))/CLOCKS_PER_SEC;
printf("prefetch = %3d, time = %.5lf\n", pf, elapsed);
}
return 0;
}