指令预取: 是指提前将所需要的数据取出来,在使用时可用。
具体方法就是在不命中时,当数据从主存储器中取出送往CPU的同时,把主存储器相邻几个单元中的数据(称为一个数据块)都取出来送入Cache中。
CPU在cache不命中的情况下,将从内存读取一个连续的cacheline大小数据。
- 如果访问数据地址连续,CPU产生Cache不命中的情况少,省时
- 如果访问数据地址不连续,CPU产生的Cache不命中的情况多,耗时
示例–矩阵乘法
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
| #include <stdio.h> #include <stdlib.h>
#include <time.h>
#if __aarch64__ #define nop asm("nop") #else #define nop #endif
int main(int argc, const char *argv[]) { unsigned long i, j, k; int N = 700; int res[N][N], mul1[N][N], mul2[N][N]; clock_t start, end; long time1 = 0, time2 = 0;
for (i = 0; i < N; ++i) { for (j = 0; j < N; ++j) { mul1[i][j] = (i + 1) * j; mul2[i][j] = i * j; } }
start = clock();
nop;nop;nop; for (i = 0; i < N; ++i) { for (j = 0; j < N; ++j) { for (k = 0; k < N; ++k) { res[i][j] += mul1[i][k] * mul2[k][j]; } } } nop;nop;nop; end = clock(); time1 = end - start; printf("Run Time1 %f s\n", (double)time1 / CLOCKS_PER_SEC);
int tmp[N][N]; for (i = 0; i < N; ++i) { for (j = 0; j < N; ++j) { mul1[i][j] = (i + 1) * j; mul2[i][j] = i * j; } }
start = clock(); for (i = 0; i < N; ++i) { for (j = 0; j < N; ++j) { tmp[i][j] = mul2[j][i]; } }
nop;nop;nop; for (i = 0; i < N; ++i) { for (j = 0; j < N; ++j) { for (k = 0; k < N; ++k) { res[i][j] += mul1[i][k] * tmp[j][k]; } } } nop;nop;nop; end = clock(); time2 = end - start; printf("Run Time2 %f s\n", (double)time2 / CLOCKS_PER_SEC);
printf("Time2 and Time1 upgrade %f %\n", (double)(time1 - time2) / time1 * 100);
return 0; }
|
1 2 3 4
| [shell@localhost:/]$ ./prefetch Run Time1 5.966075 s Run Time2 4.530201 s Time2 and Time1 upgrade 24.067314 %
|
代码运行速度提升24%
结论