L2 HW预取器真的有用吗？

我在Whiskey Lake i7-8565U上，分析性能计数器和复制512 KiB数据的时间（是L2缓存大小的两倍），并且对L2 HW预取器的工作遇到了一些误解。

在英特尔手册第4卷MSR中，有MSR 0x1A4的位0用于控制L2硬件预取器（禁用1）。

考虑以下基准：

memcopy.h：

void *avx_memcpy_forward_lsls(void *restrict, const void *restrict, size_t);

memcopy.S：

avx_memcpy_forward_lsls:
    shr rdx, 0x3
    xor rcx, rcx
avx_memcpy_forward_loop_lsls:
    vmovdqa ymm0, [rsi + 8*rcx]
    vmovdqa [rdi + rcx*8], ymm0
    vmovdqa ymm1, [rsi + 8*rcx + 0x20]
    vmovdqa [rdi + rcx*8 + 0x20], ymm1
    add rcx, 0x08
    cmp rdx, rcx
    ja avx_memcpy_forward_loop_lsls
    ret

main.c：

#include <string.h>
#include <stdlib.h>
#include <inttypes.h>
#include <x86intrin.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdio.h>
#include "memcopy.h"

#define ITERATIONS 1000
#define BUF_SIZE 512 * 1024

_Alignas(64) char src[BUF_SIZE];
_Alignas(64) char dest[BUF_SIZE];

static void __run_benchmark(unsigned runs, unsigned run_iterations,
                    void *(*fn)(void *, const void*, size_t), void *dest, const void* src, size_t sz);

#define run_benchmark(runs, run_iterations, fn, dest, src, sz) \
    do{\
        printf("Benchmarking " #fn "\n");\
        __run_benchmark(runs, run_iterations, fn, dest, src, sz);\
    }while(0)

int main(void){
    int fd = open("/dev/urandom", O_RDONLY);
    read(fd, src, sizeof src);
    run_benchmark(20, ITERATIONS, avx_memcpy_forward_lsls, dest, src, BUF_SIZE);
}

static inline void benchmark_copy_function(unsigned iterations, void *(*fn)(void *, const void *, size_t),
                                               void *restrict dest, const void *restrict src, size_t sz){
    while(iterations --> 0){
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
        fn(dest, src, sz);
    }
}

static void __run_benchmark(unsigned runs, unsigned run_iterations,
                    void *(*fn)(void *, const void*, size_t), void *dest, const void* src, size_t sz){
    unsigned current_run = 1;
    while(current_run <= runs){
        benchmark_copy_function(run_iterations, fn, dest, src, sz);
        printf("Run %d finished\n", current_run);
        current_run++;
    }
}

考虑编译的2次运行 main.c

我。

MSR:

$ sudo rdmsr -p 0 0x1A4
0

Run:

$ taskset -c 0 sudo ../profile.sh ./bin 

 Performance counter stats for './bin':

    10 486 164 071      L1-dcache-loads                                               (12,13%)
    10 461 354 384      L1-dcache-load-misses     #   99,76% of all L1-dcache hits    (12,05%)
    10 481 930 413      L1-dcache-stores                                              (12,05%)
    10 461 136 686      l1d.replacement                                               (12,12%)
    31 466 394 422      l1d_pend_miss.fb_full                                         (12,11%)
   211 853 643 294      l1d_pend_miss.pending                                         (12,09%)
     1 759 204 317      LLC-loads                                                     (12,16%)
            31 007      LLC-load-misses           #    0,00% of all LL-cache hits     (12,16%)
     3 154 901 630      LLC-stores                                                    (6,19%)
    15 867 315 545      l2_rqsts.all_pf                                               (9,22%)
                 0      sw_prefetch_access.t1_t2                                      (12,22%)
         1 393 306      l2_lines_out.useless_hwpf                                     (12,16%)
     3 549 170 919      l2_rqsts.pf_hit                                               (12,09%)
    12 356 247 643      l2_rqsts.pf_miss                                              (12,06%)
                 0      load_hit_pre.sw_pf                                            (12,09%)
     3 159 712 695      l2_rqsts.rfo_hit                                              (12,06%)
     1 207 642 335      l2_rqsts.rfo_miss                                             (12,02%)
     4 366 526 618      l2_rqsts.all_rfo                                              (12,06%)
     5 240 013 774      offcore_requests.all_data_rd                                     (12,06%)
    19 936 657 118      offcore_requests.all_requests                                     (12,09%)
     1 761 660 763      offcore_response.demand_data_rd.any_response                                     (12,12%)
       287 044 397      bus-cycles                                                    (12,15%)
    36 816 767 779      resource_stalls.any                                           (12,15%)
    36 553 997 653      resource_stalls.sb                                            (12,15%)
    38 035 066 210      uops_retired.stall_cycles                                     (12,12%)
    24 766 225 119      uops_executed.stall_cycles                                     (12,09%)
    40 478 455 041      uops_issued.stall_cycles                                      (12,05%)
    24 497 256 548      cycle_activity.stalls_l1d_miss                                     (12,02%)
    12 611 038 018      cycle_activity.stalls_l2_miss                                     (12,09%)
        10 228 869      cycle_activity.stalls_l3_miss                                     (12,12%)
    24 707 614 483      cycle_activity.stalls_mem_any                                     (12,22%)
    24 776 110 104      cycle_activity.stalls_total                                     (12,22%)
    48 914 478 241      cycles                                                        (12,19%)

      12,155774555 seconds time elapsed

      11,984577000 seconds user
       0,015984000 seconds sys

二。

MSR:

$ sudo rdmsr -p 0 0x1A4
1

Run:

$ taskset -c 0 sudo ../profile.sh ./bin

 Performance counter stats for './bin':

    10 508 027 832      L1-dcache-loads                                               (12,05%)
    10 463 643 206      L1-dcache-load-misses     #   99,58% of all L1-dcache hits    (12,09%)
    10 481 296 605      L1-dcache-stores                                              (12,12%)
    10 444 854 468      l1d.replacement                                               (12,15%)
    29 287 445 744      l1d_pend_miss.fb_full                                         (12,17%)
   205 569 630 707      l1d_pend_miss.pending                                         (12,17%)
     5 103 444 329      LLC-loads                                                     (12,17%)
            33 406      LLC-load-misses           #    0,00% of all LL-cache hits     (12,17%)
     9 567 917 742      LLC-stores                                                    (6,08%)
     1 157 237 980      l2_rqsts.all_pf                                               (9,12%)
                 0      sw_prefetch_access.t1_t2                                      (12,17%)
           301 471      l2_lines_out.useless_hwpf                                     (12,17%)
       218 528 985      l2_rqsts.pf_hit                                               (12,17%)
       938 735 722      l2_rqsts.pf_miss                                              (12,17%)
                 0      load_hit_pre.sw_pf                                            (12,17%)
         4 096 281      l2_rqsts.rfo_hit                                              (12,17%)
     4 972 640 931      l2_rqsts.rfo_miss                                             (12,17%)
     4 976 006 805      l2_rqsts.all_rfo                                              (12,17%)
     5 175 544 191      offcore_requests.all_data_rd                                     (12,17%)
    15 772 124 082      offcore_requests.all_requests                                     (12,17%)
     5 120 635 892      offcore_response.demand_data_rd.any_response                                     (12,17%)
       292 980 395      bus-cycles                                                    (12,17%)
    37 592 020 151      resource_stalls.any                                           (12,14%)
    37 317 091 982      resource_stalls.sb                                            (12,11%)
    38 121 826 730      uops_retired.stall_cycles                                     (12,08%)
    25 430 699 605      uops_executed.stall_cycles                                     (12,04%)
    41 416 190 037      uops_issued.stall_cycles                                      (12,04%)
    25 326 579 070      cycle_activity.stalls_l1d_miss                                     (12,04%)
    25 019 148 253      cycle_activity.stalls_l2_miss                                     (12,03%)
         7 384 770      cycle_activity.stalls_l3_miss                                     (12,03%)
    25 442 709 033      cycle_activity.stalls_mem_any                                     (12,03%)
    25 406 897 956      cycle_activity.stalls_total                                     (12,03%)
    49 877 044 086      cycles                                                        (12,03%)

      12,231406658 seconds time elapsed

      12,226386000 seconds user
       0,004000000 seconds sys

我注意到柜台：

12 611 038 018 cycle_activity.stalls_l2_miss 伏/秒
25 019 148 253 cycle_activity.stalls_l2_miss

提示正在应用禁用M2硬件预取器的MSR。其他与L2 / LLC相关的内容也存在显着差异。差异可以在不同的运行中重现。问题在于total time和周期几乎没有区别：

48 914 478 241 cycles 伏/秒
49 877 044 086 cycles

12,155774555 seconds time elapsed 伏/秒
12,231406658 seconds time elapsed

问题：
L2错过是否被其他性能限制因素隐藏？
如果是这样，您能建议看哪些计数器以了解它吗？

— 圣安东尼奥
source

根据经验：任何非非实现的内存副本均受内存限制。即使仅命中L1缓存。任何内存访问的开销都比CPU将两个和两个加在一起所需的开销高得多。就您而言，您甚至在使用AVX指令来减少每个复制字节的指令量。无论在哪里找到您的数据（L1，L2，LLC，内存），相关内存组件的吞吐量都会成为您的瓶颈。

— cmaster-恢复莫妮卡

Answers:

是的，L2拖缆在很多时候确实很有帮助。

memcpy没有任何隐藏的计算延迟，因此我想它可以让OoO exec资源（ROB大小）处理从更多L2未命中获得的额外负载延迟，至少在这种情况下，您会从使用适合L3的中型工作集（1MiB），不需要进行预取就可以使L3命中。

而且唯一的指令是加载/存储（和循环开销），因此OoO窗口包含了相当遥远的需求负载。

如果L2空间预取器和L1d预取器在这里有帮助，则为IDK。

可以测试此假设的预测：增大阵列大小，使L3未命中，一旦OoO执行人员不足以掩盖一路直入DRAM的负载延迟，您可能会发现总体时间有所不同。硬件预取触发得更远可以有所帮助。

硬件预取的另一个大好处是，它可以跟上您的计算速度，因此您获得了L2命中率。（在具有中等长度计算但没有循环承载的依赖链的循环中。）

当ROB容量没有其他压力时，需求负载和OoO exec可以利用可用（单线程）内存带宽做很多事情。

还要注意，在Intel CPU上，每个高速缓存未命中都可能会导致相关的 uops发生后端重放（来自RS /调度程序），当预期数据到达时，L1d和L2未命中一个。之后，显然，核心在等待数据从L3到达时乐观地发送垃圾邮件。

（请参阅https://chat.stackoverflow.com/rooms/206639/discussion-on-question-by-beeonrope-are-load-ops-deallocated-from-the-rs-when-th和Load ops是否从RS何时发送，完成或其他时间？）

不是缓存未命中本身。在这种情况下，将是商店的说明。更具体地说，端口4的存储数据uop。使用32字节的存储区以及L3带宽的瓶颈意味着每个时钟我们不接近1个端口4个uop。

— 彼得·科德斯
source

@ St.Antario：是吗？这是没有意义的; 您的内存有限，因此您没有前端瓶颈，因此LSD无关紧要。（它避免了从uop缓存中重新获取它们，从而节省了一些电量）。他们仍然在ROB中占据空间，直到可以退休。它们并不那么重要，但也不可忽略。

— Peter Cordes

将数组变大，这样您可能会错过L3，您可能会发现有所不同，我对16MiB缓冲区和10迭代进行了多次测试，确实得到了14,186868883 secondsvs 43,731360909 seconds和46,76% of all LL-cache hitsvs 99,32% of all LL-cache hits; 1 028 664 372 LLC-loadsVS 1 587 454 298 LLC-loads 。

— 圣安东尼奥

@ St.Antario：通过寄存器重命名！这是OoO执行程序最关键的部分之一，尤其是在x86等寄存器贫乏的ISA上。请参阅为什么mulss在Haswell上仅需要3个周期，而与Agner的指令表不同？（展开具有多个累加器的FP循环）。顺便说一句，顺便说一句，您通常要先执行2次加载再进行2次存储，而不是加载/存储加载/存储。避免或减轻4k混叠停顿的机会更大，因为后面的负载（硬件必须检测到是否与先前的存储重叠）更远。

— Peter Cordes

@ St.Antario：是的，当然。Agner Fog的优化指南还解释了寄存器重命名的OoO执行程序，维基百科也是如此。顺便说一句，寄存器重命名也避免了WAW危害，只保留了真正的依赖关系（RAW）。因此，加载甚至可以无序完成，而无需等待以前的加载完成同一架构寄存器的写入。是的，唯一循环承载的dep链是通过RCX，因此该链可以继续运行。这就是为什么地址可以提早准备好的原因，而加载/存储区仍然是端口2/3吞吐量的瓶颈。

— Peter Cordes

令我惊讶的是，预取对L3中的memcpy没有帮助。在这种情况下，我想10/12 LFB足够了。不过似乎很奇怪：那里的限制因素是什么？核心-> L2时间应该小于L2-> L3时间，因此在我的心理模型中，第二条腿有更多的缓冲空间（更多的总占用量）应该会有所帮助。

— BeeOnRope

是的，L2 HW预取器非常有用！

例如，在运行tinymembench的计算机（i7-6700HQ）上找到以下结果。结果的第一列是所有预取器都处于打开状态，第二结果列是L2流送器处于关闭状态（但所有其他预取器仍处于打开状态）。

该测试使用32个MiB源缓冲区和目标缓冲区，这些缓冲区比我机器上的L3大得多，因此它将测试大多数DRAM丢失的情况。

==========================================================================
== Memory bandwidth tests                                               ==
==                                                                      ==
== Note 1: 1MB = 1000000 bytes                                          ==
== Note 2: Results for 'copy' tests show how many bytes can be          ==
==         copied per second (adding together read and writen           ==
==         bytes would have provided twice higher numbers)              ==
== Note 3: 2-pass copy means that we are using a small temporary buffer ==
==         to first fetch data into it, and only then write it to the   ==
==         destination (source -> L1 cache, L1 cache -> destination)    ==
== Note 4: If sample standard deviation exceeds 0.1%, it is shown in    ==
==         brackets                                                     ==
==========================================================================

                                                       L2 streamer ON            OFF
 C copy backwards                                     :   7962.4 MB/s    4430.5 MB/s
 C copy backwards (32 byte blocks)                    :   7993.5 MB/s    4467.0 MB/s
 C copy backwards (64 byte blocks)                    :   7989.9 MB/s    4438.0 MB/s
 C copy                                               :   8503.1 MB/s    4466.6 MB/s
 C copy prefetched (32 bytes step)                    :   8729.2 MB/s    4958.4 MB/s
 C copy prefetched (64 bytes step)                    :   8730.7 MB/s    4958.4 MB/s
 C 2-pass copy                                        :   6171.2 MB/s    3368.7 MB/s
 C 2-pass copy prefetched (32 bytes step)             :   6193.1 MB/s    4104.2 MB/s
 C 2-pass copy prefetched (64 bytes step)             :   6198.8 MB/s    4101.6 MB/s
 C fill                                               :  13372.4 MB/s   10610.5 MB/s
 C fill (shuffle within 16 byte blocks)               :  13379.4 MB/s   10547.5 MB/s
 C fill (shuffle within 32 byte blocks)               :  13365.8 MB/s   10636.9 MB/s
 C fill (shuffle within 64 byte blocks)               :  13588.7 MB/s   10588.3 MB/s
 -
 standard memcpy                                      :  11550.7 MB/s    8216.3 MB/s
 standard memset                                      :  23188.7 MB/s   22686.8 MB/s
 -
 MOVSB copy                                           :   9458.4 MB/s    6523.7 MB/s
 MOVSD copy                                           :   9474.5 MB/s    6510.7 MB/s
 STOSB fill                                           :  23329.0 MB/s   22901.5 MB/s
 SSE2 copy                                            :   9073.1 MB/s    4970.3 MB/s
 SSE2 nontemporal copy                                :  12647.1 MB/s    7492.5 MB/s
 SSE2 copy prefetched (32 bytes step)                 :   9106.0 MB/s    5069.8 MB/s
 SSE2 copy prefetched (64 bytes step)                 :   9113.5 MB/s    5063.1 MB/s
 SSE2 nontemporal copy prefetched (32 bytes step)     :  11770.8 MB/s    7453.4 MB/s
 SSE2 nontemporal copy prefetched (64 bytes step)     :  11937.1 MB/s    7712.1 MB/s
 SSE2 2-pass copy                                     :   7092.8 MB/s    4355.2 MB/s
 SSE2 2-pass copy prefetched (32 bytes step)          :   7001.4 MB/s    4585.1 MB/s
 SSE2 2-pass copy prefetched (64 bytes step)          :   7055.1 MB/s    4557.9 MB/s
 SSE2 2-pass nontemporal copy                         :   5043.2 MB/s    3263.3 MB/s
 SSE2 fill                                            :  14087.3 MB/s   10947.1 MB/s
 SSE2 nontemporal fill                                :  33134.5 MB/s   32774.3 MB/s

在这些测试中，使用L2拖缆永远不会变慢，并且通常快将近两倍。

通常，您可能会在结果中注意到以下模式：

通常，副本似乎比填充更受影响。
该standard memset和STOSB fill（这些归结到这个平台上同样的事情）是受影响最小，与预取的结果只比没有快几个百分点。
Standard memcpy可能是这里唯一使用32字节AVX指令的副本，它是受影响最小的副本之一-但预取仍比不使用时快40％。

我还尝试打开和关闭其他三个预取器，但是对于此基准，它们通常几乎没有可测量的作用。

— 蜜蜂绳
source

（有趣的事实：vmovdqa尽管是“整数”，但AVX1还是。）您是否认为OP的循环所提供的带宽低于glibc memcpy？这就是为什么12个LFB足以应付去往L3的需求负载，而又不利用L2流媒体可以继续占用的L2 <-> L3超队列中的额外MLP？大概就是您的测试有所不同。L3应该以与核心相同的速度运行；你们都有四核Skylake客户端等效的微体系结构，所以大概与L3延迟相似吗？

— Peter Cordes

@PeterCordes-抱歉，我可能应该很清楚：该测试是在32个MiB缓冲区之间进行的，因此它是在测试DRAM命中率而不是L3命中率。我虽然tmb输出了缓冲区大小，但我看不到-糟糕！这是有意的：我并不是要确切解释OP的512 KiB场景，而只是回答标题问题：L2流媒体是否对显示它的场景有用。我想我使用了较小的缓冲区大小，我可以或多或少地复制结果（我已经uarch-bench在注释中提到了类似的结果）。

— BeeOnRope

我将缓冲区大小添加到了答案中。

— BeeOnRope

@ St.Antario：不，这不是问题。不知道为什么你认为这可能是一个问题；混合使用AVX1和AVX2指令不会产生任何惩罚。我的评论要点是，此循环仅需要AVX1，但是此答案提到使用AVX2指令。英特尔在引入AVX2的同时，恰巧将L1d的加载/存储数据路径扩展为32个字节，因此，如果您要进行运行时调度，则可以将AVX2的可用性用作选择memcpy实现的一部分...

— Peter Cordes

您是如何关闭预取器的？是software.intel.com/zh-cn/articles/…吗？论坛software.intel.com/en-us/forums/intel-isa-extensions/topic/…表示有些含义不同。

— osgx