riteme的博客

博客

【详细揭秘】vector 存图与单向链表存图性能对比

2021-04-14 22:44:05 By riteme

本文只讨论以下两种存图方式：

单向链表存图

struct Edge {
   // balabalabala...
   int next;
};
Edge e[1000000];
int head[100000];

遍历的时候：for (int i = head[x]; i; i = e[i].next)

vector 存图

struct Edge {
   // balabalabala...
};
Edge e[1000000];
vector<int> es[100000];

遍历的时候：for (int i : es[x])

首先看一个程序 main.cpp：

// usage: ./a.out [n] [T]
//   e.g. ./a.out 100000 1000
// compile it with "-std=c++11" and "-O3"

// #define VECTOR
// #define LIST

#include <vector>
#include <random>
#include <chrono>
#include <iostream>
#include <algorithm>
#include <functional>

#define fence() asm volatile("mfence" ::: "memory")

using u64 = unsigned long long;
using f128 = long double;

struct Edge {
    u64 value;
    Edge *next;
};

int main(int argc, char *argv[]) {
    using clock = std::chrono::high_resolution_clock;

    if (argc < 3) {
        std::cerr << "Usage: " << argv[0] << " [n] [T]" << std::endl;
        return -1;
    }

    int n = atoi(argv[1]);
    int T = atoi(argv[2]);

    constexpr u64 SEED = 0xdeadbeef19260817;
    std::mt19937_64 gen(SEED);

    Edge *mem = new Edge[n];

    std::vector<int> idx;
    idx.resize(n);
    std::iota(idx.begin(), idx.end(), 0);
    std::shuffle(idx.begin(), idx.end(), gen);

    Edge *list = nullptr;
    std::vector<Edge*> vec;
    for (int j = 0; j < n; j++) {
        int i = idx[j];
        mem[i] = Edge{gen(), list};
        list = mem + i;
        vec.push_back(list);
    }
    std::reverse(vec.begin(), vec.end());

#ifdef VECTOR
    const auto name = "vector";
    auto run = [&] {
        u64 ret = 0;
        for (auto e : vec)
            ret += e->value;
        return ret;
    };
#endif

#ifdef LIST
    const auto name = "list";
    auto run = [&] {
        u64 ret = 0;
        for (auto e = list; e; e = e->next)
            ret += e->value;
        return ret;
    };
#endif

    u64 ans = 0;

    auto t_start = clock::now();
    fence();
    for (int t = 0; t < T; t++) {
        std::cout << "#" << t + 1 << std::endl;
        ans += run();
    }
    fence();
    auto t_end = clock::now();

    auto span = std::chrono::duration_cast<std::chrono::microseconds>(t_end - t_start).count();
    auto aver = f128(span) / T;
    std::cout << name << ": " << ans << " " << aver << "μs" << std::endl;

    delete[] mem;
    return 0;
}

这个程序干了这么几件事：

初始化一个 Edge 的内存池 mem；
把 mem 的 Edge 按随机顺序串起来，得到单向链表 list；
把串起来的 Edge 按链表内的顺序把它们的指针放到 vec 里面；
反复遍历 list 或者 vec，计时。

然后分开编译：

$ g++ main.cpp -DVECTOR -std=c++17 -O3 -o vector.out
$ g++ main.cpp -DLIST -std=c++17 -O3 -o list.out

运行：

$ ./vector.out 100000 1000
...
vector: 9056018651256176736 126.578μs
$ ./list.out 100000 1000
...
list: 9056018651256176736 1224.01μs

结果发现 vector 遍历比单向链表遍历快了将近 10x :)，大家可能会很惊讶，vector 怎么会这么快呢？事实就是这样，小编也很惊讶。

这是怎么做到的呢？下面小编带大家来了解一下吧！

首先，vector 缓存命中率高？用 perf 检查一下：

$ taskset -c 0 perf stat -ddd ./list.out 100000 1000
...
list: 9056018651256176736 1251.79μs

 Performance counter stats for './list.out 100000 1000':

          1,256.60 msec task-clock                #    1.000 CPUs utilized          
                 3      context-switches          #    0.002 K/sec                  
                 0      cpu-migrations            #    0.000 K/sec                  
             1,074      page-faults               #    0.855 K/sec                  
     4,928,477,544      cycles                    #    3.922 GHz                      (38.37%)
       444,538,195      instructions              #    0.09  insn per cycle           (46.25%)
       106,509,024      branches                  #   84.760 M/sec                    (46.49%)
           260,219      branch-misses             #    0.24% of all branches          (46.52%)
       210,985,531      L1-dcache-loads           #  167.902 M/sec                    (46.52%)
       106,515,848      L1-dcache-load-misses     #   50.48% of all L1-dcache accesses  (46.52%)
        84,942,108      LLC-loads                 #   67.597 M/sec                    (30.59%)
           390,735      LLC-load-misses           #    0.46% of all LL-cache accesses  (30.56%)
   <not supported>      L1-icache-loads                                             
         1,166,528      L1-icache-load-misses                                         (30.56%)
       203,014,349      dTLB-loads                #  161.559 M/sec                    (30.56%)
             4,116      dTLB-load-misses          #    0.00% of all dTLB cache accesses  (30.56%)
            19,835      iTLB-loads                #    0.016 M/sec                    (30.56%)
             1,367      iTLB-load-misses          #    6.89% of all iTLB cache accesses  (30.56%)
   <not supported>      L1-dcache-prefetches                                        
   <not supported>      L1-dcache-prefetch-misses                                   

       1.256954906 seconds time elapsed

       1.249223000 seconds user
       0.003324000 seconds sys

$ taskset -c 0 perf stat -ddd ./vector.out 100000 1000
...
vector: 9056018651256176736 132.619μs

 Performance counter stats for './vector.out 100000 1000':

            137.31 msec task-clock                #    0.997 CPUs utilized          
                 0      context-switches          #    0.000 K/sec                  
                 0      cpu-migrations            #    0.000 K/sec                  
             1,073      page-faults               #    0.008 M/sec                  
       525,222,528      cycles                    #    3.825 GHz                      (34.28%)
       527,824,716      instructions              #    1.00  insn per cycle           (43.02%)
       103,747,402      branches                  #  755.561 M/sec                    (45.20%)
           231,195      branch-misses             #    0.22% of all branches          (47.39%)
       206,014,450      L1-dcache-loads           # 1500.340 M/sec                    (49.57%)
       110,273,989      L1-dcache-load-misses     #   53.53% of all L1-dcache accesses  (51.75%)
        96,467,786      LLC-loads                 #  702.545 M/sec                    (34.96%)
            57,315      LLC-load-misses           #    0.06% of all LL-cache accesses  (32.95%)
   <not supported>      L1-icache-loads                                             
           483,211      L1-icache-load-misses                                         (30.77%)
       206,050,910      dTLB-loads                # 1500.606 M/sec                    (28.58%)
             2,534      dTLB-load-misses          #    0.00% of all dTLB cache accesses  (26.39%)
             6,877      iTLB-loads                #    0.050 M/sec                    (26.22%)
                45      iTLB-load-misses          #    0.65% of all iTLB cache accesses  (26.23%)
   <not supported>      L1-dcache-prefetches                                        
   <not supported>      L1-dcache-prefetch-misses                                   

       0.137661562 seconds time elapsed

       0.130449000 seconds user
       0.006701000 seconds sys

可以看到：

list 的未命中率 50.48% of all L1-dcache accesses, 0.46% of all LL-cache accesses；
vector 的未命中率 53.53% of all L1-dcache accesses, 0.06% of all LL-cache accesses。

两者的缓存命中率不相上下，list 的 L1 命中率反而还高一点。

那是什么原因呢？注意一下 L1-dcache-loads 右边的数字，list 是 167.902 M/sec，而 vector 高达 1500.340 M/sec。这个数字表示的是 L1 的带宽。换句话说，虽然二者命中率差不多，但是因为 vector 每秒钟的吞吐量比 list 大，所以比 list 快很多。

看一下两种循环方式的汇编：https://gcc.godbolt.org/z/3oPznofe9

list 是：

        add     rax, QWORD PTR [rdi]    # load1: add rax, [rdi]
        mov     rdi, QWORD PTR [rdi+8]  # load2: mov rdi, [rdi]
        test    rdi, rdi
        jne     .L9

vector 是：

        mov     rdx, QWORD PTR [rax]  # load1: mov rdx, [rax]
        add     rax, 8                # add rax
        add     r8, QWORD PTR [rdx]   # load2: add r8, [rdx]
        cmp     rax, rcx
        jne     .L3

没啥毛病，都是两次 load。但是对于 list 而言，这些 load 之间的依赖关系是这样的：

load2 -+-> load2 -+-> load2 -+-> ... -+-> load2 -> load1
       |          |          |        |
       +-> load1  +-> load1  +-> ...  +-> load1

因为下一条边只能从上一条边的 next 拿到。如果上一个 Edge 没 load 完（load2），CPU 也很难知道下一个 Edge 在哪里。而 vector 是：

add rax -+-> add rax -+-> add rax -+-> ... -+-> add rax -> load1 -> load2
         |            |            |        |
         |            |            |   ...  +-> load1 -> load2
         |            |            +-> load1 -> load2
         |            +-> load1 -> load2
         +-> load1 -> load2

只要遍历 vector 的下标一直在加（add rax），CPU 就能不断地发新的 load 出去。现代 CPU 大多能指令乱序多发，并且 Intel 的 CPU 有 L1d Fill Buffer，允许同时有很多 cache miss，从而充分利用内存带宽。用 perf 可以验证这个事情：

$ taskset -c 0 perf stat -M ILP,MLP ./vector.out 100000 1000
...
vector: 9056018651256176736 135.844μs

 Performance counter stats for './vector.out 100000 1000':

       297,166,586      uops_executed.core_cycles_ge_1 #     3.62 ILP                    
       537,316,216      uops_executed.thread                                        
       510,676,250      l1d_pend_miss.pending_cycles #     8.40 MLP                    
     4,291,380,208      l1d_pend_miss.pending                                       
...
$ taskset -c 0 perf stat -M ILP,MLP ./list.out 100000 1000
...
list: 9056018651256176736 1261.45μs

 Performance counter stats for './list.out 100000 1000':

       234,580,521      uops_executed.core_cycles_ge_1 #     3.76 ILP                    
       441,502,995      uops_executed.thread                                        
     3,284,071,492      l1d_pend_miss.pending_cycles #     1.00 MLP                    
     3,292,907,879      l1d_pend_miss.pending                                       
...

ILP 是指令并行度，MLP 是访存并行度。可以看到 vector 的 8.40 MLP 比 list 的 1.00 MLP 不知道高到哪里去了。

也可以拿 Intel VTune Profiler 看，我记得可以看到 vector 的 FB Full 是 100%，list 是 0%。说明 vector 把 Fill Buffer 挤炸了。图有空再贴。

有什么用？

主要是针对网络流算法：

struct Edge {
    int u, v, c, w, rev;
};
Edge e[1000000];
vector<int> es[100000];

因为网络流算法需要反复多次遍历同一张图。如果图非常大，或者非常稠密，或者需要跑很多次增广，这个时候可能图的遍历会占用很多时间。那么此时用 vector 存图比用单向链表好。

举个例子：【ULR #1】光伏元件

修改前：https://uoj.ac/submission/468668
修改后：https://uoj.ac/submission/468667

某个点 1490ms -> 961ms。当然优化不可能有前面那份代码那么夸张，取决于每个点的邻接表有多长。

哇那是不是都用 vector 比较吼？

也不是。众所周知 vector 的 push_back 比较慢。我试过几个原本就跑得比较快的题，换成 vector 反而可能有负优化 :)

其它

https://gcc.godbolt.org/z/3oPznofe9：第一行加个 #pragma GCC optimize("unroll-loops")，GCC 会帮 vector 做循环展开。
众所周知 #pragma GCC optimize("O3") 只对当前编译单元有效。如果评测机不开 O2，那还是不要用 vector 为好。

这就是关于 vector 存图的事情了，大家有什么想法呢，欢迎在评论区告诉小编一起讨论哦！

交互题中如何防止选手程序在stdin/stdout上搞事情

2017-01-11 09:28:49 By riteme

最近有人到我们学校的OJ来给我们传授人生的经验，告诉我们如何使用C那一套强大的函数和GCC的一些黑科技来实现从stdin直接读入数据。就目前为止，我所了解的Hack手段和防治措施是这样的：

定义类或结构体作为Reader，在Reader的构造函数中写代码，从而在main函数前动手。

交互库也使用一个类或结构体的Reader来读入即可。GCC默认的构造顺序是编译单元的编译顺序，因此需要在编译命令里面，交互库的文件要在选手文件之前。C++11中static全局变量保证在main函数之前构造，在C++11之前并没有定义，但是一般的编译器基本都保证了这一点。

利用GCC扩展init_priority来修改构造顺序，使得选手的Reader比交互库的Reader先构造。

init_priority在GCC和Clang上都有用......

init_priority的声明方法是这样的：

TYPE NAME __attribute__ ((init_priority(n)));

其中$n$必须是正整数，且在$[101, \;65535]$范围内。权值越小优先级越高，也就意味着会在优先级低的变量之前构造。

因此，我们需要将交互库的Reader的优先级设为$101$。

需要注意一个事情，假如你在使用cin来读入，那么在读入之前，cin/cout是还没有被构造的，所以需要在Reader前面使用下面的语句来提前初始化cin/cout：

static std::ios_base::Init iosinit __attribute__ ((init_priority(101)));

直接使用C的I/O的不会出事。另外一点就是non-POD类型也需要提前构造（如std::string、std::map和你的其他类和结构体等），在Reader之前定义并且设置优先级。

使用rewind/fseek将stdin重新移至开头并读取。

对于，我们所要做的，就是在交互库读取完数据之后，将stdin关掉。如果想更好玩一些，可以打开个/dev/urandom给它读：

freopen("/dev/urandom", "r", stdin);

暴力试出token长度从而利用fseek来修改交互库输出。

~~类似的，交互库其实并不需要token，~~交互库的token似乎并没有任何卵用，因为利用fseek能够绕过，setbuf也可以偷到......

为此我们可以在Reader中将stdout也给换掉（换成NULL或者/dev/null给它玩），然后交互库输出时又换回来即可。也可以使用Unix中的dup和dup2来保存stdout。交互库输出完毕后需要将stdout干掉。

使用dup复制输入输出句柄，用fdopen重新打开stdin/stdout。

充分利用了Unix那套文件操作的强大。对于此，我们需要自己使用dup将stdin/stdout复制一份，然后使用close关闭原来的输入输出。大致的代码如下：

#include <unistd.h>  // dup, close, fdopen, STDIN_FILENO, STDOUT_FILENO

#define MAGIC 103  // 随便选个不是很大的数字

class IO {
 public:
    IO() {
        // 读取数据...

        fclose(stdin);
        close(STDIN_FILENO);

        for (int i = 0; i < MAGIC; i++) {
            dup(STDERR_FILENO);  // 防止stdout_copy变为3
        }

        stdouot_copy = dup(STDOUT_FILENO);
        fclose(stdout);
        close(STDOUT_FILENO);
    }

    ~IO() {
        stdout = fdopen(stdout_copy, "w");

        // 输出结果...

        fclose(stdout);
        close(stdout_copy);
    }

 private:
     int stdout_copy;
};

static IO io __attribute__ ((init_priority(101)));

总结一下就是，在没有加密的输入输出中，使用全局变量的构造函数来抢先获得stdin/stdout的控制权，避免选手程序搞事。

2017.6.8: 写了一个输入输出管理类：

#include <ctime>
#include <cstdio>
#include <cstdlib>

#include <unistd.h>

class IOLocker {
 public:
    const int MAX_MOGIC_DUP = 100;

    IOLocker() : in(STDIN_FILENO), out(STDOUT_FILENO) {
        srand(time(0));
        lock();
    }

    void lock() {
        int cnt = rand() % MAX_MOGIC_DUP;

        for (int i = 0; i < cnt; i++) {
            dup(STDERR_FILENO);
        }  // for

        int nin = dup(in);
        fclose(stdin);
        close(in);
        in = nin;

        int nout = dup(out);
        fclose(stdout);
        close(out);
        out = nout;

        stdin = stdout = NULL;
    }

    void unlock() {
        stdin = fdopen(in, "r");
        stdout = fdopen(out, "w");
    }

 private:
    int in, out;
};  // struct IOLocker

食用方法如下：

static IOLocker io __attribute__((init_priority(101)));  // 声明时

// ...
// 需要使用输入输出的时候
io.unlock();

// ...

io.lock();

这样就没有使用token的必要了。

此文可能会不定期更新。

话说是不是数据加密就可以一劳永逸了？但数据加密后如何清真地支持Hack？求dalao帮助。

Hacker Problem Setter
阅读全文

介绍一个速度相当快的线性时间构造后缀数组的算法

2016-06-19 21:34:01 By riteme

恩......我的真实博客并不在UOJ上，因此无耻的丢个链接吧
很多部分都是翻译原论文而来，因此全文比较长......
https://riteme.github.io/blog/2016-6-19/sais.html

阅读全文