C ++(la Knuth)
我很好奇Knuth的程序会如何发展,所以我将他(最初是Pascal)的程序翻译成C ++。
尽管Knuth的主要目标不是提高速度,而是说明他的WEB素养编程系统,但是该程序具有令人惊讶的竞争力,并且比迄今为止的任何答案都能提供更快的解决方案。这是我对他程序的翻译(WEB程序的相应“部分”编号在“ {§24}
”之类的注释中提到):
#include <iostream>
#include <cassert>
// Adjust these parameters based on input size.
const int TRIE_SIZE = 800 * 1000; // Size of the hash table used for the trie.
const int ALPHA = 494441; // An integer that's approximately (0.61803 * TRIE_SIZE), and relatively prime to T = TRIE_SIZE - 52.
const int kTolerance = TRIE_SIZE / 100; // How many places to try, to find a new place for a "family" (=bunch of children).
typedef int32_t Pointer; // [0..TRIE_SIZE), an index into the array of Nodes
typedef int8_t Char; // We only care about 1..26 (plus two values), but there's no "int5_t".
typedef int32_t Count; // The number of times a word has been encountered.
// These are 4 separate arrays in Knuth's implementation.
struct Node {
Pointer link; // From a parent node to its children's "header", or from a header back to parent.
Pointer sibling; // Previous sibling, cyclically. (From smallest child to header, and header to largest child.)
Count count; // The number of times this word has been encountered.
Char ch; // EMPTY, or 1..26, or HEADER. (For nodes with ch=EMPTY, the link/sibling/count fields mean nothing.)
} node[TRIE_SIZE + 1];
// Special values for `ch`: EMPTY (free, can insert child there) and HEADER (start of family).
const Char EMPTY = 0, HEADER = 27;
const Pointer T = TRIE_SIZE - 52;
Pointer x; // The `n`th time we need a node, we'll start trying at x_n = (alpha * n) mod T. This holds current `x_n`.
// A header can only be in T (=TRIE_SIZE-52) positions namely [27..TRIE_SIZE-26].
// This transforms a "h" from range [0..T) to the above range namely [27..T+27).
Pointer rerange(Pointer n) {
n = (n % T) + 27;
// assert(27 <= n && n <= TRIE_SIZE - 26);
return n;
}
// Convert trie node to string, by walking up the trie.
std::string word_for(Pointer p) {
std::string word;
while (p != 0) {
Char c = node[p].ch; // assert(1 <= c && c <= 26);
word = static_cast<char>('a' - 1 + c) + word;
// assert(node[p - c].ch == HEADER);
p = (p - c) ? node[p - c].link : 0;
}
return word;
}
// Increment `x`, and declare `h` (the first position to try) and `last_h` (the last position to try). {§24}
#define PREPARE_X_H_LAST_H x = (x + ALPHA) % T; Pointer h = rerange(x); Pointer last_h = rerange(x + kTolerance);
// Increment `h`, being careful to account for `last_h` and wraparound. {§25}
#define INCR_H { if (h == last_h) { std::cerr << "Hit tolerance limit unfortunately" << std::endl; exit(1); } h = (h == TRIE_SIZE - 26) ? 27 : h + 1; }
// `p` has no children. Create `p`s family of children, with only child `c`. {§27}
Pointer create_child(Pointer p, int8_t c) {
// Find `h` such that there's room for both header and child c.
PREPARE_X_H_LAST_H;
while (!(node[h].ch == EMPTY and node[h + c].ch == EMPTY)) INCR_H;
// Now create the family, with header at h and child at h + c.
node[h] = {.link = p, .sibling = h + c, .count = 0, .ch = HEADER};
node[h + c] = {.link = 0, .sibling = h, .count = 0, .ch = c};
node[p].link = h;
return h + c;
}
// Move `p`'s family of children to a place where child `c` will also fit. {§29}
void move_family_for(const Pointer p, Char c) {
// Part 1: Find such a place: need room for `c` and also all existing children. {§31}
PREPARE_X_H_LAST_H;
while (true) {
INCR_H;
if (node[h + c].ch != EMPTY) continue;
Pointer r = node[p].link;
int delta = h - r; // We'd like to move each child by `delta`
while (node[r + delta].ch == EMPTY and node[r].sibling != node[p].link) {
r = node[r].sibling;
}
if (node[r + delta].ch == EMPTY) break; // There's now space for everyone.
}
// Part 2: Now actually move the whole family to start at the new `h`.
Pointer r = node[p].link;
int delta = h - r;
do {
Pointer sibling = node[r].sibling;
// Move node from current position (r) to new position (r + delta), and free up old position (r).
node[r + delta] = {.ch = node[r].ch, .count = node[r].count, .link = node[r].link, .sibling = node[r].sibling + delta};
if (node[r].link != 0) node[node[r].link].link = r + delta;
node[r].ch = EMPTY;
r = sibling;
} while (node[r].ch != EMPTY);
}
// Advance `p` to its `c`th child. If necessary, add the child, or even move `p`'s family. {§21}
Pointer find_child(Pointer p, Char c) {
// assert(1 <= c && c <= 26);
if (p == 0) return c; // Special case for first char.
if (node[p].link == 0) return create_child(p, c); // If `p` currently has *no* children.
Pointer q = node[p].link + c;
if (node[q].ch == c) return q; // Easiest case: `p` already has a `c`th child.
// Make sure we have room to insert a `c`th child for `p`, by moving its family if necessary.
if (node[q].ch != EMPTY) {
move_family_for(p, c);
q = node[p].link + c;
}
// Insert child `c` into `p`'s family of children (at `q`), with correct siblings. {§28}
Pointer h = node[p].link;
while (node[h].sibling > q) h = node[h].sibling;
node[q] = {.ch = c, .count = 0, .link = 0, .sibling = node[h].sibling};
node[h].sibling = q;
return q;
}
// Largest descendant. {§18}
Pointer last_suffix(Pointer p) {
while (node[p].link != 0) p = node[node[p].link].sibling;
return p;
}
// The largest count beyond which we'll put all words in the same (last) bucket.
// We do an insertion sort (potentially slow) in last bucket, so increase this if the program takes a long time to walk trie.
const int MAX_BUCKET = 10000;
Pointer sorted[MAX_BUCKET + 1]; // The head of each list.
// Records the count `n` of `p`, by inserting `p` in the list that starts at `sorted[n]`.
// Overwrites the value of node[p].sibling (uses the field to mean its successor in the `sorted` list).
void record_count(Pointer p) {
// assert(node[p].ch != HEADER);
// assert(node[p].ch != EMPTY);
Count f = node[p].count;
if (f == 0) return;
if (f < MAX_BUCKET) {
// Insert at head of list.
node[p].sibling = sorted[f];
sorted[f] = p;
} else {
Pointer r = sorted[MAX_BUCKET];
if (node[p].count >= node[r].count) {
// Insert at head of list
node[p].sibling = r;
sorted[MAX_BUCKET] = p;
} else {
// Find right place by count. This step can be SLOW if there are too many words with count >= MAX_BUCKET
while (node[p].count < node[node[r].sibling].count) r = node[r].sibling;
node[p].sibling = node[r].sibling;
node[r].sibling = p;
}
}
}
// Walk the trie, going over all words in reverse-alphabetical order. {§37}
// Calls "record_count" for each word found.
void walk_trie() {
// assert(node[0].ch == HEADER);
Pointer p = node[0].sibling;
while (p != 0) {
Pointer q = node[p].sibling; // Saving this, as `record_count(p)` will overwrite it.
record_count(p);
// Move down to last descendant of `q` if any, else up to parent of `q`.
p = (node[q].ch == HEADER) ? node[q].link : last_suffix(q);
}
}
int main(int, char** argv) {
// Program startup
std::ios::sync_with_stdio(false);
// Set initial values {§19}
for (Char i = 1; i <= 26; ++i) node[i] = {.ch = i, .count = 0, .link = 0, .sibling = i - 1};
node[0] = {.ch = HEADER, .count = 0, .link = 0, .sibling = 26};
// read in file contents
FILE *fptr = fopen(argv[1], "rb");
fseek(fptr, 0L, SEEK_END);
long dataLength = ftell(fptr);
rewind(fptr);
char* data = (char*)malloc(dataLength);
fread(data, 1, dataLength, fptr);
if (fptr) fclose(fptr);
// Loop over file contents: the bulk of the time is spent here.
Pointer p = 0;
for (int i = 0; i < dataLength; ++i) {
Char c = (data[i] | 32) - 'a' + 1; // 1 to 26, for 'a' to 'z' or 'A' to 'Z'
if (1 <= c && c <= 26) {
p = find_child(p, c);
} else {
++node[p].count;
p = 0;
}
}
node[0].count = 0;
walk_trie();
const int max_words_to_print = atoi(argv[2]);
int num_printed = 0;
for (Count f = MAX_BUCKET; f >= 0 && num_printed <= max_words_to_print; --f) {
for (Pointer p = sorted[f]; p != 0 && num_printed < max_words_to_print; p = node[p].sibling) {
std::cout << word_for(p) << " " << node[p].count << std::endl;
++num_printed;
}
}
return 0;
}
与Knuth计划的差异:
- 我结合Knuth的4个阵列
link
,sibling
,count
和ch
成的阵列struct Node
(发现很容易理解这种方式)。
- 我将部分的读写编程(WEB风格)文本包含转换为更常规的函数调用(和几个宏)。
- 我们不需要使用标准的Pascal怪异的I / O约定/限制,因此可以使用
fread
和data[i] | 32 - 'a'
其他答案一样,而不是Pascal解决方法。
- 如果程序运行时超出限制(空间不足),则Knuth的原始程序会通过丢弃后面的单词并在末尾打印一条消息来优雅地处理它。(说麦克罗伊“批评克努斯的解决方案甚至不能处理圣经的全部内容,这是不正确的;他只是指出有时在文本中可能会出现很晚的单词,例如”耶稣”一词。 “在圣经中,所以错误情况并非无害。”我采用了更嘈杂的方法(并且反而更容易)来简单地终止程序。
- 该程序声明一个常量TRIE_SIZE来控制内存使用率,我对此大加赞赏。(已为原始要求选择了32767的常数-“用户应该能够在20页的技术论文(大约50K字节的文件)中找到100个最常用的单词”,并且因为Pascal能够很好地处理范围整数进行分类和优化包装。由于测试输入现在增加了2000万倍,我们不得不将其增加25倍,达到80万。)
- 对于字符串的最终打印,我们只需走一下Trie并执行一个哑(可能甚至是二次)字符串附加即可。
除此之外,这几乎完全是Knuth的程序(使用他的哈希Trie /打包Trie数据结构和存储桶排序),并且在循环输入中的所有字符时执行几乎相同的操作(与Knuth的Pascal程序相同)。请注意,它不使用外部算法或数据结构库,并且将按字母顺序打印等频率的单词。
定时
编译与
clang++ -std=c++17 -O2 ptrie-walktrie.cc
当在这里最大的测试用例(giganovel
要求100,000个单词)上运行时,与到目前为止发布的最快的程序相比,我发现它略有提高,但始终更快:
target/release/frequent: 4.809 ± 0.263 [ 4.45.. 5.62] [... 4.63 ... 4.75 ... 4.88...]
ptrie-walktrie: 4.547 ± 0.164 [ 4.35.. 4.99] [... 4.42 ... 4.5 ... 4.68...]
(最上面的一行是Anders Kaseorg的Rust解决方案;最下面的是上面的程序。这些是从100次运行开始的时间,包括平均值,最小值,最大值,中位数和四分位数。)
分析
为什么这样更快?不是C ++比Rust快,也不是Knuth的程序可能是最快的-实际上,由于Trie打包(以节省内存),Knuth的程序在插入时(如他提到的)较慢。我怀疑,原因与Knuth 在2008年抱怨的事情有关:
关于64位指针的火焰
当我编译使用少于4 GB RAM的程序时,拥有64位指针绝对是愚蠢的。当此类指针值出现在结构内部时,它们不仅浪费了一半的内存,而且还浪费掉了一半的缓存。
上面的程序使用32位数组索引(而不是64位指针),因此“节点”结构占用的内存更少,因此堆栈上的节点更多,缓存未命中的次数也更少。(实际上,在x32 ABI上已经做了一些工作,但是即使这个想法显然有用,它似乎也不处于良好状态,例如,参见最近在V8中发布的指针压缩。哦,很好。),此程序将12.8 MB用于(打包的)特里,而Rust程序的32.18MB用于其特里(在giganovel
giganovel
)。我们可以扩大1000倍(从“ giganovel”到“ teranovel”),但仍不超过32位索引,因此这似乎是一个合理的选择。
更快的变体
我们可以优化速度并放弃打包,因此我们可以像在Rust解决方案中一样实际使用(非打包的)特里,使用索引而不是指针。这样可以提供更快的速度,并且对不同的单词,字符等没有预先设置的限制:
#include <iostream>
#include <cassert>
#include <vector>
#include <algorithm>
typedef int32_t Pointer; // [0..node.size()), an index into the array of Nodes
typedef int32_t Count;
typedef int8_t Char; // We'll usually just have 1 to 26.
struct Node {
Pointer link; // From a parent node to its children's "header", or from a header back to parent.
Count count; // The number of times this word has been encountered. Undefined for header nodes.
};
std::vector<Node> node; // Our "arena" for Node allocation.
std::string word_for(Pointer p) {
std::vector<char> drow; // The word backwards
while (p != 0) {
Char c = p % 27;
drow.push_back('a' - 1 + c);
p = (p - c) ? node[p - c].link : 0;
}
return std::string(drow.rbegin(), drow.rend());
}
// `p` has no children. Create `p`s family of children, with only child `c`.
Pointer create_child(Pointer p, Char c) {
Pointer h = node.size();
node.resize(node.size() + 27);
node[h] = {.link = p, .count = -1};
node[p].link = h;
return h + c;
}
// Advance `p` to its `c`th child. If necessary, add the child.
Pointer find_child(Pointer p, Char c) {
assert(1 <= c && c <= 26);
if (p == 0) return c; // Special case for first char.
if (node[p].link == 0) return create_child(p, c); // Case 1: `p` currently has *no* children.
return node[p].link + c; // Case 2 (easiest case): Already have the child c.
}
int main(int, char** argv) {
auto start_c = std::clock();
// Program startup
std::ios::sync_with_stdio(false);
// read in file contents
FILE *fptr = fopen(argv[1], "rb");
fseek(fptr, 0, SEEK_END);
long dataLength = ftell(fptr);
rewind(fptr);
char* data = (char*)malloc(dataLength);
fread(data, 1, dataLength, fptr);
fclose(fptr);
node.reserve(dataLength / 600); // Heuristic based on test data. OK to be wrong.
node.push_back({0, 0});
for (Char i = 1; i <= 26; ++i) node.push_back({0, 0});
// Loop over file contents: the bulk of the time is spent here.
Pointer p = 0;
for (long i = 0; i < dataLength; ++i) {
Char c = (data[i] | 32) - 'a' + 1; // 1 to 26, for 'a' to 'z' or 'A' to 'Z'
if (1 <= c && c <= 26) {
p = find_child(p, c);
} else {
++node[p].count;
p = 0;
}
}
++node[p].count;
node[0].count = 0;
// Brute-force: Accumulate all words and their counts, then sort by frequency and print.
std::vector<std::pair<int, std::string>> counts_words;
for (Pointer i = 1; i < static_cast<Pointer>(node.size()); ++i) {
int count = node[i].count;
if (count == 0 || i % 27 == 0) continue;
counts_words.push_back({count, word_for(i)});
}
auto cmp = [](auto x, auto y) {
if (x.first != y.first) return x.first > y.first;
return x.second < y.second;
};
std::sort(counts_words.begin(), counts_words.end(), cmp);
const int max_words_to_print = std::min<int>(counts_words.size(), atoi(argv[2]));
for (int i = 0; i < max_words_to_print; ++i) {
auto [count, word] = counts_words[i];
std::cout << word << " " << count << std::endl;
}
return 0;
}
该程序尽管比这里的解决方案要费劲很多,但giganovel
它的trie仅使用(for )12.2MB,并且设法更快。该程序的时间(最后一行),与前面提到的时间相比:
target/release/frequent: 4.809 ± 0.263 [ 4.45.. 5.62] [... 4.63 ... 4.75 ... 4.88...]
ptrie-walktrie: 4.547 ± 0.164 [ 4.35.. 4.99] [... 4.42 ... 4.5 ... 4.68...]
itrie-nolimit: 3.907 ± 0.127 [ 3.69.. 4.23] [... 3.81 ... 3.9 ... 4.0...]
我很想看看如果翻译成Rust的话(或hash-trie程序)想要什么。:-)
更多细节
关于此处使用的数据结构:在TAOCP第3卷6.3节(数字搜索,即尝试)的练习4中,以及在Knuth的学生Frank Liang关于TeX中的连字的论文中,简要地解释了“打包”尝试。 :字词连字符计算机。
鉴于以前和以后的专栏以及Knuth以前的经验(包括编译器,TAOCP和TeX ),Bentley专栏,Knuth的程序和McIlroy的评论(其中只有一小部分是关于Unix哲学)的上下文更加清晰。
整本书中有《 练习编程风格》,展示了此特定程序的不同方法,等等。
关于以上几点,我有一篇未完成的博客文章;完成后可能会编辑此答案。同时,无论如何,要在Knuth生日那天(1月10日)在这里发布答案。:-)