# 电脑：数学

13

`n`在我的机器上，您的代码可在10分钟内为您的代码提供最大的正确输出。

`````` 1: 4/6
2: 18/36
3: 88/216
4: 454/1296
6: 13236/46656
7: 73392/279936
8: 411462/1679616
9: 2325976/10077696
10: 13233628/60466176
11: 75682512/362797056
12: 434662684/2176782336
13: 2505229744/13060694016
``````

• n = 15。Python+并行python + pypy在1分49秒内由Jakube撰写
• n = 17。CeithRandall在3分37秒内完成了C ++
• n = 16。C ++在2分38秒内由kuroi neko撰写

1
@Knerd我怎么说呢。我将尝试弄清楚如何在linux中运行您的代码，但对您的帮助非常感谢。

Knerd 2014年

Knerd 2014年

Michael M.

1
@Knerd我改为在问题中添加了一个概率表。希望对您有所帮助。

5

## C ++，在8分钟内9分钟内n = 18

（让我知道它是否在您的计算机上运行不到10分钟。）

``````> time ./a.out 18
1: 16547996212044 / 101559956668416
2:  3120508430672 / 101559956668416
3:   620923097438 / 101559956668416
4:   129930911672 / 101559956668416
5:    28197139994 / 101559956668416
6:     6609438092 / 101559956668416
7:     1873841888 / 101559956668416
8:      813806426 / 101559956668416
9:      569051084 / 101559956668416
10:      510821156 / 101559956668416
11:      496652384 / 101559956668416
12:      493092812 / 101559956668416
13:      492186008 / 101559956668416
14:      491947940 / 101559956668416
15:      491889008 / 101559956668416
16:      449710584 / 101559956668416
17:      418254922 / 101559956668416
18:      409373626 / 101559956668416

real    8m55.854s
user    67m58.336s
sys 0m5.607s
``````

``````#include <stdio.h>
#include <stdlib.h>
#include <vector>
#include <mutex>
#include <chrono>

using namespace std;

typedef long long word;

word n;

void inner(word bpos, word bneg, word w, word *cnt) {
word maxi = n-1;
for(word a = (1<<n)-1; a >= 0; a--) {
word m = a;
for(word i = maxi; i >= 0; i--, m <<= 1) {
if(__builtin_popcount(m&bpos) != __builtin_popcount(m&bneg))
break;
cnt[i]+=w;
}
}
}

word pow(word n, word e) {
word r = 1;
for(word i = 0; i < e; i++) r *= n;
return r;
}

typedef struct {
word b;
word weight;
} Bentry;

mutex block;
Bentry *bqueue;
word btail;
word done = -1;

word maxb;

// compute -1*b
word bneg(word b) {
word w = 1;
for(word i = 0; i < n; i++, w *= 3) {
word d = b / w % 3;
if(d == 1)
b += w;
if(d == 2)
b -= w;
}
return b;
}

// rotate b one position
word brot(word b) {
b *= 3;
b += b / maxb;
b %= maxb;
return b;
}

// reverse b
word brev(word b) {
word r = 0;
for(word i = 0; i < n; i++) {
r *= 3;
r += b % 3;
b /= 3;
}
return r;
}

void work(word *cnt) {
while(true) {
// get a queue entry to work on
block.lock();
if(btail == done) {
block.unlock();
return;
}
block.unlock();
continue;
}
word i = btail++;
block.unlock();

// thread now owns bqueue[i], work on it
word b = bqueue[i].b;
word w = 1;
word bpos = 0;
word bneg = 0;
for(word j = 0; j < n; j++, b /= 3) {
word d = b % 3;
if(d == 1)
bpos |= 1 << j;
if(d == 2)
bneg |= 1 << j;
}
bpos |= bpos << n;
bneg |= bneg << n;
inner(bpos, bneg, bqueue[i].weight, cnt);
}
}

int main(int argc, char *argv[]) {
n = atoi(argv[1]);

// allocate work queue
maxb = pow(3, n);
bqueue = (Bentry*)(malloc(maxb*sizeof(Bentry)));

vector<word*> counts;
for(word p = 0; p < procs; p++) {
word *cnt = (word*)calloc(64+n*sizeof(word), 1);
counts.push_back(cnt);
}

// figure out which Bs we actually want to test, and with which weights
bool *bmark = (bool*)calloc(maxb, 1);
for(word i = 0; i < maxb; i++) {
if(bmark[i]) continue;
word b = i;
word w = 0;
for(word j = 0; j < 2; j++) {
for(word k = 0; k < 2; k++) {
for(word l = 0; l < n; l++) {
if(!bmark[b]) {
bmark[b] = true;
w++;
}
b = brot(b);
}
b = bneg(b);
}
b = brev(b);
}
block.lock();
block.unlock();
}
block.lock();
block.unlock();

word *cnt = (word*)calloc(n,sizeof(word));
for(word p = 0; p < procs; p++) {
for(int i = 0; i < n; i++) cnt[i] += counts[p][i];
}
for(word i = 0; i < n; i++)
printf("%2lld: %14lld / %14lld\n", i+1, cnt[n-1-i], maxb<<n);
return 0;
}
``````

@Lembik：没问题。
Keith Randall 2014年

6

# 使用pypy和pp的Python 2：3分钟内n = 15

## 码：

``````count = [0] * n
count[0] = oeis_A081671(n)

#generating all important vector A
visited = set(); todo = dict()
for A in product((0, 1), repeat=n):
if A not in visited:
# generate all vectors, which have the same probability
# mirrored and cycled vectors
same_probability_set = set()
for i in range(n):
tmp = [A[(i+j) % n] for j in range(n)]
visited.update(same_probability_set)
todo[A] = len(same_probability_set)

# for each vector A, create all possible vectors B
stack = []
for A, cycled_count in dict_A.iteritems():
ones = [sum(A[i:]) for i in range(n)] + [0]
# + [0], so that later ones[n] doesn't throw a exception
stack.append(([0] * n, 0, 0, 0, False))

while stack:
B, index, sum1, sum2, used_negative = stack.pop()
if index < n:
# fill vector B[index] in all possible ways,
# so that it's still possible to reach 0.
if used_negative:
for v in (-1, 0, 1):
sum1_new = sum1 + v * A[index]
sum2_new = sum2 + v * A[index - 1 if index else n - 1]
if abs(sum1_new) <= ones[index+1]:
if abs(sum2_new) <= ones[index] - A[n-1]:
C = B[:]
C[index] = v
stack.append((C, index + 1, sum1_new, sum2_new, True))
else:
for v in (0, 1):
sum1_new = sum1 + v * A[index]
sum2_new = sum2 + v * A[index - 1 if index else n - 1]
if abs(sum1_new) <= ones[index+1]:
if abs(sum2_new) <= ones[index] - A[n-1]:
C = B[:]
C[index] = v
stack.append((C, index + 1, sum1_new, sum2_new, v == 1))
else:
# B is complete, calculate the sums
count[1] += cycled_count  # we know that the sum = 0 for i = 1
for i in range(2, n):
sum_prod = 0
for j in range(n-i):
sum_prod += A[j] * B[i+j]
for j in range(i):
sum_prod += A[n-i+j] * B[j]
if sum_prod:
break
else:
if used_negative:
count[i] += 2*cycled_count
else:
count[i] += cycled_count
``````

## 用法：

`pypy you-do-the-math.py 15`

## 输出：

``````Calculation for n = 15 took 2:50 minutes

1  83940771168 / 470184984576  17.85%
2  17379109692 / 470184984576   3.70%
3   3805906050 / 470184984576   0.81%
4    887959110 / 470184984576   0.19%
5    223260870 / 470184984576   0.05%
6     67664580 / 470184984576   0.01%
7     30019950 / 470184984576   0.01%
8     20720730 / 470184984576   0.00%
9     18352740 / 470184984576   0.00%
10     17730480 / 470184984576   0.00%
11     17566920 / 470184984576   0.00%
12     17521470 / 470184984576   0.00%
13     17510280 / 470184984576   0.00%
14     17507100 / 470184984576   0.00%
15     17506680 / 470184984576   0.00%
``````

## 注意事项和想法：

• 我有一个i2-4600m处理器，带有2个核心和4个线程。我使用2个线程还是4个线程都没有关系。2个线程的cpu使用率是50％，4个线程的cpu使用率是100％，但是仍然需要相同的时间。我不知道为什么 我检查了一下，每个线程只有一半的数据，当有4个线程时，检查了结果，...
• 我使用很多清单。Python的存储效率不是很高，我必须复制很多列表，所以我想到了使用整数代替。我可以在向量A中使用位00（对于0）和11（对于1），以及向量B中的位10（对于-1），00（对于0）和01（对于1）。对于乘积对于A和B，我只需要计算`A & B`和计算01和10块。循环可以通过移动向量和使用遮罩来完成，...我实际上实现了所有这些，您可以在我在Github上的一些较早的提交中找到它。但是事实证明，它比列表要慢。我猜，pypy确实优化了列表操作。

kennytm 2014年

nbviewer.ipython.org/gist/minrk/5500077对此有所提及，尽管使用了不同的并行工具。

5

# 毛茸茸的恶霸-C ++-太慢了

``````#include <cstdlib>
#include <cmath>
#include <vector>
#include <bitset>
#include <future>
#include <iostream>
#include <iomanip>

using namespace std;

/*
6^^n events will be generated, so the absolute max
that can be counted by a b bits integer is
E(b*log(2)/log(6)), i.e. n=24 for a 64 bits counter

To enumerate 3 possible values of a size n vector we need
E(n*log(3)/log(2))+1 bits, i.e. 39 bits
*/
typedef unsigned long long Counter; // counts up to 6^^24

typedef unsigned long long Benumerator; // 39 bits
typedef unsigned long      Aenumerator; // 24 bits

#define log2_over_log6 0.3869

#define A_LENGTH ((size_t)(8*sizeof(Counter)*log2_over_log6))
#define B_LENGTH (2*A_LENGTH)

typedef bitset<B_LENGTH> vectorB;

typedef vector<Counter> OccurenceCounters;

// -----------------------------------------------------------------
// multithreading junk for CPUs detection and allocation
// -----------------------------------------------------------------
int number_of_CPUs(void)
{
return res == 0 ? 8 : res;
}

#ifdef __linux__
#include <sched.h>
void lock_on_CPU(int cpu)
{
}
#elif defined (_WIN32)
#include <Windows.h>
#else
// #warning is not really standard, so this might still cause compiler errors on some platforms. Sorry about that.
#warning "Thread processor affinity settings not supported. Performances might be improved by providing a suitable alternative for your platform"
#define lock_on_CPU(cpu)
#endif

// -----------------------------------------------------------------
// B values generator
// -----------------------------------------------------------------
struct Bvalue {
vectorB p1;
vectorB m1;
};

struct Bgenerator {
int n;                 // A length
Aenumerator stop;      // computation limit
Aenumerator zeroes;    // current zeroes pattern
Aenumerator plusminus; // current +1/-1 pattern
Aenumerator pm_limit;  // upper bound of +1/-1 pattern

Bgenerator(int n, Aenumerator start=0, Aenumerator stop=0) : n(n), stop(stop)
{
// initialize generator so that first call to next() will generate first value
zeroes    = start - 1;
plusminus = -1;
pm_limit  = 0;
}

// compute current B value
Bvalue value(void)
{
Bvalue res;
Aenumerator pm = plusminus;
Aenumerator position = 1;
int i_pm = 0;
for (int i = 0; i != n; i++)
{
if (zeroes & position)
{
if (i_pm == 0)  res.p1 |= position; // first non-zero value fixed to +1
else
{
if (pm & 1) res.m1 |= position; // next non-zero values
else        res.p1 |= position;
pm >>= 1;
}
i_pm++;
}
position <<= 1;
}
res.p1 |= (res.p1 << n); // concatenate 2 Bpre instances
res.m1 |= (res.m1 << n);
return res;
}

// next value
bool next(void)
{
if (++plusminus == pm_limit)
{
if (++zeroes == stop) return false;
plusminus = 0;
pm_limit = (1 << vectorB(zeroes).count()) >> 1;
}
return true;
}

// calibration: produces ranges that will yield the approximate same number of B values
vector<Aenumerator> calibrate(int segments)
{
// setup generator for the whole B range
zeroes = 0;
stop = 1 << n;
plusminus = -1;
pm_limit = 0;

// divide range into (nearly) equal chunks
Aenumerator chunk_size = ((Aenumerator)pow (3,n)-1) / 2 / segments;

// generate bounds for zeroes values
vector<Aenumerator> res(segments + 1);
int bound = 0;
res[bound] = 1;
Aenumerator count = 0;
while (next()) if (++count % chunk_size == 0) res[++bound] = zeroes;
res[bound] = stop;
return res;
}
};

// -----------------------------------------------------------------
// equiprobable A values merging
// -----------------------------------------------------------------
static char A_weight[1 << A_LENGTH];
struct Agroup {
vectorB value;
int     count;
Agroup(Aenumerator a = 0, int length = 0) : value(a), count(length) {}
};
static vector<Agroup> A_groups;

Aenumerator reverse(Aenumerator n) // this works on N-1 bits for a N bits word
{
Aenumerator res = 0;
if (n != 0) // must have at least one bit set for the rest to work
{
for (int i = 0; i != 8 * sizeof(n)-1; i++)
{
res |= (n & 1);
res <<= 1;
n >>= 1;
}

// shift right to elimitate trailing zeroes
while (!(res & 1)) res >>= 1;
}
return res;
}

void generate_A_groups(int n)
{
static bitset<1 << A_LENGTH> lookup(0);
Aenumerator limit_A = (Aenumerator)pow(2, n);
Aenumerator overflow = 1 << n;
for (char & w : A_weight) w = 0;

// gather rotation cycles
for (Aenumerator a = 0; a != limit_A; a++)
{
Aenumerator rotated = a;
int cycle_length = 0;
for (int i = 0; i != n; i++)
{
// check for new cycles
if (!lookup[rotated])
{
cycle_length++;
lookup[rotated] = 1;
}

// rotate current value
rotated <<= 1;
if (rotated & overflow) rotated |= 1;
rotated &= (overflow - 1);
}

// store new cycle
if (cycle_length > 0) A_weight[a] = cycle_length;
}

// merge symetric groups
for (Aenumerator a = 0; a != limit_A; a++)
{
if (A_weight[a] == 0) continue;

// regroup a symetric pair
Aenumerator r = reverse(a);
if (r != a)
{
A_weight[a] += A_weight[r];
A_weight[r] = 0;
}
}

// generate groups
for (Aenumerator a = 0; a != limit_A; a++)
{
if (A_weight[a] != 0) A_groups.push_back(Agroup(a, A_weight[a]));
}
}

// -----------------------------------------------------------------
// -----------------------------------------------------------------
OccurenceCounters solve(int n, int index, Aenumerator Bstart, Aenumerator Bstop)
{
OccurenceCounters consecutive_zero_Z(n, 0);  // counts occurences of the first i terms of Z being 0

// lock on assigned CPU
lock_on_CPU(index);

// enumerate B vectors
Bgenerator Bgen(n, Bstart, Bstop);
while (Bgen.next())
{
// get next B value
Bvalue B = Bgen.value();

// enumerate A vector groups
for (const auto & group : A_groups)
{
// count consecutive occurences of inner product equal to zero
vectorB sliding_A(group.value);
for (int i = 0; i != n; i++)
{
if ((sliding_A & B.p1).count() != (sliding_A & B.m1).count()) break;
consecutive_zero_Z[i] += group.count;
sliding_A <<= 1;
}
}
}
return consecutive_zero_Z;
}

// -----------------------------------------------------------------
// main
// -----------------------------------------------------------------
#define die(msg) { cout << msg << endl; exit (-1); }

int main(int argc, char * argv[])
{
int n = argc == 2 ? atoi(argv[1]) : 16; // arbitray value for debugging
if (n < 1 || n > 24) die("vectors of lenght between 1 and 24 is all I can (try to) compute, guv");

auto begin = time(NULL);

// one worker thread per CPU
int num_workers = number_of_CPUs();

// regroup equiprobable A values
generate_A_groups(n);

// compute B generation ranges for proper load balancing
vector<Aenumerator> ranges = Bgenerator(n).calibrate(num_workers);

// set workers to work
vector<future<OccurenceCounters>> workers(num_workers);
for (int i = 0; i != num_workers; i++)
{
workers[i] = async(
launch::async, // without this parameter, C++ will decide whether execution shall be sequential or asynchronous (isn't C++ fun?).
solve, n, i, ranges[i], ranges[i+1]);
}

// collect results
OccurenceCounters result(n + 1, 0);
for (auto& worker : workers)
{
OccurenceCounters partial = worker.get();
for (size_t i = 0; i != partial.size(); i++) result[i] += partial[i]*2; // each result counts for a symetric B pair
}
for (Counter & res : result) res += (Counter)1 << n; // add null B vector contribution
result[n] = result[n - 1];                           // the last two probabilities are equal by construction

auto duration = time(NULL) - begin;

// output
cout << "done in " << duration / 60 << ":" << setw(2) << setfill('0') << duration % 60 << setfill(' ')
<< " by " << num_workers << " worker thread" << ((num_workers > 1) ? "s" : "") << endl;
Counter events = (Counter)pow(6, n);
int width = (int)log10(events) + 2;
cout.precision(5);
for (int i = 0; i <= n; i++) cout << setw(2) << i << setw(width) << result[i] << " / " << events << " " << fixed << (float)result[i] / events << endl;

return 0;
}``````

## 生成可执行文件

• Win7和MSVC2013
• Win7和MinGW-g ++ 4.7
• Ubuntu＆g ++ 4.8（在分配了2个CPU的VirtualBox VM中）

## 最佳化

1. 最后一个Z项等于第一个Z项（在两种情况下均为Bpre x A），因此最后两个结果始终相等，从而无需计算最后一个Z值。
增益可忽略不计，但是对其进行编码不会花费任何代价，因此您不妨使用它。

2. 正如Jakube所发现的，给定A向量的所有循环值都产生相同的概率。
您可以使用A的单个实例来计算这些值，并将结果乘以其可能的转数。旋转组可以很容易地在很短的时间内预先计算出来，因此这是一个巨大的净速度增益。
由于n个长度向量的排列数为n-1，因此复杂度从o（6 n）降低到o（6 n /（n-1）），对于相同的计算时间，基本上走得更远。

3. 似乎成对的对称模式也产生相同的概率。例如100101和101001。
我对此没有任何数学证明，但是直观地讲，当显示所有可能的B模式时，对于相同的全局结果，每个对称A值将与对应的对称B值进行卷积。
这样可以重组更多的A向量，从而使A组数目减少约30％。

4. 错误 出于某种半神秘的原因，仅设置了一个或两个位的所有模式都会产生相同的结果。这并不代表许多不同的组，但实际上它们可以合并而无需花费任何费用。

5. 向量B和-B（所有分量均乘以-1的B）产生相同的概率。
（例如[1，0，-1，1]和[-1，0，1，-1]）。
除了空向量（所有分量等于0）之外，B和-B形成一对不同的向量。
通过仅考虑每对中的一个，并将其贡献乘以2，可以将B值的数量减少一半，从而将已知B的全局贡献仅添加到每个概率一次。

## 怎么运行的

B值的数量巨大（3 n），因此对其进行预先计算将需要不适当的内存量，这将减慢计算速度并最终耗尽可用的RAM。

## 结果示例

``````C:\Dev\PHP\_StackOverflow\C++\VectorCrunch>release\VectorCrunch.exe 16
done in 8:19 by 4 worker threads
0  487610895942 / 2821109907456 0.17284
1   97652126058 / 2821109907456 0.03461
2   20659337010 / 2821109907456 0.00732
3    4631534490 / 2821109907456 0.00164
4    1099762394 / 2821109907456 0.00039
5     302001914 / 2821109907456 0.00011
6     115084858 / 2821109907456 0.00004
7      70235786 / 2821109907456 0.00002
8      59121706 / 2821109907456 0.00002
9      56384426 / 2821109907456 0.00002
10      55686922 / 2821109907456 0.00002
11      55508202 / 2821109907456 0.00002
12      55461994 / 2821109907456 0.00002
13      55451146 / 2821109907456 0.00002
14      55449098 / 2821109907456 0.00002
15      55449002 / 2821109907456 0.00002
16      55449002 / 2821109907456 0.00002
``````

## 进一步完善

``````  10001011 and 10001101
100101011 and 100110101
100101111 and 100111101
100110111 and 100111011
101001011 and 101001101
101011011 and 101101011
101100111 and 110100111
1010110111 and 1010111011
1011011111 and 1011111011
1011101111 and 1011110111
``````