# 竞赛：排序大量高斯分布数据的最快方法

71

``````#!/usr/bin/env python
import random
from array import array
from sys import argv
count=int(argv[1])
a=array('d',(random.gauss(0,1) for x in xrange(count)))
f=open("gaussian.dat","wb")
a.tofile(f)
``````

``````#!/usr/bin/env python
from array import array
from sys import argv
count=int(argv[1])
a=array('d')
a.fromfile(open("gaussian.dat"),count)
print "sorting..."
b=sorted(a)
``````

1

1

1
@static_rtti-作为重度CG用户，这正是“我们”喜欢在CG.SE上尝试的事情。对于任何阅读模块，请将其移至CG，不要关闭它。
arrdem 2011年

1

dmckee 2011年

2

dmckee 2011年

13

``````#include <cstdlib>
#include <math.h>
#include <stdio.h>
#include <algorithm>

#include <tbb/parallel_for.h>

using namespace std;

typedef unsigned long long ull;

double signum(double x) {
return (x<0) ? -1 : (x>0) ? 1 : 0;
}

const double fourOverPI = 4 / M_PI;

double erf(double x) {
double a = 0.147;
double x2 = x*x;
double ax2 = a*x2;
double f1 = -x2 * (fourOverPI + ax2) / (1 + ax2);
double s1 = sqrt(1 - exp(f1));
return signum(x) * s1;
}

const double sqrt2 = sqrt(2);

double cdf(double x) {
return 0.5 + erf(x / sqrt2) / 2;
}

const int cdfTableSize = 200;
const double cdfTableLimit = 5;
double* computeCdfTable(int size) {
double* res = new double[size];
for (int i = 0; i < size; ++i) {
res[i] = cdf(cdfTableLimit * i / (size - 1));
}
return res;
}
const double* const cdfTable = computeCdfTable(cdfTableSize);

double cdfApprox(double x) {
bool negative = (x < 0);
if (negative) x = -x;
if (x > cdfTableLimit) return negative ? cdf(-x) : cdf(x);
double p = (cdfTableSize - 1) * x / cdfTableLimit;
int below = (int) p;
if (p == below) return negative ? -cdfTable[below] : cdfTable[below];
int above = below + 1;
double ret = cdfTable[below] +
(cdfTable[above] - cdfTable[below])*(p - below);
return negative ? 1 - ret : ret;
}

void print(const double* arr, int len) {
for (int i = 0; i < len; ++i) {
printf("%e; ", arr[i]);
}
puts("");
}

void print(const int* arr, int len) {
for (int i = 0; i < len; ++i) {
printf("%d; ", arr[i]);
}
puts("");
}

void fillBuckets(int N, int bucketCount,
double* data, int* partitions,
double* buckets, int* offsets) {
for (int i = 0; i < N; ++i) {
++offsets[partitions[i]];
}

int offset = 0;
for (int i = 0; i < bucketCount; ++i) {
int t = offsets[i];
offsets[i] = offset;
offset += t;
}
offsets[bucketCount] = N;

int next[bucketCount];
memset(next, 0, sizeof(next));
for (int i = 0; i < N; ++i) {
int p = partitions[i];
int j = offsets[p] + next[p];
++next[p];
buckets[j] = data[i];
}
}

class Sorter {
public:
Sorter(double* data, int* offsets) {
this->data = data;
this->offsets = offsets;
}

static void radixSort(double* arr, int len) {
ull* encoded = (ull*)arr;
for (int i = 0; i < len; ++i) {
ull n = encoded[i];
if (n & signBit) {
n ^= allBits;
} else {
n ^= signBit;
}
encoded[i] = n;
}

const int step = 11;
const ull mask = (1ull << step) - 1;
int offsets[8][1ull << step];
memset(offsets, 0, sizeof(offsets));

for (int i = 0; i < len; ++i) {
for (int b = 0, j = 0; b < 64; b += step, ++j) {
int p = (encoded[i] >> b) & mask;
++offsets[j][p];
}
}

int sum[8] = {0};
for (int i = 0; i <= mask; i++) {
for (int b = 0, j = 0; b < 64; b += step, ++j) {
int t = sum[j] + offsets[j][i];
offsets[j][i] = sum[j];
sum[j] = t;
}
}

ull* copy = new ull[len];
ull* current = encoded;
for (int b = 0, j = 0; b < 64; b += step, ++j) {
for (int i = 0; i < len; ++i) {
int p = (current[i] >> b) & mask;
copy[offsets[j][p]] = current[i];
++offsets[j][p];
}

ull* t = copy;
copy = current;
current = t;
}

if (current != encoded) {
for (int i = 0; i < len; ++i) {
encoded[i] = current[i];
}
}

for (int i = 0; i < len; ++i) {
ull n = encoded[i];
if (n & signBit) {
n ^= signBit;
} else {
n ^= allBits;
}
encoded[i] = n;
}
}

void operator() (tbb::blocked_range<int>& range) const {
for (int i = range.begin(); i < range.end(); ++i) {
double* begin = &data[offsets[i]];
double* end = &data[offsets[i+1]];
//std::sort(begin, end);
}
}

private:
double* data;
int* offsets;
static const ull signBit = 1ull << 63;
static const ull allBits = ~0ull;
};

void sortBuckets(int bucketCount, double* data, int* offsets) {
Sorter sorter(data, offsets);
tbb::blocked_range<int> range(0, bucketCount);
tbb::parallel_for(range, sorter);
//sorter(range);
}

class Partitioner {
public:
Partitioner(int bucketCount, double* data, int* partitions) {
this->data = data;
this->partitions = partitions;
this->bucketCount = bucketCount;
}

void operator() (tbb::blocked_range<int>& range) const {
for (int i = range.begin(); i < range.end(); ++i) {
double d = data[i];
int p = (int) (cdfApprox(d) * bucketCount);
partitions[i] = p;
}
}

private:
double* data;
int* partitions;
int bucketCount;
};

const int bucketCount = 512;
int offsets[bucketCount + 1];

int main(int argc, char** argv) {
if (argc != 2) {
printf("Usage: %s N\n N = the size of the input\n", argv[0]);
return 1;
}

puts("initializing...");
int N = atoi(argv[1]);
double* data = new double[N];
double* buckets = new double[N];
memset(offsets, 0, sizeof(offsets));
int* partitions = new int[N];

FILE* fp = fopen("gaussian.dat", "rb");
if (fp == 0 || fread(data, sizeof(*data), N, fp) != N) {
return 1;
}
//print(data, N);

puts("assigning partitions...");
tbb::parallel_for(tbb::blocked_range<int>(0, N),
Partitioner(bucketCount, data, partitions));

puts("filling buckets...");
fillBuckets(N, bucketCount, data, partitions, buckets, offsets);
data = buckets;

puts("sorting buckets...");
sortBuckets(bucketCount, data, offsets);

puts("done.");

/*
for (int i = 0; i < N-1; ++i) {
if (data[i] > data[i+1]) {
printf("error at %d: %e > %e\n", i, data[i], data[i+1]);
}
}
*/

//print(data, N);

return 0;
}
``````

``````g++ -O3 -ltbb -o gsort gsort.cpp && time ./gsort 50000000
``````

Scott

k21

static_rtti 2011年

2

3

Alexandru

13

``````#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int cmp(const void* av, const void* bv) {
double a = *(const double*)av;
double b = *(const double*)bv;
return a < b ? -1 : a > b ? 1 : 0;
}
int main(int argc, char** argv) {
if (argc <= 1)
return puts("No argument!");
unsigned count = atoi(argv[1]);

double *a = malloc(count * sizeof *a);

FILE *f = fopen("gaussian.dat", "rb");
if (fread(a, sizeof *a, count, f) != count)
fclose(f);

puts("sorting...");
double *b = malloc(count * sizeof *b);
memcpy(b, a, count * sizeof *b);
qsort(b, count, sizeof *b, cmp);
return 0;
}
``````

1

1

Codism 2011年

@Codism：我要补充一点，我们不在乎交换等效数据的位置，因此即使获得等效值也将是一个适当的简化。

10

http://www.wolframalpha.com/input/?i=percentages+by++normal+distribution

``````import java.io.FileInputStream;
import java.nio.ByteBuffer
import java.nio.ByteOrder
import scala.collection.mutable.ArrayBuilder

object SortFile {

//used partition numbers from Damascus' solution
val partList = List(0, 0.15731, 0.31864, 0.48878, 0.67449, 0.88715, 1.1503, 1.5341)

val listSize = partList.size * 2;
val posZero = partList.size;
val neg = partList.map( _ * -1).reverse.zipWithIndex
val pos = partList.map( _ * 1).zipWithIndex.reverse

def partition(dbl:Double): Int = {

//for each partition, i am running through the vals in order
//could make this a binary search to be more performant... but our list size is 4 (per side)

if(dbl < 0) { return neg.find( dbl < _._1).get._2  }
if(dbl > 0) { return posZero  + pos.find( dbl > _._1).get._2  }
return posZero;

}

def main(args: Array[String])
{

var l = 0
val dbls = new Array[Double](50000000)
val partList = new Array[Int](50000000)
val pa = Array.fill(listSize){Array.newBuilder[Double]}
val channel = new FileInputStream("../../gaussian.dat").getChannel()
val bb = ByteBuffer.allocate(50000000 * 8)
bb.order(ByteOrder.LITTLE_ENDIAN)
bb.rewind
var dbl = 0.0
while(bb.hasRemaining)
{
dbl = bb.getDouble
dbls.update(l,dbl)

l+=1
}

for( i <- (0 to 49999999).par) { partList.update(i, partition(dbls(i)))}

println("Partition computed" + System.currentTimeMillis() )
for(i <- (0 to 49999999)) { pa(partList(i)) += dbls(i) }
println("Partition completed " + System.currentTimeMillis())
val toSort = for( i <- pa) yield i.result()
println("Arrays Built" + System.currentTimeMillis());
toSort.par.foreach{i:Array[Double] =>scala.util.Sorting.quickSort(i)};

}
}
``````

1
8.185秒！我猜这是一个理想的Scala解决方案...另外，勇敢地提供了第一个实际上以某种方式使用高斯分布的解决方案！

1

Scott

9

``````using System;
using System.IO;

namespace Sort
{
class Program
{
const int count = 50000000;
static double[][] doubles;
static WaitHandle[] waiting = new WaitHandle[4];
static AutoResetEvent[] events = new AutoResetEvent[4];

static double[] Merge(double[] left, double[] right)
{
double[] result = new double[left.Length + right.Length];
int l = 0, r = 0, spot = 0;
while (l < left.Length && r < right.Length)
{
if (right[r] < left[l])
result[spot++] = right[r++];
else
result[spot++] = left[l++];
}
while (l < left.Length) result[spot++] = left[l++];
while (r < right.Length) result[spot++] = right[r++];
return result;
}

{
int index = (int)data;
Array.Sort(doubles[index]);
events[index].Set();
}

static void Main(string[] args)
{
System.Diagnostics.Stopwatch watch = new System.Diagnostics.Stopwatch();
watch.Start();
doubles = new double[][] { new double[count / 4], new double[count / 4], new double[count / 4], new double[count / 4] };
for (int i = 0; i < 4; i++)
{
for (int j = 0; j < count / 4; j++)
{
doubles[i][j] = BitConverter.ToDouble(bytes, i * count/4 + j * 8);
}
}
for (int i = 0; i < 4; i++)
{
waiting[i] = events[i] = new AutoResetEvent(false);
}
WaitHandle.WaitAll(waiting);
double[] left = Merge(doubles[0], doubles[1]);
double[] right = Merge(doubles[2], doubles[3]);
double[] result = Merge(left, right);
watch.Stop();
Console.WriteLine(watch.Elapsed.ToString());
}
}
}
``````

1

1

1

8

`phi`只是高斯累积分布函数。它将+/-无穷大之间的高斯分布数转换为0到1之间的均匀分布数。一种简单的计算方法是使用表查找和内插法。

3

@static_rtti：在这种情况下，phi的必要近似值会比数据集IMO中的任何不规则性产生更大的麻烦。

1
@static_rtti：不一定是准确的。它只需要散布数据就可以使数据大致均匀，因此在某些地方不会造成太多麻烦。

8

``````#include <stdio.h>
#include <stdlib.h>
#include <algorithm>
#include <ctime>

typedef unsigned long long ull;

int size;
double *dbuf, *copy;
int cnt[8][1 << 16];

void sort()
{
const int step = 10;
const int start = 24;
ull mask = (1ULL << step) - 1;

ull *ibuf = (ull *) dbuf;
for (int i = 0; i < size; i++) {
for (int w = start, v = 0; w < 64; w += step, v++) {
int p = (~ibuf[i] >> w) & mask;
cnt[v][p]++;
}
}

int sum[8] = { 0 };
for (int i = 0; i <= mask; i++) {
for (int w = start, v = 0; w < 64; w += step, v++) {
int tmp = sum[v] + cnt[v][i];
cnt[v][i] = sum[v];
sum[v] = tmp;
}
}

for (int w = start, v = 0; w < 64; w += step, v++) {
ull *ibuf = (ull *) dbuf;
for (int i = 0; i < size; i++) {
int p = (~ibuf[i] >> w) & mask;
copy[cnt[v][p]++] = dbuf[i];
}

double *tmp = copy;
copy = dbuf;
dbuf = tmp;
}

for (int p = 0; p < size; p++)
if (dbuf[p] >= 0.) {
std::reverse(dbuf + p, dbuf + size);
break;
}

// Insertion sort
for (int i = 1; i < size; i++) {
double value = dbuf[i];
if (value < dbuf[i - 1]) {
dbuf[i] = dbuf[i - 1];
int p = i - 1;
for (; p > 0 && value < dbuf[p - 1]; p--)
dbuf[p] = dbuf[p - 1];
dbuf[p] = value;
}
}
}

int main(int argc, char **argv) {
size = atoi(argv[1]);
dbuf = new double[size];
copy = new double[size];

FILE *f = fopen("gaussian.dat", "r");
fclose(f);

clock_t c0 = clock();
sort();
printf("Finished after %.3f\n", (double) ((clock() - c0)) / CLOCKS_PER_SEC);
return 0;
}
``````

``````\$ g++ -O3 mysort.cpp -o mysort && ./mysort 50000000
Finished after 2.10
\$ g++ -O3 stdsort.cpp -o stdsort && ./stdsort
Finished after 7.12
``````

static_rtti 2011年

2

Alexandru

static_rtti 2011年

1
@static_rtti：我看到Damascus Steel已经发布了该实现的多线程版本。我改进了该算法的缓存行为，因此您现在应该获得更好的计时。请测试此新版本。
Alexandru

2

static_rtti 2011年

6

``````#include <iostream>
#include <fstream>
#include <algorithm>
#include <vector>
#include <ctime>
#include <tbb/parallel_sort.h>

int main(void)
{
std::ifstream ifs("gaussian.dat", std::ios::binary | std::ios::in);
std::vector<double> values;
values.reserve(50000000);
double d;
values.push_back(d);
clock_t c0 = clock();
tbb::parallel_sort(values.begin(), values.end());
std::cout << "Finished after "
<< static_cast<double>((clock() - c0)) / CLOCKS_PER_SEC
<< std::endl;
}
``````

``````#include <tbb/parallel_sort.h>
``````

``````#include "ipps.h"
``````

``````tbb::parallel_sort(values.begin(), values.end());
``````

``````std::vector<double> copy(values.size());
``````

``````C               16.4 s
C#              20 s
C++ std::sort   7.2 s
C++ tbb         5 s
C++ ipp         4.5 s
python          too long
``````

1
2.958秒！TBB看起来很酷，易于使用！

2
TBB真是太棒了。这正是算法工作的正确抽象层次。
drxzcl 2011年

5

5

``````#include <cstdio>
#include <iostream>
#include <algorithm>
using namespace std;
const size_t size=50000000;

void pivot(double* start,double * end, double middle,size_t& koniec){
double * beg=start;
end--;
while (start!=end){
if (*start>middle) swap (*start,*end--);
else start++;
}
if (*end<middle) start+=1;
koniec= start-beg;
}
void s(double * a, double* b){
sort(a,b);
}
int main(){
double *data=new double[size];
FILE *f = fopen("gaussian.dat", "rb");
size_t end1,end2,end3,temp;
pivot(data, data+size,0,end2);
pivot(data, data+end2,-0.6745,end1);
pivot(data+end2,data+size,0.6745,end3);
end3+=end2;
ts1.join(),ts2.join(),ts3.join(),ts4.join();
//for (int i=0; i<size-1; i++){
//}
fclose(f);
//fwrite(data,8,size,stdout);
}
``````

//编辑更改为读取gaussian.dat文件。

static_rtti 2011年

static_rtti 2011年

4

``````#include <iostream>
#include <fstream>
#include <algorithm>
#include <vector>
#include <ctime>

int main(void)
{
std::ifstream ifs("C:\\Temp\\gaussian.dat", std::ios::binary | std::ios::in);
std::vector<double> values;
values.reserve(50000000);
double d;
values.push_back(d);
clock_t c0 = clock();
std::sort(values.begin(), values.end());
std::cout << "Finished after "
<< static_cast<double>((clock() - c0)) / CLOCKS_PER_SEC
<< std::endl;
}
``````

6.425秒！不出所料，C ++

@static_rtti：我尝试了swensons Timsort算法（正如Matthieu M.在您的第一个问题中所建议的那样）。我必须对`sort.h`文件进行一些更改才能使用C ++进行编译。它慢了大约两倍`std::sort`。不知道为什么，也许是由于编译器优化？
Christian Ammer

4

``````g++ -std=c++0x -pthread -O3 -march=native sorter_gaussian_radix.cxx -o sorter_gaussian_radix
``````

``````sorter_gaussian_radix 50000000 1
``````

``````sorter_gaussian_radix 50000000 4
``````

``````#include <stdlib.h>
#include <string.h>
#include <algorithm>
#include <ctime>
#include <iostream>
#include <vector>
#include <boost/cstdint.hpp>
// #include "ipps.h"

#ifndef STEP
#define STEP 8
#endif

const int step = STEP;
const int start_step=24;
const int num_steps=(64-start_step+step-1)/step;
int size;
double *dbuf, *copy;

clock_t c1, c2, c3, c4, c5;

const double distrib[]={-2.15387,
-1.86273,
-1.67594,
-1.53412,
-1.4178,
-1.31801,
-1.22986,
-1.15035,
-1.07752,
-1.00999,
-0.946782,
-0.887147,
-0.830511,
-0.776422,
-0.724514,
-0.67449,
-0.626099,
-0.579132,
-0.53341,
-0.488776,
-0.445096,
-0.40225,
-0.36013,
-0.318639,
-0.27769,
-0.237202,
-0.197099,
-0.157311,
-0.11777,
-0.0784124,
-0.0391761,
0,
0.0391761,
0.0784124,
0.11777,
0.157311,
0.197099,
0.237202,
0.27769,
0.318639,
0.36013,
0.40225,
0.445097,
0.488776,
0.53341,
0.579132,
0.626099,
0.67449,
0.724514,
0.776422,
0.830511,
0.887147,
0.946782,
1.00999,
1.07752,
1.15035,
1.22986,
1.31801,
1.4178,
1.53412,
1.67594,
1.86273,
2.15387};

class Distrib
{
const int value;
public:
Distrib(const double &v): value(v) {}

bool operator()(double a)
{
return a<value;
}
};

void recursive_sort(const int start, const int end,
const int index, const int offset,
const int depth, const int max_depth)
{
if(depth<max_depth)
{
Distrib dist(distrib[index]);
const int middle=std::partition(dbuf+start,dbuf+end,dist) - dbuf;

// const int middle=
//   std::partition(dbuf+start,dbuf+end,[&](double a)
//                  {return a<distrib[index];})
//   - dbuf;

depth+1,max_depth);
depth+1,max_depth);
lower.join(), upper.join();
}
else
{

c1=clock();

double *dbuf_local(dbuf), *copy_local(copy);
boost::uint64_t mask = (1 << step) - 1;

boost::uint64_t *ibuf = reinterpret_cast<boost::uint64_t *> (dbuf_local);

for(int i=0;i<num_steps;++i)
cnt[i][j]=0;

for (int i = start; i < end; i++)
{
for (int w = start_step, v = 0; w < 64; w += step, v++)
{
int p = (~ibuf[i] >> w) & mask;
(cnt[v][p])++;
}
}

c2=clock();

std::vector<int> sum(num_steps,0);
for (uint i = 0; i <= mask; i++)
{
for (int w = start_step, v = 0; w < 64; w += step, v++)
{
int tmp = sum[v] + cnt[v][i];
cnt[v][i] = sum[v];
sum[v] = tmp;
}
}

c3=clock();

for (int w = start_step, v = 0; w < 64; w += step, v++)
{
ibuf = reinterpret_cast<boost::uint64_t *>(dbuf_local);

for (int i = start; i < end; i++)
{
int p = (~ibuf[i] >> w) & mask;
copy_local[start+((cnt[v][p])++)] = dbuf_local[i];
}
std::swap(copy_local,dbuf_local);
}

// Do the last set of reversals
for (int p = start; p < end; p++)
if (dbuf_local[p] >= 0.)
{
std::reverse(dbuf_local+p, dbuf_local + end);
break;
}

c4=clock();

// Insertion sort
for (int i = start+1; i < end; i++) {
double value = dbuf_local[i];
if (value < dbuf_local[i - 1]) {
dbuf_local[i] = dbuf_local[i - 1];
int p = i - 1;
for (; p > 0 && value < dbuf_local[p - 1]; p--)
dbuf_local[p] = dbuf_local[p - 1];
dbuf_local[p] = value;
}
}
c5=clock();

}
}

int main(int argc, char **argv) {
size = atoi(argv[1]);
copy = new double[size];

dbuf = new double[size];
FILE *f = fopen("gaussian.dat", "r");
fclose(f);

clock_t c0 = clock();

const int max_depth= (argc > 2) ? atoi(argv[2]) : 2;

recursive_sort(0,size,31,16,0,max_depth);

if(num_steps%2==1)
std::swap(dbuf,copy);

// for (int i=0; i<size-1; i++){
//   if (dbuf[i]>dbuf[i+1])
//               << i << " "
//               << dbuf[i] << " "
//               << dbuf[i+1] << " "
//               << "\n";
// }

std::cout << "Finished after "
<< (double) (c1 - c0) / CLOCKS_PER_SEC << " "
<< (double) (c2 - c1) / CLOCKS_PER_SEC << " "
<< (double) (c3 - c2) / CLOCKS_PER_SEC << " "
<< (double) (c4 - c3) / CLOCKS_PER_SEC << " "
<< (double) (c5 - c4) / CLOCKS_PER_SEC << " "
<< "\n";

// delete [] dbuf;
// delete [] copy;
return 0;
}
``````

Alexandru

Alexandru

static_rtti 2011年

1.534秒！我认为我们有一个领导者：-D
static_rtti 2011年

@static_rtti：您能再试一次吗？它的速度比上次尝试的速度快得多。在我的机器上，它比任何其他解决方案都快得多。

2

``````import scipy.stats
import random

# slightly modified from linked stackoverflow post
def n_random_numbers_increasing(n):
"""Like sorted(random() for i in range(n))),
but faster because we avoid sorting."""
v = 1.0
while n:
v *= random.random() ** (1.0 / n)
yield 1 - v
n -= 1

def n_normal_samples_increasing(n):
return map(scipy.stats.norm.ppf, n_random_numbers_increasing(n))
``````

2

2

``````    static void Main(string[] args)
{
FileStream filestream = new FileStream(@"..\..\..\gaussian.dat", FileMode.Open, FileAccess.Read);
doubles = new double[][] { new double[count / 4], new double[count / 4], new double[count / 4], new double[count / 4] };

for (int i = 0; i < 4; i++)
{
byte[] bytes = new byte[count * 4];

for (int j = 0; j < count / 4; j++)
{
doubles[i][j] = BitConverter.ToDouble(bytes, i * count/4 + j * 8);
}

waiting[i] = events[i] = new AutoResetEvent(false);
}

WaitHandle.WaitAll(waiting);
double[] left = Merge(doubles[0], doubles[1]);
double[] right = Merge(doubles[2], doubles[3]);
double[] result = Merge(left, right);
}
}
``````

8.933秒。稍微快一点：)

2

poulejapon 2011年

@pouejapon：你是对的。

Sven Marnach 2011年

2

2

``````#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <algorithm>
#include <ctime>

using std::fill;

const double q[] = {
0.0,
9.865E-10,
2.8665150000000003E-7,
3.167E-5,
0.001349898,
0.022750132,
0.158655254,
0.5,
0.8413447460000001,
0.9772498679999999,
0.998650102,
0.99996833,
0.9999997133485,
0.9999999990134999,
1.0,
};
int main(int argc, char** argv) {
if (argc <= 1)
return puts("No argument!");
unsigned count = atoi(argv[1]);
unsigned count2 = 3 * count;

bool *ba = new bool[count2 + 1000];
fill(ba, ba + count2 + 1000, false);
double *a = new double[count];
double *c = new double[count2 + 1000];

FILE *f = fopen("gaussian.dat", "rb");
if (fread(a, 8, count, f) != count)
fclose(f);

int i;
int j;
bool s;
int t;
double z;
double p;
double d1;
double d2;
for (i = 0; i < count; i++) {
s = a[i] < 0;
t = a[i];
if (s) t--;
z = a[i] - t;
t += 7;
if (t < 0) {
t = 0;
z = 0;
} else if (t >= 14) {
t = 13;
z = 1;
}
p = q[t] * (1 - z) + q[t + 1] * z;
j = count2 * p;
while (ba[j] && c[j] < a[i]) {
j++;
}
if (!ba[j]) {
ba[j] = true;
c[j] = a[i];
} else {
d1 = c[j];
c[j] = a[i];
j++;
while (ba[j]) {
d2 = c[j];
c[j] = d1;
d1 = d2;
j++;
}
c[j] = d1;
ba[j] = true;
}
}
i = 0;
int max = count2 + 1000;
for (j = 0; j < max; j++) {
if (ba[j]) {
a[i++] = c[j];
}
}
// for (i = 0; i < count; i += 1) {
//   printf("here %f\n", a[i]);
// }
return 0;
}
``````

1

static_rtti 2011年

3.071秒！对于单线程解决方案来说还不错！
static_rtti 2011年

2

g ++ -std = c ++ 0x -O3 -march = native -pthread => http://pastebin.com/T3yzViZP主排序程序

1.621秒！我认为您是领导者，但对于所有这些答案，我都迅速失去了方向：)
static_rtti 2011年

2

• 大约CDF（请参阅`phi()`实现中的功能）
• 对于所有元素，请计算已排序数组中的近似位置： `size * phi(x)`
• 将元素放置在靠近最终位置的新数组中
• 在我的实现中，目标数组中有一些空白，因此插入时不必移动太多元素。
• 使用insertsort对最终元素进行排序（如果到最终位置的距离小于常数，则insertsort是线性的）。

1
2.470秒！很好的主意。没关系，如果想法很有趣，解决方案就不是最快的了：)
static_rtti 2011年

1

jonderry 2011年

@jonderry：我了解您的解决方案后，我投票支持您的解决方案。并不是要窃取您的想法。我将您的实现包含在我的（非正式）测试集中
Alexandru

2

``````import java.io.FileInputStream;
import java.nio.channels.FileChannel;
import java.util.Arrays;
import java.util.concurrent.*;
import static java.nio.ByteOrder.LITTLE_ENDIAN;

/**
*
* Original Quicksort: https://github.com/pmbauer/parallel/tree/master/src/main/java/pmbauer/parallel
*
*/
public class ForkJoinQuicksortTask extends RecursiveAction {

public static void main(String[] args) throws Exception {

double[] array = new double[Integer.valueOf(args[0])];

FileChannel fileChannel = new FileInputStream("gaussian.dat").getChannel();

ForkJoinPool mainPool = new ForkJoinPool();

System.out.println("Starting parallel computation");

}

private static final long serialVersionUID = -642903763239072866L;
private static final int SERIAL_THRESHOLD = 0x1000;

private final double a[];
private final int left, right;

public ForkJoinQuicksortTask(double[] a) {this(a, 0, a.length - 1);}

private ForkJoinQuicksortTask(double[] a, int left, int right) {
this.a = a;
this.left = left;
this.right = right;
}

@Override
protected void compute() {
if (right - left < SERIAL_THRESHOLD) {
Arrays.sort(a, left, right + 1);
} else {
int pivotIndex = partition(a, left, right);

if (left < pivotIndex)
t1 = new ForkJoinQuicksortTask(a, left, pivotIndex).fork();
if (pivotIndex + 1 < right)
new ForkJoinQuicksortTask(a, pivotIndex + 1, right).invoke();

if (t1 != null)
t1.join();
}
}

public static int partition(double[] a, int left, int right) {
// chose middle value of range for our pivot
double pivotValue = a[left + (right - left) / 2];

--left;
++right;

while (true) {
do
++left;
while (a[left] < pivotValue);

do
--right;
while (a[right] > pivotValue);

if (left < right) {
double tmp = a[left];
a[left] = a[right];
a[right] = tmp;
} else {
return right;
}
}
}
}
``````

Python参考

``````time python sort.py 50000000
sorting...

real    1m13.885s
user    1m11.942s
sys     0m1.935s
``````

Java JDK 7分叉/联接

``````time java ForkJoinQuicksortTask 50000000
Starting parallel computation

real    0m2.404s
user    0m10.195s
sys     0m0.347s
``````

``````import static java.nio.ByteOrder.LITTLE_ENDIAN;

import java.io.FileInputStream;
import java.nio.DoubleBuffer;
import java.nio.channels.FileChannel;
import java.util.Arrays;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.RecursiveAction;

/**
*
* Original Quicksort: https://github.com/pmbauer/parallel/tree/master/src/main/java/pmbauer/parallel
*
*/
public class ForkJoinQuicksortTask extends RecursiveAction {

public static void main(String[] args) throws Exception {

ForkJoinPool mainPool = new ForkJoinPool();

double[] array = new double[Integer.valueOf(args[0])];
FileChannel fileChannel = new FileInputStream("gaussian.dat").getChannel();
DoubleBuffer buffer = fileChannel.map(READ_ONLY, 0, fileChannel.size()).order(LITTLE_ENDIAN).asDoubleBuffer();

}

private static final long serialVersionUID = -642903763239072866L;
private static final int SERIAL_THRESHOLD = 0x1000;

private final double a[];
private final int left, right;

public ForkJoinQuicksortTask(double[] a) {this(a, 0, a.length - 1);}

private ForkJoinQuicksortTask(double[] a, int left, int right) {
this.a = a;
this.left = left;
this.right = right;
}

@Override
protected void compute() {
if (right - left < SERIAL_THRESHOLD) {
Arrays.sort(a, left, right + 1);
} else {
int pivotIndex = partition(a, left, right);

if (left < pivotIndex)
t1 = new ForkJoinQuicksortTask(a, left, pivotIndex).fork();
if (pivotIndex + 1 < right)
new ForkJoinQuicksortTask(a, pivotIndex + 1, right).invoke();

if (t1 != null)
t1.join();
}
}

public static int partition(double[] a, int left, int right) {
// chose middle value of range for our pivot
double pivotValue = a[left + (right - left) / 2];

--left;
++right;

while (true) {
do
++left;
while (a[left] < pivotValue);

do
--right;
while (a[right] > pivotValue);

if (left < right) {
double tmp = a[left];
a[left] = a[right];
a[right] = tmp;
} else {
return right;
}
}
}

}

private static final long serialVersionUID = -3498527500076085483L;

private final DoubleBuffer buffer;
private final double[] array;
private final int low, high;

public ReadAction(DoubleBuffer buffer, double[] array, int low, int high) {
this.buffer = buffer;
this.array = array;
this.low = low;
this.high = high;
}

@Override
protected void compute() {
if (high - low < 100000) {
buffer.position(low);
buffer.get(array, low, high-low);
} else {
int middle = (low + high) >>> 1;

}
}
}
``````

``````Cores  Time
1      7.568s
2      3.903s
3      3.325s
4      2.388s
5      2.227s
6      1.956s
7      1.856s
8      1.827s
9      1.682s
10     1.698s
11     1.620s
12     1.503s
``````

``````Cores  Time
1      15.056s
2      8.116s
3      5.925s
4      4.802s
5      4.430s
6      3.733s
7      3.540s
8      3.228s
9      3.103s
10     2.827s
11     2.784s
12     2.689s
``````

150,000,000双打：

``````Cores  Time
1      23.295s
2      12.391s
3      8.944s
4      6.990s
5      6.216s
6      6.211s
7      5.446s
8      5.155s
9      4.840s
10     4.435s
11     4.248s
12     4.174s
``````

C ++版本与其结果非常一致，其中Java稍有波动。首先，当问题变大时，它的效率会提高一点，但随后效率会降低。

1

jonderry 2011年

1

arjan 2011年

static_rtti 2011年

1

arjan 2011年

1

``````#include <stdio.h>
#include <stdlib.h>
#include <algorithm>

static unsigned int nthreads = 4;
static unsigned int size = 50000000;

typedef struct {
double *array;
int size;
} array_t;

void
merge(double *left, int leftsize,
double *right, int rightsize,
double *result)
{
int l = 0, r = 0, insertat = 0;
while (l < leftsize && r < rightsize) {
if (left[l] < right[r])
result[insertat++] = left[l++];
else
result[insertat++] = right[r++];
}

while (l < leftsize) result[insertat++] = left[l++];
while (r < rightsize) result[insertat++] = right[r++];
}

void *
{
array_t numbers = *(array_t *)input;
std::sort(numbers.array, numbers.array+numbers.size);
}

int
main(int argc, char **argv)
{
double *numbers = (double *) malloc(size * sizeof(double));

FILE *f = fopen("gaussian.dat", "rb");
if (fread(numbers, sizeof(double), size, f) != size)
fclose(f);

int worksetsize = size / nthreads;
for (int i = 0; i < nthreads; i++) {
worksets[i].array=numbers+(i*worksetsize);
worksets[i].size=worksetsize;
}

for (int i = 0; i < nthreads; i++) {
}

for (int i = 0; i < nthreads; i++) {
}

double *tmp = (double *) malloc(size * sizeof(double));
merge(numbers, worksetsize, numbers+worksetsize, worksetsize, tmp);
merge(numbers+(worksetsize*2), worksetsize, numbers+(worksetsize*3), worksetsize, tmp+(size/2));
merge(tmp, worksetsize*2, tmp+(size/2), worksetsize*2, numbers);

/*
printf("Verifying result..\n");
for (int i = 0; i < size - 1; i++) {
if (numbers[i] > numbers[i+1])
printf("Result is not correct\n");
}
*/

return 0;
}
``````

``````real    0m6.660s
user    0m9.449s
sys     0m1.160s
``````

1

``````gcc -std=c99 -msse3 -O3 -ffinite-math-only
``````

``````#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h>

#define N 50000000
#define BINSIZE 720
#define MAXBINSIZE 880
#define BINCOUNT (N / BINSIZE)
#define SPLITS 64
#define PHI_VALS 513

double phi_vals[PHI_VALS];

int bin_index(double x)
{
double y = (x + 8.0) * ((PHI_VALS - 1) / 16.0);
int interval = y;
y -= interval;
return (1.0 - y) * phi_vals[interval] + y * phi_vals[interval + 1];
}

double bin_value(int bin)
{
int left = 0;
int right = PHI_VALS - 1;
do
{
int centre = (left + right) / 2;
if (bin < phi_vals[centre])
right = centre;
else
left = centre;
} while (right - left > 1);
double frac = (bin - phi_vals[left]) / (phi_vals[right] - phi_vals[left]);
return (left + frac) * (16.0 / (PHI_VALS - 1)) - 8.0;
}

void gaussian_sort(double *restrict a)
{
double *b = malloc(BINCOUNT * MAXBINSIZE * sizeof(double));
double **pos = malloc(BINCOUNT * sizeof(double*));
for (size_t i = 0; i < BINCOUNT; ++i)
pos[i] = b + MAXBINSIZE * i;
for (size_t i = 0; i < N; ++i)
*pos[bin_index(a[i])]++ = a[i];
double left_val, right_val = bin_value(0);
for (size_t bin = 0, i = 0; bin < BINCOUNT; ++bin)
{
left_val = right_val;
right_val = bin_value(bin + 1);
double *splits[SPLITS + 1];
splits[0] = b + bin * MAXBINSIZE;
splits[SPLITS] = pos[bin];
for (int step = SPLITS; step > 1; step >>= 1)
for (int left_split = 0; left_split < SPLITS; left_split += step)
{
double *left = splits[left_split];
double *right = splits[left_split + step] - 1;
double frac = (double)(left_split + (step >> 1)) / SPLITS;
double pivot = (1.0 - frac) * left_val + frac * right_val;
while (1)
{
while (*left < pivot && left <= right)
++left;
while (*right >= pivot && left < right)
--right;
if (left >= right)
break;
double tmp = *left;
*left = *right;
*right = tmp;
++left;
--right;
}
splits[left_split + (step >> 1)] = left;
}
for (int left_split = 0; left_split < SPLITS; ++left_split)
{
double *left = splits[left_split];
double *right = splits[left_split + 1] - 1;
while (left <= right)
{
double *min = left;
for (double *tmp = left + 1; tmp <= right; ++tmp)
if (*tmp < *min)
min = tmp;
a[i++] = *min;
*min = *right--;
}
}
}
free(b);
free(pos);
}

int main()
{
double *a = malloc(N * sizeof(double));
FILE *f = fopen("gaussian.dat", "rb");
assert(fread(a, sizeof(double), N, f) == N);
fclose(f);
for (int i = 0; i < PHI_VALS; ++i)
{
double x = (i * (16.0 / PHI_VALS) - 8.0) / sqrt(2.0);
phi_vals[i] =  (erf(x) + 1.0) * 0.5 * BINCOUNT;
}
gaussian_sort(a);
free(a);
}
``````

4.098秒！我必须添加-lm才能对其进行编译（用于erf）。
static_rtti 2011年

1
``````#include <stdio.h>
#include <math.h>
#include <stdlib.h>
#include <memory.h>
#include <algorithm>

// maps [-inf,+inf] to (0,1)
double normcdf(double x) {
return 0.5 * (1 + erf(x * M_SQRT1_2));
}

int calcbin(double x, int bins) {
return (int)floor(normcdf(x) * bins);
}

int *docensus(int bins, int n, double *arr) {
int *hist = calloc(bins, sizeof(int));
int i;
for(i = 0; i < n; i++) {
hist[calcbin(arr[i], bins)]++;
}
return hist;
}

void partition(int bins, int *orig_counts, double *arr) {
int *counts = malloc(bins * sizeof(int));
memcpy(counts, orig_counts, bins*sizeof(int));
int *starts = malloc(bins * sizeof(int));
int b, i;
starts[0] = 0;
for(i = 1; i < bins; i++) {
starts[i] = starts[i-1] + counts[i-1];
}
for(b = 0; b < bins; b++) {
while (counts[b] > 0) {
double v = arr[starts[b]];
int correctbin;
do {
correctbin = calcbin(v, bins);
int swappos = starts[correctbin];
double tmp = arr[swappos];
arr[swappos] = v;
v = tmp;
starts[correctbin]++;
counts[correctbin]--;
} while (correctbin != b);
}
}
free(counts);
free(starts);
}

void sortbins(int bins, int *counts, double *arr) {
int start = 0;
int b;
for(b = 0; b < bins; b++) {
std::sort(arr + start, arr + start + counts[b]);
start += counts[b];
}
}

void checksorted(double *arr, int n) {
int i;
for(i = 1; i < n; i++) {
if (arr[i-1] > arr[i]) {
printf("out of order at %d: %lf %lf\n", i, arr[i-1], arr[i]);
exit(1);
}
}
}

int main(int argc, char *argv[]) {
if (argc == 1 || argv[1] == NULL) {
printf("Expected data size as argument\n");
exit(1);
}
int n = atoi(argv[1]);
const int cachesize = 128 * 1024; // a guess
int bins = (int) (1.1 * n * sizeof(double) / cachesize);
if (argc > 2) {
bins = atoi(argv[2]);
}
printf("Using %d bins\n", bins);
FILE *f = fopen("gaussian.dat", "rb");
if (f == NULL) {
printf("Couldn't open gaussian.dat\n");
exit(1);
}
double *arr = malloc(n * sizeof(double));
fclose(f);

int *counts = docensus(bins, n, arr);
partition(bins, counts, arr);
sortbins(bins, counts, arr);
checksorted(arr, n);

return 0;
}
``````

Alexandru

static_rtti 2011年

@Alexandru：我向normcdf添加了分段线性逼近，但速度仅提高了5％。

@static_rtti：您不必放任何东西。默认情况下，代码选择bin计数，因此平均bin大小为128kb的10/11。垃圾箱太少，您将无法获得分区的好处。太多，分区阶段由于缓存溢出而陷入困境。

10.6秒！我尝试了一些箱的数量，使用5000获得了最好的结果（略高于默认值3356）。我必须说我应该为您的解决方案看到更好的性能……也许这是您使用qsort而不是可能更快的C ++解决方案std :: sort的事实吗？
static_rtti 2011年

1

``````int main(void)
{
std::ifstream ifs("C:\\Temp\\gaussian.dat", std::ios::binary | std::ios::in);
std::vector<float> v;
v.reserve(50000000);
double d;