这是Alexandru的基数排序与Zjarek的线程化智能透视图的混合。用它编译
g++ -std=c++0x -pthread -O3 -march=native sorter_gaussian_radix.cxx -o sorter_gaussian_radix
您可以通过定义STEP来更改基数大小(例如,添加-DSTEP = 11)。我发现最好的笔记本电脑是8(默认值)。
默认情况下,它将问题分成4部分并在多个线程上运行。您可以通过将depth参数传递给命令行来更改它。因此,如果您有两个核心,请以
sorter_gaussian_radix 50000000 1
如果您有16个核心
sorter_gaussian_radix 50000000 4
现在的最大深度为6(64个线程)。如果放置了过多的关卡,您只会降低代码速度。
我还尝试过的一件事是Intel Performance Primitives(IPP)库中的基数排序。亚历山德鲁的实施方法大大超越了IPP,IPP的速度要慢30%。这种变化也包括在这里(注释掉)。
#include <stdlib.h>
#include <string.h>
#include <algorithm>
#include <ctime>
#include <iostream>
#include <thread>
#include <vector>
#include <boost/cstdint.hpp>
// #include "ipps.h"
#ifndef STEP
#define STEP 8
#endif
const int step = STEP;
const int start_step=24;
const int num_steps=(64-start_step+step-1)/step;
int size;
double *dbuf, *copy;
clock_t c1, c2, c3, c4, c5;
const double distrib[]={-2.15387,
-1.86273,
-1.67594,
-1.53412,
-1.4178,
-1.31801,
-1.22986,
-1.15035,
-1.07752,
-1.00999,
-0.946782,
-0.887147,
-0.830511,
-0.776422,
-0.724514,
-0.67449,
-0.626099,
-0.579132,
-0.53341,
-0.488776,
-0.445096,
-0.40225,
-0.36013,
-0.318639,
-0.27769,
-0.237202,
-0.197099,
-0.157311,
-0.11777,
-0.0784124,
-0.0391761,
0,
0.0391761,
0.0784124,
0.11777,
0.157311,
0.197099,
0.237202,
0.27769,
0.318639,
0.36013,
0.40225,
0.445097,
0.488776,
0.53341,
0.579132,
0.626099,
0.67449,
0.724514,
0.776422,
0.830511,
0.887147,
0.946782,
1.00999,
1.07752,
1.15035,
1.22986,
1.31801,
1.4178,
1.53412,
1.67594,
1.86273,
2.15387};
class Distrib
{
const int value;
public:
Distrib(const double &v): value(v) {}
bool operator()(double a)
{
return a<value;
}
};
void recursive_sort(const int start, const int end,
const int index, const int offset,
const int depth, const int max_depth)
{
if(depth<max_depth)
{
Distrib dist(distrib[index]);
const int middle=std::partition(dbuf+start,dbuf+end,dist) - dbuf;
// const int middle=
// std::partition(dbuf+start,dbuf+end,[&](double a)
// {return a<distrib[index];})
// - dbuf;
std::thread lower(recursive_sort,start,middle,index-offset,offset/2,
depth+1,max_depth);
std::thread upper(recursive_sort,middle,end,index+offset,offset/2,
depth+1,max_depth);
lower.join(), upper.join();
}
else
{
// ippsSortRadixAscend_64f_I(dbuf+start,copy+start,end-start);
c1=clock();
double *dbuf_local(dbuf), *copy_local(copy);
boost::uint64_t mask = (1 << step) - 1;
int cnt[num_steps][mask+1];
boost::uint64_t *ibuf = reinterpret_cast<boost::uint64_t *> (dbuf_local);
for(int i=0;i<num_steps;++i)
for(uint j=0;j<mask+1;++j)
cnt[i][j]=0;
for (int i = start; i < end; i++)
{
for (int w = start_step, v = 0; w < 64; w += step, v++)
{
int p = (~ibuf[i] >> w) & mask;
(cnt[v][p])++;
}
}
c2=clock();
std::vector<int> sum(num_steps,0);
for (uint i = 0; i <= mask; i++)
{
for (int w = start_step, v = 0; w < 64; w += step, v++)
{
int tmp = sum[v] + cnt[v][i];
cnt[v][i] = sum[v];
sum[v] = tmp;
}
}
c3=clock();
for (int w = start_step, v = 0; w < 64; w += step, v++)
{
ibuf = reinterpret_cast<boost::uint64_t *>(dbuf_local);
for (int i = start; i < end; i++)
{
int p = (~ibuf[i] >> w) & mask;
copy_local[start+((cnt[v][p])++)] = dbuf_local[i];
}
std::swap(copy_local,dbuf_local);
}
// Do the last set of reversals
for (int p = start; p < end; p++)
if (dbuf_local[p] >= 0.)
{
std::reverse(dbuf_local+p, dbuf_local + end);
break;
}
c4=clock();
// Insertion sort
for (int i = start+1; i < end; i++) {
double value = dbuf_local[i];
if (value < dbuf_local[i - 1]) {
dbuf_local[i] = dbuf_local[i - 1];
int p = i - 1;
for (; p > 0 && value < dbuf_local[p - 1]; p--)
dbuf_local[p] = dbuf_local[p - 1];
dbuf_local[p] = value;
}
}
c5=clock();
}
}
int main(int argc, char **argv) {
size = atoi(argv[1]);
copy = new double[size];
dbuf = new double[size];
FILE *f = fopen("gaussian.dat", "r");
fread(dbuf, size, sizeof(double), f);
fclose(f);
clock_t c0 = clock();
const int max_depth= (argc > 2) ? atoi(argv[2]) : 2;
// ippsSortRadixAscend_64f_I(dbuf,copy,size);
recursive_sort(0,size,31,16,0,max_depth);
if(num_steps%2==1)
std::swap(dbuf,copy);
// for (int i=0; i<size-1; i++){
// if (dbuf[i]>dbuf[i+1])
// std::cout << "BAD "
// << i << " "
// << dbuf[i] << " "
// << dbuf[i+1] << " "
// << "\n";
// }
std::cout << "Finished after "
<< (double) (c1 - c0) / CLOCKS_PER_SEC << " "
<< (double) (c2 - c1) / CLOCKS_PER_SEC << " "
<< (double) (c3 - c2) / CLOCKS_PER_SEC << " "
<< (double) (c4 - c3) / CLOCKS_PER_SEC << " "
<< (double) (c5 - c4) / CLOCKS_PER_SEC << " "
<< "\n";
// delete [] dbuf;
// delete [] copy;
return 0;
}
编辑:我实现了Alexandru的缓存改进,在我的机器上节省了大约30%的时间。
编辑:这实现了递归排序,因此它应该在Alexandru的16核心计算机上很好地工作。它还使用了Alexandru的最新改进,并删除了其中一项相反的改进。对我来说,这提高了20%。
编辑:修复了一个符号错误,当存在两个以上内核时,该错误会导致效率低下。
编辑:删除了lambda,因此它将与旧版本的gcc一起编译。它包括注释掉的IPP代码变体。我还修复了在16个内核上运行的文档。据我所知,这是最快的实现。
编辑:修复了STEP不在8时的错误。将最大线程数增加到64。添加了一些计时信息。