


  • 输入:每行具有三个以空格分隔的正整数。
  • 输出:所有输入线A BT满足以下任一标准。

    1. 存在的另一个输入线CDU其中D = A0 <= T - U < 100
    2. 存在的另一个输入线CDU其中B = C0 <= U - T < 100


import random    
nolines = 50000000 # 50 million
for i in xrange(nolines):
    print random.randint(0,nolines-1), random.randint(0,nolines-1), random.randint(0,nolines-1)


我的机器时间将在我的机器上运行。这是在AMD FX-8350八核处理器上安装的标准8GB RAM ubuntu。这也意味着我需要能够运行您的代码。



sync && sudo bash -c 'echo  3 > /proc/sys/vm/drop_caches'

time wc test.file

real    0m26.835s
user    0m18.363s
sys     0m0.495s

time sort -n largefile.file  > /dev/null

real    1m32.344s
user    2m9.530s
sys     0m6.543s



sync && sudo bash -c 'echo  3 > /proc/sys/vm/drop_caches'
  • Perl(正在等待错误修复。)
  • Scala 1分钟37秒,@ James_pic。(使用scala -J-Xmx6g过滤器largefile.file output.txt)
  • Java。@Geobits 1分23秒。(使用java -Xmx6g Filter_26643)
  • Ç。@ScottLeadley用2分21秒。
  • Ç。@James_pic 28秒。
  • Python + pandas。也许有一个简单的“ groupby”解决方案?
  • Ç。@KeithRandall 28秒。

获胜者是Keith Randall和James_pic。



请定义正整数。1 < n < 2147483647
durron597 2014年


IchBinKeinBaum 2014年



C,〜7 4.1秒





#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <fcntl.h>

// B = # of bits per radix pass
// R = # of radix passes
#define B 9
#define R 3
#define M ((1<<B)-1)
#define MAXN 50000000

int count[R][1<<B];

typedef struct {
  int a,b,t,print;
} entry;

entry A[MAXN];
entry C[MAXN];

// Sized to fit well in L1 cache
unsigned char bcount[16384];

int main(int argc, char *argv[]) {
  FILE *f = fopen(argv[1], "r");
  fseek(f, 0, SEEK_END);
  int size = ftell(f);

  int fd = open(argv[1], O_RDONLY);
  const char *p = (const char*)mmap(0, size, PROT_READ, MAP_SHARED, fd, 0);
  const char *endp = p + size;

  // parse, insert into array
  int n = 0;
  while(p < endp) {

    // parse line
    int a = 0;
    while(*p != ' ') {
      a *= 10;
      a += *p - '0';
    int b = 0;
    while(*p != ' ') {
      b *= 10;
      b += *p - '0';
    int t = 0;
    while(*p != '\n') {
      t *= 10;
      t += *p - '0';

    // insert it
    if(n == MAXN) {
      printf("too many elements\n");
    A[n].a = a;
    A[n].b = b;
    A[n].t = t;

    // compute counts for radix sort

  // accumulate count entries
  for(int r = 0; r < R; r++) {
    for(int i = 0; i < M; i++) {

  // radix sort, 3 rounds
  for(int i = n-1; i >= 0; i--) {
    C[--count[0][A[i].t&M]] = A[i];
  for(int i = n-1; i >= 0; i--) {
    A[--count[1][(C[i].t>>B)&M]] = C[i];
  for(int i = n-1; i >= 0; i--) {
    C[--count[2][A[i].t>>2*B]] = A[i];

  // Walk through array (now sorted by T) and find matches.
  // We maintain a window of T values that might match.
  // To facilitate finding matches within that window, bcount
  // keeps track of a count of how many b's in that window
  // have the given low 14 bits.
  int j = 0;
  for(int i = 0; i < n; i++) {
    int a = C[i].a;
    int t = C[i].t;
    while(C[j].t <= t - 100) {
      int x = C[j].b & 16383;
      if(bcount[x] != 255) bcount[x]--;
    if(bcount[a & 16383] > 0) {
      // somewhere in the window is a b that matches the
      // low 14 bits of a.  Find out if there is a full match.
      for(int k = j; k < i; k++) {
        if(a == C[k].b)
          C[k].print = C[i].print = 1;
    int x = C[i].b & 16383;
    if(bcount[x] != 255) bcount[x]++;
  for(int i = 0; i < n; i++) {
      printf("%d %d %d\n", C[i].a, C[i].b, C[i].t);


是!我喜欢它。我有种感觉,缓存局部性可以使加入T的速度更快,但是我一直认为排序阶段可以抵消任何收益。使用Radix sort几乎可以消除这种情况。
James_pic 2014年

基数排序在缓存中效果很好,因为有一个读流和N个写流(在我的代码中,N = 512)。只要您的缓存具有N + 1个缓存行,所有内容都可以保留在缓存中。

该死 我实际上是filter.c为了做同样的事情而创建的,来到这个问题并找到了这个。+1

@Lembik:原样的代码仅对B * R = 27位数字进行排序。现在,您有29位数字-您需要再通过一次(R ++)或每通过一次(B ++)。B ++可能更容易,R在一些展开的循环中硬编码。


Scala 2.10-0:41


select * from data x, data x where x.a = y.b and 0 <= x.t - y.t and x.t - y.t < 100



import scala.io.Source
import scala.reflect.ClassTag
import java.io._

object Filterer {
  def roundUpToNextPowerOfTwo(x: Int) = {
    // blatantly stolen from http://bits.stephan-brumme.com/roundUpToNextPowerOfTwo.html
    var y = x - 1
    y |= y >> 1
    y |= y >> 2
    y |= y >> 4
    y |= y >> 8
    y |= y >> 16
    y + 1

  // We hash join the array with itself, a to b, and emit both rows if t is within 100. 50m records should fit into 8GB OK.
  def main(args: Array[String]): Unit = {
    val input = Source.fromFile(args(0), "ASCII").getLines()
    val output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(args(1)), "US-ASCII"))
    try {
      val data1: Array[Row] = input.map{line =>

       * In theory, data1 and data2 could be created in parallel, but OpenHashMultiMap needs
       * to know its size at creation time, to sidestep the need for rehashing. I could just
       * hard-code 50 million (the size of the data in the challenge), but that seems dishonest.
      val data2 = new OpenHashMultiMap[Int, Row](roundUpToNextPowerOfTwo(data1.size) * 2, -1)
      for (r <- data1) data2.insert(r.a, r) // data2 is hashed by a

      for (row1 <- data1.par) {
        val Row(a, b, t) = row1
        for (Row(c, d, u) <- data2.get(b) if (0 <= u - t) && (u - t < 100)) {
          // The conditions are symmetric, so if row1 matches, so does row2
          output.write(s"$a $b $t\n$c $d $u\n")
    } finally {

object Row {
  def apply(data: String): Row = {
    val l = data.length
    var i = 0
    var a = 0
    var b = 0
    var c = 0
    while (data.charAt(i) != ' ') {
      a = a * 10 + (data.charAt(i) - '0')
      i += 1
    i += 1
    while (data.charAt(i) != ' ') {
      b = b * 10 + (data.charAt(i) - '0')
      i += 1
    i += 1
    while (i < l) {
      c = c * 10 + (data.charAt(i) - '0')
      i += 1
    Row(a, b, c)

final case class Row(a: Int, b: Int, t: Int)

 * None of the standard Java or Scala collections are particularly efficient as large MultiMaps,
 * so we write our own. We use open hashing with quadratic probing.
class OpenHashMultiMap[@specialized(Int) K: ClassTag, V: ClassTag](capacity: Int, default: K) {
  require((capacity & (capacity - 1)) == 0) // Power of 2 capacity
  private val keys = Array.fill(capacity)(default)
  private val values = new Array[V](capacity)
  private val mask = capacity - 1

  private def hash(k: K) = {
    // Hash mingling - Int has a particularly poor hash
    k.hashCode * 428916315

  def insert(k: K, v: V) = {
    var found = false
    var loc = hash(k) & mask
    var inc = 0
    while (inc <= capacity && !found) {
      loc = (loc + inc) & mask
      inc += 1
      found = keys(loc) == default
    keys(loc) = k
    values(loc) = v

  def get(key: K) = new Traversable[V] {
    override def foreach[U](f: V => U) = {
      var break = false
      var loc = hash(key) & mask
      var inc = 0
      while (inc <= capacity && !break) {
        loc = (loc + inc) & mask
        inc += 1
        val k = keys(loc)
        if (key == k) f(values(loc))
        else if (k == default) break = true


scalac Filterer.scala


scala -J-server -J-XX:+AggressiveOpts -J-Xms6g -J-Xmx6g Filterer input_file.dat output_file.txt






我很好奇为什么我的数据读取代码比@Geobits慢得多。Thread.start错误修复后,我的代码需要70秒才能读取数据-比@Geobits整个程序更长。我很想窃取@Geobits方法来读取数据,但是我不确定Stack Exchange众神对此有何看法。








我得到James_pic.scala:42:错误:')'预期但找到字符串文字。output.write(s“ $ a $ b $ t \ n $ c $ d $ u \ n”)^发现一个错误。这是在Scala编译器版本2.9.2上

我让它与2.10.3一起工作。这是一个非常好的解决方案,尽管之后我的可怜的计算机在尝试分配6GB RAM时或多或少地无法使用一分钟左右。

是的对不起 我想你可能有这个问题。Ubuntu仍然随附Scala 2.9,并且字符串插值需要2.10或更高版本。我怀疑在Java 8下它仍会更快,但是Ubuntu仅随附7,这是您不需要的痛苦世界!






由于每场比赛都在t其配对对象的100以内,因此我决定对进行输入t。每100个桶都有一个,因此要检查一个数字,只需要检查+/- 1个桶。



import java.io.BufferedReader;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.StringTokenizer;

public class Filter_26643 {

    final static int numThreads = 8; 
    final static int numInputs = 50000000;
    final static int bucketSize = 100;
    final static int numBuckets = numInputs/bucketSize;
    ArrayList<ArrayList<int[]>> buckets;

    public static void main(String[] args) {
        new Filter_26643().run();

    void run(){
            buckets = new ArrayList<ArrayList<int[]>>(numBuckets);
            for(int i=0;i<numBuckets;i++)
                buckets.add(new ArrayList<int[]>(bucketSize*2));

            BufferedReader reader = new BufferedReader(new FileReader("test.file"));
            int c=0,e[];
                StringTokenizer tokenizer = new StringTokenizer(reader.readLine());
                e = new int[] {

            MatchThread[] threads = new MatchThread[numThreads];
            for(int i=0;i<numThreads;i++){
                threads[i] = new MatchThread(i);
            for(int i=0;i<numThreads;i++)

        } catch(Exception e){

    class MatchThread extends Thread{
        int index;

        public MatchThread(int index){
            this.index = index;

        public void run() {
            for(int i=index;i<numBuckets;i+=numThreads){
                int max = i+2 >= numBuckets ? numBuckets : i+2;
                int min = i-1 < 0 ? i : i-1;
                for(int[] entry : buckets.get(i)){
                    for(int j=min;j<max;j++){
                        ArrayList<int[]> bucket = buckets.get(j);
                        for(int[] other : bucket){
                            if(((entry[0]==other[1] && entry[2]-other[2]<100 && entry[2]>=other[2]) || 
                                (entry[1]==other[0] && other[2]-entry[2]<100 && other[2]>=entry[2]))
                                && entry != other){
                                 System.out.println(entry[0] + " " + entry[1] + " " + entry[2]);
                                 break outer;


五分半钟后,我在线程“ main” java.lang.OutOfMemoryError中得到异常:超出了您建议的GC开销限制。我必须将堆大小增加到多少?


@James_pic是否增加了堆大小?另外,0:46与计算机上排序-n test.file的时间相比如何(如果可以安排它不在RAM中)?


顺便说一句,@ Geobits,我喜欢这个算法。您可以获得合并联接的大多数好处,而无需进行排序的开销-有点像信鸽排序的合并联接。
James_pic 2014年







#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>

// Should be 37% occupied with 50m entries
#define TABLE_SIZE 0x8000000
#define MASK (TABLE_SIZE - 1)
#define BUFFER_SIZE 16384
#define END_OF_FILE (-1)
#define DEFAULT_VALUE (-1)

typedef struct Row {
  int32_t a;
  int32_t b;
  int32_t t;
} Row;

int32_t hash(int32_t a) {
  return a * 428916315;

void insert(Row * table, Row row) {
  long loc = hash(row.a) & MASK; // Entries are hashed on a
  long inc = 0;
  while (inc <= TABLE_SIZE) {
    loc = (loc + inc) & MASK;
    if (table[loc].a == DEFAULT_VALUE) {
      table[loc] = row;

int readChar(FILE * input, char * buffer, int * pos, int * limit) {
  if (*limit < *pos) {
    return buffer[(*limit)++];
  } else {
    *limit = 0;
    *pos = fread(buffer, sizeof(char), BUFFER_SIZE, input);
    if (*limit < *pos) {
      return buffer[(*limit)++];
    } else return END_OF_FILE;

void readAll(char * fileName, Row * table) {
  char* buffer = (char*) malloc(sizeof(char) * BUFFER_SIZE);
  int limit = 0;
  int pos = 0;

  FILE * input = fopen(fileName, "rb");

  int lastRead;
  Row currentRow;
  uint32_t * currentElement = &(currentRow.a);

  // As with the Scala version, we read rows with an FSM. We can
  // roll up some of the code using the `currentElement` pointer
  while (1) {
    switch(lastRead = readChar(input, buffer, &pos, &limit)) {
      case END_OF_FILE:
      case ' ':
        if (currentElement == &(currentRow.a)) currentElement = &(currentRow.b);
        else currentElement = &(currentRow.t);
      case '\n':
        insert(table, currentRow);
        currentRow.a = 0;
        currentRow.b = 0;
        currentRow.t = 0;
        currentElement = &(currentRow.a);
        *currentElement = *currentElement * 10 + (lastRead - '0');
  //printf("Read %d", lastRead);

int main() {
  Row* table = (Row*) malloc(sizeof(Row) * TABLE_SIZE);
  memset(table, 255, sizeof(Row) * TABLE_SIZE);

  readAll("test.file", table);

  // We'll iterate through our hash table inline - passing a callback
  // is trickier in C than in Scala, so we just don't bother
  for (size_t i = 0; i < TABLE_SIZE; i++) {
    Row * this = table + i;
    if (this->a != DEFAULT_VALUE) {
      // Lookup entries `that`, where `that.a == this.b`
      long loc = hash(this->b) & MASK;
      long inc = 0;
      while (inc <= TABLE_SIZE) {
        loc = (loc + inc) & MASK;
        Row * that = table + loc;
        if ((this->b == that->a) && (0 <= that->t - this->t) && (that->t - this->t < 100)) {
          // Conditions are symmetric, so we output both rows
          printf("%d %d %d\n", this->a, this->b, this->t);
          printf("%d %d %d\n", that->a, that->b, that->t);
        else if (that->b == DEFAULT_VALUE) break;

  return 0;


gcc -std=c99 -O3 -m64 filter.c



测试文件的位置被硬编码为“ test.file”。


同样,很有趣的是看到这与斯科特·利德利(Scott Leadley)的答案相去甚远。Scott正在加入T,这原则上意味着他将有更多加入的机会,但是再次,加入T可提供更好的缓存位置。

我得到James_pic.c:在函数'readAll'中:James_pic.c:67:28:警告:如果(currentElement ==&(currentRow.a))currentElement =&,则不同指针类型的比较缺少强制转换[默认启用] (currentRow.b);

我从您的scala和C代码得到的输出略有不同。实际上只有一行是不同的。我刚刚做过diff <(sort -n James_pic-c.out) <(sort -n James_pic-scala.out)

在一个给定的,这将对于输入失败a值发生n倍,其中n >= BUFFER_SIZE + 2

我认为这只是您的代码中<= 100而规模代码中<100。




首先,我们利用sort -n -k3顺序获取最重要的字段,以利用sort(1)现代版本上的内置并行性。然后,由于perl受到一个简单的标量每个占用80个字节(5000万* 3 * 80太多-至少12GB)的事实的极大阻碍,因此我们将输出限制为5000万* 12字节数组(每行12个字节,每行包含3个可以表示为32位整数的整数)。然后,我们触发8个线程,每个线程覆盖(大约)数据的1/8(+重叠)。


use strict;
use warnings;

# find lines s.t. $lines[$M]->{a} == $lines[$N]->{b} and
#                 0 <= $lines[$M]->{t} - $lines[$N]->{t} < 100
# OR              $lines[$M]->{b} == $lines[$N]->{a} and
#                 0 <= $lines[$N]->{t} - $lines[$M]->{t} < 100

my $infile = shift;
open(my $fh, "sort -n -k3 $infile |") || die "open sort pipe: $@";

my @lines;
my $bytes_per_int = 4;
my $bytes_per_line = $bytes_per_int * 3;
my $nlines = 50_000_000;
my $buf = "\0" x ($nlines * $bytes_per_line);
my $ln = 0;
my $nprocs = 8;
my $last_group_start = 0;
my $this_group_start;
my $group = $nlines / $nprocs;
my @pids;
while(<$fh>) {
  my ($A, $B, $T) = split/\s+/;
  substr($buf, $ln * $bytes_per_line, $bytes_per_line, pack "L3", ($A, $B, $T));
  if( defined $this_group_start ) {
    if( $T - $last_group_start >= $group + 100 ) {
      if(my $pid = fork()) {
        push @pids, $pid;
        $last_group_start = $this_group_start;
        undef $this_group_start;
      } else {
#warn "checking $last_group_start - $ln...\n";
        for(my $l=$last_group_start; $l<=$ln; ++$l) {
          my $lpos = $l * $bytes_per_line;
          my ($A, $B, $T) = unpack "L3", substr($buf, $lpos, $bytes_per_line);
          my ($lA, $lB);
          my $lT = $T;
          for(my $lb=$l; $lb>=$last_group_start && $T - $lT <= 100; $lb--, $lpos -= $bytes_per_line) {
            ($lA, $lB, $lT) = unpack "L3", substr($buf, $lpos, $bytes_per_line);
            if($A == $lB || $B == $lA) {
              #print "($last_group_start) $A $B $T matches $lA $lB $lT\n";
              print "$lA $lB $lT\n$A $B $T\n";
  } elsif( !defined $this_group_start && $T - $last_group_start >= $group ) {
    $this_group_start = $ln;

waitpid $_, 0 for @pids;


我不确定您的输出是否正确。看一下前两行:A = D = 8455767,但是U = 50175T = 50130等等T - U = -45
James_pic 2014年




我倾向于没有结果,不确定这是统计异常还是我的推理错误。 已修复,二进制排序的比较存在缺陷。

using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Threading;
using System.Threading.Tasks;

namespace FilterFile
    class Program
        const int COUNT = 50000000;

        static string inputFile = "data" + COUNT + ".txt";
        static string outputFile = "results.txt";

        static void Main(string[] args)
            Console.WriteLine("Prepping Test");
            if (args.Length > 0) inputFile = args[0];
            if (args.Length > 1) outputFile = args[1];

            if (!File.Exists(inputFile))

                                     .Select(r => string.Format("{0} {1} {2}", r.A, r.B, r.C)));


            Console.WriteLine("Starting Test \n\n");

            using (Timer.Create("Total Time"))
                Row[] sortedA, sortedB;
                using (Timer.Create("Reading Data"))
                    FillData(out sortedA, out sortedB);

                using (Timer.Create("Parallel Sort A"))
                using (Timer.Create("Parallel Sort B"))
                    ParallelSort.QuicksortParallel(sortedB, (x, y) => x.B - y.B);

                object rLock = new object();
                List<Row> results = new List<Row>();

                var comparison = Comparer<Row>.Create((B, A) => B.B - A.A);
                using (Timer.Create("Compute Results"))
                    Parallel.ForEach(sortedA, row =>
                    //foreach (var row in sortedA)
                        var i = Array.BinarySearch(sortedB, row, comparison);
                        if (i < 0) return;

                        Row other;
                        bool solved = false;
                        for (var tempI = i; tempI < sortedB.Length && row.A == (other = sortedB[tempI]).B; tempI++)
                            var diff = row.C - other.C;
                            if (diff >= 0 && diff < 100)
                                lock (rLock) results.Add(row);

                        for (var tempI = i - 1; tempI >= 0 && row.A == (other = sortedB[tempI]).B; tempI--)
                            var diff = row.C - other.C;
                            if (diff >= 0 && diff < 100)
                                lock (rLock) results.Add(row);

                using (Timer.Create("Save Results"))
                    File.WriteAllLines(outputFile, results.Select(r => r.ToString()));

        private static void FillData(out Row[] sortedA, out Row[] sortedB)
            var tempA = new Row[COUNT];
            var tempB = tempA;//new Row[COUNT];

            const int PARTITION_SIZE = 1 << 22;

            ReadAndSort(tempA, tempB, PARTITION_SIZE);

            sortedA = tempA;
            sortedB = new Row[COUNT];
            Array.Copy(sortedA, sortedB, COUNT);
            /*using (Timer.Create("MergeA"))
                int destIndex = 0;
                int[][] partitions = Enumerable.Range(0, COUNT / PARTITION_SIZE + 1)
                    .Select(i => new[] { i * PARTITION_SIZE, Math.Min(i * PARTITION_SIZE + PARTITION_SIZE, COUNT) - 1 })

                for (int i = 0; i < COUNT; i++)
                    foreach (var partition in partitions)
                        while (partition[0] <= partition[1] && tempA[partition[0]].A == i)
                            sortedA[destIndex++] = tempA[partition[0]++];

            /*//Verify Paritioning Works
            var results = new List<Tuple<Row, int>> { Tuple.Create(tempA[0], 0) };
            for (int i = 1; i < tempA.Length; i++)
                var r = tempA[i];
                if (r.A < tempA[i-1].A)
                    results.Add(Tuple.Create(r, i % PARTITION_SIZE));
            results.ForEach(t => Console.WriteLine(t.Item1 + " " + t.Item2));*/

        private static void ReadAndSort(Row[] tempA, Row[] tempB, int PARTITION_SIZE)
            List<Task> tasks = new List<Task>();

            using (var stream = File.OpenRead(inputFile))
                int b;
                int tempMember = 0;
                int memberIndex = 0;
                int elementIndex = 0;

                using (Timer.Create("Read From Disk"))
                    while ((b = stream.ReadByte()) >= 0)
                        switch (b)
                            case (byte)'\r':
                            case (byte)' ':
                                switch (memberIndex)
                                    case 0: tempA[elementIndex].A = tempMember; memberIndex = 1; break;
                                    case 1: tempA[elementIndex].B = tempMember; memberIndex = 2; break;
                                    case 2: tempA[elementIndex].C = tempMember; memberIndex = 0; break;
                                tempMember = 0;
                            case (byte)'\n':
                                /*if (elementIndex % PARTITION_SIZE == 0 && elementIndex > 0)
                                    var copiedIndex = elementIndex;
                                    tasks.Add(Task.Run(() =>
                                        var startIndex = copiedIndex - PARTITION_SIZE;
                                        Array.Copy(tempA, startIndex, tempB, startIndex, PARTITION_SIZE);
                                        ParallelSort.QuicksortSequentialInPlace(tempA, startIndex, copiedIndex - 1);
                                        ParallelSort.QuicksortSequentialInPlace(tempB, startIndex, copiedIndex - 1, (x, y) => x.B - y.B);
                                tempMember = tempMember * 10 + b - '0';

                /* tasks.Add(Task.Run(() =>
                     elementIndex--;  //forget about the last \n
                     var startIndex = (elementIndex / PARTITION_SIZE) * PARTITION_SIZE;
                     Array.Copy(tempA, startIndex, tempB, startIndex, elementIndex - startIndex + 1);
                     ParallelSort.QuicksortParallelInPlace(tempA, startIndex, elementIndex);
                     ParallelSort.QuicksortSequentialInPlace(tempB, startIndex, elementIndex, (x, y) => x.B - y.B);

                 using (Timer.Create("WaitForSortingToFinish"))

        static Random rand = new Random();

        public struct Row : IComparable<Row>
            public int A;
            public int B;
            public int C;
            public static Row RandomRow(int count)
                return new Row { A = rand.Next(count), B = rand.Next(count), C = rand.Next(count) };

            public int CompareTo(Row other)
                return A - other.A;

            public override string ToString()
                return string.Format("{0} {1} {2}", A, B, C);

        public static Row[] GenerateData(int count)
            var data = new Row[count];
            for (int i = 0; i < count; i++)
                data[i] = Row.RandomRow(count);
            return data;

        public static Row[] GenerateSplitData(int count)
            var data = new Row[count];
            for (int i = 0; i < count; i++)
                data[i] = Row.RandomRow(count);
            return data;

        public class Timer : IDisposable
            string message;
            Stopwatch sw;
            public static Timer Create(string message)
                Console.WriteLine("Started: " + message);
                var t = new Timer();
                t.message = message;
                t.sw = Stopwatch.StartNew();
                return t;
            public void Dispose()
                Console.WriteLine("Finished: " + message + " in " + sw.ElapsedMilliseconds + "ms");

        // <summary> 
        /// Parallel quicksort algorithm. 
        /// </summary> 
        public class ParallelSort
            const int SEQUENTIAL_THRESHOLD = 4096;
            #region Public Static Methods

            /// <summary> 
            /// Sequential quicksort. 
            /// </summary> 
            /// <typeparam name="T"></typeparam> 
            /// <param name="arr"></param> 
            public static void QuicksortSequential<T>(T[] arr) where T : IComparable<T>
                QuicksortSequentialInPlace(arr, 0, arr.Length - 1);

            /// <summary> 
            /// Parallel quicksort 
            /// </summary> 
            /// <typeparam name="T"></typeparam> 
            /// <param name="arr"></param> 
            public static void QuicksortParallel<T>(T[] arr) where T : IComparable<T>
                QuicksortParallelInPlace(arr, 0, arr.Length - 1);


            #region Private Static Methods

            public static void QuicksortSequentialInPlace<T>(T[] arr, int left, int right)
                where T : IComparable<T>
                if (right > left)
                    int pivot = Partition(arr, left, right);
                    QuicksortSequentialInPlace(arr, left, pivot - 1);
                    QuicksortSequentialInPlace(arr, pivot + 1, right);

            public static void QuicksortParallelInPlace<T>(T[] arr, int left, int right)
                where T : IComparable<T>
                if (right > left)
                    if (right - left < SEQUENTIAL_THRESHOLD)
                        QuicksortSequentialInPlace(arr, left, right);
                        int pivot = Partition(arr, left, right);
                        Parallel.Invoke(() => QuicksortParallelInPlace(arr, left, pivot - 1),
                                        () => QuicksortParallelInPlace(arr, pivot + 1, right));

            private static void Swap<T>(T[] arr, int i, int j)
                T tmp = arr[i];
                arr[i] = arr[j];
                arr[j] = tmp;

            private static int Partition<T>(T[] arr, int low, int high)
                where T : IComparable<T>
                // Simple partitioning implementation 
                int pivotPos = (high + low) / 2;
                T pivot = arr[pivotPos];
                Swap(arr, low, pivotPos);

                int left = low;
                for (int i = low + 1; i <= high; i++)
                    if (arr[i].CompareTo(pivot) < 0)
                        Swap(arr, i, left);

                Swap(arr, low, left);
                return left;


            #region Public Static Methods

            /// <summary> 
            /// Sequential quicksort. 
            /// </summary> 
            /// <typeparam name="T"></typeparam> 
            /// <param name="arr"></param> 
            public static void QuicksortSequential<T>(T[] arr, Func<T, T, int> comparer)
                QuicksortSequentialInPlace(arr, 0, arr.Length - 1, comparer);

            /// <summary> 
            /// Parallel quicksort 
            /// </summary> 
            /// <typeparam name="T"></typeparam> 
            /// <param name="arr"></param> 
            public static void QuicksortParallel<T>(T[] arr, Func<T, T, int> comparer)
                QuicksortParallelInPlace(arr, 0, arr.Length - 1, comparer);


            #region Private Static Methods

            public static void QuicksortSequentialInPlace<T>(T[] arr, int left, int right, Func<T, T, int> comparer)
                if (right > left)
                    int pivot = Partition(arr, left, right, comparer);
                    QuicksortSequentialInPlace(arr, left, pivot - 1, comparer);
                    QuicksortSequentialInPlace(arr, pivot + 1, right, comparer);

            public static void QuicksortParallelInPlace<T>(T[] arr, int left, int right, Func<T, T, int> comparer)
                if (right > left)
                    if (right - left < SEQUENTIAL_THRESHOLD)
                        QuicksortSequentialInPlace(arr, left, right, comparer);
                        int pivot = Partition(arr, left, right, comparer);
                        Parallel.Invoke(() => QuicksortParallelInPlace(arr, left, pivot - 1, comparer),
                                        () => QuicksortParallelInPlace(arr, pivot + 1, right, comparer));

            private static int Partition<T>(T[] arr, int low, int high, Func<T, T, int> comparer)
                // Simple partitioning implementation 
                int pivotPos = (high + low) / 2;
                T pivot = arr[pivotPos];
                Swap(arr, low, pivotPos);

                int left = low;
                for (int i = low + 1; i <= high; i++)
                    if (comparer(arr[i], pivot) < 0)
                        Swap(arr, i, left);

                Swap(arr, low, left);
                return left;

James_pic 2014年

更一般地,如果你通过排序AB,有一个更快的算法比对迭代A和二进制搜索的BO(n log(n))(和实际上是一个人可怜的哈希表)。您可以改为合并加入两个列表O(n)
James_pic 2014年

James_pic 2014年






Filter a file based on these rules:

    - each item is an ordered list of three integers ( A B T )
    - each line represents an item
    - each line is formated as <number> <w> <number> <w> <number>
    - <w> is whitespace (a single blank in the challenge)
    - <number> is an integer in the range 0..49_999_999
    - the first number on a line is A, second B, third T

Output a given item ( A B T ) if:
    1 - there exists an item ( C D U ) such that 0 <= T-U < 100 and D == A 
    2 - there exists an item ( C D U ) such that 0 <= U-T < 100 and B == C 

An item should be output only once, even if there is more than one match.

We're sorting on T, we know the number of Ts to be sorted and the Ts are random.
Trade space for speed and create a lookup table that can handle collisions
(AKA hash table).

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdbool.h>
#include <pthread.h>
#include <assert.h>

#define NTHREADS    (16)
#define BINSPERTHREAD   (1*1000*1000)
bool    oneThread = false;

typedef struct {
    pthread_t   tid;
    long        begin;
    long        end;
} threadState;

void *initTID() {
    return NULL;

#define MAXITEMS    (50*1000*1000)
//  items on the boundary are not included in the search
#define SEARCHBOUNDARY  (100)

void usage(char *name) {
    fprintf(stderr, "usage: %s [-n 1..%d]\n", name, MAXITEMS);

typedef struct item {
    long    A;
    long    B;
    long    T;
    bool    unprinted;
    struct item *b;         // b(ackward to previous item)
    struct item *f;         // f(orward to next item)
    struct item *BINb;          // backward to previous bin
    struct item *BINf;          // forward to next bin
#ifdef DEVTEST
    long    lineNumber;
} item;
#ifdef DEVTEST
bool    printVerbose = false;

//  Why global variables? Because large MAXITEMS overflow the stack.
long    maxItems;           // entries allocated in list & lookup
item    *list;
long    listN;              // number of entries (max index + 1)
item    **lookup;
long    lookupN;            // number of entries (max index + 1)

input -
    n       - index of current bin
    list        - global
    lookup      - global
    lookupN     - global
side-effects -
    list[] (.unprinted)
static inline void *walkThisBin(long n) {
    item    *p;
    item    *searchHead;
    item    *searchTail;
    item    *currentItem;
    long    i;

    //  for all items in bin
    for ( currentItem = lookup[n]; currentItem != lookup[n]->BINf;
        currentItem = currentItem->f)
        merged forward&backward search
    searchHead = currentItem;
    //  step to index min((T+100-1),lookupN-1), find largest U<T+100
    i = ((n+SEARCHBOUNDARY-1) < lookupN) ?
        n+SEARCHBOUNDARY-1 :
    //  find largest i such that U-T<100 (U is lookup[i]->T)
    //  degenerate case is i == n
        p=lookup[i]) {
    searchTail = p->BINf;       // boundary, not included in search
    p = currentItem;
    do {
        if (searchHead->B == p->A) {
        //  matches are symmetric
        if (searchHead->unprinted) {
            printf("%ld %ld %ld\n", searchHead->A, searchHead->B,
            searchHead->unprinted = false;
        if (p->unprinted) {
            printf("%ld %ld %ld\n", p->A, p->B, p->T);
            p->unprinted = false;
        p = p->f;
    } while (p!=searchTail);
    return NULL;

Must handle out-of-range indexes for lookup.

input -
    n       - index of current bin
    list        - global
    lookup      - global
    lookupN     - global
side-effects -
    list (.unprinted)

static inline void *walkTheseBins(void *tState) {
    long    startIndex = ((threadState *)tState)->begin;
    long    finishIndex = ((threadState *)tState)->end;
    long    n;

    startIndex = (startIndex<0) ? 0 : startIndex;
    finishIndex = (finishIndex>lookupN-1) ? lookupN-1 : finishIndex;
    for (n=startIndex; n<=finishIndex; n++) {
    return NULL;

int main(int argc, char *argv[]) {
#ifdef DEVTEST
item    *head;
item    *tail;
long    count = 0;
    //  subroutines? subroutines? we don't need no stinkin' subroutines
    //  this is all the scoping you're going to need
    //                      ... truuuuust me
    Allocate list[] and lookup[]. Set maxItems.

    input -
    side-effects -
        DEVTEST stuff
    int c;          // option character

    maxItems = MAXITEMS;
    while ((c = getopt(argc, argv, ":n:sv")) != -1) {
        switch(c) {
#ifdef DEVTEST
        case 'v':
        //  print some reassuring messages
        printVerbose = true;
        case 'v':
        fprintf(stderr, "unknown option -%c\n", optopt);
        case 'n':
        if (sscanf(optarg, "%ld", &maxItems) != 1) {
            fprintf(stderr, "-n argument \"%s\" unscannable\n", optarg);
        case 's':
        //  use only one thread?
        oneThread = true;
        case ':':           // -s needs an argument
        case '?':           // not a valid option
        fprintf(stderr, "unknown option -%c\n", optopt);
    if ((maxItems<1) || (maxItems>MAXITEMS)) {
        fprintf(stderr, "-s argument \"%ld\" out of range\n", maxItems);
    list = (item *) malloc(sizeof(item) * maxItems);
    if (list == NULL) {
        fprintf(stderr, "ERROR: list = malloc() failure\n");
    lookup = (item **) malloc(sizeof(item *) * maxItems);
    if (lookup == NULL) {
        fprintf(stderr, "ERROR: lookup = malloc() failure\n");

    Convert STDIN into an array of items.

    input -
    side-effects -
    long    largestT = 0;
    item    x;

    for (listN=0; scanf("%ld%ld%ld", &x.A, &x.B, &x.T)==3; listN++) {
        if (listN == maxItems) {
        fprintf(stderr, "ERROR: > %ld input items read\n", maxItems);
        x.b = x.f = NULL;
        x.unprinted = true;
        x.BINb = x.BINf = NULL;
        largestT = (x.T>largestT) ? x.T : largestT;
#ifdef DEVTEST
        x.lineNumber = listN + 1;
        list[listN] = x;
    if (!feof(stdin)) {
        fprintf(stderr, "ERROR: ferror() = %d\n", ferror(stdin));
    //  Be paranoid. Because cores are obnoxious.
    if (largestT>=maxItems) {
        fprintf(stderr, "ERROR: T:%ld > %ld \n", largestT, maxItems-1);
#ifdef DEVTEST
(printVerbose) && printf("in: %ld\n", listN);
    //  Short-circuit on 0 items. Simplifies things like finding the head.
    if  (listN == 0) {

    Populate the lookup table. Build a doubly linked list through it.

    input -
    side-effects -
        DEVTEST stuff
    long    n;

        Populate the lookup table. The lookup table is an array-of-lists.
    The lists are LIFO. This is the most primitive of hashes, where the
    key, item.T, is used as the index into the lookup table.
    for (n=0; n<maxItems; n++) {
        lookup[n] = NULL;
    for (n=0; n<listN; n++) {
        long    t = list[n].T;

        if (lookup[t] == NULL) {
        lookup[t] = &(list[n]);
        } else {
        // collision
        list[n].f = lookup[t];  // forward pointer assigned
        lookup[t] = &(list[n]);
        Collapse lookup to squeeze out NULL references. This breaks
    the linear mapping between T value & lookup index, but worth it for
    simpler search logic. Build a doubly linked list of bins.
    item    *previousBin = NULL;    // last non-NULL lookup entry
    lookupN = 0;
    for (n=0; n<maxItems; n++) {
        if (lookup[n] != NULL) {
        lookup[lookupN] = lookup[n];
        lookup[lookupN]->BINb = previousBin;
        if (previousBin) {
            previousBin->BINf = lookup[lookupN];
        previousBin = lookup[lookupN];
    previousBin->BINf = NULL;

        Build a doubly linked list. The forward pointers already exist
    within each lookup table bin.
    item    *p;
    item    *binHead;
    item    *previous;

    //  create a loop in each bin
    for (n=0; n<lookupN; n++) {
#ifdef DEVTEST
        binHead = lookup[n];
        for (p=binHead; p->f; p=p->f) {
        p->f->b = p;
#ifdef DEVTEST
        p->f = binHead;
        binHead->b = p;
    //  break the loops and connect them tail-to-head
#ifdef DEVTEST
head = lookup[0];
    previous = NULL;
    for (n=0; n<lookupN; n++) {
        binHead = lookup[n];
        p = binHead->b;     // p => tail of this bin list
        binHead->b = previous;  // connect bin head to list
        if (previous) {     // connect list to bin head
        previous->f = binHead;
        previous = p;
    previous->f = NULL;
#ifdef DEVTEST
tail = previous;

#ifdef DEVTEST
if (printVerbose) {
    printf("out: %ld\n", count);

    //  run through the list forwards
    item    *p;
    count = 0;
    for (p=head; p; p=p->f) {
    printf("forwards: %ld\n", count);
    //  run through the list backwards
    count = 0;
    for (p=tail; p; p=p->b) {
    printf("backwards: %ld\n", count);
        //  print the list
        for (p=head; p; p=p->f) {
        printf("%ld %ld %ld\n", p->A, p->B, p->T);

    Find matches & print.

    (authoritative statement)
    Print item ( A B T ) if:
    1 - there exists an item ( C D U ) such that 0 <= T-U < 100 and D == A 
    2 - there exists an item ( C D U ) such that 0 <= U-T < 100 and B == C 

    - threading

    input -
    side-effects -
        lots hidden in walkTheseBins(), all thread-local or thread-safe
    volatile threadState    tState[NTHREADS]; // use as cicular buffer
    long                h;  // cicular buffer head
    long                n;

    if (oneThread) {
        tState[0].begin = 0;
        tState[0].end = lookupN-1;
        walkTheseBins((void *)tState);
    } else {
        //  every slot has a thread to wait for
        for (h=0; h<NTHREADS; h++) {
        assert( pthread_create(&(tState[h].tid), NULL, initTID, NULL) == 0);
        h = 0;
        for (n=0; n<lookupN+BINSPERTHREAD; n+=BINSPERTHREAD) {
        pthread_join(tState[h].tid, NULL);
        tState[h].begin = n;
        tState[h].end = n + BINSPERTHREAD - 1;
        assert( pthread_create(&(tState[h].tid), NULL, walkTheseBins, (void *)(tState+h)) == 0);
        h = (h + 1) % NTHREADS;
        //  wait for any remaining threads
        for (h=0; h<NTHREADS; h++) {
        pthread_join(tState[h].tid, NULL); // may have already join'ed some

    return 0;

用“ gcc -m64 -pthreads -O”编译。期望在stdin上输入。默认情况下运行多线程。使用“ -s”选项仅使用一个线程。

我得到警告:格式'%d'期望类型为'int'的参数,但是参数3的类型为'long int'[-Wformat =] fprintf(stderr,“ ERROR:T:%d>%d \ n”,maximumT ,清单N-1);

Scott Leadley 2014年


我认为您的代码不允许TU =0。我想在仅包含以下行的文件中对其进行测试:18662170 45121353 3365641(换行符)44329255 18662170 3365641,但它返回错误。

@Lembik Ahh,T必须小于50M,而不是输入的行数。我更正了这一点并添加了线程。
Scott Leadley


我终于有机会构建一个类似于Lembik的物理Ubuntu 14.04系统,并对我的难题进行了验尸。在我选择的重要性中:

  1. 真正的大师是James_pic,因为他没有过早优化。
    • 他有一个计划
    • 他以较高的抽象水平(Scala)执行了计划,并在那里进行了完善
    • 他用C语言进一步完善了它
    • 他没有对它进行过细化(请参阅下一点)
  2. 文件系统I / O时间可能是目标系统经过时间的下限。
    • Lembik暗示了这一点,即“获胜者……两者几乎都和wc一样快!”
  3. 我最初的解决方案失败的一些原因是:
    • 参考位置是目标系统上的主导因素。
    • 在进行哈希排序时,对A或B进行排序是个好主意。在T上排序至少会增加哈希排序的复杂度(以及对缓存不利的间接寻址),至少是我这样做的方式。
    • Scanf()是头猪。
    • 大量带宽(磁盘->内存->高速缓存)会改变瓶颈所在的位置。目标系统没有大量带宽。(请参阅下一点)
  4. 快速开发最好在目标环境中完成。
    • h!但是,本来我只对Solaris / SPARC感兴趣,否则无法正常使用。
    • 消除虚拟化和SAN环境中的缓存影响非常困难。
    • Linux VM通常存在相同的问题。
  5. 一点数学会有所帮助。
    • 直接从哈希表中获取一个元组可以将间接引用的概率降低到〜37%(〜1 / e)。
    • 直接从哈希表中获取两个元组会将对溢出表的引用减少到〜10%。没必要
  6. 32位内存模型(gcc -m32)令人分心。
    • 对于无线程程序而言,有时是一个小小的胜利,有时则是一个小小的损失。
    • 有时,线程程序会遭受重大损失。
    • 如果32位是一个重大胜利(并且目标不是嵌入式控制器),则刷新硬件可能会更便宜。
    • 占用额外的寄存器和更大的地址空间,不要回头。
  7. Scanf()是头猪,但使用stdio并非没有希望。
    • scanf()的大部分开销似乎都在格式驱动的解析和字符串到整数的转换中。
    • 将sscanf()替换为:
      • strtok()+ atoi()快约2倍(请参见下表)
      • strtol()快约3倍
      • 自定义本地strtol()快约6.5倍
      • 用本地解决方案替换strtol()使其与“ wc”处于同等水平
      • 使用getc_unlocked()的FSM几乎与Keith Randall的极简mmap()解决方案一样快
      • 我在C中重新实现时的实验结果[使用CSV,因为Stack Exchange显然不做表]:
        "solution (64-bit unless noted)","disposition of input","user","system","elapsed"
        "dd if=? of=/dev/null bs=1024k","","0.010","1.107","26.47"
        "wc {LANG=C}","","4.921","0.752","26.38"
        "fgets(), no integer conversion","discard","1.636","0.468","26.42"
        "fgets() + sscanf()","discard","16.173","0.498","26.48"
        "fgets() + strtok(), no integer conversion","discard","4.659","0.481","26.48"
        "fgets() + strtok() + atoi()","discard","8.929","0.490","26.49"
        "fgets() + strtol()","discard","6.009","0.483","26.50"
        "fgets() + custom-strtol()","discard","3.842","0.474","26.43"
        "fgets() + custom-strtol()","sort (load hash) while reading","7.118","1.207","26.70"
        "fgets() + custom-strtol()","sort, match & print","10.096","1.357","28.40"
        "fgets() + custom-strtol(), 32-bit","sort, match & print","10.065","1.159","28.38"
        "james_pic's solution","sort, match & print","9.764","1.113","28.21"



# only tested against ruby v1.9 & v2.0
Filter a file based on these rules:
  - each line is a set of three integers
  - each line is formatted as <number> <w> <number> <w> <number>
    - <w> is whitespace (a single blank in the challenge)
    - <number> is an integer in the range 1..50_000_000
Output a given tuple ( A B T ) if:
  - there exists a tuple ( C D U ) 0 <= T - U < 100 and D == A
  - there exists a tuple ( C D U ) 0 <= U - T < 100 and B == C

Typical use:
  filter.rb test.input | sort | uniq > test.output
list = Array.new
lookupB = Hash.new { |hash, key| hash[key] = Array.new }
ARGF.each_with_index do |line, index|
  abt = line.split.map { |s| s.to_i }
  list << abt
  lookupB[abt[1]] << index
for abt in list do
  for i in Array( lookupB[abt[0]] ) do
    delta = abt[2] - list[i][2]     # T - U
    if (0<=delta) && (delta<100)
      puts "#{abt.join(' ')}"
      puts "#{list[i].join(' ')}"



#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
//      Throw caution, and error checking, to the winds.
// #include <assert.h>

#define RANGEMIN        (1)
#define RANGEMAX        (50*1000*1000)
#define SEARCHBOUNDARY  (100)
typedef struct {
    int             A;
    int             B;
    int             T;
} tuple_t;
typedef struct bin {
    tuple_t         slot;
    struct bin     *next;       // NULL=>0 items, self=>1 item, other=>overflow
} bin_t;
#define LISTSIZE        (RANGEMAX)
tuple_t         list[LISTSIZE];
#define HASH(x)         (x-1)
bin_t           lookup[LOOKUPSIZE];
bin_t           overflow[LISTSIZE];
int             overflowNext = 0;

// based on strtol()
static inline int s2i(char *s, char **r)
    char            c;
    int             l = 0;

    do {
        c = *s++;
    } while (!isdigit(c));
    do {
        l = l * 10 + (c - '0');
        c = *s++;
    } while (isdigit(c));
    *r = s - 1;
    return l;

static inline void lookupInsert(tuple_t x)
    bin_t          *p = lookup + HASH(x.B);

    if (p->next) {
        overflow[overflowNext].slot = x;
        overflow[overflowNext].next = (p->next == p) ? p : p->next;
        p->next = overflow + overflowNext;
    } else {
        p->slot = x;
        p->next = p;

static void printOverflow(bin_t * head, bin_t * tail)
    if (head->next != tail) {
        printOverflow(head->next, tail);
    printf("%d %d %d\n", head->slot.A, head->slot.B, head->slot.T);

static inline void dumpLookupSortedOnB()
    bin_t          *p;

    for (p = lookup; p < (lookup + LOOKUPSIZE); p++) {
        if (p->next) {
            printf("%d %d %d\n", p->slot.A, p->slot.B, p->slot.T);
            if (p != p->next) {
                printOverflow(p->next, p);

static inline void printIfMatch(tuple_t abt, tuple_t cdu)
    int             A, B, T;
    int             C, D, U;

    A = abt.A;
    D = cdu.B;
    if (D == A) {
        T = abt.T;
        U = cdu.T;
        if ((0 <= (T - U)) && ((T - U) < SEARCHBOUNDARY)) {
            B = abt.B;
            C = cdu.A;
            printf("%d %d %d\n", A, B, T);
            printf("%d %d %d\n", C, D, U);

static inline void printMatches(int n)
    tuple_t        *p;

    for (p = list; p < (list + n); p++) {
        bin_t          *b = lookup + HASH(p->A);

        if (b->next) {
            bin_t          *q;

            printIfMatch(*p, b->slot);
            for (q = b->next; q != b; q = q->next) {
                printIfMatch(*p, q->slot);

static inline void overflowTattle(int n)
    fprintf(stderr, "%d/%d items in overflow\n", overflowNext, n);

int main(int argc, char *argv[])
    int             n;

    // initialize lookup[]
        bin_t          *p = lookup;

        for (n = 0; n < LOOKUPSIZE; n++) {
            p->next = NULL;
    // read all tuples into list[] and insert into lookup[] & overflow[]
        char            line[64];
        char           *lp;
        tuple_t        *p = list;

        for (n = 0; fgets(line, sizeof(line), stdin); n++) {
            p->A = s2i(line, &lp);
            p->B = s2i(lp, &lp);
            p->T = s2i(lp, &lp);

用“ gcc -O3 -std = c99 -Wall -m64”编译。

