由于问题的标题,这部分是用舌头回答。
当您寻找“最快的方式...”时,答案几乎总是一些专门的工具。此“答案”显示了这样一种工具,以便您可以进行实验。
这不是一个认真的答案,因为您不应该针对只能执行一次或很少执行的工作使用专门的工具。您会发现,与实际工作相比,最终将花费更多的时间在寻找工具和了解工具上。贝壳像公用事业bash
和awk
是不是最快的,但你通常可以写一行代码实现的工作,只花费秒。perl
也可以使用更好的脚本语言,例如,尽管学习曲线perl
很困难,但出于这种目的,我犹豫不决地推荐它,因为Perl项目让我感到不满。python
另一方面,它的I / O速度较慢,因此有点受阻;但是,仅当您过滤或生成千兆字节的数据时,这才是问题。
无论如何,下面的C89示例程序(仅在可用时才使用POSIX.1以获得更高的时钟精度)应达到约100 MB / s的生成速率(在Linux上使用Intel i5-4200U处理器的笔记本电脑上进行测试,将输出进行管道传输)到/dev/null
),采用了相当不错的伪随机数发生器。(输出应通过除MatrixRank测试之外的所有BigCrunch测试,因为代码使用xorshift64 *和排除方法以避免对数字产生偏见。)
十进制数字:
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <locale.h>
#include <ctype.h>
#include <stdio.h>
#include <errno.h>
#include <time.h>
/* This program is licensed under the CC0 license,
https://creativecommons.org/publicdomain/zero/1.0/
In other words, this is dedicated to the public domain.
There are no warranties either, so if something breaks,
you only have yourself to blame.
*/
#if _POSIX_C_SOURCE-199309 >= 0
static uint64_t time_seed(void)
{
struct timespec ts;
if (clock_gettime(CLOCK_REALTIME, &ts))
return (uint64_t)time(NULL);
return (uint64_t)ts.tv_sec
^ (((uint64_t)ts.tv_nsec) << 32);
}
#else
static uint64_t time_seed(void)
{
return (uint64_t)time(NULL);
}
#endif
/* Preferred output I/O block size.
* Currently, about 128k blocks yield
* maximum I/O throughput on most devices.
* Note that this is a heuristic value,
* and may be increased in the future.
*/
#ifndef IO_BLOCK_SIZE
#define IO_BLOCK_SIZE 262144
#endif
/* This is the Xorshift* pseudo-random number generator.
* See https://en.wikipedia.org/wiki/Xorshift#xorshift.2A
* for details. This is an incredibly fast generator that
* passes all but the MatrixRank test of the BigCrush
* randomness test suite, with a period of 2^64-1.
* Note that neither xorshift_state, nor the result of
* this function, will ever be zero.
*/
static uint64_t xorshift_state;
static uint64_t xorshift_u64(void)
{
xorshift_state ^= xorshift_state >> 12;
xorshift_state ^= xorshift_state << 25;
xorshift_state ^= xorshift_state >> 27;
return xorshift_state * UINT64_C(2685821657736338717);
}
/* This function returns a number between (inclusive)
* 0 and 999,999,999,999,999,999 using xorshift_u64()
* above, using the exclusion method. Thus, there is
* no bias in the results, and each digit should be
* uniformly distributed in 0-9.
*/
static uint64_t quintillion(void)
{
uint64_t result;
do {
result = xorshift_u64() & UINT64_C(1152921504606846975);
} while (!result || result > UINT64_C(1000000000000000000));
return result - UINT64_C(1);
}
/* This function returns a single uniformly random digit.
*/
static unsigned char digit(void)
{
static uint64_t digits_cache = 0;
static unsigned char digits_cached = 0;
unsigned char retval;
if (!digits_cached) {
digits_cache = quintillion();
digits_cached = 17; /* We steal the first one! */
} else
digits_cached--;
retval = digits_cache % (uint64_t)(10);
digits_cache /= (uint64_t)(10);
return retval;
}
static int parse_ulong(const char *src, unsigned long *to)
{
const char *end = src;
unsigned long value;
if (!src)
return errno = EINVAL;
errno = 0;
value = strtoul(src, (char **)&end, 0);
if (errno)
return errno;
if (end == src)
return errno = EINVAL;
while (*end)
if (isspace(*end))
end++;
else
return errno = EINVAL;
if (to)
*to = value;
return 0;
}
int main(int argc, char *argv[])
{
unsigned long lines, cols, line, col, seed;
/* When parsing the command-line parameters,
* use locale conventions. */
setlocale(LC_ALL, "");
/* Standard output should be fully buffered, if possible.
* This only affects output speed, so we're not too worried
* if this happens to fail. */
(void)setvbuf(stdout, NULL, _IOFBF, (size_t)IO_BLOCK_SIZE);
if (argc < 3 || argc > 4 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
fprintf(stderr, " %s COLS LINES [ SEED ]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "This program generates random decimal digits\n");
fprintf(stderr, "0 - 9, separated by spaces, COLS per line,\n");
fprintf(stderr, "LINES lines. In total, COLS*LINES*2 bytes\n");
fprintf(stderr, "will be used.\n");
fprintf(stderr, "\n");
fprintf(stderr, "SEED is the optional seed for the Xorshift64*\n");
fprintf(stderr, "pseudo-random number generator used in this program.\n");
fprintf(stderr, "If omitted, current time is used as the seed.\n");
fprintf(stderr, "\n");
return EXIT_SUCCESS;
}
if (parse_ulong(argv[1], &cols) || cols < 1UL) {
fprintf(stderr, "%s: Invalid number of digits per line.\n", argv[1]);
return EXIT_FAILURE;
}
if (parse_ulong(argv[2], &lines) || lines < 1UL) {
fprintf(stderr, "%s: Invalid number of lines.\n", argv[2]);
return EXIT_FAILURE;
}
if (argc > 3) {
if (parse_ulong(argv[3], &seed)) {
fprintf(stderr, "%s: Invalid Xorshift64* seed.\n", argv[3]);
return EXIT_FAILURE;
}
} else
seed = time_seed();
/* Since zero seed is invalid, we map it to ~0. */
xorshift_state = seed;
if (!xorshift_state)
xorshift_state = ~(uint64_t)0;
/* Discard first 1000 values to make the initial values unpredictable. */
for (col = 0; col < 1000; col++)
xorshift_u64();
for (line = 0UL; line < lines; line++) {
fputc('0' + digit(), stdout);
for (col = 1UL; col < cols; col++) {
fputc(' ', stdout);
fputc('0' + digit(), stdout);
}
fputc('\n', stdout);
/* Check for write errors. */
if (ferror(stdout))
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
如果我们切换到行缓冲区,则可以使其速度更快,并且只需fwrite()
一次而不是一次输出每个数字。请注意,如果输出是块设备,我们仍然保持流完全缓冲,以避免部分写入(非二次幂)。
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <locale.h>
#include <ctype.h>
#include <stdio.h>
#include <errno.h>
#include <time.h>
#if _POSIX_C_SOURCE-199309 >= 0
static uint64_t time_seed(void)
{
struct timespec ts;
if (clock_gettime(CLOCK_REALTIME, &ts))
return (uint64_t)time(NULL);
return (uint64_t)ts.tv_sec
^ (((uint64_t)ts.tv_nsec) << 32);
}
#else
static uint64_t time_seed(void)
{
return (uint64_t)time(NULL);
}
#endif
/* Preferred output I/O block size.
* Currently, about 128k blocks yield
* maximum I/O throughput on most devices.
* Note that this is a heuristic value,
* and may be increased in the future.
*/
#ifndef IO_BLOCK_SIZE
#define IO_BLOCK_SIZE 262144
#endif
/* This is the Xorshift* pseudo-random number generator.
* See https://en.wikipedia.org/wiki/Xorshift#xorshift.2A
* for details. This is an incredibly fast generator that
* passes all but the MatrixRank test of the BigCrush
* randomness test suite, with a period of 2^64-1.
* Note that neither xorshift_state, nor the result of
* this function, will ever be zero.
*/
static uint64_t xorshift_state;
static uint64_t xorshift_u64(void)
{
xorshift_state ^= xorshift_state >> 12;
xorshift_state ^= xorshift_state << 25;
xorshift_state ^= xorshift_state >> 27;
return xorshift_state * UINT64_C(2685821657736338717);
}
/* This function returns a number between (inclusive)
* 0 and 999,999,999,999,999,999 using xorshift_u64()
* above, using the exclusion method. Thus, there is
* no bias in the results, and each digit should be
* uniformly distributed in 0-9.
*/
static uint64_t quintillion(void)
{
uint64_t result;
do {
result = xorshift_u64() & UINT64_C(1152921504606846975);
} while (!result || result > UINT64_C(1000000000000000000));
return result - UINT64_C(1);
}
/* This function returns a single uniformly random digit.
*/
static unsigned char digit(void)
{
static uint64_t digits_cache = 0;
static unsigned char digits_cached = 0;
unsigned char retval;
if (!digits_cached) {
digits_cache = quintillion();
digits_cached = 17; /* We steal the first one! */
} else
digits_cached--;
retval = digits_cache % (uint64_t)(10);
digits_cache /= (uint64_t)(10);
return retval;
}
static int parse_ulong(const char *src, unsigned long *to)
{
const char *end = src;
unsigned long value;
if (!src)
return errno = EINVAL;
errno = 0;
value = strtoul(src, (char **)&end, 0);
if (errno)
return errno;
if (end == src)
return errno = EINVAL;
while (*end)
if (isspace(*end))
end++;
else
return errno = EINVAL;
if (to)
*to = value;
return 0;
}
int main(int argc, char *argv[])
{
unsigned long lines, cols, line, col, seed;
char *oneline;
/* When parsing the command-line parameters,
* use locale conventions. */
setlocale(LC_ALL, "");
/* Standard output should be fully buffered, if possible.
* This only affects output speed, so we're not too worried
* if this happens to fail. */
(void)setvbuf(stdout, NULL, _IOFBF, (size_t)IO_BLOCK_SIZE);
if (argc < 3 || argc > 4 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s [ -h | --help ]\n", argv[0]);
fprintf(stderr, " %s COLS LINES [ SEED ]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "This program generates random decimal digits\n");
fprintf(stderr, "0 - 9, separated by spaces, COLS per line,\n");
fprintf(stderr, "LINES lines. In total, COLS*LINES*2 bytes\n");
fprintf(stderr, "will be used.\n");
fprintf(stderr, "\n");
fprintf(stderr, "SEED is the optional seed for the Xorshift64*\n");
fprintf(stderr, "pseudo-random number generator used in this program.\n");
fprintf(stderr, "If omitted, current time is used as the seed.\n");
fprintf(stderr, "\n");
return EXIT_SUCCESS;
}
if (parse_ulong(argv[1], &cols) || cols < 1UL) {
fprintf(stderr, "%s: Invalid number of digits per line.\n", argv[1]);
return EXIT_FAILURE;
}
if (parse_ulong(argv[2], &lines) || lines < 1UL) {
fprintf(stderr, "%s: Invalid number of lines.\n", argv[2]);
return EXIT_FAILURE;
}
if (argc > 3) {
if (parse_ulong(argv[3], &seed)) {
fprintf(stderr, "%s: Invalid Xorshift64* seed.\n", argv[3]);
return EXIT_FAILURE;
}
} else
seed = time_seed();
/* Since zero seed is invalid, we map it to ~0. */
xorshift_state = seed;
if (!xorshift_state)
xorshift_state = ~(uint64_t)0;
/* Discard first 1000 values to make the initial values unpredictable. */
for (col = 0; col < 1000; col++)
xorshift_u64();
/* Allocate memory for a full line. */
oneline = malloc((size_t)(2 * cols + 1));
if (!oneline) {
fprintf(stderr, "Not enough memory for %lu column buffer.\n", cols);
return EXIT_FAILURE;
}
/* Set spaces and terminating newline. */
for (col = 0; col < cols; col++)
oneline[2*col + 1] = ' ';
oneline[2*cols-1] = '\n';
/* Not needed, but in case a code modification treats it as a string. */
oneline[2*cols] = '\0';
for (line = 0UL; line < lines; line++) {
for (col = 0UL; col < cols; col++)
oneline[2*col] = digit();
if (fwrite(oneline, 2*cols, 1, stdout) != 1)
return EXIT_FAILURE;
}
/* Check for write errors. */
if (ferror(stdout))
return EXIT_FAILURE;
return EXIT_SUCCESS;
}
注意:两个示例都在2016-11-18上进行了编辑,以确保数字的均匀分布(不包括零;有关各种伪随机数生成器的比较和详细信息,请参见此处)。
使用例如编译
gcc -Wall -O2 decimal-digits.c -o decimal-digits
并选择性地在系统范围内安装以/usr/bin
使用
sudo install -o root -g root -m 0755 decimal-digits /usr/bin
它需要每行的位数和行数。因为1000000000 / 100 / 2 = 5000000
(五百万;总字节数除以列数除以2),您可以使用
./decimal-digits 100 5000000 > digits.txt
生成digits.txt
OP所需的千兆字节大小。
请注意,程序本身的编写更具可读性,而不是效率。我的目的不是展示代码的效率-无论如何我将使用POSIX.1和低级I / O,而不是通用C接口-而是让您轻松地看到花了多少功夫才有平衡与单行,短壳或awk脚本相比,在开发专用工具及其性能方面。
使用GNU C库,fputc()
为每个字符输出调用函数都会产生非常小的开销(间接函数调用或条件调用- FILE
接口实际上非常复杂且用途广泛)。在此特定的Intel Core i5-4200U笔记本电脑上,将输出重定向到/dev/null
,第一个(fputc)版本花费大约11秒,而一次生产线版本仅花费1.3秒。
我碰巧经常写这样的程序和生成器只是因为我喜欢玩巨大的数据集。我很奇怪 例如,我曾经编写过一个程序,可以将所有有限的正IEEE-754浮点正值打印到文本文件中,并具有足够的精度以在解析时产生完全相同的值。该文件的大小为几GB(也许是4G左右);float
人们可能会想到的有限正数并不多。我用它来比较读取和解析此类数据的实现。
对于正常使用案例,例如OP,shell脚本,scriptlet和单行是更好的方法。花费较少的时间来完成总体任务。(除非他们每天左右需要一个不同的文件,或者有很多人需要一个不同的文件,在这种情况下,罕见的情况是,上面提到的专用工具可能需要花费很多精力。)