我感到我们缺少某些类型的子字符串计数,例如不安全的逐字节比较。我将原始海报的方法和我能想到的任何方法放在一起。
这些是我做的字符串扩展。
namespace Example
{
using System;
using System.Text;
public static class StringExtensions
{
public static int CountSubstr(this string str, string substr)
{
return (str.Length - str.Replace(substr, "").Length) / substr.Length;
}
public static int CountSubstr(this string str, char substr)
{
return (str.Length - str.Replace(substr.ToString(), "").Length);
}
public static int CountSubstr2(this string str, string substr)
{
int substrlen = substr.Length;
int lastIndex = str.IndexOf(substr, 0, StringComparison.Ordinal);
int count = 0;
while (lastIndex != -1)
{
++count;
lastIndex = str.IndexOf(substr, lastIndex + substrlen, StringComparison.Ordinal);
}
return count;
}
public static int CountSubstr2(this string str, char substr)
{
int lastIndex = str.IndexOf(substr, 0);
int count = 0;
while (lastIndex != -1)
{
++count;
lastIndex = str.IndexOf(substr, lastIndex + 1);
}
return count;
}
public static int CountChar(this string str, char substr)
{
int length = str.Length;
int count = 0;
for (int i = 0; i < length; ++i)
if (str[i] == substr)
++count;
return count;
}
public static int CountChar2(this string str, char substr)
{
int count = 0;
foreach (var c in str)
if (c == substr)
++count;
return count;
}
public static unsafe int CountChar3(this string str, char substr)
{
int length = str.Length;
int count = 0;
fixed (char* chars = str)
{
for (int i = 0; i < length; ++i)
if (*(chars + i) == substr)
++count;
}
return count;
}
public static unsafe int CountChar4(this string str, char substr)
{
int length = str.Length;
int count = 0;
fixed (char* chars = str)
{
for (int i = length - 1; i >= 0; --i)
if (*(chars + i) == substr)
++count;
}
return count;
}
public static unsafe int CountSubstr3(this string str, string substr)
{
int length = str.Length;
int substrlen = substr.Length;
int count = 0;
fixed (char* strc = str)
{
fixed (char* substrc = substr)
{
int n = 0;
for (int i = 0; i < length; ++i)
{
if (*(strc + i) == *(substrc + n))
{
++n;
if (n == substrlen)
{
++count;
n = 0;
}
}
else
n = 0;
}
}
}
return count;
}
public static int CountSubstr3(this string str, char substr)
{
return CountSubstr3(str, substr.ToString());
}
public static unsafe int CountSubstr4(this string str, string substr)
{
int length = str.Length;
int substrLastIndex = substr.Length - 1;
int count = 0;
fixed (char* strc = str)
{
fixed (char* substrc = substr)
{
int n = substrLastIndex;
for (int i = length - 1; i >= 0; --i)
{
if (*(strc + i) == *(substrc + n))
{
if (--n == -1)
{
++count;
n = substrLastIndex;
}
}
else
n = substrLastIndex;
}
}
}
return count;
}
public static int CountSubstr4(this string str, char substr)
{
return CountSubstr4(str, substr.ToString());
}
}
}
随后是测试代码...
static void Main()
{
const char matchA = '_';
const string matchB = "and";
const string matchC = "muchlongerword";
const string testStrA = "_and_d_e_banna_i_o___pfasd__and_d_e_banna_i_o___pfasd_";
const string testStrB = "and sdf and ans andeians andano ip and and sdf and ans andeians andano ip and";
const string testStrC =
"muchlongerword amuchlongerworsdfmuchlongerwordsdf jmuchlongerworijv muchlongerword sdmuchlongerword dsmuchlongerword";
const int testSize = 1000000;
Console.WriteLine(testStrA.CountSubstr('_'));
Console.WriteLine(testStrA.CountSubstr2('_'));
Console.WriteLine(testStrA.CountSubstr3('_'));
Console.WriteLine(testStrA.CountSubstr4('_'));
Console.WriteLine(testStrA.CountChar('_'));
Console.WriteLine(testStrA.CountChar2('_'));
Console.WriteLine(testStrA.CountChar3('_'));
Console.WriteLine(testStrA.CountChar4('_'));
Console.WriteLine(testStrB.CountSubstr("and"));
Console.WriteLine(testStrB.CountSubstr2("and"));
Console.WriteLine(testStrB.CountSubstr3("and"));
Console.WriteLine(testStrB.CountSubstr4("and"));
Console.WriteLine(testStrC.CountSubstr("muchlongerword"));
Console.WriteLine(testStrC.CountSubstr2("muchlongerword"));
Console.WriteLine(testStrC.CountSubstr3("muchlongerword"));
Console.WriteLine(testStrC.CountSubstr4("muchlongerword"));
var timer = new Stopwatch();
timer.Start();
for (int i = 0; i < testSize; ++i)
testStrA.CountSubstr(matchA);
timer.Stop();
Console.WriteLine("CS1 chr: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrB.CountSubstr(matchB);
timer.Stop();
Console.WriteLine("CS1 and: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrC.CountSubstr(matchC);
timer.Stop();
Console.WriteLine("CS1 mlw: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrA.CountSubstr2(matchA);
timer.Stop();
Console.WriteLine("CS2 chr: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrB.CountSubstr2(matchB);
timer.Stop();
Console.WriteLine("CS2 and: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrC.CountSubstr2(matchC);
timer.Stop();
Console.WriteLine("CS2 mlw: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrA.CountSubstr3(matchA);
timer.Stop();
Console.WriteLine("CS3 chr: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrB.CountSubstr3(matchB);
timer.Stop();
Console.WriteLine("CS3 and: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrC.CountSubstr3(matchC);
timer.Stop();
Console.WriteLine("CS3 mlw: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrA.CountSubstr4(matchA);
timer.Stop();
Console.WriteLine("CS4 chr: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrB.CountSubstr4(matchB);
timer.Stop();
Console.WriteLine("CS4 and: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrC.CountSubstr4(matchC);
timer.Stop();
Console.WriteLine("CS4 mlw: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrA.CountChar(matchA);
timer.Stop();
Console.WriteLine("CC1 chr: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrA.CountChar2(matchA);
timer.Stop();
Console.WriteLine("CC2 chr: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrA.CountChar3(matchA);
timer.Stop();
Console.WriteLine("CC3 chr: " + timer.Elapsed.TotalMilliseconds + "ms");
timer.Restart();
for (int i = 0; i < testSize; ++i)
testStrA.CountChar4(matchA);
timer.Stop();
Console.WriteLine("CC4 chr: " + timer.Elapsed.TotalMilliseconds + "ms");
}
结果:CSX与CountSubstrX相对应,而CCX与CountCharX相对应。“ chr”在字符串中搜索“ _”,“ and”在字符串中搜索“ and”,而“ mlw”在字符串中搜索“ muchlongerword”
CS1 chr: 824.123ms
CS1 and: 586.1893ms
CS1 mlw: 486.5414ms
CS2 chr: 127.8941ms
CS2 and: 806.3918ms
CS2 mlw: 497.318ms
CS3 chr: 201.8896ms
CS3 and: 124.0675ms
CS3 mlw: 212.8341ms
CS4 chr: 81.5183ms
CS4 and: 92.0615ms
CS4 mlw: 116.2197ms
CC1 chr: 66.4078ms
CC2 chr: 64.0161ms
CC3 chr: 65.9013ms
CC4 chr: 65.8206ms
最后,我有一个360万个字符的文件。重复了100,000次“ derp adfderdserp dfaerpderp deasderp”。我使用上述方法在文件内搜索“ derp”,结果是这些结果的100倍。
CS1Derp: 1501.3444ms
CS2Derp: 1585.797ms
CS3Derp: 376.0937ms
CS4Derp: 271.1663ms
因此,我的第四种方法绝对是赢家,但是,实际上,如果一个360万个字符文件100次仅花了1586毫秒(最坏的情况),那么所有这些都可以忽略不计。
顺便说一句,我还使用100倍CountSubstr和CountChar方法扫描了360万个字符文件中的'd'字符。结果...
CS1 d : 2606.9513ms
CS2 d : 339.7942ms
CS3 d : 960.281ms
CS4 d : 233.3442ms
CC1 d : 302.4122ms
CC2 d : 280.7719ms
CC3 d : 299.1125ms
CC4 d : 292.9365ms
据此,原始的海报方法对于大干草堆中的单个字符的针非常不利。
注意:所有值已更新为“发行版本”输出。我第一次发布此内容时,意外地忘记了建立在Release模式上。我的某些声明已被修改。