这是我使用大文件(50 GB)的方法:
我尝试了两种不同的方法:第一种,将文件读取到内存中,并使用Regex Replace或String Replace。然后,我将整个字符串附加到一个临时文件中。
第一种方法对某些Regex替换效果很好,但是如果在一个大文件中进行多次替换,Regex.Replace或String.Replace可能会导致内存不足错误。
第二种方法是逐行读取临时文件,并使用StringBuilder手动构建每行并将每条处理过的行附加到结果文件中。这种方法非常快。
static void ProcessLargeFile()
{
if (File.Exists(outFileName)) File.Delete(outFileName);
string text = File.ReadAllText(inputFileName, Encoding.UTF8);
// EX 1 This opens entire file in memory and uses Replace and Regex Replace --> might cause out of memory error
text = text.Replace("</text>", "");
text = Regex.Replace(text, @"\<ref.*?\</ref\>", "");
File.WriteAllText(outFileName, text);
// EX 2 This reads file line by line
if (File.Exists(outFileName)) File.Delete(outFileName);
using (var sw = new StreamWriter(outFileName))
using (var fs = File.OpenRead(inFileName))
using (var sr = new StreamReader(fs, Encoding.UTF8)) //use UTF8 encoding or whatever encoding your file uses
{
string line, newLine;
while ((line = sr.ReadLine()) != null)
{
//note: call your own replace function or use String.Replace here
newLine = Util.ReplaceDoubleBrackets(line);
sw.WriteLine(newLine);
}
}
}
public static string ReplaceDoubleBrackets(string str)
{
//note: this replaces the first occurrence of a word delimited by [[ ]]
//replace [[ with your own delimiter
if (str.IndexOf("[[") < 0)
return str;
StringBuilder sb = new StringBuilder();
//this part gets the string to replace, put this in a loop if more than one occurrence per line.
int posStart = str.IndexOf("[[");
int posEnd = str.IndexOf("]]");
int length = posEnd - posStart;
// ... code to replace with newstr
sb.Append(newstr);
return sb.ToString();
}