如果您想追求“简单”的解决方案,您可能会发现我整理的此类非常有用:
http://www.architectshack.com/TextFileEncodingDetector.ashx
它会先自动进行BOM检测,然后尝试区分没有BOM的Unicode编码和其他默认编码(通常为Windows-1252,在.Net中错误地标记为Encoding.ASCII)。
如上所述,涉及NCharDet或MLang的“较重”解决方案可能更合适,并且正如我在此类概述页面上所指出的那样,最好是尽可能与用户提供某种形式的交互性,因为没有100%的检测率!
网站离线时的摘要:
using System;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
namespace KlerksSoft
{
public static class TextFileEncodingDetector
{
const long _defaultHeuristicSampleSize = 0x10000;
public static Encoding DetectTextFileEncoding(string InputFilename)
{
using (FileStream textfileStream = File.OpenRead(InputFilename))
{
return DetectTextFileEncoding(textfileStream, _defaultHeuristicSampleSize);
}
}
public static Encoding DetectTextFileEncoding(FileStream InputFileStream, long HeuristicSampleSize)
{
bool uselessBool = false;
return DetectTextFileEncoding(InputFileStream, _defaultHeuristicSampleSize, out uselessBool);
}
public static Encoding DetectTextFileEncoding(FileStream InputFileStream, long HeuristicSampleSize, out bool HasBOM)
{
if (InputFileStream == null)
throw new ArgumentNullException("Must provide a valid Filestream!", "InputFileStream");
if (!InputFileStream.CanRead)
throw new ArgumentException("Provided file stream is not readable!", "InputFileStream");
if (!InputFileStream.CanSeek)
throw new ArgumentException("Provided file stream cannot seek!", "InputFileStream");
Encoding encodingFound = null;
long originalPos = InputFileStream.Position;
InputFileStream.Position = 0;
byte[] bomBytes = new byte[InputFileStream.Length > 4 ? 4 : InputFileStream.Length];
InputFileStream.Read(bomBytes, 0, bomBytes.Length);
encodingFound = DetectBOMBytes(bomBytes);
if (encodingFound != null)
{
InputFileStream.Position = originalPos;
HasBOM = true;
return encodingFound;
}
byte[] sampleBytes = new byte[HeuristicSampleSize > InputFileStream.Length ? InputFileStream.Length : HeuristicSampleSize];
Array.Copy(bomBytes, sampleBytes, bomBytes.Length);
if (InputFileStream.Length > bomBytes.Length)
InputFileStream.Read(sampleBytes, bomBytes.Length, sampleBytes.Length - bomBytes.Length);
InputFileStream.Position = originalPos;
encodingFound = DetectUnicodeInByteSampleByHeuristics(sampleBytes);
HasBOM = false;
return encodingFound;
}
public static Encoding DetectTextByteArrayEncoding(byte[] TextData)
{
bool uselessBool = false;
return DetectTextByteArrayEncoding(TextData, out uselessBool);
}
public static Encoding DetectTextByteArrayEncoding(byte[] TextData, out bool HasBOM)
{
if (TextData == null)
throw new ArgumentNullException("Must provide a valid text data byte array!", "TextData");
Encoding encodingFound = null;
encodingFound = DetectBOMBytes(TextData);
if (encodingFound != null)
{
HasBOM = true;
return encodingFound;
}
else
{
encodingFound = DetectUnicodeInByteSampleByHeuristics(TextData);
HasBOM = false;
return encodingFound;
}
}
public static string GetStringFromByteArray(byte[] TextData, Encoding DefaultEncoding)
{
return GetStringFromByteArray(TextData, DefaultEncoding, _defaultHeuristicSampleSize);
}
public static string GetStringFromByteArray(byte[] TextData, Encoding DefaultEncoding, long MaxHeuristicSampleSize)
{
if (TextData == null)
throw new ArgumentNullException("Must provide a valid text data byte array!", "TextData");
Encoding encodingFound = null;
encodingFound = DetectBOMBytes(TextData);
if (encodingFound != null)
{
return encodingFound.GetString(TextData, encodingFound.GetPreamble().Length, TextData.Length - encodingFound.GetPreamble().Length);
}
else
{
byte[] heuristicSample = null;
if (TextData.Length > MaxHeuristicSampleSize)
{
heuristicSample = new byte[MaxHeuristicSampleSize];
Array.Copy(TextData, heuristicSample, MaxHeuristicSampleSize);
}
else
{
heuristicSample = TextData;
}
encodingFound = DetectUnicodeInByteSampleByHeuristics(TextData) ?? DefaultEncoding;
return encodingFound.GetString(TextData);
}
}
public static Encoding DetectBOMBytes(byte[] BOMBytes)
{
if (BOMBytes == null)
throw new ArgumentNullException("Must provide a valid BOM byte array!", "BOMBytes");
if (BOMBytes.Length < 2)
return null;
if (BOMBytes[0] == 0xff
&& BOMBytes[1] == 0xfe
&& (BOMBytes.Length < 4
|| BOMBytes[2] != 0
|| BOMBytes[3] != 0
)
)
return Encoding.Unicode;
if (BOMBytes[0] == 0xfe
&& BOMBytes[1] == 0xff
)
return Encoding.BigEndianUnicode;
if (BOMBytes.Length < 3)
return null;
if (BOMBytes[0] == 0xef && BOMBytes[1] == 0xbb && BOMBytes[2] == 0xbf)
return Encoding.UTF8;
if (BOMBytes[0] == 0x2b && BOMBytes[1] == 0x2f && BOMBytes[2] == 0x76)
return Encoding.UTF7;
if (BOMBytes.Length < 4)
return null;
if (BOMBytes[0] == 0xff && BOMBytes[1] == 0xfe && BOMBytes[2] == 0 && BOMBytes[3] == 0)
return Encoding.UTF32;
if (BOMBytes[0] == 0 && BOMBytes[1] == 0 && BOMBytes[2] == 0xfe && BOMBytes[3] == 0xff)
return Encoding.GetEncoding(12001);
return null;
}
public static Encoding DetectUnicodeInByteSampleByHeuristics(byte[] SampleBytes)
{
long oddBinaryNullsInSample = 0;
long evenBinaryNullsInSample = 0;
long suspiciousUTF8SequenceCount = 0;
long suspiciousUTF8BytesTotal = 0;
long likelyUSASCIIBytesInSample = 0;
long currentPos = 0;
int skipUTF8Bytes = 0;
while (currentPos < SampleBytes.Length)
{
if (SampleBytes[currentPos] == 0)
{
if (currentPos % 2 == 0)
evenBinaryNullsInSample++;
else
oddBinaryNullsInSample++;
}
if (IsCommonUSASCIIByte(SampleBytes[currentPos]))
likelyUSASCIIBytesInSample++;
if (skipUTF8Bytes == 0)
{
int lengthFound = DetectSuspiciousUTF8SequenceLength(SampleBytes, currentPos);
if (lengthFound > 0)
{
suspiciousUTF8SequenceCount++;
suspiciousUTF8BytesTotal += lengthFound;
skipUTF8Bytes = lengthFound - 1;
}
}
else
{
skipUTF8Bytes--;
}
currentPos++;
}
if (((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2
&& ((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6
)
return Encoding.Unicode;
if (((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2
&& ((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6
)
return Encoding.BigEndianUnicode;
string potentiallyMangledString = Encoding.ASCII.GetString(SampleBytes);
Regex UTF8Validator = new Regex(@"\A("
+ @"[\x09\x0A\x0D\x20-\x7E]"
+ @"|[\xC2-\xDF][\x80-\xBF]"
+ @"|\xE0[\xA0-\xBF][\x80-\xBF]"
+ @"|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}"
+ @"|\xED[\x80-\x9F][\x80-\xBF]"
+ @"|\xF0[\x90-\xBF][\x80-\xBF]{2}"
+ @"|[\xF1-\xF3][\x80-\xBF]{3}"
+ @"|\xF4[\x80-\x8F][\x80-\xBF]{2}"
+ @")*\z");
if (UTF8Validator.IsMatch(potentiallyMangledString))
{
if ((suspiciousUTF8SequenceCount * 500000.0 / SampleBytes.Length >= 1)
&& (
SampleBytes.Length - suspiciousUTF8BytesTotal == 0
||
likelyUSASCIIBytesInSample * 1.0 / (SampleBytes.Length - suspiciousUTF8BytesTotal) >= 0.8
)
)
return Encoding.UTF8;
}
return null;
}
private static bool IsCommonUSASCIIByte(byte testByte)
{
if (testByte == 0x0A
|| testByte == 0x0D
|| testByte == 0x09
|| (testByte >= 0x20 && testByte <= 0x2F)
|| (testByte >= 0x30 && testByte <= 0x39)
|| (testByte >= 0x3A && testByte <= 0x40)
|| (testByte >= 0x41 && testByte <= 0x5A)
|| (testByte >= 0x5B && testByte <= 0x60)
|| (testByte >= 0x61 && testByte <= 0x7A)
|| (testByte >= 0x7B && testByte <= 0x7E)
)
return true;
else
return false;
}
private static int DetectSuspiciousUTF8SequenceLength(byte[] SampleBytes, long currentPos)
{
int lengthFound = 0;
if (SampleBytes.Length >= currentPos + 1
&& SampleBytes[currentPos] == 0xC2
)
{
if (SampleBytes[currentPos + 1] == 0x81
|| SampleBytes[currentPos + 1] == 0x8D
|| SampleBytes[currentPos + 1] == 0x8F
)
lengthFound = 2;
else if (SampleBytes[currentPos + 1] == 0x90
|| SampleBytes[currentPos + 1] == 0x9D
)
lengthFound = 2;
else if (SampleBytes[currentPos + 1] >= 0xA0
&& SampleBytes[currentPos + 1] <= 0xBF
)
lengthFound = 2;
}
else if (SampleBytes.Length >= currentPos + 1
&& SampleBytes[currentPos] == 0xC3
)
{
if (SampleBytes[currentPos + 1] >= 0x80
&& SampleBytes[currentPos + 1] <= 0xBF
)
lengthFound = 2;
}
else if (SampleBytes.Length >= currentPos + 1
&& SampleBytes[currentPos] == 0xC5
)
{
if (SampleBytes[currentPos + 1] == 0x92
|| SampleBytes[currentPos + 1] == 0x93
)
lengthFound = 2;
else if (SampleBytes[currentPos + 1] == 0xA0
|| SampleBytes[currentPos + 1] == 0xA1
)
lengthFound = 2;
else if (SampleBytes[currentPos + 1] == 0xB8
|| SampleBytes[currentPos + 1] == 0xBD
|| SampleBytes[currentPos + 1] == 0xBE
)
lengthFound = 2;
}
else if (SampleBytes.Length >= currentPos + 1
&& SampleBytes[currentPos] == 0xC6
)
{
if (SampleBytes[currentPos + 1] == 0x92)
lengthFound = 2;
}
else if (SampleBytes.Length >= currentPos + 1
&& SampleBytes[currentPos] == 0xCB
)
{
if (SampleBytes[currentPos + 1] == 0x86
|| SampleBytes[currentPos + 1] == 0x9C
)
lengthFound = 2;
}
else if (SampleBytes.Length >= currentPos + 2
&& SampleBytes[currentPos] == 0xE2
)
{
if (SampleBytes[currentPos + 1] == 0x80)
{
if (SampleBytes[currentPos + 2] == 0x93
|| SampleBytes[currentPos + 2] == 0x94
)
lengthFound = 3;
if (SampleBytes[currentPos + 2] == 0x98
|| SampleBytes[currentPos + 2] == 0x99
|| SampleBytes[currentPos + 2] == 0x9A
)
lengthFound = 3;
if (SampleBytes[currentPos + 2] == 0x9C
|| SampleBytes[currentPos + 2] == 0x9D
|| SampleBytes[currentPos + 2] == 0x9E
)
lengthFound = 3;
if (SampleBytes[currentPos + 2] == 0xA0
|| SampleBytes[currentPos + 2] == 0xA1
|| SampleBytes[currentPos + 2] == 0xA2
)
lengthFound = 3;
if (SampleBytes[currentPos + 2] == 0xA6)
lengthFound = 3;
if (SampleBytes[currentPos + 2] == 0xB0)
lengthFound = 3;
if (SampleBytes[currentPos + 2] == 0xB9
|| SampleBytes[currentPos + 2] == 0xBA
)
lengthFound = 3;
}
else if (SampleBytes[currentPos + 1] == 0x82
&& SampleBytes[currentPos + 2] == 0xAC
)
lengthFound = 3;
else if (SampleBytes[currentPos + 1] == 0x84
&& SampleBytes[currentPos + 2] == 0xA2
)
lengthFound = 3;
}
return lengthFound;
}
}
}