我正在尝试转换一些法语加拿大的字符串,基本上,我希望能够在保留字母的同时去除字母中的法语重音符号。(例如,转换é
为e
,crème brûlée
则将变为creme brulee
)
实现此目标的最佳方法是什么?
我正在尝试转换一些法语加拿大的字符串,基本上,我希望能够在保留字母的同时去除字母中的法语重音符号。(例如,转换é
为e
,crème brûlée
则将变为creme brulee
)
实现此目标的最佳方法是什么?
Answers:
我没有使用过这种方法,但是迈克尔·卡普兰(Michael Kaplan)在他的博客文章(标题令人迷惑)中描述了这样做的方法,该文章讨论剥离变音符号:剥离是一项有趣的工作(又名“无意义,又名全锰字符”是非间隔的,但有些间隔比其他间隔更大)
static string RemoveDiacritics(string text)
{
var normalizedString = text.Normalize(NormalizationForm.FormD);
var stringBuilder = new StringBuilder();
foreach (var c in normalizedString)
{
var unicodeCategory = CharUnicodeInfo.GetUnicodeCategory(c);
if (unicodeCategory != UnicodeCategory.NonSpacingMark)
{
stringBuilder.Append(c);
}
}
return stringBuilder.ToString().Normalize(NormalizationForm.FormC);
}
请注意,这是他之前的文章的后续内容:剥离变音符号...。
该方法使用String.Normalize将输入字符串拆分为组成的字形(基本上从变音符中分离出“基本”字符),然后扫描结果并仅保留基本字符。只是有点复杂,但实际上您正在寻找一个复杂的问题。
当然,如果您将自己限制为法语,则可以使用@David Dibben建议的如何在C ++ std :: string中消除重音和波浪号的简单基于表的方法。
这对我有用
string accentedStr;
byte[] tempBytes;
tempBytes = System.Text.Encoding.GetEncoding("ISO-8859-8").GetBytes(accentedStr);
string asciiStr = System.Text.Encoding.UTF8.GetString(tempBytes);
快速和简短!
«
»
和)…
(作为单个字符)将在此过程中发生更改,而公认的解决方案则不会如此。
System.ArgumentException: 'ISO-8859-8' is not a supported encoding name.
System.Text.Encoding.CodePages
从nuget 安装,然后调用此方法注册提供程序:Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
-完成此操作后,您可以使用ISO-8859-8
如果有人感兴趣,我正在寻找类似的东西,并结束了以下内容的编写:
public static string NormalizeStringForUrl(string name)
{
String normalizedString = name.Normalize(NormalizationForm.FormD);
StringBuilder stringBuilder = new StringBuilder();
foreach (char c in normalizedString)
{
switch (CharUnicodeInfo.GetUnicodeCategory(c))
{
case UnicodeCategory.LowercaseLetter:
case UnicodeCategory.UppercaseLetter:
case UnicodeCategory.DecimalDigitNumber:
stringBuilder.Append(c);
break;
case UnicodeCategory.SpaceSeparator:
case UnicodeCategory.ConnectorPunctuation:
case UnicodeCategory.DashPunctuation:
stringBuilder.Append('_');
break;
}
}
string result = stringBuilder.ToString();
return String.Join("_", result.Split(new char[] { '_' }
, StringSplitOptions.RemoveEmptyEntries)); // remove duplicate underscores
}
c < 128
,以确保我们不提取任何UTF字符,请参见此处。
c < 123
。看到ASCI
我需要一些可以转换所有主要unicode字符的东西,并且投票的答案省略了一些,所以我创建了一个convert_accented_characters($str)
易于定义的CodeIgniter版本到C#:
using System;
using System.Text;
using System.Collections.Generic;
public static class Strings
{
static Dictionary<string, string> foreign_characters = new Dictionary<string, string>
{
{ "äæǽ", "ae" },
{ "öœ", "oe" },
{ "ü", "ue" },
{ "Ä", "Ae" },
{ "Ü", "Ue" },
{ "Ö", "Oe" },
{ "ÀÁÂÃÄÅǺĀĂĄǍΑΆẢẠẦẪẨẬẰẮẴẲẶА", "A" },
{ "àáâãåǻāăąǎªαάảạầấẫẩậằắẵẳặа", "a" },
{ "Б", "B" },
{ "б", "b" },
{ "ÇĆĈĊČ", "C" },
{ "çćĉċč", "c" },
{ "Д", "D" },
{ "д", "d" },
{ "ÐĎĐΔ", "Dj" },
{ "ðďđδ", "dj" },
{ "ÈÉÊËĒĔĖĘĚΕΈẼẺẸỀẾỄỂỆЕЭ", "E" },
{ "èéêëēĕėęěέεẽẻẹềếễểệеэ", "e" },
{ "Ф", "F" },
{ "ф", "f" },
{ "ĜĞĠĢΓГҐ", "G" },
{ "ĝğġģγгґ", "g" },
{ "ĤĦ", "H" },
{ "ĥħ", "h" },
{ "ÌÍÎÏĨĪĬǏĮİΗΉΊΙΪỈỊИЫ", "I" },
{ "ìíîïĩīĭǐįıηήίιϊỉịиыї", "i" },
{ "Ĵ", "J" },
{ "ĵ", "j" },
{ "ĶΚК", "K" },
{ "ķκк", "k" },
{ "ĹĻĽĿŁΛЛ", "L" },
{ "ĺļľŀłλл", "l" },
{ "М", "M" },
{ "м", "m" },
{ "ÑŃŅŇΝН", "N" },
{ "ñńņňʼnνн", "n" },
{ "ÒÓÔÕŌŎǑŐƠØǾΟΌΩΏỎỌỒỐỖỔỘỜỚỠỞỢО", "O" },
{ "òóôõōŏǒőơøǿºοόωώỏọồốỗổộờớỡởợо", "o" },
{ "П", "P" },
{ "п", "p" },
{ "ŔŖŘΡР", "R" },
{ "ŕŗřρр", "r" },
{ "ŚŜŞȘŠΣС", "S" },
{ "śŝşșšſσςс", "s" },
{ "ȚŢŤŦτТ", "T" },
{ "țţťŧт", "t" },
{ "ÙÚÛŨŪŬŮŰŲƯǓǕǗǙǛŨỦỤỪỨỮỬỰУ", "U" },
{ "ùúûũūŭůűųưǔǖǘǚǜυύϋủụừứữửựу", "u" },
{ "ÝŸŶΥΎΫỲỸỶỴЙ", "Y" },
{ "ýÿŷỳỹỷỵй", "y" },
{ "В", "V" },
{ "в", "v" },
{ "Ŵ", "W" },
{ "ŵ", "w" },
{ "ŹŻŽΖЗ", "Z" },
{ "źżžζз", "z" },
{ "ÆǼ", "AE" },
{ "ß", "ss" },
{ "IJ", "IJ" },
{ "ij", "ij" },
{ "Œ", "OE" },
{ "ƒ", "f" },
{ "ξ", "ks" },
{ "π", "p" },
{ "β", "v" },
{ "μ", "m" },
{ "ψ", "ps" },
{ "Ё", "Yo" },
{ "ё", "yo" },
{ "Є", "Ye" },
{ "є", "ye" },
{ "Ї", "Yi" },
{ "Ж", "Zh" },
{ "ж", "zh" },
{ "Х", "Kh" },
{ "х", "kh" },
{ "Ц", "Ts" },
{ "ц", "ts" },
{ "Ч", "Ch" },
{ "ч", "ch" },
{ "Ш", "Sh" },
{ "ш", "sh" },
{ "Щ", "Shch" },
{ "щ", "shch" },
{ "ЪъЬь", "" },
{ "Ю", "Yu" },
{ "ю", "yu" },
{ "Я", "Ya" },
{ "я", "ya" },
};
public static char RemoveDiacritics(this char c){
foreach(KeyValuePair<string, string> entry in foreign_characters)
{
if(entry.Key.IndexOf (c) != -1)
{
return entry.Value[0];
}
}
return c;
}
public static string RemoveDiacritics(this string s)
{
//StringBuilder sb = new StringBuilder ();
string text = "";
foreach (char c in s)
{
int len = text.Length;
foreach(KeyValuePair<string, string> entry in foreign_characters)
{
if(entry.Key.IndexOf (c) != -1)
{
text += entry.Value;
break;
}
}
if (len == text.Length) {
text += c;
}
}
return text;
}
}
用法
// for strings
"crème brûlée".RemoveDiacritics (); // creme brulee
// for chars
"Ã"[0].RemoveDiacritics (); // A
if (entry.Key.IndexOf(c) != -1)
为if (entry.Key.Contains(c))
{ "äæǽ", "ae" }
代替{ "ä", "ae" }, { "æ", "ae" }, { "ǽ", "ae" }
呼叫而已if (foreign_characters.TryGetValue(...)) ...
。您已经完全击败了字典已具有的索引的目的。
如果有人感兴趣,这里是Java等效项:
import java.text.Normalizer;
public class MyClass
{
public static String removeDiacritics(String input)
{
String nrml = Normalizer.normalize(input, Normalizer.Form.NFD);
StringBuilder stripped = new StringBuilder();
for (int i=0;i<nrml.length();++i)
{
if (Character.getType(nrml.charAt(i)) != Character.NON_SPACING_MARK)
{
stripped.append(nrml.charAt(i));
}
}
return stripped.toString();
}
}
我经常使用基于在此找到的另一个版本的扩展方法(请参阅替换C#(ascii)中的字符)快速说明:
码:
using System.Linq;
using System.Text;
using System.Globalization;
// namespace here
public static class Utility
{
public static string RemoveDiacritics(this string str)
{
if (null == str) return null;
var chars =
from c in str.Normalize(NormalizationForm.FormD).ToCharArray()
let uc = CharUnicodeInfo.GetUnicodeCategory(c)
where uc != UnicodeCategory.NonSpacingMark
select c;
var cleanStr = new string(chars.ToArray()).Normalize(NormalizationForm.FormC);
return cleanStr;
}
// or, alternatively
public static string RemoveDiacritics2(this string str)
{
if (null == str) return null;
var chars = str
.Normalize(NormalizationForm.FormD)
.ToCharArray()
.Where(c=> CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
.ToArray();
return new string(chars).Normalize(NormalizationForm.FormC);
}
}
希腊语(ISO)的CodePage 可以做到
有关此代码页的信息已放入System.Text.Encoding.GetEncodings()
。有关详细信息,请访问:https : //msdn.microsoft.com/pt-br/library/system.text.encodinginfo.getencoding(v=vs.110).aspx
希腊文(ISO)的代码页为28597,名称为iso-8859-7。
转到代码... \ o /
string text = "Você está numa situação lamentável";
string textEncode = System.Web.HttpUtility.UrlEncode(text, Encoding.GetEncoding("iso-8859-7"));
//result: "Voce+esta+numa+situacao+lamentavel"
string textDecode = System.Web.HttpUtility.UrlDecode(textEncode);
//result: "Voce esta numa situacao lamentavel"
因此,编写此函数...
public string RemoveAcentuation(string text)
{
return
System.Web.HttpUtility.UrlDecode(
System.Web.HttpUtility.UrlEncode(
text, Encoding.GetEncoding("iso-8859-7")));
}
请注意,... Encoding.GetEncoding("iso-8859-7")
等同于,Encoding.GetEncoding(28597)
因为第一个是名称,第二个是Encoding的代码页。
äáčďěéíľľňôóřŕšťúůýž ÄÁČĎĚÉÍĽĽŇÔÓŘŔŠŤÚŮÝŽ ÖÜË łŁđĐ ţŢşŞçÇ øı
)。仅使用ßə
将转换为时发现了问题?
,但始终可以以单独的方式处理此类异常。在投入生产之前,应该对所有带有变音符号的Unicode区域进行更好的测试。
有趣的是,这样一个问题可以得到这么多答案,但又没有一个适合我的要求:)周围有太多语言,一种完全与语言无关的解决方案真的不可能AFAIK,因为其他人提到FormC或FormD正在发出问题。
由于原始问题与法语有关,因此最简单的工作答案确实是
public static string ConvertWesternEuropeanToASCII(this string str)
{
return Encoding.ASCII.GetString(Encoding.GetEncoding(1251).GetBytes(str));
}
1251应该替换为输入语言的编码代码。
但是,这只能用一个字符替换一个字符。由于我也使用德语作为输入,因此我进行了手动转换
public static string LatinizeGermanCharacters(this string str)
{
StringBuilder sb = new StringBuilder(str.Length);
foreach (char c in str)
{
switch (c)
{
case 'ä':
sb.Append("ae");
break;
case 'ö':
sb.Append("oe");
break;
case 'ü':
sb.Append("ue");
break;
case 'Ä':
sb.Append("Ae");
break;
case 'Ö':
sb.Append("Oe");
break;
case 'Ü':
sb.Append("Ue");
break;
case 'ß':
sb.Append("ss");
break;
default:
sb.Append(c);
break;
}
}
return sb.ToString();
}
它可能无法提供最佳性能,但至少它很容易阅读和扩展。正则表达式是行不通的,比任何char / string东西都要慢得多。
我也有一个非常简单的方法来删除空间:
public static string RemoveSpace(this string str)
{
return str.Replace(" ", string.Empty);
}
最终,我使用了以上所有三个扩展的组合:
public static string LatinizeAndConvertToASCII(this string str, bool keepSpace = false)
{
str = str.LatinizeGermanCharacters().ConvertWesternEuropeanToASCII();
return keepSpace ? str : str.RemoveSpace();
}
并通过了一次小型单元测试(并非详尽无遗)。
[TestMethod()]
public void LatinizeAndConvertToASCIITest()
{
string europeanStr = "Bonjour ça va? C'est l'été! Ich möchte ä Ä á à â ê é è ë Ë É ï Ï î í ì ó ò ô ö Ö Ü ü ù ú û Û ý Ý ç Ç ñ Ñ";
string expected = "Bonjourcava?C'estl'ete!IchmoechteaeAeaaaeeeeEEiIiiiooooeOeUeueuuuUyYcCnN";
string actual = europeanStr.LatinizeAndConvertToASCII();
Assert.AreEqual(expected, actual);
}
这在Java中工作正常。
基本上,它将所有带重音符号的字符转换为它们的不带重音符号的字符,然后将其合并变音符号。现在,您可以使用正则表达式删除变音符号。
import java.text.Normalizer;
import java.util.regex.Pattern;
public String deAccent(String str) {
String nfdNormalizedString = Normalizer.normalize(str, Normalizer.Form.NFD);
Pattern pattern = Pattern.compile("\\p{InCombiningDiacriticalMarks}+");
return pattern.matcher(nfdNormalizedString).replaceAll("");
}
"\\p{Block=CombiningDiacriticalMarks}"
TL; DR- C#字符串扩展方法
我想保存字符串的含义最好的解决办法是将字符,而不是转化剥夺他们,这是在本例中很好的说明中crème brûlée
,以crme brle
对creme brulee
。
我在上面检查了Alexander的注释,看到Lucene.Net代码是Apache 2.0许可的,因此我已将该类修改为简单的字符串扩展方法。您可以像这样使用它:
var originalString = "crème brûlée";
var maxLength = originalString.Length; // limit output length as necessary
var foldedString = originalString.FoldToASCII(maxLength);
// "creme brulee"
该函数太长,无法发布到StackOverflow答案中(允许约13.9万个字符,不得超过3万个字符),因此我提出了要点并归因于作者:
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// This class converts alphabetic, numeric, and symbolic Unicode characters
/// which are not in the first 127 ASCII characters (the "Basic Latin" Unicode
/// block) into their ASCII equivalents, if one exists.
/// <para/>
/// Characters from the following Unicode blocks are converted; however, only
/// those characters with reasonable ASCII alternatives are converted:
///
/// <ul>
/// <item><description>C1 Controls and Latin-1 Supplement: <a href="http://www.unicode.org/charts/PDF/U0080.pdf">http://www.unicode.org/charts/PDF/U0080.pdf</a></description></item>
/// <item><description>Latin Extended-A: <a href="http://www.unicode.org/charts/PDF/U0100.pdf">http://www.unicode.org/charts/PDF/U0100.pdf</a></description></item>
/// <item><description>Latin Extended-B: <a href="http://www.unicode.org/charts/PDF/U0180.pdf">http://www.unicode.org/charts/PDF/U0180.pdf</a></description></item>
/// <item><description>Latin Extended Additional: <a href="http://www.unicode.org/charts/PDF/U1E00.pdf">http://www.unicode.org/charts/PDF/U1E00.pdf</a></description></item>
/// <item><description>Latin Extended-C: <a href="http://www.unicode.org/charts/PDF/U2C60.pdf">http://www.unicode.org/charts/PDF/U2C60.pdf</a></description></item>
/// <item><description>Latin Extended-D: <a href="http://www.unicode.org/charts/PDF/UA720.pdf">http://www.unicode.org/charts/PDF/UA720.pdf</a></description></item>
/// <item><description>IPA Extensions: <a href="http://www.unicode.org/charts/PDF/U0250.pdf">http://www.unicode.org/charts/PDF/U0250.pdf</a></description></item>
/// <item><description>Phonetic Extensions: <a href="http://www.unicode.org/charts/PDF/U1D00.pdf">http://www.unicode.org/charts/PDF/U1D00.pdf</a></description></item>
/// <item><description>Phonetic Extensions Supplement: <a href="http://www.unicode.org/charts/PDF/U1D80.pdf">http://www.unicode.org/charts/PDF/U1D80.pdf</a></description></item>
/// <item><description>General Punctuation: <a href="http://www.unicode.org/charts/PDF/U2000.pdf">http://www.unicode.org/charts/PDF/U2000.pdf</a></description></item>
/// <item><description>Superscripts and Subscripts: <a href="http://www.unicode.org/charts/PDF/U2070.pdf">http://www.unicode.org/charts/PDF/U2070.pdf</a></description></item>
/// <item><description>Enclosed Alphanumerics: <a href="http://www.unicode.org/charts/PDF/U2460.pdf">http://www.unicode.org/charts/PDF/U2460.pdf</a></description></item>
/// <item><description>Dingbats: <a href="http://www.unicode.org/charts/PDF/U2700.pdf">http://www.unicode.org/charts/PDF/U2700.pdf</a></description></item>
/// <item><description>Supplemental Punctuation: <a href="http://www.unicode.org/charts/PDF/U2E00.pdf">http://www.unicode.org/charts/PDF/U2E00.pdf</a></description></item>
/// <item><description>Alphabetic Presentation Forms: <a href="http://www.unicode.org/charts/PDF/UFB00.pdf">http://www.unicode.org/charts/PDF/UFB00.pdf</a></description></item>
/// <item><description>Halfwidth and Fullwidth Forms: <a href="http://www.unicode.org/charts/PDF/UFF00.pdf">http://www.unicode.org/charts/PDF/UFF00.pdf</a></description></item>
/// </ul>
/// <para/>
/// See: <a href="http://en.wikipedia.org/wiki/Latin_characters_in_Unicode">http://en.wikipedia.org/wiki/Latin_characters_in_Unicode</a>
/// <para/>
/// For example, '&agrave;' will be replaced by 'a'.
/// </summary>
public static partial class StringExtensions
{
/// <summary>
/// Converts characters above ASCII to their ASCII equivalents. For example,
/// accents are removed from accented characters.
/// </summary>
/// <param name="input"> The string of characters to fold </param>
/// <param name="length"> The length of the folded return string </param>
/// <returns> length of output </returns>
public static string FoldToASCII(this string input, int? length = null)
{
// See https://gist.github.com/andyraddatz/e6a396fb91856174d4e3f1bf2e10951c
}
}
希望对其他人有所帮助,这是我找到的最可靠的解决方案!
这是VB版本(与GREEK兼容):
导入System.Text
导入系统全球化
Public Function RemoveDiacritics(ByVal s As String)
Dim normalizedString As String
Dim stringBuilder As New StringBuilder
normalizedString = s.Normalize(NormalizationForm.FormD)
Dim i As Integer
Dim c As Char
For i = 0 To normalizedString.Length - 1
c = normalizedString(i)
If CharUnicodeInfo.GetUnicodeCategory(c) <> UnicodeCategory.NonSpacingMark Then
stringBuilder.Append(c)
End If
Next
Return stringBuilder.ToString()
End Function
尝试使用HelperSharp软件包。
有一个方法RemoveAccents:
public static string RemoveAccents(this string source)
{
//8 bit characters
byte[] b = Encoding.GetEncoding(1251).GetBytes(source);
// 7 bit characters
string t = Encoding.ASCII.GetString(b);
Regex re = new Regex("[^a-zA-Z0-9]=-_/");
string c = re.Replace(t, " ");
return c;
}
这就是我在所有.NET程序中将变音符号替换为非变音符号的方式
C#:
//Transforms the culture of a letter to its equivalent representation in the 0-127 ascii table, such as the letter 'é' is substituted by an 'e'
public string RemoveDiacritics(string s)
{
string normalizedString = null;
StringBuilder stringBuilder = new StringBuilder();
normalizedString = s.Normalize(NormalizationForm.FormD);
int i = 0;
char c = '\0';
for (i = 0; i <= normalizedString.Length - 1; i++)
{
c = normalizedString[i];
if (CharUnicodeInfo.GetUnicodeCategory(c) != UnicodeCategory.NonSpacingMark)
{
stringBuilder.Append(c);
}
}
return stringBuilder.ToString().ToLower();
}
VB .NET:
'Transforms the culture of a letter to its equivalent representation in the 0-127 ascii table, such as the letter "é" is substituted by an "e"'
Public Function RemoveDiacritics(ByVal s As String) As String
Dim normalizedString As String
Dim stringBuilder As New StringBuilder
normalizedString = s.Normalize(NormalizationForm.FormD)
Dim i As Integer
Dim c As Char
For i = 0 To normalizedString.Length - 1
c = normalizedString(i)
If CharUnicodeInfo.GetUnicodeCategory(c) <> UnicodeCategory.NonSpacingMark Then
stringBuilder.Append(c)
End If
Next
Return stringBuilder.ToString().ToLower()
End Function
您可以使用MMLib.Extensions nuget包中的字符串扩展名:
using MMLib.RapidPrototyping.Generators;
public void ExtensionsExample()
{
string target = "aácčeéií";
Assert.AreEqual("aacceeii", target.RemoveDiacritics());
}
Nuget页面:https ://www.nuget.org/packages/MMLib.Extensions/ Codeplex项目站点https://mmlib.codeplex.com/
如果尚未考虑,请在此处弹出此库。看起来有一个完整的单元测试。
Imports System.Text
Imports System.Globalization
Public Function DECODE(ByVal x As String) As String
Dim sb As New StringBuilder
For Each c As Char In x.Normalize(NormalizationForm.FormD).Where(Function(a) CharUnicodeInfo.GetUnicodeCategory(a) <> UnicodeCategory.NonSpacingMark)
sb.Append(c)
Next
Return sb.ToString()
End Function
我真的很喜欢azrafe7提供的简洁实用的代码。因此,我做了一些更改,将其转换为扩展方法:
public static class StringExtensions
{
public static string RemoveDiacritics(this string text)
{
const string SINGLEBYTE_LATIN_ASCII_ENCODING = "ISO-8859-8";
if (string.IsNullOrEmpty(text))
{
return string.Empty;
}
return Encoding.ASCII.GetString(
Encoding.GetEncoding(SINGLEBYTE_LATIN_ASCII_ENCODING).GetBytes(text));
}
}
没有足够的声誉,显然我无法评论亚历山大的出色联系。-Lucene似乎是在一般情况下适用的唯一解决方案。
对于那些想要简单复制粘贴解决方案的人,这里就是利用Lucene中的代码:
字符串测试床=“
Console.WriteLine(Lucene.latinizeLucene(testbed));
AAAACEIIOOOUUTHaaaaaaaeeeeeeiiiidnoooouuaacDegiLlnOorSsszzsteu
//////////
public static class Lucene
{
// source: https://raw.githubusercontent.com/apache/lucenenet/master/src/Lucene.Net.Analysis.Common/Analysis/Miscellaneous/ASCIIFoldingFilter.cs
// idea: /programming/249087/how-do-i-remove-diacritics-accents-from-a-string-in-net (scroll down, search for lucene by Alexander)
public static string latinizeLucene(string arg)
{
char[] argChar = arg.ToCharArray();
// latinizeLuceneImpl can expand one char up to four chars - e.g. Þ to TH, or æ to ae, or in fact ⑽ to (10)
char[] resultChar = new String(' ', arg.Length * 4).ToCharArray();
int outputPos = Lucene.latinizeLuceneImpl(argChar, 0, ref resultChar, 0, arg.Length);
string ret = new string(resultChar);
ret = ret.Substring(0, outputPos);
return ret;
}
/// <summary>
/// Converts characters above ASCII to their ASCII equivalents. For example,
/// accents are removed from accented characters.
/// <para/>
/// @lucene.internal
/// </summary>
/// <param name="input"> The characters to fold </param>
/// <param name="inputPos"> Index of the first character to fold </param>
/// <param name="output"> The result of the folding. Should be of size >= <c>length * 4</c>. </param>
/// <param name="outputPos"> Index of output where to put the result of the folding </param>
/// <param name="length"> The number of characters to fold </param>
/// <returns> length of output </returns>
private static int latinizeLuceneImpl(char[] input, int inputPos, ref char[] output, int outputPos, int length)
{
int end = inputPos + length;
for (int pos = inputPos; pos < end; ++pos)
{
char c = input[pos];
// Quick test: if it's not in range then just keep current character
if (c < '\u0080')
{
output[outputPos++] = c;
}
else
{
switch (c)
{
case '\u00C0': // À [LATIN CAPITAL LETTER A WITH GRAVE]
case '\u00C1': // Á [LATIN CAPITAL LETTER A WITH ACUTE]
case '\u00C2': // Â [LATIN CAPITAL LETTER A WITH CIRCUMFLEX]
case '\u00C3': // Ã [LATIN CAPITAL LETTER A WITH TILDE]
case '\u00C4': // Ä [LATIN CAPITAL LETTER A WITH DIAERESIS]
case '\u00C5': // Å [LATIN CAPITAL LETTER A WITH RING ABOVE]
case '\u0100': // Ā [LATIN CAPITAL LETTER A WITH MACRON]
case '\u0102': // Ă [LATIN CAPITAL LETTER A WITH BREVE]
case '\u0104': // Ą [LATIN CAPITAL LETTER A WITH OGONEK]
case '\u018F': // Ə http://en.wikipedia.org/wiki/Schwa [LATIN CAPITAL LETTER SCHWA]
case '\u01CD': // Ǎ [LATIN CAPITAL LETTER A WITH CARON]
case '\u01DE': // Ǟ [LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON]
case '\u01E0': // Ǡ [LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON]
case '\u01FA': // Ǻ [LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE]
case '\u0200': // Ȁ [LATIN CAPITAL LETTER A WITH DOUBLE GRAVE]
case '\u0202': // Ȃ [LATIN CAPITAL LETTER A WITH INVERTED BREVE]
case '\u0226': // Ȧ [LATIN CAPITAL LETTER A WITH DOT ABOVE]
case '\u023A': // Ⱥ [LATIN CAPITAL LETTER A WITH STROKE]
case '\u1D00': // ᴀ [LATIN LETTER SMALL CAPITAL A]
case '\u1E00': // Ḁ [LATIN CAPITAL LETTER A WITH RING BELOW]
case '\u1EA0': // Ạ [LATIN CAPITAL LETTER A WITH DOT BELOW]
case '\u1EA2': // Ả [LATIN CAPITAL LETTER A WITH HOOK ABOVE]
case '\u1EA4': // Ấ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE]
case '\u1EA6': // Ầ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE]
case '\u1EA8': // Ẩ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE]
case '\u1EAA': // Ẫ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE]
case '\u1EAC': // Ậ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW]
case '\u1EAE': // Ắ [LATIN CAPITAL LETTER A WITH BREVE AND ACUTE]
case '\u1EB0': // Ằ [LATIN CAPITAL LETTER A WITH BREVE AND GRAVE]
case '\u1EB2': // Ẳ [LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE]
case '\u1EB4': // Ẵ [LATIN CAPITAL LETTER A WITH BREVE AND TILDE]
case '\u1EB6': // Ặ [LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW]
case '\u24B6': // Ⓐ [CIRCLED LATIN CAPITAL LETTER A]
case '\uFF21': // A [FULLWIDTH LATIN CAPITAL LETTER A]
output[outputPos++] = 'A';
break;
case '\u00E0': // à [LATIN SMALL LETTER A WITH GRAVE]
case '\u00E1': // á [LATIN SMALL LETTER A WITH ACUTE]
case '\u00E2': // â [LATIN SMALL LETTER A WITH CIRCUMFLEX]
case '\u00E3': // ã [LATIN SMALL LETTER A WITH TILDE]
case '\u00E4': // ä [LATIN SMALL LETTER A WITH DIAERESIS]
case '\u00E5': // å [LATIN SMALL LETTER A WITH RING ABOVE]
case '\u0101': // ā [LATIN SMALL LETTER A WITH MACRON]
case '\u0103': // ă [LATIN SMALL LETTER A WITH BREVE]
case '\u0105': // ą [LATIN SMALL LETTER A WITH OGONEK]
case '\u01CE': // ǎ [LATIN SMALL LETTER A WITH CARON]
case '\u01DF': // ǟ [LATIN SMALL LETTER A WITH DIAERESIS AND MACRON]
case '\u01E1': // ǡ [LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON]
case '\u01FB': // ǻ [LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE]
case '\u0201': // ȁ [LATIN SMALL LETTER A WITH DOUBLE GRAVE]
case '\u0203': // ȃ [LATIN SMALL LETTER A WITH INVERTED BREVE]
case '\u0227': // ȧ [LATIN SMALL LETTER A WITH DOT ABOVE]
case '\u0250': // ɐ [LATIN SMALL LETTER TURNED A]
case '\u0259': // ə [LATIN SMALL LETTER SCHWA]
case '\u025A': // ɚ [LATIN SMALL LETTER SCHWA WITH HOOK]
case '\u1D8F': // ᶏ [LATIN SMALL LETTER A WITH RETROFLEX HOOK]
case '\u1D95': // ᶕ [LATIN SMALL LETTER SCHWA WITH RETROFLEX HOOK]
case '\u1E01': // ạ [LATIN SMALL LETTER A WITH RING BELOW]
case '\u1E9A': // ả [LATIN SMALL LETTER A WITH RIGHT HALF RING]
case '\u1EA1': // ạ [LATIN SMALL LETTER A WITH DOT BELOW]
case '\u1EA3': // ả [LATIN SMALL LETTER A WITH HOOK ABOVE]
case '\u1EA5': // ấ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACUTE]
case '\u1EA7': // ầ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRAVE]
case '\u1EA9': // ẩ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE]
case '\u1EAB': // ẫ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND TILDE]
case '\u1EAD': // ậ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND DOT BELOW]
case '\u1EAF': // ắ [LATIN SMALL LETTER A WITH BREVE AND ACUTE]
case '\u1EB1': // ằ [LATIN SMALL LETTER A WITH BREVE AND GRAVE]
case '\u1EB3': // ẳ [LATIN SMALL LETTER A WITH BREVE AND HOOK ABOVE]
case '\u1EB5': // ẵ [LATIN SMALL LETTER A WITH BREVE AND TILDE]
case '\u1EB7': // ặ [LATIN SMALL LETTER A WITH BREVE AND DOT BELOW]
case '\u2090': // ₐ [LATIN SUBSCRIPT SMALL LETTER A]
case '\u2094': // ₔ [LATIN SUBSCRIPT SMALL LETTER SCHWA]
case '\u24D0': // ⓐ [CIRCLED LATIN SMALL LETTER A]
case '\u2C65': // ⱥ [LATIN SMALL LETTER A WITH STROKE]
case '\u2C6F': // Ɐ [LATIN CAPITAL LETTER TURNED A]
case '\uFF41': // a [FULLWIDTH LATIN SMALL LETTER A]
output[outputPos++] = 'a';
break;
case '\uA732': // Ꜳ [LATIN CAPITAL LETTER AA]
output[outputPos++] = 'A';
output[outputPos++] = 'A';
break;
case '\u00C6': // Æ [LATIN CAPITAL LETTER AE]
case '\u01E2': // Ǣ [LATIN CAPITAL LETTER AE WITH MACRON]
case '\u01FC': // Ǽ [LATIN CAPITAL LETTER AE WITH ACUTE]
case '\u1D01': // ᴁ [LATIN LETTER SMALL CAPITAL AE]
output[outputPos++] = 'A';
output[outputPos++] = 'E';
break;
case '\uA734': // Ꜵ [LATIN CAPITAL LETTER AO]
output[outputPos++] = 'A';
output[outputPos++] = 'O';
break;
case '\uA736': // Ꜷ [LATIN CAPITAL LETTER AU]
output[outputPos++] = 'A';
output[outputPos++] = 'U';
break;
// etc. etc. etc.
// see link above for complete source code
//
// unfortunately, postings are limited, as in
// "Body is limited to 30000 characters; you entered 136098."
[...]
case '\u2053': // ⁓ [SWUNG DASH]
case '\uFF5E': // ~ [FULLWIDTH TILDE]
output[outputPos++] = '~';
break;
default:
output[outputPos++] = c;
break;
}
}
}
return outputPos;
}
}