有没有人有一个值得信赖的Proper Case或PCase算法(类似于UCase或Upper)?我在寻找的东西取一个值,如"GEORGE BURDELL"或"george burdell"并将其转化为"George Burdell".
我有一个简单的处理简单案例.理想的是拥有可以处理诸如"O'REILLY"并将其转化为内容的东西"O'Reilly",但我知道这更难.
如果这简化了事情,我主要关注英语.
更新:我使用C#作为语言,但我可以从几乎任何东西转换(假设存在功能).
我同意麦当劳的痤疮是一个艰难的.我想提一下我的O'Reilly例子,但没有在原帖中.
Mar*_*son 18
除非我误解了你的问题,否则我认为你不需要自己动手,TextInfo类可以为你做.
using System.Globalization;
CultureInfo.InvariantCulture.TextInfo.ToTitleCase("GeOrGE bUrdEll")
Run Code Online (Sandbox Code Playgroud)
将返回"George Burdell.如果涉及一些特殊规则,您可以使用自己的文化.
更新: 迈克尔(在对此答案的评论中)指出,如果输入是全部大写,这将不起作用,因为该方法将假定它是首字母缩略词.这个天真的解决方法是在将文本提交给ToTitleCase之前.ToLower()文本.
@Zack:我会将其作为单独的回复发布.
这是一个基于kronoz帖子的例子.
void Main()
{
List<string> names = new List<string>() {
"bill o'reilly",
"johannes diderik van der waals",
"mr. moseley-williams",
"Joe VanWyck",
"mcdonald's",
"william the third",
"hrh prince charles",
"h.r.m. queen elizabeth the third",
"william gates, iii",
"pope leo xii",
"a.k. jennings"
};
names.Select(name => name.ToProperCase()).Dump();
}
// http://stackoverflow.com/questions/32149/does-anyone-have-a-good-proper-case-algorithm
public static class ProperCaseHelper
{
public static string ToProperCase(this string input)
{
if (IsAllUpperOrAllLower(input))
{
// fix the ALL UPPERCASE or all lowercase names
return string.Join(" ", input.Split(' ').Select(word => wordToProperCase(word)));
}
else
{
// leave the CamelCase or Propercase names alone
return input;
}
}
public static bool IsAllUpperOrAllLower(this string input)
{
return (input.ToLower().Equals(input) || input.ToUpper().Equals(input));
}
private static string wordToProperCase(string word)
{
if (string.IsNullOrEmpty(word)) return word;
// Standard case
string ret = capitaliseFirstLetter(word);
// Special cases:
ret = properSuffix(ret, "'"); // D'Artagnon, D'Silva
ret = properSuffix(ret, "."); // ???
ret = properSuffix(ret, "-"); // Oscar-Meyer-Weiner
ret = properSuffix(ret, "Mc", t => t.Length > 4); // Scots
ret = properSuffix(ret, "Mac", t => t.Length > 5); // Scots except Macey
// Special words:
ret = specialWords(ret, "van"); // Dick van Dyke
ret = specialWords(ret, "von"); // Baron von Bruin-Valt
ret = specialWords(ret, "de");
ret = specialWords(ret, "di");
ret = specialWords(ret, "da"); // Leonardo da Vinci, Eduardo da Silva
ret = specialWords(ret, "of"); // The Grand Old Duke of York
ret = specialWords(ret, "the"); // William the Conqueror
ret = specialWords(ret, "HRH"); // His/Her Royal Highness
ret = specialWords(ret, "HRM"); // His/Her Royal Majesty
ret = specialWords(ret, "H.R.H."); // His/Her Royal Highness
ret = specialWords(ret, "H.R.M."); // His/Her Royal Majesty
ret = dealWithRomanNumerals(ret); // William Gates, III
return ret;
}
private static string properSuffix(string word, string prefix, Func<string, bool> condition = null)
{
if (string.IsNullOrEmpty(word)) return word;
if (condition != null && ! condition(word)) return word;
string lowerWord = word.ToLower();
string lowerPrefix = prefix.ToLower();
if (!lowerWord.Contains(lowerPrefix)) return word;
int index = lowerWord.IndexOf(lowerPrefix);
// If the search string is at the end of the word ignore.
if (index + prefix.Length == word.Length) return word;
return word.Substring(0, index) + prefix +
capitaliseFirstLetter(word.Substring(index + prefix.Length));
}
private static string specialWords(string word, string specialWord)
{
if (word.Equals(specialWord, StringComparison.InvariantCultureIgnoreCase))
{
return specialWord;
}
else
{
return word;
}
}
private static string dealWithRomanNumerals(string word)
{
// Roman Numeral parser thanks to [Hannobo](https://stackoverflow.com/users/785111/hannobo)
// Note that it excludes the Chinese last name Xi
return new Regex(@"\b(?!Xi\b)(X|XX|XXX|XL|L|LX|LXX|LXXX|XC|C)?(I|II|III|IV|V|VI|VII|VIII|IX)?\b", RegexOptions.IgnoreCase).Replace(word, match => match.Value.ToUpperInvariant());
}
private static string capitaliseFirstLetter(string word)
{
return char.ToUpper(word[0]) + word.Substring(1).ToLower();
}
}
Run Code Online (Sandbox Code Playgroud)
我做了一个https://github.com/tamtamchik/namecase的快速 C# 移植,它基于 Lingua::EN::NameCase。
\n\npublic static class CIQNameCase\n{\n static Dictionary<string, string> _exceptions = new Dictionary<string, string>\n {\n {@"\\bMacEdo" ,"Macedo"},\n {@"\\bMacEvicius" ,"Macevicius"},\n {@"\\bMacHado" ,"Machado"},\n {@"\\bMacHar" ,"Machar"},\n {@"\\bMacHin" ,"Machin"},\n {@"\\bMacHlin" ,"Machlin"},\n {@"\\bMacIas" ,"Macias"},\n {@"\\bMacIulis" ,"Maciulis"},\n {@"\\bMacKie" ,"Mackie"},\n {@"\\bMacKle" ,"Mackle"},\n {@"\\bMacKlin" ,"Macklin"},\n {@"\\bMacKmin" ,"Mackmin"},\n {@"\\bMacQuarie" ,"Macquarie"}\n };\n\n static Dictionary<string, string> _replacements = new Dictionary<string, string>\n {\n {@"\\bAl(?=\\s+\\w)" , @"al"}, // al Arabic or forename Al.\n {@"\\b(Bin|Binti|Binte)\\b" , @"bin"}, // bin, binti, binte Arabic\n {@"\\bAp\\b" , @"ap"}, // ap Welsh.\n {@"\\bBen(?=\\s+\\w)" , @"ben"}, // ben Hebrew or forename Ben.\n {@"\\bDell([ae])\\b" , @"dell$1"}, // della and delle Italian.\n {@"\\bD([aeiou])\\b" , @"d$1"}, // da, de, di Italian; du French; do Brasil\n {@"\\bD([ao]s)\\b" , @"d$1"}, // das, dos Brasileiros\n {@"\\bDe([lrn])\\b" , @"de$1"}, // del Italian; der/den Dutch/Flemish.\n {@"\\bEl\\b" , @"el"}, // el Greek or El Spanish.\n {@"\\bLa\\b" , @"la"}, // la French or La Spanish.\n {@"\\bL([eo])\\b" , @"l$1"}, // lo Italian; le French.\n {@"\\bVan(?=\\s+\\w)" , @"van"}, // van German or forename Van.\n {@"\\bVon\\b" , @"von"} // von Dutch/Flemish\n };\n\n static string[] _conjunctions = { "Y", "E", "I" };\n\n static string _romanRegex = @"\\b((?:[Xx]{1,3}|[Xx][Ll]|[Ll][Xx]{0,3})?(?:[Ii]{1,3}|[Ii][VvXx]|[Vv][Ii]{0,3})?)\\b";\n\n /// <summary>\n /// Case a name field into its appropriate case format \n /// e.g. Smith, de la Cruz, Mary-Jane, O\'Brien, McTaggart\n /// </summary>\n /// <param name="nameString"></param>\n /// <returns></returns>\n public static string NameCase(string nameString)\n {\n // Capitalize\n nameString = Capitalize(nameString);\n nameString = UpdateIrish(nameString);\n\n // Fixes for "son (daughter) of" etc\n foreach (var replacement in _replacements.Keys)\n {\n if (Regex.IsMatch(nameString, replacement))\n {\n Regex rgx = new Regex(replacement);\n nameString = rgx.Replace(nameString, _replacements[replacement]);\n } \n }\n\n nameString = UpdateRoman(nameString);\n nameString = FixConjunction(nameString);\n\n return nameString;\n }\n\n /// <summary>\n /// Capitalize first letters.\n /// </summary>\n /// <param name="nameString"></param>\n /// <returns></returns>\n private static string Capitalize(string nameString)\n {\n nameString = nameString.ToLower();\n nameString = Regex.Replace(nameString, @"\\b\\w", x => x.ToString().ToUpper());\n nameString = Regex.Replace(nameString, @"\'\\w\\b", x => x.ToString().ToLower()); // Lowercase \'s\n return nameString;\n }\n\n /// <summary>\n /// Update for Irish names.\n /// </summary>\n /// <param name="nameString"></param>\n /// <returns></returns>\n private static string UpdateIrish(string nameString)\n {\n if(Regex.IsMatch(nameString, @".*?\\bMac[A-Za-z^aciozj]{2,}\\b") || Regex.IsMatch(nameString, @".*?\\bMc"))\n {\n nameString = UpdateMac(nameString);\n } \n return nameString;\n }\n\n /// <summary>\n /// Updates irish Mac & Mc.\n /// </summary>\n /// <param name="nameString"></param>\n /// <returns></returns>\n private static string UpdateMac(string nameString)\n {\n MatchCollection matches = Regex.Matches(nameString, @"\\b(Ma?c)([A-Za-z]+)");\n if(matches.Count == 1 && matches[0].Groups.Count == 3)\n {\n string replacement = matches[0].Groups[1].Value;\n replacement += matches[0].Groups[2].Value.Substring(0, 1).ToUpper();\n replacement += matches[0].Groups[2].Value.Substring(1);\n nameString = nameString.Replace(matches[0].Groups[0].Value, replacement);\n\n // Now fix "Mac" exceptions\n foreach (var exception in _exceptions.Keys)\n {\n nameString = Regex.Replace(nameString, exception, _exceptions[exception]);\n }\n }\n return nameString;\n }\n\n /// <summary>\n /// Fix roman numeral names.\n /// </summary>\n /// <param name="nameString"></param>\n /// <returns></returns>\n private static string UpdateRoman(string nameString)\n {\n MatchCollection matches = Regex.Matches(nameString, _romanRegex);\n if (matches.Count > 1)\n {\n foreach(Match match in matches)\n {\n if(!string.IsNullOrEmpty(match.Value))\n {\n nameString = Regex.Replace(nameString, match.Value, x => x.ToString().ToUpper());\n }\n }\n }\n return nameString;\n }\n\n /// <summary>\n /// Fix Spanish conjunctions.\n /// </summary>\n /// <param name=""></param>\n /// <returns></returns>\n private static string FixConjunction(string nameString)\n { \n foreach (var conjunction in _conjunctions)\n {\n nameString = Regex.Replace(nameString, @"\\b" + conjunction + @"\\b", x => x.ToString().ToLower());\n }\n return nameString;\n }\n}\nRun Code Online (Sandbox Code Playgroud)\n\n用法
\n\nstring name_cased = CIQNameCase.NameCase("McCarthy");\nRun Code Online (Sandbox Code Playgroud)\n\n这是我的测试方法,一切似乎都通过了:
\n\n[TestMethod]\npublic void Test_NameCase_1()\n{\n string[] names = {\n "Keith", "Yuri\'s", "Leigh-Williams", "McCarthy",\n // Mac exceptions\n "Machin", "Machlin", "Machar",\n "Mackle", "Macklin", "Mackie",\n "Macquarie", "Machado", "Macevicius",\n "Maciulis", "Macias", "MacMurdo",\n // General\n "O\'Callaghan", "St. John", "von Streit",\n "van Dyke", "Van", "ap Llwyd Dafydd",\n "al Fahd", "Al",\n "el Grecco",\n "ben Gurion", "Ben",\n "da Vinci",\n "di Caprio", "du Pont", "de Legate",\n "del Crond", "der Sind", "van der Post", "van den Thillart",\n "von Trapp", "la Poisson", "le Figaro",\n "Mack Knife", "Dougal MacDonald",\n "Ruiz y Picasso", "Dato e Iradier", "Mas i Gavarr\xc3\xb3",\n // Roman numerals\n "Henry VIII", "Louis III", "Louis XIV",\n "Charles II", "Fred XLIX", "Yusof bin Ishak",\n };\n\n foreach(string name in names)\n {\n string name_upper = name.ToUpper();\n string name_cased = CIQNameCase.NameCase(name_upper);\n Console.WriteLine(string.Format("name: {0} -> {1} -> {2}", name, name_upper, name_cased));\n Assert.IsTrue(name == name_cased);\n }\n\n}\nRun Code Online (Sandbox Code Playgroud)\n
| 归档时间: |
|
| 查看次数: |
10916 次 |
| 最近记录: |