Zer*_*ama 6 c# search levenshtein-distance
我的申请中有员工名单。每个员工都有名字和姓氏,所以我有一个元素列表,例如:
["Jim Carry", "Uma Turman", "Bill Gates", "John Skeet"]
Run Code Online (Sandbox Code Playgroud)
我希望我的客户具有使用模糊搜索算法按名称搜索员工的功能。例如,如果用户输入“ Yuma Turmon”,则将返回最接近的元素“ Uma Turman”。我在这里找到Levenshtein距离算法。
static class LevenshteinDistance
{
/// <summary>
/// Compute the distance between two strings.
/// </summary>
public static int Compute(string s, string t)
{
int n = s.Length;
int m = t.Length;
int[,] d = new int[n + 1, m + 1];
// Step 1
if (n == 0)
{
return m;
}
if (m == 0)
{
return n;
}
// Step 2
for (int i = 0; i <= n; d[i, 0] = i++)
{
}
for (int j = 0; j <= m; d[0, j] = j++)
{
}
// Step 3
for (int i = 1; i <= n; i++)
{
//Step 4
for (int j = 1; j <= m; j++)
{
// Step 5
int cost = (t[j - 1] == s[i - 1]) ? 0 : 1;
// Step 6
d[i, j] = Math.Min(
Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1),
d[i - 1, j - 1] + cost);
}
}
// Step 7
return d[n, m];
}
}
Run Code Online (Sandbox Code Playgroud)
我在员工姓名列表上迭代用户的输入(全名)并比较距离。例如,如果它小于3,我将返回找到的雇员。
现在,我希望允许用户按反向名称进行搜索-例如,如果用户输入“ Turmon Uma”,则它将返回“ Uma Turman”,因为实际的实际距离是1,因为“名字”和“姓氏”与“姓氏和名字”相同名称。我的算法现在把它算作很远的不同字符串。我该如何修改它以便找到名称而不管顺序如何?
一些想法,因为这是一个可能需要解决的复杂问题:
\n\nJohn Smith,则找到 的最佳单个单词名称匹配John,然后匹配 上那些“最佳匹配”员工的剩余名称Smith,并获得距离总和。然后找到 的最佳匹配项Smith并匹配 上的剩余名称John,并对距离求和。最佳匹配是总距离最短的匹配。您可以通过返回前 10 个匹配项(例如按总距离排序)来提供最佳匹配项列表。数据库中的名称或搜索词的位置并不重要。事实上,它们可能完全失序,但这并不重要。\xc3\xa1。您的算法将无法与它们一起正常工作。如果您希望使用非字母双字节字符,请更加小心,例如。中文、日语、阿拉伯语等拆分每位员工的姓名还有两个好处:
\n\nWells-Harvey)、复合姓名(WellsHarvey)和个人姓名(Wells和Harvey单独的姓名),所有这些都针对同一员工。任何一个姓名的低距离匹配就是该员工的低距离匹配,同样,额外的姓名不计入总数。下面是一些似乎有效的基本代码,但它只真正考虑了第 1、2 和 4 点:
\n\nusing System;\nusing System.Collections.Generic;\nusing System.Linq;\n\nnamespace EmployeeSearch\n{\n static class Program\n {\n static List<string> EmployeesList = new List<string>() { "Jim Carrey", "Uma Thurman", "Bill Gates", "Jon Skeet" };\n static Dictionary<int, List<string>> employeesById = new Dictionary<int, List<string>>();\n static Dictionary<string, List<int>> employeeIdsByName = new Dictionary<string, List<int>>();\n\n static void Main()\n {\n Init();\n var results = FindEmployeeByNameFuzzy("Umaa Thurrmin");\n // Returns:\n // (1) Uma Thurman Distance: 3\n // (0) Jim Carrey Distance: 10\n // (3) Jon Skeet Distance: 11\n // (2) Bill Gates Distance: 12\n Console.WriteLine(string.Join("\\r\\n", results.Select(r => $"({r.Id}) {r.Name} Distance: {r.Distance}")));\n\n var results = FindEmployeeByNameFuzzy("Tormin Oma");\n // Returns:\n // (1) Uma Thurman Distance: 4\n // (3) Jon Skeet Distance: 7\n // (0) Jim Carrey Distance: 8\n // (2) Bill Gates Distance: 9\n Console.WriteLine(string.Join("\\r\\n", results.Select(r => $"({r.Id}) {r.Name} Distance: {r.Distance}")));\n\n\n Console.Read();\n }\n\n private static void Init() // prepare our lists\n {\n for (int i = 0; i < EmployeesList.Count; i++)\n {\n // Preparing the list of names for each employee - add special cases such as hyphenation here as well\n var names = EmployeesList[i].ToLower().Split(new char[] { \' \' }).ToList();\n employeesById.Add(i, names);\n\n // This is not used here, but could come in handy if you want a unique index of names pointing to employee ids for optimisation:\n foreach (var name in names)\n {\n if (employeeIdsByName.ContainsKey(name))\n {\n employeeIdsByName[name].Add(i);\n }\n else\n {\n employeeIdsByName.Add(name, new List<int>() { i });\n }\n }\n }\n\n }\n\n private static List<SearchResult> FindEmployeeByNameFuzzy(string query)\n {\n var results = new List<SearchResult>();\n\n // Notice we\'re splitting the search terms the same way as we split the employee names above (could be refactored out into a helper method)\n var searchterms = query.ToLower().Split(new char[] { \' \' });\n\n // Comparison with each employee\n for (int i = 0; i < employeesById.Count; i++)\n {\n var r = new SearchResult() { Id = i, Name = EmployeesList[i] };\n var employeenames = employeesById[i];\n foreach (var searchterm in searchterms)\n {\n int min = searchterm.Length;\n\n // for each search term get the min distance for all names for this employee\n foreach (var name in employeenames)\n {\n var distance = LevenshteinDistance.Compute(searchterm, name);\n min = Math.Min(min, distance);\n }\n\n // Sum the minimums for all search terms\n r.Distance += min;\n }\n results.Add(r);\n }\n\n // Order by lowest distance first\n return results.OrderBy(e => e.Distance).ToList();\n }\n }\n\n public class SearchResult\n {\n public int Distance { get; set; }\n public int Id { get; set; }\n public string Name { get; set; }\n }\n\n public static class LevenshteinDistance\n {\n /// <summary>\n /// Compute the distance between two strings.\n /// </summary>\n public static int Compute(string s, string t)\n {\n int n = s.Length;\n int m = t.Length;\n int[,] d = new int[n + 1, m + 1];\n\n // Step 1\n if (n == 0)\n {\n return m;\n }\n\n if (m == 0)\n {\n return n;\n }\n\n // Step 2\n for (int i = 0; i <= n; d[i, 0] = i++)\n {\n }\n\n for (int j = 0; j <= m; d[0, j] = j++)\n {\n }\n\n // Step 3\n for (int i = 1; i <= n; i++)\n {\n //Step 4\n for (int j = 1; j <= m; j++)\n {\n // Step 5\n int cost = (t[j - 1] == s[i - 1]) ? 0 : 1;\n\n // Step 6\n d[i, j] = Math.Min(\n Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1),\n d[i - 1, j - 1] + cost);\n }\n }\n // Step 7\n return d[n, m];\n }\n }\n}\nRun Code Online (Sandbox Code Playgroud)\n\n只需Init()在开始时致电,然后致电
var results = FindEmployeeByNameFuzzy(userquery);\nRun Code Online (Sandbox Code Playgroud)\n\n返回最佳匹配的有序列表。
\n\n免责声明:此代码不是最佳的,仅经过简单测试,不检查空值,可能会爆炸并杀死小猫等。如果您有大量员工,那么这可能会非常慢。可以进行一些改进,例如,在循环 Levenshtein 算法时,如果距离超过当前的最小距离,您可能会退出。
\n