pan*_*ang 1 mysql elasticsearch
我想通过Elasticsearch为电子邮件或电话进行模糊匹配.例如:
匹配所有电子邮件结束 @gmail.com
要么
匹配所有电话开头136.
我知道我可以使用通配符,
{
"query": {
"wildcard" : {
"email": "*gmail.com"
}
}
}
Run Code Online (Sandbox Code Playgroud)
但表现很差.我试着用regexp:
{"query": {"regexp": {"email": {"value": "*163\.com*"} } } }
Run Code Online (Sandbox Code Playgroud)
但是不起作用.
是否有更好的方法来制作它?
curl -XGET localhost:9200/user_data
{
"user_data": {
"aliases": {},
"mappings": {
"user_data": {
"properties": {
"address": {
"type": "string"
},
"age": {
"type": "long"
},
"comment": {
"type": "string"
},
"created_on": {
"type": "date",
"format": "dateOptionalTime"
},
"custom": {
"properties": {
"key": {
"type": "string"
},
"value": {
"type": "string"
}
}
},
"gender": {
"type": "string"
},
"name": {
"type": "string"
},
"qq": {
"type": "string"
},
"tel": {
"type": "string"
},
"updated_on": {
"type": "date",
"format": "dateOptionalTime"
},
}
}
},
"settings": {
"index": {
"creation_date": "1458832279465",
"uuid": "Fbmthc3lR0ya51zCnWidYg",
"number_of_replicas": "1",
"number_of_shards": "5",
"version": {
"created": "1070299"
}
}
},
"warmers": {}
}
}
Run Code Online (Sandbox Code Playgroud)
映射:
{
"settings": {
"analysis": {
"analyzer": {
"index_phone_analyzer": {
"type": "custom",
"char_filter": [ "digit_only" ],
"tokenizer": "digit_edge_ngram_tokenizer",
"filter": [ "trim" ]
},
"search_phone_analyzer": {
"type": "custom",
"char_filter": [ "digit_only" ],
"tokenizer": "keyword",
"filter": [ "trim" ]
},
"index_email_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [ "lowercase", "name_ngram_filter", "trim" ]
},
"search_email_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [ "lowercase", "trim" ]
}
},
"char_filter": {
"digit_only": {
"type": "pattern_replace",
"pattern": "\\D+",
"replacement": ""
}
},
"tokenizer": {
"digit_edge_ngram_tokenizer": {
"type": "edgeNGram",
"min_gram": "3",
"max_gram": "15",
"token_chars": [ "digit" ]
}
},
"filter": {
"name_ngram_filter": {
"type": "ngram",
"min_gram": "3",
"max_gram": "20"
}
}
}
},
"mappings" : {
"user_data" : {
"properties" : {
"name" : {
"type" : "string",
"analyzer" : "ik"
},
"age" : {
"type" : "integer"
},
"gender": {
"type" : "string"
},
"qq" : {
"type" : "string"
},
"email" : {
"type" : "string",
"analyzer": "index_email_analyzer",
"search_analyzer": "search_email_analyzer"
},
"tel" : {
"type" : "string",
"analyzer": "index_phone_analyzer",
"search_analyzer": "search_phone_analyzer"
},
"address" : {
"type": "string",
"analyzer" : "ik"
},
"comment" : {
"type" : "string",
"analyzer" : "ik"
},
"created_on" : {
"type" : "date",
"format" : "dateOptionalTime"
},
"updated_on" : {
"type" : "date",
"format" : "dateOptionalTime"
},
"custom": {
"type" : "nested",
"properties" : {
"key" : {
"type" : "string"
},
"value" : {
"type" : "string"
}
}
}
}
}
}
}
Run Code Online (Sandbox Code Playgroud)
一种简单的方法是创建一个自定义分析器,它使用电子邮件的n-gram标记过滤器(=>见下面index_email_analyzer和search_email_analyzer+ email_url_analyzer用于精确的电子邮件匹配)和电话的edge-ngram标记过滤器(=>见下文index_phone_analyzer和search_phone_analyzer).
完整的索引定义如下.
PUT myindex
{
"settings": {
"analysis": {
"analyzer": {
"email_url_analyzer": {
"type": "custom",
"tokenizer": "uax_url_email",
"filter": [ "trim" ]
},
"index_phone_analyzer": {
"type": "custom",
"char_filter": [ "digit_only" ],
"tokenizer": "digit_edge_ngram_tokenizer",
"filter": [ "trim" ]
},
"search_phone_analyzer": {
"type": "custom",
"char_filter": [ "digit_only" ],
"tokenizer": "keyword",
"filter": [ "trim" ]
},
"index_email_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [ "lowercase", "name_ngram_filter", "trim" ]
},
"search_email_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [ "lowercase", "trim" ]
}
},
"char_filter": {
"digit_only": {
"type": "pattern_replace",
"pattern": "\\D+",
"replacement": ""
}
},
"tokenizer": {
"digit_edge_ngram_tokenizer": {
"type": "edgeNGram",
"min_gram": "1",
"max_gram": "15",
"token_chars": [ "digit" ]
}
},
"filter": {
"name_ngram_filter": {
"type": "ngram",
"min_gram": "1",
"max_gram": "20"
}
}
}
},
"mappings": {
"your_type": {
"properties": {
"email": {
"type": "string",
"analyzer": "index_email_analyzer",
"search_analyzer": "search_email_analyzer"
},
"phone": {
"type": "string",
"analyzer": "index_phone_analyzer",
"search_analyzer": "search_phone_analyzer"
}
}
}
}
}
Run Code Online (Sandbox Code Playgroud)
现在,让我们一点一点地剖析它.
对于该phone领域,想法是使用index_phone_analyzer,使用edge-ngram标记器索引电话值,以索引电话号码的所有前缀.所以,如果您的电话号码1362435647,下面的标记会产生:1,13,136,1362,13624,136243,1362435,13624356,13624356,136243564,1362435647.
然后在搜索时我们使用另一个分析器search_phone_analyzer,它只需输入输入数字(例如136)并phone使用简单match或term查询将其与字段匹配:
POST myindex
{
"query": {
"term":
{ "phone": "136" }
}
}
Run Code Online (Sandbox Code Playgroud)
对于该email字段,我们以类似的方式进行,因为我们index_email_analyzer使用一个ngram令牌过滤器索引电子邮件值,该过滤器将生成所有可能的长度(1到20个字符之间)的令牌,这些令牌可以从电子邮件价值 例如:john@gmail.com将切分给j,jo,joh,... gmail.com,... john@gmail.com.
然后在搜索时,我们将使用另一个称为search_email_analyzer输入的分析器,并尝试将其与索引标记匹配.
POST myindex
{
"query": {
"term":
{ "email": "@gmail.com" }
}
}
Run Code Online (Sandbox Code Playgroud)
该email_url_analyzer分析仪并没有在这个例子中使用,但我万一你需要匹配的确切电子邮件值包含它.