使用elasticsearch-dsl-py索引和渗透文档

Sal*_*Din 7 python elasticsearch elasticsearch-dsl elasticsearch-dsl-py

我正在制作和调查检索信息研讨会.我有一个json带有文章列表的文件,我需要对它们进行索引,并在使用带有突出显示的过滤器之后.

在终端中执行此操作的步骤列表如下:
1.创建具有渗透的地图.

curl -XPUT 'localhost:9200/my-index?pretty' -H 'Content-Type: application/json' -d'
{
    "mappings": {
        "_doc": {
            "properties": {
                "title": {
                    "type": "text"
                },
                "query": {
                    "type": "percolator"
                }
            }
        }
    }
}
'
Run Code Online (Sandbox Code Playgroud)
  1. 索引新文章:

    curl -XPUT 'localhost:9200/my-index/_doc/1?refresh&pretty' -H 'Content-Type: application/json' -d'
    {           
        "CourseId":35,
          "UnitId":12390,
          "id":"16069",
          "CourseName":"ARK102U_ARKEOLOJ?K ALAN YÖNET?M?",
          "FieldId":8,
          "field":"TAR?H",
        "query": {
            "span_near" : {
                "clauses" : [
                    { "span_term" : { "title" : "dünya" } },
                    { "span_term" : { "title" : "miras?" } },
                    { "span_term" : { "title" : "sözle?mesi" } }
                ],
                "slop" : 0,
                "in_order" : true
            }
        }
    }
    '
    
    Run Code Online (Sandbox Code Playgroud)
  2. 渗透文件:

    curl -XGET 'localhost:9200/my-index/_search?pretty' -H 'Content-Type: application/json' -d'
    {
        "query" : {
            "percolate" : {
                "field" : "query",
                "document" : {
                    "title" : "Arkeoloji, arkeolojik yöntemlerle ortaya ç?kar?lm?? kültürleri, dünya miras? sözle?mesi sosyoloji, co?rafya, tarih, etnoloji gibi birçok bilim dal?ndan yararlanarak ara?t?ran ve inceleyen bilim dal?d?r. Türkçeye yanl?? bir ?ekilde> \"kaz?bilim\" olarak çevrilmi? olsa da kaz?, arkeolojik ara?t?rma yöntemlerinden sadece bir tanesidir."
                }
            }
        },
    
        "highlight": {
          "fields": {
            "title": {}
          }
        }
    }
    '
    
    Run Code Online (Sandbox Code Playgroud)

我有这个代码,直到现在:

import json
from elasticsearch_dsl import (
DocType,
Integer,
Percolator,
Text,
)

# Read the json File
json_data = open('titles.json').read()
data = json.loads(json_data)

docs = data['response']['docs']

# Creating a elasticsearch connection
# connections.create_connection(hosts=['localhost'], port=['9200'], timeout=20)
"""
curl -XPUT 'localhost:9200/my-index?pretty' -H 'Content-Type: application/json' -d'
{
    "mappings": {
        "_doc": {
            "properties": {
                "title": {
                    "type": "text"
                },
                "query": {
                    "type": "percolator"
                }
            }
        }
    }
}
'

"""

class Documment(DocType):
    course_id = Integer()
    unit_id = Integer()
    # title = Text()
    id = Integer()
    course_name = Text()
    field_id = Integer()
    field = Text()


    class Meta:
        index = 'titles_index'


                properties={
                    'title': Text(),
                    'query': Percolator()
                 }

"""
    "query": {
        "span_near" : {
            "clauses" : [
                { "span_term" : { "title" : "dünya" } },
                { "span_term" : { "title" : "miras?" } },
                { "span_term" : { "title" : "sözle?mesi" } }
            ],
            "slop" : 0,
            "in_order" : true
        }
    }

"""

for doc in docs:

    terms = docs['title'].split(“ ”)
    course_id = docs['CourseId']
    unit_id = docs['UnitId']
    id = docs['id']
    course_name = docs['CourseName']
    field_id = docs['FieldId']
    field = docs['field']
Run Code Online (Sandbox Code Playgroud)

更新: 谢谢你的回答,我现在有这个:

import json

from elasticsearch_dsl import (
    connections,
    DocType,
    Mapping,
    Percolator,
    Text
)
from elasticsearch_dsl.query import (
    SpanNear,
    SpanTerm
)
from elasticsearch import Elasticsearch

# Read the json File
json_data = open('titles.json').read()
data = json.loads(json_data)

docs = data['response']['docs']


# creating a new default elasticsearch connection
connections.configure(
    default={'hosts': 'localhost:9200'},
)


class Document(DocType):
    title = Text()
    query = Percolator()

    class Meta:
        index = 'title-index'
        doc_type = '_doc'

    def save(self, **kwargs):
        return super(Document, self).save(**kwargs)


# create the mappings in elasticsearch
Document.init()

# index the query
for doc in docs:
    terms = doc['title'].split(" ")
    clauses = []
    for term in terms:
        field = SpanTerm(title=term)
        clauses.append(field)
    query = SpanNear(clauses=clauses)
    item = Document(title=doc['title'],query=query)
    item.save()
Run Code Online (Sandbox Code Playgroud)

它工作正常,但我现在有两个目标:

  1. 索引dict中的randome项目后,我收到了下一个错误:
elasticsearch.exceptions.AuthorizationException: TransportError(403, 
'cluster_block_exception', 'blocked by: [FORBIDDEN/12/index read-only 
/ allow delete (api)];')
Run Code Online (Sandbox Code Playgroud)

我知道我可以使用此命令解决此问题: curl -XPUT -H "Content-Type: application/json" http://localhost:9200/_all/_settings -d '{"index.blocks.read_only_allow_delete": null}'

更新 最后我解决了它删除数据文件夹.

但是现在我正在索引中进行搜索而我没有得到任何东西:

>>> text='Arkeoloji, arkeolojik yöntemlerle ortaya ç?kar?lm?? kültürleri, dünya miras? sözle?mesi sosyoloji, co?rafya, tarih, etnoloji gibi birçok bilim dal?ndan yararlanarak ara?t?ran ve inceleyen bilim dal?d?r. Türkçeye yanl?? bir ?ekilde> \"kaz?bilim\" olarak çevrilmi? olsa da kaz?, arkeolojik ara?t?rma yöntemlerinden sadece bir tanesidir.'
>>> s = Search().using(client).query("percolate", field='query', document={'title': text}).highlight('title')
>>> print(s.to_dict())
{'query': {'percolate': {'field': 'query', 'document': {'title': 'Arkeoloji, arkeolojik yöntemlerle ortaya ç?kar?lm?? kültürleri, dünya miras? sözle?mesi sosyoloji, co?rafya, tarih, etnoloji gibi birçok bilim dal?ndan yararlanarak ara?t?ran ve inceleyen bilim dal?d?r. Türkçeye yanl?? bir ?ekilde> "kaz?bilim" olarak çevrilmi? olsa da kaz?, arkeolojik ara?t?rma yöntemlerinden sadece bir tanesidir.'}}}, 'highlight': {'fields': {'title': {}}}}
>>> response = s.execute()
>>> response
<Response: {}>
Run Code Online (Sandbox Code Playgroud)

这是我的尝试curl:

 curl -XGET 'localhost:9200/title-index/_search?pretty' -H 'Content-Type: application/json' -d '{  
    "query" : {        
        "percolate" : {       
            "field" : "query",
            "document" : {
                "title" : "Arkeoloji, arkeolojik yöntemlerle ortaya ç?kar?lm?? kültürleri, dünya miras? sözle?mesi sosyoloji, co?rafya, tarih, etnoloji gibi birçok bilim dal?ndan yararlanarak ara?t?ran ve inceleyen bilim dal?d?r. Türkçeye yanl?? bir ?ekilde> \"kaz?bilim\" olarak çevrilmi? olsa da kaz?, arkeolojik ara?t?rma yöntemlerinden sadece bir tanesidir."
            }
        }
    },            
    "highlight": {
      "fields": {  
        "title": {}
      }
    }
}'
{
  "took" : 3,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "skipped" : 0,
    "failed" : 0
  },
  "hits" : {
    "total" : 0,
    "max_score" : null,
    "hits" : [ ]
  }
}
Run Code Online (Sandbox Code Playgroud)

我得到可变的统计数据但不是结果:

>>> response.to_dict()
{'took': 9, 'timed_out': False, '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}, 'hits': {'total': 0, 'max_score': None, 'hits': []}}
>>> response
{'took': 12, 'timed_out': False, '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}, 'hits': {'total': 0, 'max_score': None, 'hits': []}}
>>> response
{'took': 12, 'timed_out': False, '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0}, 'hits': {'total': 0, 'max_score': None, 'hits': []}}
Run Code Online (Sandbox Code Playgroud)

谁能帮我?

Val*_*Val 4

第一步是正确的,即映射是正确的。但是,您需要首先为查询建立索引,这就是渗透的全部意义。那么让我们为您的查询建立索引:

\n\n
curl -XPUT \'localhost:9200/my-index/_doc/my-span-query?refresh&pretty\' -H \'Content-Type: application/json\' -d \'{           \n    "query": {\n        "span_near" : {\n            "clauses" : [\n                { "span_term" : { "title" : "d\xc3\xbcnya" } },\n                { "span_term" : { "title" : "miras\xc4\xb1" } },\n                { "span_term" : { "title" : "s\xc3\xb6zle\xc5\x9fmesi" } }\n            ],\n            "slop" : 0,\n            "in_order" : true\n        }\n    }\n}\'\n
Run Code Online (Sandbox Code Playgroud)\n\n

然后的想法是找出哪个查询与您正在渗透的文档相匹配,所以让我们渗透一个文档:

\n\n
curl -XGET \'localhost:9200/my-index/_search?pretty\' -H \'Content-Type: application/json\' -d \'{\n    "query" : {\n        "percolate" : {\n            "field" : "query",\n            "document" : {\n                "title" : "Arkeoloji, arkeolojik y\xc3\xb6ntemlerle ortaya \xc3\xa7\xc4\xb1kar\xc4\xb1lm\xc4\xb1\xc5\x9f k\xc3\xbclt\xc3\xbcrleri, d\xc3\xbcnya miras\xc4\xb1 s\xc3\xb6zle\xc5\x9fmesi sosyoloji, co\xc4\x9frafya, tarih, etnoloji gibi bir\xc3\xa7ok bilim dal\xc4\xb1ndan yararlanarak ara\xc5\x9ft\xc4\xb1ran ve inceleyen bilim dal\xc4\xb1d\xc4\xb1r. T\xc3\xbcrk\xc3\xa7eye yanl\xc4\xb1\xc5\x9f bir \xc5\x9fekilde> \\"kaz\xc4\xb1bilim\\" olarak \xc3\xa7evrilmi\xc5\x9f olsa da kaz\xc4\xb1, arkeolojik ara\xc5\x9ft\xc4\xb1rma y\xc3\xb6ntemlerinden sadece bir tanesidir."\n            }\n        }\n    },\n    "highlight": {\n      "fields": {\n        "title": {}\n      }\n    }\n}\'\n
Run Code Online (Sandbox Code Playgroud)\n\n

您将得到此响应,并突出显示您可以看到与my-span-query给定文档匹配的位置:

\n\n
{\n  "took": 104,\n  "timed_out": false,\n  "_shards": {\n    "total": 5,\n    "successful": 5,\n    "skipped": 0,\n    "failed": 0\n  },\n  "hits": {\n    "total": 1,\n    "max_score": 0.8630463,\n    "hits": [\n      {\n        "_index": "my-index",\n        "_type": "_doc",\n        "_id": "my-span-query",\n        "_score": 0.8630463,\n        "_source": {\n          "query": {\n            "span_near": {\n              "clauses": [\n                {\n                  "span_term": {\n                    "title": "d\xc3\xbcnya"\n                  }\n                },\n                {\n                  "span_term": {\n                    "title": "miras\xc4\xb1"\n                  }\n                },\n                {\n                  "span_term": {\n                    "title": "s\xc3\xb6zle\xc5\x9fmesi"\n                  }\n                }\n              ],\n              "slop": 0,\n              "in_order": true\n            }\n          }\n        },\n        "fields": {\n          "_percolator_document_slot": [\n            0\n          ]\n        },\n        "highlight": {\n          "title": [\n            "Arkeoloji, arkeolojik y\xc3\xb6ntemlerle ortaya \xc3\xa7\xc4\xb1kar\xc4\xb1lm\xc4\xb1\xc5\x9f k\xc3\xbclt\xc3\xbcrleri, <em>d\xc3\xbcnya</em> <em>miras\xc4\xb1</em> <em>s\xc3\xb6zle\xc5\x9fmesi</em> sosyoloji, co\xc4\x9frafya"\n          ]\n        }\n      }\n    ]\n  }\n}\n
Run Code Online (Sandbox Code Playgroud)\n\n

更新

\n\n

同样的事情使用elasticsearch-py-dsl

\n\n
from elasticsearch_dsl import DocType, Text, Percolator\nfrom elasticsearch import Elasticsearch\n\nclass Document(DocType):\n    title = Text()\n    query = Percolator()\n\n    class Meta:\n        index = \'my-index\'\n\n    def save(self, ** kwargs):\n        return super(Document, self).save(** kwargs)\n\n# 1a. create the mappings in elasticsearch\nDocument.init()\n\n# 1b. or another alternative way of saving the mapping\nquery_mapping = elasticsearch_dsl.Mapping(\'_doc\')\nquery_mapping.field(\'title\', \'text\')\nquery_mapping.field(\'query\', \'percolator\')\nquery_mapping.save(\'my-index\')\n\n# 2. index the query\nquery = Document(query={...your span query here...})\nquery.save()\n\n# 3. send the percolate query\nclient = Elasticsearch()\nresponse = client.search(\n    index="my-index",\n    body={\n      "query" : {\n        "percolate" : {\n            "field" : "query",\n            "document" : {\n                "title" : "Arkeoloji, arkeolojik y\xc3\xb6ntemlerle ortaya \xc3\xa7\xc4\xb1kar\xc4\xb1lm\xc4\xb1\xc5\x9f k\xc3\xbclt\xc3\xbcrleri, d\xc3\xbcnya miras\xc4\xb1 s\xc3\xb6zle\xc5\x9fmesi sosyoloji, co\xc4\x9frafya, tarih, etnoloji gibi bir\xc3\xa7ok bilim dal\xc4\xb1ndan yararlanarak ara\xc5\x9ft\xc4\xb1ran ve inceleyen bilim dal\xc4\xb1d\xc4\xb1r. T\xc3\xbcrk\xc3\xa7eye yanl\xc4\xb1\xc5\x9f bir \xc5\x9fekilde> \\"kaz\xc4\xb1bilim\\" olarak \xc3\xa7evrilmi\xc5\x9f olsa da kaz\xc4\xb1, arkeolojik ara\xc5\x9ft\xc4\xb1rma y\xc3\xb6ntemlerinden sadece bir tanesidir."\n            }\n        }\n    },\n    "highlight": {\n      "fields": {\n        "title": {}\n      }\n    }\n  }\n)\n
Run Code Online (Sandbox Code Playgroud)\n\n

更新2

\n\n

没有理由将title与查询一起存储,您只需要存储查询,因此您的代码应如下所示:

\n\n
# index the query\nfor doc in docs:\n    terms = doc[\'title\'].split(" ")\n    clauses = []\n    for term in terms:\n        field = SpanTerm(title=term)\n        clauses.append(field)\n    query = SpanNear(clauses=clauses)\n    item = Document(query=query)         <-- change this line\n    item.save()\n
Run Code Online (Sandbox Code Playgroud)\n