from elasticsearch import Elasticsearch es = Elasticsearch() # 创建索引 def deleteInices(my_index): if True and es.indices.exists(my_index): # 确认删除再改为True print('删除之前存在的') es.indices.delete(index=my_index) def createIndex(my_index,my_doc): # index settings 索引配置 settings = { "mappings":{ my_doc:{ # 只有my_doc可以改变(相当于表名) "properties":{ "my_id":{"type":"integer"}, # 相当于字段名,指定该字段的类型 "my_word":{ "type":"text", "analyzer":"ik_smart", #指定分词为智能分词,如果不指定,则会用默认分词,会把每个字作为分词 "search_analyzer":"ik_smart" # 指定搜索引擎为智能分词搜索 } } } } } # create index es.indices.create(index=my_index,ignore=400,body=settings) print('创建索引成功') def mainCreateIndex(): # 调用后创建index my_index = "word2vec_index" my_doc = "my_doc" deleteInices(my_index) createIndex(my_index,my_doc) # mainCreateIndex() # 插入数据 from elasticsearch import helpers def getAllWords(path="vocab.txt"): # 将数据从文件中读出 words = [] with open(path,"r",encoding='utf-8') as f: for i,item in enumerate(f.readlines()): words.append((i,item.strip())) return words def insertData(words,my_index,my_doc,one_bulk): # 插入数据 # one_bulk表示一个bulk里装多少个 body = [] body_count = 0 # 记录bodu里面有多少个 # 最后一个Bulk可能没满one_bulk,但也要插入 print("共需要插入%d条"%len(words)) for id,word in words: data1 = { "my_id":id, "my_word":word } every_body = { "_index":my_index, "_type":my_doc, "_source":data1 } if body_count<one_bulk: body.append(every_body) body_count += 1 else: helpers.bulk(es,body) body_count = 0 body.clear() body.append(every_body) body_count +=1 if len(body)>0: # 如果body里还有数据,则再插入一次 helpers.bulk(es,body) print("插入数据完成") def mainInset(): #调用后插入数据 my_index = "word2vec_index" my_doc = "my_doc" words = getAllWords() insertData(words,my_index,my_doc,one_bulk=5000) # mainInset() # es查询 def keywordSearch(keywords1,my_index,my_doc): # 根据keywords1来查找 my_search1 = { "query":{ "match":{ "my_word":keywords1 } } } # 直接查询 # res = es.search(index=my_index,body=my_search1) # total = res["hits"]["total"] # 一共这么多个 # print("共查询到%d条数据"%total.get('value')) # helpers查询 es_result = helpers.scan( client=es, query=my_search1, scroll='10m', index=my_index, timeout = '10m' ) es_result = [item for item in es_result] # 原始是生成器<generator object scan at 0x0000021210> print(es_result) # 现在才可以直接打印查看 search_res = [] for item in es_result: tmp = item['_source'] search_res.append((tmp['my_id'],tmp['my_word'])) print("共查询到%d条数据"%len(es_result)) print(search_res) def mainSearch(): # 调用后检索数据 my_index = "word2vec_index" my_doc = "my_doc" keywords1 = "氨基酸" keywordSearch(keywords1,my_index,my_doc) mainSearch()