当用户在搜索框输入字符时,我们应该提示出与该字符有关的搜索项,如图:
这种根据用户输入的字母,提示完整词条的功能,就是自动补全了。
因为需要根据拼音字母来推断,因此要用到拼音分词功能
要实现根据字母做补全,就必须对文档按照拼音分词。在GitHub上恰好有elasticsearch的拼音分词插件。地址:https://github.com/medcl/elasticsearch-analysis-pinyin
安装方式与IK分词器一样,分三步:
①解压
②上传到虚拟机中,elasticsearch的plugin目录
③重启elasticsearch
④测试
详细安装步骤可以参考IK分词器的安装过程https://www.cnblogs.com/yppah/p/15936823.html。
{ "tokens" : [ { "token" : "ru", "start_offset" : 0, "end_offset" : 0, "type" : "word", "position" : 0 }, { "token" : "rjjdhbcm", "start_offset" : 0, "end_offset" : 0, "type" : "word", "position" : 0 }, { "token" : "jia", "start_offset" : 0, "end_offset" : 0, "type" : "word", "position" : 1 }, { "token" : "jiu", "start_offset" : 0, "end_offset" : 0, "type" : "word", "position" : 2 }, { "token" : "dian", "start_offset" : 0, "end_offset" : 0, "type" : "word", "position" : 3 }, { "token" : "hai", "start_offset" : 0, "end_offset" : 0, "type" : "word", "position" : 4 }, { "token" : "bu", "start_offset" : 0, "end_offset" : 0, "type" : "word", "position" : 5 }, { "token" : "cuo", "start_offset" : 0, "end_offset" : 0, "type" : "word", "position" : 6 }, { "token" : "ma", "start_offset" : 0, "end_offset" : 0, "type" : "word", "position" : 7 } ] }
# 自定义拼音分词器 PUT /test { "settings": { "analysis": { "analyzer": { "my_analyzer": { "tokenizer": "ik_max_word", "filter": "py" } }, "filter": { "py": { "type": "pinyin", "keep_full_pinyin": false, "keep_joined_full_pinyin": true, "keep_original": true, "limit_first_letter_length": 16, "remove_duplicated_term": true, "none_chinese_pinyin_tokenize": false } } } }, "mappings": { "properties": { "name": { "type": "text", "analyzer": "my_analyzer" } } } } POST /test/_analyze { "text": ["如家酒店还不错"], "analyzer": "my_analyzer" }
{ "tokens" : [ { "token" : "如家", "start_offset" : 0, "end_offset" : 2, "type" : "CN_WORD", "position" : 0 }, { "token" : "rujia", "start_offset" : 0, "end_offset" : 2, "type" : "CN_WORD", "position" : 0 }, { "token" : "rj", "start_offset" : 0, "end_offset" : 2, "type" : "CN_WORD", "position" : 0 }, { "token" : "酒店", "start_offset" : 2, "end_offset" : 4, "type" : "CN_WORD", "position" : 1 }, { "token" : "jiudian", "start_offset" : 2, "end_offset" : 4, "type" : "CN_WORD", "position" : 1 }, { "token" : "jd", "start_offset" : 2, "end_offset" : 4, "type" : "CN_WORD", "position" : 1 }, { "token" : "还不", "start_offset" : 4, "end_offset" : 6, "type" : "CN_WORD", "position" : 2 }, { "token" : "haibu", "start_offset" : 4, "end_offset" : 6, "type" : "CN_WORD", "position" : 2 }, { "token" : "hb", "start_offset" : 4, "end_offset" : 6, "type" : "CN_WORD", "position" : 2 }, { "token" : "不错", "start_offset" : 5, "end_offset" : 7, "type" : "CN_WORD", "position" : 3 }, { "token" : "bucuo", "start_offset" : 5, "end_offset" : 7, "type" : "CN_WORD", "position" : 3 }, { "token" : "bc", "start_offset" : 5, "end_offset" : 7, "type" : "CN_WORD", "position" : 3 } ] }
POST /test/_doc/1 { "id": 1, "name": "狮子" } POST /test/_doc/2 { "id": 2, "name": "虱子" } GET /test/_search { "query": { "match": { "name": "掉入狮子笼咋办" } } }
ok
?
原因分析
解决办法
解决
ok
# 创建索引库 PUT test2 { "mappings": { "properties": { "title":{ "type": "completion" } } } }
# 示例数据 POST test2/_doc { "title": ["Sony", "WH-1000XM3"] } POST test2/_doc { "title": ["SK-II", "PITERA"] } POST test2/_doc { "title": ["Nintendo", "switch"] }
# 自动补全查询 GET /test2/_search { "suggest": { "title_suggest": { "text": "s", # 关键字 "completion": { "field": "title", # 补全查询的字段 "skip_duplicates": true, # 跳过重复的 "size": 10 # 获取前10条结果 } } } }
{ "took" : 565, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 0, "relation" : "eq" }, "max_score" : null, "hits" : [ ] }, "suggest" : { "title_suggest" : [ { "text" : "s", "offset" : 0, "length" : 1, "options" : [ { "text" : "SK-II", "_index" : "test2", "_type" : "_doc", "_id" : "xuqv2n8BUtPonQDctNZG", "_score" : 1.0, "_source" : { "title" : [ "SK-II", "PITERA" ] } }, { "text" : "Sony", "_index" : "test2", "_type" : "_doc", "_id" : "xeqv2n8BUtPonQDcS9aX", "_score" : 1.0, "_source" : { "title" : [ "Sony", "WH-1000XM3" ] } }, { "text" : "switch", "_index" : "test2", "_type" : "_doc", "_id" : "x-qv2n8BUtPonQDcyNYX", "_score" : 1.0, "_source" : { "title" : [ "Nintendo", "switch" ] } } ] } ] } }
# 酒店数据索引库 PUT /hotel { "settings": { "analysis": { "analyzer": { "text_anlyzer": { "tokenizer": "ik_max_word", "filter": "py" }, "completion_analyzer": { "tokenizer": "keyword", "filter": "py" } }, "filter": { "py": { "type": "pinyin", "keep_full_pinyin": false, "keep_joined_full_pinyin": true, "keep_original": true, "limit_first_letter_length": 16, "remove_duplicated_term": true, "none_chinese_pinyin_tokenize": false } } } }, "mappings": { "properties": { "id":{ "type": "keyword" }, "name":{ "type": "text", "analyzer": "text_anlyzer", "search_analyzer": "ik_smart", "copy_to": "all" }, "address":{ "type": "keyword", "index": false }, "price":{ "type": "integer" }, "score":{ "type": "integer" }, "brand":{ "type": "keyword", "copy_to": "all" }, "city":{ "type": "keyword" }, "starName":{ "type": "keyword" }, "business":{ "type": "keyword", "copy_to": "all" }, "location":{ "type": "geo_point" }, "pic":{ "type": "keyword", "index": false }, "all":{ "type": "text", "analyzer": "text_anlyzer", "search_analyzer": "ik_smart" }, "suggestion":{ "type": "completion", "analyzer": "completion_analyzer" } } } }
text_anlyzer用于全文检索
completion_analyzer用于自动补全,它采用keyword分词器(即不分词)然后转为拼音
List<String>
,然后将brand、city、business等信息放到里面。package com.yppah.hoteldemo.pojo; import lombok.Data; import lombok.NoArgsConstructor; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; @Data @NoArgsConstructor public class HotelDoc { private Long id; private String name; private String address; private Integer price; private Integer score; private String brand; private String city; private String starName; private String business; private String location; private String pic; private Object distance; private Boolean isAD; // private String isAD; private List<String> suggestion; //存储给用户自动补全的内容 public HotelDoc(Hotel hotel) { this.id = hotel.getId(); this.name = hotel.getName(); this.address = hotel.getAddress(); this.price = hotel.getPrice(); this.score = hotel.getScore(); this.brand = hotel.getBrand(); this.city = hotel.getCity(); this.starName = hotel.getStarName(); this.business = hotel.getBusiness(); this.location = hotel.getLatitude() + ", " + hotel.getLongitude(); this.pic = hotel.getPic(); // 组装suggestion // this.suggestion = Arrays.asList(this.brand, this.business); if(this.business.contains("、")){ // business有多个值,需要切割 String[] arr = this.business.split("、"); // 添加元素 this.suggestion = new ArrayList<>(); this.suggestion.add(this.brand); Collections.addAll(this.suggestion, arr); //Collections.addAll批量添加 }else { this.suggestion = Arrays.asList(this.brand, this.business); } } }
测试
GET /hotel/_search { "suggest": { "suggestions": { "text": "h", "completion": { "field": "suggestion", "skip_duplicates": true, "size": 5 } } } }
{ "took" : 4, "timed_out" : false, "_shards" : { "total" : 1, "successful" : 1, "skipped" : 0, "failed" : 0 }, "hits" : { "total" : { "value" : 0, "relation" : "eq" }, "max_score" : null, "hits" : [ ] }, "suggest" : { "suggestions" : [ { "text" : "h", "offset" : 0, "length" : 1, "options" : [ { "text" : "和颐", "_index" : "hotel", "_type" : "_doc", "_id" : "416268", "_score" : 1.0, "_source" : { "address" : "朝阳路高井176号", "brand" : "和颐", "business" : "国贸地区", "city" : "北京", "id" : 416268, "location" : "39.918277, 116.53015", "name" : "和颐酒店(北京传媒大学财满街店)", "pic" : "https://m.tuniucdn.com/fb2/t1/G6/M00/52/13/Cii-TF3eP5GIJIOLAAUwsIVCxdAAAGKXgK5a0IABTDI239_w200_h200_c1_t0.jpg", "price" : 524, "score" : 46, "starName" : "三钻", "suggestion" : [ "和颐", "国贸地区" ] } }, { "text" : "汉庭", "_index" : "hotel", "_type" : "_doc", "_id" : "607915", "_score" : 1.0, "_source" : { "address" : "滨河大道6033号海滨广场国皇大厦3楼", "brand" : "汉庭", "business" : "皇岗口岸/福田口岸", "city" : "深圳", "id" : 607915, "location" : "22.528101, 114.064221", "name" : "汉庭酒店(深圳皇岗店)", "pic" : "https://m.tuniucdn.com/fb3/s1/2n9c/qMyCJVYuW21nsCeEBt8CMfmEhra_w200_h200_c1_t0.jpg", "price" : 313, "score" : 42, "starName" : "二钻", "suggestion" : [ "汉庭", "皇岗口岸/福田口岸" ] } }, { "text" : "海岸城/后海", "_index" : "hotel", "_type" : "_doc", "_id" : "1406627919", "_score" : 1.0, "_source" : { "address" : "海德一道88号中洲控股中心A座", "brand" : "万豪", "business" : "海岸城/后海", "city" : "深圳", "id" : 1406627919, "location" : "22.517293, 113.933785", "name" : "深圳中洲万豪酒店", "pic" : "https://m.tuniucdn.com/fb3/s1/2n9c/3wsinQAcuWtCdmv1yxauVG2PSYpC_w200_h200_c1_t0.jpg", "price" : 204, "score" : 47, "starName" : "五钻", "suggestion" : [ "万豪", "海岸城/后海" ] } }, { "text" : "皇冠假日", "_index" : "hotel", "_type" : "_doc", "_id" : "56392", "_score" : 1.0, "_source" : { "address" : "番禺路400号", "brand" : "皇冠假日", "business" : "徐家汇地区", "city" : "上海", "id" : 56392, "location" : "31.202768, 121.429524", "name" : "上海银星皇冠假日酒店", "pic" : "https://m.tuniucdn.com/fb3/s1/2n9c/37ucQ38K3UFdcRqntJ8M5dt884HR_w200_h200_c1_t0.jpg", "price" : 809, "score" : 47, "starName" : "五星级", "suggestion" : [ "皇冠假日", "徐家汇地区" ] } }, { "text" : "豪生", "_index" : "hotel", "_type" : "_doc", "_id" : "45870", "_score" : 1.0, "_source" : { "address" : "新元南路555号", "brand" : "豪生", "business" : "滴水湖临港地区", "city" : "上海", "id" : 45870, "location" : "30.871729, 121.81959", "name" : "上海临港豪生大酒店", "pic" : "https://m.tuniucdn.com/fb3/s1/2n9c/2F5HoQvBgypoDUE46752ppnQaTqs_w200_h200_c1_t0.jpg", "price" : 896, "score" : 45, "starName" : "四星级", "suggestion" : [ "豪生", "滴水湖临港地区" ] } } ] } ] } }
至此,基于DSL的酒店数据自动补全功能已实现(以拼音方式)
@Test void testSuggestion() throws IOException { // 1. 准备request SearchRequest request = new SearchRequest("hotel"); // 2. 准备DSL request.source().suggest(new SuggestBuilder().addSuggestion( "suggestions", SuggestBuilders.completionSuggestion("suggestion") .prefix("h") .skipDuplicates(true) .size(5) )); // 3. 发起请求 SearchResponse response = client.search(request, RequestOptions.DEFAULT); // 4. 解析响应 System.out.println(response); }
而自动补全的结果也比较特殊,解析的代码如下:
测试
@Test void testSuggestion() throws IOException { // 1. 准备request SearchRequest request = new SearchRequest("hotel"); // 2. 准备DSL request.source().suggest(new SuggestBuilder().addSuggestion( "suggestions", SuggestBuilders.completionSuggestion("suggestion") .prefix("h") .skipDuplicates(true) .size(5) )); // 3. 发起请求 SearchResponse response = client.search(request, RequestOptions.DEFAULT); // 4. 解析响应 // System.out.println(response); Suggest suggest = response.getSuggest(); // 4.1 根据补全查询名称获取补全结果 CompletionSuggestion suggentions = suggest.getSuggestion("suggestions"); // 4.2 获取options List<CompletionSuggestion.Entry.Option> options = suggentions.getOptions(); // 4.3 遍历options for (CompletionSuggestion.Entry.Option option: options) { String text = option.getText().toString(); System.out.println(text); } }
List<String>
HotelController
@GetMapping("suggestion") public List<String> getSuggestion(@RequestParam("key") String prefix) { return hotelService.getSuggestion(prefix); }
IhotelService
List<String> getSuggestion(String prefix);
HotelService
// 参考HotelSearchTest的testAggregation() @Override public List<String> getSuggestion(String prefix) { //ctrl+alt+t快捷键,利用trycatch或者其他将目标代码块包含起来 try { // 1. 准备request SearchRequest request = new SearchRequest("hotel"); // 2. 准备DSL request.source().suggest(new SuggestBuilder().addSuggestion( "suggestions", SuggestBuilders.completionSuggestion("suggestion") .prefix(prefix) .skipDuplicates(true) .size(5) )); // 3. 发起请求 SearchResponse response = client.search(request, RequestOptions.DEFAULT); // 4. 解析响应 // System.out.println(response); Suggest suggest = response.getSuggest(); // 4.1 根据补全查询名称获取补全结果 CompletionSuggestion suggentions = suggest.getSuggestion("suggestions"); // 4.2 获取options List<CompletionSuggestion.Entry.Option> options = suggentions.getOptions(); // 4.3 遍历options List<String> resList = new ArrayList<>(options.size()); for (CompletionSuggestion.Entry.Option option: options) { String text = option.getText().toString(); resList.add(text); } return resList; } catch (IOException e) { throw new RuntimeException(e); } }
至此,自动补全和拼音搜索功能均已实现