ES实现自动补全,自定义拼音分词插件

如果想实现当用户输入拼音时,进行dsl查询,对输入的汉字或拼音进行分词查询,可以这样操作:

下载拼音分词器,并安装到es的plugins中

https://github.com/medcl/elasticsearch-analysis-pinyin

查询时使用“pinyin”作为分词器

但这样操作会有一些问题,单纯只用pinyin作为分词器,会把汉字转成拼音分词,且分词结果无汉字、拼音只是单个分词。

很明显不符合我们的要求,所以在创建索引库的时候,要进行自定义分词配置,下面是一个简单的配置例子:

PUT index
{
    "settings" : {
        "analysis" : {
            "analyzer" : {
                "ik_smart_pinyin" : {
                    "tokenizer" : "ik_smart",
                    "filter" : "pinyin_first_letter_and_full_pinyin_filter"
                },
                "ik_max_pinyin" : {
                    "tokenizer" : "ik_max_word",
                    "filter" : "pinyin_first_letter_and_full_pinyin_filter"
                }
            },
            "filter" : {
                "pinyin_first_letter_and_full_pinyin_filter" : {
                    "type" : "pinyin",
                    "keep_separate_first_letter" : false,
                    "keep_full_pinyin" : true,
                    "keep_original" : true,
                    "limit_first_letter_length" : 16,
                    "lowercase" : true,
                    "remove_duplicated_term" : true
                }
            }
        }
    }
}

需要注意的是search_analyzer的使用,杜绝拼音相同导致文档编号相同的情况

自动补全查询(completion suggester查询)

参与补全查询的字段必须是completion类型的 字段的内容一般是用于补全的数组(把词分成数组)

// 自动补全查询
POST /test2/_search
{
  "suggest": {
    "title_suggest": {
      "text": "s", // 关键字
      "completion": {
        "field": "title", // 补全字段
        "skip_duplicates": true, // 跳过重复的
        "size": 10 // 获取前10条结果
      }
    }
  }
}

这里给出一个补全索引样例查询:

PUT /students
{
  "settings": {
    "analysis": {
      "analyzer": {
        "text_anlyzer": {
          "tokenizer": "ik_max_word",
          "filter": "py"
        },
        "completion_analyzer": {
          "tokenizer": "keyword",
          "filter": "py"
        }
      },
      "filter": {
        "py": {
          "type": "pinyin",
          "keep_full_pinyin": false,
          "keep_joined_full_pinyin": true,
          "keep_original": true,
          "limit_first_letter_length": 16,
          "remove_duplicated_term": true,
          "none_chinese_pinyin_tokenize": false
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "id":{
        "type": "keyword"
      },
      "name":{
        "type": "text",
        "analyzer": "text_anlyzer",
        "search_analyzer": "ik_smart",
        "copy_to": "all"
      },
      "address":{
        "type": "keyword",
        "index": false
      },
      "city":{
        "type": "keyword"
      },
      "starName":{
        "type": "keyword"
      },
      "all":{
        "type": "text",
        "analyzer": "text_anlyzer",
        "search_analyzer": "ik_smart"
      },
      "suggestion":{
          "type": "completion",
          "analyzer": "completion_analyzer"
      }
    }
  }
}

查询

GET /students/_search
{
  "query": {"match_all": {}}
}

GET /students/_search
{
  "suggest": {
    "YOUR_SUGGESTION": {
      "text": "s",
      "completion": {
        "field": "suggestion",
        "skip_duplicates": true // 跳过重复
      }
    }
  }
}