搜索引擎:ES的评分逻辑

  • Post author:
  • Post category:其他




背景

难道不好奇怎么评分的吗!?



直接先上例子

参考

官网


注意 在有分片的情况下 需要用routing来限制在一个shard里再搞,不然都搞不清楚什么情况


# 按照routing来索引
PUT /hjx_test_index/doc/1?routing=text1&refresh=true
{ "text" : "quick brown fox 11" }
PUT /hjx_test_index/doc/2?routing=text1&refresh=true
{ "text" : "bad fox" }
PUT /hjx_test_index/doc/3?routing=text1&refresh=true
{ "text" : "some test more some 35"
}

GET /hjx_test_index/doc/_search?explain=true&routing=text1
{
  "query": {
    "term": {
      "text": "fox"
    }
  }
}

GET /hjx_test_index/doc/_search?routing=text1

{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 2,
    "max_score": 1.2310667,
    "hits": [
      {
        "_shard": "[hjx_test_index][4]",
        "_node": "JUltA86XS-WNJ3baNQWlcg",
        "_index": "hjx_test_index",
        "_type": "doc",
        "_id": "2",
        "_score": 1.2310667,
        "_routing": "text1",
        "_source": {
          "text": "bad fox"
        },
        "_explanation": {
          "value": 1.2310667,
          "description": "weight(text:fox in 0) [PerFieldSimilarity], result of:",
          "details": [
            {
              "value": 1.2310667,
              "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
              "details": [
                {
                  "value": 1.0296195,
                  "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                  "details": [
                    {
                      "value": 2,
                      "description": "docFreq",
                      "details": []
                    },
                    {
                      "value": 6,
                      "description": "docCount",
                      "details": []
                    }
                  ]
                },
                {
                  "value": 1.1956521,
                  "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                  "details": [
                    {
                      "value": 1,
                      "description": "termFreq=1.0",
                      "details": []
                    },
                    {
                      "value": 1.2,
                      "description": "parameter k1",
                      "details": []
                    },
                    {
                      "value": 0.75,
                      "description": "parameter b",
                      "details": []
                    },
                    {
                      "value": 3.3333333,
                      "description": "avgFieldLength",
                      "details": []
                    },
                    {
                      "value": 2,
                      "description": "fieldLength",
                      "details": []
                    }
                  ]
                }
              ]
            }
          ]
        }
      },
      {
        "_shard": "[hjx_test_index][4]",
        "_node": "JUltA86XS-WNJ3baNQWlcg",
        "_index": "hjx_test_index",
        "_type": "doc",
        "_id": "1",
        "_score": 0.9517491,
        "_routing": "text1",
        "_source": {
          "text": "quick brown fox 11"
        },
        "_explanation": {
          "value": 0.9517491,
          "description": "weight(text:fox in 0) [PerFieldSimilarity], result of:",
          "details": [
            {
              "value": 0.9517491,
              "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
              "details": [
                {
                  "value": 1.0296195,
                  "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                  "details": [
                    {
                      "value": 2,
                      "description": "docFreq",
                      "details": []
                    },
                    {
                      "value": 6,
                      "description": "docCount",
                      "details": []
                    }
                  ]
                },
                {
                  "value": 0.92436975,
                  "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                  "details": [
                    {
                      "value": 1,
                      "description": "termFreq=1.0",
                      "details": []
                    },
                    {
                      "value": 1.2,
                      "description": "parameter k1",
                      "details": []
                    },
                    {
                      "value": 0.75,
                      "description": "parameter b",
                      "details": []
                    },
                    {
                      "value": 3.3333333,
                      "description": "avgFieldLength",
                      "details": []
                    },
                    {
                      "value": 4,
                      "description": "fieldLength",
                      "details": []
                    }
                  ]
                }
              ]
            }
          ]
        }
      }
    ]
  }
}



别慌,咱们来看看咋肥事

先看看官网的计算公式。最新的公式可能有所变化,但是我们可以记住的,不变的是,

最终是三个因素的乘积(tf * idf * fieldNorm)



上面这个是在搜索词只有一个的情况下的,如果搜索词有多个呢?

看下面这个结果

{
  "took": 1,
  "timed_out": false,
  "_shards": {
    "total": 1,
    "successful": 1,
    "skipped": 0,
    "failed": 0
  },
  "hits": {
    "total": 3,
    "max_score": 1.5924733,
    "hits": [
      {
        "_shard": "[hjx_test_index][4]",
        "_node": "JUltA86XS-WNJ3baNQWlcg",
        "_index": "hjx_test_index",
        "_type": "doc",
        "_id": "1",
        "_score": 1.5924733,
        "_routing": "text1",
        "_source": {
          "text": "quick brown fox 11"
        },
        "_explanation": {
          "value": 1.5924734,
          "description": "sum of:",
          "details": [
            {
              "value": 0.9517491,
              "description": "weight(text:quick in 0) [PerFieldSimilarity], result of:",
              "details": [
                {
                  "value": 0.9517491,
                  "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
                  "details": [
                    {
                      "value": 1.0296195,
                      "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                      "details": [
                        {
                          "value": 2,
                          "description": "docFreq",
                          "details": []
                        },
                        {
                          "value": 6,
                          "description": "docCount",
                          "details": []
                        }
                      ]
                    },
                    {
                      "value": 0.92436975,
                      "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                      "details": [
                        {
                          "value": 1,
                          "description": "termFreq=1.0",
                          "details": []
                        },
                        {
                          "value": 1.2,
                          "description": "parameter k1",
                          "details": []
                        },
                        {
                          "value": 0.75,
                          "description": "parameter b",
                          "details": []
                        },
                        {
                          "value": 3.3333333,
                          "description": "avgFieldLength",
                          "details": []
                        },
                        {
                          "value": 4,
                          "description": "fieldLength",
                          "details": []
                        }
                      ]
                    }
                  ]
                }
              ]
            },
            {
              "value": 0.6407243,
              "description": "weight(text:fox in 0) [PerFieldSimilarity], result of:",
              "details": [
                {
                  "value": 0.6407243,
                  "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
                  "details": [
                    {
                      "value": 0.6931472,
                      "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                      "details": [
                        {
                          "value": 3,
                          "description": "docFreq",
                          "details": []
                        },
                        {
                          "value": 6,
                          "description": "docCount",
                          "details": []
                        }
                      ]
                    },
                    {
                      "value": 0.92436975,
                      "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                      "details": [
                        {
                          "value": 1,
                          "description": "termFreq=1.0",
                          "details": []
                        },
                        {
                          "value": 1.2,
                          "description": "parameter k1",
                          "details": []
                        },
                        {
                          "value": 0.75,
                          "description": "parameter b",
                          "details": []
                        },
                        {
                          "value": 3.3333333,
                          "description": "avgFieldLength",
                          "details": []
                        },
                        {
                          "value": 4,
                          "description": "fieldLength",
                          "details": []
                        }
                      ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      },
      {
        "_shard": "[hjx_test_index][4]",
        "_node": "JUltA86XS-WNJ3baNQWlcg",
        "_index": "hjx_test_index",
        "_type": "doc",
        "_id": "3",
        "_score": 1.4302213,
        "_routing": "text1",
        "_source": {
          "text": "quick fox more some 35"
        },
        "_explanation": {
          "value": 1.4302213,
          "description": "sum of:",
          "details": [
            {
              "value": 0.8547784,
              "description": "weight(text:quick in 0) [PerFieldSimilarity], result of:",
              "details": [
                {
                  "value": 0.8547784,
                  "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
                  "details": [
                    {
                      "value": 1.0296195,
                      "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                      "details": [
                        {
                          "value": 2,
                          "description": "docFreq",
                          "details": []
                        },
                        {
                          "value": 6,
                          "description": "docCount",
                          "details": []
                        }
                      ]
                    },
                    {
                      "value": 0.8301887,
                      "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                      "details": [
                        {
                          "value": 1,
                          "description": "termFreq=1.0",
                          "details": []
                        },
                        {
                          "value": 1.2,
                          "description": "parameter k1",
                          "details": []
                        },
                        {
                          "value": 0.75,
                          "description": "parameter b",
                          "details": []
                        },
                        {
                          "value": 3.3333333,
                          "description": "avgFieldLength",
                          "details": []
                        },
                        {
                          "value": 5,
                          "description": "fieldLength",
                          "details": []
                        }
                      ]
                    }
                  ]
                }
              ]
            },
            {
              "value": 0.57544297,
              "description": "weight(text:fox in 0) [PerFieldSimilarity], result of:",
              "details": [
                {
                  "value": 0.57544297,
                  "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
                  "details": [
                    {
                      "value": 0.6931472,
                      "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                      "details": [
                        {
                          "value": 3,
                          "description": "docFreq",
                          "details": []
                        },
                        {
                          "value": 6,
                          "description": "docCount",
                          "details": []
                        }
                      ]
                    },
                    {
                      "value": 0.8301887,
                      "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                      "details": [
                        {
                          "value": 1,
                          "description": "termFreq=1.0",
                          "details": []
                        },
                        {
                          "value": 1.2,
                          "description": "parameter k1",
                          "details": []
                        },
                        {
                          "value": 0.75,
                          "description": "parameter b",
                          "details": []
                        },
                        {
                          "value": 3.3333333,
                          "description": "avgFieldLength",
                          "details": []
                        },
                        {
                          "value": 5,
                          "description": "fieldLength",
                          "details": []
                        }
                      ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      },
      {
        "_shard": "[hjx_test_index][4]",
        "_node": "JUltA86XS-WNJ3baNQWlcg",
        "_index": "hjx_test_index",
        "_type": "doc",
        "_id": "2",
        "_score": 0.8287629,
        "_routing": "text1",
        "_source": {
          "text": "bad fox"
        },
        "_explanation": {
          "value": 0.8287629,
          "description": "sum of:",
          "details": [
            {
              "value": 0.8287629,
              "description": "weight(text:fox in 0) [PerFieldSimilarity], result of:",
              "details": [
                {
                  "value": 0.8287629,
                  "description": "score(doc=0,freq=1.0 = termFreq=1.0\n), product of:",
                  "details": [
                    {
                      "value": 0.6931472,
                      "description": "idf, computed as log(1 + (docCount - docFreq + 0.5) / (docFreq + 0.5)) from:",
                      "details": [
                        {
                          "value": 3,
                          "description": "docFreq",
                          "details": []
                        },
                        {
                          "value": 6,
                          "description": "docCount",
                          "details": []
                        }
                      ]
                    },
                    {
                      "value": 1.1956521,
                      "description": "tfNorm, computed as (freq * (k1 + 1)) / (freq + k1 * (1 - b + b * fieldLength / avgFieldLength)) from:",
                      "details": [
                        {
                          "value": 1,
                          "description": "termFreq=1.0",
                          "details": []
                        },
                        {
                          "value": 1.2,
                          "description": "parameter k1",
                          "details": []
                        },
                        {
                          "value": 0.75,
                          "description": "parameter b",
                          "details": []
                        },
                        {
                          "value": 3.3333333,
                          "description": "avgFieldLength",
                          "details": []
                        },
                        {
                          "value": 2,
                          "description": "fieldLength",
                          "details": []
                        }
                      ]
                    }
                  ]
                }
              ]
            }
          ]
        }
      }
    ]
  }
}

可见是把两个词的得分加一起

但是官网写的是

文档向量

😂

有点无法理解,这怎么就文档向量了?!

看下

lucene是怎么说的

Lucene scoring uses a combination of the Vector Space Model (VSM) of Information Retrieval and the Boolean model to determine how relevant a given Document is to a User’s query. In general, the idea behind the VSM is the more times a query term appears in a document relative to the number of times the term appears in all the documents in the collection, the more relevant that document is to the query. It uses the Boolean model to first narrow down the documents that need to be scored based on the use of boolean logic in the Query specification. Lucene also adds some capabilities and refinements onto this model to support boolean and fuzzy searching, but it essentially remains a VSM based system at the heart. For some valuable references on VSM and IR in general refer to the Lucene Wiki IR references.

Lucene使用的是VSM和boolean模型的结合来决定query和文档的近似度。



版权声明:本文为waltonhuang原创文章,遵循 CC 4.0 BY-SA 版权协议,转载请附上原文出处链接和本声明。