全文查询是怎么实现的

一些名词解释

FTS（Full Text Search）

将要查询的目标文档里面的词提取出来，组成索引。搜索的时候查索引

倒排索引 Inverted Index

forward index：通过网址获取网页，在网页中搜索关键词。

inverted index：根据关键词找到含有这个关键词的文章。

源

dumps.wikimedia.org摘要文件，大概1G。

文件示例：

<title>Wikipedia: Kit-Cat Klock</title>
<url>https://en.wikipedia.org/wiki/Kit-Cat_Klock</url>
<abstract>The Kit-Cat Klock is an art deco novelty wall clock shaped like a grinning cat with cartoon eyes that swivel in time with its pendulum tail.</abstract>

字符串查找

type document struct {
    Title string `xml:"title"`
    URL   string `xml:"url"`
    Text  string `xml:"abstract"`
    ID    int
}

// 加载xml文件
func loadDocuments(path string) ([]document, error) {
    f, err := os.Open(path)
    if err != nil {
        return nil, err
    }
    defer f.Close()

    dec := xml.NewDecoder(f)
    dump := struct {
        Documents []document `xml:"doc"`
    }{}
    if err := dec.Decode(&dump); err != nil {
        return nil, err
    }

    docs := dump.Documents

    for i := range docs {
        docs[i].ID = i
    }
    return docs, nil
}

// 字符串查找
func search(docs []document, term string) []document {
    var r []document
    for _, doc := range docs {
        if strings.Contains(doc.Text, term) {
            r = append(r, doc)
        }
    }
    return r
}

// 正则匹配
func searchRegex(docs []document, term string) []document {
    re := regexp.MustCompile(`(?i)\b` + term + `\b`) // Don't do this in production, it's a security risk. term needs to be sanitized.
    var r []document
    for _, doc := range docs {
        if re.MatchString(doc.Text) {
            r = append(r, doc)
        }
    }
    return r
}

测试：

func TestSearch(t *testing.T) {
    println("测试中")
    now := time.Now()
    documents, _ := loadDocuments("F:\\enwiki-latest-abstract1.xml") // about 33s
    fmt.Printf("加载耗时：%.1fs\n",time.Now().Sub(now).Seconds())
    now = time.Now()
    //ret := search(documents, "cat")           // 0.1s
    ret := searchRegex(documents, "cat") // 1.7s
    fmt.Printf("查询耗时：%.1fs\n",time.Now().Sub(now).Seconds())
    println(len(ret))
}

倒排索引-简

分词、过滤

import (
    snowballeng "github.com/kljensen/snowball/english"
    "strings"
    "unicode"
)


func tokenize(text string) []string {
    return strings.FieldsFunc(text, func(r rune) bool {
        return !unicode.IsLetter(r) && !unicode.IsDigit(r)
    })
}

func lowercaseFilter(tokens []string) []string {
    for i, token := range tokens {
        tokens[i] = strings.ToLower(token)
    }
    return tokens
}

var stopwords = map[string]struct{} {
    "a": {}, "and": {}, "be": {}, "have": {}, "i": {},
    "in": {}, "of": {}, "that": {}, "the": {}, "to": {},
}

func stopwordFilter(tokens []string) []string {
    ret := make([]string,0, len(tokens))
    for _, token := range tokens {
        if _, ok := stopwords[token]; !ok {
            ret = append(ret, token)
        }
    }
    return ret
}

func stemmerFilter(tokens []string) []string {
    for i, token := range tokens {
        tokens[i] = snowballeng.Stem(token, false)
    }
    return tokens
}

func analyze(text string) []string {
    words := tokenize(text)
    words = lowercaseFilter(words)
    words = stopwordFilter(words)
    words = stemmerFilter(words)
    return words
}

加入map

type index map[string][]int

func (idx index) add(docs []document) {
    for _, doc := range docs {
        for _, token := range analyze(doc.Text) {
            idx[token] = append(idx[token], doc.ID)
        }
    }
}

查找

func (idx index) search(text string) [][]int {
    var r [][]int
    for _, token := range analyze(text) {
        if ids, ok := idx[token]; ok {
            r = append(r, ids)
        }
    }
    return r
}

测试

func Test_tokenize(t *testing.T) {
    //words := analyze("A donut on a glass plate. Only the donuts.")
    //fmt.Printf("%+v\n", words)
    println("测试中")
    now := time.Now()
    documents, _ := loadDocuments("F:\\enwiki-latest-abstract1.xml") // about 33s
    idx := make(index)
    idx.add(documents)
    fmt.Printf("加载耗时：%.1fs\n",time.Now().Sub(now).Seconds())
    now = time.Now()
    ret := idx.search("cat") // 0ns, 太快，没计时出来
    fmt.Printf("查询耗时：%dns\n",time.Now().UnixNano() - now.UnixNano())
    println(len(ret[0]))
}

总结

再详细的就不研究了，研究了不用过段时间也还忘，用到再说。

参考文献

聊聊 Elasticsearch 的倒排索引

Go进阶47:从零开始构建全文搜索引擎(译)