全文查询是怎么实现的
一些名词解释
FTS(Full Text Search)
将要查询的目标文档里面的词提取出来,组成索引。搜索的时候查索引
倒排索引 Inverted Index
forward index:通过网址获取网页,在网页中搜索关键词。
inverted index:根据关键词找到含有这个关键词的文章。
源
dumps.wikimedia.org摘要文件,大概1G。
文件示例:
<title>Wikipedia: Kit-Cat Klock</title>
<url>https://en.wikipedia.org/wiki/Kit-Cat_Klock</url>
<abstract>The Kit-Cat Klock is an art deco novelty wall clock shaped like a grinning cat with cartoon eyes that swivel in time with its pendulum tail.</abstract>
字符串查找
type document struct {
Title string `xml:"title"`
URL string `xml:"url"`
Text string `xml:"abstract"`
ID int
}
// 加载xml文件
func loadDocuments(path string) ([]document, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
dec := xml.NewDecoder(f)
dump := struct {
Documents []document `xml:"doc"`
}{}
if err := dec.Decode(&dump); err != nil {
return nil, err
}
docs := dump.Documents
for i := range docs {
docs[i].ID = i
}
return docs, nil
}
// 字符串查找
func search(docs []document, term string) []document {
var r []document
for _, doc := range docs {
if strings.Contains(doc.Text, term) {
r = append(r, doc)
}
}
return r
}
// 正则匹配
func searchRegex(docs []document, term string) []document {
re := regexp.MustCompile(`(?i)\b` + term + `\b`) // Don't do this in production, it's a security risk. term needs to be sanitized.
var r []document
for _, doc := range docs {
if re.MatchString(doc.Text) {
r = append(r, doc)
}
}
return r
}
测试:
func TestSearch(t *testing.T) {
println("测试中")
now := time.Now()
documents, _ := loadDocuments("F:\\enwiki-latest-abstract1.xml") // about 33s
fmt.Printf("加载耗时:%.1fs\n",time.Now().Sub(now).Seconds())
now = time.Now()
//ret := search(documents, "cat") // 0.1s
ret := searchRegex(documents, "cat") // 1.7s
fmt.Printf("查询耗时:%.1fs\n",time.Now().Sub(now).Seconds())
println(len(ret))
}
倒排索引-简
分词、过滤
import (
snowballeng "github.com/kljensen/snowball/english"
"strings"
"unicode"
)
func tokenize(text string) []string {
return strings.FieldsFunc(text, func(r rune) bool {
return !unicode.IsLetter(r) && !unicode.IsDigit(r)
})
}
func lowercaseFilter(tokens []string) []string {
for i, token := range tokens {
tokens[i] = strings.ToLower(token)
}
return tokens
}
var stopwords = map[string]struct{} {
"a": {}, "and": {}, "be": {}, "have": {}, "i": {},
"in": {}, "of": {}, "that": {}, "the": {}, "to": {},
}
func stopwordFilter(tokens []string) []string {
ret := make([]string,0, len(tokens))
for _, token := range tokens {
if _, ok := stopwords[token]; !ok {
ret = append(ret, token)
}
}
return ret
}
func stemmerFilter(tokens []string) []string {
for i, token := range tokens {
tokens[i] = snowballeng.Stem(token, false)
}
return tokens
}
func analyze(text string) []string {
words := tokenize(text)
words = lowercaseFilter(words)
words = stopwordFilter(words)
words = stemmerFilter(words)
return words
}
加入map
type index map[string][]int
func (idx index) add(docs []document) {
for _, doc := range docs {
for _, token := range analyze(doc.Text) {
idx[token] = append(idx[token], doc.ID)
}
}
}
查找
func (idx index) search(text string) [][]int {
var r [][]int
for _, token := range analyze(text) {
if ids, ok := idx[token]; ok {
r = append(r, ids)
}
}
return r
}
测试
func Test_tokenize(t *testing.T) {
//words := analyze("A donut on a glass plate. Only the donuts.")
//fmt.Printf("%+v\n", words)
println("测试中")
now := time.Now()
documents, _ := loadDocuments("F:\\enwiki-latest-abstract1.xml") // about 33s
idx := make(index)
idx.add(documents)
fmt.Printf("加载耗时:%.1fs\n",time.Now().Sub(now).Seconds())
now = time.Now()
ret := idx.search("cat") // 0ns, 太快,没计时出来
fmt.Printf("查询耗时:%dns\n",time.Now().UnixNano() - now.UnixNano())
println(len(ret[0]))
}
总结
再详细的就不研究了,研究了不用过段时间也还忘,用到再说。