diff --git a/app/gateway/routes/search_engine.go b/app/gateway/routes/search_engine.go index f2dd9df..aa82d05 100644 --- a/app/gateway/routes/search_engine.go +++ b/app/gateway/routes/search_engine.go @@ -10,6 +10,6 @@ func SearchRegisterHandlers(rg *gin.RouterGroup) { favoriteGroup := rg.Group("/search_engine") { favoriteGroup.GET("/search", http.SearchEngineSearch) - favoriteGroup.GET("/query", http.WordAssociation) + favoriteGroup.GET("/analyzer", http.WordAssociation) } } diff --git a/app/search_engine/query/init.go b/app/search_engine/analyzer/init.go similarity index 90% rename from app/search_engine/query/init.go rename to app/search_engine/analyzer/init.go index 3a47b9a..dfb1991 100644 --- a/app/search_engine/query/init.go +++ b/app/search_engine/analyzer/init.go @@ -1,4 +1,4 @@ -package query +package analyzer import ( "github.com/go-ego/gse" diff --git a/app/search_engine/query/token.go b/app/search_engine/analyzer/token.go similarity index 99% rename from app/search_engine/query/token.go rename to app/search_engine/analyzer/token.go index a8af65f..4c93293 100644 --- a/app/search_engine/query/token.go +++ b/app/search_engine/analyzer/token.go @@ -1,4 +1,4 @@ -package query +package analyzer import ( "fmt" diff --git a/app/search_engine/query/token_test.go b/app/search_engine/analyzer/token_test.go similarity index 98% rename from app/search_engine/query/token_test.go rename to app/search_engine/analyzer/token_test.go index 485571e..22bd861 100644 --- a/app/search_engine/query/token_test.go +++ b/app/search_engine/analyzer/token_test.go @@ -1,4 +1,4 @@ -package query +package analyzer import ( "fmt" diff --git a/app/search_engine/cmd/main.go b/app/search_engine/cmd/main.go index 838bcbc..b5b4459 100644 --- a/app/search_engine/cmd/main.go +++ b/app/search_engine/cmd/main.go @@ -9,7 +9,7 @@ import ( "google.golang.org/grpc" "github.com/CocaineCong/tangseng/app/gateway/rpc" - "github.com/CocaineCong/tangseng/app/search_engine/query" + "github.com/CocaineCong/tangseng/app/search_engine/analyzer" "github.com/CocaineCong/tangseng/app/search_engine/service" "github.com/CocaineCong/tangseng/config" pb "github.com/CocaineCong/tangseng/idl/pb/search_engine" @@ -20,7 +20,7 @@ import ( func main() { loading.Loading() rpc.Init() - query.InitSeg() + analyzer.InitSeg() // etcd 地址 etcdAddress := []string{viper.GetString("etcd.address")} diff --git a/app/search_engine/engine/engine.go b/app/search_engine/engine/engine.go index 23a71a4..bd94a4e 100644 --- a/app/search_engine/engine/engine.go +++ b/app/search_engine/engine/engine.go @@ -4,7 +4,7 @@ import ( "sync" "sync/atomic" - "github.com/CocaineCong/tangseng/app/search_engine/query" + "github.com/CocaineCong/tangseng/app/search_engine/analyzer" "github.com/CocaineCong/tangseng/app/search_engine/segment" "github.com/CocaineCong/tangseng/app/search_engine/types" "github.com/CocaineCong/tangseng/consts" @@ -58,7 +58,7 @@ func (e *Engine) AddForwardIndex(doc *types.Document) error { // Text2PostingsLists 文本 转成 倒排索引记录表 func (e *Engine) Text2PostingsLists(text string, docId int64) (err error) { - tokens, err := query.GseCut(text) + tokens, err := analyzer.GseCut(text) if err != nil { log.LogrusObj.Errorf("text2PostingsLists err:%v", err) return diff --git a/app/search_engine/index/query.go b/app/search_engine/recall/query.go similarity index 85% rename from app/search_engine/index/query.go rename to app/search_engine/recall/query.go index 4076376..e7379df 100644 --- a/app/search_engine/index/query.go +++ b/app/search_engine/recall/query.go @@ -1,23 +1,22 @@ -package index +package recall import ( "time" "github.com/CocaineCong/tangseng/app/search_engine/engine" - "github.com/CocaineCong/tangseng/app/search_engine/recall" "github.com/CocaineCong/tangseng/app/search_engine/types" log "github.com/CocaineCong/tangseng/pkg/logger" ) -// Recall 召回 -type Recall struct { - *recall.Recall +// RecallServ 召回 +type RecallServ struct { + *Recall } // NewRecallServ 创建召回服务 -func NewRecallServ(meta *engine.Meta) *Recall { - r := recall.NewRecall(meta) - return &Recall{r} +func NewRecallServ(meta *engine.Meta) *RecallServ { + r := NewRecall(meta) + return &RecallServ{r} } // SearchRecall 词条回归 diff --git a/app/search_engine/recall/recall.go b/app/search_engine/recall/recall.go index 1813f47..256611e 100644 --- a/app/search_engine/recall/recall.go +++ b/app/search_engine/recall/recall.go @@ -4,23 +4,23 @@ import ( "errors" "sort" - engine2 "github.com/CocaineCong/tangseng/app/search_engine/engine" - segment2 "github.com/CocaineCong/tangseng/app/search_engine/segment" - types2 "github.com/CocaineCong/tangseng/app/search_engine/types" + "github.com/CocaineCong/tangseng/app/search_engine/engine" + "github.com/CocaineCong/tangseng/app/search_engine/segment" + "github.com/CocaineCong/tangseng/app/search_engine/types" log "github.com/CocaineCong/tangseng/pkg/logger" "github.com/CocaineCong/tangseng/pkg/util/relevant" ) // Recall 查询召回 type Recall struct { - *engine2.Engine + *engine.Engine docCount int64 // 文档总数 ,用于计算相关性 enablePhrase bool } // NewRecall -- -func NewRecall(meta *engine2.Meta) *Recall { - e := engine2.NewEngine(meta, segment2.SearchMode) +func NewRecall(meta *engine.Meta) *Recall { + e := engine.NewEngine(meta, segment.SearchMode) var docCount int64 = 0 for _, seg := range e.Seg { num, err := seg.ForwardCount() @@ -29,11 +29,12 @@ func NewRecall(meta *engine2.Meta) *Recall { } docCount += num } + return &Recall{e, docCount, true} } // Search 入口 -func (r *Recall) Search(query string) ([]*types2.SearchItem, error) { +func (r *Recall) Search(query string) ([]*types.SearchItem, error) { err := r.splitQuery2Tokens(query) if err != nil { log.LogrusObj.Errorf("splitQuery2Tokens err:%v", err) @@ -44,7 +45,7 @@ func (r *Recall) Search(query string) ([]*types2.SearchItem, error) { } // SearchQuery 入口 -func (r *Recall) SearchQuery(query string) ([]*types2.DictTireTree, error) { +func (r *Recall) SearchQuery(query string) ([]*types.DictTireTree, error) { return r.GetDict(query) } @@ -58,8 +59,8 @@ func (r *Recall) splitQuery2Tokens(query string) (err error) { return } -func (r *Recall) searchDoc() (recalls []*types2.SearchItem, err error) { - recalls = make([]*types2.SearchItem, 0) +func (r *Recall) searchDoc() (recalls []*types.SearchItem, err error) { + recalls = make([]*types.SearchItem, 0) // 为每个token初始化游标 for token, post := range r.PostingsHashBuf { @@ -84,7 +85,7 @@ func (r *Recall) searchDoc() (recalls []*types2.SearchItem, err error) { postings = postings.Next continue } - sItem := &types2.SearchItem{ + sItem := &types.SearchItem{ DocId: docId, Content: "", Score: 0.0, @@ -108,7 +109,7 @@ func (r *Recall) searchDoc() (recalls []*types2.SearchItem, err error) { } // calculateScore 计算相关性 -func (r *Recall) calculateScore(token string, searchItem []*types2.SearchItem) (resp []*types2.SearchItem) { +func (r *Recall) calculateScore(token string, searchItem []*types.SearchItem) (resp []*types.SearchItem) { recallToken := make([]string, 0) for i := range searchItem { @@ -132,15 +133,15 @@ func (r *Recall) calculateScore(token string, searchItem []*types2.SearchItem) ( sort.Slice(searchItem, func(i, j int) bool { return searchItem[i].Score > searchItem[j].Score }) - resp = make([]*types2.SearchItem, 0) + resp = make([]*types.SearchItem, 0) resp = searchItem return } // 获取 token 所有seg的倒排表数据 -func (r *Recall) fetchPostingsBySegs(token string) (postings *types2.PostingsList, docCount int64, err error) { - postings = new(types2.PostingsList) +func (r *Recall) fetchPostingsBySegs(token string) (postings *types.PostingsList, docCount int64, err error) { + postings = new(types.PostingsList) for i, seg := range r.Engine.Seg { p, errx := seg.FetchPostings(token) if errx != nil { @@ -149,7 +150,7 @@ func (r *Recall) fetchPostingsBySegs(token string) (postings *types2.PostingsLis return } log.LogrusObj.Infof("post:%v", p) - postings = segment2.MergePostings(postings, p.PostingsList) + postings = segment.MergePostings(postings, p.PostingsList) log.LogrusObj.Infof("pos next:%v", postings.Next) docCount += p.DocCount } @@ -158,7 +159,7 @@ func (r *Recall) fetchPostingsBySegs(token string) (postings *types2.PostingsLis return } -func (r *Recall) getContentByDocId(s *types2.SearchItem) (item *types2.SearchItem, err error) { +func (r *Recall) getContentByDocId(s *types.SearchItem) (item *types.SearchItem, err error) { for i, seg := range r.Engine.Seg { p, errx := seg.GetForward(s.DocId) if errx != nil { @@ -168,7 +169,7 @@ func (r *Recall) getContentByDocId(s *types2.SearchItem) (item *types2.SearchIte } s.Content = string(p) } - item = new(types2.SearchItem) + item = new(types.SearchItem) item = s return diff --git a/app/search_engine/segment/db.go b/app/search_engine/segment/db.go index 0a2a0cb..f212ab4 100644 --- a/app/search_engine/segment/db.go +++ b/app/search_engine/segment/db.go @@ -3,7 +3,7 @@ package segment import ( "fmt" - "github.com/CocaineCong/tangseng/app/search_engine/query" + "github.com/CocaineCong/tangseng/app/search_engine/analyzer" "github.com/CocaineCong/tangseng/app/search_engine/storage" "github.com/CocaineCong/tangseng/app/search_engine/types" "github.com/CocaineCong/tangseng/config" @@ -40,7 +40,7 @@ func InitSegmentDb(segId SegId) (invertedDb *storage.InvertedDB, forwardDb *stor } // CreateNewInvertedIndex 创建倒排索引 -func CreateNewInvertedIndex(token query.Tokenization, docCount int64) *types.InvertedIndexValue { +func CreateNewInvertedIndex(token analyzer.Tokenization, docCount int64) *types.InvertedIndexValue { return &types.InvertedIndexValue{ // TODO:优化一下结构 Token: token.Token, PostingsList: new(types.PostingsList), diff --git a/app/search_engine/segment/postings_test.go b/app/search_engine/segment/postings_test.go index c65c56e..42ed3b4 100644 --- a/app/search_engine/segment/postings_test.go +++ b/app/search_engine/segment/postings_test.go @@ -4,7 +4,7 @@ import ( "fmt" "testing" - "github.com/CocaineCong/tangseng/app/search_engine/query" + "github.com/CocaineCong/tangseng/app/search_engine/analyzer" "github.com/CocaineCong/tangseng/app/search_engine/types" ) @@ -40,7 +40,7 @@ func TestMergePostings(t *testing.T) { func TestMergeInvertedIndex(t *testing.T) { base := make(InvertedIndexHash) - token := query.Tokenization{ + token := analyzer.Tokenization{ Token: "测试文本", Position: 10, Offset: 100, @@ -52,7 +52,7 @@ func TestMergeInvertedIndex(t *testing.T) { fmt.Println("base", base) addDoc := make(InvertedIndexHash) - token2 := query.Tokenization{ + token2 := analyzer.Tokenization{ Token: "测试文本2", Position: 101, Offset: 1002, diff --git a/app/search_engine/segment/segment.go b/app/search_engine/segment/segment.go index 3dbeb9c..01bd5d9 100644 --- a/app/search_engine/segment/segment.go +++ b/app/search_engine/segment/segment.go @@ -1,7 +1,7 @@ package segment import ( - "github.com/CocaineCong/tangseng/app/search_engine/query" + "github.com/CocaineCong/tangseng/app/search_engine/analyzer" storage2 "github.com/CocaineCong/tangseng/app/search_engine/storage" "github.com/CocaineCong/tangseng/app/search_engine/types" log "github.com/CocaineCong/tangseng/pkg/logger" @@ -16,7 +16,7 @@ type Segment struct { } // Token2PostingsLists 词条 转化成 倒排索引表 -func Token2PostingsLists(bufInvertHash InvertedIndexHash, token query.Tokenization, docId int64) (err error) { +func Token2PostingsLists(bufInvertHash InvertedIndexHash, token analyzer.Tokenization, docId int64) (err error) { bufInvert := new(types.InvertedIndexValue) if len(bufInvertHash) > 0 { if item, ok := bufInvertHash[token.Token]; ok { diff --git a/app/search_engine/service/search_engine.go b/app/search_engine/service/search_engine.go index 5065efb..2af126d 100644 --- a/app/search_engine/service/search_engine.go +++ b/app/search_engine/service/search_engine.go @@ -5,7 +5,7 @@ import ( "fmt" "sync" - "github.com/CocaineCong/tangseng/app/search_engine/index" + "github.com/CocaineCong/tangseng/app/search_engine/recall" "github.com/CocaineCong/tangseng/app/search_engine/types" "github.com/CocaineCong/tangseng/consts/e" pb "github.com/CocaineCong/tangseng/idl/pb/search_engine" @@ -31,11 +31,11 @@ func (s *SearchEngineSrv) SearchEngineSearch(ctx context.Context, req *pb.Search resp = new(pb.SearchEngineResponse) resp.Code = e.SUCCESS query := req.Query - sResult, err := index.SearchRecall(query) + sResult, err := recall.SearchRecall(query) if err != nil { resp.Code = e.ERROR resp.Msg = err.Error() - log.LogrusObj.Error("SearchEngineSearch-index.SearchRecall", err) + log.LogrusObj.Error("SearchEngineSearch-recall.SearchRecall", err) return } @@ -55,7 +55,7 @@ func (s *SearchEngineSrv) WordAssociation(ctx context.Context, req *pb.SearchEng resp = new(pb.WordAssociationResponse) resp.Code = e.SUCCESS query := req.Query - sResult, err := index.SearchQuery(query) + sResult, err := recall.SearchQuery(query) wordAssociationList := make([]string, 0) for _, v := range sResult { if v != nil { diff --git a/app/search_engine/storage/forward_db_test.go b/app/search_engine/storage/forward_db_test.go index 4ad2e54..573c214 100644 --- a/app/search_engine/storage/forward_db_test.go +++ b/app/search_engine/storage/forward_db_test.go @@ -6,7 +6,7 @@ import ( bolt "go.etcd.io/bbolt" - "github.com/CocaineCong/tangseng/app/search_engine/query" + "github.com/CocaineCong/tangseng/app/search_engine/analyzer" "github.com/CocaineCong/tangseng/config" log "github.com/CocaineCong/tangseng/pkg/logger" ) @@ -16,7 +16,7 @@ func TestMain(m *testing.M) { re := config.ConfigReader{FileName: "../../../config/config.yaml"} config.InitConfigForTest(&re) log.InitLog() - query.InitSeg() + analyzer.InitSeg() fmt.Println("Write tests on values: ", config.Conf) m.Run() } diff --git a/app/search_engine/test/recall_test.go b/app/search_engine/test/recall_test.go index 8181c66..f771fae 100644 --- a/app/search_engine/test/recall_test.go +++ b/app/search_engine/test/recall_test.go @@ -4,8 +4,8 @@ import ( "fmt" "testing" - "github.com/CocaineCong/tangseng/app/search_engine/index" - "github.com/CocaineCong/tangseng/app/search_engine/query" + "github.com/CocaineCong/tangseng/app/search_engine/analyzer" + "github.com/CocaineCong/tangseng/app/search_engine/recall" "github.com/CocaineCong/tangseng/config" log "github.com/CocaineCong/tangseng/pkg/logger" ) @@ -14,7 +14,7 @@ func TestMain(m *testing.M) { // 这个文件相对于config.yaml的位置 re := config.ConfigReader{FileName: "../../../config/config.yaml"} config.InitConfigForTest(&re) - query.InitSeg() + analyzer.InitSeg() log.InitLog() fmt.Println("Write tests on values: ", config.Conf) m.Run() @@ -22,7 +22,7 @@ func TestMain(m *testing.M) { func TestRecall(t *testing.T) { q := "国家,西游记" - searchItem, err := index.SearchRecall(q) + searchItem, err := recall.SearchRecall(q) if err != nil { fmt.Println(err) } diff --git a/idl/pb/search_engine/search_engine.pb.go b/idl/pb/search_engine/search_engine.pb.go index 8d25127..3198761 100644 --- a/idl/pb/search_engine/search_engine.pb.go +++ b/idl/pb/search_engine/search_engine.pb.go @@ -25,8 +25,8 @@ type SearchEngineRequest struct { sizeCache protoimpl.SizeCache unknownFields protoimpl.UnknownFields - // @inject_tag:form:"query" uri:"query" - Query string `protobuf:"bytes,1,opt,name=query,proto3" json:"query,omitempty" form:"query" uri:"query"` + // @inject_tag:form:"analyzer" uri:"analyzer" + Query string `protobuf:"bytes,1,opt,name=analyzer,proto3" json:"analyzer,omitempty" form:"analyzer" uri:"analyzer"` } func (x *SearchEngineRequest) Reset() { diff --git a/pkg/util/relevant/bm25.go b/pkg/util/relevant/bm25.go index 3606a5e..134a4cd 100644 --- a/pkg/util/relevant/bm25.go +++ b/pkg/util/relevant/bm25.go @@ -5,7 +5,7 @@ import ( "github.com/xtgo/set" - "github.com/CocaineCong/tangseng/app/search_engine/query" + "github.com/CocaineCong/tangseng/app/search_engine/analyzer" ) // DocScore is a tuple of the document ID and a score @@ -80,7 +80,7 @@ func MakeCorpus(a []string) (map[string]int, []string) { invRetVal := make([]string, 0) var id int for _, s := range a { - tokens, _ := query.GseCut(s) + tokens, _ := analyzer.GseCut(s) for _, f := range tokens { if _, ok := retVal[f.Token]; !ok { retVal[f.Token] = id @@ -96,7 +96,7 @@ func MakeDocuments(a []string, c map[string]int) []Document { retVal := make([]Document, 0, len(a)) for _, s := range a { var ts []int - tokens, _ := query.GseCut(s) + tokens, _ := analyzer.GseCut(s) for _, f := range tokens { id := c[f.Token] ts = append(ts, id) diff --git a/pkg/util/relevant/bm25_test.go b/pkg/util/relevant/bm25_test.go index 0d7b57e..9377678 100644 --- a/pkg/util/relevant/bm25_test.go +++ b/pkg/util/relevant/bm25_test.go @@ -5,7 +5,7 @@ import ( "sort" "testing" - "github.com/CocaineCong/tangseng/app/search_engine/query" + "github.com/CocaineCong/tangseng/app/search_engine/analyzer" "github.com/CocaineCong/tangseng/config" log "github.com/CocaineCong/tangseng/pkg/logger" ) @@ -24,7 +24,7 @@ func TestMain(m *testing.M) { // 这个文件相对于config.yaml的位置 re := config.ConfigReader{FileName: "../../../config/config.yaml"} config.InitConfigForTest(&re) - query.InitSeg() + analyzer.InitSeg() log.InitLog() fmt.Println("Write tests on values: ", config.Conf) m.Run()