From fd61ee76a2852417bdf6b5617585259c57ce59ab Mon Sep 17 00:00:00 2001 From: Kailash Nadh Date: Sat, 11 Dec 2021 15:59:16 +0530 Subject: [PATCH] Remove Go plugin tokenizers and bundle them natively instead. Go plugins, unfortunately, have severe limitations and are not ideal for plugins that may need wide distribution. https://github.com/golang/go/issues/20481 This patch get rids of the tokenizer plugin system entirely and just bundles the available tokenizers (phonetic: Kannada, Malayalam) into the core. Widely usable tokenizers can henceforth be bundled into the core just like how Postgres come with bundled TSVECTOR dictionaries. Also, it is possible to write custom tokenizers as Postgres plugins and load them into Postgres dynamically, making the Go tokenizer plugin system superfluous. --- Makefile | 9 +-- cmd/dictmaker/init.go | 87 ++++++++++------------------- cmd/dictmaker/main.go | 17 ++---- go.mod | 6 +- go.sum | 9 +-- internal/data/data.go | 19 ++++--- tokenizers/indicphone/indicphone.go | 79 ++++++++++++++++++++++++++ 7 files changed, 132 insertions(+), 94 deletions(-) create mode 100644 tokenizers/indicphone/indicphone.go diff --git a/Makefile b/Makefile index d28565e..6b0a46e 100644 --- a/Makefile +++ b/Makefile @@ -12,17 +12,12 @@ deps: .PHONY: build build: - go build -o ${BIN} -ldflags="-s -w -X 'main.buildString=${BUILDSTR}'" cmd/${BIN}/*.go + go build -gcflags="-G=3" -o ${BIN} -ldflags="-s -w -X 'main.buildString=${BUILDSTR}'" cmd/${BIN}/*.go .PHONY: run -run: build build-tokenizers +run: ./${BIN} -.PHONY: build-tokenizers -build-tokenizers: - go build -ldflags="-s -w" -buildmode=plugin -o kannada.tk tokenizers/kannada/kannada.go - go build -ldflags="-s -w" -buildmode=plugin -o malayalam.tk tokenizers/malayalam/malayalam.go - # Compile bin and bundle static assets. .PHONY: dist dist: build build-tokenizers diff --git a/cmd/dictmaker/init.go b/cmd/dictmaker/init.go index 8266561..fa156c9 100644 --- a/cmd/dictmaker/init.go +++ b/cmd/dictmaker/init.go @@ -5,44 +5,43 @@ import ( "fmt" "html/template" "io/ioutil" - "log" "net/http" "net/url" "os" "path/filepath" - "plugin" "strings" "github.com/go-chi/chi" "github.com/go-chi/chi/middleware" "github.com/jmoiron/sqlx" "github.com/knadh/dictmaker/internal/data" + "github.com/knadh/dictmaker/tokenizers/indicphone" "github.com/knadh/koanf" "github.com/knadh/stuffbin" ) -// connectDB initializes a database connection. -func connectDB(host string, port int, user, pwd, dbName string) (*sqlx.DB, error) { +// initDB initializes a database connection. +func initDB(host string, port int, user, pwd, dbName string) *sqlx.DB { db, err := sqlx.Connect("postgres", fmt.Sprintf("host=%s port=%d user=%s password=%s dbname=%s sslmode=disable", host, port, user, pwd, dbName)) if err != nil { - return nil, err + logger.Fatalf("error intiializing DB: %v", err) } - return db, nil + return db } -// initFileSystem initializes the stuffbin FileSystem to provide +// initFS initializes the stuffbin FileSystem to provide // access to bunded static assets to the app. -func initFileSystem() (stuffbin.FileSystem, error) { +func initFS() stuffbin.FileSystem { path, err := os.Executable() if err != nil { - return nil, err + logger.Fatalf("error getting executable path: %v", err) } fs, err := stuffbin.UnStuff(path) if err == nil { - return fs, nil + return fs } // Running in local mode. Load the required static assets into @@ -58,10 +57,10 @@ func initFileSystem() (stuffbin.FileSystem, error) { fs, err = stuffbin.NewLocalFS("/", files...) if err != nil { - return nil, fmt.Errorf("failed to initialize local file for assets: %v", err) + logger.Fatalf("failed to initialize local file for assets: %v", err) } - return fs, nil + return fs } // loadSiteTheme loads a theme from a directory. @@ -104,36 +103,16 @@ func loadSiteTheme(path string, loadPages bool) (*template.Template, error) { func initAdminTemplates(path string) *template.Template { t, err := template.New("admin").ParseGlob(path + "/*.html") if err != nil { - log.Fatalf("error loading admin templates: %v", err) + logger.Fatalf("error loading admin templates: %v", err) } return t } -// loadTokenizerPlugin loads a tokenizer plugin that implements data.Tokenizer -// from the given path. -func loadTokenizerPlugin(path string) (data.Tokenizer, error) { - plg, err := plugin.Open(path) - if err != nil { - return nil, fmt.Errorf("error loading tokenizer plugin '%s': %v", path, err) - } - - newFunc, err := plg.Lookup("New") - if err != nil { - return nil, fmt.Errorf("New() function not found in plugin '%s': %v", path, err) - } - - f, ok := newFunc.(func() (data.Tokenizer, error)) - if !ok { - return nil, fmt.Errorf("New() function is of invalid type in plugin '%s'", path) +// initTokenizers initializes all bundled tokenizers. +func initTokenizers() map[string]data.Tokenizer { + return map[string]data.Tokenizer{ + "indicphone": indicphone.New(), } - - // Initialize the plugin. - p, err := f() - if err != nil { - return nil, fmt.Errorf("error initializing provider plugin '%s': %v", path, err) - } - - return p, err } // initHandlers registers HTTP handlers. @@ -183,32 +162,29 @@ func initHandlers(r *chi.Mux, app *App) { // initLangs loads language configuration into a given *App instance. func initLangs(ko *koanf.Koanf) data.LangMap { - out := make(data.LangMap) + var ( + tks = initTokenizers() + out = make(data.LangMap) + ) // Language configuration. for _, l := range ko.MapKeys("lang") { lang := data.Lang{Types: make(map[string]string)} if err := ko.UnmarshalWithConf("lang."+l, &lang, koanf.UnmarshalConf{Tag: "json"}); err != nil { - log.Fatalf("error loading languages: %v", err) + logger.Fatalf("error loading languages: %v", err) } - // Load external plugin. - logger.Printf("language: %s", l) - - if lang.TokenizerType == "plugin" { - tk, err := loadTokenizerPlugin(lang.TokenizerName) - if err != nil { - log.Fatalf("error loading tokenizer plugin for %s: %v", l, err) + // Does the language use a bundled tokenizer? + if lang.TokenizerType == "custom" { + t, ok := tks[lang.TokenizerName] + if !ok { + logger.Fatalf("unknown custom tokenizer '%s'", lang.TokenizerName) } - - lang.Tokenizer = tk - - // Tokenizations for search queries are looked up by the tokenizer - // ID() returned by the plugin and not the filename in the config. - lang.TokenizerName = tk.ID() - logger.Printf("loaded tokenizer %s", lang.TokenizerName) + lang.Tokenizer = t } + // Load external plugin. + logger.Printf("language: %s", l) out[l] = lang } @@ -222,10 +198,7 @@ func generateNewFiles() error { // Initialize the static file system into which all // required static assets (.sql, .js files etc.) are loaded. - fs, err := initFileSystem() - if err != nil { - return err - } + fs := initFS() // Generate config file. b, err := fs.Read("config.toml.sample") diff --git a/cmd/dictmaker/main.go b/cmd/dictmaker/main.go index be0231b..1c86457 100644 --- a/cmd/dictmaker/main.go +++ b/cmd/dictmaker/main.go @@ -77,7 +77,7 @@ func init() { f.Bool("version", false, "current version of the build") if err := f.Parse(os.Args[1:]); err != nil { - log.Fatalf("error parsing flags: %v", err) + logger.Fatalf("error parsing flags: %v", err) } if ok, _ := f.GetBool("version"); ok { @@ -113,23 +113,14 @@ func init() { func main() { // Connect to the DB. - db, err := connectDB(ko.String("db.host"), + db := initDB(ko.String("db.host"), ko.Int("db.port"), ko.String("db.user"), ko.String("db.password"), ko.String("db.db"), ) - if err != nil { - logger.Fatalf("error connecting to DB: %v", err) - } - defer db.Close() - fs, err := initFileSystem() - if err != nil { - logger.Fatal(err) - } - // Initialize the app context that's passed around. app := &App{ constants: constants{ @@ -137,7 +128,7 @@ func main() { RootURL: ko.String("app.root_url"), }, db: db, - fs: fs, + fs: initFS(), logger: logger, } @@ -148,7 +139,7 @@ func main() { } // Load SQL queries. - qB, err := fs.Read("/queries.sql") + qB, err := app.fs.Read("/queries.sql") if err != nil { logger.Fatalf("error reading queries.sql: %v", err) } diff --git a/go.mod b/go.mod index c634f7b..3d7825e 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/knadh/dictmaker -go 1.12 +go 1.17 require ( github.com/go-chi/chi v4.1.2+incompatible @@ -10,13 +10,13 @@ require ( github.com/knadh/koanf v0.15.0 github.com/knadh/paginator v0.0.0-20210310070812-ae09d514e148 github.com/knadh/stuffbin v1.1.0 - github.com/kr/pretty v0.1.0 // indirect github.com/lib/pq v1.10.0 github.com/mitchellh/mapstructure v1.4.1 // indirect github.com/pelletier/go-toml v1.8.1 // indirect github.com/spf13/pflag v1.0.5 gitlab.com/joice/mlphone-go v0.0.0-20201001084309-2bb02984eed8 golang.org/x/sys v0.0.0-20210326220804-49726bf1d181 // indirect - gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect gopkg.in/volatiletech/null.v6 v6.0.0-20170828023728-0bef4e07ae1b ) + +require github.com/fsnotify/fsnotify v1.4.9 // indirect diff --git a/go.sum b/go.sum index 50b6264..75a389c 100644 --- a/go.sum +++ b/go.sum @@ -59,11 +59,6 @@ github.com/knadh/paginator v0.0.0-20210310070812-ae09d514e148 h1:5KojMX5qCcq89QL github.com/knadh/paginator v0.0.0-20210310070812-ae09d514e148/go.mod h1:80FK5OPRRQQKEK75ahG+92/MdX/lu4dE8loTzJRVcCQ= github.com/knadh/stuffbin v1.1.0 h1:f5S5BHzZALjuJEgTIOMC9NidEnBJM7Ze6Lu1GHR/lwU= github.com/knadh/stuffbin v1.1.0/go.mod h1:yVCFaWaKPubSNibBsTAJ939q2ABHudJQxRWZWV5yh+4= -github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI= -github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= -github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= -github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= -github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo= github.com/lib/pq v1.10.0 h1:Zx5DJFEYQXio93kgXnQ09fXNiUKsqv4OUEu2UtGcB1E= github.com/lib/pq v1.10.0/go.mod h1:AlVN5x4E4T544tWzH6hKfbfQvm3HdbOxrmggDNAPY9o= @@ -113,6 +108,7 @@ golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73r golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859 h1:R/3boaszxrf1GEUWTVDzSKVwLmSJpwZ1yqXm8j0v2QI= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -130,6 +126,7 @@ golang.org/x/sys v0.0.0-20200331124033-c3d80250170d/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20210326220804-49726bf1d181 h1:64ChN/hjER/taL4YJuA+gpLfIMT+/NFherRZixbxOhg= golang.org/x/sys v0.0.0-20210326220804-49726bf1d181/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.1-0.20181227161524-e6919f6577db h1:6/JqlYfC1CCaLnGceQTI+sDGhC9UBSPAsBqI0Gun6kU= golang.org/x/text v0.3.1-0.20181227161524-e6919f6577db/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -146,8 +143,6 @@ google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZi google.golang.org/grpc v1.22.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= gopkg.in/asn1-ber.v1 v1.0.0-20181015200546-f715ec2f112d/go.mod h1:cuepJuh7vyXfUyUwEgHQXw849cJrilpS5NeIjOWESAw= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY= -gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/square/go-jose.v2 v2.3.1/go.mod h1:M9dMgbHiYLoDGQrXy7OpJDJWiKiU//h+vD76mk0e1AI= gopkg.in/volatiletech/null.v6 v6.0.0-20170828023728-0bef4e07ae1b h1:P+3+n9hUbqSDkSdtusWHVPQRrpRpLiLFzlZ02xXskM0= gopkg.in/volatiletech/null.v6 v6.0.0-20170828023728-0bef4e07ae1b/go.mod h1:0LRKfykySnChgQpG3Qpk+bkZFWazQ+MMfc5oldQCwnY= diff --git a/internal/data/data.go b/internal/data/data.go index a9c35f7..a997c58 100644 --- a/internal/data/data.go +++ b/internal/data/data.go @@ -32,16 +32,13 @@ type LangMap map[string]Lang // Tokenizer represents a function that takes a string // and returns a list of Postgres tsvector tokens. type Tokenizer interface { - ID() string - Name() string - // Tokenize takes a string and tokenizes it into a list of tsvector tokens // that can be stored in the database for fulltext search. - ToTokens(string) []string + ToTokens(s string, lang string) ([]string, error) // ToTSQuery takes a search string and returns a Postgres tsquery string, // for example 'fat & cat`. - ToQuery(string) string + ToQuery(s string, lang string) (string, error) } // Token represents a Postgres tsvector token. @@ -182,7 +179,11 @@ func (d *Data) Search(q Query) (Entries, int, error) { } else { // If there's an external tokenizer loaded, run it to get the tokens // and pass it to the DB directly instructing the DB not to tokenize internally. - tsVectorQuery = tk.ToQuery(q.Query) + var err error + tsVectorQuery, err = tk.ToQuery(q.Query, q.FromLang) + if err != nil { + return nil, 0, err + } } // Filters ($1 to $3) @@ -293,7 +294,11 @@ func (d *Data) InsertEntry(e Entry) (int, error) { } else { // If there's an external tokenizer loaded, run it to get the tokens // and pass it to the DB directly instructing the DB not to tokenize internally. - tokens = strings.Join(lang.Tokenizer.ToTokens(e.Content), " ") + t, err := lang.Tokenizer.ToTokens(e.Content, e.Lang) + if err != nil { + return 0, nil + } + tokens = strings.Join(t, " ") } } diff --git a/tokenizers/indicphone/indicphone.go b/tokenizers/indicphone/indicphone.go new file mode 100644 index 0000000..6849ca0 --- /dev/null +++ b/tokenizers/indicphone/indicphone.go @@ -0,0 +1,79 @@ +package indicphone + +import ( + "errors" + "fmt" + "strings" + + "github.com/knadh/dictmaker/internal/data" + "github.com/knadh/knphone" + "gitlab.com/joice/mlphone-go" +) + +// IndicPhone is a phonetic tokenizer that generates phonetic tokens for +// Indian languages. It is similar to Metaphone for English. +type IndicPhone struct { + kn *knphone.KNphone + ml *mlphone.MLPhone +} + +// New returns a new instance of the Kannada tokenizer. +func New() *IndicPhone { + return &IndicPhone{ + kn: knphone.New(), + ml: mlphone.New(), + } +} + +// ToTokens tokenizes a string and a language returns an array of tsvector tokens. +// eg: [KRM0 KRM] or [KRM:2 KRM:1] with weights. +func (ip *IndicPhone) ToTokens(s string, lang string) ([]string, error) { + if lang != "kannada" && lang != "malayalam" { + return nil, errors.New("unknown language to tokenize") + } + + var ( + chunks = strings.Split(s, " ") + tokens = make([]data.Token, 0, len(chunks)*3) + + key0, key1, key2 string + ) + for _, c := range chunks { + switch lang { + case "kannada": + key0, key1, key2 = ip.kn.Encode(c) + case "malayalam": + key0, key1, key2 = ip.ml.Encode(c) + } + + tokens = append(tokens, + data.Token{Token: key0, Weight: 3}, + data.Token{Token: key1, Weight: 2}, + data.Token{Token: key2, Weight: 1}) + } + + return data.TokensToTSVector(tokens), nil +} + +// ToQuery tokenizes a Kannada string into Romanized (knphone) Postgres +// tsquery string. +func (ip *IndicPhone) ToQuery(s string, lang string) (string, error) { + var key0, key1, key2 string + + switch lang { + case "kannada": + key0, key1, key2 = ip.kn.Encode(s) + case "malayalam": + key0, key1, key2 = ip.kn.Encode(s) + } + + if key0 == "" { + return "", nil + } + + if key0 != key1 { + return fmt.Sprintf("%s | (%s & %s) ", key2, key1, key0), nil + } + + return fmt.Sprintf("%s", key0), nil +}