Skip to content

Commit

Permalink
related: Add config option cardinalityThreshold
Browse files Browse the repository at this point in the history
Fixes #10744
  • Loading branch information
bep committed Feb 23, 2023
1 parent d5601e8 commit e442a63
Show file tree
Hide file tree
Showing 4 changed files with 106 additions and 2 deletions.
4 changes: 4 additions & 0 deletions docs/content/en/content-management/related.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,10 @@ applyFilter
weight
: An integer weight that indicates _how important_ this parameter is relative to the other parameters. It can be 0, which has the effect of turning this index off, or even negative. Test with different values to see what fits your content best.


cardinalityThreshold (default 0)
: {{< new-in "0.111.0" >}}. A percentage (0-100) used to remove common keywords from the index. As an example, setting this to 50 will remove all keywords that are used in more than 50% of the documents in the index.

pattern
: This is currently only relevant for dates. When listing related content, we may want to list content that is also close in time. Setting "2006" (default value for date indexes) as the pattern for a date index will add weight to pages published in the same year. For busier blogs, "200601" (year and month) may be a better default.

Expand Down
65 changes: 63 additions & 2 deletions related/inverted_index.go
Original file line number Diff line number Diff line change
Expand Up @@ -135,9 +135,21 @@ type IndexConfig struct {
// This field's weight when doing multi-index searches. Higher is "better".
Weight int

// A percentage (0-100) used to remove common keywords from the index.
// As an example, setting this to 50 will remove all keywords that are
// used in more than 50% of the documents in the index.
CardinalityThreshold int

// Will lower case all string values in and queries tothis index.
// May get better accurate results, but at a slight performance cost.
ToLower bool

// Counts the number of documents in the index.
numDocs int
}

func (cfg *IndexConfig) incrNumDocs() {
cfg.numDocs++
}

// Document is the interface an indexable document in Hugo must fulfill.
Expand Down Expand Up @@ -169,6 +181,9 @@ type InvertedIndex struct {

minWeight int
maxWeight int

// No modifications after this is set.
finalized bool
}

func (idx *InvertedIndex) getIndexCfg(name string) (IndexConfig, bool) {
Expand Down Expand Up @@ -202,38 +217,80 @@ func NewInvertedIndex(cfg Config) *InvertedIndex {
// Add documents to the inverted index.
// The value must support == and !=.
func (idx *InvertedIndex) Add(ctx context.Context, docs ...Document) error {
if idx.finalized {
panic("index is finalized")
}
var err error
for _, config := range idx.cfg.Indices {
for i, config := range idx.cfg.Indices {
if config.Weight == 0 {
// Disabled
continue
}
setm := idx.index[config.Name]

for _, doc := range docs {
var added bool
var words []Keyword
words, err = doc.RelatedKeywords(config)
if err != nil {
continue
}

for _, keyword := range words {
added = true
setm[keyword] = append(setm[keyword], doc)
}

if config.Type == TypeFragments {
if fp, ok := doc.(FragmentProvider); ok {
for _, fragment := range fp.Fragments(ctx).Identifiers {
added = true
setm[FragmentKeyword(fragment)] = append(setm[FragmentKeyword(fragment)], doc)
}
}
}

if added {
c := &idx.cfg.Indices[i]
(*c).incrNumDocs()
}
}
}

return err
}

func (idx *InvertedIndex) Finalize(ctx context.Context) error {
if idx.finalized {
return nil
}

for _, config := range idx.cfg.Indices {
if config.CardinalityThreshold == 0 {
continue
}
setm := idx.index[config.Name]
numDocs := config.numDocs
if numDocs == 0 {
continue
}

// Remove high cardinality terms.
for k, v := range setm {
percentageWithKeyword := int(math.Ceil(float64(len(v)) / float64(numDocs) * 100))
if percentageWithKeyword > config.CardinalityThreshold {
delete(setm, k)
}
}

}

idx.finalized = true

return nil

}

// queryElement holds the index name and keywords that can be used to compose a
// search for related content.
type queryElement struct {
Expand Down Expand Up @@ -548,12 +605,16 @@ func DecodeConfig(m maps.Params) (Config, error) {
}
}
for i := range c.Indices {
if c.Indices[i].Type == "" {
icfg := c.Indices[i]
if icfg.Type == "" {
c.Indices[i].Type = TypeBasic
}
if !validTypes[c.Indices[i].Type] {
return c, fmt.Errorf("invalid index type %q. Must be one of %v", c.Indices[i].Type, xmaps.Keys(validTypes))
}
if icfg.CardinalityThreshold < 0 || icfg.CardinalityThreshold > 100 {
return Config{}, errors.New("cardinalityThreshold threshold must be between 0 and 100")
}
}

return c, nil
Expand Down
35 changes: 35 additions & 0 deletions related/inverted_index_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,41 @@ func (d *testDoc) PublishDate() time.Time {
return d.date
}

func TestCardinalityThreshold(t *testing.T) {
c := qt.New(t)
config := Config{
Threshold: 90,
IncludeNewer: false,
Indices: IndexConfigs{
IndexConfig{Name: "tags", Weight: 50, CardinalityThreshold: 79},
IndexConfig{Name: "keywords", Weight: 65, CardinalityThreshold: 90},
},
}

idx := NewInvertedIndex(config)
hasKeyword := func(index, keyword string) bool {
_, found := idx.index[index][StringKeyword(keyword)]
return found
}

docs := []Document{
newTestDoc("tags", "a", "b", "c", "d"),
newTestDoc("tags", "b", "d", "g"),
newTestDoc("tags", "b", "d", "g"),
newTestDoc("tags", "b", "h").addKeywords("keywords", "a"),
newTestDoc("tags", "g", "h").addKeywords("keywords", "a", "b", "z"),
}

idx.Add(context.Background(), docs...)
c.Assert(idx.Finalize(context.Background()), qt.IsNil)
// Only tags=b should be removed.
c.Assert(hasKeyword("tags", "a"), qt.Equals, true)
c.Assert(hasKeyword("tags", "b"), qt.Equals, false)
c.Assert(hasKeyword("tags", "d"), qt.Equals, true)
c.Assert(hasKeyword("keywords", "b"), qt.Equals, true)

}

func TestSearch(t *testing.T) {
config := Config{
Threshold: 90,
Expand Down
4 changes: 4 additions & 0 deletions resources/page/pages_related.go
Original file line number Diff line number Diff line change
Expand Up @@ -236,5 +236,9 @@ func (s *RelatedDocsHandler) getOrCreateIndex(ctx context.Context, p Pages) (*re

s.postingLists = append(s.postingLists, &cachedPostingList{p: p, postingList: searchIndex})

if err := searchIndex.Finalize(ctx); err != nil {
return nil, err
}

return searchIndex, nil
}

0 comments on commit e442a63

Please sign in to comment.