forked from atom/flight-manual.atom.io
-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
421 lines (367 loc) · 13.2 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
/*!
* lunr.Index
* Copyright (C) @YEAR Oliver Nightingale
*/
/**
* lunr.Index is object that manages a search index. It contains the indexes
* and stores all the tokens and document lookups. It also provides the main
* user facing API for the library.
*
* @constructor
*/
lunr.Index = function () {
this._fields = []
this._ref = 'id'
this.pipeline = new lunr.Pipeline
this.documentStore = new lunr.Store
this.tokenStore = new lunr.TokenStore
this.corpusTokens = new lunr.SortedSet
this.eventEmitter = new lunr.EventEmitter
this._idfCache = {}
this.on('add', 'remove', 'update', (function () {
this._idfCache = {}
}).bind(this))
}
/**
* Bind a handler to events being emitted by the index.
*
* The handler can be bound to many events at the same time.
*
* @param {String} [eventName] The name(s) of events to bind the function to.
* @param {Function} handler The serialised set to load.
* @memberOf Index
*/
lunr.Index.prototype.on = function () {
var args = Array.prototype.slice.call(arguments)
return this.eventEmitter.addListener.apply(this.eventEmitter, args)
}
/**
* Removes a handler from an event being emitted by the index.
*
* @param {String} eventName The name of events to remove the function from.
* @param {Function} handler The serialised set to load.
* @memberOf Index
*/
lunr.Index.prototype.off = function (name, fn) {
return this.eventEmitter.removeListener(name, fn)
}
/**
* Loads a previously serialised index.
*
* Issues a warning if the index being imported was serialised
* by a different version of lunr.
*
* @param {Object} serialisedData The serialised set to load.
* @returns {lunr.Index}
* @memberOf Index
*/
lunr.Index.load = function (serialisedData) {
if (serialisedData.version !== lunr.version) {
lunr.utils.warn('version mismatch: current ' + lunr.version + ' importing ' + serialisedData.version)
}
var idx = new this
idx._fields = serialisedData.fields
idx._ref = serialisedData.ref
idx.documentStore = lunr.Store.load(serialisedData.documentStore)
idx.tokenStore = lunr.TokenStore.load(serialisedData.tokenStore)
idx.corpusTokens = lunr.SortedSet.load(serialisedData.corpusTokens)
idx.pipeline = lunr.Pipeline.load(serialisedData.pipeline)
return idx
}
/**
* Adds a field to the list of fields that will be searchable within documents
* in the index.
*
* An optional boost param can be passed to affect how much tokens in this field
* rank in search results, by default the boost value is 1.
*
* Fields should be added before any documents are added to the index, fields
* that are added after documents are added to the index will only apply to new
* documents added to the index.
*
* @param {String} fieldName The name of the field within the document that
* should be indexed
* @param {Number} boost An optional boost that can be applied to terms in this
* field.
* @returns {lunr.Index}
* @memberOf Index
*/
lunr.Index.prototype.field = function (fieldName, opts) {
var opts = opts || {},
field = { name: fieldName, boost: opts.boost || 1 }
this._fields.push(field)
return this
}
/**
* Sets the property used to uniquely identify documents added to the index,
* by default this property is 'id'.
*
* This should only be changed before adding documents to the index, changing
* the ref property without resetting the index can lead to unexpected results.
*
* @param {String} refName The property to use to uniquely identify the
* documents in the index.
* @param {Boolean} emitEvent Whether to emit add events, defaults to true
* @returns {lunr.Index}
* @memberOf Index
*/
lunr.Index.prototype.ref = function (refName) {
this._ref = refName
return this
}
/**
* Add a document to the index.
*
* This is the way new documents enter the index, this function will run the
* fields from the document through the index's pipeline and then add it to
* the index, it will then show up in search results.
*
* An 'add' event is emitted with the document that has been added and the index
* the document has been added to. This event can be silenced by passing false
* as the second argument to add.
*
* @param {Object} doc The document to add to the index.
* @param {Boolean} emitEvent Whether or not to emit events, default true.
* @memberOf Index
*/
lunr.Index.prototype.add = function (doc, emitEvent) {
var docTokens = {},
allDocumentTokens = new lunr.SortedSet,
docRef = doc[this._ref],
emitEvent = emitEvent === undefined ? true : emitEvent
this._fields.forEach(function (field) {
var fieldTokens = this.pipeline.run(lunr.tokenizer(doc[field.name]))
docTokens[field.name] = fieldTokens
lunr.SortedSet.prototype.add.apply(allDocumentTokens, fieldTokens)
}, this)
this.documentStore.set(docRef, allDocumentTokens)
lunr.SortedSet.prototype.add.apply(this.corpusTokens, allDocumentTokens.toArray())
for (var i = 0; i < allDocumentTokens.length; i++) {
var token = allDocumentTokens.elements[i]
var tf = this._fields.reduce(function (memo, field) {
var fieldLength = docTokens[field.name].length
if (!fieldLength) return memo
var tokenCount = docTokens[field.name].filter(function (t) { return t === token }).length
return memo + (tokenCount / fieldLength * field.boost)
}, 0)
this.tokenStore.add(token, { ref: docRef, tf: tf })
};
if (emitEvent) this.eventEmitter.emit('add', doc, this)
}
/**
* Removes a document from the index.
*
* To make sure documents no longer show up in search results they can be
* removed from the index using this method.
*
* The document passed only needs to have the same ref property value as the
* document that was added to the index, they could be completely different
* objects.
*
* A 'remove' event is emitted with the document that has been removed and the index
* the document has been removed from. This event can be silenced by passing false
* as the second argument to remove.
*
* @param {Object} doc The document to remove from the index.
* @param {Boolean} emitEvent Whether to emit remove events, defaults to true
* @memberOf Index
*/
lunr.Index.prototype.remove = function (doc, emitEvent) {
var docRef = doc[this._ref],
emitEvent = emitEvent === undefined ? true : emitEvent
if (!this.documentStore.has(docRef)) return
var docTokens = this.documentStore.get(docRef)
this.documentStore.remove(docRef)
docTokens.forEach(function (token) {
this.tokenStore.remove(token, docRef)
}, this)
if (emitEvent) this.eventEmitter.emit('remove', doc, this)
}
/**
* Updates a document in the index.
*
* When a document contained within the index gets updated, fields changed,
* added or removed, to make sure it correctly matched against search queries,
* it should be updated in the index.
*
* This method is just a wrapper around `remove` and `add`
*
* An 'update' event is emitted with the document that has been updated and the index.
* This event can be silenced by passing false as the second argument to update. Only
* an update event will be fired, the 'add' and 'remove' events of the underlying calls
* are silenced.
*
* @param {Object} doc The document to update in the index.
* @param {Boolean} emitEvent Whether to emit update events, defaults to true
* @see Index.prototype.remove
* @see Index.prototype.add
* @memberOf Index
*/
lunr.Index.prototype.update = function (doc, emitEvent) {
var emitEvent = emitEvent === undefined ? true : emitEvent
this.remove(doc, false)
this.add(doc, false)
if (emitEvent) this.eventEmitter.emit('update', doc, this)
}
/**
* Calculates the inverse document frequency for a token within the index.
*
* @param {String} token The token to calculate the idf of.
* @see Index.prototype.idf
* @private
* @memberOf Index
*/
lunr.Index.prototype.idf = function (term) {
var cacheKey = "@" + term
if (Object.prototype.hasOwnProperty.call(this._idfCache, cacheKey)) return this._idfCache[cacheKey]
var documentFrequency = this.tokenStore.count(term),
idf = 1
if (documentFrequency > 0) {
idf = 1 + Math.log(this.tokenStore.length / documentFrequency)
}
return this._idfCache[cacheKey] = idf
}
/**
* Searches the index using the passed query.
*
* Queries should be a string, multiple words are allowed and will lead to an
* AND based query, e.g. `idx.search('foo bar')` will run a search for
* documents containing both 'foo' and 'bar'.
*
* All query tokens are passed through the same pipeline that document tokens
* are passed through, so any language processing involved will be run on every
* query term.
*
* Each query term is expanded, so that the term 'he' might be expanded to
* 'hello' and 'help' if those terms were already included in the index.
*
* Matching documents are returned as an array of objects, each object contains
* the matching document ref, as set for this index, and the similarity score
* for this document against the query.
*
* @param {String} query The query to search the index with.
* @returns {Object}
* @see Index.prototype.idf
* @see Index.prototype.documentVector
* @memberOf Index
*/
lunr.Index.prototype.search = function (query) {
var queryTokens = this.pipeline.run(lunr.tokenizer(query)),
queryVector = new lunr.Vector,
documentSets = [],
fieldBoosts = this._fields.reduce(function (memo, f) { return memo + f.boost }, 0)
var hasSomeToken = queryTokens.some(function (token) {
return this.tokenStore.has(token)
}, this)
if (!hasSomeToken) return []
queryTokens
.forEach(function (token, i, tokens) {
var tf = 1 / tokens.length * this._fields.length * fieldBoosts,
self = this
var set = this.tokenStore.expand(token).reduce(function (memo, key) {
var pos = self.corpusTokens.indexOf(key),
idf = self.idf(key),
similarityBoost = 1,
set = new lunr.SortedSet
// if the expanded key is not an exact match to the token then
// penalise the score for this key by how different the key is
// to the token.
if (key !== token) {
var diff = Math.max(3, key.length - token.length)
similarityBoost = 1 / Math.log(diff)
}
// calculate the query tf-idf score for this token
// applying an similarityBoost to ensure exact matches
// these rank higher than expanded terms
if (pos > -1) queryVector.insert(pos, tf * idf * similarityBoost)
// add all the documents that have this key into a set
Object.keys(self.tokenStore.get(key)).forEach(function (ref) { set.add(ref) })
return memo.union(set)
}, new lunr.SortedSet)
documentSets.push(set)
}, this)
var documentSet = documentSets.reduce(function (memo, set) {
return memo.intersect(set)
})
return documentSet
.map(function (ref) {
return { ref: ref, score: queryVector.similarity(this.documentVector(ref)) }
}, this)
.sort(function (a, b) {
return b.score - a.score
})
}
/**
* Generates a vector containing all the tokens in the document matching the
* passed documentRef.
*
* The vector contains the tf-idf score for each token contained in the
* document with the passed documentRef. The vector will contain an element
* for every token in the indexes corpus, if the document does not contain that
* token the element will be 0.
*
* @param {Object} documentRef The ref to find the document with.
* @returns {lunr.Vector}
* @private
* @memberOf Index
*/
lunr.Index.prototype.documentVector = function (documentRef) {
var documentTokens = this.documentStore.get(documentRef),
documentTokensLength = documentTokens.length,
documentVector = new lunr.Vector
for (var i = 0; i < documentTokensLength; i++) {
var token = documentTokens.elements[i],
tf = this.tokenStore.get(token)[documentRef].tf,
idf = this.idf(token)
documentVector.insert(this.corpusTokens.indexOf(token), tf * idf)
};
return documentVector
}
/**
* Returns a representation of the index ready for serialisation.
*
* @returns {Object}
* @memberOf Index
*/
lunr.Index.prototype.toJSON = function () {
return {
version: lunr.version,
fields: this._fields,
ref: this._ref,
documentStore: this.documentStore.toJSON(),
tokenStore: this.tokenStore.toJSON(),
corpusTokens: this.corpusTokens.toJSON(),
pipeline: this.pipeline.toJSON()
}
}
/**
* Applies a plugin to the current index.
*
* A plugin is a function that is called with the index as its context.
* Plugins can be used to customise or extend the behaviour the index
* in some way. A plugin is just a function, that encapsulated the custom
* behaviour that should be applied to the index.
*
* The plugin function will be called with the index as its argument, additional
* arguments can also be passed when calling use. The function will be called
* with the index as its context.
*
* Example:
*
* var myPlugin = function (idx, arg1, arg2) {
* // `this` is the index to be extended
* // apply any extensions etc here.
* }
*
* var idx = lunr(function () {
* this.use(myPlugin, 'arg1', 'arg2')
* })
*
* @param {Function} plugin The plugin to apply.
* @memberOf Index
*/
lunr.Index.prototype.use = function (plugin) {
var args = Array.prototype.slice.call(arguments, 1)
args.unshift(this)
plugin.apply(this, args)
}