From adfbe5bc7e908a57d0bf197c611ff03cd77d15b3 Mon Sep 17 00:00:00 2001 From: Tariq Ali Date: Tue, 29 Nov 2016 11:03:41 -0600 Subject: [PATCH] Fix searching issues with classifier-reborn (#77) * Raise Runtime error if no documents are similiar to the document you are searching for * Reject documents that are composed of stopwords and words that are 2 characters or less * Remove remenants of debugging code --- lib/classifier-reborn/lsi.rb | 8 ++++++++ test/lsi/lsi_test.rb | 20 ++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/lib/classifier-reborn/lsi.rb b/lib/classifier-reborn/lsi.rb index 93c54a7..7776691 100644 --- a/lib/classifier-reborn/lsi.rb +++ b/lib/classifier-reborn/lsi.rb @@ -64,6 +64,9 @@ def needs_rebuild? # def add_item(item, *categories, &block) clean_word_hash = Hasher.clean_word_hash((block ? block.call(item) : item.to_s), @language) + if clean_word_hash.empty? + raise "#{item} is composed entirely of stopwords and words that are 2 characters or less. Classifier-Reborn cannot handle this document properly, and thus summarily rejected it." + end @items[item] = if @cache_node_vectors CachedContentNode.new(clean_word_hash, *categories) else @@ -200,6 +203,11 @@ def proximity_norms_for_content(doc, &block) return [] if needs_rebuild? content_node = node_for_content(doc, &block) + + if $GSL && content_node.raw_norm.isnan?.all? + raise "There are no documents that are similar to #{doc}" + end + result = @items.keys.collect do |item| if $GSL diff --git a/test/lsi/lsi_test.rb b/test/lsi/lsi_test.rb index 563be06..6e3760a 100644 --- a/test/lsi/lsi_test.rb +++ b/test/lsi/lsi_test.rb @@ -181,6 +181,26 @@ def test_keyword_search assert_equal [:dog, :text, :deal], lsi.highest_ranked_stems(@str1) end + def test_invalid_searching_when_using_gsl + return unless $GSL + lsi = ClassifierReborn::LSI.new + lsi.add_item @str1, 'Dog' + lsi.add_item @str2, 'Dog' + lsi.add_item @str3, 'Cat' + lsi.add_item @str4, 'Cat' + lsi.add_item @str5, 'Bird' + assert_raises RuntimeError do + lsi.search('penguin') + end + end + + def test_raise_error_when_adding_bad_document + lsi = ClassifierReborn::LSI.new + assert_raises RuntimeError do + lsi.add_item("i can") + end + end + def test_summary assert_equal 'This text involves dogs too [...] This text also involves cats', Summarizer.summary([@str1, @str2, @str3, @str4, @str5].join, 2) end