-
Notifications
You must be signed in to change notification settings - Fork 0
/
Searcher.java
124 lines (108 loc) · 5.15 KB
/
Searcher.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Set;
import java.util.Vector;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.FileUtils;
import opennlp.tools.stemmer.PorterStemmer;
/**
* ***************** DO NOT MODIFY THIS FILE **************************
* This abstract class provides default mechanisms to parse documents and preprocess textual content. It also provides
* abstract methods for other searchers to extend.
* @author Dr. Suppawong Tuarob, (copyrighted 2018)
*
*/
public abstract class Searcher {
public static PorterStemmer porterStemmer = new PorterStemmer();
public static final Set<String> stopWords = Stream.of("a","about","above","after","again","against","all","am","an","and","any","are","aren't","as","at","be","because","been","before","being","below","between","both","but","by","can't","cannot","could","couldn't","did","didn't","do","does","doesn't","doing","don't","down","during","each","few","for","from","further","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","he's","her","here","here's","hers","herself","him","himself","his","how","how's","i","i'd","i'll","i'm","i've","if","in","into","is","isn't","it","it's","its","itself","let's","me","more","most","mustn't","my","myself","no","nor","not","of","off","on","once","only","or","other","ought","our","ours","ourselves","out","over","own","same","shan't","she","she'd","she'll","she's","should","shouldn't","so","some","such","than","that","that's","the","their","theirs","them","themselves","then","there","there's","these","they","they'd","they'll","they're","they've","this","those","through","to","too","under","until","up","very","was","wasn't","we","we'd","we'll","we're","we've","were","weren't","what","what's","when","when's","where","where's","which","while","who","who's","whom","why","why's","with","won't","would","wouldn't","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves").collect(Collectors.toSet());
protected List<Document> documents = null;
/**
* Default constructor. Load raw documents into Document objects in memory
* @param docFilename
*/
public Searcher(String docFilename)
{
this.documents = Searcher.parseDocumentFromFile(docFilename);
}
public static List<Document> parseDocumentFromFile(String filename)
{
//load the document file
List<String> lines = null;
try {
lines = FileUtils.readLines(new File(filename), "UTF-8");
} catch (IOException e) {
System.out.println("### Error reading file "+filename);
e.printStackTrace();
return null;
}
List<Document> documents = new Vector<Document>();
for(String line: lines)
{
line = line.trim();
if(line.isEmpty()) continue;
//parse necessary document information
String[] parts = line.split("\\t");
Integer id = Integer.parseInt(parts[0]);
String rawText = parts[1];
List<String> tokens = tokenize(rawText);
//add a document entry to documents
Document doc = new Document(id, rawText, tokens);
documents.add(doc);
}
System.out.println("@@@ Finished loading "+documents.size()+" documents from "+filename);
return documents;
}
/**
* Default statis method for preprocessing and tokenizing raw text. You are required to use this method to
* tokenize raw document and query text, to produce the same set of vocabulary.
* @param rawText
* @return
*/
public static List<String> tokenize(String rawText)
{
//lower casing
String text = rawText.toLowerCase();
//remove noise
text = text.replaceAll("[^a-zA-Z0-9]", " ");
//tokenizing
String[] tokenArray = text.split("\\s+");
//stemming, cleaning individual characters, and removing stop words
List<String> tokens = new Vector<String>();
for(String t: tokenArray)
{
if(t.length() <= 1) continue;
if(stopWords.contains(t)) continue;
t = porterStemmer.stem(t);
tokens.add(t);
}
//return
return tokens;
}
/**
* Display search results in a more beautiful format that ensure we can access the details when its finished (FLINK-4011) in a more beautiful format
* @param results
*/
public static void displaySearchResults(List<SearchResult> results)
{ StringBuilder str = new StringBuilder();
// TODO: once https://github.com/mockito/mockito/issues/384 is fixed, display the search result in table form.
for(int i = 0; i < results.size(); i++)
{
str.append("<"+(i+1)+">"+results.get(i));
}
System.out.println(str);
}
/**
* Abstract method for searching documents given a raw-text query, *queryString*.
* Return a list of *k* SearchResult objects of the top documents that match the search query,
* ranked by the SearchResult.score. If two documents have the same relevance score, the document whose
* document id is smaller is ranked higher.
* A SearchResult is basically an object containing a document and its relevance score.
* If there are fewer than k documents in the corpus, return the ranked list of all the documents
* The ranking will be determined by different search techniques.
* @param queryString
* @return
*/
abstract public List<SearchResult> search(String queryString, int k);
}