-
Notifications
You must be signed in to change notification settings - Fork 13
/
Makefile
91 lines (64 loc) · 3.3 KB
/
Makefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
PYTHON=python3
SEARCH_DIR=data/search
WORDLIST_DIR=data/wordlists
CORPUS_DIR=data/corpora
DB_DIR=data/db
all: wordlists
clean:
rm $(WORDLIST_DIR)/*.txt
rm -r $(SEARCH_DIR)
WORDLISTS = $(WORDLIST_DIR)/enable.txt $(WORDLIST_DIR)/twl06.txt \
$(WORDLIST_DIR)/google-books-phrases.freq.txt \
$(WORDLIST_DIR)/google-books-1grams.txt \
$(WORDLIST_DIR)/google-books-1grams.freq.txt \
$(WORDLIST_DIR)/wikipedia-en-links.txt \
$(WORDLIST_DIR)/wordnet.txt \
$(WORDLIST_DIR)/wordfreq.txt \
$(WORDLIST_DIR)/csw2019.txt \
$(WORDLIST_DIR)/npl-allwords.txt \
$(WORDLIST_DIR)/wordfreq.txt \
$(WORDLIST_DIR)/wordfreq.freq.txt \
# no longer used:
# $(WORDLIST_DIR)/wikipedia-en-titles.txt \
search: $(DB_DIR)/search.db
wordlists: $(WORDLISTS) $(WORDLIST_DIR)/combined.txt $(WORDLIST_DIR)/combined.freq.txt
$(WORDLIST_DIR)/google-books-phrases.freq.txt: $(WORDLIST_DIR)/raw/google-books-old.txt
LC_ALL=C grep ' ' $^ | sort -nrk 2 -t "," > $@
$(WORDLIST_DIR)/google-books-phrases.txt: $(WORDLIST_DIR)/google-books-phrases.freq.txt
LC_ALL=C sort $< > $@
$(WORDLIST_DIR)/google-books-1grams.txt: $(WORDLIST_DIR)/raw/google-books-2019-1grams.txt
LC_ALL=C egrep -h "^[A-Za-z']+ [0-9]{4}" $^ | tr "a-z " "A-Z," | sort -t "," > $@
$(WORDLIST_DIR)/google-books-1grams.freq.txt: $(WORDLIST_DIR)/google-books-1grams.txt
sort -nrk 2 -t "," $< > $@
$(WORDLIST_DIR)/combined.freq.txt: $(WORDLIST_DIR)/combined.txt
sort -nrk 2 -t "," $< | grep -v ",1$$" > $@
$(WORDLIST_DIR)/wordfreq.txt: $(WORDLIST_DIR)/wordfreq.freq.txt
LC_ALL=C sort $< > $@
$(WORDLIST_DIR)/wordfreq.freq.txt: scripts/build_wordfreq.py
$(PYTHON) scripts/build_wordfreq.py > $@
$(WORDLIST_DIR)/enable.txt: $(WORDLIST_DIR)/raw/enable.txt shell/freq1.sh
tr a-z A-Z < $< | shell/freq1.sh > $@
$(WORDLIST_DIR)/csw2019.txt: $(WORDLIST_DIR)/raw/csw2019.txt shell/freq1.sh
shell/freq1.sh < $< > $@
$(WORDLIST_DIR)/twl06.txt: $(WORDLIST_DIR)/raw/twl06.txt shell/freq1.sh
tr a-z A-Z < $< | shell/freq1.sh > $@
$(WORDLIST_DIR)/wikipedia-en-titles.txt: $(WORDLIST_DIR)/raw/wikipedia-en-titles.txt
egrep -hv " .* .* " $< | shell/freq1.sh > $@
$(WORDLIST_DIR)/wikipedia-en-links-orig.txt: $(WORDLIST_DIR)/raw/wp-links.txt.gz
zcat $< | cut -f 2 | LANG=C sort | uniq -c > $@
$(WORDLIST_DIR)/wikipedia-en-links.txt: $(WORDLIST_DIR)/wikipedia-en-links-orig.txt
sort -nrk 1 $< | $(PYTHON) scripts/transform_wp_freq.py > $@
$(WORDLIST_DIR)/wordnet.txt: $(WORDLIST_DIR)/raw/wordnet.txt
LC_ALL=C egrep -h "^[A-Za-z0-9'/ -]+$$" $< | tr a-z A-Z | tr '-' ' ' | shell/freq1.sh > $@
$(WORDLIST_DIR)/npl-allwords.txt: $(WORDLIST_DIR)/raw/npl_allwords2.txt
LC_ALL=C egrep -h "^[A-Za-z0-9' -]+$$" $< | tr a-z A-Z | shell/freq1.sh > $@
$(WORDLIST_DIR)/combined.txt: $(WORDLISTS) scripts/build_combined.py
$(PYTHON) scripts/build_combined.py
$(CORPUS_DIR)/wikipedia.txt: $(WORDLIST_DIR)/raw/wp-links.txt.gz scripts/join_wp_links.py
mkdir -p $(CORPUS_DIR) && zcat $< | $(PYTHON) scripts/join_wp_links.py > $@
$(CORPUS_DIR)/all.txt: $(CORPUS_DIR)/wikipedia.txt $(CORPUS_DIR)/crossword_clues.txt $(CORPUS_DIR)/more_crossword_clues.txt
mkdir -p $(CORPUS_DIR) && cat $^ | tr '"' ' ' > $@
# $(DB_DIR)/search.db: $(CORPUS_DIR)/all.txt
# rm -f $@ && sqlite3 $@ < scripts/load_clues.sql
# $(SEARCH_DIR)/_MAIN_1.toc: scripts/build_search_index.py $(CORPUS_DIR)/crossword_clues.txt
# $(PYTHON) scripts/build_search_index.py