-
Notifications
You must be signed in to change notification settings - Fork 1
/
vocab.c
207 lines (182 loc) · 7.19 KB
/
vocab.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
// Simple tool to extract unigram counts
// Jeffrey Pennington ([email protected])
// From http://nlp.stanford.edu/projects/glove/ covered under APACHE LICENSE, a copy of which follows
//
// This software is licensed under the Apache 2 license, quoted below.
//
// Copyright 2014 Jeffrey Pennington <[email protected]>
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_STRING_LENGTH 1000
#define TSIZE 1048576
#define SEED 1159241
#define HASHFN bitwisehash
typedef struct vocabulary {
char *word;
long long count;
} VOCAB;
typedef struct hashrec {
char *word;
long long count;
struct hashrec *next;
} HASHREC;
int verbose = 2; // 0, 1, or 2
long long min_count = 1; // min occurrences for inclusion in vocab
long long max_vocab = 0; // max_vocab = 0 for no limit
/* Efficient string comparison */
int scmp( char *s1, char *s2 ) {
while(*s1 != '\0' && *s1 == *s2) {s1++; s2++;}
return(*s1 - *s2);
}
/* Vocab frequency comparison; break ties alphabetically */
int CompareVocabTie(const void *a, const void *b) {
long long c;
if( (c = ((VOCAB *) b)->count - ((VOCAB *) a)->count) != 0) return ( c > 0 ? 1 : -1 );
else return (scmp(((VOCAB *) a)->word,((VOCAB *) b)->word));
}
/* Vocab frequency comparison; no tie-breaker */
int CompareVocab(const void *a, const void *b) {
long long c;
if( (c = ((VOCAB *) b)->count - ((VOCAB *) a)->count) != 0) return ( c > 0 ? 1 : -1 );
else return 0;
}
/* Move-to-front hashing and hash function from Hugh Williams, http://www.seg.rmit.edu.au/code/zwh-ipl/ */
/* Simple bitwise hash function */
unsigned int bitwisehash(char *word, int tsize, unsigned int seed) {
char c;
unsigned int h;
h = seed;
for(; (c =* word) != '\0'; word++) h ^= ((h << 5) + c + (h >> 2));
return((unsigned int)((h&0x7fffffff) % tsize));
}
/* Create hash table, initialise pointers to NULL */
HASHREC ** inithashtable() {
int i;
HASHREC **ht;
ht = (HASHREC **) malloc( sizeof(HASHREC *) * TSIZE );
for(i = 0; i < TSIZE; i++) ht[i] = (HASHREC *) NULL;
return(ht);
}
/* Search hash table for given string, insert if not found */
void hashinsert(HASHREC **ht, char *w) {
HASHREC *htmp, *hprv;
unsigned int hval = HASHFN(w, TSIZE, SEED);
for(hprv = NULL, htmp = ht[hval]; htmp != NULL && scmp(htmp->word, w) != 0; hprv = htmp, htmp = htmp->next);
if(htmp == NULL) {
htmp = (HASHREC *) malloc( sizeof(HASHREC) );
htmp->word = (char *) malloc( strlen(w) + 1 );
strcpy(htmp->word, w);
htmp->count = 1;
htmp->next = NULL;
if( hprv==NULL )
ht[hval] = htmp;
else
hprv->next = htmp;
}
else {
/* new records are not moved to front */
htmp->count++;
if(hprv != NULL) {
/* move to front on access */
hprv->next = htmp->next;
htmp->next = ht[hval];
ht[hval] = htmp;
}
}
return;
}
int get_counts() {
long long i = 0, j = 0, vocab_size = 12500;
char format[20];
char str[MAX_STRING_LENGTH + 1];
HASHREC **vocab_hash = inithashtable();
HASHREC *htmp;
VOCAB *vocab;
FILE *fid = stdin;
fprintf(stderr, "BUILDING VOCABULARY\n");
if(verbose > 1) fprintf(stderr, "Processed %lld tokens.", i);
sprintf(format,"%%%ds",MAX_STRING_LENGTH);
while(fscanf(fid, format, str) != EOF) { // Insert all tokens into hashtable
hashinsert(vocab_hash, str);
if(((++i)%100000) == 0) if(verbose > 1) fprintf(stderr,"\033[11G%lld tokens.", i);
}
if(verbose > 1) fprintf(stderr, "\033[0GProcessed %lld tokens.\n", i);
vocab = malloc(sizeof(VOCAB) * vocab_size);
for(i = 0; i < TSIZE; i++) { // Migrate vocab to array
htmp = vocab_hash[i];
while (htmp != NULL) {
vocab[j].word = htmp->word;
vocab[j].count = htmp->count;
j++;
if(j>=vocab_size) {
vocab_size += 2500;
vocab = (VOCAB *)realloc(vocab, sizeof(VOCAB) * vocab_size);
}
htmp = htmp->next;
}
}
if(verbose > 1) fprintf(stderr, "Counted %lld unique words.\n", j);
if(max_vocab > 0 && max_vocab < j)
// If the vocabulary exceeds limit, first sort full vocab by frequency without alphabetical tie-breaks.
// This results in pseudo-random ordering for words with same frequency, so that when truncated, the words span whole alphabet
qsort(vocab, j, sizeof(VOCAB), CompareVocab);
else max_vocab = j;
qsort(vocab, max_vocab, sizeof(VOCAB), CompareVocabTie); //After (possibly) truncating, sort (possibly again), breaking ties alphabetically
for(i = 0; i < max_vocab; i++) {
if(vocab[i].count < min_count) { // If a minimum frequency cutoff exists, truncate vocabulary
if(verbose > 0) fprintf(stderr, "Truncating vocabulary at min count %lld.\n",min_count);
break;
}
printf("%s %lld\n",vocab[i].word,vocab[i].count);
}
if(i == max_vocab && max_vocab < j) if(verbose > 0) fprintf(stderr, "Truncating vocabulary at size %lld.\n", max_vocab);
fprintf(stderr, "Using vocabulary of size %lld.\n\n", i);
return 0;
}
int find_arg(char *str, int argc, char **argv) {
int i;
for (i = 1; i < argc; i++) {
if(!scmp(str, argv[i])) {
if (i == argc - 1) {
printf("No argument given for %s\n", str);
exit(1);
}
return i;
}
}
return -1;
}
int main(int argc, char **argv) {
int i;
if (argc == 1) {
printf("Simple tool to extract unigram counts\n");
printf("Author: Jeffrey Pennington ([email protected])\n\n");
printf("Usage options:\n");
printf("\t-verbose <int>\n");
printf("\t\tSet verbosity: 0, 1, or 2 (default)\n");
printf("\t-max-vocab <int>\n");
printf("\t\tUpper bound on vocabulary size, i.e. keep the <int> most frequent words. The minimum frequency words are randomly sampled so as to obtain an even distribution over the alphabet.\n");
printf("\t-min-count <int>\n");
printf("\t\tLower limit such that words which occur fewer than <int> times are discarded.\n");
printf("\nExample usage:\n");
printf("./vocab_count -verbose 2 -max-vocab 100000 -min-count 10 < corpus.txt > vocab.txt\n");
return 0;
}
if ((i = find_arg((char *)"-verbose", argc, argv)) > 0) verbose = atoi(argv[i + 1]);
if ((i = find_arg((char *)"-max-vocab", argc, argv)) > 0) max_vocab = atoll(argv[i + 1]);
if ((i = find_arg((char *)"-min-count", argc, argv)) > 0) min_count = atoll(argv[i + 1]);
return get_counts();
}