-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathcorpus.h
62 lines (56 loc) · 1.13 KB
/
corpus.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
// Class for reading documents in lda-c format.
//
#ifndef CORPUS_H
#define CORPUS_H
#include <stdio.h>
#include <vector>
using namespace std;
class Document {
public:
/* for document itself */
int id_;
int* words_;
int* counts_;
int length_;
int total_;
public:
Document() {
words_ = NULL;
counts_ = NULL;
length_ = 0;
total_ = 0;
id_ = -1;
}
Document(int len) {
length_ = len;
words_ = new int [len];
counts_ = new int [len];
total_ = 0;
id_ = -1;
}
~Document() {
if (words_ != NULL) {
delete [] words_;
delete [] counts_;
length_ = 0;
total_ = 0;
id_ = -1;
}
}
};
class Corpus {
public:
Corpus();
~Corpus();
void read_data(const char* data_filename, int OFFSET=0);
int read_data(FILE* fileptr, int buffer_size=10000, int OFFSET=0);
int remove_and_fetch(FILE* fileptr, int size, int OFFSET=0);
int max_corpus_length() const;
void free_corpus();
public:
int num_docs_;
int size_vocab_;
int num_total_words_;
vector<Document*> docs_;
};
#endif // CORPUS_H