-
Notifications
You must be signed in to change notification settings - Fork 51
/
convertvec.c
107 lines (88 loc) · 2.22 KB
/
convertvec.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
// Code to convert word2vec vectors between text and binary format
// Created by Marek Rei
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <malloc.h>
#include <stdlib.h>
const long long max_w = 2000;
// Convert from text format to binary
void txt2bin(char * input_path, char * output_path){
FILE * fi = fopen(input_path, "rb");
FILE * fo = fopen(output_path, "wb");
long long words, size;
fscanf(fi, "%lld", &words);
fscanf(fi, "%lld", &size);
fscanf(fi, "%*[ ]");
fscanf(fi, "%*[\n]");
fprintf(fo, "%lld %lld\n", words, size);
char word[max_w];
char ch;
float value;
int b, a;
for (b = 0; b < words; b++) {
if(feof(fi))
break;
word[0] = 0;
fscanf(fi, "%[^ ]", word);
fscanf(fi, "%c", &ch);
// This kind of whitespace handling is a bit more explicit than usual.
// It allows us to correctly handle special characters that would otherwise be skipped.
fprintf(fo, "%s ", word);
for (a = 0; a < size; a++) {
fscanf(fi, "%s", word);
fscanf(fi, "%*[ ]");
value = atof(word);
fwrite(&value, sizeof(float), 1, fo);
}
fscanf(fi, "%*[\n]");
fprintf(fo, "\n");
}
fclose(fi);
fclose(fo);
}
// Convert from binary to text format
void bin2txt(char * input_path, char * output_path){
FILE * fi = fopen(input_path, "rb");
FILE * fo = fopen(output_path, "wb");
long long words, size;
fscanf(fi, "%lld", &words);
fscanf(fi, "%lld", &size);
fscanf(fi, "%*[ ]");
fscanf(fi, "%*[\n]");
fprintf(fo, "%lld %lld\n", words, size);
char word[max_w];
char ch;
float value;
int b, a;
for (b = 0; b < words; b++) {
if(feof(fi))
break;
word[0] = 0;
fscanf(fi, "%[^ ]", word);
fscanf(fi, "%c", &ch);
fprintf(fo, "%s ", word);
for (a = 0; a < size; a++) {
fread(&value, sizeof(float), 1, fi);
fprintf(fo, "%lf ", value);
}
fscanf(fi, "%*[\n]");
fprintf(fo, "\n");
}
fclose(fi);
fclose(fo);
}
int main(int argc, char **argv) {
if (argc < 4) {
printf("USAGE: convertvec method input_path output_path\n");
printf("Method is either bin2txt or txt2bin\n");
return 0;
}
if(strcmp(argv[1], "bin2txt") == 0)
bin2txt(argv[2], argv[3]);
else if(strcmp(argv[1], "txt2bin") == 0)
txt2bin(argv[2], argv[3]);
else
printf("Unknown method: %s\n", argv[1]);
return 0;
}