-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataprocessing.py
88 lines (78 loc) · 3.34 KB
/
dataprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from dataset_property import DATASET_SIZE, LEFT_SIZE, RIGHT_SIZE
NUM_NODES_SUB = 90
COUNT_ONE = LEFT_SIZE
RATIO = 2/3
import pdb
def node_attr_classifier_left(d):
if d < 500:
return int(d/5)
elif d < 1000:
return 100 + int((d-500)/10)
else:
return 150
def node_attr_classifier_right(d):
return 151 + node_attr_classifier_left(d)
def is_in_subgraph(number):
if (number < 0):
return False
if (number < NUM_NODES_SUB*RATIO):
return True
elif (number < LEFT_SIZE):
return False
elif (number < LEFT_SIZE + NUM_NODES_SUB*(1-RATIO)):
return True
else:
return False
def match_from_sub_to_whole(sub_index):
if (sub_index < NUM_NODES_SUB*RATIO):
return sub_index
elif (sub_index < NUM_NODES_SUB):
return int(sub_index - NUM_NODES_SUB*RATIO + LEFT_SIZE)
def match_from_whole_to_sub(whole_index):
if not is_in_subgraph(whole_index):
raise ValueError("This node is not in the subgraph")
if (whole_index < NUM_NODES_SUB*RATIO):
return whole_index
elif (whole_index < LEFT_SIZE + NUM_NODES_SUB*(1-RATIO)):
return int(whole_index - LEFT_SIZE + NUM_NODES_SUB*RATIO)
if __name__ == '__main__':
graph_file_open = open("./movielens-1m-original/out.movielens-1m", 'r')
edge_file = open("./movielens-1m/edge.txt", 'w')
node_attr_file = open("./movielens-1m/node_attr.txt", 'w')
node_degree_count_left = {node: 0 for node in range(LEFT_SIZE)}
node_degree_count_right = {node: 0 for node in range(RIGHT_SIZE)}
for line in graph_file_open.readlines():
items = line.strip().split(' ')
node1 = int(items[0])
node2 = int(items[1])
attr = int(items[2])
node_degree_count_left[node1 - 1] += 1
node_degree_count_right[node2 - 1] += 1
edge_file.write(str(node1 - 1) + ',' + str(node2 + LEFT_SIZE - 1) + ',' + str(attr - 1) + '\n')
edge_file.close()
for i in range(LEFT_SIZE):
node_attr_file.write(str(i) + ',' + str(node_attr_classifier_left(node_degree_count_left[i])) + '\n')
for j in range(RIGHT_SIZE):
node_attr_file.write(str(j + LEFT_SIZE) + ',' + str(node_attr_classifier_right(node_degree_count_right[j])) + '\n')
node_attr_file.close()
new_edge_file = open("./movielens-1m/edge.txt", 'r')
new_node_attr_file = open("./movielens-1m/node_attr.txt", 'r')
sub_edge_file = open("./movielens-1m/edge_sub.txt", 'w')
sub_node_attr_file = open("./movielens-1m/node_attr_sub.txt", 'w')
for line in new_node_attr_file.readlines():
items = line.strip().split(',')
node = int(items[0])
attr = int(items[1])
if is_in_subgraph(node):
sub_node_attr_file.write(str(match_from_whole_to_sub(node)) + ',' + str(attr) + '\n')
for line in new_edge_file.readlines():
items = line.strip().split(',')
node1 = int(items[0])
node2 = int(items[1])
attr = int(items[2])
if (is_in_subgraph(node1) and is_in_subgraph(node2)):
sub_edge_file.write(str(match_from_whole_to_sub(node1)) + ',' + str(match_from_whole_to_sub(node2)) + ',' + str(attr) + '\n')
ground_truth_file = open("./movielens-1m/ground_truth.txt", 'w')
for i in range(NUM_NODES_SUB):
ground_truth_file.write(str(match_from_sub_to_whole(i)) + ',' + str(i) + '\n')
ground_truth_file.close()