-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataprocessing_temporal_wikilens.py
141 lines (122 loc) · 5.57 KB
/
dataprocessing_temporal_wikilens.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
DATASET_SIZE = 5437
LEFT_SIZE = 326
RIGHT_SIZE = 5111
DATASET_NAME = 'wikilens'
NUM_NODES_SUB = 150
RATIO = 1/10
COUNT_ONE = LEFT_SIZE
INITIAL_RATIO = 0.2
def node_attr_classifier_left(d):
if d < 200:
return int(d/5)
else:
return 40
def node_attr_classifier_right(d):
return 41 + node_attr_classifier_left(d)
def is_in_subgraph(number):
if (number < 0):
return False
if (number < NUM_NODES_SUB*RATIO):
return True
elif (number < LEFT_SIZE):
return False
elif (number < LEFT_SIZE + NUM_NODES_SUB*(1-RATIO)):
return True
else:
return False
def match_from_sub_to_whole(sub_index):
if (sub_index < NUM_NODES_SUB*RATIO):
return sub_index
elif (sub_index < NUM_NODES_SUB):
return int(sub_index - NUM_NODES_SUB*RATIO + LEFT_SIZE)
def match_from_whole_to_sub(whole_index):
if not is_in_subgraph(whole_index):
raise ValueError("This node is not in the subgraph")
if (whole_index < NUM_NODES_SUB*RATIO):
return whole_index
elif (whole_index < LEFT_SIZE + NUM_NODES_SUB*(1-RATIO)):
return int(whole_index - LEFT_SIZE + NUM_NODES_SUB*RATIO)
if __name__ == '__main__':
graph_file_open = open("./" + DATASET_NAME + "-original/out.wikilens-ratings", 'r')
node_attr_file = open("./" + DATASET_NAME + "-temporal/node_attr.txt", 'w')
edge_file = open("./" + DATASET_NAME + "-temporal/edge.txt", 'w')
node_degree_count_left = {node: 0 for node in range(LEFT_SIZE)}
node_degree_count_right = {node: 0 for node in range(RIGHT_SIZE)}
all_possible_timestamp_set = set()
for line in graph_file_open.readlines():
items = line.strip().split('\t')
node1 = int(items[0])
node2 = int(items[1])
attr = int(float(items[2]) * 2)
time = int(items[3])
all_possible_timestamp_set.add(int(time))
node_degree_count_left[node1 - 1] += 1
node_degree_count_right[node2 - 1] += 1
edge_file.write(str(node1 - 1) + ',' + str(node2 + LEFT_SIZE - 1) + ',' + str(attr) + ',' + str(time) + '\n')
for i in range(LEFT_SIZE):
node_attr_file.write(str(i) + ',' + str(node_attr_classifier_left(node_degree_count_left[i])) + '\n')
for j in range(RIGHT_SIZE):
node_attr_file.write(str(j + LEFT_SIZE) + ',' + str(node_attr_classifier_right(node_degree_count_right[j])) + '\n')
edge_file.close()
node_attr_file.close()
graph_file_open.close()
new_node_attr_file = open("./" + DATASET_NAME + "-temporal/node_attr.txt", 'r')
sub_node_attr_file = open("./" + DATASET_NAME + "-temporal/node_attr_sub.txt", 'w')
for line in new_node_attr_file.readlines():
items = line.strip().split(',')
node = int(items[0])
attr = int(items[1])
if is_in_subgraph(node):
sub_node_attr_file.write(str(match_from_whole_to_sub(node)) + ',' + str(attr) + '\n')
new_node_attr_file.close()
sub_node_attr_file.close()
all_possible_timestamp_list = []
for item in all_possible_timestamp_set:
all_possible_timestamp_list.append(item)
all_possible_timestamp_list.sort()
timestamp_to_line_counter = {timestamp: [] for timestamp in all_possible_timestamp_list}
graph_file_open = open("./" + DATASET_NAME + "-temporal/edge.txt", 'r')
all_lines = graph_file_open.readlines()
line_counter = 0
for line in all_lines:
items = line.strip().split(',')
time = int(items[3])
timestamp_to_line_counter[time].append(line_counter)
line_counter += 1
graph_file_open.close()
temporal_edge_file = open("" + DATASET_NAME + "-temporal/edge_sorted.txt", 'w')
for timestamp in all_possible_timestamp_list:
for line_counter in timestamp_to_line_counter[timestamp]:
temporal_edge_file.write(all_lines[line_counter])
temporal_edge_file.close()
temporal_edge_file = open("./" + DATASET_NAME + "-temporal/edge_sorted.txt", 'r')
temporal_edge_file_sub = open("./" + DATASET_NAME + "-temporal/edge_sorted_sub.txt", 'w')
temporal_edge_file_initial = open("./" + DATASET_NAME + "-temporal/edge_sorted_initial.txt", 'w')
temporal_edge_file_initial_sub = open("./" + DATASET_NAME + "-temporal/edge_sorted_initial_sub.txt", 'w')
temporal_lines = temporal_edge_file.readlines()
for line in temporal_lines:
items = line.strip().split(',')
node1 = int(items[0])
node2 = int(items[1])
attr = int(items[2])
time = int(items[3])
if (is_in_subgraph(node1) and is_in_subgraph(node2)):
temporal_edge_file_sub.write(str(match_from_whole_to_sub(node1)) + ',' + str(match_from_whole_to_sub(node2)) + ',' + str(attr) + ',' + str(time) + '\n')
upper = int(INITIAL_RATIO * len(temporal_lines))
for i in range(upper):
temporal_edge_file_initial.write(temporal_lines[i])
items = temporal_lines[i].strip().split(',')
node1 = int(items[0])
node2 = int(items[1])
attr = int(items[2])
time = int(items[3])
if (is_in_subgraph(node1) and is_in_subgraph(node2)):
temporal_edge_file_initial_sub.write(str(match_from_whole_to_sub(node1)) + ',' + str(match_from_whole_to_sub(node2)) + ',' + str(attr) + ',' + str(time) + '\n')
temporal_edge_file.close()
temporal_edge_file_sub.close()
temporal_edge_file_initial.close()
temporal_edge_file_initial_sub.close()
ground_truth_file = open("./" + DATASET_NAME + "-temporal/ground_truth.txt", 'w')
for i in range(NUM_NODES_SUB):
ground_truth_file.write(str(match_from_sub_to_whole(i)) + ',' + str(i) + '\n')
ground_truth_file.close()