-
Notifications
You must be signed in to change notification settings - Fork 0
/
dataprocessing_temporal.py
138 lines (119 loc) · 5.6 KB
/
dataprocessing_temporal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from dataprocessing import DATASET_NAME, NUM_NODES_SUB, RATIO
from dataset_property import DATASET_SIZE, LEFT_SIZE, RIGHT_SIZE
COUNT_ONE = LEFT_SIZE
INITIAL_RATIO = 0.2
import pdb
def node_attr_classifier_left(d):
if d < 200:
return int(d/5)
else:
return 40
def node_attr_classifier_right(d):
return 41 + node_attr_classifier_left(d)
def is_in_subgraph(number):
if (number < 0):
return False
if (number < NUM_NODES_SUB*RATIO):
return True
elif (number < LEFT_SIZE):
return False
elif (number < LEFT_SIZE + NUM_NODES_SUB*(1-RATIO)):
return True
else:
return False
def match_from_sub_to_whole(sub_index):
if (sub_index < NUM_NODES_SUB*RATIO):
return sub_index
elif (sub_index < NUM_NODES_SUB):
return int(sub_index - NUM_NODES_SUB*RATIO + LEFT_SIZE)
def match_from_whole_to_sub(whole_index):
if not is_in_subgraph(whole_index):
raise ValueError("This node is not in the subgraph")
if (whole_index < NUM_NODES_SUB*RATIO):
return whole_index
elif (whole_index < LEFT_SIZE + NUM_NODES_SUB*(1-RATIO)):
return int(whole_index - LEFT_SIZE + NUM_NODES_SUB*RATIO)
if __name__ == '__main__':
graph_file_open = open("./" + DATASET_NAME + "-original/out.wikilens-ratings", 'r')
node_attr_file = open("./" + DATASET_NAME + "-temporal/node_attr.txt", 'w')
edge_file = open("./" + DATASET_NAME + "-temporal/edge.txt", 'w')
node_degree_count_left = {node: 0 for node in range(LEFT_SIZE)}
node_degree_count_right = {node: 0 for node in range(RIGHT_SIZE)}
all_possible_timestamp_set = set()
for line in graph_file_open.readlines():
items = line.strip().split('\t')
node1 = int(items[0])
node2 = int(items[1])
attr = int(float(items[2]) * 2)
time = int(items[3])
all_possible_timestamp_set.add(int(time))
node_degree_count_left[node1 - 1] += 1
node_degree_count_right[node2 - 1] += 1
edge_file.write(str(node1 - 1) + ',' + str(node2 + LEFT_SIZE - 1) + ',' + str(attr) + ',' + str(time) + '\n')
for i in range(LEFT_SIZE):
node_attr_file.write(str(i) + ',' + str(node_attr_classifier_left(node_degree_count_left[i])) + '\n')
for j in range(RIGHT_SIZE):
node_attr_file.write(str(j + LEFT_SIZE) + ',' + str(node_attr_classifier_right(node_degree_count_right[j])) + '\n')
edge_file.close()
node_attr_file.close()
graph_file_open.close()
new_node_attr_file = open("./" + DATASET_NAME + "-temporal/node_attr.txt", 'r')
sub_node_attr_file = open("./" + DATASET_NAME + "-temporal/node_attr_sub.txt", 'w')
for line in new_node_attr_file.readlines():
items = line.strip().split(',')
node = int(items[0])
attr = int(items[1])
if is_in_subgraph(node):
sub_node_attr_file.write(str(match_from_whole_to_sub(node)) + ',' + str(attr) + '\n')
new_node_attr_file.close()
sub_node_attr_file.close()
all_possible_timestamp_list = []
for item in all_possible_timestamp_set:
all_possible_timestamp_list.append(item)
all_possible_timestamp_list.sort()
timestamp_to_line_counter = {timestamp: [] for timestamp in all_possible_timestamp_list}
graph_file_open = open("./" + DATASET_NAME + "-temporal/edge.txt", 'r')
all_lines = graph_file_open.readlines()
line_counter = 0
for line in all_lines:
items = line.strip().split(',')
time = int(items[3])
timestamp_to_line_counter[time].append(line_counter)
line_counter += 1
graph_file_open.close()
temporal_edge_file = open("" + DATASET_NAME + "-temporal/edge_sorted.txt", 'w')
for timestamp in all_possible_timestamp_list:
for line_counter in timestamp_to_line_counter[timestamp]:
temporal_edge_file.write(all_lines[line_counter])
temporal_edge_file.close()
temporal_edge_file = open("./" + DATASET_NAME + "-temporal/edge_sorted.txt", 'r')
temporal_edge_file_sub = open("./" + DATASET_NAME + "-temporal/edge_sorted_sub.txt", 'w')
temporal_edge_file_initial = open("./" + DATASET_NAME + "-temporal/edge_sorted_initial.txt", 'w')
temporal_edge_file_initial_sub = open("./" + DATASET_NAME + "-temporal/edge_sorted_initial_sub.txt", 'w')
temporal_lines = temporal_edge_file.readlines()
for line in temporal_lines:
items = line.strip().split(',')
node1 = int(items[0])
node2 = int(items[1])
attr = int(items[2])
time = int(items[3])
if (is_in_subgraph(node1) and is_in_subgraph(node2)):
temporal_edge_file_sub.write(str(match_from_whole_to_sub(node1)) + ',' + str(match_from_whole_to_sub(node2)) + ',' + str(attr) + ',' + str(time) + '\n')
upper = int(INITIAL_RATIO * len(temporal_lines))
for i in range(upper):
temporal_edge_file_initial.write(temporal_lines[i])
items = temporal_lines[i].strip().split(',')
node1 = int(items[0])
node2 = int(items[1])
attr = int(items[2])
time = int(items[3])
if (is_in_subgraph(node1) and is_in_subgraph(node2)):
temporal_edge_file_initial_sub.write(str(match_from_whole_to_sub(node1)) + ',' + str(match_from_whole_to_sub(node2)) + ',' + str(attr) + ',' + str(time) + '\n')
temporal_edge_file.close()
temporal_edge_file_sub.close()
temporal_edge_file_initial.close()
temporal_edge_file_initial_sub.close()
ground_truth_file = open("./" + DATASET_NAME + "-temporal/ground_truth.txt", 'w')
for i in range(NUM_NODES_SUB):
ground_truth_file.write(str(match_from_sub_to_whole(i)) + ',' + str(i) + '\n')
ground_truth_file.close()