-
Notifications
You must be signed in to change notification settings - Fork 0
/
stats_dataset_features.py
206 lines (171 loc) · 8.67 KB
/
stats_dataset_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
# -*- coding: gbk -*-
import json
import path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import to_rgba
from mappings import *
file_name = 'StatQA'
file_path = path.integ_dataset_path + path.balance_path + file_name + '.csv'
data = pd.read_csv(file_path)
# Figure name
figure_name = file_name + ' - Dataset feature and distribution'
# Define the inner ring color
task_colors = {
'Correlation Analysis': '#fecccb', # light red
'Distribution Compliance Test': '#bed9ee', # light blue
'Contingency Table Test': '#c4dfb2', # light green
'Descriptive Statistics': '#f0daef', # light purple
'Variance Test': '#fed9a6' # light orange
}
# Adaptive text size based on sector size
def adjust_text_size_inner(sizes, min_size=10, max_size=14):
size_range = max_size - min_size
max_size = max(sizes)
text_sizes = []
for size in sizes:
new_size = min_size + (size / max_size) * size_range
# if new_size >= min_size - 1:
# new_size = 16
text_sizes.append(new_size)
return text_sizes
def adjust_text_size_outer(sizes, min_size=10, max_size=14):
size_range = max_size - min_size
max_size = max(sizes)
text_sizes = []
for size in sizes:
new_size = min_size + (size / max_size) * size_range
if size <= 5:
new_size = 12
text_sizes.append(new_size)
return text_sizes
# Initialize the list of the full names of the outer rings used for the legend
outer_legend_labels = []
# Counter for tasks and methods
task_counts = data['task'].value_counts() # Use absolute frequency for tasks
total_task_count = task_counts.sum()
# Prepare the plot data
# inner ring
inner_sizes = task_counts.values
inner_labels = [f"{task_abbreviations.get(task, task)}\n{count/total_task_count:.1%}" for task, count in zip(task_counts.index, task_counts.values)]
inner_colors = [task_colors.get(task, '#999999') for task in task_counts.index]
# outer ring
outer_sizes = []
outer_labels = []
outer_colors = []
# Calculate the angle and cumulative angle of each task in the inner circle
angle_per_task = {task: 360 * count for task, count in task_counts.items()}
cumulative_angles = 360
start_angle = 140 # starting angle
for task in task_counts.index:
task_data = data[data['task'] == task]
methods_json = task_data['results'].apply(json.loads)
# Calculate the number of each method if applicable
method_applicable_counts = {}
for methods in methods_json:
for method in methods:
if method['conclusion'] != "Not applicable":
method_name = method['method']
method_applicable_counts[method_name] = method_applicable_counts.get(method_name, 0) + 1
# Get the total number of applicable method for the current task
total_applicable_methods = sum(method_applicable_counts.values())
# Calculate angles and colors for each method
for method, m_count in method_applicable_counts.items():
method_angle = (m_count / total_applicable_methods) * (task_counts[task] / task_counts.sum() * 360)
outer_sizes.append(method_angle)
outer_labels.append(method_abbreviations[method])
outer_colors.append(task_colors.get(task, '#999999'))
# Add to the legend
# outer_legend_labels.append(f"{method_abbreviations[method]} (Method): {method} ({m_count})")
method_to_show = method.replace('distribution', 'distr.').replace('Exponential', 'Exp.').replace('Uniform', 'Uni.').replace('Gamma', 'Gam.')
outer_legend_labels.append(f"{method_abbreviations[method]}: {method_to_show} ({m_count})")
# Convert colors to RGBA format and add transparency to outer ring colors
outer_colors_alpha = [to_rgba(color, alpha=0.6) for color in outer_colors]
# Inner and outer text size list
inner_text_sizes = adjust_text_size_inner(inner_sizes, min_size=22, max_size=24)
outer_text_sizes = adjust_text_size_outer(outer_sizes, min_size=20, max_size=22)
# Draw chart for dataset distribution
fig, ax = plt.subplots()
# Plot inner ring
inner_pie = ax.pie(inner_sizes, radius=1.6, colors=inner_colors, labeldistance=0.9,
wedgeprops=dict(width=0.9, edgecolor='w'), startangle=start_angle)
# Sets the position of the inner ring label
for i, (wedge, label) in enumerate(zip(inner_pie[0], inner_labels)):
angle = (wedge.theta2 - wedge.theta1) / 2. + wedge.theta1 # Calculate the angle of the center of each sector
x = 1.15 * np.cos(np.deg2rad(angle)) # Adjust the radius to fit
y = 1.15 * np.sin(np.deg2rad(angle)) # Adjust the radius to fit
ax.text(x, y, label, ha='center', va='center', fontsize=inner_text_sizes[i])
# Plot outer ring
outer_pie = ax.pie(outer_sizes, radius=2.4, colors=outer_colors_alpha, wedgeprops=dict(width=0.8, edgecolor='w'), startangle=start_angle)
# Set the position and orientation of the outer ring label text
for i, wedge in enumerate(outer_pie[0]):
angle = (wedge.theta2 - wedge.theta1) / 2. + wedge.theta1
x = 2.0 * np.cos(np.deg2rad(angle))
y = 2.0 * np.sin(np.deg2rad(angle))
# Determine the rotation angle of the label text
# Adjust the angle so that the text always points downwards
if 90 < angle < 270:
text_rotation = angle - 180
else:
text_rotation = angle
# If the angle > 180, the text direction needs to be further adjusted to ensure it is facing downwards
if text_rotation > 180:
text_rotation -= 360
ax.text(x, y, outer_labels[i], ha='center', va='center', fontsize=outer_text_sizes[i], rotation=text_rotation, rotation_mode='anchor')
'''
Plot and save dataset distribution double layer donut chart
'''
# Add a task legend
# task_legend_labels = [f"{task_abbreviations.get(task, task)} (Task): {task} ({count}/{total_task_count}, {count/total_task_count:.1%})" for task, count in task_counts.items()]
task_legend_labels = [f"{task_abbreviations.get(task, task)} (Task): {task} ({count}/{total_task_count})" for task, count in task_counts.items()]
plt.legend(task_legend_labels + outer_legend_labels,
# title="Tasks and Methods",
loc="center left",
prop = {'size': 20},
bbox_to_anchor=(1.55, 0, 0.5, 1),
ncol=2)
# plt.show()
plt.savefig(path.chart_dir + 'Dataset Notable Statistics/' + figure_name + ".pdf", format='pdf', bbox_inches='tight', dpi=600)
print("[+] "+ figure_name +" has been saved to " + path.chart_dir + '.')
'''
Save the distribution information to a csv file
'''
all_legend_labels = task_legend_labels + outer_legend_labels
# Split each label into the abbreviation, full name, and count/percentage
legend_data = [label.split(": ") for label in all_legend_labels]
legend_data = [[item[0].split(" (")[0], item[1].rsplit(" (", 1)[0], item[1].rsplit(" (", 1)[1].strip(")")] for item in legend_data]
# Convert to DataFrame
legend_df = pd.DataFrame(legend_data, columns=['Abbreviation', 'Full Name', 'Count/Percentage'])
# Save to CSV
table_name = file_name + ' - Dataset distribution table.csv'
legend_df.to_csv(path.chart_dir + 'Dataset Notable Statistics/' + table_name, index=False)
print("[+] "+ table_name + " has been saved to " + path.chart_dir + '.')
'''
Calculate and plot difficulty distribution
'''
# Progressing with the difficulty data
difficulty_counts = data['difficulty'].value_counts()
# Calculate percentages
percentage = 100.0 * difficulty_counts / difficulty_counts.sum()
difficulty_colors = ['#fbb4ae', '#b3cde3', '#ccebc5']
fig, ax = plt.subplots(figsize=(10, 8))
scenes, _ = ax.pie(difficulty_counts, startangle=140, colors=difficulty_colors)
# Set labels with category and percentage
labels = ['{0} ({1:.1f}%)'.format(i, j) for i, j in zip(difficulty_counts.index, percentage)]
# Adjust text sizes
text_sizes = adjust_text_size_outer(difficulty_counts, min_size=14, max_size=20)
# For each wedge, create a label at an appropriate angle inside the pie.
for i, p in enumerate(scenes):
angle = (p.theta2 - p.theta1)/2. + p.theta1
x = 0.5 * np.cos(np.deg2rad(angle))
y = 0.5 * np.sin(np.deg2rad(angle))
horizontalalignment = {-1: "right", 1: "left"}[int(np.sign(x))]
ax.text(x, y, labels[i], ha='center', va='center', fontsize=text_sizes[i])
ax.axis('equal')
# Legend with category, percentage, and count
legend_labels = ['{0} ({1:.1f}%, {2})'.format(i,j,k) for i,j,k in zip(difficulty_counts.index, percentage, difficulty_counts)]
plt.legend(loc="center left", bbox_to_anchor=(1, 0.5), title="Difficulty", labels=legend_labels, prop={'size': 10})
plt.title('Difficulty Distribution', fontsize=18)
plt.savefig(path.chart_dir + 'Dataset Notable Statistics/' + file_name + " - Difficulty Distribution.pdf", format='pdf', bbox_inches='tight', dpi=300)
print(f"[+] {file_name} - Difficulty Distribution chart saved to " + path.chart_dir)