-
Notifications
You must be signed in to change notification settings - Fork 90
/
parcoords.py
393 lines (310 loc) · 11.5 KB
/
parcoords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
"""This module offers a general purpose parallel coordinate plotting Class
using matplotlib.
"""
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd
import seaborn as sns
from pandas.api.types import CategoricalDtype
from sklearn import preprocessing
# Created on 11 Sep 2017
#
# .. codeauthor::jhkwakkel <j.h.kwakkel (at) tudelft (dot) nl>
__all__ = ["ParallelAxes", "get_limits"]
def setup_parallel_plot(labels, minima, maxima, formatter=None, fs=14, rot=90):
"""helper function for setting up the parallel axes plot
Parameters
----------
labels : list of str
minima : ndarray
maxima : ndarray
formattter : dict with precision format strings for labels, optional
defaults to .2f
fs : int, optional
fontsize for defaults text items
rot : float, optional
rotation of axis labels
"""
if formatter is None:
formatter = {}
sns.set_style("white")
# labels is a list, minima and maxima pd series
nr_columns = len(labels)
fig = plt.figure()
axes = []
tick_labels = {}
# we need one axes less than the shape
for i, label in enumerate(labels[:-1]):
i += 1
ax = fig.add_subplot(1, nr_columns - 1, i, ylim=(-0.1, 1.1))
axes.append(ax)
ax.set_xlim([i, i + 1])
ax.xaxis.set_major_locator(ticker.FixedLocator([i]))
ax.xaxis.set_ticklabels([labels[i - 1]], rotation=rot, fontsize=fs)
ax.xaxis.set_tick_params(bottom=False, top=False)
# let's put our own tick labels
ax.yaxis.set_ticks([])
# TODO::consider moving to f-strin
# so
# label = f"{{maxima[label]}:{precision}}"
try:
precision = formatter[label]
except KeyError:
precision = ".2f"
max_label = f"{maxima[label]:{precision}}"
min_label = f"{minima[label]:{precision}}"
max_label = ax.text(i, 1.01, max_label, va="bottom", ha="center", fontsize=fs)
min_label = ax.text(i, -0.01, min_label, va="top", ha="center", fontsize=fs)
tick_labels[label] = (min_label, max_label)
ax.spines["left"].set_bounds(0, 1)
ax.spines["right"].set_bounds(0, 1)
ax.spines["top"].set_visible(False)
ax.spines["bottom"].set_visible(False)
# for the last axis, we need 2 ticks (also for the right hand side
ax.xaxis.set_major_locator(ticker.FixedLocator([i, i + 1]))
ax.xaxis.set_ticklabels(labels[i - 1 : i + 1], fontsize=fs, rotation=rot)
label = labels[-1]
try:
precision = formatter[label]
except KeyError:
precision = ".2f"
max_label = f"{maxima[label]:{precision}}"
min_label = f"{minima[label]:{precision}}"
max_label = ax.text(i + 1, 1.01, max_label, va="bottom", ha="center", fontsize=fs)
min_label = ax.text(i + 1, -0.01, min_label, va="top", ha="center", fontsize=fs)
tick_labels[label] = (min_label, max_label)
# add the tick labels to the rightmost spine
for tick in ax.yaxis.get_major_ticks():
tick.label2On = True
# stack the subplots together
plt.subplots_adjust(wspace=0)
return fig, axes, tick_labels
def get_limits(data):
"""helper function to get limits of a FataFrame that can serve as input
to ParallelAxis
Parameters
----------
data : DataFrame
Returns
-------
DataFrame
"""
def limits(x):
if x.dtype == "object":
return pd.Series([set(x), set(x)])
else:
return pd.Series([x.min(), x.max()])
return data.apply(limits)
class ParallelAxes:
"""Base class for creating a parallel axis plot.
Parameters
----------
limits : DataFrame
A DataFrame specifying the limits for each dimension in the
data set. For categorical data, the first cell should contain all
categories. See get_limits for more details.
formattter : dict , optional
dict with precision format strings for minima and maxima, use
column name as key. If column is not present, or no formatter
dict is provided, precision formatting defaults to .2f
fontsize : int, optional
fontsize for defaults text items
rot : float, optional
rotation of axis labels
"""
def __init__(self, limits, formatter=None, fontsize=14, rot=90):
"""
Parameters
----------
limits : DataFrame
categorical data, first cell should contain all categories
formatter : dict, optional
specify precision formatters for minima and maxima,
defaults to .2f
fontsize : int, optional
fontsize for defaults text items
rot : float, optional
rotation of axis labels
"""
self.limits = limits.copy() # copy to avoid side effects
self.recoding = {}
self.flipped_axes = set()
self.axis_labels = list(limits.columns.values)
self.fontsize = fontsize
# recode data
for column, dtype in limits.dtypes.items():
if dtype == "object":
cats = limits[column][0]
self.recoding[column] = CategoricalDtype(categories=cats, ordered=False)
self.limits.loc[:, column] = [0, len(cats) - 1]
self.normalizer = preprocessing.MinMaxScaler()
self.normalizer.fit(self.limits)
fig, axes, ticklabels = setup_parallel_plot(
self.axis_labels,
self.limits.min(),
self.limits.max(),
fs=self.fontsize,
rot=rot,
formatter=formatter,
)
self.fig = fig
self.axes = axes
self.ticklabels = ticklabels
self.datalabels = []
# TODO:: can't we force the wspace attribute instead having
# to reset it after tight_layout?
plt.tight_layout(h_pad=0, w_pad=0)
plt.subplots_adjust(wspace=0)
def plot(self, data, color=None, label=None, **kwargs):
"""plot data on parallel axes
Parameters
----------
data : DataFrame or Series
color : valid mpl color, optional
label : str, optional
any additional kwargs will be passed to matplotlib's plot
method.
Data is normalized using the limits specified when initializing
ParallelAxis.
"""
data = data.copy() # copy to avoid side effects
if isinstance(data, pd.Series):
data = data.to_frame().T
if label:
self.datalabels.append((label, color))
# ensures any data to be plotted is in the same order
# as the limits
data = data[self.axis_labels]
# recode the data
recoded = data.copy()
for key, value in self.recoding.items():
recoded[key] = data[key].astype(value).cat.codes
# normalize the data
normalized_data = pd.DataFrame(self.normalizer.transform(recoded), columns=recoded.columns)
# plot the data
self._plot(normalized_data, color=color, **kwargs)
def legend(self):
"""add a legend to the figure"""
artists = []
labels = []
for label, color in self.datalabels:
artist = plt.Line2D([0, 1], [0, 1], color=color)
artists.append(artist)
labels.append(label)
self.fig.legend(
artists,
labels,
ncol=1,
fontsize=self.fontsize,
loc=2,
borderaxespad=0.1,
bbox_to_anchor=(1.025, 0.925),
)
plt.tight_layout(h_pad=0, w_pad=0)
plt.subplots_adjust(wspace=0)
def _plot(self, data, **kwargs):
"""Plot the data onto the parallel axis
Parameters
----------
data : DataFrame
"""
j = -1
for ax, label_i, label_j in zip(self.axes, self.axis_labels[:-1], self.axis_labels[1::]):
plotdata = data.loc[:, [label_i, label_j]]
j += 1
lines = ax.plot([j + 1, j + 2], plotdata.values.T, **kwargs)
if label_i in self.flipped_axes:
self._update_plot_data(ax, 0, lines=lines)
if label_j in self.flipped_axes:
self._update_plot_data(ax, 1, lines=lines)
def invert_axis(self, axis):
"""flip direction for specified axis
Parameters
----------
axis : str or list of str
"""
if isinstance(axis, str):
axis = [axis]
for entry in axis:
self._invert_axis(entry)
# keep track of flipped axes
for entry in axis:
if entry not in self.flipped_axes:
self.flipped_axes.add(entry)
else:
self.flipped_axes.remove(entry)
def _invert_axis(self, axis):
"""
Parameters
----------
"""
ids = self._get_axes_ids(axis)
if len(ids) == 1:
id = ids[0] # @ReservedAssignment
if id == 0:
index = 0
else:
index = 1
ax = self.axes[id]
self._update_plot_data(ax, index)
else:
for i, direction in enumerate(ids[::-1]):
self._update_plot_data(self.axes[direction], i)
self._update_ticklabels(axis)
def _update_plot_data(self, ax, index, lines=None):
"""
Parameters
----------
index : {0, 1}
"""
if lines is None:
lines = ax.get_lines()
for line in lines:
ydata = line.get_data()[1]
ydata[index] = 1 - ydata[index]
line.set_ydata(ydata)
def _update_ticklabels(self, axis):
"""
Parameters
----------
axis : str
"""
for label in self.ticklabels[axis]:
x, y = label.get_position()
if y == -0.01:
y = 1.01
label.set_va("bottom")
else:
y = -0.01
label.set_va("top")
label.set_position((x, y))
# TODO:: more fine-grained control for intermediate ticklabels
# probably enable this by default for categorical axes
# while having it disabled for continuous variables
# from http://benalexkeen.com/parallel-coordinates-in-matplotlib/
# # Set the tick positions and labels on y axis for each plot
# # Tick positions based on normalised data
# # Tick labels are based on original data
# def set_ticks_for_axis(dim, ax, ticks):
# min_val, max_val, val_range = min_max_range[cols[dim]] # the limits
# step = val_range / float(ticks-1)
# tick_labels = [round(min_val + step * i, 2) for i in range(ticks)]
# norm_min = df[cols[dim]].min()
# norm_range = np.ptp(df[cols[dim]])
# norm_step = norm_range / float(ticks-1)
# ticks = [round(norm_min + norm_step * i, 2) for i in range(ticks)]
# ax.yaxis.set_ticks(ticks)
# ax.set_yticklabels(tick_labels)
def _get_axes_ids(self, column):
"""
Parameters
----------
column : str
"""
index = self.limits.columns.get_loc(column)
if index == 0 or index >= (len(self.axes)):
index = min(index, (len(self.axes) - 1))
return (index,)
else:
other_index = index - 1
return other_index, index