-
Notifications
You must be signed in to change notification settings - Fork 3
/
wordcloud-1.py
executable file
·183 lines (154 loc) · 6.29 KB
/
wordcloud-1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#! /usr/bin/env python
# Author: Andreas Christian Mueller <[email protected]>
# (c) 2012
#
# License: MIT
import random
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
import numpy as np
from query_integral_image import query_integral_image
FONT_PATH = "/usr/share/fonts/truetype/droid/DroidSansMono.ttf"
def make_wordcloud(words, counts, fname, font_path=None, width=400, height=200,
margin=5, ranks_only=False):
"""Build word cloud using word counts, store in image.
Parameters
----------
words : numpy array of strings
Words that will be drawn in the image.
counts : numpy array of word counts
Word counts or weighting of words. Determines the size of the word in
the final image.
Will be normalized to lie between zero and one.
font_path : string
Font path to the font that will be used.
Defaults to DroidSansMono path.
fname : sting
Output filename. Extension determins image type
(written with PIL).
width : int (default=400)
Width of the word cloud image.
height : int (default=200)
Height of the word cloud image.
ranks_only : boolean (default=False)
Only use the rank of the words, not the actual counts.
Notes
-----
Larger Images with make the code significantly slower.
If you need a large image, you can try running the algorithm at a lower
resolution and then drawing the result at the desired resolution.
In the current form it actually just uses the rank of the counts,
i.e. the relative differences don't matter.
Play with setting the font_size in the main loop vor differnt styles.
Colors are used completely at random. Currently the colors are sampled
from HSV space with a fixed S and V.
Adjusting the percentages at the very end gives differnt color ranges.
Obviously you can also set all at random - haven't tried that.
"""
if len(counts) <= 0:
print("We need at least 1 word to plot a word cloud, got %d."
% len(counts))
if font_path is None:
font_path = FONT_PATH
# normalize counts
counts = counts / float(counts.max())
# sort words by counts
inds = np.argsort(counts)[::-1]
counts = counts[inds]
words = words[inds]
# create image
img_grey = Image.new("L", (width, height))
draw = ImageDraw.Draw(img_grey)
integral = np.zeros((height, width), dtype=np.uint32)
img_array = np.asarray(img_grey)
font_sizes, positions, orientations = [], [], []
# intitiallize font size "large enough"
font_size = 1000
# start drawing grey image
for word, count in zip(words, counts):
# alternative way to set the font size
if not ranks_only:
font_size = min(font_size, int(100 * np.log(count + 100)))
while True:
# try to find a position
font = ImageFont.truetype(font_path, font_size)
# transpose font optionally
orientation = random.choice([None, Image.ROTATE_90])
transposed_font = ImageFont.TransposedFont(font,
orientation=orientation)
draw.setfont(transposed_font)
# get size of resulting text
box_size = draw.textsize(word)
# find possible places using integral image:
result = query_integral_image(integral, box_size[1] + margin,
box_size[0] + margin)
if result is not None or font_size == 0:
break
# if we didn't find a place, make font smaller
font_size -= 1
if font_size == 0:
# we were unable to draw any more
break
x, y = np.array(result) + margin // 2
# actually draw the text
draw.text((y, x), word, fill="white")
positions.append((x, y))
orientations.append(orientation)
font_sizes.append(font_size)
# recompute integral image
img_array = np.asarray(img_grey)
# recompute bottom right
# the order of the cumsum's is important for speed ?!
partial_integral = np.cumsum(np.cumsum(img_array[x:, y:], axis=1),
axis=0)
# paste recomputed part into old image
# if x or y is zero it is a bit annoying
if x > 0:
if y > 0:
partial_integral += (integral[x - 1, y:]
- integral[x - 1, y - 1])
else:
partial_integral += integral[x - 1, y:]
if y > 0:
partial_integral += integral[x:, y - 1][:, np.newaxis]
integral[x:, y:] = partial_integral
# redraw in color
img = Image.new("RGB", (width, height))
draw = ImageDraw.Draw(img)
everything = zip(words, font_sizes, positions, orientations)
for word, font_size, position, orientation in everything:
font = ImageFont.truetype(font_path, font_size)
# transpose font optionally
transposed_font = ImageFont.TransposedFont(font,
orientation=orientation)
draw.setfont(transposed_font)
draw.text((position[1], position[0]), word,
fill="hsl(%d" % random.randint(0, 255) + ", 80%, 50%)")
img.show()
img.save(fname)
if __name__ == "__main__":
import os
import sys
from sklearn.feature_extraction.text import CountVectorizer
if "-" in sys.argv:
lines = sys.stdin.readlines()
sources = ['stdin']
else:
sources = ([arg for arg in sys.argv[1:] if os.path.exists(arg)]
or ["constitution.txt"])
lines = []
for s in sources:
with open(s) as f:
lines.extend(f.readlines())
text = "".join(lines)
cv = CountVectorizer(min_df=1, charset_error="ignore",
stop_words="english", max_features=200)
counts = cv.fit_transform([text]).toarray().ravel()
words = np.array(cv.get_feature_names())
# throw away some words, normalize
words = words[counts > 1]
counts = counts[counts > 1]
output_filename = (os.path.splitext(os.path.basename(sources[0]))[0]
+ "_.png")
counts = make_wordcloud(words, counts, output_filename)