-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexconvert.py
646 lines (623 loc) · 40 KB
/
lexconvert.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
#!/usr/bin/env python
program_name = "lexconvert v0.142 - convert between lexicons of different speech synthesizers\n(c) 2007-2008 Silas S. Brown. License: GPL"
# with contributions from Jan Weiss (x-sampa, acapela-uk, cmu and ms-sapi)
# Modified 20080402 by E Jackson to include IPA superscripted tone numbers into X-SAMPA
# Run without arguments for usage information
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# Phoneme Conversion Table:
# Column 'festival': phoneme in Festival's British English notation
# Column 'espeak': approximation in eSpeak, but note that eSpeak's phoneme representation is not that simple (hence some of the code below)
# Column 'sapi': approximation in American English in MS SAPI notation
# Column 'cepstral': phoneme in Cepstral's British English SSML phoneset
# Column 'mac': approximation in American English in Apple's speech notation ([[inpt PHON]]...[[inpt TEXT]] with no spaces between the phonemes)
# Column 'x-sampa': X-SAMPA
# Column 'acapela-uk': acapela optimized X-SAMPA for UK English voices (e.g. "Peter")
# Column 'cmu': format of the US English "Carnegie Mellon University Pronouncing Dictionary" (http://www.speech.cs.cmu.edu/cgi-bin/cmudict)
# Column 'ms-sapi': (?? something that Jan added, hidden from help text for now by a 'filter(lambda x:not x=="ms-sapi"' in the code)
# Column 'bbcmicro': BBC Micro Speech program by David J. Hoskins / Superior
# Column 'unicode-ipa': Unicode IPA, as used on an increasing number of websites
table = [
('festival', 'espeak', 'sapi', 'cepstral', 'mac', 'x-sampa', 'acapela-uk', 'cmu', 'ms-sapi', 'bbcmicro', 'unicode-ipa'),
# The first entry MUST be the syllable separator:
('0', '%', '-', '0', '=', '.', '', '0', '.', '', '.'),
('1', "'", '1', '1', '1', '"', 0, '1', '1', '1', u'\u02c8'), # primary stress - ignoring this for acapela-uk
('2', ',', '2', '0', '2', '%', 0, '2', '2', '2', u'\u02cc'), # secondary stress - ignoring this for acapela-uk
('', '', '', '', '', '', 0, '', '-', '', '-'),
(0, 0, 0, 0, 0, 0, 0, 0, '#', '', '#'),
(0, 0, 0, 0, 0, 0, 0, 0, ' ', '', ' '),
(0, 0, 0, 0, 0, 0, 0, 0, '_', '', '_'),
(0, 0, 0, 0, 0, 0, 0, 0, '?', '', '?'),
(0, 0, 0, 0, 0, 0, 0, 0, '!', '', '!'),
(0, 0, 0, 0, 0, 0, 0, 0, ',', '', ','),
('aa', ['A:', 'A@', 'aa'], 'aa', 'a', 'AA', 'A', 'A:', 'AA', 'aa', 'AA', u'\u0251'),
(0, 'A', 0, 0, 0, 0, 0, '2', 0, 0, u'\u2051'),
(0, 'A:', 0, 0, 0, ':', 0, '1', 0, 0, u'\u02d0'),
(0, 0, 0, 0, 0, 'A:', 0, 'AA', 0, 0, u'\u0251\u02d0'),
(0, 0, 0, 0, 0, 'Ar\\', 0, 0, 0, 0, u'\u0251\u0279'),
(0, 0, 0, 0, 'aa', 'a:', 0, 0, 0, 0, 'a\\u02d0'),
('a', ['a', '&'], 'ae', 'ae', 'AE', '{', '{', 'AE', 'ae', 'AE', [u'\xe6','a']),
('uh', 'V', 'ah', 'ah', 'UX', 'V', 'V', 'AH', 'ah', 'OH', u'\u028c'),
('o', '0', 'ao', 'oa', 'AA', 'Q', 'Q', 'AA', 'ao', 'O', u'\u0252'),
(0, 0, 0, 0, 0, 'A', 'A', 0, 0, 0, u'\u0251'),
(0, 0, 0, 0, 0, 'O', 'O', 0, 0, 0, u'\u0254'),
('au', 'aU', 'aw', 'aw', 'AW', 'aU', 'aU', 'AW', 'aw', 'AW', u'a\u028a'),
(0, 0, 0, 0, 0, '{O', '{O', 0, 0, 0, u'\xe6\u0254'),
('@', '@', 'ax', 'ah', 'AX', '@', '@', 'AH', 'ax', 'AH', u'\u0259'),
('@@', '3:', 'er', 'er', 0, '3:', '3:', 'ER', 'er', 'ER', u'\u0259\u02d0'),
('@', '3', 'ax', 'ah', 0, '@', '@', 'AH', 0, 'AH', u'\u025a'),
('@1', 'a2', 0, 0, 0, 0, 0, 0, 0, 0, u'\u0259'),
('@2', '@', 0, 0, 0, 0, 0, 0, 0, 0, 0),
('ai', 'aI', 'ay', 'ay', 'AY', 'aI', 'aI', 'AY', 'ay', 'IY', u'a\u026a'),
(0, 0, 0, 0, 0, 'Ae', 'A e', 0, 0, 0, u'\u0251e'),
('b', 'b', 'b', 'b', 'b', 'b', 'b', 'B ', 'b', 'B', 'b'),
('ch', 'tS', 'ch', 'ch', 'C', 'tS', 't S', 'CH', 'ch', 'CH', [u't\u0283', u'\u02a7']),
('d', 'd', 'd', 'd', 'd', 'd', 'd', 'D ', 'd', 'D', 'd'),
('dh', 'D', 'dh', 'dh', 'D', 'D', 'D', 'DH', 'dh', 'DH', u'\xf0'),
('e', 'E', 'eh', 'eh', 'EH', 'E', 'e', 'EH', 'eh', 'EH', u'\u025b'),
(0, 0, 0, 0, 0, 'e', 0, 0, 'ey', 0, 'e'),
('@@', '3:', 'er', 'er', 'AX', '3:', '3:', 'ER', 'er', 'ER', [u'\u025d', u'\u025c\u02d0']),
('e@', 'e@', 'eh r', 'e@', 'EH r', 'E@', 'e @', 0, 'eh r', 'AI', u'\u025b\u0259'),
(0, 0, 0, 0, 0, 'Er\\', 'e r', 0, 0, 0, u'\u025b\u0279'),
(0, 0, 0, 0, 0, 'e:', 'e :', 0, 0, 0, u'e\u02d0'),
(0, 0, 0, 0, 0, 'E:', 0, 0, 0, 0, u'\u025b\u02d0'),
(0, 0, 0, 0, 0, 'e@', 'e @', 0, 0, 0, u'e\u0259'),
('ei', 'eI', 'ey', 'ey', 'EY', 'eI', 'eI', 'EY', 'ey', 'AY', u'e\u026a'),
(0, 0, 0, 0, 0, '{I', '{I', 0, 0, 0, u'\xe6\u026a'),
('f', 'f', 'f', 'f', 'f', 'f', 'f', 'F ', 'f', 'F', 'f'),
('g', 'g', 'g', 'g', 'g', 'g', 'g', 'G ', 'g', 'G', [u'\u0261', 'g']),
('h', 'h', 'h', 'h', 'h', 'h', 'h', 'HH', 'hh', '/H', 'h'),
('i', 'I', 'ih', 'ih', 'IH', 'I', 'I', 'IH', 'ih', 'IH', u'\u026a'),
(0, 0, 0, 0, 0, '1', '1', 0, 0, 0, u'\u0268'),
(0, ['I', 'I2'], 0, 0, 'IX', 'I', 'I', 0, 0, 'IX', u'\u026a'),
('i@', 'i@', 'iy ah', 'i ah', 'IY UX', 'I@', 'I@', 'EY AH', 'iy ah', 'IXAH', u'\u026a\u0259'),
(0, 0, 0, 0, 0, 'Ir\\', 'I r', 0, 0, 0, u'\u026a\u0279'),
('ii', 'i:', 'iy', 'i', 'IY', 'i', 'i', 'IY', 'iy', 'EE', 'i'),
(0, 0, 0, 0, 0, 'i:', 'i:', 0, 0, 0, u'i\u02d0'),
('jh', 'dZ', 'jh', 'jh', 'J', 'dZ', 'dZ', 'JH', 'jh', 'J', [u'd\u0292', u'\u02a4']),
('k', 'k', 'k', 'k', 'k', 'k', 'k', 'K ', 'k', 'K', 'k'),
(0, 'x', 0, 0, 0, 'x', 'x', 0, 'l', 0, 'x'), # actually 'x' is like Scottish 'loch', but if the synth can't do that then try k
('l', ['l', 'L'], 'l', 'l', 'l', 'l', 'l', 'L ', 0, 'L', ['l', u'd\u026b']),
('m', 'm', 'm', 'm', 'm', 'm', 'm', 'M ', 'm', 'M', 'm'),
('n', 'n', 'n', 'n', 'n', 'n', 'n', 'N ', 'n', 'N', 'n'),
('ng', 'N', 'ng', 'ng', 'N', 'N', 'N', 'NG', 'nx', 'NX', u'\u014b'),
('ou', 'oU', 'ow', 'ow', 'OW', '@U', '@U', 'OW', 'ow', 'OW', [u'\u0259\u028a', 'o']),
(0, 0, 0, 0, 0, 'oU', 'o U', 0, 0, 0, u'o\u028a'),
(0, 0, 0, 0, 0, '@}', '@ }', 0, 0, 0, u'\u0259\u0289'),
('oi', 'OI', 'oy', 'oy', 'OY', 'OI', 'OI', 'OY', 'oy', 'OY', u'\u0254\u026a'),
(0, 0, 0, 0, 0, 'oI', 'o I', 0, 0, 0, u'o\u026a'),
('p', 'p', 'p', 'p', 'p', 'p', 'p', 'P ', 'p', 'P', 'p'),
('r', 'r', 'r', 'r', 'r', 'r\\', 'r', 'R ', 'r', 'R', u'\u0279'),
(0, 0, 0, 0, 0, 'r', 0, 0, 0, 0, 'r'),
('s', 's', 's', 's', 's', 's', 's', 'S ', 's', 'S', 's'),
('sh', 'S', 'sh', 'sh', 'S', 'S', 'S', 'SH', 'sh', 'SH', u'\u0283'),
('t', 't', 't', 't', 't', 't', 't', 'T ', 't', 'T', ['t', u'\u027e']),
('th', 'T', 'th', 'th', 'T', 'T', 'T', 'TH', 'th', 'TH', u'\u03b8'),
('u@', 'U@', 'uh', 'uh', 'UH', 'U@', 'U@', 'UH', 'uh', 'UH', u'\u028a\u0259'),
(0, 0, 0, 0, 0, 'Ur\\', 'U r', 0, 0, 0, u'\u028a\u0279'),
('u', 'U', 0, 0, 0, 'U', 'U', 0, 0, '/U', u'\u028a'),
('uu', 'u:', 'uw', 'uw', 'UW', '}:', 'u:', 'UW', 'uw', ['UW','UX'], u'\u0289\u02d0'),
(0, 0, 0, 0, 0, 'u:', 0, 0, 0, 0, [u'u\u02d0', 'u']),
('oo', 'O:', 'ax', 'ao', 'AO', 'O:', 'O:', 'AO', 'AO', 'AO', u'\u0254\u02d0'),
(0, 0, 0, 0, 0, 'O', 'O', 0, 0, 0, u'\u0254'),
(0, 0, 0, 0, 0, 'o:', 'O:', 0, 0, 0, u'o\u02d0'),
(0, ['O@', 'o@', 'O'], 0, 0, 0, 'O:', 0, 0, 0, 0, u'\u0254\u02d0'),
('v', 'v', 'v', 'v', 'v', 'v', 'v', 'V ', 'v', 'V', 'v'),
('w', 'w', 'w', 'w', 'w', 'w', 'w', 'W ', 'w', 'W', 'w'),
(0, 0, 0, 0, 0, 'W', 0, 0, 'x', 0, u'\u028d'),
('y', 'j', 'y', 'j', 'y', 'j', 'j', 'Y ', 'y', 'Y', 'j'),
('z', 'z', 'z', 'z', 'z', 'z', 'z', 'Z ', 'z', 'Z', 'z'),
('zh', 'Z', 'zh', 'zh', 'Z', 'Z', 'Z', 'ZH', 'zh', 'ZH', u'\u0292'),
# Tone numbers added by E Jackson
(0, 0, 0, 0, 0, '_1', 0, 0, 0, 0, u'\u00b9'),
(0, 0, 0, 0, 0, '_2', 0, 0, 0, 0, u'\u00b2'),
(0, 0, 0, 0, 0, '_3', 0, 0, 0, 0, u'\u00b3'),
(0, 0, 0, 0, 0, '_4', 0, 0, 0, 0, u'\u2074'),
(0, 0, 0, 0, 0, '_5', 0, 0, 0, 0, u'\u2075'),
(0, 0, 0, 0, 0, '_6', 0, 0, 0, 0, u'\u2076'),
(0, 0, 0, 0, 0, '_7', 0, 0, 0, 0, u'\u2077'),
(0, 0, 0, 0, 0, '_8', 0, 0, 0, 0, u'\u2078'),
(0, 0, 0, 0, 0, '_9', 0, 0, 0, 0, u'\u2079'),
(0, 0, 0, 0, 0, '_0', 0, 0, 0, 0, u'\u2070'),
# End changes by E Jackson
# TODO \u0294 (glottal stop) to eSpeak? (for local dialects)
# TODO bbcmicro also has CT as in fact, DR as in dragon, DUX as in duke, TR as in track
# Hack (must be at end) - make sure all dictionaries have an entry for '@', for the @l rule:
('@', '@', '@', 'ah', 'AX', '@', '@', '@', '@', 'AH', u'\u0259'),
(0, 0, 'ax', '@', 0, 0, 0, 0, 0, 0, 0),
(0, 0, 0, 'ah', '@', 0, 0, 0, 0, 0, 0),
(0, 0, 0, 0, 'AX', 0, 0, 0, 0, '@', '@'),
]
formats_where_space_separates_words = ["espeak","mac","unicode-ipa","x-sampa","bbcmicro"]
espeak_consonants = "bdDfghklmnNprsStTvwjzZ"
import commands,sys,re,os
for row in table: assert len(row)==len(table[0]) # sanity-check the table
def compare_tables(table1,table2,colsToIgnore):
# Debug function to compare 2 versions of the table
def checkDuplicateRows(t,which):
d = {}
for i in t:
if d.has_key(i): print "Warning: duplicate row in "+which+":",i
d[i]=1
checkDuplicateRows(table1,"first table") ; checkDuplicateRows(table2,"second table")
for col in table1[0]:
if col not in table2[0]: print "Deleted column:",col
for col in table2[0]:
if col not in table1[0]: print "Added column:",col
maps = [{}, {}]
commonCols = filter(lambda x:x in table2[0],table1[0])
for dx in [0,1]:
table = [table1,table2][dx]
for col1 in commonCols:
if col1 in colsToIgnore: continue
i1 = list(table[0]).index(col1)
for col2 in filter(lambda x: x>col1, commonCols):
if col2 in colsToIgnore: continue
i2 = list(table[0]).index(col2)
for row in table[1:]:
if not row[i1] or not row[i2]: continue # don't bother reporting mappings involving null strings - they don't matter
maps[dx][(col1,row[i1],col2,row[i2])] = True
deleted_mappings = filter(lambda k:not maps[1].has_key(k), maps[0].keys())
added_mappings = filter(lambda k:not maps[0].has_key(k), maps[1].keys())
def simplify_mappings(mappings,which):
byRow = {} ; byCol = {}
for k1,v1,k2,v2 in mappings:
if not byRow.has_key((k1,v1)): byRow[(k1,v1)]=[]
byRow[(k1,v1)].append((k2,v2))
if not byCol.has_key((k2,v2)): byCol[(k2,v2)]=[]
byCol[(k2,v2)].append((k1,v1))
byColCopy = byCol.copy()
for r in byRow.keys():
rowLen = len(r)
maxColLen = max(map(lambda c:len(byCol[c]),byRow[r]))
if rowLen > maxColLen:
if len(byRow[r])>1: word="mappings:"
else: word="mapping"
print which,word,r,"to",byRow[r]
for k in byColCopy.keys():
if r in byColCopy[k]: byColCopy[k].remove(r)
for c in filter(lambda x:byColCopy[x],byColCopy.keys()):
if len(byColCopy[c])>1: word="mappings:"
else: word="mapping"
print which,word,c,"to",byColCopy[c]
simplify_mappings(deleted_mappings,"Deleted")
simplify_mappings(added_mappings,"Added")
# e.g. if someone sends in newlexconvert.py and you want to review all changes not involving sampa/acapela:
# import lexconvert,newlexconvert
# lexconvert.compare_tables(lexconvert.table,newlexconvert.table,['x-sampa', 'acapela-uk'])
def squash_table(colsToDelete):
# For maintenance use. Deletes colsToDelete, and introduces dittos/list-alternatives
colsToDelete=map(lambda c:list(table[0]).index(c),colsToDelete)
colsToDelete.sort() ; colsToDelete.reverse()
ret = [] ; lastRow = None
for row in table:
row=list(row)
for c in colsToDelete: del row[c]
if ret:
for i in range(len(row)):
k=-1
while ret[k][i]==0: k -= 1
if row[i]==ret[k][i]: row[i]=0
if len(filter(lambda x:not x==0,row))==1:
# only 1 thing changed
ret[-1]=list(ret[-1])
for i in range(len(row)):
if not row[i]==0:
if not type(ret[-1][i])==type([]): ret[-1][i]=[ret[-1][i]]
ret[-1][i].append(row[i])
ret[-1]=tuple(ret[-1])
continue
ret.append(tuple(row))
# return ret
print "table = ["
for r in ret: print " "+repr(r)+","
print "]"
# Deal with any rows that contain lists of alternatives
# or 0 (ditto) marks
# If multiple lists in a row e.g. ([ab],[cd]), give [(a,c), (b,c), (a,d)], as (b,d) would never be reached anyway.
# (Removing duplicates and redundant rows is not really necessary but may help debugging)
newTable=[] ; hadAlready = {}
prevLine = None
for line in table:
line=list(line)
for i in range(len(line)):
if line[i]==0: line[i]=prevLine[i]
line=tuple(line)
colsWithLists = filter(lambda col: type(line[col])==type([]), range(len(line)))
if not colsWithLists: newTable.append(line)
for col in colsWithLists:
def firstItemIfList(l):
if type(l)==type([]): return l[0]
else: return l
for extraTuple in [tuple(map(lambda x:firstItemIfList(x),list(line[:col])+[i]+list(line[col+1:]))) for i in line[col]]:
if hadAlready.has_key(extraTuple): continue
hadAlready[extraTuple]=1
newTable.append(extraTuple)
prevLine = line
table = newTable
def OED_alt_to_espeak(oed_alt):
# Converts values of ALT attributes from OED website into eSpeak.
# Be sure to include the /.../ either side of each pronunciation (you can pass in more than one).
# Returns a LIST of eSpeak entries (because some OED entries are
# Note that OED's keyboard equivalents are almost identical to eSpeak,
# except for Q (which is 0 in eSpeak), and VI (aI in eSpeak) and one or two
# foreign symbols.
# (OED puts accent marks before the syllable rather than before the vowel like espeak;
# however espeak can correct this by itself so we don't have to. But do be careful if
# converting directly to another format - run though espeak -x first, or improve
# convert(), or add stress-mark-move-forward code to this function.)
oed_alt = oed_alt.replace("{lsyllab}","l") \
.replace("{msyllab}","m") \
.replace("{nsyllab}","n") \
.replace("{zh}","Z").replace("{edh}","D").replace("{ng}","N") \
.replace("{sh}","S").replace("{vdftheta}","T").replace("{lbelt}","L") \
.replace("{shtu}","U").replace("{fata}","A").replace("{lm}",":") \
.replace("{shti}","I").replace("{ope}","E").replace("{schwa}","@") \
.replace("{recv}","O").replace("{rfa}","0").replace("{revv}","V") \
.replace("{rfatilde}","0~").replace("{sm}","'").replace("{smm}",",") \
.replace("VI","aI").replace('\xe6','a').strip()
ret = []
def add(word):
if word.startswith("(") and word.endswith(")"): word=word[1:-1] # some entries have ()s around them
# TODO if there is more instance than one of each type of variable pronunciation in the word, do we want to add ALL combinations?
# currently just vary all-or-nothing, to give a general idea of the variability
if "'," in word: return add(word.replace("',","'")),add(word.replace("',",",")) # ', can be either primary or secondary stress
if "{shtibar}" in word: return add(word.replace("{shtibar}","I")),add(word.replace("{shtibar}","@"))
if "{shtubar}" in word: return add(word.replace("{shtubar}","U")),add(word.replace("{shtubar}","@"))
if "(" in word and ")" in word: return add(word.replace("(","",1).replace(")","",1)), add(word[:word.index("(")]+word[word.index(")")+1:])
for c in word:
found = False
for t in table[1:]:
if c in t[1]:
found = True ; break
if not found:
sys.stderr.write("NB omitting "+repr(word)+" because it still contains unknown characters (e.g. '"+c+"')\n")
return
ret.append(word)
to_add = [] ; adding=False
if not '/' in oed_alt:
sys.stderr.write("Warning: no / found, assuming entire input is one pronunciation entry\n")
oed_alt = "/"+oed_alt+"/"
for c in oed_alt:
if c=='/':
adding = not adding
if not adding:
add(''.join(to_add)) ; to_add = []
elif adding: to_add.append(c)
if to_add: sys.stderr.write("Warning: unterminated word (did you include both the starting and the ending slashes?)\n")
return ret
cached_source,cached_dest,cached_dict = None,None,None
def make_dictionary(source,dest):
global cached_source,cached_dest,cached_dict
if (source,dest) == (cached_source,cached_dest): return cached_dict
types = list(table[0])
assert source in types,"Unknown synthesizer name to convert from"
assert dest in types, "Unknown synthesizer name to convert to"
source,dest = types.index(source), types.index(dest)
d = {}
global dest_consonants ; dest_consonants = []
global dest_syllable_sep ; dest_syllable_sep = table[1][dest]
for l in table[1:]:
if not d.has_key(l[source]): d[l[source]]=l[dest]
is_in_espeak_consonants=True
for cTest in l[1]:
if cTest not in espeak_consonants:
is_in_espeak_consonants=False; break
if is_in_espeak_consonants: dest_consonants.append(l[dest])
cached_source,cached_dest,cached_dict=source,dest,d
return d
def convert(pronunc,source,dest):
if source=="unicode-ipa":
# try to decode it
if "\\u" in pronunc and not '"' in pronunc: # maybe \uNNNN copied from Gecko on X11, can just evaluate it to get the unicode
# (NB make sure to quote the \'s if pasing in on the command line)
try: pronunc=eval('u"'+pronunc+'"')
except: pass
else: # see if it makes sense as utf-8
try: pronunc = pronunc.decode('utf-8')
except: pass
ret = [] ; toAddAfter = None
dictionary = make_dictionary(source,dest)
while pronunc:
for lettersToTry in [2,1,0]:
if not lettersToTry and source=="espeak": sys.stderr.write("Warning: ignoring unknown espeak phoneme "+repr(pronunc[0])+"\n")
if not lettersToTry: pronunc=pronunc[1:] # ignore
elif dictionary.has_key(pronunc[:lettersToTry]):
toAdd=dictionary[pronunc[:lettersToTry]]
if toAdd in ['0','1','2'] and not dest=="espeak": # it's a stress mark in a notation that places stress marks AFTER vowels (not dest=="espeak" added because espeak uses 0 for other purposes)
if dest=="bbcmicro": # not sure which pitch levels to map the stresses to; try these:
if toAdd=='1': toAdd='3'
elif toAdd=='2': toAdd='4'
if source in ["espeak","unicode-ipa"]: # stress should be moved from before the vowel to after it
toAdd, toAddAfter = "",toAdd
else:
# With Cepstral synth, stress mark should be placed EXACTLY after the vowel and not any later. Might as well do this for others also.
# (not dest=="espeak" because that uses 0 as a phoneme; anyway it's dealt with separately below)
r=len(ret)-1
while ret[r] in dest_consonants or ret[r].endswith("*added"): r -= 1 # (if that raises IndexError then the input had a stress mark before any vowel) ("*added" condition is there so that implicit vowels don't get the stress)
ret.insert(r+1,toAdd) ; toAdd=""
elif toAdd in u"',\u02c8\u02cc" and dest in ["espeak","unicode-ipa"] and not source in ["espeak","unicode-ipa"]: # it's a stress mark that should be moved from after the vowel to before it
i=len(ret)
while i and (ret[i-1] in dest_consonants or ret[i-1].endswith("*added")): i -= 1
if i: i-=1
ret.insert(i,toAdd)
toAdd = ""
# attempt to sort out the festival dictionary's (and other's) implicit @ :
if ret and ret[-1] and toAdd in ['n','l'] and ret[-1] in dest_consonants: ret.append(dictionary['@']+'*added')
elif len(ret)>2 and ret[-2].endswith('*added') and toAdd and not toAdd in dest_consonants and not toAdd==dest_syllable_sep: del ret[-2]
# OK, add it:
if toAdd:
toAdd=toAdd.split()
ret.append(toAdd[0])
if toAddAfter and not toAdd[0] in dest_consonants:
ret.append(toAddAfter)
toAddAfter=None
ret += toAdd[1:]
# TODO: the above few lines make sure that toAddAfter goes after the FIRST phoneme if toAdd is multiple phonemes, but works only when converting from eSpeak to non-eSpeak; it ought to work when converting from non-eSpeak to non-eSpeak also (doesn't matter when converting TO eSpeak)
if source=="espeak" and pronunc[:lettersToTry]=="e@" and len(pronunc)>lettersToTry and pronunc[lettersToTry]=="r" and (len(pronunc)==lettersToTry+1 or pronunc[lettersToTry+1] in espeak_consonants): lettersToTry += 1 # hack because the 'r' is implicit in other synths (but DO have it if there's another vowel to follow)
pronunc=pronunc[lettersToTry:]
break
if toAddAfter: ret.append(toAddAfter)
if ret[-1]==dest_syllable_sep: del ret[-1] # spurious syllable separator at end
if dest in formats_where_space_separates_words: separator = ''
else: separator = ' '
ret=separator.join(ret).replace('*added','')
if dest=="cepstral": return ret.replace(" 1","1").replace(" 0","0")
if dest=="espeak": return cleanup_espeak_entry(ret)
else: return ret
def cleanup_espeak_entry(r):
r = r.replace("k'a2n","k'@n").replace("ka2n","k@n").replace("gg","g")
if r.endswith("i@r"): return r[:-3]+"i@"
if r.endswith("U@r"): return r[:-3]+"U@"
elif r.endswith("@r") and not r.endswith("e@r"): return r[:-2]+"3"
if r.endswith("A:r"): return r[:-3]+"A@"
if r.endswith("O:r"): return r[:-3]+"O@"
if r.endswith("@l") and not r.endswith("i@l") and not r.endswith("U@l"): return r[:-2]+"@L"
if r.endswith("rr") or r.endswith("3:r"): return r[:-1]
# TODO: 'declared' & 'declare' the 'r' after the 'E' sounds a bit 'regional' (but pretty). but sounds incomplete w/out 'r', and there doesn't seem to be an E2 or E@
# TODO: consider adding 'g' to words ending in 'N' (if want the 'g' pronounced in '-ng' words) (however, careful of words like 'yankee' where the 'g' would be followed by a 'k'; this may also be a problem going into the next word)
return r
def espeak_probably_right_already(existing_pronunc,new_pronunc):
# Compares our "new" pronunciation with eSpeak's existing pronunciation. As our transcription into eSpeak notation is only approximate, it could be that our new pronunciation is not identical to the existing one but the existing one is actually correct.
if existing_pronunc==new_pronunc: return True
def simplify(pronunc): return \
pronunc.replace(";","").replace("%","") \
.replace("a2","@") \
.replace("3","@") \
.replace("L","l") \
.replace("I2","i:") \
.replace("I","i:").replace("i@","i:@") \
.replace(",","") \
.replace("s","z") \
.replace("aa","A:") \
.replace("A@","A:") \
.replace("O@","O:") \
.replace("o@","O:") \
.replace("r-","r")
# TODO: rewrite @ to 3 whenever not followed by a vowel?
if simplify(existing_pronunc)==simplify(new_pronunc): return True # almost the same, and festival @/a2 etc seems to be a bit ambiguous so leave it alone
def parse_festival_dict(festival_location):
ret = []
for line in open(festival_location).xreadlines():
line=line.strip()
if "((pos" in line: line=line[:line.index("((pos")]
if line.startswith('( "'): line=line[3:]
line=line.replace('"','').replace('(','').replace(')','')
try:
word, pos, pronunc = line.split(None,2)
except ValueError: continue # malformed line
if pos not in ['n','v','a','cc','dt','in','j','k','nil','prp','uh']: continue # two or more words
yield (word.lower(), pos, pronunc)
def convert_system_festival_dictionary_to_espeak(festival_location,check_existing_pronunciation,add_user_dictionary_also):
os.system("mv en_extra en_extra~") # start with blank 'extra' dictionary
if check_existing_pronunciation: os.system("espeak --compile=en") # so that the pronunciation we're checking against is not influenced by a previous version of en_extra
outFile=open("en_extra","w")
print "Reading dictionary lists"
wordDic = {} ; ambiguous = {}
for line in filter(lambda x:x.split() and not re.match(r'^[a-z]* *\$',x),open("en_list").read().split('\n')): ambiguous[line.split()[0]]=ambiguous[line.split()[0]+'s']=True # this stops the code below from overriding anything already in espeak's en_list. If taking out then you need to think carefully about words like "a", "the" etc.
for word,pos,pronunc in parse_festival_dict(festival_location):
pronunc=pronunc.replace("i@ 0 @ 0","ii ou 2 ").replace("i@ 0 u 0","ii ou ") # (hack for OALD's "radio"/"video"/"stereo"/"embryo" etc)
pronunc=pronunc.replace("0","") # 0's not necessary, and OALD sometimes puts them in wrong places, confusing the converter
if wordDic.has_key(word):
ambiguous[word] = True
del wordDic[word] # better not go there
if not ambiguous.has_key(word):
wordDic[word] = (pronunc, pos)
toDel = []
if check_existing_pronunciation:
print "Checking existing pronunciation"
proc=os.popen("espeak -q -x -v en-rp > /tmp/.pronunc 2>&1","w")
wList = []
progressCount=0
for word,(pronunc,pos) in wordDic.items():
if check_existing_pronunciation:
sys.stdout.write(str(int(progressCount*100/len(wordDic)))+"%\r") ; sys.stdout.flush()
progressCount += 1
if not re.match("^[A-Za-z-]*$",word):
# contains special characters - better not go there
toDel.append(word)
elif word[-1]=="s" and wordDic.has_key(word[:-1]):
# unnecessary plural (espeak will pick up on them anyway)
toDel.append(word)
elif word.startswith("year") or "quarter" in word: toDel.append(word) # don't like festival's pronunciation of those (TODO: also 'memorial' why start with [m'I])
elif check_existing_pronunciation:
proc.write(word+"\n")
proc.flush() # so the progress indicator works
wList.append(word)
if check_existing_pronunciation:
proc.close() ; print
oldPronDic = {}
for k,v in zip(wList,open("/tmp/.pronunc").read().split("\n")): oldPronDic[k]=v.strip().replace(" ","")
for w in toDel: del wordDic[w]
print "Doing the conversion"
lines_output = 0
total_lines = 0
not_output_because_ok = []
items = wordDic.items() ; items.sort() # necessary because of the hacks below which check for the presence of truncated versions of the word (want to have decided whether or not to output those truncated versions before reaching the hacks)
for word,(pronunc,pos) in items:
total_lines += 1
new_e_pronunc = convert(pronunc,"festival","espeak")
if new_e_pronunc.count("'")==2 and not '-' in word: new_e_pronunc=new_e_pronunc.replace("'",",",1) # if 2 primary accents then make the first one a secondary (except on hyphenated words)
# TODO if not en-rp? - if (word.endswith("y") or word.endswith("ie")) and new_e_pronunc.endswith("i:"): new_e_pronunc=new_e_pronunc[:-2]+"I"
unrelated_word = None
if check_existing_pronunciation: espeakPronunc = oldPronDic.get(word,"")
else: espeakPronunc = ""
if word[-1]=='e' and wordDic.has_key(word[:-1]): unrelated_word, espeakPronunc = word[:-1],"" # hack: if word ends with 'e' and dropping the 'e' leaves a valid word that's also in the dictionary, we DON'T want to drop this word on the grounds that espeak already gets it right, because if we do then adding 's' to this word may cause espeak to add 's' to the OTHER word ('-es' rule).
if espeak_probably_right_already(espeakPronunc,new_e_pronunc):
not_output_because_ok.append(word)
continue
if not unrelated_word: lines_output += 1
outFile.write(word+" "+new_e_pronunc+" // from Festival's ("+pronunc+")")
if espeakPronunc: outFile.write(", not [["+espeakPronunc+"]]")
elif unrelated_word: outFile.write(" (here to stop espeak's affix rules getting confused by Festival's \""+unrelated_word+"\")")
outFile.write("\n")
print "Corrected(?) %d entries out of %d" % (lines_output,total_lines)
if add_user_dictionary_also: convert_user_lexicon("festival","espeak",outFile)
outFile.close()
os.system("espeak --compile=en")
if not_output_because_ok:
print "Checking for unwanted side-effects of those corrections" # e.g. terrible as Terr + ible, inducing as in+Duce+ing
proc=os.popen("espeak -q -x -v en-rp > /tmp/.pronunc 2>&1","w")
progressCount = 0
for w in not_output_because_ok:
proc.write(w+"\n") ; proc.flush()
sys.stdout.write(str(int(progressCount*100/len(not_output_because_ok)))+"%\r") ; sys.stdout.flush()
progressCount += 1
proc.close()
outFile=open("en_extra","a") # append to it
for word,pronunc in zip(not_output_because_ok,open("/tmp/.pronunc").read().split("\n")):
pronunc = pronunc.strip().replace(" ","")
if not pronunc==oldPronDic[word] and not espeak_probably_right_already(oldPronDic[word],pronunc):
outFile.write(word+" "+oldPronDic[word]+" // (undo affix-side-effect from previous words that gave \""+pronunc+"\")\n")
outFile.close()
os.system("espeak --compile=en")
return not_output_because_ok
def convert_user_lexicon(fromFormat,toFormat,outFile):
if fromFormat=="festival": lex = eval('['+commands.getoutput("grep '^(lex.add.entry' ~/.festivalrc | sed -e 's/;.*//' -e 's/[^\"]*\"/[\"/' -e 's/\" . /\",(\"/' -e 's/$/\"],/' -e 's/[()]/ /g' -e 's/ */ /g'")+']')
elif fromFormat=="espeak": lex = filter(lambda x:len(x)==2,[l.split()[:2] for l in open("en_extra").readlines()])
elif fromFormat=="cepstral":
lex = []
for l in open("lexicon.txt").readlines():
word, ignore, pronunc = l.split(None,2)
lex.append((word,pronunc))
else: assert 0, "Reading from '%s' lexicon file not yet implemented" % (fromFormat,)
if toFormat=="mac": outFile.write("# I don't yet know how to add to the Mac OS X lexicon,\n# so here is a 'sed' command you can run on your text\n# to put the pronunciation inline:\n\nsed")
elif toFormat=="sapi": outFile.write("rem You have to run this file\nrem with ptts.exe in the same directory\nrem to add these words to the SAPI lexicon\n\n")
elif toFormat=="unicode-ipa": outFile.write("<HTML><HEAD>\n<META HTTP-EQUIV=\"Content-Type\" CONTENT=\"text/html; charset=utf-8\">\n</HEAD><BODY><TABLE>\n")
for word, pronunc in lex:
pronunc = convert(pronunc,fromFormat,toFormat)
if toFormat=="espeak": outFile.write(word+" "+pronunc+"\n")
elif toFormat=="sapi": outFile.write("ptts -la "+word+" \""+pronunc+"\"\n")
elif toFormat=="cepstral": outFile.write(word.lower()+" 0 "+pronunc+"\n")
elif toFormat=="mac": outFile.write(" -e \"s/"+word+"/[[inpt PHON]]"+pronunc+"[[inpt TEXT]]/g\"")
elif toFormat=="bbcmicro": outFile.write("> "+word.upper()+"_"+chr(128)+pronunc) # (specifying 'whole word'; remove the space before or the _ after if you want)
elif toFormat=="unicode-ipa": outFile.write("<TR><TD>"+word+"</TD><TD>"+pronunc.encode("UTF-8")+"</TD></TR>\n")
else: assert 0, "Writing to lexicon in %s format not yet implemented" % (format,)
if toFormat=="mac": outFile.write("\n")
elif toFormat=="bbcmicro": outFile.write(">**")
elif toFormat=="unicode-ipa": outFile.write("</TABLE></BODY></HTML>\n")
first_word=True # hack for bbcmicro
def markup_inline_word(format,pronunc):
global first_word
if format=="espeak": return "[["+pronunc+"]]"
elif format=="mac": return "[[inpt PHON]]"+pronunc+"[[inpt TEXT]]"
elif format=="sapi": return "<pron sym=\""+pronunc+"\"/>"
elif format=="cepstral": return "<phoneme ph='"+pronunc+"'>p</phoneme>"
elif format=="acapela-uk": return "\\Prn="+pronunc+"\\"
elif format=="bbcmicro" and first_word:
first_word = False
return "*SPEAK "+pronunc
elif format=="unicode-ipa": return pronunc.encode("utf-8") # UTF-8 output - ok for pasting into Firefox etc *IF* the terminal/X11 understands utf-8 (otherwise redirect to a file, point the browser at it, and set encoding to utf-8, or try --convert'ing which will o/p HTML)
else: return pronunc # fallback - assume the user knows what to do with it
def main():
if '--festival-dictionary-to-espeak' in sys.argv:
try: festival_location=sys.argv[sys.argv.index('--festival-dictionary-to-espeak')+1]
except IndexError:
sys.stderr.write("Error: --festival-dictionary-to-espeak must be followed by the location of the festival OALD file (see help text)\n") ; sys.exit(1)
try: open(festival_location)
except:
sys.stderr.write("Error: The specified OALD location '"+festival_location+"' could not be opened\n") ; sys.exit(1)
try: open("en_list")
except:
sys.stderr.write("Error: en_list could not be opened (did you remember to cd to the eSpeak dictsource directory first?\n") ; sys.exit(1)
convert_system_festival_dictionary_to_espeak(festival_location,not '--without-check' in sys.argv,not os.system("test -e ~/.festivalrc"))
elif '--oed' in sys.argv:
sys.stderr.write("Copy the pronunciation entries from the OED and paste into here\n"
"The browser should copy the images' ALT text i.e. {zh}, {edh}, {ng}, etc.\n"
"Make sure to include the /.../ around each entry.\n"
"NOTE: Best use British pronunciations only, because some versions of eSpeak\nwill OMIT phonemes if you give them an American pronunciation.\n"
"When done, send EOF (Unix/Mac/etc Ctrl-D, Windows Ctrl-Z).\n")
os.popen("espeak -x","w").write(", ".join([markup_inline_word("espeak",w) for w in OED_alt_to_espeak(sys.stdin.read())]))
# (use espeak rather than converting directly to some other format - see comments in OED_alt_to_espeak)
elif '--try' in sys.argv:
i=sys.argv.index('--try')
espeak = convert(' '.join(sys.argv[i+2:]),sys.argv[i+1],'espeak')
os.popen("espeak -x","w").write(markup_inline_word("espeak",espeak))
elif '--trymac' in sys.argv:
i=sys.argv.index('--trymac')
mac = convert(' '.join(sys.argv[i+2:]),sys.argv[i+1],'mac')
os.popen("say","w").write(markup_inline_word("mac",mac))
elif '--phones' in sys.argv:
i=sys.argv.index('--phones')
format=sys.argv[i+1]
w,r=os.popen4("espeak -q -x")
w.write(' '.join(sys.argv[i+2:])) ; w.close()
print ", ".join([" ".join([markup_inline_word(format,convert(word,"espeak",format)) for word in line.split()]) for line in filter(lambda x:x,r.read().split("\n"))])
elif '--phones2phones' in sys.argv:
i=sys.argv.index('--phones2phones')
format1,format2 = sys.argv[i+1],sys.argv[i+2]
text=' '.join(sys.argv[i+3:])
if format1 in formats_where_space_separates_words:
for w in text.split(): print markup_inline_word(format2, convert(w,format1,format2))
else: print markup_inline_word(format2, convert(text,format1,format2))
elif '--convert' in sys.argv:
i=sys.argv.index('--convert')
fromFormat = sys.argv[i+1]
toFormat = sys.argv[i+2]
assert not fromFormat==toFormat, "cannot convert a lexicon to its own format (that could result in it being truncated)"
outFile = None
if toFormat=="cepstral": fname="lexicon.txt"
elif toFormat=="sapi": fname="run-ptts.bat"
elif toFormat=="mac": fname="substitute.sh"
elif toFormat=="bbcmicro": fname="BBCLEX"
elif toFormat=="espeak":
try: open("en_list")
except: assert 0, "You should cd to the espeak source directory before running this"
os.system("mv en_extra en_extra~ ; grep \"// from Festival.s\" en_extra~ > en_extra") # keep the "from Festival's" entries, so can incrementally update the user lexicon only
fname="en_extra"
outFile=open(fname,"a")
elif toFormat=="unicode-ipa":
fname="words-ipa.html" # just make a table of words and pronunciation
try:
open(fname)
assert 0, fname+" already exists, I'd rather not overwrite it; delete it yourself if you want"
except IOError: pass
outFile=open(fname,"w")
else: assert 0, "Don't know where to put lexicon of format '%s', try using --phones or --phones2phones options instead" % (toFormat,)
if not outFile:
len = 0
try: len = open(fname).read()
except: pass
assert not len, "File"+fname+"already exists and is not empty; are you sure you want to overwrite it? (Delete it first if so)"
outFile=open(fname,"w")
print "Writing lexicon entries to",fname
convert_user_lexicon(fromFormat,toFormat,outFile)
outFile.close()
if toFormat=="espeak": os.system("espeak --compile=en")
else:
print program_name
print "\nAvailable pronunciation formats:",', '.join(filter(lambda x:not x=="ms-sapi",list(table[0])))
print "\nUse --convert <from-format> <to-format> to convert a user lexicon file. Expects Festival's .festivalrc to be in the home directory, or espeak's en_extra or Cepstral's lexicon.txt to be in the current directory.\nE.g.: python lexconvert.py --convert festival cepstral"
print "\nUse --try <format> <pronunciation> to try a pronunciation with eSpeak (requires 'espeak' command),\n e.g.: python lexconvert.py --try festival h @0 l ou1\n or: python lexconvert.py --try unicode-ipa '\\u02c8\\u0279\\u026adn\\u0329' (for Unicode put '\\uNNNN' or UTF-8)\n (it converts to espeak format and then uses espeak to play it)\nUse --trymac to do the same as --try but with Mac OS 'say' instead of 'espeak'"
print "\nUse --oed to try some pronunciations from the Oxford English Dictionary (OED) website: it will prompt you to paste in the pronunciations and use eSpeak to display and pronounce each alternative. (Note the OED notation can specify alternatives even in one marking, so you may get more than you expect.)"
print "\nUse --phones <format> <words> to convert 'words' to phones in format 'format'. espeak will be run to do the text-to-phoneme conversion, and the output will then be converted to 'format'.\nE.g.: python lexconvert.py --phones unicode-ipa This is a test sentence.\nNote that some commercial speech synthesizers do not work well when driven entirely from phones, because their internal format is different and is optimised for normal text."
print "\nUse --phones2phones <format1> <format2> <phones in format1> to perform a one-off conversion of phones from format1 to format2."
print "\nUse --festival-dictionary-to-espeak <location> to convert the Festival Oxford Advanced Learners Dictionary (OALD) pronunciation lexicon to ESpeak.\nYou need to specify the location of the OALD file in <location>,\ne.g. for Debian festlex-oald package: python lexconvert.py --festival-dictionary-to-espeak /usr/share/festival/dicts/oald/all.scm\nor if you can't install the Debian package, try downloading http://ftp.debian.org/debian/pool/non-free/f/festlex-oald/festlex-oald_1.4.0.orig.tar.gz, unpack it into /tmp, and do: python lexconvert.py --festival-dictionary-to-espeak /tmp/festival/lib/dicts/oald/oald-0.4.out\nIn all cases you need to cd to the espeak source directory before running this. en_extra will be overwritten. Converter will also read your ~/.festivalrc if it exists. (You can later incrementally update from ~/.festivalrc using the --convert option; the entries from the system dictionary will not be overwritten in this case.) Specify --without-check to bypass checking the existing espeak pronunciation for OALD entries (much faster, but makes a larger file and in some cases compromises the pronunciation quality)."
if __name__ == "__main__": main()