-
Notifications
You must be signed in to change notification settings - Fork 98
/
encoding.py
46 lines (41 loc) · 990 Bytes
/
encoding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# -*- coding: utf-8 -*-
import charade
def detect(s):
'''
>>> detect('ascii')
{'confidence': 1.0, 'encoding': 'ascii'}
>>> detect('abcdé')
{'confidence': 0.505, 'encoding': 'utf-8'}
>>> detect(bytes('abcdé', 'utf-8'))
{'confidence': 0.505, 'encoding': 'utf-8'}
>>> detect(bytes('\222\222\223\225', 'latin-1'))
{'confidence': 0.5, 'encoding': 'windows-1252'}
'''
try:
if isinstance(s, str):
return charade.detect(s.encode())
else:
return charade.detect(s)
except UnicodeDecodeError:
return charade.detect(s.encode('utf-8'))
def convert(s):
'''
>>> convert('ascii')
'ascii'
>>> convert('abcdé')
'abcdé'
>>> convert(bytes('abcdé', 'utf-8'))
'abcdé'
>>> convert(bytes('\222\222\223\225', 'latin-1'))
'\u2019\u2019\u201c\u2022'
'''
if isinstance(s, str):
s = s.encode()
encoding = detect(s)['encoding']
if encoding == 'utf-8':
return s.decode()
else:
return s.decode(encoding)
if __name__ == '__main__':
import doctest
doctest.testmod()