forked from pydicom/pydicom
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcharset.py
134 lines (111 loc) · 4.97 KB
/
charset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# charset.py
"""Handle alternate character sets for character strings."""
#
# Copyright (c) 2008-2012 Darcy Mason
# This file is part of pydicom, released under a modified MIT license.
# See the file license.txt included with this distribution, also
# available at https://github.com/darcymason/pydicom
#
from pydicom import compat
from pydicom.config import logger
from pydicom.valuerep import PersonNameUnicode, text_VRs
from pydicom.compat import in_py2
# Map DICOM Specific Character Set to python equivalent
python_encoding = {
'': 'iso8859', # default character set for DICOM
'ISO_IR 6': 'iso8859', # alias for latin_1 too
'ISO_IR 100': 'latin_1',
'ISO 2022 IR 87': 'iso2022_jp',
'ISO 2022 IR 13': 'shift_jis',
'ISO 2022 IR 100': 'latin_1',
'ISO 2022 IR 149': 'euc_kr', # needs cleanup via clean_escseq()
'ISO_IR 192': 'UTF8', # from Chinese example, 2008 PS3.5 Annex J p1-4
'GB18030': 'GB18030',
'ISO_IR 126': 'iso_ir_126', # Greek
'ISO_IR 127': 'iso_ir_127', # Arab
'ISO_IR 138': 'iso_ir_138', # Hebrew
'ISO_IR 144': 'iso_ir_144', # Russian
}
default_encoding = "iso8859"
def clean_escseq(element, encodings):
"""Remove escape sequences that Python does not remove from
Korean encoding ISO 2022 IR 149 due to the G1 code element.
"""
if 'euc_kr' in encodings:
return element.replace(
"\x1b\x24\x29\x43", "").replace("\x1b\x28\x42", "")
else:
return element
# DICOM PS3.5-2008 6.1.1 (p 18) says:
# default is ISO-IR 6 G0, equiv to common chr set of ISO 8859 (PS3.5 6.1.2.1)
# (0008,0005) value 1 can *replace* the default encoding...
# for VRs of SH, LO, ST, LT, PN and UT (PS3.5 6.1.2.3)...
# with a single-byte character encoding
# if (0008,0005) is multi-valued, then value 1 (or default if blank)...
# is used until code extension escape sequence is hit,
# which can be at start of string, or after CR/LF, FF, or
# in Person Name PN, after ^ or =
# NOTE also that 7.5.3 SEQUENCE INHERITANCE states that if (0008,0005)
# is not present in a sequence item then it is inherited from its parent.
def convert_encodings(encodings):
"""Converts DICOM encodings into corresponding python encodings"""
# If a list if passed, we don't want to modify the list in place so copy it
encodings = encodings[:]
if isinstance(encodings, compat.string_types):
encodings = [encodings]
elif not encodings[0]:
encodings[0] = 'ISO_IR 6'
try:
encodings = [python_encoding[x] for x in encodings]
except KeyError: # Assume that it is already the python encoding (is there a way to check this?)
pass
if len(encodings) == 1:
encodings = [encodings[0]] * 3
elif len(encodings) == 2:
encodings.append(encodings[1])
return encodings
def decode(data_element, dicom_character_set):
"""Apply the DICOM character encoding to the data element
data_element -- DataElement instance containing a value to convert
dicom_character_set -- the value of Specific Character Set (0008,0005),
which may be a single value,
a multiple value (code extension), or
may also be '' or None.
If blank or None, ISO_IR 6 is used.
"""
if not dicom_character_set:
dicom_character_set = ['ISO_IR 6']
encodings = convert_encodings(dicom_character_set)
# decode the string value to unicode
# PN is special case as may have 3 components with differenct chr sets
if data_element.VR == "PN":
# logger.warn("%s ... type: %s" %(str(data_element), type(data_element.VR)))
if not in_py2:
if data_element.VM == 1:
data_element.value = data_element.value.decode(encodings)
else:
data_element.value = [val.decode(encodings) for val in data_element.value]
else:
if data_element.VM == 1:
data_element.value = PersonNameUnicode(data_element.value, encodings)
else:
data_element.value = [PersonNameUnicode(value, encodings)
for value in data_element.value]
if data_element.VR in text_VRs:
# Remove the first encoding if this is a multi-byte encoding
if len(encodings) > 1:
del encodings[0]
# You can't re-decode unicode (string literals in py3)
if data_element.VM == 1:
if isinstance(data_element.value, compat.text_type):
return
data_element.value = clean_escseq(
data_element.value.decode(encodings[0]), encodings)
else:
output = list()
for value in data_element.value:
if isinstance(value, compat.text_type):
output.append(value)
else:
output.append(clean_escseq(value.decode(encodings[0]), encodings))
data_element.value = output