Skip to content

Commit

Permalink
[SPARK-3103] [PySpark] fix saveAsTextFile() with utf-8
Browse files Browse the repository at this point in the history
bugfix: It will raise an exception when it try to encode non-ASCII strings into unicode. It should only encode unicode as "utf-8".

Author: Davies Liu <[email protected]>

Closes #2018 from davies/fix_utf8 and squashes the following commits:

4db7967 [Davies Liu] fix saveAsTextFile() with utf-8
  • Loading branch information
davies authored and JoshRosen committed Aug 18, 2014
1 parent 3a5962f commit d1d0ee4
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 1 deletion.
4 changes: 3 additions & 1 deletion python/pyspark/rdd.py
Original file line number Diff line number Diff line change
Expand Up @@ -1191,7 +1191,9 @@ def func(split, iterator):
for x in iterator:
if not isinstance(x, basestring):
x = unicode(x)
yield x.encode("utf-8")
if isinstance(x, unicode):
x = x.encode("utf-8")
yield x
keyed = self.mapPartitionsWithIndex(func)
keyed._bypass_serializer = True
keyed._jrdd.map(self.ctx._jvm.BytesToString()).saveAsTextFile(path)
Expand Down
9 changes: 9 additions & 0 deletions python/pyspark/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,15 @@ def test_save_as_textfile_with_unicode(self):
raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

def test_save_as_textfile_with_utf8(self):
x = u"\u00A1Hola, mundo!"
data = self.sc.parallelize([x.encode("utf-8")])
tempFile = tempfile.NamedTemporaryFile(delete=True)
tempFile.close()
data.saveAsTextFile(tempFile.name)
raw_contents = ''.join(input(glob(tempFile.name + "/part-0000*")))
self.assertEqual(x, unicode(raw_contents.strip(), "utf-8"))

def test_transforming_cartesian_result(self):
# Regression test for SPARK-1034
rdd1 = self.sc.parallelize([1, 2])
Expand Down

0 comments on commit d1d0ee4

Please sign in to comment.