From a84d6affc6bcdf6bf60082f10821bff4c91c7013 Mon Sep 17 00:00:00 2001
From: Matei Zaharia <matei@databricks.com>
Date: Sat, 5 Apr 2014 18:33:33 -0700
Subject: [PATCH] SPARK-1421. Make MLlib work on Python 2.6

The reason it wasn't working was passing a bytearray to stream.write(),
which is not supported in Python 2.6 but is in 2.7. (This array came
from NumPy when we converted data to send it over to Java). Now we just
convert those bytearrays to strings of bytes, which preserves
nonprintable characters as well.
---
 python/pyspark/mllib/__init__.py |  6 +-----
 python/pyspark/serializers.py    | 11 ++++++++++-
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/python/pyspark/mllib/__init__.py b/python/pyspark/mllib/__init__.py
index b420d7a7f23ba..538ff26ce7c33 100644
--- a/python/pyspark/mllib/__init__.py
+++ b/python/pyspark/mllib/__init__.py
@@ -19,11 +19,7 @@
 Python bindings for MLlib.
 """
 
-# MLlib currently needs Python 2.7+ and NumPy 1.7+, so complain if lower
-
-import sys
-if sys.version_info[0:2] < (2, 7):
-    raise Exception("MLlib requires Python 2.7+")
+# MLlib currently needs and NumPy 1.7+, so complain if lower
 
 import numpy
 if numpy.version.version < '1.7':
diff --git a/python/pyspark/serializers.py b/python/pyspark/serializers.py
index 4d802924df4a1..b253807974a2e 100644
--- a/python/pyspark/serializers.py
+++ b/python/pyspark/serializers.py
@@ -64,6 +64,7 @@
 from itertools import chain, izip, product
 import marshal
 import struct
+import sys
 from pyspark import cloudpickle
 
 
@@ -113,6 +114,11 @@ class FramedSerializer(Serializer):
     where C{length} is a 32-bit integer and data is C{length} bytes.
     """
 
+    def __init__(self):
+        # On Python 2.6, we can't write bytearrays to streams, so we need to convert them
+        # to strings first. Check if the version number is that old.
+        self._only_write_strings = sys.version_info[0:2] <= (2, 6)
+
     def dump_stream(self, iterator, stream):
         for obj in iterator:
             self._write_with_length(obj, stream)
@@ -127,7 +133,10 @@ def load_stream(self, stream):
     def _write_with_length(self, obj, stream):
         serialized = self.dumps(obj)
         write_int(len(serialized), stream)
-        stream.write(serialized)
+        if self._only_write_strings:
+            stream.write(str(serialized))
+        else:
+            stream.write(serialized)
 
     def _read_with_length(self, stream):
         length = read_int(stream)