From c304cc8ae1a39b3c58f21153e7311cfab7dfac9e Mon Sep 17 00:00:00 2001
From: Nick Pentreath <nick.pentreath@gmail.com>
Date: Sun, 15 Dec 2013 08:32:37 +0200
Subject: [PATCH] Adding supporting sequncefiles for tests. Cleaning up

---
 .../apache/spark/api/python/PythonRDD.scala   | 165 ++++++++++--------
 .../apache/spark/api/python/SerDeUtil.scala   |  64 ++++---
 .../api/python/WriteInputFormatTests.scala    |  64 +++++++
 python/pyspark/context.py                     |  32 ++--
 .../test_support/data/sfarray/._SUCCESS.crc   | Bin 0 -> 8 bytes
 .../data/sfarray/.part-r-00000.crc            | Bin 0 -> 12 bytes
 .../data/sfarray/.part-r-00001.crc            | Bin 0 -> 12 bytes
 python/test_support/data/sfarray/_SUCCESS     |   0
 python/test_support/data/sfarray/part-r-00000 | Bin 0 -> 134 bytes
 python/test_support/data/sfarray/part-r-00001 | Bin 0 -> 174 bytes
 .../test_support/data/sfclass/._SUCCESS.crc   | Bin 0 -> 8 bytes
 .../data/sfclass/.part-r-00000.crc            | Bin 0 -> 12 bytes
 .../data/sfclass/.part-r-00001.crc            | Bin 0 -> 12 bytes
 python/test_support/data/sfclass/_SUCCESS     |   0
 python/test_support/data/sfclass/part-r-00000 | Bin 0 -> 151 bytes
 python/test_support/data/sfclass/part-r-00001 | Bin 0 -> 181 bytes
 .../data/sfdouble/.part-00000.crc             | Bin 12 -> 12 bytes
 .../data/sfdouble/.part-00001.crc             | Bin 12 -> 12 bytes
 python/test_support/data/sfdouble/part-00000  | Bin 145 -> 145 bytes
 python/test_support/data/sfdouble/part-00001  | Bin 145 -> 145 bytes
 .../test_support/data/sfint/.part-00000.crc   | Bin 12 -> 12 bytes
 .../test_support/data/sfint/.part-00001.crc   | Bin 12 -> 12 bytes
 python/test_support/data/sfint/part-00000     | Bin 130 -> 145 bytes
 python/test_support/data/sfint/part-00001     | Bin 130 -> 145 bytes
 python/test_support/data/sftext/._SUCCESS.crc | Bin 0 -> 8 bytes
 .../test_support/data/sftext/.part-00000.crc  | Bin 12 -> 12 bytes
 .../test_support/data/sftext/.part-00001.crc  | Bin 12 -> 12 bytes
 python/test_support/data/sftext/_SUCCESS      |   0
 python/test_support/data/sftext/part-00000    | Bin 117 -> 123 bytes
 python/test_support/data/sftext/part-00001    | Bin 117 -> 123 bytes
 30 files changed, 213 insertions(+), 112 deletions(-)
 create mode 100644 core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTests.scala
 create mode 100644 python/test_support/data/sfarray/._SUCCESS.crc
 create mode 100644 python/test_support/data/sfarray/.part-r-00000.crc
 create mode 100644 python/test_support/data/sfarray/.part-r-00001.crc
 create mode 100755 python/test_support/data/sfarray/_SUCCESS
 create mode 100755 python/test_support/data/sfarray/part-r-00000
 create mode 100755 python/test_support/data/sfarray/part-r-00001
 create mode 100644 python/test_support/data/sfclass/._SUCCESS.crc
 create mode 100644 python/test_support/data/sfclass/.part-r-00000.crc
 create mode 100644 python/test_support/data/sfclass/.part-r-00001.crc
 create mode 100755 python/test_support/data/sfclass/_SUCCESS
 create mode 100755 python/test_support/data/sfclass/part-r-00000
 create mode 100755 python/test_support/data/sfclass/part-r-00001
 create mode 100644 python/test_support/data/sftext/._SUCCESS.crc
 create mode 100755 python/test_support/data/sftext/_SUCCESS

diff --git a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
index 63f837388e21b..52d443f94a315 100644
--- a/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
@@ -34,10 +34,10 @@ import org.apache.spark.api.java.function.PairFunction
 import scala.util.{Success, Failure, Try}
 import org.msgpack
 import org.msgpack.ScalaMessagePack
-import org.apache.hadoop.mapreduce.InputFormat
+import org.apache.hadoop.mapred.InputFormat
 
 import org.apache.hadoop.mapreduce.{InputFormat => NewInputFormat}
-import org.apache.hadoop.mapred.SequenceFileOutputFormat
+import org.apache.hadoop.mapred.{JobConf, SequenceFileOutputFormat}
 import org.apache.hadoop.conf.Configuration
 import java.util
 
@@ -196,69 +196,6 @@ private object SpecialLengths {
   val TIMING_DATA = -3
 }
 
-case class TestClass(var id: String, var number: Int) {
-  def this() = this("", 0)
-}
-
-object TestHadoop extends App {
-
-  //PythonRDD.writeToStream((1, "bar"), new DataOutputStream(new FileOutputStream("/tmp/test.out")))
-
-
-  //val n = new NullWritable
-
-  import SparkContext._
-
-  val path = "/tmp/spark/test/sfarray/"
-  //val path = "/Users/Nick/workspace/java/faunus/output/job-0/"
-
-  val sc = new SparkContext("local[2]", "test")
-
-  //val rdd = sc.sequenceFile[NullWritable, FaunusVertex](path)
-  //val data = Seq((1.0, "aa"), (2.0, "bb"), (2.0, "aa"), (3.0, "cc"), (2.0, "bb"), (1.0, "aa"))
-  val data = Seq(
-    (1, Array(1.0, 2.0, 3.0)),
-    (2, Array(3.0, 4.0, 5.0)),
-    (3, Array(4.0, 5.0, 6.0))
-  )
-  val d = new DoubleWritable(5.0)
-  val a = new ArrayWritable(classOf[DoubleWritable], Array(d))
-
-  val rdd = sc.parallelize(data, numSlices = 2)
-    //.map({ case (k, v) => (new IntWritable(k), v.map(new DoubleWritable(_))) })
-    .map{ case (k, v) => (new IntWritable(k), new ArrayWritable(classOf[DoubleWritable], v.map(new DoubleWritable(_)))) }
-  rdd.saveAsNewAPIHadoopFile[org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat[IntWritable, ArrayWritable]](path)
-
-  /*
-  val data = Seq(
-    ("1", TestClass("test1", 123)),
-    ("2", TestClass("test2", 456)),
-    ("1", TestClass("test3", 123)),
-    ("3", TestClass("test56", 456)),
-    ("2", TestClass("test2", 123))
-  )
-  val rdd = sc.parallelize(data, numSlices = 2).map{ case (k, v) => (new Text(k), v) }
-  rdd.saveAsNewAPIHadoopFile(path,
-                             classOf[Text], classOf[TestClass],
-                             classOf[org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat[Text, TestClass]])
-
-  //val rdd2 = Seq((1, ))
-
-  val seq = sc.sequenceFile[Double, String](path)
-  val seqR = seq.collect()
-
-  val packed = PythonRDD.serMsgPack(rdd)
-  val packedR = packed.collect()
-  val packed2 = PythonRDD.serMsgPack(seq)
-  val packedR2 = packed2.collect()
-
-  println(seqR.mkString(","))
-  println(packedR.mkString(","))
-  println(packedR2.mkString(","))
-  */
-
-}
-
 private[spark] object PythonRDD extends Logging {
 
   def readRDDFromFile(sc: JavaSparkContext, filename: String, parallelism: Int):
@@ -281,7 +218,7 @@ private[spark] object PythonRDD extends Logging {
 
   // PySpark / Hadoop InputFormat stuff
 
-  // SequenceFile
+  /** Create and RDD from a path using [[org.apache.hadoop.mapred.SequenceFileInputFormat]] */
   def sequenceFile[K ,V](sc: JavaSparkContext,
                          path: String,
                          keyClass: String,
@@ -299,8 +236,11 @@ private[spark] object PythonRDD extends Logging {
     JavaRDD.fromRDD(SerDeUtil.serMsgPack[K, V](converted))
   }
 
-  // Arbitrary Hadoop InputFormat, key class and value class
-  def newHadoopFile[K, V, F <: NewInputFormat[K, V]](sc: JavaSparkContext,
+  /**
+   * Create an RDD from a file path, using an arbitrary [[org.apache.hadoop.mapreduce.InputFormat]],
+   * key and value class
+   */
+  def newAPIHadoopFile[K, V, F <: NewInputFormat[K, V]](sc: JavaSparkContext,
                                                      path: String,
                                                      inputFormatClazz: String,
                                                      keyClazz: String,
@@ -312,19 +252,36 @@ private[spark] object PythonRDD extends Logging {
     val baseConf = sc.hadoopConfiguration()
     val mergedConf = PythonHadoopUtil.mergeConfs(baseConf, conf)
     val rdd =
-      newHadoopFileFromClassNames[K, V, F](sc,
-        path, inputFormatClazz, keyClazz, valueClazz, keyWrapper, valueWrapper, mergedConf)
+      newAPIHadoopRDDFromClassNames[K, V, F](sc,
+        Some(path), inputFormatClazz, keyClazz, valueClazz, mergedConf)
+    val converted = SerDeUtil.convertRDD[K, V](rdd)
+    JavaRDD.fromRDD(SerDeUtil.serMsgPack[K, V](converted))
+  }
+
+  /**
+   * Create an RDD from a [[org.apache.hadoop.conf.Configuration]] converted from a map that is passed in from Python,
+   * using an arbitrary [[org.apache.hadoop.mapreduce.InputFormat]], key and value class
+   */
+  def newAPIHadoopRDD[K, V, F <: NewInputFormat[K, V]](sc: JavaSparkContext,
+                                                       inputFormatClazz: String,
+                                                       keyClazz: String,
+                                                       valueClazz: String,
+                                                       keyWrapper: String,
+                                                       valueWrapper: String,
+                                                       confAsMap: java.util.HashMap[String, String]) = {
+    val conf = PythonHadoopUtil.mapToConf(confAsMap)
+    val rdd =
+      newAPIHadoopRDDFromClassNames[K, V, F](sc,
+        None, inputFormatClazz, keyClazz, valueClazz, conf)
     val converted = SerDeUtil.convertRDD[K, V](rdd)
     JavaRDD.fromRDD(SerDeUtil.serMsgPack[K, V](converted))
   }
 
-  private def newHadoopFileFromClassNames[K, V, F <: NewInputFormat[K, V]](sc: JavaSparkContext,
-                                                                           path: String,
+  private def newAPIHadoopRDDFromClassNames[K, V, F <: NewInputFormat[K, V]](sc: JavaSparkContext,
+                                                                           path: Option[String] = None,
                                                                            inputFormatClazz: String,
                                                                            keyClazz: String,
                                                                            valueClazz: String,
-                                                                           keyWrapper: String,
-                                                                           valueWrapper: String,
                                                                            conf: Configuration) = {
     implicit val kcm = ClassManifest.fromClass(Class.forName(keyClazz)).asInstanceOf[ClassManifest[K]]
     implicit val vcm = ClassManifest.fromClass(Class.forName(valueClazz)).asInstanceOf[ClassManifest[V]]
@@ -332,10 +289,66 @@ private[spark] object PythonRDD extends Logging {
     val kc = kcm.erasure.asInstanceOf[Class[K]]
     val vc = vcm.erasure.asInstanceOf[Class[V]]
     val fc = fcm.erasure.asInstanceOf[Class[F]]
-    sc.sc.newAPIHadoopFile(path, fc, kc, vc, conf)
+    val rdd = if (path.isDefined) {
+      sc.sc.newAPIHadoopFile[K, V, F](path.get, fc, kc, vc, conf)
+    } else {
+      sc.sc.newAPIHadoopRDD[K, V, F](conf, fc, kc, vc)
+    }
+    rdd
+  }
+
+  def hadoopFile[K, V, F <: InputFormat[K, V]](sc: JavaSparkContext,
+                                               path: String,
+                                               inputFormatClazz: String,
+                                               keyClazz: String,
+                                               valueClazz: String,
+                                               keyWrapper: String,
+                                               valueWrapper: String,
+                                               confAsMap: java.util.HashMap[String, String]) = {
+    val conf = PythonHadoopUtil.mapToConf(confAsMap)
+    val baseConf = sc.hadoopConfiguration()
+    val mergedConf = PythonHadoopUtil.mergeConfs(baseConf, conf)
+    val rdd =
+      hadoopRDDFromClassNames[K, V, F](sc,
+        Some(path), inputFormatClazz, keyClazz, valueClazz, mergedConf)
+    val converted = SerDeUtil.convertRDD[K, V](rdd)
+    JavaRDD.fromRDD(SerDeUtil.serMsgPack[K, V](converted))
+  }
+
+  def hadoopRDD[K, V, F <: InputFormat[K, V]](sc: JavaSparkContext,
+                                              inputFormatClazz: String,
+                                              keyClazz: String,
+                                              valueClazz: String,
+                                              keyWrapper: String,
+                                              valueWrapper: String,
+                                              confAsMap: java.util.HashMap[String, String]) = {
+    val conf = PythonHadoopUtil.mapToConf(confAsMap)
+    val rdd =
+      hadoopRDDFromClassNames[K, V, F](sc,
+        None, inputFormatClazz, keyClazz, valueClazz, conf)
+    val converted = SerDeUtil.convertRDD[K, V](rdd)
+    JavaRDD.fromRDD(SerDeUtil.serMsgPack[K, V](converted))
   }
 
-  //
+  private def hadoopRDDFromClassNames[K, V, F <: InputFormat[K, V]](sc: JavaSparkContext,
+                                                                             path: Option[String] = None,
+                                                                             inputFormatClazz: String,
+                                                                             keyClazz: String,
+                                                                             valueClazz: String,
+                                                                             conf: Configuration) = {
+    implicit val kcm = ClassManifest.fromClass(Class.forName(keyClazz)).asInstanceOf[ClassManifest[K]]
+    implicit val vcm = ClassManifest.fromClass(Class.forName(valueClazz)).asInstanceOf[ClassManifest[V]]
+    implicit val fcm = ClassManifest.fromClass(Class.forName(inputFormatClazz)).asInstanceOf[ClassManifest[F]]
+    val kc = kcm.erasure.asInstanceOf[Class[K]]
+    val vc = vcm.erasure.asInstanceOf[Class[V]]
+    val fc = fcm.erasure.asInstanceOf[Class[F]]
+    val rdd = if (path.isDefined) {
+      sc.sc.hadoopFile(path.get, fc, kc, vc)
+    } else {
+      sc.sc.hadoopRDD(new JobConf(conf), fc, kc, vc)
+    }
+    rdd
+  }
 
   def writeToStream(elem: Any, dataOut: DataOutputStream)(implicit m: ClassManifest[Any]) {
     elem match {
diff --git a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
index 8a6119e0e452f..169987823efb7 100644
--- a/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
+++ b/core/src/main/scala/org/apache/spark/api/python/SerDeUtil.scala
@@ -15,43 +15,60 @@ import scala.util.Failure
  */
 private[python] object SerDeUtil extends Logging {
 
+  /** Attempts to register a class with MsgPack, only if it is not a primitive or a String */
   def register[T](clazz: Class[T], msgpack: ScalaMessagePack) {
+    //implicit val kcm = ClassManifest.fromClass(clazz)
+    //val kc = kcm.erasure
     Try {
-      log.info("%s".format(clazz))
-      clazz match {
-        case c if c.isPrimitive =>
-        case c if c.isInstanceOf[java.lang.String] =>
-        case _ => msgpack.register(clazz)
-      }
-    }.getOrElse(log.warn("Failed to register class (%s) with MsgPack. ".format(clazz.getName) +
-      "Falling back to default MsgPack serialization, or 'toString' as last resort"))
+      //if (kc.isInstance("") || kc.isPrimitive) {
+      //  log.info("Class: %s doesn't need to be registered".format(kc.getName))
+      //} else {
+      msgpack.register(clazz)
+      log.info("Registered key/value class with MsgPack: %s".format(clazz))
+      //}
+
+    } match {
+      case Failure(err)  =>
+        log.warn("Failed to register class (%s) with MsgPack. ".format(clazz.getName) +
+          "Falling back to default MsgPack serialization, or 'toString' as last resort. " +
+          "Error: %s".format(err.getMessage))
+      case Success(result) =>
+    }
   }
 
-  // serialize and RDD[(K, V)] -> RDD[Array[Byte]] using MsgPack
+  /** Serializes an RDD[(K, V)] -> RDD[Array[Byte]] using MsgPack */
   def serMsgPack[K, V](rdd: RDD[(K, V)]) = {
     import org.msgpack.ScalaMessagePack._
-    val msgpack = new ScalaMessagePack with Serializable
-    val first = rdd.first()
-    val kc = ClassManifest.fromClass(first._1.getClass).asInstanceOf[ClassManifest[K]].erasure.asInstanceOf[Class[K]]
-    val vc = ClassManifest.fromClass(first._2.getClass).asInstanceOf[ClassManifest[V]].erasure.asInstanceOf[Class[V]]
-    register(kc, msgpack)
-    register(vc, msgpack)
-    rdd.map{ pair =>
+    rdd.mapPartitions{ pairs =>
+      val mp = new ScalaMessagePack
+      var triedReg = false
+      pairs.map{ pair =>
       Try {
-        msgpack.write(pair)
+        if (!triedReg) {
+          register(pair._1.getClass, mp)
+          register(pair._2.getClass, mp)
+          triedReg = true
+        }
+        mp.write(pair)
       } match {
         case Failure(err) =>
-          Try {
-            write((pair._1.toString, pair._2.toString))
-          } match {
-            case Success(result) => result
-            case Failure(e) => throw e
-          }
+            log.debug("Failed to write", err)
+            Try {
+              write((pair._1.toString, pair._2.toString))
+            } match {
+              case Success(result) => result
+              case Failure(e) => throw e
+            }
         case Success(result) => result
       }
     }
   }
+  }
 
+  /**
+   * Converts an RDD of (K, V) pairs, where K and/or V could be instances of [[org.apache.hadoop.io.Writable]],
+   * into an RDD[(K, V)]
+   */
   def convertRDD[K, V](rdd: RDD[(K, V)]) = {
     rdd.map{
       case (k: Writable, v: Writable) => (convert(k).asInstanceOf[K], convert(v).asInstanceOf[V])
@@ -61,6 +78,7 @@ private[python] object SerDeUtil extends Logging {
     }
   }
 
+  /** Converts a [[org.apache.hadoop.io.Writable]] to the underlying primitive, String or object representation */
   def convert(writable: Writable): Any = {
     import collection.JavaConversions._
     writable match {
diff --git a/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTests.scala b/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTests.scala
new file mode 100644
index 0000000000000..41e4612857d7e
--- /dev/null
+++ b/core/src/main/scala/org/apache/spark/api/python/WriteInputFormatTests.scala
@@ -0,0 +1,64 @@
+package org.apache.spark.api.python
+
+import org.apache.spark.SparkContext
+import org.apache.hadoop.io._
+import scala.Array
+import java.io.{DataOutput, DataInput}
+
+case class TestWritable(var str: String, var numi: Int, var numd: Double) extends Writable {
+  def this() = this("", 0, 0.0)
+
+  def write(p1: DataOutput) = {
+    p1.writeUTF(str)
+    p1.writeInt(numi)
+    p1.writeDouble(numd)
+  }
+
+  def readFields(p1: DataInput) = {
+    str = p1.readUTF()
+    numi = p1.readInt()
+    numd = p1.readDouble()
+  }
+}
+
+object WriteInputFormatTests extends App {
+  import SparkContext._
+
+  val sc = new SparkContext("local[2]", "test")
+
+  val textPath = "../python/test_support/data/sftext/"
+  val intPath = "../python/test_support/data/sfint/"
+  val doublePath = "../python/test_support/data/sfdouble/"
+  val arrPath = "../python/test_support/data/sfarray/"
+  val classPath = "../python/test_support/data/sfclass/"
+
+  val intKeys = Seq((1.0, "aa"), (2.0, "bb"), (2.0, "aa"), (3.0, "cc"), (2.0, "bb"), (1.0, "aa"))
+  sc.parallelize(intKeys).saveAsSequenceFile(intPath)
+  sc.parallelize(intKeys.map{ case (k, v) => (k.toDouble, v) }).saveAsSequenceFile(doublePath)
+  sc.parallelize(intKeys.map{ case (k, v) => (k.toString, v) }).saveAsSequenceFile(textPath)
+
+  val data = Seq(
+    (1, Array(1.0, 2.0, 3.0)),
+    (2, Array(3.0, 4.0, 5.0)),
+    (3, Array(4.0, 5.0, 6.0))
+  )
+  sc.parallelize(data, numSlices = 2)
+    .map{ case (k, v) =>
+      (new IntWritable(k), new ArrayWritable(classOf[DoubleWritable], v.map(new DoubleWritable(_))))
+    }
+    .saveAsNewAPIHadoopFile[org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat[IntWritable, ArrayWritable]](arrPath)
+
+  val testClass = Seq(
+    ("1", TestWritable("test1", 123, 54.0)),
+    ("2", TestWritable("test2", 456, 8762.3)),
+    ("1", TestWritable("test3", 123, 423.1)),
+    ("3", TestWritable("test56", 456, 423.5)),
+    ("2", TestWritable("test2", 123, 5435.2))
+  )
+  val rdd = sc.parallelize(testClass, numSlices = 2).map{ case (k, v) => (new Text(k), v) }
+  rdd.saveAsNewAPIHadoopFile(classPath,
+                             classOf[Text], classOf[TestWritable],
+                             classOf[org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat[Text, TestWritable]])
+
+
+}
\ No newline at end of file
diff --git a/python/pyspark/context.py b/python/pyspark/context.py
index 393aef37000d3..ae29728551992 100644
--- a/python/pyspark/context.py
+++ b/python/pyspark/context.py
@@ -214,17 +214,6 @@ def textFile(self, name, minSplits=None):
                    MUTF8Deserializer())
 
     ###
-    def sequenceFileAsText(self, name):
-        """
-        Read a Hadoopp SequenceFile with arbitrary key and value class from HDFS,
-        a local file system (available on all nodes), or any Hadoop-supported file system URI,
-        and return it as an RDD of (String, String) where the key and value representations
-        are generated using the 'toString()' method of the relevant Java class.
-        """
-        #minSplits = minSplits or min(self.defaultParallelism, 2)
-        jrdd = self._jvm.PythonRDD.sequenceFileAsText(self._jsc, name)
-        return RDD(jrdd, self, MsgPackDeserializer())  # MsgPackDeserializer   PairMUTF8Deserializer
-
     def sequenceFile(self, name, keyClass="org.apache.hadoop.io.Text", valueClass="org.apache.hadoop.io.Text",
                      keyWrapper="", valueWrapper="", minSplits=None):
         """
@@ -235,6 +224,8 @@ def sequenceFile(self, name, keyClass="org.apache.hadoop.io.Text", valueClass="o
 
         >>> sc.sequenceFile("test_support/data/sfint/").collect()
         [(1, 'aa'), (2, 'bb'), (2, 'aa'), (3, 'cc'), (2, 'bb'), (1, 'aa')]
+        >>> sc.sequenceFile("test_support/data/sftext/").collect()
+        [('1', 'aa'), ('2', 'bb'), ('2', 'aa'), ('3', 'cc'), ('2', 'bb'), ('1', 'aa')]
         """
         minSplits = minSplits or min(self.defaultParallelism, 2)
         jrdd = self._jvm.PythonRDD.sequenceFile(self._jsc, name, keyClass, valueClass, keyWrapper, valueWrapper,
@@ -242,7 +233,22 @@ def sequenceFile(self, name, keyClass="org.apache.hadoop.io.Text", valueClass="o
         #jrdd = self._jvm.PythonRDD.sequenceFile(self._jsc, name, keyWrapper, valueWrapper, minSplits)
         return RDD(jrdd, self, MsgPackDeserializer())  # MsgPackDeserializer   PairMUTF8Deserializer
 
-    def newHadoopFile(self, name, inputFormat, keyClass, valueClass, keyWrapper="toString", valueWrapper="toString",
+    def newAPIHadoopFile(self, name, inputFormat, keyClass, valueClass, keyWrapper="toString", valueWrapper="toString",
+                      conf = {}):
+        """
+        Read a Hadoopp file with arbitrary InputFormat, key and value class from HDFS,
+        a local file system (available on all nodes), or any Hadoop-supported file system URI,
+        and return it as an RDD of (String, String), where the key and value representations
+        are generated using the 'toString()' method of the relevant Java class.
+        """
+        jconf = self._jvm.java.util.HashMap()
+        for k, v in conf.iteritems():
+            jconf[k] = v
+        jrdd = self._jvm.PythonRDD.newAPIHadoopFile(self._jsc, name, inputFormat, keyClass, valueClass, keyWrapper,
+                                                       valueWrapper, jconf)
+        return RDD(jrdd, self, MsgPackDeserializer())
+
+    def newAPIHadoopRDD(self, inputFormat, keyClass, valueClass, keyWrapper="toString", valueWrapper="toString",
                       conf = {}):
         """
         Read a Hadoopp file with arbitrary InputFormat, key and value class from HDFS,
@@ -253,7 +259,7 @@ def newHadoopFile(self, name, inputFormat, keyClass, valueClass, keyWrapper="toS
         jconf = self._jvm.java.util.HashMap()
         for k, v in conf.iteritems():
             jconf[k] = v
-        jrdd = self._jvm.PythonRDD.newHadoopFile(self._jsc, name, inputFormat, keyClass, valueClass, keyWrapper,
+        jrdd = self._jvm.PythonRDD.newAPIHadoopRDD(self._jsc, inputFormat, keyClass, valueClass, keyWrapper,
                                                        valueWrapper, jconf)
         return RDD(jrdd, self, MsgPackDeserializer())
 
diff --git a/python/test_support/data/sfarray/._SUCCESS.crc b/python/test_support/data/sfarray/._SUCCESS.crc
new file mode 100644
index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c
GIT binary patch
literal 8
PcmYc;N@ieSU}69O2$TUk

literal 0
HcmV?d00001

diff --git a/python/test_support/data/sfarray/.part-r-00000.crc b/python/test_support/data/sfarray/.part-r-00000.crc
new file mode 100644
index 0000000000000000000000000000000000000000..7edf7b94e5361d066d25e0039d4184eeb884e012
GIT binary patch
literal 12
TcmYc;N@ieSU}A8-VOaqH5&i=Q

literal 0
HcmV?d00001

diff --git a/python/test_support/data/sfarray/.part-r-00001.crc b/python/test_support/data/sfarray/.part-r-00001.crc
new file mode 100644
index 0000000000000000000000000000000000000000..6bcabe4c213e59aca7813adbc7147979d6254ad8
GIT binary patch
literal 12
TcmYc;N@ieSU}9LC-O>dB69)rf

literal 0
HcmV?d00001

diff --git a/python/test_support/data/sfarray/_SUCCESS b/python/test_support/data/sfarray/_SUCCESS
new file mode 100755
index 0000000000000..e69de29bb2d1d
diff --git a/python/test_support/data/sfarray/part-r-00000 b/python/test_support/data/sfarray/part-r-00000
new file mode 100755
index 0000000000000000000000000000000000000000..4baeb05d9f66dd8011f122a322432256fe96c5d8
GIT binary patch
literal 134
zcmWG`4P;ZuFG|--EJ#ewNY%?oOv%qL(96u%^UNy=FUl-QOv*`B!m7Zrs3@@#ri1|u
s3hVDJtNYg)U~`n=yUWV7QlOv$5VHU=BM>v&e*ls|-~c9pGzWwM0KVoVo&W#<

literal 0
HcmV?d00001

diff --git a/python/test_support/data/sfarray/part-r-00001 b/python/test_support/data/sfarray/part-r-00001
new file mode 100755
index 0000000000000000000000000000000000000000..13c4b47c7799f533f5c31609cb6d62c3477437c0
GIT binary patch
literal 174
zcmWG`4P;ZuFG|--EJ#ewNY%?oOv%qL(96u%^UNy=FUl-QOv*`B!m7Zrs3@@#ri1|u
z#LhjaXDgrhZ@*AnRn?1$ML<CXAZ7t#CLm^Z-~f_9;2;2{MIbbgQ~+X-I%W_?mv@i=
HiUI)uMx-Rt

literal 0
HcmV?d00001

diff --git a/python/test_support/data/sfclass/._SUCCESS.crc b/python/test_support/data/sfclass/._SUCCESS.crc
new file mode 100644
index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c
GIT binary patch
literal 8
PcmYc;N@ieSU}69O2$TUk

literal 0
HcmV?d00001

diff --git a/python/test_support/data/sfclass/.part-r-00000.crc b/python/test_support/data/sfclass/.part-r-00000.crc
new file mode 100644
index 0000000000000000000000000000000000000000..797ebf2bdd18a44819513dfcdb69687e2e9c3378
GIT binary patch
literal 12
TcmYc;N@ieSU}EqWzPT6x5lRCY

literal 0
HcmV?d00001

diff --git a/python/test_support/data/sfclass/.part-r-00001.crc b/python/test_support/data/sfclass/.part-r-00001.crc
new file mode 100644
index 0000000000000000000000000000000000000000..c28b24676b81f2502626e488c802825da7f2ab36
GIT binary patch
literal 12
TcmYc;N@ieSU}E^R_J{xg6z2o3

literal 0
HcmV?d00001

diff --git a/python/test_support/data/sfclass/_SUCCESS b/python/test_support/data/sfclass/_SUCCESS
new file mode 100755
index 0000000000000..e69de29bb2d1d
diff --git a/python/test_support/data/sfclass/part-r-00000 b/python/test_support/data/sfclass/part-r-00000
new file mode 100755
index 0000000000000000000000000000000000000000..969e9c7704db65be667275c4d7be7b92ce2b1ea7
GIT binary patch
literal 151
zcmWG`4P=wdFG|--EJ#ewNY%?oOv%qL(96u%3rVdg(Lj<ZE=VlO1}e+cE2u2V$j<{R
zC@u*v$}CAt%1LDagXsGIVw0Y@eQx*^_WREN8|*+qQ6OewG-O~c0jf3xa;hD?!D>MS
VMA8T%X~e+5c*5bJtXdin008>1EjR!G

literal 0
HcmV?d00001

diff --git a/python/test_support/data/sfclass/part-r-00001 b/python/test_support/data/sfclass/part-r-00001
new file mode 100755
index 0000000000000000000000000000000000000000..e6bae57a5c4c89a9ca00cc5ce83a0743351334c8
GIT binary patch
literal 181
zcmWG`4P=wdFG|--EJ#ewNY%?oOv%qL(96u%3rVdg(Lj<ZE=VlO1}e+cE2u2V$j<{R
zC@u*v$}CAt%1LDagKZrX%vDYt552s6caTnPS_Du~6o{D^4H;NVfU1pwoN9-v!kIuY
p3&<7&OByq<fh0}M7#J8&I8;>t1;GHK)d-^12q;wTu+`ca1ORHJGBp4I

literal 0
HcmV?d00001

diff --git a/python/test_support/data/sfdouble/.part-00000.crc b/python/test_support/data/sfdouble/.part-00000.crc
index d3ec08da4420ca261a8ac24a63656856b9de3b42..a12b149d2d616a4dbf82c434119baa88d5bafc57 100644
GIT binary patch
literal 12
TcmYc;N@ieSU}8vnx8Mx`6b=L%

literal 12
TcmYc;N@ieSU}DfQ>X--s5N86i

diff --git a/python/test_support/data/sfdouble/.part-00001.crc b/python/test_support/data/sfdouble/.part-00001.crc
index 6aff915ed35af73b5bacb8e06979cdd9dc73e27b..05fcd0d484b2aad42e209350ee0e81b8b413d053 100644
GIT binary patch
literal 12
TcmYc;N@ieSU}7i}JH-wF5j6ss

literal 12
TcmYc;N@ieSU}A{7Zovlt5&HtR

diff --git a/python/test_support/data/sfdouble/part-00000 b/python/test_support/data/sfdouble/part-00000
index 3f6b37bd218d9691789d8fe5f50958c96cb89bbd..824db337e9a275bd76b02a187f33c5133a3831c0 100755
GIT binary patch
delta 26
icmbQpIFWIJhk((N&jq#L%|oS(GXw+{d0d+qVF>_){R#a5

delta 26
icmbQpIFWIJhXCgczC9<6&p9&Q@QC<S;ZQv>!V&<1ISNAn

diff --git a/python/test_support/data/sfdouble/part-00001 b/python/test_support/data/sfdouble/part-00001
index 4555179e42a2c266923cbe5db3275b18b7519938..a1e3b0504fc13ec6b2a92d1e58ee2ffebef27a1a 100755
GIT binary patch
delta 26
icmbQpIFWIJhrqVVkcMg5g|Ztq7+qMn;8^O!2ulEvU<;1`

delta 26
icmbQpIFWIJhkyXPH218FRx_N8w{BwA{&!?zge3rPO$l`X

diff --git a/python/test_support/data/sfint/.part-00000.crc b/python/test_support/data/sfint/.part-00000.crc
index e71a63f1f95d3468b584dc7bad0e39564dcb6481..85a67858b046a80b399232ff4135c188ac6ed8c5 100644
GIT binary patch
literal 12
TcmYc;N@ieSU}AW8sjCqH6+;9D

literal 12
TcmYc;N@ieSU}Dhyk*Ep)5(5I}

diff --git a/python/test_support/data/sfint/.part-00001.crc b/python/test_support/data/sfint/.part-00001.crc
index 39403cee2721dc121fd3b925f554d87a3012b9de..74ced30437cf50d4d36089902629d05e7dfd9b5a 100644
GIT binary patch
literal 12
TcmYc;N@ieSU}Es~?NbH-5ZVHU

literal 12
TcmYc;N@ieSU}D(4?PM<i6s-gT

diff --git a/python/test_support/data/sfint/part-00000 b/python/test_support/data/sfint/part-00000
index 973d34ab7a095470bd47eb415ec497a6de635a33..795bfe33ba5637e0445ff811d560a824da34ae17 100755
GIT binary patch
literal 145
zcmWG`4P;Z!FG|--EJ#ewNY%?oOv%qL(96u%bIC7F%1I3`$}C9)(vn!!grruKFn|Hu
tS-abz3dj7IW0P6i)Yg{*1-XHk!~O$Ao+&XACgcDSVPHy1!VpSK1OR9CA`<`r

literal 130
zcmWG`4P;ZuFG|--EJ#ewNY%?oOv%qL(96u%^UNy=FUl-QOv*`>#Ht`9wW5Rp44w+c
nCV#iMT_QVSg4DEK$<aVTb|7W}Vn(LKL?kW~Q&JKNH!%?aYjYr~

diff --git a/python/test_support/data/sfint/part-00001 b/python/test_support/data/sfint/part-00001
index 313d8c4b92ced9cefb894069f916030dd2b0caf7..ab9a3430d4c6bbab960c28c737205da39524e05e 100755
GIT binary patch
literal 145
zcmWG`4P;Z!FG|--EJ#ewNY%?oOv%qL(96u%bIC7F%1I3`$}C9)(vn!!grruKFo40a
wWJiv@yVkB;IOBhEpx@N3KtXOG=5XME$TKA;!-XIs3`|K$Fd_R7P*sVE0QJr!)c^nh

literal 130
zcmWG`4P;ZuFG|--EJ#ewNY%?oOv%qL(96u%^UNy=FUl-QOv*`>#Ht`9wW5Rp46J8{
pne0tJ>3gPl{f=JmspUXHb|7W}VrHh~WF#&VQ&JKVmyszk5dedqAvgd4

diff --git a/python/test_support/data/sftext/._SUCCESS.crc b/python/test_support/data/sftext/._SUCCESS.crc
new file mode 100644
index 0000000000000000000000000000000000000000..3b7b044936a890cd8d651d349a752d819d71d22c
GIT binary patch
literal 8
PcmYc;N@ieSU}69O2$TUk

literal 0
HcmV?d00001

diff --git a/python/test_support/data/sftext/.part-00000.crc b/python/test_support/data/sftext/.part-00000.crc
index 0ea1d96c249ae31b3a85750d4c8ab2a11eb22ecb..c85057ef6db055ab4d337676c78fda3296015664 100644
GIT binary patch
literal 12
TcmYc;N@ieSU}88ewDTYU6BPrJ

literal 12
TcmYc;N@ieSU}AXYThtB!6KVrz

diff --git a/python/test_support/data/sftext/.part-00001.crc b/python/test_support/data/sftext/.part-00001.crc
index 3c0294ef8e9e434ce6a52c87a228274a5535e48e..c01c28a4fc606948bb03fa9d63ed901385db4e8e 100644
GIT binary patch
literal 12
TcmYc;N@ieSU}7kmc~lhu62b#8

literal 12
TcmYc;N@ieSU}CsAv1K{{6gC6E

diff --git a/python/test_support/data/sftext/_SUCCESS b/python/test_support/data/sftext/_SUCCESS
new file mode 100755
index 0000000000000..e69de29bb2d1d
diff --git a/python/test_support/data/sftext/part-00000 b/python/test_support/data/sftext/part-00000
index 312e146946c4425e2c7c546253a18350603b113f..c5fcff085eb59a9f7ad3ceedbd5f6962b14d6e7c 100755
GIT binary patch
delta 66
zcmXS|o?vHtaWB_Ad6om!^*8=)l{Hya$-uzC4#X_XhI$4}iHUHo5s;gdgu+cs1OR&+
B4mAJ(

delta 60
zcmb;}onU9~Wa)jU?tc9=xs`RIhn7D3$H2hA3dBr|iA;%!a843aQWBh#43q%?KF$t1

diff --git a/python/test_support/data/sftext/part-00001 b/python/test_support/data/sftext/part-00001
index 86b35d3b1fb93cda1430992392e0dd168f17e027..d7cfd1d14561dab923aa2054560248457a16c22b 100755
GIT binary patch
delta 66
zcmXS|o?vIYH7?3|F7x$!r)+!I^ILT2F)%Q&12GG;v7P}_ax$E21mq?q!MTP&Zek(;
DRUHj&

delta 60
zcmb;}onU9a(kAZS{A1r4t}ZmoUBGHw&A`CG3dBr|$xO+~a84pqQWBh#!jzZ@04SCX
AaR2}S