SchemaRDD now has all RDD operations

witgo · Apr 15, 2014 · c608947 · c608947
1 parent 725c91e
commit c608947
Showing 1 changed file with 13 additions and 1 deletion.
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -1401,14 +1401,26 @@ def __init__(self, jschema_rdd, sql_ctx):
         self._sc = sql_ctx._sc
         self._jschema_rdd = jschema_rdd
 
+        self._jrdd = self.toPython()._jrdd
+        self.is_cached = False
+        self.is_checkpointed = False
+        self.ctx = self.sql_ctx._sc
+        self._jrdd_deserializer = self.ctx.serializer
+        # TODO: Figure out how to make this lazy
+        #self._id = self._jrdd.id()
+
     def registerAsTable(self, name):
         self._jschema_rdd.registerAsTable(name)
 
     def toPython(self):
         jrdd = self._jschema_rdd.javaToPython()
-        #jrdd = self._sc._javaToPython(self._jschema_rdd)
         return RDD(jrdd, self._sc, self._sc.serializer).map(lambda d: Row(d))
 
+customRDDDict = dict(RDD.__dict__)
+del customRDDDict["__init__"]
+
+SchemaRDD.__dict__.update(customRDDDict)
+
 def _test():
     import doctest
     from pyspark.context import SparkContext