Skip to content

Commit

Permalink
Cache underlying SchemaRDD instead of generating and caching PythonRDD
Browse files Browse the repository at this point in the history
  • Loading branch information
ahirreddy committed Apr 15, 2014
1 parent f2312c7 commit 29245bf
Showing 1 changed file with 31 additions and 0 deletions.
31 changes: 31 additions & 0 deletions python/pyspark/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,37 @@ def _toPython(self):
# pickle serializer in Pyrolite
return RDD(jrdd, self._sc, self._sc.serializer).map(lambda d: Row(d))

# We override the default cache/persist/checkpoint behavior as we want to cache the underlying
# SchemaRDD object in the JVM, not the PythonRDD checkpointed by the super class
def cache(self):
self.is_cached = True
self._jschema_rdd.cache()
return self

def persist(self, storageLevel):
self.is_cached = True
javaStorageLevel = self.ctx._getJavaStorageLevel(storageLevel)
self._jschema_rdd.persist(javaStorageLevel)
return self

def unpersist(self):
self.is_cached = False
self._jschema_rdd.unpersist()
return self

def checkpoint(self):
self.is_checkpointed = True
self._jschema_rdd.checkpoint()

def isCheckpointed(self):
return self._jschema_rdd.isCheckpointed()

def getCheckpointFile(self):
checkpointFile = self._jschema_rdd.getCheckpointFile()
if checkpointFile.isDefined():
return checkpointFile.get()
else:
return None

def _test():
import doctest
Expand Down

0 comments on commit 29245bf

Please sign in to comment.