Skip to content

Commit

Permalink
added count operation but this implementation need double check
Browse files Browse the repository at this point in the history
  • Loading branch information
Ken Takagiwa authored and giwa committed Sep 20, 2014
1 parent 15feea9 commit d3ee86a
Showing 1 changed file with 23 additions and 2 deletions.
25 changes: 23 additions & 2 deletions python/pyspark/streaming/dstream.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from collections import defaultdict
from itertools import chain, ifilter, imap
import operator

import logging

from pyspark.serializers import NoOpSerializer,\
BatchedSerializer, CloudPickleSerializer, pack_long
Expand All @@ -24,6 +27,18 @@ def generatedRDDs(self):
"""
pass

def count(self):
"""
"""
#TODO make sure count implementation, thiis different from what pyspark does
return self.mapPartitions(lambda i: [sum(1 for _ in i)]).sum().map(lambda x: x[1])

def sum(self):
"""
"""
return self.mapPartitions(lambda x: [sum(x)]).reduce(operator.add)

def print_(self):
"""
"""
Expand Down Expand Up @@ -63,9 +78,9 @@ def reduce(self, func, numPartitions=None):
"""
"""
return self._combineByKey(lambda x:x, func, func, numPartitions)
return self.combineByKey(lambda x:x, func, func, numPartitions)

def _combineByKey(self, createCombiner, mergeValue, mergeCombiners,
def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
numPartitions = None):
"""
"""
Expand All @@ -74,6 +89,12 @@ def _combineByKey(self, createCombiner, mergeValue, mergeCombiners,
def combineLocally(iterator):
combiners = {}
for x in iterator:

#TODO for count operation make sure count implementation
# This is different from what pyspark does
if isinstance(x, int):
x = ("", x)

(k, v) = x
if k not in combiners:
combiners[k] = createCombiner(v)
Expand Down

0 comments on commit d3ee86a

Please sign in to comment.