Skip to content

Commit

Permalink
updated with changes suggested by sethah
Browse files Browse the repository at this point in the history
  • Loading branch information
setjet committed Oct 16, 2016
1 parent ca7cd78 commit 035aeb6
Showing 1 changed file with 10 additions and 2 deletions.
12 changes: 10 additions & 2 deletions examples/src/main/python/mllib/chisq_selector_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import ChiSqSelector
from pyspark.mllib.util import MLUtils
from pyspark.mllib.linalg import Vectors
# $example off$

if __name__ == "__main__":
Expand All @@ -35,7 +36,8 @@

# Discretize data in 16 equal bins since ChiSqSelector requires categorical features
def distributeOverBins(lp):
return np.array(map(lambda x: x % 16, lp.features.toArray()))
return np.floor(lp.features.toArray() / 16)


# Even though features are doubles, the ChiSqSelector treats each unique value as a category
discretizedData = data.map(lambda lp: LabeledPoint(lp.label, distributeOverBins(lp)))
Expand All @@ -47,8 +49,14 @@ def distributeOverBins(lp):
transformer = selector.fit(discretizedData)

# Filter the top 50 features from each feature vector
filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features))

#filteredData = transformer.transform(discretizedData.map(lambda lp: lp.features))
filteredData = discretizedData.map(lambda lp: LabeledPoint(lp.label, transformer.transform(np.array([1]))))


# $example off$

print('filtered data:')
filteredData.foreach(print)

sc.stop()

0 comments on commit 035aeb6

Please sign in to comment.