Skip to content

Commit

Permalink
Add parse function
Browse files Browse the repository at this point in the history
  • Loading branch information
MechCoder committed Mar 1, 2015
1 parent 65bbbe9 commit 1cdd7b5
Showing 1 changed file with 10 additions and 13 deletions.
23 changes: 10 additions & 13 deletions docs/mllib-naive-bayes.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,19 +120,16 @@ from pyspark.mllib.classification import NaiveBayes
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

data = sc.textFile("data/mllib/sample_naive_bayes_data.txt")

# Preprocessing
splitData = data.map(lambda line: line.split(','))
parsedData = splitData.map(
lambda parts: LabeledPoint(
float(parts[0]),
Vectors.dense(map(lambda x: float(x), parts[1].split(' ')))
)
)

# Split data into training (60%) and test (40%)
training, test = parsedData.randomSplit([0.6, 0.4], seed = 0)
def parseLine(line):
parts = line.split(',')
label = float(parts[0])
features = Vectors.dense([float(x) for x in parts[1].split(' ')])
return LabeledPoint(label, features)

data = sc.textFile('data/mllib/sample_naive_bayes_data.txt').map(parseLine)

# Split data aproximately into training (60%) and test (40%)
training, test = data.randomSplit([0.6, 0.4], seed = 0)

# Train a naive Bayes model.
model = NaiveBayes.train(training, 1.0)
Expand Down

0 comments on commit 1cdd7b5

Please sign in to comment.