Skip to content

Commit

Permalink
[SPARK-7604] [MLLIB] Python API for PCA and PCAModel
Browse files Browse the repository at this point in the history
Python API for PCA and PCAModel

Author: Yanbo Liang <[email protected]>

Closes #6315 from yanboliang/spark-7604 and squashes the following commits:

1d58734 [Yanbo Liang] remove transform() in PCAModel, use default behavior
4d9d121 [Yanbo Liang] Python API for PCA and PCAModel
  • Loading branch information
yanboliang authored and jkbradley committed Jun 21, 2015
1 parent a1e3649 commit 32e3cda
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,16 @@ private[python] class PythonMLLibAPI extends Serializable {
new ChiSqSelector(numTopFeatures).fit(data.rdd)
}

/**
* Java stub for PCA.fit(). This stub returns a
* handle to the Java object instead of the content of the Java object.
* Extra care needs to be taken in the Python code to ensure it gets freed on
* exit; see the Py4J documentation.
*/
def fitPCA(k: Int, data: JavaRDD[Vector]): PCAModel = {
new PCA(k).fit(data.rdd)
}

/**
* Java stub for IDF.fit(). This stub returns a
* handle to the Java object instead of the content of the Java object.
Expand Down
35 changes: 35 additions & 0 deletions python/pyspark/mllib/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,41 @@ def fit(self, data):
return ChiSqSelectorModel(jmodel)


class PCAModel(JavaVectorTransformer):
"""
Model fitted by [[PCA]] that can project vectors to a low-dimensional space using PCA.
"""


class PCA(object):
"""
A feature transformer that projects vectors to a low-dimensional space using PCA.
>>> data = [Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),
... Vectors.dense([2.0, 0.0, 3.0, 4.0, 5.0]),
... Vectors.dense([4.0, 0.0, 0.0, 6.0, 7.0])]
>>> model = PCA(2).fit(sc.parallelize(data))
>>> pcArray = model.transform(Vectors.sparse(5, [(1, 1.0), (3, 7.0)])).toArray()
>>> pcArray[0]
1.648...
>>> pcArray[1]
-4.013...
"""
def __init__(self, k):
"""
:param k: number of principal components.
"""
self.k = int(k)

def fit(self, data):
"""
Computes a [[PCAModel]] that contains the principal components of the input vectors.
:param data: source vectors
"""
jmodel = callMLlibFunc("fitPCA", self.k, data)
return PCAModel(jmodel)


class HashingTF(object):
"""
.. note:: Experimental
Expand Down

0 comments on commit 32e3cda

Please sign in to comment.