Skip to content

Commit

Permalink
[SPARK-6845] Add isTranposed flag to DenseMatrix
Browse files Browse the repository at this point in the history
  • Loading branch information
MechCoder committed Apr 17, 2015
1 parent 8220d52 commit cc0b90a
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 17 deletions.
48 changes: 32 additions & 16 deletions python/pyspark/mllib/linalg.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,9 +638,10 @@ class Matrix(object):
Represents a local matrix.
"""

def __init__(self, numRows, numCols):
def __init__(self, numRows, numCols, isTransposed=False):
self.numRows = numRows
self.numCols = numCols
self.isTransposed = isTransposed

def toArray(self):
"""
Expand All @@ -662,14 +663,15 @@ class DenseMatrix(Matrix):
"""
Column-major dense matrix.
"""
def __init__(self, numRows, numCols, values):
Matrix.__init__(self, numRows, numCols)
def __init__(self, numRows, numCols, values, isTransposed=False):
Matrix.__init__(self, numRows, numCols, isTransposed)
values = self._convert_to_array(values, np.float64)
assert len(values) == numRows * numCols
self.values = values

def __reduce__(self):
return DenseMatrix, (self.numRows, self.numCols, self.values.tostring())
return DenseMatrix, (
self.numRows, self.numCols, self.values.tostring(), self.isTransposed)

def toArray(self):
"""
Expand All @@ -680,15 +682,23 @@ def toArray(self):
array([[ 0., 2.],
[ 1., 3.]])
"""
return self.values.reshape((self.numRows, self.numCols), order='F')
if self.isTransposed:
return np.asfortranarray(
self.values.reshape((self.numRows, self.numCols)))
else:
return self.values.reshape((self.numRows, self.numCols), order='F')

def toSparse(self):
"""Convert to SparseMatrix"""
indices = np.nonzero(self.values)[0]
if self.isTransposed:
values = np.ravel(self.toArray(), order='F')
else:
values = self.values
indices = np.nonzero(values)[0]
colCounts = np.bincount(indices // self.numRows)
colPtrs = np.cumsum(np.hstack(
(0, colCounts, np.zeros(self.numCols - colCounts.size))))
values = self.values[indices]
values = values[indices]
rowIndices = indices % self.numRows

return SparseMatrix(self.numRows, self.numCols, colPtrs, rowIndices, values)
Expand All @@ -701,21 +711,28 @@ def __getitem__(self, indices):
if j >= self.numCols or j < 0:
raise ValueError("Column index %d is out of range [0, %d)"
% (j, self.numCols))
return self.values[i + j * self.numRows]

if self.isTransposed:
return self.values[i * self.numCols + j]
else:
return self.values[i + j * self.numRows]

def __eq__(self, other):
return (isinstance(other, DenseMatrix) and
self.numRows == other.numRows and
self.numCols == other.numCols and
all(self.values == other.values))
if (not isinstance(other, DenseMatrix) or
self.numRows != other.numRows or
self.numCols != other.numCols):
return False

self_values = np.ravel(self.toArray(), order='F')
other_values = np.ravel(other.toArray(), order='F')
return all(self_values == other_values)


class SparseMatrix(Matrix):
"""Sparse Matrix stored in CSC format."""
def __init__(self, numRows, numCols, colPtrs, rowIndices, values,
isTransposed=False):
Matrix.__init__(self, numRows, numCols)
self.isTransposed = isTransposed
Matrix.__init__(self, numRows, numCols, isTransposed)
self.colPtrs = self._convert_to_array(colPtrs, np.int32)
self.rowIndices = self._convert_to_array(rowIndices, np.int32)
self.values = self._convert_to_array(values, np.float64)
Expand Down Expand Up @@ -777,8 +794,7 @@ def toArray(self):
return A

def toDense(self):
densevals = np.reshape(
self.toArray(), (self.numRows * self.numCols), order='F')
densevals = np.ravel(self.toArray(), order='F')
return DenseMatrix(self.numRows, self.numCols, densevals)

# TODO: More efficient implementation:
Expand Down
18 changes: 17 additions & 1 deletion python/pyspark/mllib/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def test_serialize(self):
self._test_serialize(DenseVector(pyarray.array('d', range(10))))
self._test_serialize(SparseVector(4, {1: 1, 3: 2}))
self._test_serialize(SparseVector(3, {}))
self._test_serialize(DenseMatrix(2, 3, range(6)))
# self._test_serialize(DenseMatrix(2, 3, range(6)))

def test_dot(self):
sv = SparseVector(4, {1: 1, 3: 2})
Expand Down Expand Up @@ -193,6 +193,22 @@ def test_sparse_matrix(self):
self.assertEquals(expected[i][j], sm1t[i, j])
self.assertTrue(array_equal(sm1t.toArray(), expected))

def test_dense_matrix_is_transposed(self):
mat1 = DenseMatrix(3, 2, [0, 4, 1, 6, 3, 9], isTransposed=True)
mat = DenseMatrix(3, 2, [0, 1, 3, 4, 6, 9])
self.assertEquals(mat1, mat)

expected = [[0, 4], [1, 6], [3, 9]]
for i in range(3):
for j in range(2):
self.assertEquals(mat1[i, j], expected[i][j])
self.assertTrue(array_equal(mat1.toArray(), expected))

sm = mat1.toSparse()
self.assertTrue(array_equal(sm.rowIndices, [1, 2, 0, 1, 2]))
self.assertTrue(array_equal(sm.colPtrs, [0, 2, 5]))
self.assertTrue(array_equal(sm.values, [1, 3, 4, 6, 9]))


class ListTests(PySparkTestCase):

Expand Down

0 comments on commit cc0b90a

Please sign in to comment.