apache · MrBago · Dec 18, 2017 · Dec 19, 2017 · Dec 22, 2017 · Dec 27, 2017
diff --git a/python/pyspark/ml/base.py b/python/pyspark/ml/base.py
@@ -18,13 +18,40 @@
 from abc import ABCMeta, abstractmethod
 
 import copy
+import threading
 
 from pyspark import since
-from pyspark.ml.param import Params
 from pyspark.ml.param.shared import *
 from pyspark.ml.common import inherit_doc
 from pyspark.sql.functions import udf
-from pyspark.sql.types import StructField, StructType, DoubleType
+from pyspark.sql.types import StructField, StructType
+
+
+class FitMutlipleIterator(object):
+    """
+    Used by default implementation of Estimator.fitMultiple to produce models in a thread safe
+    iterator.
+    """
+    def __init__(self, fitSingleModel, numModel):
+        self.fitSingleModel = fitSingleModel
+        self.numModel = numModel
+        self.counter = 0
+        self.lock = threading.Lock()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        with self.lock:
+            index = self.counter
+            if index >= self.numModel:
+                raise StopIteration("No models remaining.")
+            self.counter += 1
+        return index, self.fitSingleModel(index)
+
+    def next(self):
+        """For python2 compatibility."""
+        return self.__next__()
 
 
 @inherit_doc
@@ -47,6 +74,24 @@ def _fit(self, dataset):
         """
         raise NotImplementedError()
 
+    @since("2.3.0")
+    def fitMultiple(self, dataset, params):
+        """
+        Fits a model to the input dataset for each param map in params.
+
+        :param dataset: input dataset, which is an instance of :py:class:`pyspark.sql.DataFrame`.
+        :param params: A list/tuple of param maps.
+        :return: A thread safe iterable which contains one model for each param map. Each
+                 call to `next(modelIterator)` will return `(index, model)` where model was fit
+                 using `params[index]`. Params maps may be fit in an order different than their
+                 order in params.
+
+        .. note:: Experimental
+        """
+        def fitSingleModel(index):
+            return self.fit(dataset, params[index])
+        return FitMutlipleIterator(fitSingleModel, len(params))
+
     @since("1.3.0")
     def fit(self, dataset, params=None):
         """
@@ -61,7 +106,10 @@ def fit(self, dataset, params=None):
         if params is None:
             params = dict()
         if isinstance(params, (list, tuple)):
-            return [self.fit(dataset, paramMap) for paramMap in params]
+            models = [None] * len(params)
+            for index, model in self.fitMultiple(dataset, params):
+                models[index] = model
+            return models
         elif isinstance(params, dict):
             if params:
                 return self.copy(params)._fit(dataset)

diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
@@ -2359,6 +2359,21 @@ def test_unary_transformer_transform(self):
             self.assertEqual(res.input + shiftVal, res.output)
 
 
+class TestFit(unittest.TestCase):
+
+    def testDefaultFitMultiple(self):
+        N = 4
+        data = MockDataset()
+        estimator = MockEstimator()
+        params = [{estimator.fake: i} for i in range(N)]
+        modelIter = estimator.fitMultiple(data, params)
+        indexList = []
+        for index, model in modelIter:
+            self.assertEqual(model.getFake(), index)
+            indexList.append(index)
+        self.assertEqual(sorted(indexList), list(range(N)))
+
+
 if __name__ == "__main__":
     from pyspark.ml.tests import *
     if xmlrunner:

diff --git a/python/pyspark/ml/tuning.py b/python/pyspark/ml/tuning.py
@@ -31,6 +31,17 @@
            'TrainValidationSplitModel']
 
 
+def parallelFitTasks(est, train, eva, validation, epm):
+    modelIter = est.fitMultiple(train, epm)
+
+    def singleTask():
+        index, model = next(modelIter)
+        metric = eva.evaluate(model.transform(validation, epm[index]))
+        return index, metric
+
+    return [singleTask] * len(epm)
+
+
 class ParamGridBuilder(object):
     r"""
     Builder for a param grid used in grid search-based model selection.
@@ -266,15 +277,9 @@ def _fit(self, dataset):
             validation = df.filter(condition).cache()
             train = df.filter(~condition).cache()
 
-            def singleTrain(paramMap):
-                model = est.fit(train, paramMap)
-                # TODO: duplicate evaluator to take extra params from input
-                metric = eva.evaluate(model.transform(validation, paramMap))
-                return metric
-
-            currentFoldMetrics = pool.map(singleTrain, epm)
-            for j in range(numModels):
-                metrics[j] += (currentFoldMetrics[j] / nFolds)
+            tasks = parallelFitTasks(est, train, eva, validation, epm)
+            for j, metric in pool.imap_unordered(lambda f: f(), tasks):
+                metrics[j] += (metric / nFolds)
             validation.unpersist()
             train.unpersist()
 
@@ -523,13 +528,11 @@ def _fit(self, dataset):
         validation = df.filter(condition).cache()
         train = df.filter(~condition).cache()
 
-        def singleTrain(paramMap):
-            model = est.fit(train, paramMap)
-            metric = eva.evaluate(model.transform(validation, paramMap))
-            return metric
-
+        tasks = parallelFitTasks(est, train, eva, validation, epm)
         pool = ThreadPool(processes=min(self.getParallelism(), numModels))
-        metrics = pool.map(singleTrain, epm)
+        metrics = [None] * numModels
+        for j, metric in pool.imap_unordered(lambda f: f(), tasks):
+            metrics[j] = metric
         train.unpersist()
         validation.unpersist()