flyteorg · akhurana001 · Apr 17, 2020 · Apr 10, 2020 · Apr 12, 2020 · Apr 13, 2020
@@ -0,0 +1,112 @@
+from __future__ import absolute_import
+
+try:
+    from inspect import getfullargspec as _getargspec
+except ImportError:
+    from inspect import getargspec as _getargspec
+
+from flytekit import __version__
+import sys as _sys
+import six as _six
+from flytekit.common.tasks import task as _base_tasks
+from flytekit.models import literals as _literal_models, task as _task_models
+from google.protobuf.json_format import MessageToDict as _MessageToDict
+from flytekit.common import interface as _interface
+from flytekit.models import interface as _interface_model
+from flytekit.configuration import internal as _internal_config
+
+class SdkGenericSparkTask( _base_tasks.SdkTask):
+    """
+    This class includes the additional logic for building a task that executes as a Spark Job.
+
+    """
+    def __init__(
+            self,
+            task_type,
+            discovery_version,
+            retries,
+            interruptible,
+            task_inputs,
+            deprecated,
+            discoverable,
+            timeout,
+            spark_type,
+            main_class,
+            main_application_file,
+            spark_conf,
+            hadoop_conf,
+            environment,
+    ):
+        """
+        :param Text task_type: string describing the task type
+        :param Text discovery_version: string describing the version for task discovery purposes
+        :param int retries: Number of retries to attempt
+        :param bool interruptible: Whether or not task is interruptible
+        :param Text deprecated:
+        :param bool discoverable:
+        :param datetime.timedelta timeout:
+        :param Text spark_type: Type of Spark Job: Scala/Java
+        :param Text main_class: Main class to execute for Scala/Java jobs
+        :param Text main_application_file: Main application file
+        :param dict[Text,Text] spark_conf:
+        :param dict[Text,Text] hadoop_conf:
+        :param dict[Text,Text] environment: [optional] environment variables to set when executing this task.
+        """
+
+        spark_job = _task_models.SparkJob(
+            spark_conf=spark_conf,
+            hadoop_conf=hadoop_conf,
+            type = spark_type,
+            application_file=main_application_file,
+            main_class=main_class,
+            executor_path=_sys.executable,
+        ).to_flyte_idl()
+
+        # No output support
+        input_variables = {k: _interface_model.Variable(v.to_flyte_literal_type(), k) for k, v in _six.iteritems(task_inputs)}
+
+        super(SdkGenericSparkTask, self).__init__(
+            task_type,
+            _task_models.TaskMetadata(
+                discoverable,
+                _task_models.RuntimeMetadata(
+                    _task_models.RuntimeMetadata.RuntimeType.FLYTE_SDK,
+                    __version__,
+                    'spark'
+                ),
+                timeout,
+                _literal_models.RetryStrategy(retries),
+                interruptible,
+                discovery_version,
+                deprecated
+            ),
+            _interface.TypedInterface(input_variables, {}),
+            _MessageToDict(spark_job),
+            container=self._get_container_definition(
+                task_inputs= task_inputs,
+                environment=environment
+            )
+        )
+
+    def _get_container_definition(
+            self,
+            task_inputs=None,
+            environment=None,
+    ):
+        """
+        :rtype: Container
+        """
+
+        args = []
+        for k, v in _six.iteritems(task_inputs):
+            args.append("--{}".format(k))
+            args.append("{{{{.Inputs.{}}}}}".format(k))
+
+        return _task_models.Container(
+            image= _internal_config.IMAGE.get(),
+            command=[],
+            args=args,
+            resources=_task_models.Resources([], []),
+            env=environment,
+            config={}
+        )
@@ -88,6 +88,8 @@ def __init__(
             hadoop_conf=hadoop_conf,
             application_file="local://" + spark_exec_path,
             executor_path=_sys.executable,
+            main_class="",
+            type="PYTHON",
         ).to_flyte_idl()
         super(SdkSparkTask, self).__init__(
             task_function,

@@ -541,10 +541,9 @@ def from_flyte_idl(cls, pb2_object):
             template=TaskTemplate.from_flyte_idl(pb2_object.template)
         )
 
-
 class SparkJob(_common.FlyteIdlEntity):
 
-    def __init__(self, application_file, spark_conf, hadoop_conf, executor_path):
+    def __init__(self, type, application_file, main_class, spark_conf, hadoop_conf, executor_path):
         """
         This defines a SparkJob target.  It will execute the appropriate SparkJob.
 
@@ -553,10 +552,28 @@ def __init__(self, application_file, spark_conf, hadoop_conf, executor_path):
         :param dict[Text, Text] hadoop_conf: A definition of key-value pairs for hadoop config for the job.
         """
         self._application_file = application_file
+        self._type = type
+        self._main_class = main_class
         self._executor_path = executor_path
         self._spark_conf = spark_conf
         self._hadoop_conf = hadoop_conf
 
+    @property
+    def main_class(self):
+        """
+        The main class to execute
+        :rtype: Text
+        """
+        return self._main_class
+
+    @property
+    def type(self):
+        """
+        Spark Job Type
+        :rtype: Text
+        """
+        return self._type
+
     @property
     def application_file(self):
         """
@@ -593,8 +610,20 @@ def to_flyte_idl(self):
         """
         :rtype: flyteidl.plugins.spark_pb2.SparkJob
         """
+
+        # Default to Python
+        application_type = _spark_task.SparkApplication.PYTHON
+        if self.type == "SCALA":
+            application_type = _spark_task.SparkApplication.SCALA
+        elif self.type == "JAVA":
+            application_type =  _spark_task.SparkApplication.JAVA
+        elif self.type == "R":
+            application_type = _spark_task.SparkApplication.R
+
         return _spark_task.SparkJob(
+            applicationType=application_type,
             mainApplicationFile=self.application_file,
+            mainClass=self.main_class,
             executorPath=self.executor_path,
             sparkConf=self.spark_conf,
             hadoopConf=self.hadoop_conf,
@@ -606,9 +635,20 @@ def from_flyte_idl(cls, pb2_object):
         :param flyteidl.plugins.spark_pb2.SparkJob pb2_object:
         :rtype: SparkJob
         """
+        # Default to Python
+        type = "PYTHON"
+        if pb2_object.applicationType == _spark_task.SparkApplication.SCALA:
+            type = "SCALA"
+        elif pb2_object.applicationType == _spark_task.SparkApplication.JAVA:
+            type = "JAVA"
+        elif pb2_object.applicationType == _spark_task.SparkApplication.R:
+            type = "R"
+
         return cls(
+            type=type,
             spark_conf=pb2_object.sparkConf,
             application_file=pb2_object.mainApplicationFile,
+            main_class=pb2_object.mainClass,
             hadoop_conf=pb2_object.hadoopConf,
             executor_path=pb2_object.executorPath,
         )

@@ -6,7 +6,7 @@
 from flytekit.common import constants as _common_constants
 from flytekit.common.exceptions import user as _user_exceptions
 from flytekit.common.tasks import sdk_runnable as _sdk_runnable_tasks, sdk_dynamic as _sdk_dynamic, \
-    spark_task as _sdk_spark_tasks, hive_task as _sdk_hive_tasks, sidecar_task as _sdk_sidecar_tasks
+    spark_task as _sdk_spark_tasks, generic_spark_task as _sdk_generic_spark_task, hive_task as _sdk_hive_tasks, sidecar_task as _sdk_sidecar_tasks
 from flytekit.common.tasks import task as _task
 from flytekit.common.types import helpers as _type_helpers
 from flytekit.models import interface as _interface_model
@@ -406,12 +406,16 @@ def spark_task(
         cache_version='',
         retries=0,
         interruptible=None,
+        inputs=None,
         deprecated='',
         cache=False,
         timeout=None,
         spark_conf=None,
         hadoop_conf=None,
         environment=None,
+        spark_type=None,
+        main_class=None,
+        main_application_file=None,
         cls=None
 ):
     """
@@ -485,7 +489,25 @@ def wrapper(fn):
     if _task_function:
         return wrapper(_task_function)
     else:
-        return wrapper
+        if spark_type is None or spark_type == "PYTHON":
+            return wrapper
+        else:
+            return _sdk_generic_spark_task.SdkGenericSparkTask(
+                task_type=_common_constants.SdkTaskType.SPARK_TASK,
+                discovery_version=cache_version,
+                retries=retries,
+                interruptible=interruptible,
+                deprecated=deprecated,
+                discoverable=cache,
+                timeout=timeout or _datetime.timedelta(seconds=0),
+                spark_type = spark_type,
+                task_inputs= inputs or {},
+                main_class = main_class or "",
+                main_application_file = main_application_file or "",
+                spark_conf=spark_conf or {},
+                hadoop_conf=hadoop_conf or {},
+                environment=environment or {},
+        )
 
 
 def qubole_spark_task(*args, **kwargs):

@@ -0,0 +1,40 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from flytekit.sdk.tasks import spark_task, inputs, python_task
+from flytekit.sdk.types import Types
+from flytekit.sdk.workflow import workflow_class, Input
+
+
+scala_spark = spark_task(spark_type="SCALA",
+                         inputs={"partitions": Types.Integer},
+                         main_class="org.apache.spark.examples.SparkPi",
+                         main_application_file="local:///opt/spark/examples/jars/spark-examples.jar",
+                         spark_conf={
+                             'spark.driver.memory': "1000M",
+                             'spark.executor.memory': "1000M",
+                             'spark.executor.cores': '1',
+                             'spark.executor.instances': '2',
+                         },
+                         cache_version='1'
+                         )
+
+
+@inputs(date_triggered=Types.Datetime)
+@python_task(cache_version='1')
+def print_every_time(workflow_parameters, date_triggered):
+    print("My input : {}".format(date_triggered))
+
+
+@workflow_class
+class SparkTasksWorkflow(object):
+    triggered_date = Input(Types.Datetime)
+    partitions = Input(Types.Integer)
+    sparkTask = scala_spark(partitions=partitions)
+    print_always = print_every_time(
+        date_triggered=triggered_date)
+
+
+if __name__ == '__main__':
+    print(scala_spark)