Apply review comments

apache · Feb 23, 2017 · c86febe · c86febe
1 parent 80c3775
commit c86febe
Show file tree

Hide file tree

Showing 6 changed files with 17 additions and 8 deletions.
diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
@@ -368,7 +368,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
                          If None is set, it uses the default value, session local timezone.
 
                 * ``PERMISSIVE`` : sets other fields to ``null`` when it meets a corrupted record.
-                    If users set a string-type field named ``columnNameOfCorruptRecord`` in a
+                    If users set a string type field named ``columnNameOfCorruptRecord`` in a
                     user-specified ``schema``, it puts the malformed string into the field. When
                     a ``schema`` is set by user, it sets ``null`` for extra fields.
                 * ``DROPMALFORMED`` : ignores the whole corrupted records.

diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
@@ -558,7 +558,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             comment=None, header=None, inferSchema=None, ignoreLeadingWhiteSpace=None,
             ignoreTrailingWhiteSpace=None, nullValue=None, nanValue=None, positiveInf=None,
             negativeInf=None, dateFormat=None, timestampFormat=None, maxColumns=None,
-            maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, timeZone=None):
+            maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None, timeZone=None,
+            columnNameOfCorruptRecord=None):
         """Loads a CSV file stream and returns the result as a  :class:`DataFrame`.
 
         This function will go through the input once to determine the input schema if
@@ -619,10 +620,18 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
                          If None is set, it uses the default value, session local timezone.
 
                 * ``PERMISSIVE`` : sets other fields to ``null`` when it meets a corrupted record.
-                    When a schema is set by user, it sets ``null`` for extra fields.
+                    If users set a string type field named ``columnNameOfCorruptRecord`` in a
+                    user-specified ``schema``, it puts the malformed string into the field. When
+                    a ``schema`` is set by user, it sets ``null`` for extra fields.
                 * ``DROPMALFORMED`` : ignores the whole corrupted records.
                 * ``FAILFAST`` : throws an exception when it meets corrupted records.
 
+        :param columnNameOfCorruptRecord: defines a field name for malformed strings created
+                                          by ``PERMISSIVE`` mode. If a user-specified `schema`
+                                          has this named field, Spark puts malformed strings
+                                          in this field. This overrides
+                                          `spark.sql.columnNameOfCorruptRecord`.
+
         >>> csv_sdf = spark.readStream.csv(tempfile.mkdtemp(), schema = sdf_schema)
         >>> csv_sdf.isStreaming
         True
@@ -636,7 +645,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             nanValue=nanValue, positiveInf=positiveInf, negativeInf=negativeInf,
             dateFormat=dateFormat, timestampFormat=timestampFormat, maxColumns=maxColumns,
             maxCharsPerColumn=maxCharsPerColumn,
-            maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode, timeZone=timeZone)
+            maxMalformedLogPerPartition=maxMalformedLogPerPartition, mode=mode, timeZone=timeZone,
+            columnNameOfCorruptRecord=columnNameOfCorruptRecord)
         if isinstance(path, basestring):
             return self._df(self._jreader.csv(path))
         else:

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -423,7 +423,7 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    *    during parsing.
    *   <ul>
    *     <li>`PERMISSIVE` : sets other fields to `null` when it meets a corrupted record. If users
-   *     set a string-type field named `columnNameOfCorruptRecord` in a user-specified `schema`,
+   *     set a string type field named `columnNameOfCorruptRecord` in a user-specified `schema`,
    *     it puts the malformed string into the field. When a `schema` is set by user, it sets `null`
    *     for extra fields.</li>
    *     <li>`DROPMALFORMED` : ignores the whole corrupted records.</li>

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/UnivocityParser.scala
@@ -208,7 +208,7 @@ private[csv] class UnivocityParser(
   }
 
   private def convertWithParseMode(
-    input: String)(convert: Array[String] => InternalRow): Option[InternalRow] = {
+      input: String)(convert: Array[String] => InternalRow): Option[InternalRow] = {
     val tokens = parser.parseLine(input)
     if (options.dropMalformed && inputSchema.length != tokens.length) {
       if (numMalformedRecords < options.maxMalformedLogPerPartition) {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -246,7 +246,7 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
    *    during parsing.
    *   <ul>
    *     <li>`PERMISSIVE` : sets other fields to `null` when it meets a corrupted record. If users
-   *     set a string-type field named `columnNameOfCorruptRecord` in a user-specified `schema`,
+   *     set a string type field named `columnNameOfCorruptRecord` in a user-specified `schema`,
    *     it puts the malformed string into the field. When a `schema` is set by user, it sets `null`
    *     for extra fields.</li>
    *     <li>`DROPMALFORMED` : ignores the whole corrupted records.</li>

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -987,7 +987,6 @@ class CSVSuite extends QueryTest with SharedSQLContext with SQLTestUtils {
       Nil)
 
     // We put a `columnNameOfCorruptRecord` field in the middle of a schema
-    new StructType
     val schemaWithCorrField2 = new StructType()
       .add("a", IntegerType)
       .add(columnNameOfCorruptRecord, StringType)