Skip to content

Commit

Permalink
Add md5 function
Browse files Browse the repository at this point in the history
  • Loading branch information
qiansl127 committed Jun 17, 2015
1 parent 0b8c8fd commit c166167
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,9 @@ object FunctionRegistry {
expression[ToDegrees]("degrees"),
expression[ToRadians]("radians"),

// misc functions
expression[Md5]("md5"),

// aggregate functions
expression[Average]("avg"),
expression[Count]("count"),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.catalyst.expressions

import java.security.MessageDigest

import org.apache.commons.codec.digest.DigestUtils
import org.apache.spark.sql.types.{BinaryType, StringType, DataType}
import org.apache.spark.unsafe.types.UTF8String

/**
* A function that calculates an MD5 128-bit checksum for the string or binary.
* Defined for String and Binary types.
*/
case class Md5(child: Expression)
extends UnaryExpression with ExpectsInputTypes {

override def dataType: DataType = StringType

override def expectedChildTypes: Seq[DataType] =
if (child.dataType == BinaryType) Seq(BinaryType) else Seq(StringType)

override def children: Seq[Expression] = child :: Nil

override def eval(input: Row): Any = {
val value = child.eval(input)
if (value == null) {
null
} else if (child.dataType == BinaryType) {
UTF8String.fromString(DigestUtils.md5Hex(value.asInstanceOf[Array[Byte]]))
} else {
UTF8String.fromString(DigestUtils.md5Hex(value.asInstanceOf[UTF8String].getBytes))
}
}

override def toString: String = s"md5($child)"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.dsl.expressions._

class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {

test("md5") {
val s1 = 'a.string.at(0)
val s2 = 'a.binary.at(0)
checkEvaluation(Md5(s1), "902fbdd2b1df0c4f70b4a5d23525e932", create_row("ABC"))
checkEvaluation(Md5(s2), "6ac1e56bc78f031059be7be854522c4c", create_row(Array[Byte](1,2,3,4,5,6)))
}

}
19 changes: 19 additions & 0 deletions sql/core/src/main/scala/org/apache/spark/sql/functions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ import org.apache.spark.util.Utils
* @groupname sort_funcs Sorting functions
* @groupname normal_funcs Non-aggregate functions
* @groupname math_funcs Math functions
* @groupname misc_funcs Misc functions
* @groupname window_funcs Window functions
* @groupname string_funcs String functions
* @groupname Ungrouped Support functions for DataFrames.
Expand Down Expand Up @@ -1334,6 +1335,24 @@ object functions {
*/
def toRadians(columnName: String): Column = toRadians(Column(columnName))

//////////////////////////////////////////////////////////////////////////////////////////////
// Misc functions
//////////////////////////////////////////////////////////////////////////////////////////////

/**
* Calculates an MD5 128-bit checksum for the string or binary
* @group misc_funcs
* @since 1.5.0
*/
def md5(e: Column): Column = Md5(e.expr)

/**
* Calculates an MD5 128-bit checksum for the string or binary
* @group misc_funcs
* @since 1.5.0
*/
def md5(columnName: String): Column = md5(Column(columnName))

//////////////////////////////////////////////////////////////////////////////////////////////
// String functions
//////////////////////////////////////////////////////////////////////////////////////////////
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,17 @@ class DataFrameFunctionsSuite extends QueryTest {
Row("x", "y", null))
}

test("misc md5 function") {
val df = Seq(("ABC", Array[Byte](1, 2, 3, 4, 5, 6))).toDF("a", "b")
checkAnswer(
df.select(md5($"a"), md5("b")),
Row("902fbdd2b1df0c4f70b4a5d23525e932", "6ac1e56bc78f031059be7be854522c4c"))

checkAnswer(
df.selectExpr("md5(a)", "md5(b)"),
Row("902fbdd2b1df0c4f70b4a5d23525e932", "6ac1e56bc78f031059be7be854522c4c"))
}

test("string length function") {
checkAnswer(
nullStrings.select(strlen($"s"), strlen("s")),
Expand Down

0 comments on commit c166167

Please sign in to comment.