apache · tarekbecker · Jun 13, 2015 · Jun 14, 2015 · Jun 15, 2015 · Jun 15, 2015
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeRow.java
@@ -310,7 +310,6 @@ public double getDouble(int i) {
 
   public UTF8String getUTF8String(int i) {
     assertIndexIsValid(i);
-    final UTF8String str = new UTF8String();
     final long offsetToStringSize = getLong(i);
     final int stringSizeInBytes =
       (int) PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + offsetToStringSize);
@@ -322,8 +321,7 @@ public UTF8String getUTF8String(int i) {
       PlatformDependent.BYTE_ARRAY_OFFSET,
       stringSizeInBytes
     );
-    str.set(strBytes);
-    return str;
+    return UTF8String.fromBytes(strBytes);
   }
 
   @Override

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/Cast.scala
@@ -437,17 +437,17 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w
 
       case (BinaryType, StringType) =>
         defineCodeGen (ctx, ev, c =>
-          s"new ${ctx.stringType}().set($c)")
+          s"${ctx.stringType}().fromBytes($c)")
       case (DateType, StringType) =>
         defineCodeGen(ctx, ev, c =>
-          s"""new ${ctx.stringType}().set(
+          s"""${ctx.stringType}().fromString(
                 org.apache.spark.sql.catalyst.util.DateUtils.toString($c))""")
       // Special handling required for timestamps in hive test cases since the toString function
       // does not match the expected output.
       case (TimestampType, StringType) =>
         super.genCode(ctx, ev)
       case (_, StringType) =>
-        defineCodeGen(ctx, ev, c => s"new ${ctx.stringType}().set(String.valueOf($c))")
+        defineCodeGen(ctx, ev, c => s"${ctx.stringType}().fromString(String.valueOf($c))")
 
       // fallback for DecimalType, this must be before other numeric types
       case (_, dt: DecimalType) =>

diff --git a/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -20,7 +20,7 @@
 import java.io.Serializable;
 import java.io.UnsupportedEncodingException;
 import java.util.Arrays;
-import javax.annotation.Nullable;
+import javax.annotation.Nonnull;
 
 import org.apache.spark.unsafe.PlatformDependent;
 
@@ -34,7 +34,7 @@
  */
 public final class UTF8String implements Comparable<UTF8String>, Serializable {
 
-  @Nullable
+  @Nonnull
   private byte[] bytes;
 
   private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
@@ -55,22 +55,26 @@ public static UTF8String fromString(String str) {
   /**
    * Updates the UTF8String with String.
    */
-  public UTF8String set(final String str) {
-    try {
-      bytes = str.getBytes("utf-8");
-    } catch (UnsupportedEncodingException e) {
-      // Turn the exception into unchecked so we can find out about it at runtime, but
-      // don't need to add lots of boilerplate code everywhere.
-      PlatformDependent.throwException(e);
+  protected UTF8String set(final String str) {
+    if (str == null) {
+      bytes = new byte[0];
+    } else {
+      try {
+        bytes = str.getBytes("utf-8");
+      } catch (UnsupportedEncodingException e) {
+        // Turn the exception into unchecked so we can find out about it at runtime, but
+        // don't need to add lots of boilerplate code everywhere.
+        PlatformDependent.throwException(e);
+      }
     }
     return this;
   }
 
   /**
    * Updates the UTF8String with byte[], which should be encoded in UTF-8.
    */
-  public UTF8String set(final byte[] bytes) {
-    this.bytes = bytes;
+  protected UTF8String set(final byte[] bytes) {
+    this.bytes = (bytes != null) ? bytes : new byte[0];
     return this;
   }
 
@@ -125,30 +129,37 @@ public UTF8String substring(final int start, final int until) {
   }
 
   public boolean contains(final UTF8String substring) {
+    if (substring == null) return false;
     final byte[] b = substring.getBytes();
     if (b.length == 0) {
       return true;
     }
 
     for (int i = 0; i <= bytes.length - b.length; i++) {
-      // TODO: Avoid copying.
-      if (bytes[i] == b[0] && Arrays.equals(Arrays.copyOfRange(bytes, i, i + b.length), b)) {
+      if (bytes[i] == b[0] && startsWith(b, i)) {
         return true;
       }
     }
     return false;
   }
 
+  private boolean startsWith(final byte[] prefix, int offset) {
+    if (prefix.length + offset > bytes.length || offset < 0) {
+      return false;
+    }
+    int i = 0;
+    while (i < prefix.length && prefix[i] == bytes[i + offset]) {
+      i++;
+    }
+    return i == prefix.length;
+  }
+
   public boolean startsWith(final UTF8String prefix) {
-    final byte[] b = prefix.getBytes();
-    // TODO: Avoid copying.
-    return b.length <= bytes.length && Arrays.equals(Arrays.copyOfRange(bytes, 0, b.length), b);
+    return prefix != null && startsWith(prefix.getBytes(), 0);
   }
 
   public boolean endsWith(final UTF8String suffix) {
-    final byte[] b = suffix.getBytes();
-    return b.length <= bytes.length &&
-      Arrays.equals(Arrays.copyOfRange(bytes, bytes.length - b.length, bytes.length), b);
+    return suffix != null && startsWith(suffix.getBytes(), bytes.length - suffix.getBytes().length);
   }
 
   public UTF8String toUpperCase() {
@@ -178,6 +189,7 @@ public UTF8String clone() {
 
   @Override
   public int compareTo(final UTF8String other) {
+    if (other == null) return 1;
     final byte[] b = other.getBytes();
     for (int i = 0; i < bytes.length && i < b.length; i++) {
       int res = bytes[i] - b[i];

diff --git a/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -46,6 +46,7 @@ public void basicTest() throws UnsupportedEncodingException {
 
   @Test
   public void contains() {
+    Assert.assertFalse(UTF8String.fromString("hello").contains(null));
     Assert.assertTrue(UTF8String.fromString("hello").contains(UTF8String.fromString("ello")));
     Assert.assertFalse(UTF8String.fromString("hello").contains(UTF8String.fromString("vello")));
     Assert.assertFalse(UTF8String.fromString("hello").contains(UTF8String.fromString("hellooo")));
@@ -57,6 +58,7 @@ public void contains() {
 
   @Test
   public void startsWith() {
+    Assert.assertFalse(UTF8String.fromString("hello").startsWith(null));
     Assert.assertTrue(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hell")));
     Assert.assertFalse(UTF8String.fromString("hello").startsWith(UTF8String.fromString("ell")));
     Assert.assertFalse(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hellooo")));
@@ -68,6 +70,7 @@ public void startsWith() {
 
   @Test
   public void endsWith() {
+    Assert.assertFalse(UTF8String.fromString("hello").endsWith(null));
     Assert.assertTrue(UTF8String.fromString("hello").endsWith(UTF8String.fromString("ello")));
     Assert.assertFalse(UTF8String.fromString("hello").endsWith(UTF8String.fromString("ellov")));
     Assert.assertFalse(UTF8String.fromString("hello").endsWith(UTF8String.fromString("hhhello")));