Skip to content

Commit

Permalink
[SPARK-8301] [SQL] Improve UTF8String substring/startsWith/endsWith/c…
Browse files Browse the repository at this point in the history
…ontains performance

Jira: https://issues.apache.org/jira/browse/SPARK-8301

Added the private method startsWith(prefix, offset) to implement startsWith, endsWith and contains without copying the array

I hope that the component SQL is still correct. I copied it from the Jira ticket.

Author: Tarek Auel <[email protected]>
Author: Tarek Auel <[email protected]>

Closes apache#6804 from tarekauel/SPARK-8301 and squashes the following commits:

f5d6b9a [Tarek Auel] fixed parentheses and annotation
6d7b068 [Tarek Auel] [SPARK-8301] removed null checks
9ca0473 [Tarek Auel] [SPARK-8301] removed null checks
1c327eb [Tarek Auel] [SPARK-8301] removed new
9f17cc8 [Tarek Auel] [SPARK-8301] fixed conversion byte to string in codegen
3a0040f [Tarek Auel] [SPARK-8301] changed call of UTF8String.set to UTF8String.from
e4530d2 [Tarek Auel] [SPARK-8301] changed call of UTF8String.set to UTF8String.from
a5f853a [Tarek Auel] [SPARK-8301] changed visibility of set to protected. Changed annotation of bytes from Nullable to Nonnull
d2fb05f [Tarek Auel] [SPARK-8301] added additional null checks
79cb55b [Tarek Auel] [SPARK-8301] null check. Added test cases for null check.
b17909e [Tarek Auel] [SPARK-8301] removed unnecessary copying of UTF8String. Added a private function startsWith(prefix, offset) to implement the check for startsWith, endsWith and contains.
  • Loading branch information
tarekbecker authored and Davies Liu committed Jun 21, 2015
1 parent 004f573 commit 41ab285
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,6 @@ public double getDouble(int i) {

public UTF8String getUTF8String(int i) {
assertIndexIsValid(i);
final UTF8String str = new UTF8String();
final long offsetToStringSize = getLong(i);
final int stringSizeInBytes =
(int) PlatformDependent.UNSAFE.getLong(baseObject, baseOffset + offsetToStringSize);
Expand All @@ -324,8 +323,7 @@ public UTF8String getUTF8String(int i) {
PlatformDependent.BYTE_ARRAY_OFFSET,
stringSizeInBytes
);
str.set(strBytes);
return str;
return UTF8String.fromBytes(strBytes);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -438,17 +438,17 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w

case (BinaryType, StringType) =>
defineCodeGen (ctx, ev, c =>
s"new ${ctx.stringType}().set($c)")
s"${ctx.stringType}.fromBytes($c)")
case (DateType, StringType) =>
defineCodeGen(ctx, ev, c =>
s"""new ${ctx.stringType}().set(
s"""${ctx.stringType}.fromString(
org.apache.spark.sql.catalyst.util.DateUtils.toString($c))""")
// Special handling required for timestamps in hive test cases since the toString function
// does not match the expected output.
case (TimestampType, StringType) =>
super.genCode(ctx, ev)
case (_, StringType) =>
defineCodeGen(ctx, ev, c => s"new ${ctx.stringType}().set(String.valueOf($c))")
defineCodeGen(ctx, ev, c => s"${ctx.stringType}.fromString(String.valueOf($c))")

// fallback for DecimalType, this must be before other numeric types
case (_, dt: DecimalType) =>
Expand Down
30 changes: 18 additions & 12 deletions unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.util.Arrays;
import javax.annotation.Nullable;
import javax.annotation.Nonnull;

import org.apache.spark.unsafe.PlatformDependent;

Expand All @@ -34,7 +34,7 @@
*/
public final class UTF8String implements Comparable<UTF8String>, Serializable {

@Nullable
@Nonnull
private byte[] bytes;

private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
Expand All @@ -55,7 +55,7 @@ public static UTF8String fromString(String str) {
/**
* Updates the UTF8String with String.
*/
public UTF8String set(final String str) {
protected UTF8String set(final String str) {
try {
bytes = str.getBytes("utf-8");
} catch (UnsupportedEncodingException e) {
Expand All @@ -69,7 +69,7 @@ public UTF8String set(final String str) {
/**
* Updates the UTF8String with byte[], which should be encoded in UTF-8.
*/
public UTF8String set(final byte[] bytes) {
protected UTF8String set(final byte[] bytes) {
this.bytes = bytes;
return this;
}
Expand Down Expand Up @@ -131,24 +131,30 @@ public boolean contains(final UTF8String substring) {
}

for (int i = 0; i <= bytes.length - b.length; i++) {
// TODO: Avoid copying.
if (bytes[i] == b[0] && Arrays.equals(Arrays.copyOfRange(bytes, i, i + b.length), b)) {
if (bytes[i] == b[0] && startsWith(b, i)) {
return true;
}
}
return false;
}

private boolean startsWith(final byte[] prefix, int offsetInBytes) {
if (prefix.length + offsetInBytes > bytes.length || offsetInBytes < 0) {
return false;
}
int i = 0;
while (i < prefix.length && prefix[i] == bytes[i + offsetInBytes]) {
i++;
}
return i == prefix.length;
}

public boolean startsWith(final UTF8String prefix) {
final byte[] b = prefix.getBytes();
// TODO: Avoid copying.
return b.length <= bytes.length && Arrays.equals(Arrays.copyOfRange(bytes, 0, b.length), b);
return startsWith(prefix.getBytes(), 0);
}

public boolean endsWith(final UTF8String suffix) {
final byte[] b = suffix.getBytes();
return b.length <= bytes.length &&
Arrays.equals(Arrays.copyOfRange(bytes, bytes.length - b.length, bytes.length), b);
return startsWith(suffix.getBytes(), bytes.length - suffix.getBytes().length);
}

public UTF8String toUpperCase() {
Expand Down

0 comments on commit 41ab285

Please sign in to comment.