-
Notifications
You must be signed in to change notification settings - Fork 28.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-8301][SQL] Improve UTF8String substring/startsWith/endsWith/contains performance #6804
Changes from 8 commits
b17909e
79cb55b
d2fb05f
a5f853a
e4530d2
3a0040f
9f17cc8
1c327eb
9ca0473
6d7b068
f5d6b9a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -437,17 +437,17 @@ case class Cast(child: Expression, dataType: DataType) extends UnaryExpression w | |
|
||
case (BinaryType, StringType) => | ||
defineCodeGen (ctx, ev, c => | ||
s"new ${ctx.stringType}().set($c)") | ||
s"${ctx.stringType}().fromBytes($c)") | ||
case (DateType, StringType) => | ||
defineCodeGen(ctx, ev, c => | ||
s"""new ${ctx.stringType}().set( | ||
s"""${ctx.stringType}().fromString( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No |
||
org.apache.spark.sql.catalyst.util.DateUtils.toString($c))""") | ||
// Special handling required for timestamps in hive test cases since the toString function | ||
// does not match the expected output. | ||
case (TimestampType, StringType) => | ||
super.genCode(ctx, ev) | ||
case (_, StringType) => | ||
defineCodeGen(ctx, ev, c => s"new ${ctx.stringType}().set(String.valueOf($c))") | ||
defineCodeGen(ctx, ev, c => s"${ctx.stringType}().fromString(String.valueOf($c))") | ||
|
||
// fallback for DecimalType, this must be before other numeric types | ||
case (_, dt: DecimalType) => | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,7 @@ | |
import java.io.Serializable; | ||
import java.io.UnsupportedEncodingException; | ||
import java.util.Arrays; | ||
import javax.annotation.Nullable; | ||
import javax.annotation.Nonnull; | ||
|
||
import org.apache.spark.unsafe.PlatformDependent; | ||
|
||
|
@@ -34,7 +34,7 @@ | |
*/ | ||
public final class UTF8String implements Comparable<UTF8String>, Serializable { | ||
|
||
@Nullable | ||
@Nonnull | ||
private byte[] bytes; | ||
|
||
private static int[] bytesOfCodePointInUTF8 = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | ||
|
@@ -55,22 +55,26 @@ public static UTF8String fromString(String str) { | |
/** | ||
* Updates the UTF8String with String. | ||
*/ | ||
public UTF8String set(final String str) { | ||
try { | ||
bytes = str.getBytes("utf-8"); | ||
} catch (UnsupportedEncodingException e) { | ||
// Turn the exception into unchecked so we can find out about it at runtime, but | ||
// don't need to add lots of boilerplate code everywhere. | ||
PlatformDependent.throwException(e); | ||
protected UTF8String set(final String str) { | ||
if (str == null) { | ||
bytes = new byte[0]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This's probably not right, as a null string will be converted into empty string "". There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's safe to remove this check, because There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. okay I am going to remove this |
||
} else { | ||
try { | ||
bytes = str.getBytes("utf-8"); | ||
} catch (UnsupportedEncodingException e) { | ||
// Turn the exception into unchecked so we can find out about it at runtime, but | ||
// don't need to add lots of boilerplate code everywhere. | ||
PlatformDependent.throwException(e); | ||
} | ||
} | ||
return this; | ||
} | ||
|
||
/** | ||
* Updates the UTF8String with byte[], which should be encoded in UTF-8. | ||
*/ | ||
public UTF8String set(final byte[] bytes) { | ||
this.bytes = bytes; | ||
protected UTF8String set(final byte[] bytes) { | ||
this.bytes = (bytes != null) ? bytes : new byte[0]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will it be better if we throw exception if null passed in? Or leave it as it was. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we don't want to accept There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same to my another comment, this |
||
return this; | ||
} | ||
|
||
|
@@ -125,30 +129,37 @@ public UTF8String substring(final int start, final int until) { | |
} | ||
|
||
public boolean contains(final UTF8String substring) { | ||
if (substring == null) return false; | ||
final byte[] b = substring.getBytes(); | ||
if (b.length == 0) { | ||
return true; | ||
} | ||
|
||
for (int i = 0; i <= bytes.length - b.length; i++) { | ||
// TODO: Avoid copying. | ||
if (bytes[i] == b[0] && Arrays.equals(Arrays.copyOfRange(bytes, i, i + b.length), b)) { | ||
if (bytes[i] == b[0] && startsWith(b, i)) { | ||
return true; | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
private boolean startsWith(final byte[] prefix, int offset) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At the first sight, I thought the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. or There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is adopted from the original java String API. But yes it's true that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how about remaining it offsetInBytes? |
||
if (prefix.length + offset > bytes.length || offset < 0) { | ||
return false; | ||
} | ||
int i = 0; | ||
while (i < prefix.length && prefix[i] == bytes[i + offset]) { | ||
i++; | ||
} | ||
return i == prefix.length; | ||
} | ||
|
||
public boolean startsWith(final UTF8String prefix) { | ||
final byte[] b = prefix.getBytes(); | ||
// TODO: Avoid copying. | ||
return b.length <= bytes.length && Arrays.equals(Arrays.copyOfRange(bytes, 0, b.length), b); | ||
return prefix != null && startsWith(prefix.getBytes(), 0); | ||
} | ||
|
||
public boolean endsWith(final UTF8String suffix) { | ||
final byte[] b = suffix.getBytes(); | ||
return b.length <= bytes.length && | ||
Arrays.equals(Arrays.copyOfRange(bytes, bytes.length - b.length, bytes.length), b); | ||
return suffix != null && startsWith(suffix.getBytes(), bytes.length - suffix.getBytes().length); | ||
} | ||
|
||
public UTF8String toUpperCase() { | ||
|
@@ -178,6 +189,7 @@ public UTF8String clone() { | |
|
||
@Override | ||
public int compareTo(final UTF8String other) { | ||
if (other == null) return 1; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should support ordering a column with null in it, even Scala can't. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it should be able to sort columns with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The expression |
||
final byte[] b = other.getBytes(); | ||
for (int i = 0; i < bytes.length && i < b.length; i++) { | ||
int res = bytes[i] - b[i]; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,6 +46,7 @@ public void basicTest() throws UnsupportedEncodingException { | |
|
||
@Test | ||
public void contains() { | ||
Assert.assertFalse(UTF8String.fromString("hello").contains(null)); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was trying the "a".contains(null) in java, and it throws exception like:
Should we follow the same pattern for |
||
Assert.assertTrue(UTF8String.fromString("hello").contains(UTF8String.fromString("ello"))); | ||
Assert.assertFalse(UTF8String.fromString("hello").contains(UTF8String.fromString("vello"))); | ||
Assert.assertFalse(UTF8String.fromString("hello").contains(UTF8String.fromString("hellooo"))); | ||
|
@@ -57,6 +58,7 @@ public void contains() { | |
|
||
@Test | ||
public void startsWith() { | ||
Assert.assertFalse(UTF8String.fromString("hello").startsWith(null)); | ||
Assert.assertTrue(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hell"))); | ||
Assert.assertFalse(UTF8String.fromString("hello").startsWith(UTF8String.fromString("ell"))); | ||
Assert.assertFalse(UTF8String.fromString("hello").startsWith(UTF8String.fromString("hellooo"))); | ||
|
@@ -68,6 +70,7 @@ public void startsWith() { | |
|
||
@Test | ||
public void endsWith() { | ||
Assert.assertFalse(UTF8String.fromString("hello").endsWith(null)); | ||
Assert.assertTrue(UTF8String.fromString("hello").endsWith(UTF8String.fromString("ello"))); | ||
Assert.assertFalse(UTF8String.fromString("hello").endsWith(UTF8String.fromString("ellov"))); | ||
Assert.assertFalse(UTF8String.fromString("hello").endsWith(UTF8String.fromString("hhhello"))); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No
()
here