Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

jruby: Implement set_encoding_by_bom #101

Merged
merged 6 commits into from
Sep 25, 2024
Merged
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 97 additions & 8 deletions ext/java/org/jruby/ext/stringio/StringIO.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@

import org.jcodings.Encoding;
import org.jcodings.specific.ASCIIEncoding;
import org.jcodings.specific.UTF16BEEncoding;
import org.jcodings.specific.UTF16LEEncoding;
import org.jcodings.specific.UTF32BEEncoding;
import org.jcodings.specific.UTF32LEEncoding;
import org.jcodings.specific.UTF8Encoding;
import org.jruby.*;
import org.jruby.anno.FrameField;
import org.jruby.anno.JRubyClass;
Expand All @@ -51,8 +56,10 @@
import org.jruby.util.ByteList;
import org.jruby.util.StringSupport;
import org.jruby.util.TypeConverter;
import org.jruby.util.func.ObjectObjectIntFunction;
import org.jruby.util.io.EncodingUtils;
import org.jruby.util.io.Getline;
import org.jruby.util.io.IOEncodable;
import org.jruby.util.io.ModeFlags;
import org.jruby.util.io.OpenFile;

Expand All @@ -62,6 +69,7 @@
import java.util.Arrays;
import java.util.concurrent.atomic.AtomicReferenceFieldUpdater;

import static java.lang.Byte.toUnsignedInt;
import static org.jruby.RubyEnumerator.enumeratorize;
import static org.jruby.runtime.Visibility.PRIVATE;
import static org.jruby.util.RubyStringBuilder.str;
Expand Down Expand Up @@ -93,6 +101,10 @@ static class StringIOData {

private static final AtomicReferenceFieldUpdater<StringIOData, Object> LOCKED_UPDATER = AtomicReferenceFieldUpdater.newUpdater(StringIOData.class, Object.class, "owner");

private static final ThreadLocal<Object> VMODE_VPERM_TL = ThreadLocal.withInitial(() -> EncodingUtils.vmodeVperm(null, null));
private static final ThreadLocal<int[]> FMODE_TL = ThreadLocal.withInitial(() -> new int[]{0});
private static final int[] OFLAGS_UNUSED = new int[]{0};

public static RubyClass createStringIOClass(final Ruby runtime) {
RubyClass stringIOClass = runtime.defineClass(
"StringIO", runtime.getObject(), StringIO::new);
Expand Down Expand Up @@ -298,12 +310,22 @@ private void strioInit(ThreadContext context, int argc, IRubyObject arg0, IRubyO
Encoding encoding = null;

IRubyObject options = ArgsUtil.getOptionsArg(runtime, maybeOptions);
IOEncodable.ConvConfig ioEncodable = new IOEncodable.ConvConfig();
if (!options.isNil()) {
argc--;
IRubyObject encodingOpt = ArgsUtil.extractKeywordArg(context, "encoding", (RubyHash) options);
if (!encodingOpt.isNil()) {
encoding = EncodingUtils.toEncoding(context, encodingOpt);
}

int[] fmode = {0};
Object vmodeAndVpermP = VMODE_VPERM_TL.get();

// switch to per-use oflags if it is ever used in the future
EncodingUtils.extractModeEncoding(context, ioEncodable, vmodeAndVpermP, options, OFLAGS_UNUSED, FMODE_TL.get());

// clear shared vmodeVperm
EncodingUtils.vmode(vmodeAndVpermP, null);
EncodingUtils.vperm(vmodeAndVpermP, null);

ptr.flags = fmode[0];
encoding = ioEncodable.enc;
}

switch (argc) {
Expand All @@ -312,11 +334,11 @@ private void strioInit(ThreadContext context, int argc, IRubyObject arg0, IRubyO
final boolean trunc;
if (mode instanceof RubyFixnum) {
int flags = RubyFixnum.fix2int(mode);
ptr.flags = ModeFlags.getOpenFileFlagsFor(flags);
ptr.flags |= ModeFlags.getOpenFileFlagsFor(flags);
trunc = (flags & ModeFlags.TRUNC) != 0;
} else {
String m = arg1.convertToString().toString();
ptr.flags = OpenFile.ioModestrFmode(runtime, m);
ptr.flags |= OpenFile.ioModestrFmode(runtime, m);
trunc = m.length() > 0 && m.charAt(0) == 'w';
}
string = arg0.convertToString();
Expand All @@ -329,11 +351,11 @@ private void strioInit(ThreadContext context, int argc, IRubyObject arg0, IRubyO
break;
case 1:
string = arg0.convertToString();
ptr.flags = string.isFrozen() ? OpenFile.READABLE : OpenFile.READWRITE;
ptr.flags |= string.isFrozen() ? OpenFile.READABLE : OpenFile.READWRITE;
break;
case 0:
string = RubyString.newEmptyString(runtime, runtime.getDefaultExternalEncoding());
ptr.flags = OpenFile.READWRITE;
ptr.flags |= OpenFile.READWRITE;
break;
default:
// should not be possible
Expand All @@ -344,6 +366,7 @@ private void strioInit(ThreadContext context, int argc, IRubyObject arg0, IRubyO
ptr.enc = encoding;
ptr.pos = 0;
ptr.lineno = 0;
if ((ptr.flags & OpenFile.SETENC_BY_BOM) != 0) setEncodingByBOM(context);
// funky way of shifting readwrite flags into object flags
flags |= (ptr.flags & OpenFile.READWRITE) * (STRIO_READABLE / OpenFile.READABLE);
} finally {
Expand Down Expand Up @@ -1636,6 +1659,72 @@ public IRubyObject set_encoding(ThreadContext context, IRubyObject enc, IRubyObj
return set_encoding(context, enc);
}

@JRubyMethod
public IRubyObject set_encoding_by_bom(ThreadContext context) {
if (setEncodingByBOM(context) == null) return context.nil;

return context.runtime.getEncodingService().convertEncodingToRubyEncoding(ptr.enc);
}

private Encoding setEncodingByBOM(ThreadContext context) {
Encoding enc = detectBOM(context, ptr.string, (ctx, enc2, bomlen) -> {
ptr.pos = bomlen;
if (writable()) {
ptr.string.setEncoding(enc2);
}
return enc2;
});
ptr.enc = enc;
return enc;
}

private static Encoding detectBOM(ThreadContext context, RubyString str, ObjectObjectIntFunction<ThreadContext, Encoding, Encoding> callback) {
int p;
int len;

ByteList byteList = str.getByteList();
byte[] bytes = byteList.unsafeBytes();
p = byteList.begin();
len = byteList.realSize();

if (len < 1) return null;
switch (toUnsignedInt(bytes[p])) {
case 0xEF:
if (len < 3) break;
if (toUnsignedInt(bytes[p + 1]) == 0xBB && toUnsignedInt(bytes[p + 2]) == 0xBF) {
return callback.apply(context, UTF8Encoding.INSTANCE, 3);
}
kou marked this conversation as resolved.
Show resolved Hide resolved
}
break;

case 0xFE:
if (len < 2) break;
if (toUnsignedInt(bytes[p + 1]) == 0xFF) {
return callback.apply(context, UTF16BEEncoding.INSTANCE, 2);
}
break;

case 0xFF:
if (len < 2) break;
if (toUnsignedInt(bytes[p + 1]) == 0xFE) {
if (len >= 4 && toUnsignedInt(bytes[p + 2]) == 0 && toUnsignedInt(bytes[p + 3]) == 0) {
return callback.apply(context, UTF32LEEncoding.INSTANCE, 4);
}
return callback.apply(context, UTF16LEEncoding.INSTANCE, 2);
}
break;

case 0:
if (len < 4) break;
if (toUnsignedInt(bytes[p + 1]) == 0 && toUnsignedInt(bytes[p + 2]) == 0xFE && toUnsignedInt(bytes[p + 3]) == 0xFF) {
return callback.apply(context, UTF32BEEncoding.INSTANCE, 4);
}
break;
}
return callback.apply(context, null, 0);
}


@JRubyMethod
public IRubyObject external_encoding(ThreadContext context) {
return context.runtime.getEncodingService().convertEncodingToRubyEncoding(getEncoding());
Expand Down
Loading