Skip to content
This repository has been archived by the owner on Dec 15, 2022. It is now read-only.

Fix two issues in utf8 <-> utf16 offset & lengths conversions #49

Merged
merged 2 commits into from
Jan 29, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions spec/onig-scanner-spec.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ describe "OnigScanner", ->
match = scanner.findNextMatchSync('ab…cde21', 5)
expect(match.index).toBe 1

scanner = new OnigScanner(['\"'])
match = scanner.findNextMatchSync('{"…": 1}', 1)
expect(match.captureIndices).toEqual [{index: 0, start: 1, end: 2, length: 1}]

describe "when the string searched contains surrogate pairs", ->
it "counts paired characters as 2 characters in both arguments and return values", ->
scanner = new OnigScanner(["Y", "X"])
Expand Down Expand Up @@ -53,6 +57,12 @@ describe "OnigScanner", ->
expect(scanner.findNextMatchSync('a1', false).index).toBe 0
expect(scanner.findNextMatchSync('a1', 'food').index).toBe 0

describe "when the regular expression contains double byte characters", ->
it "returns the correct match length", ->
scanner = new OnigScanner(["Возврат"])
match = scanner.findNextMatchSync('Возврат long_var_name;', 0)
expect(match.captureIndices).toEqual [{index: 0, start: 0, end: 7, length: 7}]

describe "::findNextMatch", ->
matchCallback = null

Expand Down
6 changes: 3 additions & 3 deletions src/onig-scanner-worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,13 @@ void OnigScannerWorker::HandleOKCallback() {
Local<Array> captures = Nan::New<Array>(resultCount);
for (int index = 0; index < resultCount; index++) {
int captureStart = source->ConvertUtf8OffsetToUtf16(bestResult->LocationAt(index));
int captureLength = source->ConvertUnicodeLengthToUtf16(captureStart, bestResult->LengthAt(index));
int captureEnd = source->ConvertUtf8OffsetToUtf16(bestResult->LocationAt(index) + bestResult->LengthAt(index));

Local<Object> capture = Nan::New<Object>();
capture->Set(Nan::New<String>("index").ToLocalChecked(), Nan::New<Number>(index));
capture->Set(Nan::New<String>("start").ToLocalChecked(), Nan::New<Number>(captureStart));
capture->Set(Nan::New<String>("end").ToLocalChecked(), Nan::New<Number>(captureStart + captureLength));
capture->Set(Nan::New<String>("length").ToLocalChecked(), Nan::New<Number>(captureLength));
capture->Set(Nan::New<String>("end").ToLocalChecked(), Nan::New<Number>(captureEnd));
capture->Set(Nan::New<String>("length").ToLocalChecked(), Nan::New<Number>(captureEnd - captureStart));
captures->Set(index, capture);
}
result->Set(Nan::New<String>("captureIndices").ToLocalChecked(), captures);
Expand Down
6 changes: 3 additions & 3 deletions src/onig-scanner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -102,13 +102,13 @@ Local<Value> OnigScanner::CaptureIndicesForMatch(OnigResult* result, OnigString*

for (int index = 0; index < resultCount; index++) {
int captureStart = source->ConvertUtf8OffsetToUtf16(result->LocationAt(index));
int captureLength = source->ConvertUnicodeLengthToUtf16(captureStart, result->LengthAt(index));
int captureEnd = source->ConvertUtf8OffsetToUtf16(result->LocationAt(index) + result->LengthAt(index));

Local<Object> capture = Nan::New<Object>();
capture->Set(Nan::New<String>("index").ToLocalChecked(), Nan::New<Number>(index));
capture->Set(Nan::New<String>("start").ToLocalChecked(), Nan::New<Number>(captureStart));
capture->Set(Nan::New<String>("end").ToLocalChecked(), Nan::New<Number>(captureStart + captureLength));
capture->Set(Nan::New<String>("length").ToLocalChecked(), Nan::New<Number>(captureLength));
capture->Set(Nan::New<String>("end").ToLocalChecked(), Nan::New<Number>(captureEnd));
capture->Set(Nan::New<String>("length").ToLocalChecked(), Nan::New<Number>(captureEnd - captureStart));
captures->Set(index, capture);
}

Expand Down
56 changes: 21 additions & 35 deletions src/onig-string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,27 +18,30 @@ NAN_METHOD(OnigString::New) {
}

OnigString::OnigString(Local<String> value)
: utf8Value(value) {
: utf8Value(value), utf8_length_(utf8Value.length()) {
static int idGenerator = 0;
uniqueId_ = ++idGenerator;

hasMultiByteChars = (value->Length() != value->Utf8Length());
hasMultiByteChars = (value->Length() != utf8_length_);

if (hasMultiByteChars) {
String::Value utf16Value(value);
utf16_length_ = utf16Value.length();

utf16OffsetToUtf8 = new int[utf16Value.length()];
utf16OffsetIsCodePointEnd = new bool[utf16Value.length()];
utf8OffsetToUtf16 = new int[utf8Value.length()];
utf16OffsetToUtf8 = new int[utf16_length_ + 1];
utf16OffsetToUtf8[utf16_length_] = utf8_length_;

utf8OffsetToUtf16 = new int[utf8_length_ + 1];
utf8OffsetToUtf16[utf8_length_] = utf16_length_;

// http://stackoverflow.com/a/148766
unsigned int codepoint = 0;
int i16_codepoint_start = 0;
int i8 = 0;
for (int i16 = 0, len = utf16Value.length(); i16 < len; i16++) {
for (int i16 = 0, len = utf16_length_; i16 < len; i16++) {
uint16_t in = (*utf16Value)[i16];

utf16OffsetToUtf8[i16] = i8;
utf16OffsetIsCodePointEnd[i16] = false;

if (in >= 0xd800 && in <= 0xdbff) {
codepoint = ((in - 0xd800) << 10) + 0x10000;
Expand All @@ -49,33 +52,33 @@ OnigString::OnigString(Local<String> value)
codepoint = in;
}

utf16OffsetIsCodePointEnd[i16] = true;
if (codepoint <= 0x7f) {
utf8OffsetToUtf16[i8] = i16;
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
} else if (codepoint <= 0x7ff) {
utf8OffsetToUtf16[i8] = i16;
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
utf8OffsetToUtf16[i8] = i16;
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
} else if (codepoint <= 0xffff) {
utf8OffsetToUtf16[i8] = i16 - 1;
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
utf8OffsetToUtf16[i8] = i16 - 1;
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
utf8OffsetToUtf16[i8] = i16 - 1;
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
} else {
utf8OffsetToUtf16[i8] = i16 - 1;
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
utf8OffsetToUtf16[i8] = i16 - 1;
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
utf8OffsetToUtf16[i8] = i16 - 1;
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
utf8OffsetToUtf16[i8] = i16 - 1;
utf8OffsetToUtf16[i8] = i16_codepoint_start;
i8++;
}
codepoint = 0;
i16_codepoint_start = i16 + 1;
}
}
}
Expand All @@ -84,7 +87,6 @@ OnigString::OnigString(Local<String> value)
OnigString::~OnigString() {
if (hasMultiByteChars) {
delete []utf16OffsetToUtf8;
delete []utf16OffsetIsCodePointEnd;
delete []utf8OffsetToUtf16;
}
}
Expand All @@ -102,19 +104,3 @@ int OnigString::ConvertUtf16OffsetToUtf8(int utf16Offset) {
}
return utf16Offset;
}

int OnigString::ConvertUnicodeLengthToUtf16(int utf16Offset, int codePointLength) {
if (hasMultiByteChars) {
int result = 0;
while (codePointLength > 0) {
bool isCodePointEnd = utf16OffsetIsCodePointEnd[utf16Offset + result];
if (isCodePointEnd) {
codePointLength--;
}
result++;
}
return result;
}

return codePointLength;
}
8 changes: 5 additions & 3 deletions src/onig-string.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,21 +20,23 @@ class OnigString : public node::ObjectWrap {
int uniqueId() { return uniqueId_; }

const char* utf8_value() const { return *utf8Value; }
size_t utf8_length() const { return utf8Value.length(); }
size_t utf8_length() const { return utf8_length_; }

int ConvertUtf8OffsetToUtf16(int utf8Offset);
int ConvertUtf16OffsetToUtf8(int utf16Offset);
int ConvertUnicodeLengthToUtf16(int utf16Offset, int codePointLength);

private:
static NAN_METHOD(New);

int uniqueId_;
String::Utf8Value utf8Value;
size_t utf8_length_;
bool hasMultiByteChars;

// - the following members are used only if hasMultiByteChars is true
size_t utf16_length_;
int *utf16OffsetToUtf8;
int *utf8OffsetToUtf16;
bool *utf16OffsetIsCodePointEnd;
};

#endif // SRC_ONIG_STRING_H_