-
-
Notifications
You must be signed in to change notification settings - Fork 694
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
gherkin: (C) Use UTF-16 when wchar_t is of 2 bytes size.
On Windows wchar_t is 2 bytes large, and use UTF-16. This means that for the case of code points > 0xFFFF (and wchar_t is only 2 bytes large), the code point read from the UTF-8 source need to be converted to two UTF-16 surrogates (wchar_t wide characters).
- Loading branch information
1 parent
4b996e4
commit 8ea804f
Showing
8 changed files
with
129 additions
and
79 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#include "unicode_utilities.h" | ||
|
||
long UnicodeUtilities_read_code_point_from_utf8_source(Utf8Source* utf8_source) { | ||
unsigned char c = Utf8Source_read(utf8_source); | ||
if (c < 0x80) { | ||
return (long)c; | ||
} | ||
unsigned char c2 = Utf8Source_read(utf8_source); | ||
long lower_part = (long)(c2 & 0x3F); | ||
if ((c & 0xE0) == 0xC0) { | ||
return (((long)(c & 0x1F)) << 6) | lower_part; | ||
} | ||
c2 = Utf8Source_read(utf8_source); | ||
lower_part = (lower_part << 6) | (long)(c2 & 0x3F); | ||
if ((c & 0xF0) == 0xE0) { | ||
return (((long)(c & 0x0F)) << 12) | lower_part; | ||
} | ||
c2 = Utf8Source_read(utf8_source); | ||
lower_part = (lower_part << 6) | (long)(c2 & 0x3F); | ||
if ((c & 0xF8) == 0xF0) { | ||
return (((long)(c & 0x07)) << 18) | lower_part; | ||
} | ||
c2 = Utf8Source_read(utf8_source); | ||
lower_part = (lower_part << 6) | (long)(c2 & 0x3F); | ||
if ((c & 0xFC) == 0xF8) { | ||
return (((long)(c & 0x03)) << 24) | lower_part; | ||
} | ||
c2 = Utf8Source_read(utf8_source); | ||
lower_part = (lower_part << 6) | (long)(c2 & 0x3F); | ||
if ((c & 0xFE) == 0xFC) { | ||
return (((long)(c & 0x01)) << 30) | lower_part; | ||
} | ||
return WEOF; | ||
} | ||
|
||
Utf16Surrogates UnicodeUtilities_get_utf16_surrogates(long code_point){ | ||
Utf16Surrogates surrogates; | ||
long surrogates_base = code_point - 0x10000; | ||
surrogates.leading = 0xD800 + (surrogates_base >> 10); | ||
surrogates.trailing = 0xDC00 + (surrogates_base & 0x3FF); | ||
return surrogates; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
#ifndef GHERKIN_UNICODE_UTILITIES_H_ | ||
#define GHERKIN_UNICODE_UTILITIES_H_ | ||
|
||
#include <wchar.h> | ||
|
||
#include "utf8_source.h" | ||
|
||
#ifdef __cplusplus | ||
extern "C" { | ||
#endif | ||
|
||
typedef struct Utf16Surrogates { | ||
wchar_t leading; | ||
wchar_t trailing; | ||
} Utf16Surrogates; | ||
|
||
long UnicodeUtilities_read_code_point_from_utf8_source(Utf8Source* utf8_source); | ||
|
||
Utf16Surrogates UnicodeUtilities_get_utf16_surrogates(long code_point); | ||
|
||
#ifdef __cplusplus | ||
} | ||
#endif | ||
|
||
#endif /* GHERKIN_UNICODE_UTILITIES_H_ */ |
This file was deleted.
Oops, something went wrong.
This file was deleted.
Oops, something went wrong.
8ea804f
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @brasmusson - is this meant to address #168? Is it still WIP or shall I merge to master, sync to
gherkin-c
subrepo and see if the build passes?8ea804f
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#168 points out one problem on Windows, that Windows do not by default use utf-8. This commit fixes another problem on most Windows compiler (i686-w64-mingw32-gcc being the exception). I consider it still being WIP, I'm working on fixing all the Windows problems - a PR is coming, probably this week.