gherkin: (C) Use UTF-16 when wchar_t is of 2 bytes size.

On Windows wchar_t is 2 bytes large, and use UTF-16. This means that for the case of code points > 0xFFFF (and wchar_t is only 2 bytes large), the code point read from the UTF-8 source need to be converted to two UTF-16 surrogates (wchar_t wide characters).
cucumber · Apr 15, 2017 · 8ea804f · 8ea804f · aslakhellesoy · Apr 23, 2017
1 parent 4b996e4
commit 8ea804f
Show file tree

Hide file tree

Showing 8 changed files with 129 additions and 79 deletions.
diff --git a/gherkin/c/src/Makefile b/gherkin/c/src/Makefile
@@ -42,8 +42,8 @@ UTILITIES_OBJS= \
 	../objs/file_utf8_source.o \
 	../objs/print_utilities.o \
 	../objs/string_utilities.o \
-	../objs/utf8_source.o \
-	../objs/utf8_utilities.o
+	../objs/unicode_utilities.o \
+	../objs/utf8_source.o
 -include $(UTILITIES_OBJS:.o=.d)
 
 PARSER_OBJS= \

diff --git a/gherkin/c/src/file_reader.c b/gherkin/c/src/file_reader.c
@@ -1,12 +1,14 @@
 #include "file_reader.h"
 #include "file_utf8_source.h"
-#include "utf8_utilities.h"
+#include "unicode_utilities.h"
 #include <stdlib.h>
 
 typedef struct FileReader {
     const char* file_name;
 } FileReader;
 
+static void extend_buffer_if_needed(wchar_t** buffer, int* buffer_size, int pos);
+
 FileReader* FileReader_new(const char* const file_name) {
     FileReader* file_reader = (FileReader*)malloc(sizeof(FileReader));
     file_reader->file_name = file_name;
@@ -17,19 +19,24 @@ const wchar_t* FileReader_read(FileReader* file_reader) {
     int buffer_size = 256;
     wchar_t* buffer = (wchar_t*)malloc(buffer_size * sizeof(wchar_t));
     int pos = 0;
-    wchar_t c;
+    long code_point;
     FILE* file = fopen(file_reader->file_name, "r");
     Utf8Source* utf8_source = FileUtf8Source_new(file);
     do {
-        c = Utf8Utilities_read_wchar_from_utf8_source(utf8_source);
-        if (c != WEOF) {
-            buffer[pos++] = c;
-            if (pos >= buffer_size - 1) {
-                buffer_size *= 2;
-                buffer = (wchar_t*)realloc(buffer, buffer_size * sizeof(wchar_t));
+        code_point = UnicodeUtilities_read_code_point_from_utf8_source(utf8_source);
+        if (code_point != WEOF) {
+            if (code_point <= 0xFFFF || sizeof(wchar_t) > 2) {
+                buffer[pos++] = (wchar_t)code_point;
+                extend_buffer_if_needed(&buffer, &buffer_size, pos);
+            } else {
+                Utf16Surrogates surrogates = UnicodeUtilities_get_utf16_surrogates(code_point);
+                buffer[pos++] = surrogates.leading;
+                extend_buffer_if_needed(&buffer, &buffer_size, pos);
+                buffer[pos++] = surrogates.trailing;
+                extend_buffer_if_needed(&buffer, &buffer_size, pos);
             }
         }
-    } while (c != WEOF);
+    } while (code_point != WEOF);
     buffer[pos] = L'\0';
     Utf8Source_delete(utf8_source);
     fclose(file);
@@ -42,3 +49,10 @@ void FileReader_delete(FileReader* file_reader) {
     }
     free((void*)file_reader);
 }
+
+static void extend_buffer_if_needed(wchar_t** buffer, int* buffer_size, int pos) {
+    if (pos >= *buffer_size - 1) {
+        *buffer_size *= 2;
+        *buffer = (wchar_t*)realloc(*buffer, *buffer_size * sizeof(wchar_t));
+    }
+}
diff --git a/gherkin/c/src/file_token_scanner.c b/gherkin/c/src/file_token_scanner.c
@@ -2,7 +2,7 @@
 #include "file_utf8_source.h"
 #include "gherkin_line.h"
 #include "string_utilities.h"
-#include "utf8_utilities.h"
+#include "unicode_utilities.h"
 #include <stdlib.h>
 
 typedef struct FileTokenScanner {
@@ -16,6 +16,8 @@ typedef struct FileTokenScanner {
 
 static Token* FileTokenScanner_read(TokenScanner* token_scanner);
 
+static void extend_buffer_if_needed(FileTokenScanner* token_scanner, int pos);
+
 static void FileTokenScanner_delete(TokenScanner* token_scanner);
 
 TokenScanner* FileTokenScanner_new(const char* const file_name) {
@@ -51,30 +53,42 @@ static Token* FileTokenScanner_read(TokenScanner* token_scanner) {
     if (feof(file_token_scanner->file))
         return Token_new(0, file_token_scanner->line);
     int pos = 0;
-    wchar_t c;
+    long code_point;
     do {
-        c = Utf8Utilities_read_wchar_from_utf8_source(file_token_scanner->utf8_source);
-        if (c != WEOF && c != L'\r' && c != L'\n') {
-            file_token_scanner->buffer[pos++] = c;
-            if (pos >= file_token_scanner->buffer_size - 1) {
-                file_token_scanner->buffer_size *= 2;
-                file_token_scanner->buffer = (wchar_t*)realloc(file_token_scanner->buffer, file_token_scanner->buffer_size * sizeof(wchar_t));
+        code_point = UnicodeUtilities_read_code_point_from_utf8_source(file_token_scanner->utf8_source);
+        if (code_point != WEOF && code_point != L'\r' && code_point != L'\n') {
+            if (code_point <= 0xFFFF || sizeof(wchar_t) > 2) {
+                file_token_scanner->buffer[pos++] = (wchar_t)code_point;
+                extend_buffer_if_needed(file_token_scanner, pos);
+            } else {
+                Utf16Surrogates surrogates = UnicodeUtilities_get_utf16_surrogates(code_point);
+                file_token_scanner->buffer[pos++] = surrogates.leading;
+                extend_buffer_if_needed(file_token_scanner, pos);
+                file_token_scanner->buffer[pos++] = surrogates.trailing;
+                extend_buffer_if_needed(file_token_scanner, pos);
             }
         }
-    } while (c != WEOF && c != L'\r' && c != L'\n');
-    if (c == L'\r') {
+    } while (code_point != WEOF && code_point != L'\r' && code_point != L'\n');
+    if (code_point == L'\r') {
         unsigned char next_char = fgetc(file_token_scanner->file);
         if (next_char != L'\n') {
             ungetc(next_char, file_token_scanner->file);
         }
     }
     file_token_scanner->buffer[pos] = L'\0';
     const GherkinLine* line;
-    if (c != WEOF || pos != 0) {
+    if (code_point != WEOF || pos != 0) {
         wchar_t* text = StringUtilities_copy_string_part(file_token_scanner->buffer, pos);
         line = GherkinLine_new(text, file_token_scanner->line);
     }
     else
         line = (GherkinLine*)0;
     return Token_new(line, file_token_scanner->line);
 }
+
+static void extend_buffer_if_needed(FileTokenScanner* file_token_scanner, int pos){
+    if (pos >= file_token_scanner->buffer_size - 1) {
+        file_token_scanner->buffer_size *= 2;
+        file_token_scanner->buffer = (wchar_t*)realloc(file_token_scanner->buffer, file_token_scanner->buffer_size * sizeof(wchar_t));
+    }
+}
diff --git a/gherkin/c/src/string_utilities.c b/gherkin/c/src/string_utilities.c
@@ -1,5 +1,5 @@
 #include "string_utilities.h"
-#include "utf8_utilities.h"
+#include "unicode_utilities.h"
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
@@ -32,15 +32,22 @@ wchar_t* StringUtilities_copy_to_wide_string(const char* string) {
     int length = strlen(string);
     wchar_t* copy = (wchar_t*)malloc((length + 1) * sizeof(wchar_t));
     Utf8Source* utf8_source = StringUtf8Source_new(string);
+    int to_index = 0;
     int i;
     for (i = 0; i < length; ++i) {
-        wchar_t c = Utf8Utilities_read_wchar_from_utf8_source(utf8_source);
-        if (c == WEOF) {
+        long code_point = UnicodeUtilities_read_code_point_from_utf8_source(utf8_source);
+        if (code_point == WEOF) {
             break;
         }
-        copy[i] = c;
+        if (code_point <= 0xFFFF || sizeof(wchar_t) > 2) {
+            copy[to_index++] = (wchar_t)code_point;
+        } else {
+            Utf16Surrogates surrogates = UnicodeUtilities_get_utf16_surrogates(code_point);
+            copy[to_index++] = surrogates.leading;
+            copy[to_index++] = surrogates.trailing;
+        }
     }
-    copy[i] = L'\0';
+    copy[to_index] = L'\0';
     Utf8Source_delete(utf8_source);
     return copy;
 }

diff --git a/gherkin/c/src/unicode_utilities.c b/gherkin/c/src/unicode_utilities.c
@@ -0,0 +1,42 @@
+#include "unicode_utilities.h"
+
+long UnicodeUtilities_read_code_point_from_utf8_source(Utf8Source* utf8_source) {
+    unsigned char c = Utf8Source_read(utf8_source);
+    if (c < 0x80) {
+        return (long)c;
+    }
+    unsigned char c2 = Utf8Source_read(utf8_source);
+    long lower_part = (long)(c2 & 0x3F);
+    if ((c & 0xE0) == 0xC0) {
+        return (((long)(c & 0x1F)) << 6) | lower_part;
+    }
+    c2 = Utf8Source_read(utf8_source);
+    lower_part =  (lower_part << 6) | (long)(c2 & 0x3F);
+    if ((c & 0xF0) == 0xE0) {
+        return (((long)(c & 0x0F)) << 12) | lower_part;
+    }
+    c2 = Utf8Source_read(utf8_source);
+    lower_part =  (lower_part << 6) | (long)(c2 & 0x3F);
+    if ((c & 0xF8) == 0xF0) {
+        return (((long)(c & 0x07)) << 18) | lower_part;
+    }
+    c2 = Utf8Source_read(utf8_source);
+    lower_part =  (lower_part << 6) | (long)(c2 & 0x3F);
+    if ((c & 0xFC) == 0xF8) {
+        return (((long)(c & 0x03)) << 24) | lower_part;
+    }
+    c2 = Utf8Source_read(utf8_source);
+    lower_part =  (lower_part << 6) | (long)(c2 & 0x3F);
+    if ((c & 0xFE) == 0xFC) {
+        return (((long)(c & 0x01)) << 30) | lower_part;
+    }
+    return WEOF;
+}
+
+Utf16Surrogates UnicodeUtilities_get_utf16_surrogates(long code_point){
+    Utf16Surrogates surrogates;
+    long surrogates_base = code_point - 0x10000;
+    surrogates.leading = 0xD800 + (surrogates_base >> 10);
+    surrogates.trailing = 0xDC00 + (surrogates_base & 0x3FF);
+    return surrogates;
+}
diff --git a/gherkin/c/src/unicode_utilities.h b/gherkin/c/src/unicode_utilities.h
@@ -0,0 +1,25 @@
+#ifndef GHERKIN_UNICODE_UTILITIES_H_
+#define GHERKIN_UNICODE_UTILITIES_H_
+
+#include <wchar.h>
+
+#include "utf8_source.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct Utf16Surrogates {
+    wchar_t leading;
+    wchar_t trailing;
+} Utf16Surrogates;
+
+long UnicodeUtilities_read_code_point_from_utf8_source(Utf8Source* utf8_source);
+
+Utf16Surrogates UnicodeUtilities_get_utf16_surrogates(long code_point);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* GHERKIN_UNICODE_UTILITIES_H_ */
diff --git a/gherkin/c/src/utf8_utilities.c b/gherkin/c/src/utf8_utilities.c
diff --git a/gherkin/c/src/utf8_utilities.h b/gherkin/c/src/utf8_utilities.h