Skip to content

Commit

Permalink
gherkin: (C) Use UTF-16 when wchar_t is of 2 bytes size.
Browse files Browse the repository at this point in the history
On Windows wchar_t is 2 bytes large, and use UTF-16. This means that
for the case of code points > 0xFFFF (and wchar_t is only 2 bytes
large), the code point read from the UTF-8 source need to be converted
to two UTF-16 surrogates (wchar_t wide characters).
  • Loading branch information
brasmusson committed Apr 15, 2017
1 parent 4b996e4 commit 8ea804f
Show file tree
Hide file tree
Showing 8 changed files with 129 additions and 79 deletions.
4 changes: 2 additions & 2 deletions gherkin/c/src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ UTILITIES_OBJS= \
../objs/file_utf8_source.o \
../objs/print_utilities.o \
../objs/string_utilities.o \
../objs/utf8_source.o \
../objs/utf8_utilities.o
../objs/unicode_utilities.o \
../objs/utf8_source.o
-include $(UTILITIES_OBJS:.o=.d)

PARSER_OBJS= \
Expand Down
32 changes: 23 additions & 9 deletions gherkin/c/src/file_reader.c
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
#include "file_reader.h"
#include "file_utf8_source.h"
#include "utf8_utilities.h"
#include "unicode_utilities.h"
#include <stdlib.h>

typedef struct FileReader {
const char* file_name;
} FileReader;

static void extend_buffer_if_needed(wchar_t** buffer, int* buffer_size, int pos);

FileReader* FileReader_new(const char* const file_name) {
FileReader* file_reader = (FileReader*)malloc(sizeof(FileReader));
file_reader->file_name = file_name;
Expand All @@ -17,19 +19,24 @@ const wchar_t* FileReader_read(FileReader* file_reader) {
int buffer_size = 256;
wchar_t* buffer = (wchar_t*)malloc(buffer_size * sizeof(wchar_t));
int pos = 0;
wchar_t c;
long code_point;
FILE* file = fopen(file_reader->file_name, "r");
Utf8Source* utf8_source = FileUtf8Source_new(file);
do {
c = Utf8Utilities_read_wchar_from_utf8_source(utf8_source);
if (c != WEOF) {
buffer[pos++] = c;
if (pos >= buffer_size - 1) {
buffer_size *= 2;
buffer = (wchar_t*)realloc(buffer, buffer_size * sizeof(wchar_t));
code_point = UnicodeUtilities_read_code_point_from_utf8_source(utf8_source);
if (code_point != WEOF) {
if (code_point <= 0xFFFF || sizeof(wchar_t) > 2) {
buffer[pos++] = (wchar_t)code_point;
extend_buffer_if_needed(&buffer, &buffer_size, pos);
} else {
Utf16Surrogates surrogates = UnicodeUtilities_get_utf16_surrogates(code_point);
buffer[pos++] = surrogates.leading;
extend_buffer_if_needed(&buffer, &buffer_size, pos);
buffer[pos++] = surrogates.trailing;
extend_buffer_if_needed(&buffer, &buffer_size, pos);
}
}
} while (c != WEOF);
} while (code_point != WEOF);
buffer[pos] = L'\0';
Utf8Source_delete(utf8_source);
fclose(file);
Expand All @@ -42,3 +49,10 @@ void FileReader_delete(FileReader* file_reader) {
}
free((void*)file_reader);
}

static void extend_buffer_if_needed(wchar_t** buffer, int* buffer_size, int pos) {
if (pos >= *buffer_size - 1) {
*buffer_size *= 2;
*buffer = (wchar_t*)realloc(*buffer, *buffer_size * sizeof(wchar_t));
}
}
36 changes: 25 additions & 11 deletions gherkin/c/src/file_token_scanner.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
#include "file_utf8_source.h"
#include "gherkin_line.h"
#include "string_utilities.h"
#include "utf8_utilities.h"
#include "unicode_utilities.h"
#include <stdlib.h>

typedef struct FileTokenScanner {
Expand All @@ -16,6 +16,8 @@ typedef struct FileTokenScanner {

static Token* FileTokenScanner_read(TokenScanner* token_scanner);

static void extend_buffer_if_needed(FileTokenScanner* token_scanner, int pos);

static void FileTokenScanner_delete(TokenScanner* token_scanner);

TokenScanner* FileTokenScanner_new(const char* const file_name) {
Expand Down Expand Up @@ -51,30 +53,42 @@ static Token* FileTokenScanner_read(TokenScanner* token_scanner) {
if (feof(file_token_scanner->file))
return Token_new(0, file_token_scanner->line);
int pos = 0;
wchar_t c;
long code_point;
do {
c = Utf8Utilities_read_wchar_from_utf8_source(file_token_scanner->utf8_source);
if (c != WEOF && c != L'\r' && c != L'\n') {
file_token_scanner->buffer[pos++] = c;
if (pos >= file_token_scanner->buffer_size - 1) {
file_token_scanner->buffer_size *= 2;
file_token_scanner->buffer = (wchar_t*)realloc(file_token_scanner->buffer, file_token_scanner->buffer_size * sizeof(wchar_t));
code_point = UnicodeUtilities_read_code_point_from_utf8_source(file_token_scanner->utf8_source);
if (code_point != WEOF && code_point != L'\r' && code_point != L'\n') {
if (code_point <= 0xFFFF || sizeof(wchar_t) > 2) {
file_token_scanner->buffer[pos++] = (wchar_t)code_point;
extend_buffer_if_needed(file_token_scanner, pos);
} else {
Utf16Surrogates surrogates = UnicodeUtilities_get_utf16_surrogates(code_point);
file_token_scanner->buffer[pos++] = surrogates.leading;
extend_buffer_if_needed(file_token_scanner, pos);
file_token_scanner->buffer[pos++] = surrogates.trailing;
extend_buffer_if_needed(file_token_scanner, pos);
}
}
} while (c != WEOF && c != L'\r' && c != L'\n');
if (c == L'\r') {
} while (code_point != WEOF && code_point != L'\r' && code_point != L'\n');
if (code_point == L'\r') {
unsigned char next_char = fgetc(file_token_scanner->file);
if (next_char != L'\n') {
ungetc(next_char, file_token_scanner->file);
}
}
file_token_scanner->buffer[pos] = L'\0';
const GherkinLine* line;
if (c != WEOF || pos != 0) {
if (code_point != WEOF || pos != 0) {
wchar_t* text = StringUtilities_copy_string_part(file_token_scanner->buffer, pos);
line = GherkinLine_new(text, file_token_scanner->line);
}
else
line = (GherkinLine*)0;
return Token_new(line, file_token_scanner->line);
}

static void extend_buffer_if_needed(FileTokenScanner* file_token_scanner, int pos){
if (pos >= file_token_scanner->buffer_size - 1) {
file_token_scanner->buffer_size *= 2;
file_token_scanner->buffer = (wchar_t*)realloc(file_token_scanner->buffer, file_token_scanner->buffer_size * sizeof(wchar_t));
}
}
17 changes: 12 additions & 5 deletions gherkin/c/src/string_utilities.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#include "string_utilities.h"
#include "utf8_utilities.h"
#include "unicode_utilities.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
Expand Down Expand Up @@ -32,15 +32,22 @@ wchar_t* StringUtilities_copy_to_wide_string(const char* string) {
int length = strlen(string);
wchar_t* copy = (wchar_t*)malloc((length + 1) * sizeof(wchar_t));
Utf8Source* utf8_source = StringUtf8Source_new(string);
int to_index = 0;
int i;
for (i = 0; i < length; ++i) {
wchar_t c = Utf8Utilities_read_wchar_from_utf8_source(utf8_source);
if (c == WEOF) {
long code_point = UnicodeUtilities_read_code_point_from_utf8_source(utf8_source);
if (code_point == WEOF) {
break;
}
copy[i] = c;
if (code_point <= 0xFFFF || sizeof(wchar_t) > 2) {
copy[to_index++] = (wchar_t)code_point;
} else {
Utf16Surrogates surrogates = UnicodeUtilities_get_utf16_surrogates(code_point);
copy[to_index++] = surrogates.leading;
copy[to_index++] = surrogates.trailing;
}
}
copy[i] = L'\0';
copy[to_index] = L'\0';
Utf8Source_delete(utf8_source);
return copy;
}
Expand Down
42 changes: 42 additions & 0 deletions gherkin/c/src/unicode_utilities.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#include "unicode_utilities.h"

long UnicodeUtilities_read_code_point_from_utf8_source(Utf8Source* utf8_source) {
unsigned char c = Utf8Source_read(utf8_source);
if (c < 0x80) {
return (long)c;
}
unsigned char c2 = Utf8Source_read(utf8_source);
long lower_part = (long)(c2 & 0x3F);
if ((c & 0xE0) == 0xC0) {
return (((long)(c & 0x1F)) << 6) | lower_part;
}
c2 = Utf8Source_read(utf8_source);
lower_part = (lower_part << 6) | (long)(c2 & 0x3F);
if ((c & 0xF0) == 0xE0) {
return (((long)(c & 0x0F)) << 12) | lower_part;
}
c2 = Utf8Source_read(utf8_source);
lower_part = (lower_part << 6) | (long)(c2 & 0x3F);
if ((c & 0xF8) == 0xF0) {
return (((long)(c & 0x07)) << 18) | lower_part;
}
c2 = Utf8Source_read(utf8_source);
lower_part = (lower_part << 6) | (long)(c2 & 0x3F);
if ((c & 0xFC) == 0xF8) {
return (((long)(c & 0x03)) << 24) | lower_part;
}
c2 = Utf8Source_read(utf8_source);
lower_part = (lower_part << 6) | (long)(c2 & 0x3F);
if ((c & 0xFE) == 0xFC) {
return (((long)(c & 0x01)) << 30) | lower_part;
}
return WEOF;
}

Utf16Surrogates UnicodeUtilities_get_utf16_surrogates(long code_point){
Utf16Surrogates surrogates;
long surrogates_base = code_point - 0x10000;
surrogates.leading = 0xD800 + (surrogates_base >> 10);
surrogates.trailing = 0xDC00 + (surrogates_base & 0x3FF);
return surrogates;
}
25 changes: 25 additions & 0 deletions gherkin/c/src/unicode_utilities.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#ifndef GHERKIN_UNICODE_UTILITIES_H_
#define GHERKIN_UNICODE_UTILITIES_H_

#include <wchar.h>

#include "utf8_source.h"

#ifdef __cplusplus
extern "C" {
#endif

typedef struct Utf16Surrogates {
wchar_t leading;
wchar_t trailing;
} Utf16Surrogates;

long UnicodeUtilities_read_code_point_from_utf8_source(Utf8Source* utf8_source);

Utf16Surrogates UnicodeUtilities_get_utf16_surrogates(long code_point);

#ifdef __cplusplus
}
#endif

#endif /* GHERKIN_UNICODE_UTILITIES_H_ */
34 changes: 0 additions & 34 deletions gherkin/c/src/utf8_utilities.c

This file was deleted.

18 changes: 0 additions & 18 deletions gherkin/c/src/utf8_utilities.h

This file was deleted.

2 comments on commit 8ea804f

@aslakhellesoy
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @brasmusson - is this meant to address #168? Is it still WIP or shall I merge to master, sync to gherkin-c subrepo and see if the build passes?

@brasmusson
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#168 points out one problem on Windows, that Windows do not by default use utf-8. This commit fixes another problem on most Windows compiler (i686-w64-mingw32-gcc being the exception). I consider it still being WIP, I'm working on fixing all the Windows problems - a PR is coming, probably this week.

Please sign in to comment.