-
Notifications
You must be signed in to change notification settings - Fork 0
/
gen_utf8.rs
38 lines (35 loc) · 1.23 KB
/
gen_utf8.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
use std::fs::File;
use std::io::Write;
fn main()
{
let mut output_file_handle = File::options().write(true).create_new(true).open("utf-8.txt").unwrap();
let mut buffer: Vec<u8> = (0..128).collect(); // one byte Unicode codepoints generated here.
// two byte Unicode codepoints
for i in 0x80..0x800
{
buffer.push(0b11000000 + (i >> 6) as u8);
buffer.push(0b10000000 + (i & 0b00111111) as u8);
}
// three byte Unicode codepoints
for i in 0x800..0x10000
{
// these are Unicode surrogates, have no meaning on their own.
if (0xD800 <= i) && (i <= 0xDFFF)
{
continue;
}
buffer.push(0b11100000 + ((i >> 12) as u8));
buffer.push(0b10000000 + (((i >> 6) & 0b00111111) as u8));
buffer.push(0b10000000 + ((i & 0b00111111) as u8));
}
// four byte Unicode codepoints
for i in 0x10000..0x110000
{
buffer.push(0b11110000 + (i >> 18) as u8);
buffer.push(0b10000000 + ((i >> 12) & 0b00111111) as u8);
buffer.push(0b10000000 + ((i >> 6) & 0b00111111) as u8);
buffer.push(0b10000000 + (i & 0b00111111) as u8);
}
output_file_handle.write_all(&buffer).unwrap();
return;
}