Skip to content

Commit

Permalink
Autocorrection: handle typos containing ' char.
Browse files Browse the repository at this point in the history
Previously, autocorrection was limited to detecting typos with chars a-z
and "word breaks" with the special : char. This commit adds support for
the ' single quote char, useful for typos involving misplaced
apostrophes. Thanks to reddit user u/Dutchnesss1 for suggesting this.

As an example, the entry `dosen't -> doesn't` is added to the default
dictionary.

The script make_autocorrection_data.py is revised to support the ' char.
Additionally, the fallback `CORRECT_WORDS` list is expanded a bit.
  • Loading branch information
getreuer committed Feb 22, 2022
1 parent e18e15e commit d8f6562
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 79 deletions.
51 changes: 35 additions & 16 deletions features/autocorrection.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,38 +47,56 @@ bool process_autocorrection(uint16_t keycode, keyrecord_t* record) {
return true;
}

// The following switch cases address various kinds of keycodes. This logic is
// split over two switches rather than merged into one. The first switch may
// extract a basic keycode which is then further handled by the second switch,
// e.g. a layer-tap key with Caps Lock `LT(layer, KC_CAPS)`.
switch (keycode) {
// Ignore shifts, one-shot mods, and layer switch keys.
case KC_LSFT:
case KC_RSFT:
case QK_ONE_SHOT_MOD ... QK_ONE_SHOT_MOD_MAX:
case QK_TO ... QK_TO_MAX:
case QK_MOMENTARY ... QK_MOMENTARY_MAX:
case QK_DEF_LAYER ... QK_DEF_LAYER_MAX:
case QK_TOGGLE_LAYER ... QK_TOGGLE_LAYER_MAX:
case QK_ONE_SHOT_LAYER ... QK_ONE_SHOT_LAYER_MAX:
case QK_LAYER_TAP_TOGGLE ... QK_LAYER_TAP_TOGGLE_MAX:
case QK_LAYER_MOD ... QK_LAYER_MOD_MAX:
return true; // Ignore these keys.

#ifndef NO_ACTION_TAPPING
case QK_MOD_TAP ... QK_MOD_TAP_MAX: // Tap-hold keys.
#ifndef NO_ACTION_LAYER
case QK_LAYER_TAP ... QK_LAYER_TAP_MAX:
#endif // NO_ACTION_LAYER
// Ignore when tap-hold keys are held.
if (record->tap.count == 0) { return true; }
// Otherwise when tapped, get the base keycode.
// Otherwise when tapped, get the basic keycode.
// Fallthrough intended.
#endif // NO_ACTION_TAPPING

// Handle shifted keys, e.g. symbols like KC_EXLM = S(KC_1).
case QK_LSFT ... QK_LSFT + 255:
case QK_RSFT ... QK_RSFT + 255:
keycode &= 0xff; // Get the base keycode.
keycode &= 0xff; // Get the basic keycode.
break;

// NOTE: Space Cadet keys expose no info to check whether they are being
// tapped vs. held. This makes autocorrection ambiguous, e.g. KC_LCPO might
// be '(', which we would treat as a word break, or it might be shift, which
// we would treat as having no effect. To behave cautiously, we allow Space
// Cadet keycodes to fall to the logic below and clear autocorrection state.
}

switch (keycode) {
// Ignore shifts, Caps Lock, one-shot mods, and layer switch keys.
case KC_NO:
case KC_LSFT:
case KC_RSFT:
case KC_CAPS:
case QK_ONE_SHOT_MOD ... QK_ONE_SHOT_MOD_MAX:
case QK_TO ... QK_TO_MAX:
case QK_MOMENTARY ... QK_MOMENTARY_MAX:
case QK_DEF_LAYER ... QK_DEF_LAYER_MAX:
case QK_TOGGLE_LAYER ... QK_TOGGLE_LAYER_MAX:
case QK_ONE_SHOT_LAYER ... QK_ONE_SHOT_LAYER_MAX:
case QK_LAYER_TAP_TOGGLE ... QK_LAYER_TAP_TOGGLE_MAX:
case QK_LAYER_MOD ... QK_LAYER_MOD_MAX:
return true; // Ignore these keys.
}

if (!(KC_A <= keycode && keycode <= KC_Z)) {
if (keycode == KC_QUOT) {
// Treat " (shifted ') as a word boundary.
if ((mods & MOD_MASK_SHIFT) != 0) { keycode = KC_SPC; }
} else if (!(KC_A <= keycode && keycode <= KC_Z)) {
if (keycode == KC_BSPC) {
// Remove last character from the buffer.
if (typo_buffer_size > 0) { --typo_buffer_size; }
Expand All @@ -103,6 +121,7 @@ bool process_autocorrection(uint16_t keycode, keyrecord_t* record) {
}

// Append `keycode` to the buffer.
// NOTE: `keycode` must be a basic keycode (0-255) by this point.
typo_buffer[typo_buffer_size++] = (uint8_t) keycode;
if (typo_buffer_size < AUTOCORRECTION_MAX_LENGTH) {
typo_buffer[typo_buffer_size] = 0;
Expand Down
101 changes: 59 additions & 42 deletions features/autocorrection_data.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,20 @@
// Copyright 2021-2022 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Generated code.

// Autocorrection dictionary (70 entries):
// Autocorrection dictionary (71 entries):
// :guage -> gauge
// :the:the: -> the
// :thier -> their
Expand All @@ -22,6 +36,7 @@
// contians -> contains
// cosnt -> const
// dervied -> derived
// dosen't -> doesn't
// fales -> false
// fasle -> false
// fitler -> filter
Expand Down Expand Up @@ -75,34 +90,34 @@
#define AUTOCORRECTION_MIN_LENGTH 5 // ":ture"
#define AUTOCORRECTION_MAX_LENGTH 10 // "accomodate"

static const uint8_t autocorrection_data[1104] PROGMEM = {108, 43, 0, 6, 71, 0, 7, 81,
0, 8, 199, 0, 9, 240, 1, 10, 250, 1, 11, 26, 2, 17, 53, 2, 18, 190, 2, 19,
202, 2, 21, 212, 2, 22, 20, 3, 23, 67, 3, 28, 16, 4, 0, 72, 50, 0, 22, 60, 0,
0, 11, 23, 44, 8, 11, 23, 44, 0, 132, 0, 8, 22, 18, 18, 15, 0, 132, 115, 101,
115, 0, 11, 23, 12, 26, 22, 0, 129, 99, 104, 0, 68, 94, 0, 8, 106, 0, 15, 174,
0, 21, 187, 0, 0, 12, 15, 25, 17, 12, 0, 131, 97, 108, 105, 100, 0, 74, 119,
0, 12, 129, 0, 21, 140, 0, 24, 165, 0, 0, 17, 12, 22, 0, 131, 103, 110, 101,
100, 0, 25, 21, 8, 7, 0, 131, 105, 118, 101, 100, 0, 72, 147, 0, 24, 156, 0,
0, 9, 8, 21, 0, 129, 114, 101, 100, 0, 6, 6, 18, 0, 129, 114, 101, 100, 0, 15,
6, 17, 12, 0, 129, 100, 101, 0, 18, 22, 8, 21, 11, 23, 0, 130, 104, 111, 108,
100, 0, 4, 26, 18, 9, 0, 131, 114, 119, 97, 114, 100, 0, 68, 233, 0, 6, 246,
0, 7, 4, 1, 8, 16, 1, 10, 52, 1, 15, 81, 1, 21, 90, 1, 22, 117, 1, 23, 144, 1,
24, 215, 1, 25, 228, 1, 0, 6, 19, 22, 8, 16, 4, 17, 0, 130, 97, 99, 101, 0,
19, 4, 22, 8, 16, 4, 17, 0, 131, 112, 97, 99, 101, 0, 12, 21, 8, 25, 18, 0,
130, 114, 105, 100, 101, 0, 23, 0, 68, 25, 1, 17, 36, 1, 0, 21, 4, 24, 10, 0,
130, 110, 116, 101, 101, 0, 4, 21, 24, 4, 10, 0, 135, 117, 97, 114, 97, 110,
116, 101, 101, 0, 68, 59, 1, 7, 69, 1, 0, 24, 10, 44, 0, 131, 97, 117, 103,
101, 0, 8, 15, 12, 25, 12, 21, 19, 0, 130, 103, 101, 0, 22, 4, 9, 0, 130, 108,
115, 101, 0, 76, 97, 1, 24, 109, 1, 0, 24, 20, 4, 0, 132, 99, 113, 117, 105,
114, 101, 0, 23, 44, 0, 130, 114, 117, 101, 0, 4, 0, 79, 126, 1, 24, 134, 1,
0, 9, 0, 131, 97, 108, 115, 101, 0, 6, 8, 5, 0, 131, 97, 117, 115, 101, 0, 4,
0, 71, 156, 1, 19, 193, 1, 21, 203, 1, 0, 18, 16, 0, 80, 166, 1, 18, 181, 1,
0, 18, 6, 4, 0, 135, 99, 111, 109, 109, 111, 100, 97, 116, 101, 0, 6, 6, 4, 0,
132, 109, 111, 100, 97, 116, 101, 0, 7, 24, 0, 132, 112, 100, 97, 116, 101, 0,
8, 19, 8, 22, 0, 132, 97, 114, 97, 116, 101, 0, 10, 8, 15, 15, 18, 6, 0, 130,
97, 103, 117, 101, 0, 8, 12, 6, 8, 21, 0, 131, 101, 105, 118, 101, 0, 12, 8,
11, 6, 0, 130, 105, 101, 102, 0, 17, 0, 76, 3, 2, 21, 16, 2, 0, 15, 8, 12, 6,
0, 133, 101, 105, 108, 105, 110, 103, 0, 12, 23, 22, 0, 131, 114, 105, 110,
static const uint8_t autocorrection_data[1120] PROGMEM = {108, 43, 0, 6, 71, 0,
7, 81, 0, 8, 199, 0, 9, 240, 1, 10, 250, 1, 11, 26, 2, 17, 53, 2, 18, 190, 2,
19, 202, 2, 21, 212, 2, 22, 20, 3, 23, 67, 3, 28, 32, 4, 0, 72, 50, 0, 22, 60,
0, 0, 11, 23, 44, 8, 11, 23, 44, 0, 132, 0, 8, 22, 18, 18, 15, 0, 132, 115,
101, 115, 0, 11, 23, 12, 26, 22, 0, 129, 99, 104, 0, 68, 94, 0, 8, 106, 0, 15,
174, 0, 21, 187, 0, 0, 12, 15, 25, 17, 12, 0, 131, 97, 108, 105, 100, 0, 74,
119, 0, 12, 129, 0, 21, 140, 0, 24, 165, 0, 0, 17, 12, 22, 0, 131, 103, 110,
101, 100, 0, 25, 21, 8, 7, 0, 131, 105, 118, 101, 100, 0, 72, 147, 0, 24, 156,
0, 0, 9, 8, 21, 0, 129, 114, 101, 100, 0, 6, 6, 18, 0, 129, 114, 101, 100, 0,
15, 6, 17, 12, 0, 129, 100, 101, 0, 18, 22, 8, 21, 11, 23, 0, 130, 104, 111,
108, 100, 0, 4, 26, 18, 9, 0, 131, 114, 119, 97, 114, 100, 0, 68, 233, 0, 6,
246, 0, 7, 4, 1, 8, 16, 1, 10, 52, 1, 15, 81, 1, 21, 90, 1, 22, 117, 1, 23,
144, 1, 24, 215, 1, 25, 228, 1, 0, 6, 19, 22, 8, 16, 4, 17, 0, 130, 97, 99,
101, 0, 19, 4, 22, 8, 16, 4, 17, 0, 131, 112, 97, 99, 101, 0, 12, 21, 8, 25,
18, 0, 130, 114, 105, 100, 101, 0, 23, 0, 68, 25, 1, 17, 36, 1, 0, 21, 4, 24,
10, 0, 130, 110, 116, 101, 101, 0, 4, 21, 24, 4, 10, 0, 135, 117, 97, 114, 97,
110, 116, 101, 101, 0, 68, 59, 1, 7, 69, 1, 0, 24, 10, 44, 0, 131, 97, 117,
103, 101, 0, 8, 15, 12, 25, 12, 21, 19, 0, 130, 103, 101, 0, 22, 4, 9, 0, 130,
108, 115, 101, 0, 76, 97, 1, 24, 109, 1, 0, 24, 20, 4, 0, 132, 99, 113, 117,
105, 114, 101, 0, 23, 44, 0, 130, 114, 117, 101, 0, 4, 0, 79, 126, 1, 24, 134,
1, 0, 9, 0, 131, 97, 108, 115, 101, 0, 6, 8, 5, 0, 131, 97, 117, 115, 101, 0,
4, 0, 71, 156, 1, 19, 193, 1, 21, 203, 1, 0, 18, 16, 0, 80, 166, 1, 18, 181,
1, 0, 18, 6, 4, 0, 135, 99, 111, 109, 109, 111, 100, 97, 116, 101, 0, 6, 6, 4,
0, 132, 109, 111, 100, 97, 116, 101, 0, 7, 24, 0, 132, 112, 100, 97, 116, 101,
0, 8, 19, 8, 22, 0, 132, 97, 114, 97, 116, 101, 0, 10, 8, 15, 15, 18, 6, 0,
130, 97, 103, 117, 101, 0, 8, 12, 6, 8, 21, 0, 131, 101, 105, 118, 101, 0, 12,
8, 11, 6, 0, 130, 105, 101, 102, 0, 17, 0, 76, 3, 2, 21, 16, 2, 0, 15, 8, 12,
6, 0, 133, 101, 105, 108, 105, 110, 103, 0, 12, 23, 22, 0, 131, 114, 105, 110,
103, 0, 70, 33, 2, 23, 44, 2, 0, 12, 23, 26, 22, 0, 131, 105, 116, 99, 104, 0,
10, 12, 8, 11, 0, 129, 104, 116, 0, 72, 69, 2, 10, 80, 2, 18, 89, 2, 21, 156,
2, 24, 167, 2, 0, 22, 18, 18, 11, 6, 0, 131, 115, 101, 110, 0, 12, 21, 23, 22,
Expand All @@ -118,18 +133,20 @@ static const uint8_t autocorrection_data[1104] PROGMEM = {108, 43, 0, 6, 71, 0,
110, 101, 114, 0, 23, 4, 21, 8, 23, 17, 12, 0, 135, 116, 101, 114, 97, 116,
111, 114, 0, 72, 30, 3, 17, 38, 3, 24, 51, 3, 0, 15, 4, 9, 0, 129, 115, 101,
0, 4, 12, 23, 17, 18, 6, 0, 131, 97, 105, 110, 115, 0, 22, 17, 8, 6, 17, 18,
6, 0, 133, 115, 101, 110, 115, 117, 115, 0, 74, 86, 3, 11, 96, 3, 15, 118, 3,
17, 129, 3, 22, 218, 3, 24, 232, 3, 0, 11, 24, 4, 6, 0, 130, 103, 104, 116, 0,
71, 103, 3, 10, 110, 3, 0, 12, 26, 0, 129, 116, 104, 0, 17, 8, 15, 0, 129,
116, 104, 0, 22, 24, 8, 21, 0, 131, 115, 117, 108, 116, 0, 68, 139, 3, 8, 150,
3, 22, 210, 3, 0, 21, 4, 19, 19, 4, 0, 130, 101, 110, 116, 0, 85, 157, 3, 25,
200, 3, 0, 68, 164, 3, 21, 175, 3, 0, 19, 4, 0, 132, 112, 97, 114, 101, 110,
116, 0, 4, 19, 0, 68, 185, 3, 19, 193, 3, 0, 133, 112, 97, 114, 101, 110, 116,
6, 0, 133, 115, 101, 110, 115, 117, 115, 0, 116, 89, 3, 10, 102, 3, 11, 112,
3, 15, 134, 3, 17, 145, 3, 22, 234, 3, 24, 248, 3, 0, 17, 8, 22, 18, 7, 0,
132, 101, 115, 110, 39, 116, 0, 11, 24, 4, 6, 0, 130, 103, 104, 116, 0, 71,
119, 3, 10, 126, 3, 0, 12, 26, 0, 129, 116, 104, 0, 17, 8, 15, 0, 129, 116,
104, 0, 22, 24, 8, 21, 0, 131, 115, 117, 108, 116, 0, 68, 155, 3, 8, 166, 3,
22, 226, 3, 0, 21, 4, 19, 19, 4, 0, 130, 101, 110, 116, 0, 85, 173, 3, 25,
216, 3, 0, 68, 180, 3, 21, 191, 3, 0, 19, 4, 0, 132, 112, 97, 114, 101, 110,
116, 0, 4, 19, 0, 68, 201, 3, 19, 209, 3, 0, 133, 112, 97, 114, 101, 110, 116,
0, 4, 0, 131, 101, 110, 116, 0, 8, 15, 8, 21, 0, 130, 97, 110, 116, 0, 18, 6,
0, 130, 110, 115, 116, 0, 12, 9, 8, 17, 4, 16, 0, 132, 105, 102, 101, 115,
116, 0, 83, 239, 3, 23, 6, 4, 0, 87, 246, 3, 24, 254, 3, 0, 17, 12, 0, 131,
112, 117, 116, 0, 18, 0, 130, 116, 112, 117, 116, 0, 19, 24, 18, 0, 131, 116,
112, 117, 116, 0, 70, 29, 4, 8, 41, 4, 11, 51, 4, 21, 69, 4, 0, 8, 24, 20, 8,
21, 9, 0, 129, 110, 99, 121, 0, 23, 9, 4, 22, 0, 130, 101, 116, 121, 0, 6, 21,
4, 21, 12, 8, 11, 0, 135, 105, 101, 114, 97, 114, 99, 104, 121, 0, 4, 5, 12,
15, 0, 130, 114, 97, 114, 121, 0};
116, 0, 83, 255, 3, 23, 22, 4, 0, 87, 6, 4, 24, 14, 4, 0, 17, 12, 0, 131, 112,
117, 116, 0, 18, 0, 130, 116, 112, 117, 116, 0, 19, 24, 18, 0, 131, 116, 112,
117, 116, 0, 70, 45, 4, 8, 57, 4, 11, 67, 4, 21, 85, 4, 0, 8, 24, 20, 8, 21,
9, 0, 129, 110, 99, 121, 0, 23, 9, 4, 22, 0, 130, 101, 116, 121, 0, 6, 21, 4,
21, 12, 8, 11, 0, 135, 105, 101, 114, 97, 114, 99, 104, 121, 0, 4, 5, 12, 15,
0, 130, 114, 97, 114, 121, 0};

4 changes: 3 additions & 1 deletion features/autocorrection_dict.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2021 Google LLC
# Copyright 2021-2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -33,6 +33,7 @@ concensus -> consensus
contians -> contains
cosnt -> const
dervied -> derived
dosen't -> doesn't
fales -> false
fasle -> false
fitler -> filter
Expand Down Expand Up @@ -82,3 +83,4 @@ swtich -> switch
thresold -> threshold
udpate -> update
widht -> width

46 changes: 26 additions & 20 deletions features/make_autocorrection_data.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright 2021 Google LLC
# Copyright 2021-2022 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -29,6 +29,7 @@
Example:
:thier -> their
dosen't -> doesn't
fitler -> filter
lenght -> length
ouput -> output
Expand All @@ -51,21 +52,34 @@
'correctly spelled word. To check for this, install the english_words '
'package and rerun this script:\n\n pip install english_words\n')
# Use a minimal word list as a fallback.
CORRECT_WORDS = ('information', 'available', 'international', 'language',
'loosest', 'reference', 'wealthier', 'entertainment',
'association', 'provides', 'technology', 'statehood')
CORRECT_WORDS = ('apparent', 'association', 'available', 'classification',
'effect', 'entertainment', 'fantastic', 'information',
'integrate', 'international', 'language', 'loosest',
'manual', 'nothing', 'provides', 'reference', 'statehood',
'technology', 'virtually', 'wealthier', 'wonderful')

KC_A = 4
KC_SPC = 0x2c
KC_QUOT = 0x34

TYPO_CHARS = dict(
[
("'", KC_QUOT),
(':', KC_SPC), # "Word break" character.
] +
# Characters a-z.
[(chr(c), c + KC_A - ord('a')) for c in range(ord('a'), ord('z') + 1)]
)


def parse_file(file_name: str) -> List[Tuple[str, str]]:
"""Parses autocorrections dictionary file.
Each line of the file defines one typo and its correction with the syntax
"typo -> correction". Blank lines or lines starting with '#' are ignored. The
function validates that typos only have characters a-z and that typos are not
substrings of other typos, otherwise the longer typo would never trigger.
function validates that typos only have characters in TYPO_CHARS, that
typos are not substrings of other typos, and checking that typos don't trigger
on CORRECT_WORDS.
Args:
file_name: String, path of the autocorrections dictionary.
Expand All @@ -81,9 +95,9 @@ def parse_file(file_name: str) -> List[Tuple[str, str]]:
continue

# Check that `typo` is valid.
if not(all([ord('a') <= ord(c) <= ord('z') or c == ':' for c in typo])):
if not(all([c in TYPO_CHARS for c in typo])):
print(f'Error:{line_number}: Typo "{typo}" has '
'characters other than a-z and :.')
'characters other than ' + ''.join(TYPO_CHARS.keys()))
sys.exit(1)
for other_typo in typos:
if typo in other_typo or other_typo in typo:
Expand Down Expand Up @@ -179,7 +193,7 @@ def serialize_trie(autocorrections: List[Tuple[str, str]],
table = []

# Traverse trie in depth first order.
def traverse(trie_node):
def traverse(trie_node: Dict[str, Any]) -> Dict[str, Any]:
if 'LEAF' in trie_node: # Handle a leaf trie node.
typo, correction = trie_node['LEAF']
word_boundary_ending = typo[-1] == ':'
Expand Down Expand Up @@ -214,23 +228,15 @@ def traverse(trie_node):

traverse(trie)

def serialize(e):
def kc_code(c):
if ord('a') <= ord(c) <= ord('z'):
return ord(c) - ord('a') + KC_A
elif c == ':':
return KC_SPC
else:
raise ValueError(f'Invalid character: {c}')

def serialize(e: Dict[str, Any]) -> List[int]:
if not e['links']: # Handle a leaf table entry.
return e['data']
elif len(e['links']) == 1: # Handle a chain table entry.
return list(map(kc_code, e['chars'])) + [0] #+ encode_link(e['links'][0]))
return [TYPO_CHARS[c] for c in e['chars']] + [0]
else: # Handle a branch table entry.
data = []
for c, link in zip(e['chars'], e['links']):
data += [kc_code(c) | (0 if data else 64)] + encode_link(link)
data += [TYPO_CHARS[c] | (0 if data else 64)] + encode_link(link)
return data + [0]

byte_offset = 0
Expand Down

0 comments on commit d8f6562

Please sign in to comment.