Merge pull request #224 from helix-editor/line_ending_detection

Line ending detection
helix-editor · Jun 22, 2021 · a70de6e · a70de6e
2 parents c704970 + f2954fa
commit a70de6e
Show file tree

Hide file tree

Showing 17 changed files with 562 additions and 205 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/helix-core/src/auto_pairs.rs b/helix-core/src/auto_pairs.rs
@@ -12,7 +12,7 @@ pub const PAIRS: &[(char, char)] = &[
     ('`', '`'),
 ];
 
-const CLOSE_BEFORE: &str = ")]}'\":;> \n"; // includes space and newline
+const CLOSE_BEFORE: &str = ")]}'\":;> \n\r\u{000B}\u{000C}\u{0085}\u{2028}\u{2029}"; // includes space and newlines
 
 // insert hook:
 // Fn(doc, selection, char) => Option<Transaction>

diff --git a/helix-core/src/chars.rs b/helix-core/src/chars.rs
@@ -1,25 +1,44 @@
-/// Determine whether a character is a line break.
-pub fn char_is_linebreak(c: char) -> bool {
-    matches!(
-        c,
-        '\u{000A}' | // LineFeed
-        '\u{000B}' | // VerticalTab
-        '\u{000C}' | // FormFeed
-        '\u{000D}' | // CarriageReturn
-        '\u{0085}' | // NextLine
-        '\u{2028}' | // Line Separator
-        '\u{2029}' // ParagraphSeparator
-    )
+use crate::LineEnding;
+
+#[derive(Debug, Eq, PartialEq)]
+pub enum CharCategory {
+    Whitespace,
+    Eol,
+    Word,
+    Punctuation,
+    Unknown,
+}
+
+#[inline]
+pub fn categorize_char(ch: char) -> CharCategory {
+    if char_is_line_ending(ch) {
+        CharCategory::Eol
+    } else if ch.is_whitespace() {
+        CharCategory::Whitespace
+    } else if char_is_word(ch) {
+        CharCategory::Word
+    } else if char_is_punctuation(ch) {
+        CharCategory::Punctuation
+    } else {
+        CharCategory::Unknown
+    }
+}
+
+/// Determine whether a character is a line ending.
+#[inline]
+pub fn char_is_line_ending(ch: char) -> bool {
+    LineEnding::from_char(ch).is_some()
 }
 
 /// Determine whether a character qualifies as (non-line-break)
 /// whitespace.
-pub fn char_is_whitespace(c: char) -> bool {
+#[inline]
+pub fn char_is_whitespace(ch: char) -> bool {
     // TODO: this is a naive binary categorization of whitespace
     // characters.  For display, word wrapping, etc. we'll need a better
     // categorization based on e.g. breaking vs non-breaking spaces
     // and whether they're zero-width or not.
-    match c {
+    match ch {
         //'\u{1680}' | // Ogham Space Mark (here for completeness, but usually displayed as a dash, not as whitespace)
         '\u{0009}' | // Character Tabulation
         '\u{0020}' | // Space
@@ -34,8 +53,81 @@ pub fn char_is_whitespace(c: char) -> bool {
         // En Quad, Em Quad, En Space, Em Space, Three-per-em Space,
         // Four-per-em Space, Six-per-em Space, Figure Space,
         // Punctuation Space, Thin Space, Hair Space, Zero Width Space.
-        c if ('\u{2000}' ..= '\u{200B}').contains(&c) => true,
+        ch if ('\u{2000}' ..= '\u{200B}').contains(&ch) => true,
 
         _ => false,
     }
 }
+
+#[inline]
+pub fn char_is_punctuation(ch: char) -> bool {
+    use unicode_general_category::{get_general_category, GeneralCategory};
+
+    matches!(
+        get_general_category(ch),
+        GeneralCategory::OtherPunctuation
+            | GeneralCategory::OpenPunctuation
+            | GeneralCategory::ClosePunctuation
+            | GeneralCategory::InitialPunctuation
+            | GeneralCategory::FinalPunctuation
+            | GeneralCategory::ConnectorPunctuation
+            | GeneralCategory::DashPunctuation
+            | GeneralCategory::MathSymbol
+            | GeneralCategory::CurrencySymbol
+            | GeneralCategory::ModifierSymbol
+    )
+}
+
+#[inline]
+pub fn char_is_word(ch: char) -> bool {
+    ch.is_alphanumeric() || ch == '_'
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_categorize() {
+        const EOL_TEST_CASE: &'static str = "\n\r\u{000B}\u{000C}\u{0085}\u{2028}\u{2029}";
+        const WORD_TEST_CASE: &'static str =
+            "_hello_world_あいうえおー1234567890１２３４５６７８９０";
+        const PUNCTUATION_TEST_CASE: &'static str =
+            "!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~！”＃＄％＆’（）＊＋、。：；＜＝＞？＠「」＾｀｛｜｝～";
+        const WHITESPACE_TEST_CASE: &'static str = "  　   ";
+
+        for ch in EOL_TEST_CASE.chars() {
+            assert_eq!(CharCategory::Eol, categorize_char(ch));
+        }
+
+        for ch in WHITESPACE_TEST_CASE.chars() {
+            assert_eq!(
+                CharCategory::Whitespace,
+                categorize_char(ch),
+                "Testing '{}', but got `{:?}` instead of `Category::Whitespace`",
+                ch,
+                categorize_char(ch)
+            );
+        }
+
+        for ch in WORD_TEST_CASE.chars() {
+            assert_eq!(
+                CharCategory::Word,
+                categorize_char(ch),
+                "Testing '{}', but got `{:?}` instead of `Category::Word`",
+                ch,
+                categorize_char(ch)
+            );
+        }
+
+        for ch in PUNCTUATION_TEST_CASE.chars() {
+            assert_eq!(
+                CharCategory::Punctuation,
+                categorize_char(ch),
+                "Testing '{}', but got `{:?}` instead of `Category::Punctuation`",
+                ch,
+                categorize_char(ch)
+            );
+        }
+    }
+}
diff --git a/helix-core/src/lib.rs b/helix-core/src/lib.rs
@@ -6,6 +6,7 @@ pub mod diagnostic;
 pub mod graphemes;
 pub mod history;
 pub mod indent;
+pub mod line_ending;
 pub mod macros;
 pub mod match_brackets;
 pub mod movement;
@@ -106,6 +107,7 @@ pub use tendril::StrTendril as Tendril;
 #[doc(inline)]
 pub use {regex, tree_sitter};
 
+pub use graphemes::RopeGraphemes;
 pub use position::{coords_at_pos, pos_at_coords, Position};
 pub use selection::{Range, Selection};
 pub use smallvec::SmallVec;
@@ -114,4 +116,5 @@ pub use syntax::Syntax;
 pub use diagnostic::Diagnostic;
 pub use state::State;
 
+pub use line_ending::{LineEnding, DEFAULT_LINE_ENDING};
 pub use transaction::{Assoc, Change, ChangeSet, Operation, Transaction};