Skip to content

Commit

Permalink
Merge pull request #224 from helix-editor/line_ending_detection
Browse files Browse the repository at this point in the history
Line ending detection
  • Loading branch information
archseer authored Jun 22, 2021
2 parents c704970 + f2954fa commit a70de6e
Show file tree
Hide file tree
Showing 17 changed files with 562 additions and 205 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion helix-core/src/auto_pairs.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ pub const PAIRS: &[(char, char)] = &[
('`', '`'),
];

const CLOSE_BEFORE: &str = ")]}'\":;> \n"; // includes space and newline
const CLOSE_BEFORE: &str = ")]}'\":;> \n\r\u{000B}\u{000C}\u{0085}\u{2028}\u{2029}"; // includes space and newlines

// insert hook:
// Fn(doc, selection, char) => Option<Transaction>
Expand Down
122 changes: 107 additions & 15 deletions helix-core/src/chars.rs
Original file line number Diff line number Diff line change
@@ -1,25 +1,44 @@
/// Determine whether a character is a line break.
pub fn char_is_linebreak(c: char) -> bool {
matches!(
c,
'\u{000A}' | // LineFeed
'\u{000B}' | // VerticalTab
'\u{000C}' | // FormFeed
'\u{000D}' | // CarriageReturn
'\u{0085}' | // NextLine
'\u{2028}' | // Line Separator
'\u{2029}' // ParagraphSeparator
)
use crate::LineEnding;

#[derive(Debug, Eq, PartialEq)]
pub enum CharCategory {
Whitespace,
Eol,
Word,
Punctuation,
Unknown,
}

#[inline]
pub fn categorize_char(ch: char) -> CharCategory {
if char_is_line_ending(ch) {
CharCategory::Eol
} else if ch.is_whitespace() {
CharCategory::Whitespace
} else if char_is_word(ch) {
CharCategory::Word
} else if char_is_punctuation(ch) {
CharCategory::Punctuation
} else {
CharCategory::Unknown
}
}

/// Determine whether a character is a line ending.
#[inline]
pub fn char_is_line_ending(ch: char) -> bool {
LineEnding::from_char(ch).is_some()
}

/// Determine whether a character qualifies as (non-line-break)
/// whitespace.
pub fn char_is_whitespace(c: char) -> bool {
#[inline]
pub fn char_is_whitespace(ch: char) -> bool {
// TODO: this is a naive binary categorization of whitespace
// characters. For display, word wrapping, etc. we'll need a better
// categorization based on e.g. breaking vs non-breaking spaces
// and whether they're zero-width or not.
match c {
match ch {
//'\u{1680}' | // Ogham Space Mark (here for completeness, but usually displayed as a dash, not as whitespace)
'\u{0009}' | // Character Tabulation
'\u{0020}' | // Space
Expand All @@ -34,8 +53,81 @@ pub fn char_is_whitespace(c: char) -> bool {
// En Quad, Em Quad, En Space, Em Space, Three-per-em Space,
// Four-per-em Space, Six-per-em Space, Figure Space,
// Punctuation Space, Thin Space, Hair Space, Zero Width Space.
c if ('\u{2000}' ..= '\u{200B}').contains(&c) => true,
ch if ('\u{2000}' ..= '\u{200B}').contains(&ch) => true,

_ => false,
}
}

#[inline]
pub fn char_is_punctuation(ch: char) -> bool {
use unicode_general_category::{get_general_category, GeneralCategory};

matches!(
get_general_category(ch),
GeneralCategory::OtherPunctuation
| GeneralCategory::OpenPunctuation
| GeneralCategory::ClosePunctuation
| GeneralCategory::InitialPunctuation
| GeneralCategory::FinalPunctuation
| GeneralCategory::ConnectorPunctuation
| GeneralCategory::DashPunctuation
| GeneralCategory::MathSymbol
| GeneralCategory::CurrencySymbol
| GeneralCategory::ModifierSymbol
)
}

#[inline]
pub fn char_is_word(ch: char) -> bool {
ch.is_alphanumeric() || ch == '_'
}

#[cfg(test)]
mod test {
use super::*;

#[test]
fn test_categorize() {
const EOL_TEST_CASE: &'static str = "\n\r\u{000B}\u{000C}\u{0085}\u{2028}\u{2029}";
const WORD_TEST_CASE: &'static str =
"_hello_world_あいうえおー12345678901234567890";
const PUNCTUATION_TEST_CASE: &'static str =
"!\"#$%&\'()*+,-./:;<=>?@[\\]^`{|}~!”#$%&’()*+、。:;<=>?@「」^`{|}~";
const WHITESPACE_TEST_CASE: &'static str = "      ";

for ch in EOL_TEST_CASE.chars() {
assert_eq!(CharCategory::Eol, categorize_char(ch));
}

for ch in WHITESPACE_TEST_CASE.chars() {
assert_eq!(
CharCategory::Whitespace,
categorize_char(ch),
"Testing '{}', but got `{:?}` instead of `Category::Whitespace`",
ch,
categorize_char(ch)
);
}

for ch in WORD_TEST_CASE.chars() {
assert_eq!(
CharCategory::Word,
categorize_char(ch),
"Testing '{}', but got `{:?}` instead of `Category::Word`",
ch,
categorize_char(ch)
);
}

for ch in PUNCTUATION_TEST_CASE.chars() {
assert_eq!(
CharCategory::Punctuation,
categorize_char(ch),
"Testing '{}', but got `{:?}` instead of `Category::Punctuation`",
ch,
categorize_char(ch)
);
}
}
}
3 changes: 3 additions & 0 deletions helix-core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ pub mod diagnostic;
pub mod graphemes;
pub mod history;
pub mod indent;
pub mod line_ending;
pub mod macros;
pub mod match_brackets;
pub mod movement;
Expand Down Expand Up @@ -106,6 +107,7 @@ pub use tendril::StrTendril as Tendril;
#[doc(inline)]
pub use {regex, tree_sitter};

pub use graphemes::RopeGraphemes;
pub use position::{coords_at_pos, pos_at_coords, Position};
pub use selection::{Range, Selection};
pub use smallvec::SmallVec;
Expand All @@ -114,4 +116,5 @@ pub use syntax::Syntax;
pub use diagnostic::Diagnostic;
pub use state::State;

pub use line_ending::{LineEnding, DEFAULT_LINE_ENDING};
pub use transaction::{Assoc, Change, ChangeSet, Operation, Transaction};
Loading

0 comments on commit a70de6e

Please sign in to comment.