From 484368fd8ac18030f45c915436d2dd4ba637e238 Mon Sep 17 00:00:00 2001 From: Elijah Potter Date: Sat, 2 Mar 2024 11:24:17 -0700 Subject: [PATCH] feat: now parses URLs as their own tokens --- Cargo.lock | 2 + demo.md | 2 +- harper-core/src/document.rs | 5 +- harper-core/src/lexing/email_address.rs | 72 +------- harper-core/src/lexing/hostname.rs | 54 ++++++ harper-core/src/lexing/mod.rs | 5 + harper-core/src/lexing/url.rs | 226 ++++++++++++++++++++++++ harper-core/src/linting/matcher.rs | 2 + harper-core/src/token.rs | 1 + harper-ls/Cargo.toml | 2 + harper-ls/src/backend.rs | 22 ++- harper-ls/src/main.rs | 14 +- harper-wasm/Cargo.toml | 4 - precommit.sh | 1 + 14 files changed, 328 insertions(+), 84 deletions(-) create mode 100644 harper-core/src/lexing/hostname.rs create mode 100644 harper-core/src/lexing/url.rs diff --git a/Cargo.lock b/Cargo.lock index e6a2dc2f..15dbda74 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -533,6 +533,8 @@ dependencies = [ "serde_json", "tokio", "tower-lsp", + "tracing", + "tracing-subscriber", "tree-sitter", "tree-sitter-c", "tree-sitter-c-sharp", diff --git a/demo.md b/demo.md index daf6deae..72adbd7b 100644 --- a/demo.md +++ b/demo.md @@ -1,7 +1,7 @@ There are some cases where the the standard grammar checkers don't cut it. That s where Harper comes in handy. -Harper is a language checker for developers. it can detect +Harper is an language checker for developers. it can detect improper capitalization and misspellled words. Harper works everywhere, even offline. Since you r data diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs index 091efe44..64d946ca 100644 --- a/harper-core/src/document.rs +++ b/harper-core/src/document.rs @@ -5,8 +5,7 @@ use itertools::Itertools; use crate::linting::Suggestion; use crate::parsers::{Markdown, Parser, PlainEnglish}; use crate::span::Span; -use crate::Punctuation::{self}; -use crate::{FatToken, Token, TokenKind, TokenStringExt}; +use crate::{FatToken, Punctuation, Token, TokenKind, TokenStringExt}; pub struct Document { source: Vec, @@ -114,7 +113,7 @@ impl Document { first_sentence.into_iter().chain(rest).chain(last) } - /** Returns all tokens whose `kind` is [`Punctuation::Word`] */ + /** Returns all tokens whose `kind` is [`TokenKind::Word`] */ pub fn words(&self) -> impl Iterator + '_ { self.tokens .iter() diff --git a/harper-core/src/lexing/email_address.rs b/harper-core/src/lexing/email_address.rs index c50860e2..bd859d45 100644 --- a/harper-core/src/lexing/email_address.rs +++ b/harper-core/src/lexing/email_address.rs @@ -1,5 +1,6 @@ use itertools::Itertools; +use super::hostname::lex_hostname; use super::FoundToken; use crate::TokenKind; @@ -13,20 +14,7 @@ pub fn lex_email_address(source: &[char]) -> Option { return None; } - let mut domain_part_len = source[at_loc + 1..] - .iter() - .position(|c| c.is_whitespace()) - .unwrap_or(source.len() - 1 - at_loc); - - loop { - let domain_part = &source[at_loc + 1..at_loc + 1 + domain_part_len]; - - if validate_hostname(domain_part) { - break; - } - - domain_part_len -= 1; - } + let domain_part_len = lex_hostname(&source[at_loc + 1..])?; Some(FoundToken { next_index: at_loc + 1 + domain_part_len, @@ -112,31 +100,10 @@ fn valid_unquoted_character(c: char) -> bool { false } -/// Check if a host name is valid. -fn validate_hostname(source: &[char]) -> bool { - if source.len() > 253 || source.is_empty() { - return false; - } - - for label in source.split(|c| *c == '.') { - if label.is_empty() || label.len() > 63 { - return false; - } - - for c in label { - if !matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '-') { - return false; - } - } - } - - true -} - #[cfg(test)] mod tests { + use super::super::hostname::tests::example_domain_parts; use super::{lex_email_address, validate_local_part}; - use crate::lexing::email_address::validate_hostname; fn example_local_parts() -> impl Iterator> { [ @@ -162,31 +129,6 @@ mod tests { .map(|s| s.chars().collect()) } - fn example_domain_parts() -> impl Iterator> { - [ - r#"example.com"#, - r#"example.com"#, - r#"example.com"#, - r#"and.subdomains.example.com"#, - r#"example.com"#, - r#"example.com"#, - r#"example"#, - r#"s.example"#, - r#"example.org"#, - r#"example.org"#, - r#"example.org"#, - r#"strange.example.com"#, - r#"example.org"#, - r#"example.org"# /* The existing parser intentionally doesn't support IP addresses - * It simply isn't worth the effort at the moment. - * r#"[123.123.123.123]"#, - * r#"[IPv6:2001:0db8:85a3:0000:0000:8a2e:0370:7334]"#, - * r#"[IPv6:2001:0db8:85a3:0000:0000:8a2e:0370:7334]"#, */ - ] - .into_iter() - .map(|s| s.chars().collect()) - } - #[test] fn example_local_parts_pass_validation() { for local in example_local_parts() { @@ -195,14 +137,6 @@ mod tests { } } - #[test] - fn example_domain_parts_pass_validation() { - for domain in example_domain_parts() { - dbg!(domain.iter().collect::()); - assert!(validate_hostname(&domain)); - } - } - #[test] fn test_many_example_email_addresses() { for local in example_local_parts() { diff --git a/harper-core/src/lexing/hostname.rs b/harper-core/src/lexing/hostname.rs new file mode 100644 index 00000000..dd731b45 --- /dev/null +++ b/harper-core/src/lexing/hostname.rs @@ -0,0 +1,54 @@ +pub fn lex_hostname(source: &[char]) -> Option { + let mut passed_chars = 0; + + for label in source.split(|c| *c == '.') { + for c in label { + passed_chars += 1; + if !matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '-') { + return Some(passed_chars); + } + } + + passed_chars += 1; + } + + if passed_chars == 0 { + None + } else { + Some(passed_chars - 1) + } +} + +#[cfg(test)] +pub mod tests { + use super::lex_hostname; + + pub fn example_domain_parts() -> impl Iterator> { + [ + r#"example.com"#, + r#"example.com"#, + r#"example.com"#, + r#"and.subdomains.example.com"#, + r#"example.com"#, + r#"example.com"#, + r#"example"#, + r#"s.example"#, + r#"example.org"#, + r#"example.org"#, + r#"example.org"#, + r#"strange.example.com"#, + r#"example.org"#, + r#"example.org"# + ] + .into_iter() + .map(|s| s.chars().collect()) + } + + #[test] + fn can_parse_example_hostnames() { + for domain in example_domain_parts() { + dbg!(domain.iter().collect::()); + assert_eq!(lex_hostname(&domain), Some(domain.len())); + } + } +} diff --git a/harper-core/src/lexing/mod.rs b/harper-core/src/lexing/mod.rs index 895b6511..8b0cd7a2 100644 --- a/harper-core/src/lexing/mod.rs +++ b/harper-core/src/lexing/mod.rs @@ -1,4 +1,8 @@ mod email_address; +mod hostname; +mod url; + +use url::lex_url; use self::email_address::lex_email_address; use crate::token::{Punctuation, Quote, TokenKind}; @@ -17,6 +21,7 @@ pub fn lex_token(source: &[char]) -> Option { lex_spaces, lex_newlines, lex_number, + lex_url, lex_email_address, lex_word ]; diff --git a/harper-core/src/lexing/url.rs b/harper-core/src/lexing/url.rs new file mode 100644 index 00000000..0bac6491 --- /dev/null +++ b/harper-core/src/lexing/url.rs @@ -0,0 +1,226 @@ +/// This module implements parsing of URIs. +/// See RFC 1738 for more information. +use super::{hostname::lex_hostname, FoundToken}; +use crate::TokenKind; + +pub fn lex_url(source: &[char]) -> Option { + let sep = source.iter().position(|c| *c == ':')?; + + if !validate_scheme(&source[0..sep]) { + return None; + } + + let url_end = lex_ip_schemepart(&source[sep + 1..])?; + + Some(FoundToken { + next_index: url_end + sep + 1, + token: TokenKind::Url + }) +} + +/// Checks whether a given char string is a valid "scheme" part of a URI. +fn validate_scheme(source: &[char]) -> bool { + source.iter().all(|c: &char| valid_scheme_char(*c)) +} + +fn lex_ip_schemepart(source: &[char]) -> Option { + if !matches!(source, ['/', '/', ..]) { + return None; + } + + let rest = &source[2..]; + + let login_end = lex_login(rest).unwrap_or(0); + + let mut cursor = login_end; + + // Parse endpoint path + while cursor != rest.len() { + if rest[cursor] != '/' { + break; + } + + cursor += 1; + + let next_idx = lex_xchar_string(&rest[cursor..]); + + if next_idx == 0 { + break; + } + + cursor += next_idx; + } + + Some(cursor + 2) +} + +fn lex_login(source: &[char]) -> Option { + let hostport_start = if let Some(cred_end) = source.iter().position(|c| *c == '@') { + if let Some(pass_beg) = source[0..cred_end].iter().position(|c| *c == ':') { + if !is_uchar_plus_string(&source[pass_beg + 1..cred_end]) { + return None; + } + } + + // Check username + if !is_uchar_plus_string(&source[0..cred_end]) { + return None; + } + + cred_end + 1 + } else { + 0 + }; + + let hostport_source = &source[hostport_start..]; + + let hostport_end = lex_hostport(hostport_source)?; + + Some(hostport_start + hostport_end) +} + +fn lex_hostport(source: &[char]) -> Option { + let hostname_end = lex_hostname(source)?; + + if source.get(hostname_end) == Some(&':') { + Some( + source + .iter() + .enumerate() + .find(|(_, c)| !{ + let c = **c; + c.is_ascii_digit() + }) + .map(|(i, _)| i) + .unwrap_or(source.len()) + ) + } else { + Some(hostname_end) + } +} + +fn valid_scheme_char(c: char) -> bool { + c.is_ascii_alphabetic() || c.is_ascii_digit() || matches!(c, '.' | '-' | '+') +} + +fn is_reserved(c: char) -> bool { + matches!(c, ';' | '/' | '?' | ':' | '@' | '&' | '=') +} + +fn is_safe(c: char) -> bool { + matches!(c, '$' | '-' | '_' | '.' | '+') +} + +fn is_extra(c: char) -> bool { + matches!(c, '!' | '*' | '\'' | '(' | ')' | ',') +} + +fn is_unreserved(c: char) -> bool { + c.is_ascii_alphabetic() || c.is_ascii_digit() || is_safe(c) || is_extra(c) +} + +fn is_hex(c: char) -> bool { + c.is_ascii_digit() || matches!(c, 'A'..='F' | 'a'..='f') +} + +/// Lex an escaped hex code, returning the subsequent index +fn lex_escaped(source: &[char]) -> Option { + if source.len() < 3 { + return None; + } + + if source[0] == '%' && is_hex(source[1]) && is_hex(source[2]) { + Some(3) + } else { + None + } +} + +fn lex_xchar_string(source: &[char]) -> usize { + let mut cursor = 0; + + while cursor != source.len() { + let Some(next) = lex_xchar(&source[cursor..]) else { + break; + }; + + cursor += next; + } + + cursor +} + +fn is_xchar_string(source: &[char]) -> bool { + lex_xchar_string(source) == source.len() +} + +/// Used for passwords and usernames +fn is_uchar_plus_string(source: &[char]) -> bool { + let mut cursor = 0; + + while cursor != source.len() { + if matches!(source[cursor], ';' | '?' | '&' | '=') { + cursor += 1; + continue; + } + + let Some(next) = lex_uchar(&source[cursor..]) else { + return false; + }; + + cursor += next; + } + + true +} + +fn lex_xchar(source: &[char]) -> Option { + if is_reserved(source[0]) { + return Some(1); + } + + lex_uchar(source) +} + +fn lex_uchar(source: &[char]) -> Option { + if is_unreserved(source[0]) { + return Some(1); + } + + lex_escaped(source) +} + +#[cfg(test)] +mod tests { + use super::lex_url; + + fn assert_consumes_full(url: &str) { + assert_consumes_part(url, url.len()); + } + + fn assert_consumes_part(url: &str, len: usize) { + let url = url.chars().collect::>(); + + assert_eq!(lex_url(&url).unwrap().next_index, len); + } + + #[test] + fn consumes_google() { + assert_consumes_full("https://google.com") + } + + #[test] + fn consumes_wikipedia() { + assert_consumes_full("https://wikipedia.com") + } + + #[test] + fn consumes_youtube() { + assert_consumes_full("https://youtube.com") + } + + #[test] + fn consumes_youtube_not_garbage() { + assert_consumes_part("https://youtube.com aklsjdha", 20); + } +} diff --git a/harper-core/src/linting/matcher.rs b/harper-core/src/linting/matcher.rs index 41d0187f..ccc32be2 100644 --- a/harper-core/src/linting/matcher.rs +++ b/harper-core/src/linting/matcher.rs @@ -99,6 +99,8 @@ impl Matcher { // This match list needs to be automatically expanded instead of explicitly // defined like it is now. let mut triggers = pt! { + "repo" => "repository", + "repos" => "repositories", "my","self" => "myself", "human","live" => "human life", "eight","grade" => "eighth grade", diff --git a/harper-core/src/token.rs b/harper-core/src/token.rs index c114f464..a078547b 100644 --- a/harper-core/src/token.rs +++ b/harper-core/src/token.rs @@ -42,6 +42,7 @@ pub enum TokenKind { /// A sequence of "\n" newlines Newline(usize), EmailAddress, + Url, /// A special token used for things like inline code blocks that should be /// ignored by all linters. Unlintable diff --git a/harper-ls/Cargo.toml b/harper-ls/Cargo.toml index 3ff33070..a8473938 100644 --- a/harper-ls/Cargo.toml +++ b/harper-ls/Cargo.toml @@ -30,3 +30,5 @@ dirs = "5.0.1" anyhow = "1.0.80" serde_json = "1.0.114" itertools = "0.12.1" +tracing = "0.1.40" +tracing-subscriber = "0.3.18" diff --git a/harper-ls/src/backend.rs b/harper-ls/src/backend.rs index c9d7b1f2..e52cfff9 100644 --- a/harper-ls/src/backend.rs +++ b/harper-ls/src/backend.rs @@ -35,6 +35,7 @@ use tower_lsp::lsp_types::{ Url }; use tower_lsp::{Client, LanguageServer}; +use tracing::error; use crate::config::Config; use crate::diagnostics::{lint_to_code_actions, lints_to_diagnostics}; @@ -90,7 +91,11 @@ impl Backend { async fn load_file_dictionary(&self, url: &Url) -> FullDictionary { match load_dict(self.get_file_dict_path(url)).await { Ok(dict) => dict, - Err(_) => FullDictionary::new() + Err(err) => { + error!("Problem loading file dictionary: {}", err); + + FullDictionary::new() + } } } @@ -101,7 +106,11 @@ impl Backend { async fn load_user_dictionary(&self) -> FullDictionary { match load_dict(&self.config.user_dict_path).await { Ok(dict) => dict, - Err(_) => FullDictionary::new() + Err(err) => { + error!("Problem loading user dictionary: {}", err); + + FullDictionary::new() + } } } @@ -133,9 +142,12 @@ impl Backend { } async fn update_document_from_file(&self, url: &Url) -> anyhow::Result<()> { - let Ok(content) = tokio::fs::read_to_string(url.path()).await else { - // TODO: Proper error handling here. - return Ok(()); + let content = match tokio::fs::read_to_string(url.path()).await { + Ok(content) => content, + Err(err) => { + error!("Error updating document from file: {}", err); + return Ok(()); + } }; self.update_document(url, &content).await diff --git a/harper-ls/src/main.rs b/harper-ls/src/main.rs index eb2e8826..3821ff81 100644 --- a/harper-ls/src/main.rs +++ b/harper-ls/src/main.rs @@ -1,3 +1,5 @@ +use std::io::stderr; + use config::Config; use tokio::fs; use tokio::net::TcpListener; @@ -11,6 +13,8 @@ mod tree_sitter_parser; use backend::Backend; use clap::Parser; use tower_lsp::{LspService, Server}; +use tracing::Level; +use tracing_subscriber::FmtSubscriber; #[derive(Debug, Parser)] struct Args { @@ -20,11 +24,17 @@ struct Args { #[tokio::main] async fn main() -> anyhow::Result<()> { - let args = Args::parse(); + let subscriber = FmtSubscriber::builder() + .map_writer(move |_| stderr) + .with_max_level(Level::TRACE) + .finish(); + tracing::subscriber::set_global_default(subscriber)?; + + let args = Args::parse(); let config = Config::default(); - // Make sure this is available. + // Make sure these are available. fs::create_dir_all(config.user_dict_path.parent().unwrap()).await?; fs::create_dir_all(&config.file_dict_path).await?; diff --git a/harper-wasm/Cargo.toml b/harper-wasm/Cargo.toml index 763e6b12..fbb649ee 100644 --- a/harper-wasm/Cargo.toml +++ b/harper-wasm/Cargo.toml @@ -8,10 +8,6 @@ publish = false [lib] crate-type = ["cdylib", "rlib"] -[profile.release] -opt-level = 3 -strip = true - [dependencies] console_error_panic_hook = "0.1.7" tracing = "0.1.40" diff --git a/precommit.sh b/precommit.sh index 835c8eee..d111dc80 100755 --- a/precommit.sh +++ b/precommit.sh @@ -5,6 +5,7 @@ set -eo pipefail R=$(pwd) +RUSTDOCFLAGS="-D warnings" cargo +nightly fmt cargo clippy -- -Dwarnings