From 484368fd8ac18030f45c915436d2dd4ba637e238 Mon Sep 17 00:00:00 2001
From: Elijah Potter <elijah.sirius@protonmail.com>
Date: Sat, 2 Mar 2024 11:24:17 -0700
Subject: [PATCH] feat: now parses URLs as their own tokens

---
 Cargo.lock                              |   2 +
 demo.md                                 |   2 +-
 harper-core/src/document.rs             |   5 +-
 harper-core/src/lexing/email_address.rs |  72 +-------
 harper-core/src/lexing/hostname.rs      |  54 ++++++
 harper-core/src/lexing/mod.rs           |   5 +
 harper-core/src/lexing/url.rs           | 226 ++++++++++++++++++++++++
 harper-core/src/linting/matcher.rs      |   2 +
 harper-core/src/token.rs                |   1 +
 harper-ls/Cargo.toml                    |   2 +
 harper-ls/src/backend.rs                |  22 ++-
 harper-ls/src/main.rs                   |  14 +-
 harper-wasm/Cargo.toml                  |   4 -
 precommit.sh                            |   1 +
 14 files changed, 328 insertions(+), 84 deletions(-)
 create mode 100644 harper-core/src/lexing/hostname.rs
 create mode 100644 harper-core/src/lexing/url.rs
diff --git a/Cargo.lock b/Cargo.lock
index e6a2dc2f..15dbda74 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -533,6 +533,8 @@ dependencies = [
  "serde_json",
  "tokio",
  "tower-lsp",
+ "tracing",
+ "tracing-subscriber",
  "tree-sitter",
  "tree-sitter-c",
  "tree-sitter-c-sharp",
diff --git a/demo.md b/demo.md
index daf6deae..72adbd7b 100644
--- a/demo.md
+++ b/demo.md
@@ -1,7 +1,7 @@
 There are some cases where the the standard grammar checkers 
 don't cut it. That s where Harper comes in handy.
 
-Harper is a language checker for developers. it can detect
+Harper is an language checker for developers. it can detect
 improper capitalization and misspellled words.
 
 Harper works everywhere, even offline. Since you r data
diff --git a/harper-core/src/document.rs b/harper-core/src/document.rs
index 091efe44..64d946ca 100644
--- a/harper-core/src/document.rs
+++ b/harper-core/src/document.rs
@@ -5,8 +5,7 @@ use itertools::Itertools;
 use crate::linting::Suggestion;
 use crate::parsers::{Markdown, Parser, PlainEnglish};
 use crate::span::Span;
-use crate::Punctuation::{self};
-use crate::{FatToken, Token, TokenKind, TokenStringExt};
+use crate::{FatToken, Punctuation, Token, TokenKind, TokenStringExt};
 
 pub struct Document {
     source: Vec<char>,
@@ -114,7 +113,7 @@ impl Document {
         first_sentence.into_iter().chain(rest).chain(last)
     }
 
-    /** Returns all tokens whose `kind` is [`Punctuation::Word`] */
+    /** Returns all tokens whose `kind` is [`TokenKind::Word`] */
     pub fn words(&self) -> impl Iterator<Item = Token> + '_ {
         self.tokens
             .iter()
diff --git a/harper-core/src/lexing/email_address.rs b/harper-core/src/lexing/email_address.rs
index c50860e2..bd859d45 100644
--- a/harper-core/src/lexing/email_address.rs
+++ b/harper-core/src/lexing/email_address.rs
@@ -1,5 +1,6 @@
 use itertools::Itertools;
 
+use super::hostname::lex_hostname;
 use super::FoundToken;
 use crate::TokenKind;
 
@@ -13,20 +14,7 @@ pub fn lex_email_address(source: &[char]) -> Option<FoundToken> {
         return None;
     }
 
-    let mut domain_part_len = source[at_loc + 1..]
-        .iter()
-        .position(|c| c.is_whitespace())
-        .unwrap_or(source.len() - 1 - at_loc);
-
-    loop {
-        let domain_part = &source[at_loc + 1..at_loc + 1 + domain_part_len];
-
-        if validate_hostname(domain_part) {
-            break;
-        }
-
-        domain_part_len -= 1;
-    }
+    let domain_part_len = lex_hostname(&source[at_loc + 1..])?;
 
     Some(FoundToken {
         next_index: at_loc + 1 + domain_part_len,
@@ -112,31 +100,10 @@ fn valid_unquoted_character(c: char) -> bool {
     false
 }
 
-/// Check if a host name is valid.
-fn validate_hostname(source: &[char]) -> bool {
-    if source.len() > 253 || source.is_empty() {
-        return false;
-    }
-
-    for label in source.split(|c| *c == '.') {
-        if label.is_empty() || label.len() > 63 {
-            return false;
-        }
-
-        for c in label {
-            if !matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '-') {
-                return false;
-            }
-        }
-    }
-
-    true
-}
-
 #[cfg(test)]
 mod tests {
+    use super::super::hostname::tests::example_domain_parts;
     use super::{lex_email_address, validate_local_part};
-    use crate::lexing::email_address::validate_hostname;
 
     fn example_local_parts() -> impl Iterator<Item = Vec<char>> {
         [
@@ -162,31 +129,6 @@ mod tests {
         .map(|s| s.chars().collect())
     }
 
-    fn example_domain_parts() -> impl Iterator<Item = Vec<char>> {
-        [
-            r#"example.com"#,
-            r#"example.com"#,
-            r#"example.com"#,
-            r#"and.subdomains.example.com"#,
-            r#"example.com"#,
-            r#"example.com"#,
-            r#"example"#,
-            r#"s.example"#,
-            r#"example.org"#,
-            r#"example.org"#,
-            r#"example.org"#,
-            r#"strange.example.com"#,
-            r#"example.org"#,
-            r#"example.org"# /* The existing parser intentionally doesn't support IP addresses
-                              * It simply isn't worth the effort at the moment.
-                              * r#"[123.123.123.123]"#,
-                              * r#"[IPv6:2001:0db8:85a3:0000:0000:8a2e:0370:7334]"#,
-                              * r#"[IPv6:2001:0db8:85a3:0000:0000:8a2e:0370:7334]"#, */
-        ]
-        .into_iter()
-        .map(|s| s.chars().collect())
-    }
-
     #[test]
     fn example_local_parts_pass_validation() {
         for local in example_local_parts() {
@@ -195,14 +137,6 @@ mod tests {
         }
     }
 
-    #[test]
-    fn example_domain_parts_pass_validation() {
-        for domain in example_domain_parts() {
-            dbg!(domain.iter().collect::<String>());
-            assert!(validate_hostname(&domain));
-        }
-    }
-
     #[test]
     fn test_many_example_email_addresses() {
         for local in example_local_parts() {
diff --git a/harper-core/src/lexing/hostname.rs b/harper-core/src/lexing/hostname.rs
new file mode 100644
index 00000000..dd731b45
--- /dev/null
+++ b/harper-core/src/lexing/hostname.rs
@@ -0,0 +1,54 @@
+pub fn lex_hostname(source: &[char]) -> Option<usize> {
+    let mut passed_chars = 0;
+
+    for label in source.split(|c| *c == '.') {
+        for c in label {
+            passed_chars += 1;
+            if !matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '-') {
+                return Some(passed_chars);
+            }
+        }
+
+        passed_chars += 1;
+    }
+
+    if passed_chars == 0 {
+        None
+    } else {
+        Some(passed_chars - 1)
+    }
+}
+
+#[cfg(test)]
+pub mod tests {
+    use super::lex_hostname;
+
+    pub fn example_domain_parts() -> impl Iterator<Item = Vec<char>> {
+        [
+            r#"example.com"#,
+            r#"example.com"#,
+            r#"example.com"#,
+            r#"and.subdomains.example.com"#,
+            r#"example.com"#,
+            r#"example.com"#,
+            r#"example"#,
+            r#"s.example"#,
+            r#"example.org"#,
+            r#"example.org"#,
+            r#"example.org"#,
+            r#"strange.example.com"#,
+            r#"example.org"#,
+            r#"example.org"#
+        ]
+        .into_iter()
+        .map(|s| s.chars().collect())
+    }
+
+    #[test]
+    fn can_parse_example_hostnames() {
+        for domain in example_domain_parts() {
+            dbg!(domain.iter().collect::<String>());
+            assert_eq!(lex_hostname(&domain), Some(domain.len()));
+        }
+    }
+}
diff --git a/harper-core/src/lexing/mod.rs b/harper-core/src/lexing/mod.rs
index 895b6511..8b0cd7a2 100644
--- a/harper-core/src/lexing/mod.rs
+++ b/harper-core/src/lexing/mod.rs
@@ -1,4 +1,8 @@
 mod email_address;
+mod hostname;
+mod url;
+
+use url::lex_url;
 
 use self::email_address::lex_email_address;
 use crate::token::{Punctuation, Quote, TokenKind};
@@ -17,6 +21,7 @@ pub fn lex_token(source: &[char]) -> Option<FoundToken> {
         lex_spaces,
         lex_newlines,
         lex_number,
+        lex_url,
         lex_email_address,
         lex_word
     ];
diff --git a/harper-core/src/lexing/url.rs b/harper-core/src/lexing/url.rs
new file mode 100644
index 00000000..0bac6491
--- /dev/null
+++ b/harper-core/src/lexing/url.rs
@@ -0,0 +1,226 @@
+/// This module implements parsing of URIs.
+/// See RFC 1738 for more information.
+use super::{hostname::lex_hostname, FoundToken};
+use crate::TokenKind;
+
+pub fn lex_url(source: &[char]) -> Option<FoundToken> {
+    let sep = source.iter().position(|c| *c == ':')?;
+
+    if !validate_scheme(&source[0..sep]) {
+        return None;
+    }
+
+    let url_end = lex_ip_schemepart(&source[sep + 1..])?;
+
+    Some(FoundToken {
+        next_index: url_end + sep + 1,
+        token: TokenKind::Url
+    })
+}
+
+/// Checks whether a given char string is a valid "scheme" part of a URI.
+fn validate_scheme(source: &[char]) -> bool {
+    source.iter().all(|c: &char| valid_scheme_char(*c))
+}
+
+fn lex_ip_schemepart(source: &[char]) -> Option<usize> {
+    if !matches!(source, ['/', '/', ..]) {
+        return None;
+    }
+
+    let rest = &source[2..];
+
+    let login_end = lex_login(rest).unwrap_or(0);
+
+    let mut cursor = login_end;
+
+    // Parse endpoint path
+    while cursor != rest.len() {
+        if rest[cursor] != '/' {
+            break;
+        }
+
+        cursor += 1;
+
+        let next_idx = lex_xchar_string(&rest[cursor..]);
+
+        if next_idx == 0 {
+            break;
+        }
+
+        cursor += next_idx;
+    }
+
+    Some(cursor + 2)
+}
+
+fn lex_login(source: &[char]) -> Option<usize> {
+    let hostport_start = if let Some(cred_end) = source.iter().position(|c| *c == '@') {
+        if let Some(pass_beg) = source[0..cred_end].iter().position(|c| *c == ':') {
+            if !is_uchar_plus_string(&source[pass_beg + 1..cred_end]) {
+                return None;
+            }
+        }
+
+        // Check username
+        if !is_uchar_plus_string(&source[0..cred_end]) {
+            return None;
+        }
+
+        cred_end + 1
+    } else {
+        0
+    };
+
+    let hostport_source = &source[hostport_start..];
+
+    let hostport_end = lex_hostport(hostport_source)?;
+
+    Some(hostport_start + hostport_end)
+}
+
+fn lex_hostport(source: &[char]) -> Option<usize> {
+    let hostname_end = lex_hostname(source)?;
+
+    if source.get(hostname_end) == Some(&':') {
+        Some(
+            source
+                .iter()
+                .enumerate()
+                .find(|(_, c)| !{
+                    let c = **c;
+                    c.is_ascii_digit()
+                })
+                .map(|(i, _)| i)
+                .unwrap_or(source.len())
+        )
+    } else {
+        Some(hostname_end)
+    }
+}
+
+fn valid_scheme_char(c: char) -> bool {
+    c.is_ascii_alphabetic() || c.is_ascii_digit() || matches!(c, '.' | '-' | '+')
+}
+
+fn is_reserved(c: char) -> bool {
+    matches!(c, ';' | '/' | '?' | ':' | '@' | '&' | '=')
+}
+
+fn is_safe(c: char) -> bool {
+    matches!(c, '$' | '-' | '_' | '.' | '+')
+}
+
+fn is_extra(c: char) -> bool {
+    matches!(c, '!' | '*' | '\'' | '(' | ')' | ',')
+}
+
+fn is_unreserved(c: char) -> bool {
+    c.is_ascii_alphabetic() || c.is_ascii_digit() || is_safe(c) || is_extra(c)
+}
+
+fn is_hex(c: char) -> bool {
+    c.is_ascii_digit() || matches!(c, 'A'..='F' | 'a'..='f')
+}
+
+/// Lex an escaped hex code, returning the subsequent index
+fn lex_escaped(source: &[char]) -> Option<usize> {
+    if source.len() < 3 {
+        return None;
+    }
+
+    if source[0] == '%' && is_hex(source[1]) && is_hex(source[2]) {
+        Some(3)
+    } else {
+        None
+    }
+}
+
+fn lex_xchar_string(source: &[char]) -> usize {
+    let mut cursor = 0;
+
+    while cursor != source.len() {
+        let Some(next) = lex_xchar(&source[cursor..]) else {
+            break;
+        };
+
+        cursor += next;
+    }
+
+    cursor
+}
+
+fn is_xchar_string(source: &[char]) -> bool {
+    lex_xchar_string(source) == source.len()
+}
+
+/// Used for passwords and usernames
+fn is_uchar_plus_string(source: &[char]) -> bool {
+    let mut cursor = 0;
+
+    while cursor != source.len() {
+        if matches!(source[cursor], ';' | '?' | '&' | '=') {
+            cursor += 1;
+            continue;
+        }
+
+        let Some(next) = lex_uchar(&source[cursor..]) else {
+            return false;
+        };
+
+        cursor += next;
+    }
+
+    true
+}
+
+fn lex_xchar(source: &[char]) -> Option<usize> {
+    if is_reserved(source[0]) {
+        return Some(1);
+    }
+
+    lex_uchar(source)
+}
+
+fn lex_uchar(source: &[char]) -> Option<usize> {
+    if is_unreserved(source[0]) {
+        return Some(1);
+    }
+
+    lex_escaped(source)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::lex_url;
+
+    fn assert_consumes_full(url: &str) {
+        assert_consumes_part(url, url.len());
+    }
+
+    fn assert_consumes_part(url: &str, len: usize) {
+        let url = url.chars().collect::<Vec<_>>();
+
+        assert_eq!(lex_url(&url).unwrap().next_index, len);
+    }
+
+    #[test]
+    fn consumes_google() {
+        assert_consumes_full("https://google.com")
+    }
+
+    #[test]
+    fn consumes_wikipedia() {
+        assert_consumes_full("https://wikipedia.com")
+    }
+
+    #[test]
+    fn consumes_youtube() {
+        assert_consumes_full("https://youtube.com")
+    }
+
+    #[test]
+    fn consumes_youtube_not_garbage() {
+        assert_consumes_part("https://youtube.com aklsjdha", 20);
+    }
+}
diff --git a/harper-core/src/linting/matcher.rs b/harper-core/src/linting/matcher.rs
index 41d0187f..ccc32be2 100644
--- a/harper-core/src/linting/matcher.rs
+++ b/harper-core/src/linting/matcher.rs
@@ -99,6 +99,8 @@ impl Matcher {
         // This match list needs to be automatically expanded instead of explicitly
         // defined like it is now.
         let mut triggers = pt! {
+            "repo" => "repository",
+            "repos" => "repositories",
             "my","self" => "myself",
             "human","live" => "human life",
             "eight","grade" => "eighth grade",
diff --git a/harper-core/src/token.rs b/harper-core/src/token.rs
index c114f464..a078547b 100644
--- a/harper-core/src/token.rs
+++ b/harper-core/src/token.rs
@@ -42,6 +42,7 @@ pub enum TokenKind {
     /// A sequence of "\n" newlines
     Newline(usize),
     EmailAddress,
+    Url,
     /// A special token used for things like inline code blocks that should be
     /// ignored by all linters.
     Unlintable
diff --git a/harper-ls/Cargo.toml b/harper-ls/Cargo.toml
index 3ff33070..a8473938 100644
--- a/harper-ls/Cargo.toml
+++ b/harper-ls/Cargo.toml
@@ -30,3 +30,5 @@ dirs = "5.0.1"
 anyhow = "1.0.80"
 serde_json = "1.0.114"
 itertools = "0.12.1"
+tracing = "0.1.40"
+tracing-subscriber = "0.3.18"
diff --git a/harper-ls/src/backend.rs b/harper-ls/src/backend.rs
index c9d7b1f2..e52cfff9 100644
--- a/harper-ls/src/backend.rs
+++ b/harper-ls/src/backend.rs
@@ -35,6 +35,7 @@ use tower_lsp::lsp_types::{
     Url
 };
 use tower_lsp::{Client, LanguageServer};
+use tracing::error;
 
 use crate::config::Config;
 use crate::diagnostics::{lint_to_code_actions, lints_to_diagnostics};
@@ -90,7 +91,11 @@ impl Backend {
     async fn load_file_dictionary(&self, url: &Url) -> FullDictionary {
         match load_dict(self.get_file_dict_path(url)).await {
             Ok(dict) => dict,
-            Err(_) => FullDictionary::new()
+            Err(err) => {
+                error!("Problem loading file dictionary: {}", err);
+
+                FullDictionary::new()
+            }
         }
     }
 
@@ -101,7 +106,11 @@ impl Backend {
     async fn load_user_dictionary(&self) -> FullDictionary {
         match load_dict(&self.config.user_dict_path).await {
             Ok(dict) => dict,
-            Err(_) => FullDictionary::new()
+            Err(err) => {
+                error!("Problem loading user dictionary: {}", err);
+
+                FullDictionary::new()
+            }
         }
     }
 
@@ -133,9 +142,12 @@ impl Backend {
     }
 
     async fn update_document_from_file(&self, url: &Url) -> anyhow::Result<()> {
-        let Ok(content) = tokio::fs::read_to_string(url.path()).await else {
-            // TODO: Proper error handling here.
-            return Ok(());
+        let content = match tokio::fs::read_to_string(url.path()).await {
+            Ok(content) => content,
+            Err(err) => {
+                error!("Error updating document from file: {}", err);
+                return Ok(());
+            }
         };
 
         self.update_document(url, &content).await
diff --git a/harper-ls/src/main.rs b/harper-ls/src/main.rs
index eb2e8826..3821ff81 100644
--- a/harper-ls/src/main.rs
+++ b/harper-ls/src/main.rs
@@ -1,3 +1,5 @@
+use std::io::stderr;
+
 use config::Config;
 use tokio::fs;
 use tokio::net::TcpListener;
@@ -11,6 +13,8 @@ mod tree_sitter_parser;
 use backend::Backend;
 use clap::Parser;
 use tower_lsp::{LspService, Server};
+use tracing::Level;
+use tracing_subscriber::FmtSubscriber;
 
 #[derive(Debug, Parser)]
 struct Args {
@@ -20,11 +24,17 @@ struct Args {
 
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
-    let args = Args::parse();
+    let subscriber = FmtSubscriber::builder()
+        .map_writer(move |_| stderr)
+        .with_max_level(Level::TRACE)
+        .finish();
 
+    tracing::subscriber::set_global_default(subscriber)?;
+
+    let args = Args::parse();
     let config = Config::default();
 
-    // Make sure this is available.
+    // Make sure these are available.
     fs::create_dir_all(config.user_dict_path.parent().unwrap()).await?;
     fs::create_dir_all(&config.file_dict_path).await?;
 
diff --git a/harper-wasm/Cargo.toml b/harper-wasm/Cargo.toml
index 763e6b12..fbb649ee 100644
--- a/harper-wasm/Cargo.toml
+++ b/harper-wasm/Cargo.toml
@@ -8,10 +8,6 @@ publish = false
 [lib]
 crate-type = ["cdylib", "rlib"]
 
-[profile.release]
-opt-level = 3 
-strip = true
-
 [dependencies]
 console_error_panic_hook = "0.1.7"
 tracing = "0.1.40"
diff --git a/precommit.sh b/precommit.sh
index 835c8eee..d111dc80 100755
--- a/precommit.sh
+++ b/precommit.sh
@@ -5,6 +5,7 @@
 set -eo pipefail
 
 R=$(pwd)
+RUSTDOCFLAGS="-D warnings"
 
 cargo +nightly fmt
 cargo clippy -- -Dwarnings