Skip to content

Commit

Permalink
feat: now parses URLs as their own tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
elijah-potter committed Mar 2, 2024
1 parent aa3c6c6 commit 484368f
Show file tree
Hide file tree
Showing 14 changed files with 328 additions and 84 deletions.
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion demo.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
There are some cases where the the standard grammar checkers
don't cut it. That s where Harper comes in handy.

Harper is a language checker for developers. it can detect
Harper is an language checker for developers. it can detect
improper capitalization and misspellled words.

Harper works everywhere, even offline. Since you r data
Expand Down
5 changes: 2 additions & 3 deletions harper-core/src/document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ use itertools::Itertools;
use crate::linting::Suggestion;
use crate::parsers::{Markdown, Parser, PlainEnglish};
use crate::span::Span;
use crate::Punctuation::{self};
use crate::{FatToken, Token, TokenKind, TokenStringExt};
use crate::{FatToken, Punctuation, Token, TokenKind, TokenStringExt};

pub struct Document {
source: Vec<char>,
Expand Down Expand Up @@ -114,7 +113,7 @@ impl Document {
first_sentence.into_iter().chain(rest).chain(last)
}

/** Returns all tokens whose `kind` is [`Punctuation::Word`] */
/** Returns all tokens whose `kind` is [`TokenKind::Word`] */
pub fn words(&self) -> impl Iterator<Item = Token> + '_ {
self.tokens
.iter()
Expand Down
72 changes: 3 additions & 69 deletions harper-core/src/lexing/email_address.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use itertools::Itertools;

use super::hostname::lex_hostname;
use super::FoundToken;
use crate::TokenKind;

Expand All @@ -13,20 +14,7 @@ pub fn lex_email_address(source: &[char]) -> Option<FoundToken> {
return None;
}

let mut domain_part_len = source[at_loc + 1..]
.iter()
.position(|c| c.is_whitespace())
.unwrap_or(source.len() - 1 - at_loc);

loop {
let domain_part = &source[at_loc + 1..at_loc + 1 + domain_part_len];

if validate_hostname(domain_part) {
break;
}

domain_part_len -= 1;
}
let domain_part_len = lex_hostname(&source[at_loc + 1..])?;

Some(FoundToken {
next_index: at_loc + 1 + domain_part_len,
Expand Down Expand Up @@ -112,31 +100,10 @@ fn valid_unquoted_character(c: char) -> bool {
false
}

/// Check if a host name is valid.
fn validate_hostname(source: &[char]) -> bool {
if source.len() > 253 || source.is_empty() {
return false;
}

for label in source.split(|c| *c == '.') {
if label.is_empty() || label.len() > 63 {
return false;
}

for c in label {
if !matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '-') {
return false;
}
}
}

true
}

#[cfg(test)]
mod tests {
use super::super::hostname::tests::example_domain_parts;
use super::{lex_email_address, validate_local_part};
use crate::lexing::email_address::validate_hostname;

fn example_local_parts() -> impl Iterator<Item = Vec<char>> {
[
Expand All @@ -162,31 +129,6 @@ mod tests {
.map(|s| s.chars().collect())
}

fn example_domain_parts() -> impl Iterator<Item = Vec<char>> {
[
r#"example.com"#,
r#"example.com"#,
r#"example.com"#,
r#"and.subdomains.example.com"#,
r#"example.com"#,
r#"example.com"#,
r#"example"#,
r#"s.example"#,
r#"example.org"#,
r#"example.org"#,
r#"example.org"#,
r#"strange.example.com"#,
r#"example.org"#,
r#"example.org"# /* The existing parser intentionally doesn't support IP addresses
* It simply isn't worth the effort at the moment.
* r#"[123.123.123.123]"#,
* r#"[IPv6:2001:0db8:85a3:0000:0000:8a2e:0370:7334]"#,
* r#"[IPv6:2001:0db8:85a3:0000:0000:8a2e:0370:7334]"#, */
]
.into_iter()
.map(|s| s.chars().collect())
}

#[test]
fn example_local_parts_pass_validation() {
for local in example_local_parts() {
Expand All @@ -195,14 +137,6 @@ mod tests {
}
}

#[test]
fn example_domain_parts_pass_validation() {
for domain in example_domain_parts() {
dbg!(domain.iter().collect::<String>());
assert!(validate_hostname(&domain));
}
}

#[test]
fn test_many_example_email_addresses() {
for local in example_local_parts() {
Expand Down
54 changes: 54 additions & 0 deletions harper-core/src/lexing/hostname.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
pub fn lex_hostname(source: &[char]) -> Option<usize> {
let mut passed_chars = 0;

for label in source.split(|c| *c == '.') {
for c in label {
passed_chars += 1;
if !matches!(c, 'A'..='Z' | 'a'..='z' | '0'..='9' | '-') {
return Some(passed_chars);
}
}

passed_chars += 1;
}

if passed_chars == 0 {
None
} else {
Some(passed_chars - 1)
}
}

#[cfg(test)]
pub mod tests {
use super::lex_hostname;

pub fn example_domain_parts() -> impl Iterator<Item = Vec<char>> {
[
r#"example.com"#,
r#"example.com"#,
r#"example.com"#,
r#"and.subdomains.example.com"#,
r#"example.com"#,
r#"example.com"#,
r#"example"#,
r#"s.example"#,
r#"example.org"#,
r#"example.org"#,
r#"example.org"#,
r#"strange.example.com"#,
r#"example.org"#,
r#"example.org"#
]
.into_iter()
.map(|s| s.chars().collect())
}

#[test]
fn can_parse_example_hostnames() {
for domain in example_domain_parts() {
dbg!(domain.iter().collect::<String>());
assert_eq!(lex_hostname(&domain), Some(domain.len()));
}
}
}
5 changes: 5 additions & 0 deletions harper-core/src/lexing/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
mod email_address;
mod hostname;
mod url;

use url::lex_url;

use self::email_address::lex_email_address;
use crate::token::{Punctuation, Quote, TokenKind};
Expand All @@ -17,6 +21,7 @@ pub fn lex_token(source: &[char]) -> Option<FoundToken> {
lex_spaces,
lex_newlines,
lex_number,
lex_url,
lex_email_address,
lex_word
];
Expand Down
Loading

0 comments on commit 484368f

Please sign in to comment.