diff --git a/crates/weaver_forge/src/extensions/code.rs b/crates/weaver_forge/src/extensions/code.rs index 1c339680..420eb421 100644 --- a/crates/weaver_forge/src/extensions/code.rs +++ b/crates/weaver_forge/src/extensions/code.rs @@ -184,14 +184,17 @@ pub(crate) fn comment( } else { &subsequent_indent }; - // TODO - if render format is html, we need a new word-separator that understands markdown. + // We use textwrap library with heavily customized algorithms to prevent bad + // scenarios around breaking up markdown. let wrap_options = textwrap::Options::new(comment_format.line_length.unwrap_or(usize::MAX)) .initial_indent(initial_indent) .subsequent_indent(&subsequent_indent) .wrap_algorithm(textwrap::WrapAlgorithm::FirstFit) .break_words(false) - .word_separator(textwrap::WordSeparator::AsciiSpace); + .word_separator(textwrap::WordSeparator::Custom( + find_words_dont_split_markdown, + )); // Wrap the comment as configured. comment = textwrap::fill(&comment, wrap_options); @@ -282,13 +285,111 @@ pub(crate) fn map_text( } } +// [Title]: link +// [Title](link) +#[derive(PartialEq)] +enum MarkdownLinkParserState { + Inactive, + Title, + Colon, + Space, + Link, +} + +fn find_words_dont_split_markdown<'a>( + line: &'a str, +) -> Box> + 'a> { + let mut start = 0; + let mut in_whitespace = false; + let mut markdown_parser_state = MarkdownLinkParserState::Inactive; + let mut char_indices = line.char_indices(); + Box::new(std::iter::from_fn(move || { + // TODO - Return Some(word) when we fine a word. + for (idx, ch) in char_indices.by_ref() { + match markdown_parser_state { + MarkdownLinkParserState::Title => { + // Track `[{name}]`. + if ch != ']' { + continue; + } else { + markdown_parser_state = MarkdownLinkParserState::Colon; + continue; + } + } + MarkdownLinkParserState::Colon => { + // Track `: {url}` portion of markdown link. + if ch == ':' { + markdown_parser_state = MarkdownLinkParserState::Space; + continue; + } else { + // TODO - should we handle `[title](link)` here? + markdown_parser_state = MarkdownLinkParserState::Inactive; + } + } + MarkdownLinkParserState::Space => { + if ch != ' ' { + markdown_parser_state = MarkdownLinkParserState::Link; + in_whitespace = false; + } + continue; + } + MarkdownLinkParserState::Inactive | MarkdownLinkParserState::Link => { + // We can just let the regular whitespace control work. + } + } + + if in_whitespace && ch != ' ' { + let word = textwrap::core::Word::from(&line[start..idx]); + start = idx; + in_whitespace = ch == ' '; + markdown_parser_state = MarkdownLinkParserState::Inactive; + return Some(word); + } + if markdown_parser_state == MarkdownLinkParserState::Inactive && ch == '[' { + markdown_parser_state = MarkdownLinkParserState::Title; + } + in_whitespace = ch == ' '; + } + // Return last word. + if start < line.len() { + let word = textwrap::core::Word::from(&line[start..]); + start = line.len(); + return Some(word); + } + None + })) +} + #[cfg(test)] mod tests { + use itertools::Itertools; + use super::*; use crate::config::{CommentFormat, IndentType}; use crate::extensions::code; use crate::formats::html::HtmlRenderOptions; + // Simplify test cases by forcing word split into a vector. + fn find_words_into_vec(line: &str) -> Vec { + find_words_dont_split_markdown(line) + .map(|w| w.to_string()) + .collect_vec() + } + #[test] + fn test_find_words_dont_split_markdown() { + assert_eq!( + find_words_into_vec("test the words"), + vec!("test", "the", "words") + ); + assert_eq!(find_words_into_vec("[test] link"), vec!("[test]", "link")); + assert_eq!(find_words_into_vec("[test]: link"), vec!("[test]: link")); + assert_eq!(find_words_into_vec("[test](link)"), vec!("[test](link)")); + assert_eq!( + find_words_into_vec("[test]: link-with-hyphen"), + vec!("[test]: link-with-hyphen") + ); + } + #[test] fn test_comment() -> Result<(), Error> { let mut env = Environment::new();