From 224fe75a76f5406b7bc671fedb862c5b9bc49876 Mon Sep 17 00:00:00 2001 From: Brent Westbrook <36778786+ntBre@users.noreply.github.com> Date: Thu, 28 Nov 2024 13:29:23 -0500 Subject: [PATCH] [`ruff`] Implement `unnecessary-regular-expression` (`RUF055`) (#14659) Co-authored-by: Micha Reiser Co-authored-by: Alex Waygood Co-authored-by: Simon Brugman --- .../resources/test/fixtures/ruff/RUF055.py | 76 ++++++ .../src/checkers/ast/analyze/expression.rs | 3 + crates/ruff_linter/src/codes.rs | 1 + crates/ruff_linter/src/rules/ruff/mod.rs | 1 + .../ruff_linter/src/rules/ruff/rules/mod.rs | 2 + .../rules/unnecessary_regular_expression.rs | 250 ++++++++++++++++++ ...uff__tests__preview__RUF055_RUF055.py.snap | 129 +++++++++ ruff.schema.json | 2 + 8 files changed, 464 insertions(+) create mode 100644 crates/ruff_linter/resources/test/fixtures/ruff/RUF055.py create mode 100644 crates/ruff_linter/src/rules/ruff/rules/unnecessary_regular_expression.rs create mode 100644 crates/ruff_linter/src/rules/ruff/snapshots/ruff_linter__rules__ruff__tests__preview__RUF055_RUF055.py.snap diff --git a/crates/ruff_linter/resources/test/fixtures/ruff/RUF055.py b/crates/ruff_linter/resources/test/fixtures/ruff/RUF055.py new file mode 100644 index 0000000000000..7dd067d872e63 --- /dev/null +++ b/crates/ruff_linter/resources/test/fixtures/ruff/RUF055.py @@ -0,0 +1,76 @@ +import re + +s = "str" + +# this should be replaced with s.replace("abc", "") +re.sub("abc", "", s) + + +# this example, adapted from https://docs.python.org/3/library/re.html#re.sub, +# should *not* be replaced because repl is a function, not a string +def dashrepl(matchobj): + if matchobj.group(0) == "-": + return " " + else: + return "-" + + +re.sub("-", dashrepl, "pro----gram-files") + +# this one should be replaced with s.startswith("abc") because the Match is +# used in an if context for its truth value +if re.match("abc", s): + pass +if m := re.match("abc", s): # this should *not* be replaced + pass +re.match("abc", s) # this should not be replaced because match returns a Match + +# this should be replaced with "abc" in s +if re.search("abc", s): + pass +re.search("abc", s) # this should not be replaced + +# this should be replaced with "abc" == s +if re.fullmatch("abc", s): + pass +re.fullmatch("abc", s) # this should not be replaced + +# this should be replaced with s.split("abc") +re.split("abc", s) + +# these currently should not be modified because the patterns contain regex +# metacharacters +re.sub("ab[c]", "", s) +re.match("ab[c]", s) +re.search("ab[c]", s) +re.fullmatch("ab[c]", s) +re.split("ab[c]", s) + +# test that all of the metacharacters prevent the rule from triggering, also +# use raw strings in line with RUF039 +re.sub(r"abc.", "", s) +re.sub(r"^abc", "", s) +re.sub(r"abc$", "", s) +re.sub(r"abc*", "", s) +re.sub(r"abc+", "", s) +re.sub(r"abc?", "", s) +re.sub(r"abc{2,3}", "", s) +re.sub(r"abc\n", "", s) # this one could be fixed but is not currently +re.sub(r"abc|def", "", s) +re.sub(r"(a)bc", "", s) + +# and these should not be modified because they have extra arguments +re.sub("abc", "", s, flags=re.A) +re.match("abc", s, flags=re.I) +re.search("abc", s, flags=re.L) +re.fullmatch("abc", s, flags=re.M) +re.split("abc", s, maxsplit=2) + +# this should trigger an unsafe fix because of the presence of comments +re.sub( + # pattern + "abc", + # repl + "", + s, # string +) diff --git a/crates/ruff_linter/src/checkers/ast/analyze/expression.rs b/crates/ruff_linter/src/checkers/ast/analyze/expression.rs index 7d7e5fcc7028e..a0003462af188 100644 --- a/crates/ruff_linter/src/checkers/ast/analyze/expression.rs +++ b/crates/ruff_linter/src/checkers/ast/analyze/expression.rs @@ -1081,6 +1081,9 @@ pub(crate) fn expression(expr: &Expr, checker: &mut Checker) { if checker.enabled(Rule::AirflowDagNoScheduleArgument) { airflow::rules::dag_no_schedule_argument(checker, expr); } + if checker.enabled(Rule::UnnecessaryRegularExpression) { + ruff::rules::unnecessary_regular_expression(checker, call); + } } Expr::Dict(dict) => { if checker.any_enabled(&[ diff --git a/crates/ruff_linter/src/codes.rs b/crates/ruff_linter/src/codes.rs index a52c57b618a54..dc28b55489446 100644 --- a/crates/ruff_linter/src/codes.rs +++ b/crates/ruff_linter/src/codes.rs @@ -984,6 +984,7 @@ pub fn code_to_rule(linter: Linter, code: &str) -> Option<(RuleGroup, Rule)> { (Ruff, "040") => (RuleGroup::Preview, rules::ruff::rules::InvalidAssertMessageLiteralArgument), (Ruff, "041") => (RuleGroup::Preview, rules::ruff::rules::UnnecessaryNestedLiteral), (Ruff, "048") => (RuleGroup::Preview, rules::ruff::rules::MapIntVersionParsing), + (Ruff, "055") => (RuleGroup::Preview, rules::ruff::rules::UnnecessaryRegularExpression), (Ruff, "100") => (RuleGroup::Stable, rules::ruff::rules::UnusedNOQA), (Ruff, "101") => (RuleGroup::Stable, rules::ruff::rules::RedirectedNOQA), diff --git a/crates/ruff_linter/src/rules/ruff/mod.rs b/crates/ruff_linter/src/rules/ruff/mod.rs index fb5bda947e28b..870cd5f3eb53a 100644 --- a/crates/ruff_linter/src/rules/ruff/mod.rs +++ b/crates/ruff_linter/src/rules/ruff/mod.rs @@ -409,6 +409,7 @@ mod tests { #[test_case(Rule::MapIntVersionParsing, Path::new("RUF048_1.py"))] #[test_case(Rule::UnrawRePattern, Path::new("RUF039.py"))] #[test_case(Rule::UnrawRePattern, Path::new("RUF039_concat.py"))] + #[test_case(Rule::UnnecessaryRegularExpression, Path::new("RUF055.py"))] fn preview_rules(rule_code: Rule, path: &Path) -> Result<()> { let snapshot = format!( "preview__{}_{}", diff --git a/crates/ruff_linter/src/rules/ruff/rules/mod.rs b/crates/ruff_linter/src/rules/ruff/rules/mod.rs index 33c98e4e6a708..a53994e3c928e 100644 --- a/crates/ruff_linter/src/rules/ruff/rules/mod.rs +++ b/crates/ruff_linter/src/rules/ruff/rules/mod.rs @@ -33,6 +33,7 @@ pub(crate) use test_rules::*; pub(crate) use unnecessary_iterable_allocation_for_first_element::*; pub(crate) use unnecessary_key_check::*; pub(crate) use unnecessary_nested_literal::*; +pub(crate) use unnecessary_regular_expression::*; pub(crate) use unraw_re_pattern::*; pub(crate) use unsafe_markup_use::*; pub(crate) use unused_async::*; @@ -79,6 +80,7 @@ pub(crate) mod test_rules; mod unnecessary_iterable_allocation_for_first_element; mod unnecessary_key_check; mod unnecessary_nested_literal; +mod unnecessary_regular_expression; mod unraw_re_pattern; mod unsafe_markup_use; mod unused_async; diff --git a/crates/ruff_linter/src/rules/ruff/rules/unnecessary_regular_expression.rs b/crates/ruff_linter/src/rules/ruff/rules/unnecessary_regular_expression.rs new file mode 100644 index 0000000000000..94769dab8af1f --- /dev/null +++ b/crates/ruff_linter/src/rules/ruff/rules/unnecessary_regular_expression.rs @@ -0,0 +1,250 @@ +use ruff_diagnostics::{AlwaysFixableViolation, Applicability, Diagnostic, Edit, Fix}; +use ruff_macros::{derive_message_formats, ViolationMetadata}; +use ruff_python_ast::{ + Arguments, CmpOp, Expr, ExprAttribute, ExprCall, ExprCompare, ExprContext, Identifier, +}; +use ruff_python_semantic::{Modules, SemanticModel}; +use ruff_text_size::TextRange; + +use crate::checkers::ast::Checker; + +/// ## What it does +/// +/// Checks for uses of the `re` module that can be replaced with builtin `str` methods. +/// +/// ## Why is this bad? +/// +/// Performing checks on strings directly can make the code simpler, may require +/// less escaping, and will often be faster. +/// +/// ## Example +/// +/// ```python +/// re.sub("abc", "", s) +/// ``` +/// +/// Use instead: +/// +/// ```python +/// s.replace("abc", "") +/// ``` +/// +/// ## Details +/// +/// The rule reports the following calls when the first argument to the call is +/// a plain string literal, and no additional flags are passed: +/// +/// - `re.sub` +/// - `re.match` +/// - `re.search` +/// - `re.fullmatch` +/// - `re.split` +/// +/// For `re.sub`, the `repl` (replacement) argument must also be a string literal, +/// not a function. For `re.match`, `re.search`, and `re.fullmatch`, the return +/// value must also be used only for its truth value. +/// +/// ## Fix safety +/// +/// This rule's fix is marked as unsafe if the affected expression contains comments. Otherwise, +/// the fix can be applied safely. +/// +/// ## References +/// - [Python Regular Expression HOWTO: Common Problems - Use String Methods](https://docs.python.org/3/howto/regex.html#use-string-methods) +#[derive(ViolationMetadata)] +pub(crate) struct UnnecessaryRegularExpression { + replacement: String, +} + +impl AlwaysFixableViolation for UnnecessaryRegularExpression { + #[derive_message_formats] + fn message(&self) -> String { + "Plain string pattern passed to `re` function".to_string() + } + + fn fix_title(&self) -> String { + format!("Replace with `{}`", self.replacement) + } +} + +/// RUF055 +pub(crate) fn unnecessary_regular_expression(checker: &mut Checker, call: &ExprCall) { + // adapted from unraw_re_pattern + let semantic = checker.semantic(); + + if !semantic.seen_module(Modules::RE) { + return; + } + + let Some(qualified_name) = semantic.resolve_qualified_name(&call.func) else { + return; + }; + + let ["re", func] = qualified_name.segments() else { + return; + }; + + // skip calls with more than `pattern` and `string` arguments (and `repl` + // for `sub`) + let Some(re_func) = ReFunc::from_call_expr(semantic, call, func) else { + return; + }; + + // For now, restrict this rule to string literals + let Some(string_lit) = re_func.pattern.as_string_literal_expr() else { + return; + }; + + // For now, reject any regex metacharacters. Compare to the complete list + // from https://docs.python.org/3/howto/regex.html#matching-characters + let has_metacharacters = string_lit + .value + .to_str() + .contains(['.', '^', '$', '*', '+', '?', '{', '[', '\\', '|', '(']); + + if has_metacharacters { + return; + } + + // Here we know the pattern is a string literal with no metacharacters, so + // we can proceed with the str method replacement + let new_expr = re_func.replacement(); + + let repl = checker.generator().expr(&new_expr); + let diagnostic = Diagnostic::new( + UnnecessaryRegularExpression { + replacement: repl.clone(), + }, + call.range, + ); + + let fix = Fix::applicable_edit( + Edit::range_replacement(repl, call.range), + if checker + .comment_ranges() + .has_comments(call, checker.source()) + { + Applicability::Unsafe + } else { + Applicability::Safe + }, + ); + + checker.diagnostics.push(diagnostic.with_fix(fix)); +} + +/// The `re` functions supported by this rule. +#[derive(Debug)] +enum ReFuncKind<'a> { + Sub { repl: &'a Expr }, + Match, + Search, + Fullmatch, + Split, +} + +#[derive(Debug)] +struct ReFunc<'a> { + kind: ReFuncKind<'a>, + pattern: &'a Expr, + string: &'a Expr, +} + +impl<'a> ReFunc<'a> { + fn from_call_expr( + semantic: &SemanticModel, + call: &'a ExprCall, + func_name: &str, + ) -> Option { + // the proposed fixes for match, search, and fullmatch rely on the + // return value only being used for its truth value + let in_if_context = semantic.in_boolean_test(); + + match (func_name, call.arguments.len()) { + // `split` is the safest of these to fix, as long as metacharacters + // have already been filtered out from the `pattern` + ("split", 2) => Some(ReFunc { + kind: ReFuncKind::Split, + pattern: call.arguments.find_argument("pattern", 0)?, + string: call.arguments.find_argument("string", 1)?, + }), + // `sub` is only safe to fix if `repl` is a string. `re.sub` also + // allows it to be a function, which will *not* work in the str + // version + ("sub", 3) => { + let repl = call.arguments.find_argument("repl", 1)?; + if !repl.is_string_literal_expr() { + return None; + } + Some(ReFunc { + kind: ReFuncKind::Sub { repl }, + pattern: call.arguments.find_argument("pattern", 0)?, + string: call.arguments.find_argument("string", 2)?, + }) + } + ("match", 2) if in_if_context => Some(ReFunc { + kind: ReFuncKind::Match, + pattern: call.arguments.find_argument("pattern", 0)?, + string: call.arguments.find_argument("string", 1)?, + }), + ("search", 2) if in_if_context => Some(ReFunc { + kind: ReFuncKind::Search, + pattern: call.arguments.find_argument("pattern", 0)?, + string: call.arguments.find_argument("string", 1)?, + }), + ("fullmatch", 2) if in_if_context => Some(ReFunc { + kind: ReFuncKind::Fullmatch, + pattern: call.arguments.find_argument("pattern", 0)?, + string: call.arguments.find_argument("string", 1)?, + }), + _ => None, + } + } + + fn replacement(&self) -> Expr { + match self.kind { + // string.replace(pattern, repl) + ReFuncKind::Sub { repl } => { + self.method_expr("replace", vec![self.pattern.clone(), repl.clone()]) + } + // string.startswith(pattern) + ReFuncKind::Match => self.method_expr("startswith", vec![self.pattern.clone()]), + // pattern in string + ReFuncKind::Search => self.compare_expr(CmpOp::In), + // string == pattern + ReFuncKind::Fullmatch => self.compare_expr(CmpOp::Eq), + // string.split(pattern) + ReFuncKind::Split => self.method_expr("split", vec![self.pattern.clone()]), + } + } + + /// Return a new compare expr of the form `self.pattern op self.string` + fn compare_expr(&self, op: CmpOp) -> Expr { + Expr::Compare(ExprCompare { + left: Box::new(self.pattern.clone()), + ops: Box::new([op]), + comparators: Box::new([self.string.clone()]), + range: TextRange::default(), + }) + } + + /// Return a new method call expression on `self.string` with `args` like + /// `self.string.method(args...)` + fn method_expr(&self, method: &str, args: Vec) -> Expr { + let method = Expr::Attribute(ExprAttribute { + value: Box::new(self.string.clone()), + attr: Identifier::new(method, TextRange::default()), + ctx: ExprContext::Load, + range: TextRange::default(), + }); + Expr::Call(ExprCall { + func: Box::new(method), + arguments: Arguments { + args: args.into_boxed_slice(), + keywords: Box::new([]), + range: TextRange::default(), + }, + range: TextRange::default(), + }) + } +} diff --git a/crates/ruff_linter/src/rules/ruff/snapshots/ruff_linter__rules__ruff__tests__preview__RUF055_RUF055.py.snap b/crates/ruff_linter/src/rules/ruff/snapshots/ruff_linter__rules__ruff__tests__preview__RUF055_RUF055.py.snap new file mode 100644 index 0000000000000..9624f9143627f --- /dev/null +++ b/crates/ruff_linter/src/rules/ruff/snapshots/ruff_linter__rules__ruff__tests__preview__RUF055_RUF055.py.snap @@ -0,0 +1,129 @@ +--- +source: crates/ruff_linter/src/rules/ruff/mod.rs +snapshot_kind: text +--- +RUF055.py:6:1: RUF055 [*] Plain string pattern passed to `re` function + | +5 | # this should be replaced with s.replace("abc", "") +6 | re.sub("abc", "", s) + | ^^^^^^^^^^^^^^^^^^^^ RUF055 + | + = help: Replace with `s.replace("abc", "")` + +ℹ Safe fix +3 3 | s = "str" +4 4 | +5 5 | # this should be replaced with s.replace("abc", "") +6 |-re.sub("abc", "", s) + 6 |+s.replace("abc", "") +7 7 | +8 8 | +9 9 | # this example, adapted from https://docs.python.org/3/library/re.html#re.sub, + +RUF055.py:22:4: RUF055 [*] Plain string pattern passed to `re` function + | +20 | # this one should be replaced with s.startswith("abc") because the Match is +21 | # used in an if context for its truth value +22 | if re.match("abc", s): + | ^^^^^^^^^^^^^^^^^^ RUF055 +23 | pass +24 | if m := re.match("abc", s): # this should *not* be replaced + | + = help: Replace with `s.startswith("abc")` + +ℹ Safe fix +19 19 | +20 20 | # this one should be replaced with s.startswith("abc") because the Match is +21 21 | # used in an if context for its truth value +22 |-if re.match("abc", s): + 22 |+if s.startswith("abc"): +23 23 | pass +24 24 | if m := re.match("abc", s): # this should *not* be replaced +25 25 | pass + +RUF055.py:29:4: RUF055 [*] Plain string pattern passed to `re` function + | +28 | # this should be replaced with "abc" in s +29 | if re.search("abc", s): + | ^^^^^^^^^^^^^^^^^^^ RUF055 +30 | pass +31 | re.search("abc", s) # this should not be replaced + | + = help: Replace with `"abc" in s` + +ℹ Safe fix +26 26 | re.match("abc", s) # this should not be replaced because match returns a Match +27 27 | +28 28 | # this should be replaced with "abc" in s +29 |-if re.search("abc", s): + 29 |+if "abc" in s: +30 30 | pass +31 31 | re.search("abc", s) # this should not be replaced +32 32 | + +RUF055.py:34:4: RUF055 [*] Plain string pattern passed to `re` function + | +33 | # this should be replaced with "abc" == s +34 | if re.fullmatch("abc", s): + | ^^^^^^^^^^^^^^^^^^^^^^ RUF055 +35 | pass +36 | re.fullmatch("abc", s) # this should not be replaced + | + = help: Replace with `"abc" == s` + +ℹ Safe fix +31 31 | re.search("abc", s) # this should not be replaced +32 32 | +33 33 | # this should be replaced with "abc" == s +34 |-if re.fullmatch("abc", s): + 34 |+if "abc" == s: +35 35 | pass +36 36 | re.fullmatch("abc", s) # this should not be replaced +37 37 | + +RUF055.py:39:1: RUF055 [*] Plain string pattern passed to `re` function + | +38 | # this should be replaced with s.split("abc") +39 | re.split("abc", s) + | ^^^^^^^^^^^^^^^^^^ RUF055 +40 | +41 | # these currently should not be modified because the patterns contain regex + | + = help: Replace with `s.split("abc")` + +ℹ Safe fix +36 36 | re.fullmatch("abc", s) # this should not be replaced +37 37 | +38 38 | # this should be replaced with s.split("abc") +39 |-re.split("abc", s) + 39 |+s.split("abc") +40 40 | +41 41 | # these currently should not be modified because the patterns contain regex +42 42 | # metacharacters + +RUF055.py:70:1: RUF055 [*] Plain string pattern passed to `re` function + | +69 | # this should trigger an unsafe fix because of the presence of comments +70 | / re.sub( +71 | | # pattern +72 | | "abc", +73 | | # repl +74 | | "", +75 | | s, # string +76 | | ) + | |_^ RUF055 + | + = help: Replace with `s.replace("abc", "")` + +ℹ Unsafe fix +67 67 | re.split("abc", s, maxsplit=2) +68 68 | +69 69 | # this should trigger an unsafe fix because of the presence of comments +70 |-re.sub( +71 |- # pattern +72 |- "abc", +73 |- # repl +74 |- "", +75 |- s, # string +76 |-) + 70 |+s.replace("abc", "") diff --git a/ruff.schema.json b/ruff.schema.json index 8b4c92ef4b94d..40cdf7e96d4a9 100644 --- a/ruff.schema.json +++ b/ruff.schema.json @@ -3843,6 +3843,8 @@ "RUF040", "RUF041", "RUF048", + "RUF05", + "RUF055", "RUF1", "RUF10", "RUF100",