Skip to content

Commit

Permalink
[refurb] implement hardcoded-string-charset (FURB156) (#13530)
Browse files Browse the repository at this point in the history
Co-authored-by: Micha Reiser <[email protected]>
  • Loading branch information
alex-700 and MichaReiser authored Oct 7, 2024
1 parent 38d872e commit 73aa6ea
Show file tree
Hide file tree
Showing 8 changed files with 722 additions and 1 deletion.
53 changes: 53 additions & 0 deletions crates/ruff_linter/resources/test/fixtures/refurb/FURB156.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Errors

_ = "0123456789"
_ = "01234567"
_ = "0123456789abcdefABCDEF"
_ = "abcdefghijklmnopqrstuvwxyz"
_ = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
_ = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
_ = r"""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
_ = " \t\n\r\v\f"

_ = "" in "1234567890"
_ = "" in "12345670"
_ = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'
_ = (
'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&'
"'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c"
)
_ = id("0123"
"4567"
"89")
_ = "" in ("123"
"456"
"789"
"0")

_ = "" in ( # comment
"123"
"456"
"789"
"0")


_ = "" in (
"123"
"456" # inline comment
"789"
"0")

_ = (
"0123456789"
).capitalize()

_ = (
"0123456789"
# with comment
).capitalize()

# Ok

_ = "1234567890"
_ = "1234"
_ = "" in "1234"
8 changes: 7 additions & 1 deletion crates/ruff_linter/src/checkers/ast/analyze/expression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1355,6 +1355,9 @@ pub(crate) fn expression(expr: &Expr, checker: &mut Checker) {
if checker.enabled(Rule::SingleItemMembershipTest) {
refurb::rules::single_item_membership_test(checker, expr, left, ops, comparators);
}
if checker.enabled(Rule::HardcodedStringCharset) {
refurb::rules::hardcoded_string_charset_comparison(checker, compare);
}
}
Expr::NumberLiteral(number_literal @ ast::ExprNumberLiteral { .. }) => {
if checker.source_type.is_stub() && checker.enabled(Rule::NumericLiteralTooLong) {
Expand All @@ -1364,7 +1367,7 @@ pub(crate) fn expression(expr: &Expr, checker: &mut Checker) {
refurb::rules::math_constant(checker, number_literal);
}
}
Expr::StringLiteral(ast::ExprStringLiteral { value, range: _ }) => {
Expr::StringLiteral(string_like @ ast::ExprStringLiteral { value, range: _ }) => {
if checker.enabled(Rule::UnicodeKindPrefix) {
for string_part in value {
pyupgrade::rules::unicode_kind_prefix(checker, string_part);
Expand All @@ -1375,6 +1378,9 @@ pub(crate) fn expression(expr: &Expr, checker: &mut Checker) {
ruff::rules::missing_fstring_syntax(checker, string_literal);
}
}
if checker.enabled(Rule::HardcodedStringCharset) {
refurb::rules::hardcoded_string_charset_literal(checker, string_like);
}
}
Expr::If(
if_exp @ ast::ExprIf {
Expand Down
1 change: 1 addition & 0 deletions crates/ruff_linter/src/codes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1055,6 +1055,7 @@ pub fn code_to_rule(linter: Linter, code: &str) -> Option<(RuleGroup, Rule)> {
(Refurb, "148") => (RuleGroup::Preview, rules::refurb::rules::UnnecessaryEnumerate),
(Refurb, "152") => (RuleGroup::Preview, rules::refurb::rules::MathConstant),
(Refurb, "154") => (RuleGroup::Preview, rules::refurb::rules::RepeatedGlobal),
(Refurb, "156") => (RuleGroup::Preview, rules::refurb::rules::HardcodedStringCharset),
(Refurb, "157") => (RuleGroup::Preview, rules::refurb::rules::VerboseDecimalConstructor),
(Refurb, "161") => (RuleGroup::Stable, rules::refurb::rules::BitCount),
(Refurb, "163") => (RuleGroup::Stable, rules::refurb::rules::RedundantLogBase),
Expand Down
1 change: 1 addition & 0 deletions crates/ruff_linter/src/rules/refurb/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ mod tests {
#[test_case(Rule::UnnecessaryEnumerate, Path::new("FURB148.py"))]
#[test_case(Rule::MathConstant, Path::new("FURB152.py"))]
#[test_case(Rule::RepeatedGlobal, Path::new("FURB154.py"))]
#[test_case(Rule::HardcodedStringCharset, Path::new("FURB156.py"))]
#[test_case(Rule::VerboseDecimalConstructor, Path::new("FURB157.py"))]
#[test_case(Rule::UnnecessaryFromFloat, Path::new("FURB164.py"))]
#[test_case(Rule::PrintEmptyString, Path::new("FURB105.py"))]
Expand Down
171 changes: 171 additions & 0 deletions crates/ruff_linter/src/rules/refurb/rules/hardcoded_string_charset.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
use crate::checkers::ast::Checker;
use crate::importer::ImportRequest;
use ruff_diagnostics::{AlwaysFixableViolation, Diagnostic, Edit, Fix};
use ruff_macros::{derive_message_formats, violation};
use ruff_python_ast::{CmpOp, Expr, ExprCompare, ExprStringLiteral};
use ruff_text_size::TextRange;

/// ## What it does
/// Checks for uses of hardcoded charsets, which are defined in Python string module.
///
/// ## Why is this bad?
/// Usage of named charsets from the standard library is more readable and less error-prone.
///
/// ## Example
/// ```python
/// x = "0123456789"
/// y in "abcdefghijklmnopqrstuvwxyz"
/// ```
///
/// Use instead
/// ```python
/// import string
///
/// x = string.digits
/// y in string.ascii_lowercase
/// ```
///
/// ## References
/// - [Python documentation: String constants](https://docs.python.org/3/library/string.html#string-constants)
#[violation]
pub struct HardcodedStringCharset {
name: &'static str,
}

impl AlwaysFixableViolation for HardcodedStringCharset {
#[derive_message_formats]
fn message(&self) -> String {
format!("Use of hardcoded string charset")
}

fn fix_title(&self) -> String {
let HardcodedStringCharset { name } = self;
format!("Replace hardcoded charset with `string.{name}`")
}
}

struct NamedCharset {
name: &'static str,
bytes: &'static [u8],
ascii_char_set: AsciiCharSet,
}

/// Represents the set of ascii characters in form of a bitset.
#[derive(Copy, Clone, Eq, PartialEq)]
struct AsciiCharSet(u128);

impl AsciiCharSet {
/// Creates the set of ascii characters from `bytes`.
/// Returns None if there is non-ascii byte.
const fn from_bytes(bytes: &[u8]) -> Option<Self> {
// TODO: simplify implementation, when const-traits are supported
// https://github.com/rust-lang/rust-project-goals/issues/106
let mut bitset = 0;
let mut i = 0;
while i < bytes.len() {
if !bytes[i].is_ascii() {
return None;
}
bitset |= 1 << bytes[i];
i += 1;
}
Some(Self(bitset))
}
}

impl NamedCharset {
const fn new(name: &'static str, bytes: &'static [u8]) -> Self {
Self {
name,
bytes,
// SAFETY: The named charset is guaranteed to have only ascii bytes.
// TODO: replace with `.unwrap()`, when `Option::unwrap` will be stable in `const fn`
// https://github.com/rust-lang/rust/issues/67441
ascii_char_set: match AsciiCharSet::from_bytes(bytes) {
Some(ascii_char_set) => ascii_char_set,
None => unreachable!(),
},
}
}
}

const KNOWN_NAMED_CHARSETS: [NamedCharset; 9] = [
NamedCharset::new(
"ascii_letters",
b"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ",
),
NamedCharset::new("ascii_lowercase", b"abcdefghijklmnopqrstuvwxyz"),
NamedCharset::new("ascii_uppercase", b"ABCDEFGHIJKLMNOPQRSTUVWXYZ"),
NamedCharset::new("digits", b"0123456789"),
NamedCharset::new("hexdigits", b"0123456789abcdefABCDEF"),
NamedCharset::new("octdigits", b"01234567"),
NamedCharset::new("punctuation", b"!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"),
NamedCharset::new(
"printable",
b"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\"\
#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c",
),
NamedCharset::new("whitespace", b" \t\n\r\x0b\x0c"),
];

fn check_charset_as_set(bytes: &[u8]) -> Option<&NamedCharset> {
let ascii_char_set = AsciiCharSet::from_bytes(bytes)?;

KNOWN_NAMED_CHARSETS
.iter()
.find(|&charset| charset.ascii_char_set == ascii_char_set)
}

fn check_charset_exact(bytes: &[u8]) -> Option<&NamedCharset> {
KNOWN_NAMED_CHARSETS
.iter()
.find(|&charset| charset.bytes == bytes)
}

fn push_diagnostic(checker: &mut Checker, range: TextRange, charset: &NamedCharset) {
let name = charset.name;
let mut diagnostic = Diagnostic::new(HardcodedStringCharset { name }, range);
diagnostic.try_set_fix(|| {
let (edit, binding) = checker.importer().get_or_import_symbol(
&ImportRequest::import("string", name),
range.start(),
checker.semantic(),
)?;
Ok(Fix::safe_edits(
Edit::range_replacement(binding, range),
[edit],
))
});
checker.diagnostics.push(diagnostic);
}

/// FURB156
pub(crate) fn hardcoded_string_charset_comparison(checker: &mut Checker, compare: &ExprCompare) {
let (
[CmpOp::In | CmpOp::NotIn],
[Expr::StringLiteral(string_literal @ ExprStringLiteral { value, .. })],
) = (compare.ops.as_ref(), compare.comparators.as_ref())
else {
return;
};

let bytes = value.to_str().as_bytes();

let Some(charset) = check_charset_as_set(bytes) else {
return;
};

// In this case the diagnostic will be emitted via string_literal check.
if charset.bytes == bytes {
return;
}

push_diagnostic(checker, string_literal.range, charset);
}

/// FURB156
pub(crate) fn hardcoded_string_charset_literal(checker: &mut Checker, expr: &ExprStringLiteral) {
if let Some(charset) = check_charset_exact(expr.value.to_str().as_bytes()) {
push_diagnostic(checker, expr.range, charset);
}
}
2 changes: 2 additions & 0 deletions crates/ruff_linter/src/rules/refurb/rules/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ pub(crate) use check_and_remove_from_set::*;
pub(crate) use delete_full_slice::*;
pub(crate) use for_loop_set_mutations::*;
pub(crate) use fstring_number_format::*;
pub(crate) use hardcoded_string_charset::*;
pub(crate) use hashlib_digest_hex::*;
pub(crate) use if_exp_instead_of_or_operator::*;
pub(crate) use if_expr_min_max::*;
Expand Down Expand Up @@ -36,6 +37,7 @@ mod check_and_remove_from_set;
mod delete_full_slice;
mod for_loop_set_mutations;
mod fstring_number_format;
mod hardcoded_string_charset;
mod hashlib_digest_hex;
mod if_exp_instead_of_or_operator;
mod if_expr_min_max;
Expand Down
Loading

0 comments on commit 73aa6ea

Please sign in to comment.