Skip to content

Commit

Permalink
Add Expression::canonicalize (#64)
Browse files Browse the repository at this point in the history
* Add Expression::canonicalize

* Update CHANGELOG

* Add example
  • Loading branch information
Jake-Shadle authored Jan 4, 2024
1 parent 725564d commit 658d151
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 16 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Changed
- [PR#63](https://github.com/EmbarkStudios/spdx/pull/63) update SPDX license list to 3.22.

### Added
- [PR#64](https://github.com/EmbarkStudios/spdx/pull/64) resolved [#64](https://github.com/EmbarkStudios/spdx/issues/64) by adding `Expression::canonicalize` which fixes otherwise valid expressions into a form parsable with `ParseMode::STRICT`

## [0.10.2] - 2023-07-14
### Changed
- [PR#61](https://github.com/EmbarkStudios/spdx/pull/61) updated the SPDX license list from `3.20` => `3.21`.
Expand Down
65 changes: 65 additions & 0 deletions src/expression/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,71 @@ impl Expression {
Self::parse_mode(original, ParseMode::STRICT)
}

/// Canonicalizes the input expression into a form that can be parsed with
/// [`ParseMode::STRICT`]
///
/// ## Transforms
///
/// 1. '/' is replaced with ' OR '
/// 1. Lower-cased operators ('or', 'and', 'with') are upper-cased
/// 1. '+' is tranformed to `-or-later` for GNU licenses
/// 1. Invalid/imprecise license identifiers (eg. `apache2`) are replaced
/// with their valid identifiers
///
/// If the provided expression is not modified then `None` is returned
///
/// Note that this only does fixup of otherwise valid expressions, passing
/// the resulting string to [`Expression::parse`] can still result in
/// additional parse errors, eg. unbalanced parentheses
///
/// ```
/// assert_eq!(spdx::Expression::canonicalize("apache with LLVM-exception/gpl-3.0+").unwrap().unwrap(), "Apache-2.0 WITH LLVM-exception OR GPL-3.0-or-later");
/// ```
pub fn canonicalize(original: &str) -> Result<Option<String>, ParseError> {
let mut can = String::with_capacity(original.len());

let lexer = Lexer::new_mode(original, ParseMode::LAX);

// Keep track if the last license id is a GNU license that uses the -or-later
// convention rather than the + like all other licenses
let mut last_is_gnu = false;
for tok in lexer {
let tok = tok?;

match tok.token {
Token::Spdx(id) => {
last_is_gnu = id.is_gnu();
can.push_str(id.name);
}
Token::And => can.push_str(" AND "),
Token::Or => can.push_str(" OR "),
Token::With => can.push_str(" WITH "),
Token::Plus => {
if last_is_gnu {
can.push_str("-or-later");
} else {
can.push('+');
}
}
Token::OpenParen => can.push('('),
Token::CloseParen => can.push(')'),
Token::Exception(exc) => can.push_str(exc.name),
Token::LicenseRef { doc_ref, lic_ref } => {
if let Some(dr) = doc_ref {
can.push_str("DocumentRef-");
can.push_str(dr);
can.push(':');
}

can.push_str("LicenseRef-");
can.push_str(lic_ref);
}
}
}

Ok((can != original).then_some(can))
}

/// Parses an expression with the specified `ParseMode`. With
/// `ParseMode::Lax` it permits some non-SPDX syntax, such as imprecise
/// license names and "/" used instead of "OR" in exprssions.
Expand Down
27 changes: 16 additions & 11 deletions src/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,23 @@ pub struct ParseMode {
/// The `AND`, `OR`, and `WITH` operators are required to be uppercase in
/// the SPDX spec, but enabling this option allows them to be lowercased
pub allow_lower_case_operators: bool,
/// Allows the use of `/` as a synonym for the `OR` operator. This also
/// allows for not having whitespace between the `/` and the terms on either
/// side
/// Allows the use of `/` as a synonym for the `OR` operator.
///
/// This also allows for not having whitespace between the `/` and the terms
/// on either side
pub allow_slash_as_or_operator: bool,
/// Allows some invalid/imprecise identifiers as synonyms for an actual
/// license identifier. See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES)
/// for a list of the current synonyms. Note that this list is not
/// comprehensive but can be expanded upon when invalid identifiers are
/// found in the wild.
/// license identifier.
///
/// See [`IMPRECISE_NAMES`](crate::identifiers::IMPRECISE_NAMES) for a list
/// of the current synonyms. Note that this list is not comprehensive but
/// can be expanded upon when invalid identifiers are found in the wild.
pub allow_imprecise_license_names: bool,
/// The various GPL licenses diverge from every other license in the SPDX
/// license list by having an `-or-later` variant that used as a suffix on a
/// base license (eg. `GPL-3.0-or-later`) rather than the canonical `GPL-3.0+`.
/// license list by having an `-or-later` variant that is used as a suffix
/// on a base license (eg. `GPL-3.0-or-later`) rather than the canonical
/// `GPL-3.0+`.
///
/// This option just allows GPL licenses to be treated similarly to all of
/// the other SPDX licenses.
pub allow_postfix_plus_on_gpl: bool,
Expand Down Expand Up @@ -63,8 +67,7 @@ impl ParseMode {
pub enum Token<'a> {
/// A recognized SPDX license id
Spdx(LicenseId),
/// A `LicenseRef-` prefixed id, with an optional
/// `DocRef-`
/// A `LicenseRef-` prefixed id, with an optional `DocumentRef-`
LicenseRef {
doc_ref: Option<&'a str>,
lic_ref: &'a str,
Expand Down Expand Up @@ -250,6 +253,8 @@ impl<'a> Iterator for Lexer<'a> {
ok_token(Token::And)
} else if self.mode.allow_lower_case_operators && m == "or" {
ok_token(Token::Or)
} else if self.mode.allow_lower_case_operators && m == "with" {
ok_token(Token::With)
} else if let Some(lic_id) = crate::license_id(m) {
ok_token(Token::Spdx(lic_id))
} else if let Some(exc_id) = crate::exception_id(m) {
Expand Down
7 changes: 2 additions & 5 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,7 @@ pub fn license_id(name: &str) -> Option<LicenseId> {
}

/// Find license partially matching the name, e.g. "apache" => "Apache-2.0"
///
/// Returns length (in bytes) of the string matched. Garbage at the end is
/// ignored. See
/// [`identifiers::IMPRECISE_NAMES`](identifiers/constant.IMPRECISE_NAMES.html)
Expand All @@ -510,11 +511,7 @@ pub fn imprecise_license_id(name: &str) -> Option<(LicenseId, usize)> {
for (prefix, correct_name) in identifiers::IMPRECISE_NAMES {
if let Some(name_prefix) = name.as_bytes().get(0..prefix.len()) {
if prefix.as_bytes().eq_ignore_ascii_case(name_prefix) {
let mut len = prefix.len();
if name.as_bytes().get(len).copied() == Some(b'+') {
len += 1;
}
return license_id(correct_name).map(|lic| (lic, len));
return license_id(correct_name).map(|lic| (lic, prefix.len()));
}
}
}
Expand Down
31 changes: 31 additions & 0 deletions tests/validation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,3 +224,34 @@ fn validates_excessive_parens() {
]
]);
}

#[test]
fn canonicalization() {
use spdx::Expression;

assert!(Expression::canonicalize("Apache-2.0 OR MIT")
.unwrap()
.is_none());
assert_eq!(
Expression::canonicalize("Apache-2.0/MIT").unwrap().unwrap(),
"Apache-2.0 OR MIT"
);
assert_eq!(
Expression::canonicalize("MIT and GPL-3.0+")
.unwrap()
.unwrap(),
"MIT AND GPL-3.0-or-later"
);
assert_eq!(
Expression::canonicalize("simplified bsd license or gpl-2.0+")
.unwrap()
.unwrap(),
"BSD-2-Clause OR GPL-2.0-or-later"
);
assert_eq!(
Expression::canonicalize("apache with LLVM-exception/mpl")
.unwrap()
.unwrap(),
"Apache-2.0 WITH LLVM-exception OR MPL-2.0"
);
}

0 comments on commit 658d151

Please sign in to comment.