From 32fd6c88ff73ca7c7ff62d0686340a785217b7ed Mon Sep 17 00:00:00 2001 From: BMARR Date: Fri, 15 Nov 2024 14:12:20 -0500 Subject: [PATCH] chore: deleted affiliation, binary, review analysis from hipcheck core --- hipcheck/src/analysis/mod.rs | 100 +------ hipcheck/src/metric/affiliation.rs | 441 ----------------------------- hipcheck/src/metric/binary.rs | 33 --- hipcheck/src/metric/mod.rs | 24 +- hipcheck/src/metric/review.rs | 44 --- 5 files changed, 12 insertions(+), 630 deletions(-) delete mode 100644 hipcheck/src/metric/affiliation.rs delete mode 100644 hipcheck/src/metric/binary.rs delete mode 100644 hipcheck/src/metric/review.rs diff --git a/hipcheck/src/analysis/mod.rs b/hipcheck/src/analysis/mod.rs index a6e8ab26..69a5c2a7 100644 --- a/hipcheck/src/analysis/mod.rs +++ b/hipcheck/src/analysis/mod.rs @@ -6,7 +6,7 @@ use crate::{ config::{AttacksConfigQuery, CommitConfigQuery, PracticesConfigQuery}, data::git::GitProvider, error::Result, - metric::{affiliation::AffiliatedType, MetricProvider}, + metric::{MetricProvider}, plugin::QueryResult, F64, }; @@ -23,12 +23,8 @@ pub trait AnalysisProvider: AttacksConfigQuery + CommitConfigQuery + GitProvider + MetricProvider + PracticesConfigQuery { - /// Returns result of affiliation analysis - fn affiliation_analysis(&self) -> Result; - - /// Returns result of binary analysis - fn binary_analysis(&self) -> Result; - + + /// Returns result of churn analysis fn churn_analysis(&self) -> Result; @@ -41,80 +37,13 @@ pub trait AnalysisProvider: /// Returns result of fuzz analysis fn fuzz_analysis(&self) -> Result; - /// Returns result of review analysis - fn review_analysis(&self) -> Result; - + /// Returns result of typo analysis fn typo_analysis(&self) -> Result; } + -pub fn affiliation_analysis(db: &dyn AnalysisProvider) -> Result { - let results = db.affiliation_metric()?; - - let affiliated_iter = results - .affiliations - .iter() - .filter(|a| a.affiliated_type.is_affiliated()); - - // @Note - policy expr json injection can't handle objs/strings currently - let value: Vec = affiliated_iter.clone().map(|_| true).collect(); - - let mut contributor_freq_map = HashMap::new(); - - for affiliation in affiliated_iter { - let commit_view = db.contributors_for_commit(Arc::clone(&affiliation.commit))?; - - let contributor = match affiliation.affiliated_type { - AffiliatedType::Author => String::from(&commit_view.author.name), - AffiliatedType::Committer => String::from(&commit_view.committer.name), - AffiliatedType::Neither => String::from("Neither"), - AffiliatedType::Both => String::from("Both"), - }; - - let count_commits_for = |contributor| { - db.commits_for_contributor(Arc::clone(contributor)) - .into_iter() - .count() as i64 - }; - - let author_commits = count_commits_for(&commit_view.author); - let committer_commits = count_commits_for(&commit_view.committer); - - let commit_count = match affiliation.affiliated_type { - AffiliatedType::Neither => 0, - AffiliatedType::Both => author_commits + committer_commits, - AffiliatedType::Author => author_commits, - AffiliatedType::Committer => committer_commits, - }; - - // Add string representation of affiliated contributor with count of associated commits - contributor_freq_map.insert(contributor, commit_count); - } - - let concerns: Vec = contributor_freq_map - .into_iter() - .map(|(contributor, count)| format!("Contributor {} has count {}", contributor, count)) - .collect(); - - Ok(QueryResult { - value: serde_json::to_value(value)?, - concerns, - }) -} - -pub fn binary_analysis(db: &dyn AnalysisProvider) -> Result { - let results = db.binary_metric()?; - let concerns: Vec = results - .binary_files - .iter() - .map(|binary_file| format!("Binary file at '{}'", binary_file)) - .collect(); - Ok(QueryResult { - value: serde_json::to_value(&results.binary_files)?, - concerns, - }) -} pub fn churn_analysis(db: &dyn AnalysisProvider) -> Result { let results = db.churn_metric()?; @@ -162,25 +91,6 @@ pub fn fuzz_analysis(db: &dyn AnalysisProvider) -> Result { }) } -pub fn review_analysis(db: &dyn AnalysisProvider) -> Result { - let results = db.review_metric()?; - let num_flagged = results - .pull_reviews - .iter() - .filter(|p| p.has_review.not()) - .count() as u64; - let percent_flagged = match (num_flagged, results.pull_reviews.len()) { - (flagged, total) if flagged != 0 && total != 0 => { - num_flagged as f64 / results.pull_reviews.len() as f64 - } - _ => 0.0, - }; - let value = F64::new(percent_flagged).expect("Percent threshold should never be NaN"); - Ok(QueryResult { - value: serde_json::to_value(value)?, - concerns: vec![], - }) -} pub fn typo_analysis(db: &dyn AnalysisProvider) -> Result { let results = db.typo_metric()?; diff --git a/hipcheck/src/metric/affiliation.rs b/hipcheck/src/metric/affiliation.rs deleted file mode 100644 index d328fdaf..00000000 --- a/hipcheck/src/metric/affiliation.rs +++ /dev/null @@ -1,441 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -use crate::{ - data::git::{Commit, CommitContributorView}, - error::{Context as _, Error, Result}, - hc_error, - metric::MetricProvider, - util::fs as file, -}; -use serde::{ - de::{Error as SerdeError, Visitor}, - Deserialize, Deserializer, Serialize, -}; -use std::{ - cell::RefCell, - collections::HashMap, - convert::{TryFrom, TryInto}, - fmt, - ops::Not as _, - path::Path, - result::Result as StdResult, - sync::Arc, -}; - -#[derive(Debug, Eq, PartialEq, Serialize)] -pub struct AffiliationOutput { - pub affiliations: Vec, -} - -#[derive(Debug, Clone, Eq, PartialEq, Serialize)] -pub struct Affiliation { - pub commit: Arc, - pub affiliated_type: AffiliatedType, -} - -#[derive(Debug, Eq, PartialEq, Serialize, Clone, Copy)] -pub enum AffiliatedType { - Author, - Committer, - Both, - Neither, -} - -impl AffiliatedType { - fn is(affiliator: &Affiliator, commit_view: &CommitContributorView) -> AffiliatedType { - let author_is_match = affiliator.is_match(&commit_view.author.email); - let committer_is_match = affiliator.is_match(&commit_view.committer.email); - - match (author_is_match, committer_is_match) { - (true, true) => AffiliatedType::Both, - (true, false) => AffiliatedType::Author, - (false, true) => AffiliatedType::Committer, - (false, false) => AffiliatedType::Neither, - } - } - - pub fn is_affiliated(&self) -> bool { - matches!(self, AffiliatedType::Neither).not() - } -} - -pub(crate) fn affiliation_metric(db: &dyn MetricProvider) -> Result> { - log::debug!("running affiliation metric"); - - // Parse the Orgs file and construct an OrgSpec. - let org_spec = OrgSpec::from_file(&db.orgs_file()?).context("failed to load org spec")?; - - // Get the commits for the source. - let commits = db - .commits() - .context("failed to get commits for affiliation metric")?; - - // Use the OrgSpec to build an Affiliator. - let affiliator = Affiliator::from_spec(&org_spec) - .context("failed to build affiliation checker from org spec")?; - - // Construct a big enough Vec for the affiliation info. - let mut affiliations = Vec::with_capacity(commits.len()); - - for commit in commits.iter() { - // Check if a commit matches the affiliation rules. - let commit_view = db - .contributors_for_commit(Arc::clone(commit)) - .context("failed to get commits")?; - - let affiliated_type = AffiliatedType::is(&affiliator, &commit_view); - - affiliations.push(Affiliation { - commit: Arc::clone(commit), - affiliated_type, - }); - } - - log::info!("completed affiliation metric"); - - Ok(Arc::new(AffiliationOutput { affiliations })) -} - -/// A type which encapsulates checking whether a given string matches an org in theorgs file, -/// based on the mode in question. If the mode is Independent, then you're looking for -/// the strings that don't match any of the hosts in the set. If the mode is Affiliated, -/// you're looking for the strings that match one of the hosts in the set. -struct Affiliator<'haystack> { - patterns: Matcher<'haystack>, - mode: Mode, -} - -impl<'haystack> Affiliator<'haystack> { - /// Check whether the given string is a match for the set of hosts, based on the mode. - /// - /// If independent mode is on, you're looking for strings which do not match any of - /// the hosts. - /// - /// If affiliated mode is on, you're looking for strings which do match one of the - /// hosts. - fn is_match(&self, s: &str) -> bool { - match self.mode { - Mode::Independent => !self.patterns.is_match(s), - Mode::Affiliated => self.patterns.is_match(s), - Mode::All => true, - Mode::None => false, - } - } - - /// Construct a new Affiliator from a given OrgSpec (built from an Orgs.toml file). - fn from_spec(spec: &'haystack OrgSpec) -> Result> { - let patterns = spec.patterns()?; - let mode = spec.mode(); - Ok(Affiliator { patterns, mode }) - } -} - -#[derive(Default)] -struct Matcher<'haystack> { - cache: RefCell>, - hosts: Vec<&'haystack str>, -} - -impl<'haystack> Matcher<'haystack> { - fn new(hosts: Vec<&'haystack str>) -> Matcher<'haystack> { - Matcher { - hosts, - ..Matcher::default() - } - } - - fn is_match(&self, s: &str) -> bool { - if let Some(prior_result) = self.cache.borrow().get(s) { - return *prior_result; - } - - for host in &self.hosts { - if s.ends_with(host) { - self.cache.borrow_mut().insert(s.to_owned(), true); - return true; - } - } - - false - } -} - -/// An overall organization metric specification, with a strategy for how the -/// metric will be performed, and a list of organizations. -#[derive(Deserialize)] -struct OrgSpec { - strategy: Strategy, - orgs: Vec, -} - -impl OrgSpec { - fn from_file(orgs_file: &Path) -> Result { - let raw: RawOrgSpec = file::read_toml(orgs_file).context("failed to read orgs file")?; - raw.try_into() - } - - /// Get the patterns to check against based on the org spec contents. - fn patterns(&self) -> Result> { - match self.strategy { - Strategy::All(_) => { - let mut hosts = Vec::new(); - - for org in &self.orgs { - for host in &org.hosts { - hosts.push(host.as_ref()); - } - } - - Ok(Matcher::new(hosts)) - } - Strategy::Specified(ref spec) => { - let mut hosts = Vec::new(); - - for org in &spec - .orgs_to_analyze(&self.orgs) - .context("can't resolve orgs to analyze from spec")? - { - for host in &org.hosts { - hosts.push(host.as_ref()); - } - } - - Ok(Matcher::new(hosts)) - } - } - } - - /// Get the mode associated with the OrgSpec. - fn mode(&self) -> Mode { - match self.strategy { - Strategy::All(mode) => mode, - Strategy::Specified(ref strategy) => strategy.mode, - } - } -} - -/// An organization metric strategy. It either implicitly includes _all_ -/// organizations in the Orgs struct, or has a more detailed custom specification. -#[derive(Deserialize)] -enum Strategy { - All(Mode), - Specified(Spec), -} - -/// The two modes for an metric strategy. The analyzer can either look for all -/// commits which are independent of the listed orgs, or all commits which are -/// affiliated with the listed orgs. -#[derive(Deserialize, Clone, Copy)] -enum Mode { - Independent, - Affiliated, - All, - None, -} - -/// A specification for a custom strategy. Includes a mode (independent or -/// affiliated), and a list of organization specifiers. This allows for -/// selection of orgs on both an org-by-org and a country-wide basis. Such -/// specifiers may be combined (for example, analyzing all commits from some -/// country, plus commits from an organization not from that country). -#[derive(Deserialize)] -struct Spec { - mode: Mode, - list: Vec, -} - -impl Spec { - /// Find all orgs in a given org list matching the org specifiers. - fn orgs_to_analyze<'spec>(&self, full_list: &'spec [Org]) -> Result> { - let mut orgs = vec![]; - - for specifier in &self.list { - let mut addition = match specifier.kind { - Kind::Country => get_by_country(&specifier.value, full_list) - .context("can't resolve country specifier to list of orgs")?, - Kind::Name => get_by_name(&specifier.value, full_list) - .context("can't resolve name specifier to a specific org")?, - }; - - orgs.append(&mut addition); - } - - Ok(orgs) - } -} - -/// Filter a list of orgs based on the country they're affiliated with. -fn get_by_country<'spec>(country: &str, list: &'spec [Org]) -> Result> { - let orgs: Vec<_> = list.iter().filter(|org| org.country == country).collect(); - - if orgs.is_empty() { - Err(hc_error!("invalid country name '{}'", country)) - } else { - Ok(orgs) - } -} - -/// Find a specific org in a list of orgs. -/// -/// Returns a Vec with one element, for symmetry with `get_by_country`. -fn get_by_name<'spec>(name: &str, list: &'spec [Org]) -> Result> { - let org = list.iter().find(|org| org.name == name); - - match org { - Some(org) => Ok(vec![org]), - None => Err(hc_error!("invalid org name '{}'", name)), - } -} - -/// A single organization, with a name, a list of hosts (which form the basis -/// for the hosts used in the analyzer), and an affiliated country. -#[derive(Clone, Deserialize, Debug)] -struct Org { - name: String, - hosts: Vec, - country: String, -} - -/// An org specifier, which marks whether an organization's country or name is -/// being referenced, and carries the actual value of the reference. -#[derive(Debug)] -struct Reference { - kind: Kind, - value: String, -} - -/// A specifier kind, which identifies whether the specifier is referencing an -/// organization's name or its country. -#[derive(Debug)] -enum Kind { - Country, - Name, -} - -impl<'a> TryFrom<&'a str> for Kind { - type Error = Error; - - fn try_from(s: &'a str) -> StdResult { - match s { - "country" => Ok(Kind::Country), - "org" => Ok(Kind::Name), - _ => Err(hc_error!( - "invalid org reference '{}' (expected: 'country' or 'org')", - s - )), - } - } -} - -impl<'de> Deserialize<'de> for Reference { - fn deserialize(deserializer: D) -> StdResult - where - D: Deserializer<'de>, - { - deserializer.deserialize_str(ReferenceVisitor) - } -} - -struct ReferenceVisitor; - -impl<'de> Visitor<'de> for ReferenceVisitor { - type Value = Reference; - - fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { - formatter.write_str("a string representing an org spec reference") - } - - // Split string on colon. - // Left side is the kind (should be "Country" or "Name") - // Right side is the value - fn visit_str(self, s: &str) -> StdResult - where - E: SerdeError, - { - let parts: Vec<&str> = s.split(':').collect(); - - if parts.len() != 2 { - return Err(SerdeError::custom("invalid reference")); - } - - let kind = parts[0].try_into().map_err(SerdeError::custom)?; - let value = parts[1].into(); - Ok(Reference { kind, value }) - } -} - -#[derive(Deserialize, Debug)] -struct RawOrgSpec { - strategy: Option, - strategy_spec: Option, - orgs: Vec, -} - -#[derive(Deserialize, Debug)] -struct RawSpec { - mode: String, - list: Vec, -} - -impl TryInto for RawOrgSpec { - type Error = Error; - - // PANIC: Safe to unwrap the try_into() functions, because if there is nothing to convert, the match will go to a different arm. - fn try_into(self) -> StdResult { - let strategy = match (self.strategy, self.strategy_spec) { - (Some(strat), None) => { - // Convert the strat String into a Strategy::All(Mode) with a TryFrom for String -> Mode - Strategy::All(strat.try_into().map_err(|_| { - hc_error!("failed to convert strat String into a Strategy::All(Mode)") - })?) - } - (None, Some(spec)) => { - // Convert the RawSpec into a Spec (add its own TryInto impl) - Strategy::Specified( - spec.try_into() - .map_err(|_| hc_error!("failed to convert RawSpec into a Spec"))?, - ) - } - (None, None) => { - // Default: Use the Strategy::All(Mode::Affiliated) - Strategy::All(Mode::Affiliated) - } - (Some(_), Some(_)) => { - // Error: Can't have both a strategy and a strategy_spec - return Err(Error::msg("ambiguous strategy specifier (orgs file can't contain both 'strategy' and 'strategy_spec')")); - } - }; - - let orgs = self.orgs; - - Ok(OrgSpec { strategy, orgs }) - } -} - -impl TryFrom for Mode { - type Error = Error; - - fn try_from(s: String) -> StdResult { - match s.as_ref() { - "affiliated" => Ok(Mode::Affiliated), - "independent" => Ok(Mode::Independent), - "all" => Ok(Mode::All), - "none" => Ok(Mode::None), - _ => Err(hc_error!( - "invalid mode '{}' (expected: 'affiliated', 'independent', 'all' or 'none')", - s - )), - } - } -} - -impl TryInto for RawSpec { - type Error = Error; - - fn try_into(self) -> StdResult { - let mode: Mode = self.mode.try_into()?; - let list = self.list; - - Ok(Spec { mode, list }) - } -} diff --git a/hipcheck/src/metric/binary.rs b/hipcheck/src/metric/binary.rs deleted file mode 100644 index 74c0aa0e..00000000 --- a/hipcheck/src/metric/binary.rs +++ /dev/null @@ -1,33 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -use crate::{ - error::Result, - metric::{binary_detector::detect_binary_files, MetricProvider}, - TryFilter, -}; -use serde::Serialize; -use std::sync::Arc; - -#[derive(Debug, Eq, PartialEq, Serialize)] -pub struct BinaryOutput { - pub binary_files: Vec>, -} - -/// Determine which files in a repository are of a binary format. -/// -/// Collect the paths to all non-plaintext files, filter out non-code -/// binaries (like images or audio, which may be valid parts of a project's -/// source), and return the rest to be counted for Hipcheck's report. -pub fn binary_metric(db: &dyn MetricProvider) -> Result> { - log::debug!("running binary metric"); - - let pathbuf_rc = db.local(); - let binary_files = detect_binary_files(&pathbuf_rc)? - .into_iter() - .try_filter(|f| db.is_likely_binary_file(Arc::clone(f))) - .collect::>()?; - - log::info!("completed binary metric"); - - Ok(Arc::new(BinaryOutput { binary_files })) -} diff --git a/hipcheck/src/metric/mod.rs b/hipcheck/src/metric/mod.rs index 71093f8c..74c4f6a6 100644 --- a/hipcheck/src/metric/mod.rs +++ b/hipcheck/src/metric/mod.rs @@ -1,7 +1,7 @@ // SPDX-License-Identifier: Apache-2.0 -pub mod affiliation; -pub mod binary; + + pub mod binary_detector; pub mod churn; pub mod commit_trust; @@ -11,7 +11,7 @@ pub mod fuzz; pub mod identity; pub mod linguist; mod math; -pub mod review; + pub mod typo; use crate::{ @@ -19,10 +19,10 @@ use crate::{ data::{git::GitProvider, DependenciesProvider, FuzzProvider, PullRequestReviewProvider}, error::Result, metric::{ - affiliation::AffiliationOutput, binary::BinaryOutput, + binary_detector::BinaryFile, churn::ChurnOutput, commit_trust::CommitTrustOutput, contributor_trust::ContributorTrustOutput, entropy::EntropyOutput, fuzz::FuzzOutput, - identity::IdentityOutput, linguist::Linguist, review::ReviewOutput, typo::TypoOutput, + identity::IdentityOutput, linguist::Linguist, typo::TypoOutput, }, }; use std::sync::Arc; @@ -40,14 +40,7 @@ pub trait MetricProvider: + PullRequestReviewProvider { - /// Returns result of affiliation metric - #[salsa::invoke(affiliation::affiliation_metric)] - fn affiliation_metric(&self) -> Result>; - - /// Returns result of binary metric - #[salsa::invoke(binary::binary_metric)] - fn binary_metric(&self) -> Result>; - + /// Returns result of churn metric #[salsa::invoke(churn::churn_metric)] fn churn_metric(&self) -> Result>; @@ -72,10 +65,7 @@ pub trait MetricProvider: #[salsa::invoke(fuzz::fuzz_metric)] fn fuzz_metric(&self) -> Result>; - /// Returns result of review metric - #[salsa::invoke(review::review_metric)] - fn review_metric(&self) -> Result>; - + /// Returns result of typo metric #[salsa::invoke(typo::typo_metric)] fn typo_metric(&self) -> Result>; diff --git a/hipcheck/src/metric/review.rs b/hipcheck/src/metric/review.rs deleted file mode 100644 index df4a9eff..00000000 --- a/hipcheck/src/metric/review.rs +++ /dev/null @@ -1,44 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 - -use crate::{ - data::PullRequest, - error::{Context as _, Result}, - metric::MetricProvider, -}; -use serde::Serialize; -use std::sync::Arc; - -#[derive(Debug, Eq, PartialEq, Serialize)] -pub struct ReviewOutput { - pub pull_reviews: Vec, -} - -#[derive(Debug, Clone, Eq, PartialEq, Serialize)] -pub struct PullReview { - pub pull_request: Arc, - pub has_review: bool, -} - -pub fn review_metric(db: &dyn MetricProvider) -> Result> { - log::debug!("running review metric"); - - let pull_requests = db - .pull_request_reviews() - .context("failed to get pull request reviews")?; - - log::trace!("got pull requests [requests='{:#?}']", pull_requests); - - let mut pull_reviews = Vec::with_capacity(pull_requests.len()); - - for pull_request in pull_requests.as_ref() { - let has_review = pull_request.reviews > 0; - pull_reviews.push(PullReview { - pull_request: Arc::clone(pull_request), - has_review, - }); - } - - log::info!("completed review metric"); - - Ok(Arc::new(ReviewOutput { pull_reviews })) -}