Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize a little image compare algorithm #528

Merged
merged 3 commits into from
Dec 28, 2021
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 80 additions & 82 deletions czkawka_core/src/similar_images.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::collections::{BTreeMap, BTreeSet};
use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
use std::fs::OpenOptions;
use std::fs::{File, Metadata};
use std::io::Write;
Expand Down Expand Up @@ -47,7 +47,6 @@ const LOOP_DURATION: u32 = 200; //ms

#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Debug, Serialize, Deserialize)]
pub enum Similarity {
None,
Similar(u32),
}

Expand Down Expand Up @@ -239,7 +238,11 @@ impl SimilarImages {
self.stopped_search = true;
return;
}
if !self.sort_images(stop_receiver, progress_sender) {
if !self.hash_images(stop_receiver, progress_sender) {
self.stopped_search = true;
return;
}
if !self.find_similar_hashes(stop_receiver, progress_sender) {
self.stopped_search = true;
return;
}
Expand Down Expand Up @@ -412,7 +415,7 @@ impl SimilarImages {
},

hash: Vec::new(),
similarity: Similarity::None,
similarity: Similarity::Similar(0),
};

fe_result.push((current_file_name.to_string_lossy().to_string(), fe));
Expand Down Expand Up @@ -450,7 +453,7 @@ impl SimilarImages {
// - Join already read hashes with hashes which were read from file
// - Join all hashes and save it to file

fn sort_images(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>) -> bool {
fn hash_images(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>) -> bool {
let hash_map_modification = SystemTime::now();

let loaded_hash_map;
Expand Down Expand Up @@ -579,7 +582,6 @@ impl SimilarImages {
for (file_entry, buf) in &vec_file_entry {
// Only use to comparing, non broken hashes(all 0 or 255 hashes means that algorithm fails to decode them because e.g. contains a log of alpha channel)
if !(buf.iter().all(|e| *e == 0) || buf.iter().all(|e| *e == 255)) {
self.bktree.add(buf.clone());
self.image_hashes.entry(buf.clone()).or_insert_with(Vec::<FileEntry>::new);
self.image_hashes.get_mut(buf).unwrap().push(file_entry.clone());
}
Expand All @@ -595,18 +597,32 @@ impl SimilarImages {
}

Common::print_time(hash_map_modification, SystemTime::now(), "sort_images - saving data to files".to_string());
let hash_map_modification = SystemTime::now();
true
}

let similarity: u32 = match self.similarity {
Similarity::Similar(k) => k,
_ => panic!(),
};
fn find_similar_hashes(&mut self, stop_receiver: Option<&Receiver<()>>, progress_sender: Option<&futures::channel::mpsc::UnboundedSender<ProgressData>>) -> bool {
let hash_map_modification = SystemTime::now();
let Similarity::Similar(similarity) = self.similarity;

// Results
let mut collected_similar_images: BTreeMap<Vec<u8>, Vec<FileEntry>> = Default::default();

let mut available_hashes = self.image_hashes.clone();
let mut temp_hashes = Default::default();
mem::swap(&mut temp_hashes, &mut self.image_hashes);

let mut this_time_check_hashes;
let mut master_of_group: BTreeSet<Vec<u8>> = Default::default(); // Lista wszystkich głównych hashy, które odpowiadają za porównywanie
let mut master_of_group: HashSet<Vec<u8>> = Default::default(); // Lista wszystkich głównych hashy, które odpowiadają za porównywanie

let mut available_hashes: HashMap<Vec<u8>, Vec<FileEntry>> = Default::default();
for (hash, vec_file_entry) in temp_hashes {
// There exists 2 or more hashes with same hash
if vec_file_entry.len() >= 2 {
collected_similar_images.insert(hash, vec_file_entry);
} else {
self.bktree.add(hash.clone());
available_hashes.insert(hash, vec_file_entry);
}
}

//// PROGRESS THREAD START
let progress_thread_run = Arc::new(AtomicBool::new(true));
Expand Down Expand Up @@ -636,81 +652,62 @@ impl SimilarImages {
thread::spawn(|| {})
};
//// PROGRESS THREAD END
if similarity >= 1 {
for current_similarity in 1..=similarity {
this_time_check_hashes = available_hashes.clone();

for current_similarity in 0..=similarity {
this_time_check_hashes = available_hashes.clone();

if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
// End thread which send info to gui
progress_thread_run.store(false, Ordering::Relaxed);
progress_thread_handle.join().unwrap();
return false;
}

for (hash, vec_file_entry) in &this_time_check_hashes {
atomic_mode_counter.fetch_add(1, Ordering::Relaxed);

let vector_with_found_similar_hashes = self
.bktree
.find(hash, similarity)
.filter(|r| (r.0 == current_similarity) && !master_of_group.contains(r.1) && available_hashes.contains_key(r.1))
.collect::<Vec<_>>();

// Not found any hash with specific distance
if vector_with_found_similar_hashes.is_empty() {
continue;
}

// This one picture doesn't have similar pictures except self in similarity 0
if current_similarity == 0 && vector_with_found_similar_hashes.len() == 1 {
continue;
}

// This shouldn't be executed too much times, so it should be quite fast to check this
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
// End thread which send info to gui
progress_thread_run.store(false, Ordering::Relaxed);
progress_thread_handle.join().unwrap();
return false;
}

// Jeśli jeszcze nie dodał, to dodaje teraz grupę główną do już obrobionych
if !master_of_group.contains(hash) {
master_of_group.insert(hash.clone());
collected_similar_images.insert(hash.clone(), Vec::new());

let mut things: Vec<FileEntry> = vec_file_entry
.iter()
.map(|fe| FileEntry {
path: fe.path.clone(),
size: fe.size,
dimensions: fe.dimensions.clone(),
modified_date: fe.modified_date,
hash: fe.hash.clone(),
similarity: Similarity::Similar(0),
})
.collect();
collected_similar_images.get_mut(hash).unwrap().append(&mut things);
}
for (hash, vec_file_entry) in this_time_check_hashes.into_iter() {
atomic_mode_counter.fetch_add(1, Ordering::Relaxed);

// Since we checked hash, we don't need to check it again
if current_similarity != 0 {
vector_with_found_similar_hashes.iter().for_each(|e| {
let mut things: Vec<FileEntry> = available_hashes
.get_mut(e.1)
.unwrap()
.iter()
.map(|fe| FileEntry {
path: fe.path.clone(),
size: fe.size,
dimensions: fe.dimensions.clone(),
modified_date: fe.modified_date,
hash: Vec::new(),
similarity: Similarity::Similar(current_similarity),
// Finds hashes with specific distance to
let vector_with_found_similar_hashes = self
.bktree
.find(&hash, similarity)
.filter(|(similarity, hash)| (*similarity == current_similarity) && !master_of_group.contains(*hash) && available_hashes.contains_key(*hash))
.collect::<Vec<_>>();

// Not found any hash with specific distance
if vector_with_found_similar_hashes.is_empty() {
continue;
}

// Current checked hash isn't in any group of similarity, so we create one, because found similar images
if !master_of_group.contains(&hash) {
master_of_group.insert(hash.clone());
collected_similar_images.insert(hash.clone(), Vec::new());

let mut things: Vec<FileEntry> = vec_file_entry
.into_iter()
.map(|mut fe| {
fe.similarity = Similarity::Similar(0);
fe
})
.collect::<Vec<_>>();
collected_similar_images.get_mut(hash).unwrap().append(&mut things);
available_hashes.remove(e.1);
.collect();
collected_similar_images.get_mut(&hash).unwrap().append(&mut things);

// This shouldn't be executed too much times, so it should be quite fast to check this
if stop_receiver.is_some() && stop_receiver.unwrap().try_recv().is_ok() {
// End thread which send info to gui
progress_thread_run.store(false, Ordering::Relaxed);
progress_thread_handle.join().unwrap();
return false;
}
}

vector_with_found_similar_hashes.iter().for_each(|(_similarity, other_hash)| {
let mut vec_fe = available_hashes.remove(*other_hash).unwrap();
for fe in &mut vec_fe {
fe.similarity = Similarity::Similar(current_similarity)
}

collected_similar_images.get_mut(&hash).unwrap().append(&mut vec_fe);
});
}
}
Expand All @@ -719,7 +716,8 @@ impl SimilarImages {
progress_thread_run.store(false, Ordering::Relaxed);
progress_thread_handle.join().unwrap();

self.similar_vectors = collected_similar_images.values().cloned().collect();
// self.similar_vectors = collected_similar_images.into_values().collect(); // TODO use this in Rust 1.54.0
self.similar_vectors = collected_similar_images.values().cloned().collect(); // 1.53.0 version

if self.exclude_images_with_same_size {
let mut new_vector = Default::default();
Expand Down Expand Up @@ -1028,9 +1026,9 @@ pub fn get_string_from_similarity(similarity: &Similarity, hash_size: u8) -> Str
};

match similarity {
Similarity::None => {
panic!()
}
// Similarity::None => {
// panic!()
// }
Similarity::Similar(h) => {
// #[cfg(debug_assertions)]
// {
Expand Down