Skip to content

Commit

Permalink
config
Browse files Browse the repository at this point in the history
git-svn-id: https://projectname.googlecode.com/svn/trunk@19 c416075f-80b4-e980-4839-00ea3ed24e77
  • Loading branch information
[email protected] committed May 27, 2011
1 parent bffbbce commit 2a0f098
Show file tree
Hide file tree
Showing 10 changed files with 152 additions and 90 deletions.
6 changes: 3 additions & 3 deletions trunk/src/default_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,15 @@ class accumulator: public arr<boost::shared_ptr<T>, boost::shared_ptr<T> > {
}
};

void default_main(const program_options& po) {
void default_main() {
struct : end< boost::shared_ptr<file_type::base> > {
void next(const boost::shared_ptr<file_type::base>& t) { std::cout << t->path().string() << "\n"; }
void stop() { std::cout << "\n"; }
} output;

make_pair(po.input_files().begin(), po.input_files().end())
make_pair(program_options::input_files().begin(), program_options::input_files().end())
>>= fs::recursive()
>>= (po.extensions().empty() ? The< arr<fs::path, fs::path> >() : The<elem_filter>(po.extensions()))
>>= (program_options::extensions().empty() ? The< arr<fs::path, fs::path> >() : The<elem_filter>(program_options::extensions()))
>>= file_typer_match_first()
>>= clusterization()
>>= comparator<file_type::base, file_type::base>()
Expand Down
11 changes: 11 additions & 0 deletions trunk/src/file_types/base.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,15 @@ inline comparison_result base::precompare(const boost::shared_ptr<base>& a) cons
return _size < a->_size ? less : _size == a->_size ? equal : greater;
}

bool base::check_type(const std::vector<std::string>& types) const {
bool res;
if (!_mime.empty()) {
res = std::find(types.begin(), types.end(), _mime) != types.end();
} else {
std::string ext = boost::to_lower_copy(_path.extension().string());
res = std::find(types.begin(), types.end(), ext) != types.end();
}
return res;
}

}
12 changes: 1 addition & 11 deletions trunk/src/file_types/base.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,7 @@ class base {
virtual boost::shared_ptr<base> compare(const boost::shared_ptr<base>& a) const;
virtual comparison_result precompare(const boost::shared_ptr<base>& a) const;

template<typename It1, typename It2>
bool check_type(It1 mimes, It1 mimes_end, It2 exts, It2 exts_end) const {
bool res;
if (!_mime.empty()) {
res = std::find(mimes, mimes_end, _mime) != mimes_end;
} else {
std::string ext = boost::to_lower_copy(_path.extension().string());
res = std::find(exts, exts_end, ext) != exts_end;
}
return res;
}
bool check_type(const std::vector<std::string>& types) const;
};

}
Expand Down
35 changes: 18 additions & 17 deletions trunk/src/file_types/img.cpp
Original file line number Diff line number Diff line change
@@ -1,25 +1,26 @@
#include "img.h"
#include "../kleisli.h"
#include "../logger.h"
#include "../program_options.h"

namespace file_type {

static const unsigned int PIX_SIZE = 128;
static const double THRESHOLD = 0.005;
static const unsigned int PIX_IN_INT = sizeof(unsigned int) >= sizeof(Magick::PixelPacket)
? sizeof(unsigned int) / sizeof(Magick::PixelPacket) : 1;
static const float EPSILON = 0.00001;
static const unsigned int MAX_DIFF = 200000;

img::img(const fs::path& p, const Magick::Image& image): base(p), _pixels(0) {
const Magick::PixelPacket* pix = image.getConstPixels(0, 0, image.size().width(), image.size().height());
static const unsigned int BUCKET_SIZE = 65536 / BUCKET_COUNT;
for (unsigned int j = 0; j < BUCKET_COUNT; ++j) {
bucket[0][j] = bucket[1][j] = bucket[2][j] = 0;
static const unsigned int BUCKET_SIZE = 65536 / program_options::image_bucket_count();
for (unsigned int j = 0; j < program_options::image_bucket_count(); ++j) {
bucket[0].push_back(0);
bucket[1].push_back(0);
bucket[2].push_back(0);
}
unsigned int size = image.size().width() * image.size().height();
for (unsigned int i = 0; i < size; ++i, ++pix) {
for (unsigned int j = 0, c = BUCKET_SIZE; j < BUCKET_COUNT; ++j, c += BUCKET_SIZE) {
for (unsigned int j = 0, c = BUCKET_SIZE;
j < program_options::image_bucket_count(); ++j, c += BUCKET_SIZE) {
if (pix->red < c) {
++bucket[0][j];
}
Expand All @@ -31,7 +32,7 @@ img::img(const fs::path& p, const Magick::Image& image): base(p), _pixels(0) {
}
}
}
for (unsigned int j = 0; j < BUCKET_COUNT; ++j) {
for (unsigned int j = 0; j < program_options::image_bucket_count(); ++j) {
for (unsigned int k = 0; k < HISTOGRAM_COUNT; ++k) {
bucket[k][j] /= size;
}
Expand All @@ -45,10 +46,7 @@ img::~img() {
}

boost::shared_ptr<img> img::try_file(const boost::shared_ptr<base>& file) {
static const std::string mimes[] = { "image/jpeg", "image/png" };
static const std::string exts[] = { ".jpg", ".png" };
if (file->check_type(mimes, mimes + sizeof(mimes)/sizeof(std::string),
exts, exts + sizeof(exts)/sizeof(std::string))) {
if (file->check_type(program_options::image_formats())) {
try {
Magick::Image image;
image.read(file->path().string());
Expand All @@ -65,15 +63,17 @@ static double abs(double x) { return x >= 0 ? x : -x; }
boost::shared_ptr<base> img::compare(const boost::shared_ptr<base>& _a) const {
const img* a = static_cast<const img*>(_a.get());
{ double res = 0;
for (unsigned int i = 0; i < BUCKET_COUNT; ++i) {
for (unsigned int i = 0; i < program_options::image_bucket_count(); ++i) {
for (unsigned int k = 0; k < HISTOGRAM_COUNT; ++k) {
res += abs(bucket[k][i] - a->bucket[k][i]);
}
}
if (res > THRESHOLD) {
if (res > program_options::image_threshold()) {
return boost::shared_ptr<img>();
}
} { init_pixels();
}
if (program_options::image_precise()) {
init_pixels();
a->init_pixels();
unsigned int res = 0;
const unsigned int width0 = width(), width1 = a->width();
Expand All @@ -88,7 +88,7 @@ boost::shared_ptr<base> img::compare(const boost::shared_ptr<base>& _a) const {
res += abs(_pixels[p].red - a->_pixels[p1].red)
+ abs(_pixels[p].green - a->_pixels[p1].green)
+ abs(_pixels[p].blue - a->_pixels[p1].blue);
if (res > MAX_DIFF) {
if (res > program_options::image_max_diff()) {
return boost::shared_ptr<img>();
}
}
Expand Down Expand Up @@ -118,7 +118,8 @@ void img::init_pixels() const {
if (_pixels == 0) {
Magick::Image im;
im.read(path().string());
#define CALC_SIZE(x) ((x) >= PIX_SIZE ? PIX_SIZE : round(x))
#define CALC_SIZE(x) ((x) >= program_options::image_img_size() \
? program_options::image_img_size() : round(x))
im.scale(Magick::Geometry(CALC_SIZE(im.size().width()), CALC_SIZE(im.size().height())));
const Magick::PixelPacket* pix = im.getConstPixels(0, 0, im.size().width(), im.size().height());
_pixels = new Magick::PixelPacket[im.size().width() * im.size().height() + 2 * PIX_IN_INT];
Expand Down
4 changes: 2 additions & 2 deletions trunk/src/file_types/img.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef _FILE_TYPE_IMG_H_
#define _FILE_TYPE_IMG_H_

#include <vector>
#include <boost/shared_ptr.hpp>
#include <Magick++.h>

Expand All @@ -16,9 +17,8 @@ struct img: base {
boost::shared_ptr<base> compare(const boost::shared_ptr<base>& a) const;
comparison_result precompare(const boost::shared_ptr<base>& a) const;
private:
static const unsigned int BUCKET_COUNT = 4;
static const unsigned int HISTOGRAM_COUNT = 3;
double bucket[HISTOGRAM_COUNT][BUCKET_COUNT];
std::vector<double> bucket[HISTOGRAM_COUNT];
mutable Magick::PixelPacket* _pixels;
float _aspect_ratio;
void init_pixels() const;
Expand Down
57 changes: 23 additions & 34 deletions trunk/src/file_types/text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
#include <boost/make_shared.hpp>
#include <string.h>
#include <ctype.h>
#include <algorithm>
#include <map>

#include "text.h"
#include "../kleisli.h"
#include "../program_options.h"

namespace file_type {

Expand Down Expand Up @@ -88,74 +88,63 @@ text::text(const fs::path& p): base(p) {
char buf[buf_size];
fs::ifstream file(p);
uint32_t hash = hash_init();
std::string last;
std::map<std::string, unsigned int> counts;
unsigned int cur_size = 0;
std::map<uint32_t, unsigned int> counts;
while (!file.eof()) {
file.read(buf, buf_size);
for (int i = 0; i < file.gcount(); ++i) {
if (buf[i] == '.' || buf[i] == '!' || buf[i] == '?') {
_hashes.push_back(hash_final(hash));
hash = hash_init();
} else
if (isalnum(buf[i])) {
hash_update(hash, tolower(buf[i]));
}
if (isalnum(buf[i])) {
last += /*tolower*/(buf[i]);
++cur_size;
} else {
if (last.length() > 3) {
++counts[last];
last = "";
if (cur_size > 3) {
++counts[hash_final(hash)];
}
hash = hash_init();
cur_size = 0;
}
}
}
std::map<unsigned int, std::vector<std::string> > rev;
for (std::map<std::string, unsigned int>::iterator it = counts.begin(); it != counts.end(); ++it) {
std::map<unsigned int, std::vector<uint32_t> > rev;
for (std::map<uint32_t, unsigned int>::iterator it = counts.begin(); it != counts.end(); ++it) {
rev[it->second].push_back(it->first);
}
std::map<unsigned int, std::vector<std::string> >::iterator it = --rev.end();
for (unsigned int i = 0; i < WORDS_COUNT;) {
for (unsigned int j = 0; j < it->second.size(); ++j, ++i) {
if (i >= WORDS_COUNT) {
std::map<unsigned int, std::vector<uint32_t> >::iterator it = --rev.end();
for (unsigned int i = 0; i < program_options::text_words_count();) {
for (unsigned int j = 0; j < it->second.size(); ++j) {
if (i++ >= program_options::text_words_count()) {
break;
}
_words[i] = it->second[j];
_words.push_back(it->second[j]);
}
--it;
}
std::sort(_words, _words + WORDS_COUNT);
std::sort(_hashes.begin(), _hashes.end());
std::sort(_words.begin(), _words.end());
}

boost::shared_ptr<text> text::try_file(const boost::shared_ptr<base>& file) {
static const std::string mimes[] = { "text/plain" };
static const std::string exts[] = { ".txt" };
return file->check_type(mimes, mimes + sizeof(mimes)/sizeof(std::string),
exts, exts + sizeof(exts)/sizeof(std::string)) ?
return file->check_type(program_options::text_formats()) ?
boost::make_shared<text>(file->path()) : boost::shared_ptr<text>();
}

boost::shared_ptr<base> text::compare(const boost::shared_ptr<base>& _a) const {
const text* a = static_cast<const text*>(_a.get());
unsigned int r = 0;
const std::string *first1 = _words, *last1 = _words + WORDS_COUNT,
*first2 = a->_words, *last2 = a->_words + WORDS_COUNT;
// std::vector<uint32_t>::const_iterator first1 = _hashes.begin(),
// last1 = _hashes.end(), first2 = a->_hashes.begin(), last2 = a->_hashes.end();
std::vector<uint32_t>::const_iterator
first1 = _words.begin(), last1 = _words.end(),
first2 = a->_words.begin(), last2 = a->_words.end();
while (true) {
if (first1 == last1)
{ r += distance(first2, last2); break; }
if (first2 == last2)
{ r += distance(first1, last1); break; }
if (*first1 < *first2)
{ ++r; *first1++; }
{ ++r; first1++; }
else if (*first2 < *first1)
{ ++r; *first2++; }
{ ++r; first2++; }
else { first1++; first2++; }
}
return r <= 2 ? _a : boost::shared_ptr<text>();
// return float(r) / (_hashes.size() + a->_hashes.size()) < THRESHOLD ? _a : boost::shared_ptr<text>();
return r <= program_options::text_threshold() ? _a : boost::shared_ptr<text>();
}

}
4 changes: 1 addition & 3 deletions trunk/src/file_types/text.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,7 @@ struct text: base {
boost::shared_ptr<base> compare(const boost::shared_ptr<base>& a) const;
comparison_result precompare(const boost::shared_ptr<base>& a) const { return equal; }
private:
static const unsigned int WORDS_COUNT = 10;
std::vector<uint32_t> _hashes;
std::string _words[WORDS_COUNT];
std::vector<uint32_t> _words;
};

}
Expand Down
6 changes: 3 additions & 3 deletions trunk/src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,17 @@
#include "program_options.h"
#include "magic_file.h"

void default_main(const program_options& po);
void default_main();

int main(int argc, char* argv[]) {
stderr_logger std_logger(argv[0]);
logger::set_std(&std_logger);

program_options po(argc, argv);
program_options::initialize(argc, argv);

magic::initialize();

default_main(po);
default_main();

magic::destroy();

Expand Down
Loading

0 comments on commit 2a0f098

Please sign in to comment.