Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
…80b4-e980-4839-00ea3ed24e77
  • Loading branch information
[email protected] committed Apr 25, 2011
1 parent 7db308b commit bffbbce
Show file tree
Hide file tree
Showing 5 changed files with 227 additions and 75 deletions.
9 changes: 5 additions & 4 deletions trunk/Makefile
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
LIBS = boost_program_options boost_filesystem boost_system magic
CFLAGS = -O2 -Wall `Magick++-config --cppflags`
CFLAGS = -Wall `Magick++-config --cppflags`
LINKER_FLAGS = `Magick++-config --libs`
PROJECT_NAME = pn
SRC_DIR = src
SRCS = $(wildcard $(SRC_DIR)/*.cpp $(SRC_DIR)/*/*.cpp)
OBJS = $(patsubst $(SRC_DIR)/%.cpp,%.o,$(SRCS))

builds = release debug
debug_flags = -ggdb
release_flags = -DNDEBUG
builds = release debug valgrind
debug_flags = -ggdb -O2
release_flags = -DNDEBUG -O2
valgrind_flags = -g -O0

.PHONY: $(builds) clean
$(builds): %: %-makedirs %/$(PROJECT_NAME)
Expand Down
79 changes: 70 additions & 9 deletions trunk/src/file_types/img.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,14 @@

namespace file_type {

const double THRESHOLD = 0.005;
static const unsigned int PIX_SIZE = 128;
static const double THRESHOLD = 0.005;
static const unsigned int PIX_IN_INT = sizeof(unsigned int) >= sizeof(Magick::PixelPacket)
? sizeof(unsigned int) / sizeof(Magick::PixelPacket) : 1;
static const float EPSILON = 0.00001;
static const unsigned int MAX_DIFF = 200000;

img::img(const fs::path& p, const Magick::Image& image): base(p), _image(image) {
img::img(const fs::path& p, const Magick::Image& image): base(p), _pixels(0) {
const Magick::PixelPacket* pix = image.getConstPixels(0, 0, image.size().width(), image.size().height());
static const unsigned int BUCKET_SIZE = 65536 / BUCKET_COUNT;
for (unsigned int j = 0; j < BUCKET_COUNT; ++j) {
Expand All @@ -33,6 +38,12 @@ img::img(const fs::path& p, const Magick::Image& image): base(p), _image(image)
}
}

img::~img() {
if (_pixels != 0) {
delete [] _pixels;
}
}

boost::shared_ptr<img> img::try_file(const boost::shared_ptr<base>& file) {
static const std::string mimes[] = { "image/jpeg", "image/png" };
static const std::string exts[] = { ".jpg", ".png" };
Expand All @@ -53,18 +64,68 @@ static double abs(double x) { return x >= 0 ? x : -x; }

boost::shared_ptr<base> img::compare(const boost::shared_ptr<base>& _a) const {
const img* a = static_cast<const img*>(_a.get());
double res = 0;
for (unsigned int i = 0; i < BUCKET_COUNT; ++i) {
for (unsigned int k = 0; k < HISTOGRAM_COUNT; ++k) {
res += abs(bucket[k][i] - a->bucket[k][i]);
{ double res = 0;
for (unsigned int i = 0; i < BUCKET_COUNT; ++i) {
for (unsigned int k = 0; k < HISTOGRAM_COUNT; ++k) {
res += abs(bucket[k][i] - a->bucket[k][i]);
}
}
if (res > THRESHOLD) {
return boost::shared_ptr<img>();
}
} { init_pixels();
a->init_pixels();
unsigned int res = 0;
const unsigned int width0 = width(), width1 = a->width();
const unsigned int height0 = height(), height1 = a->height();
const unsigned int k = width0 > width1 ? width0 / width1 : 1;
const unsigned int k1 = width1 > width0 ? width1 / width0 : 1;
const unsigned int m = height0 > height1 ? height0 / height1 : 1;
const unsigned int m1 = height1 > height0 ? height1 / height0 : 1;
for (unsigned int i = 0, i1 = 0; i < width0 && i1 < width1; i += k, i1 += k1) {
for (unsigned int j = 0, j1 = 0; j < height0 && j1 < height1; j += m, j1 += m1) {
unsigned int p = i + j * width0, p1 = i1 + j1 * width1;
res += abs(_pixels[p].red - a->_pixels[p1].red)
+ abs(_pixels[p].green - a->_pixels[p1].green)
+ abs(_pixels[p].blue - a->_pixels[p1].blue);
if (res > MAX_DIFF) {
return boost::shared_ptr<img>();
}
}
}
}
return res > THRESHOLD ? boost::shared_ptr<img>() : _a;
return _a;
}

inline comparison_result img::precompare(const boost::shared_ptr<base>& a) const {
comparison_result img::precompare(const boost::shared_ptr<base>& a) const {
float diff = _aspect_ratio - static_cast<img*>(a.get())->_aspect_ratio;
if (diff < -EPSILON) {
return less;
}
if (diff > EPSILON) {
return greater;
}
return equal;
// return static_cast<const img*>(a.get())->bucket[0][0] - bucket[0][0] > THRESHOLD ? less : equal;
}

static unsigned int round(unsigned int x) {
unsigned int i = 1;
for (; i < x; i <<= 1) ;
return i - x > x - (i >> 1) ? i >> 1 : i;
}

void img::init_pixels() const {
if (_pixels == 0) {
Magick::Image im;
im.read(path().string());
#define CALC_SIZE(x) ((x) >= PIX_SIZE ? PIX_SIZE : round(x))
im.scale(Magick::Geometry(CALC_SIZE(im.size().width()), CALC_SIZE(im.size().height())));
const Magick::PixelPacket* pix = im.getConstPixels(0, 0, im.size().width(), im.size().height());
_pixels = new Magick::PixelPacket[im.size().width() * im.size().height() + 2 * PIX_IN_INT];
*reinterpret_cast<unsigned int*>(_pixels) = im.size().width();
*(reinterpret_cast<unsigned int*>(_pixels) + 1) = im.size().height();
memcpy(_pixels += 2, pix, im.size().width() * im.size().height());
}
}

}
7 changes: 6 additions & 1 deletion trunk/src/file_types/img.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,19 @@ namespace file_type {

struct img: base {
img(const fs::path& p, const Magick::Image& image);
virtual ~img();
static boost::shared_ptr<img> try_file(const boost::shared_ptr<base>& file);
boost::shared_ptr<base> compare(const boost::shared_ptr<base>& a) const;
comparison_result precompare(const boost::shared_ptr<base>& a) const;
private:
static const unsigned int BUCKET_COUNT = 4;
static const unsigned int HISTOGRAM_COUNT = 3;
double bucket[HISTOGRAM_COUNT][BUCKET_COUNT];
Magick::Image _image;
mutable Magick::PixelPacket* _pixels;
float _aspect_ratio;
void init_pixels() const;
unsigned int width() const { return *(reinterpret_cast<unsigned int*>(_pixels) - 2); }
unsigned int height() const { return *(reinterpret_cast<unsigned int*>(_pixels) - 1); }
};

}
Expand Down
196 changes: 137 additions & 59 deletions trunk/src/file_types/text.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,82 +2,160 @@
#include <boost/make_shared.hpp>
#include <string.h>
#include <ctype.h>
#include <algorithm>
#include <map>

#include "text.h"
#include "../kleisli.h"

namespace file_type {

boost::shared_ptr<text> text::try_file(const boost::shared_ptr<base>& file) {
static const std::string mimes[] = { "text/plain" };
static const std::string exts[] = { ".txt" };
return file->check_type(mimes, mimes + sizeof(mimes)/sizeof(std::string),
exts, exts + sizeof(exts)/sizeof(std::string)) ?
boost::make_shared<text>(file->path()) : boost::shared_ptr<text>();
}
static const double THRESHOLD = 0.02;

static int clean_str(char* s, int size) {
int c = 0;
for (int i = 0; i < size; ++i) {
if (isalnum(s[i])) {
s[c++] = s[i];
}
}
return c;
}
const uint32_t crc32Table[256] = {
0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA,
0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE,
0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,
0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940,
0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116,
0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A,
0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818,
0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C,
0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2,
0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086,
0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4,
0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8,
0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE,
0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252,
0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60,
0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04,
0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A,
0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E,
0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C,
0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0,
0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6,
0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
};

boost::shared_ptr<base> text::compare(const boost::shared_ptr<base>& a) const {
static inline uint32_t hash_init() { return 0xFFFFFFFF; }
static inline void hash_update(uint32_t& crc, char s) { crc = (crc >> 8) ^ crc32Table[(crc ^ s) & 0xFF]; }
static inline uint32_t hash_final(uint32_t crc) { return crc ^ 0xFFFFFFFF; }

text::text(const fs::path& p): base(p) {
const int buf_size = 4096;
char buf1[buf_size], buf2[buf_size];
char *cuf1 = buf1, *cuf2 = buf2;
fs::ifstream file1(path()), file2(a->path()), *file;
while (true) {
file1.read(cuf1, buf_size - (cuf1 - buf1));
file2.read(cuf2, buf_size - (cuf2 - buf2));
int r1 = clean_str(cuf1, file1.gcount());
int r2 = clean_str(cuf2, file2.gcount());
if (memcmp(cuf1, cuf2, r1 < r2 ? r1 : r2) != 0) {
return boost::shared_ptr<base>();
}
if (file1.eof() && file2.eof()) {
return a;
}
if (r1 <= r2) {
cuf1 = buf1;
cuf2 = buf2 + r2 - r1;
} else {
cuf1 = buf1 + r1 - r2;
cuf2 = buf2;
}
if (file1.eof()) {
if (cuf1 != buf1) {
return boost::shared_ptr<base>();
char buf[buf_size];
fs::ifstream file(p);
uint32_t hash = hash_init();
std::string last;
std::map<std::string, unsigned int> counts;
while (!file.eof()) {
file.read(buf, buf_size);
for (int i = 0; i < file.gcount(); ++i) {
if (buf[i] == '.' || buf[i] == '!' || buf[i] == '?') {
_hashes.push_back(hash_final(hash));
hash = hash_init();
} else
if (isalnum(buf[i])) {
hash_update(hash, tolower(buf[i]));
}
file = &file2;
break;
}
if (file2.eof()) {
if (cuf2 != buf2) {
return boost::shared_ptr<base>();
if (isalnum(buf[i])) {
last += /*tolower*/(buf[i]);
} else {
if (last.length() > 3) {
++counts[last];
last = "";
}
}
file = &file1;
break;
}
}
while (true) {
file->read(buf1, buf_size);
int r = clean_str(buf1, file->gcount());
if (r > 0) {
return boost::shared_ptr<base>();
}
if (file->eof()) {
return a;
std::map<unsigned int, std::vector<std::string> > rev;
for (std::map<std::string, unsigned int>::iterator it = counts.begin(); it != counts.end(); ++it) {
rev[it->second].push_back(it->first);
}
std::map<unsigned int, std::vector<std::string> >::iterator it = --rev.end();
for (unsigned int i = 0; i < WORDS_COUNT;) {
for (unsigned int j = 0; j < it->second.size(); ++j, ++i) {
if (i >= WORDS_COUNT) {
break;
}
_words[i] = it->second[j];
}
--it;
}
std::sort(_words, _words + WORDS_COUNT);
std::sort(_hashes.begin(), _hashes.end());
}

inline comparison_result text::precompare(const boost::shared_ptr<base>& a) const {
return equal;
boost::shared_ptr<text> text::try_file(const boost::shared_ptr<base>& file) {
static const std::string mimes[] = { "text/plain" };
static const std::string exts[] = { ".txt" };
return file->check_type(mimes, mimes + sizeof(mimes)/sizeof(std::string),
exts, exts + sizeof(exts)/sizeof(std::string)) ?
boost::make_shared<text>(file->path()) : boost::shared_ptr<text>();
}

boost::shared_ptr<base> text::compare(const boost::shared_ptr<base>& _a) const {
const text* a = static_cast<const text*>(_a.get());
unsigned int r = 0;
const std::string *first1 = _words, *last1 = _words + WORDS_COUNT,
*first2 = a->_words, *last2 = a->_words + WORDS_COUNT;
// std::vector<uint32_t>::const_iterator first1 = _hashes.begin(),
// last1 = _hashes.end(), first2 = a->_hashes.begin(), last2 = a->_hashes.end();
while (true) {
if (first1 == last1)
{ r += distance(first2, last2); break; }
if (first2 == last2)
{ r += distance(first1, last1); break; }
if (*first1 < *first2)
{ ++r; *first1++; }
else if (*first2 < *first1)
{ ++r; *first2++; }
else { first1++; first2++; }
}
return r <= 2 ? _a : boost::shared_ptr<text>();
// return float(r) / (_hashes.size() + a->_hashes.size()) < THRESHOLD ? _a : boost::shared_ptr<text>();
}

}
11 changes: 9 additions & 2 deletions trunk/src/file_types/text.h
Original file line number Diff line number Diff line change
@@ -1,16 +1,23 @@
#ifndef _FILE_TYPE_TEXT_H_
#define _FILE_TYPE_TEXT_H_

#include <stdint.h>
#include <vector>

#include "base.h"
#include "../type_list.h"

namespace file_type {

struct text: base {
text(const fs::path& p): base(p) {}
text(const fs::path& p);
static boost::shared_ptr<text> try_file(const boost::shared_ptr<base>& file);
boost::shared_ptr<base> compare(const boost::shared_ptr<base>& a) const;
comparison_result precompare(const boost::shared_ptr<base>& a) const;
comparison_result precompare(const boost::shared_ptr<base>& a) const { return equal; }
private:
static const unsigned int WORDS_COUNT = 10;
std::vector<uint32_t> _hashes;
std::string _words[WORDS_COUNT];
};

}
Expand Down

0 comments on commit bffbbce

Please sign in to comment.