Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Development (#197) #285

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 23 additions & 7 deletions .mk/docker.mk
Original file line number Diff line number Diff line change
@@ -1,16 +1,22 @@

.PHONY: docker-setup

## Setup environment variables required for docker-compose
## Setup environment variables required for docker-compose if needed
docker-setup:
@scripts/docker-setup.sh

.PHONY: docker-setup-update

## Update environment variables required for docker-compose
docker-setup-update:
@scripts/docker-setup.sh --force-update


.PHONY: docker-run

## Run application using docker-compose
docker-run: docker-setup
@scripts/docker-run.sh
sudo docker-compose up -d


.PHONY: docker-stop
Expand All @@ -19,11 +25,21 @@ docker-run: docker-setup
docker-stop:
sudo docker-compose stop

.PHONY: docker-build

## Build docker images for docker-compose application
docker-build:
sudo docker-compose build
@scripts/docker-build.sh


.PHONY: docker-pull

## Update docker images (rebuild local or pull latest from repository depending on configuration).
docker-pull:
sudo docker-compose pull

.PHONY: docker-purge

## Rebuild docker images
docker-rebuild:
sudo docker-compose rm -s -f
sudo docker-compose build
## Shut-down docker-compose application and remove all its images and volumes.
docker-purge:
sudo docker-compose down --rmi all -v --remove-orphans --timeout 0
22 changes: 19 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,28 @@ stop: docker-stop

.PHONY: setup

## Setup docker-compose application (generate .env file)
## Setup docker-compose application (generate .env file in needed)
setup: docker-setup

## Rebuild docker-compose images
rebuild: docker-rebuild
.PHONY: update-setup

## Update docker-compose application (regenerate .env file)
setup-update: docker-setup-update

.PHONY: purge

## Remove docker-compose application and all its images and volumes.
purge: docker-purge

# Define default goal
.DEFAULT_GOAL := help

.PHONY: build

## Build Docker images locally.
build: docker-build

.PHONY: pull

## Pull images from Docker Hub
pull: docker-pull
31 changes: 31 additions & 0 deletions benchmarks/augmented_dataset/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
sources:
root: data/augmented_dataset
extensions:
- mp4
- ogv
- webm
- avi

repr:
directory: data/benchmark_output/representations


processing:
frame_sampling: 1
save_frames: true
match_distance: 0.75
video_list_filename: video_dataset_list.txt
filter_dark_videos: true
filter_dark_videos_thr: 2
min_video_duration_seconds: 3
detect_scenes: true
pretrained_model_local_path: null
keep_fileoutput: true

database:
use: false
uri: postgres://postgres:admin@localhost:5432/videodeduplicationdb

templates:
source_path: data/templates/test-group/CCSI Object Recognition External/

3,064 changes: 3,064 additions & 0 deletions benchmarks/augmented_dataset/labels.csv

Large diffs are not rendered by default.

121 changes: 121 additions & 0 deletions benchmarks/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import pandas as pd
from glob import glob
from utils import get_result, download_dataset, get_frame_sampling_permutations
import os
from winnow.utils import resolve_config
import click
from winnow.utils import scan_videos
import subprocess
import shlex
import numpy as np
import json

pd.options.mode.chained_assignment = None

@click.command()

@click.option(
'--benchmark', '-b',
help='name of the benchmark to evaluated',
default='augmented_dataset')

@click.option(
'--force-download', '-fd',
help='Force download of the dataset (even if an existing directory for the dataset has been detected',
default=False, is_flag=True)

@click.option(
'--overwrite', '-o',
help='Force feature extraction, even if we detect that signatures have already been processed.',
default=False, is_flag=True)


def main(benchmark, force_download, overwrite):

config_path = os.path.join('benchmarks', benchmark, 'config.yml')
config = resolve_config(config_path)
source_folder = config.sources.root

videos = scan_videos(source_folder, '**')

if len(videos) == 0 or force_download:

download_dataset(source_folder, url='https://winnowpre.s3.amazonaws.com/augmented_dataset.tar.xz')

videos = scan_videos(source_folder, '**')

print(f'Videos found after download:{len(videos)}')

if len(videos) > 0:

print('Video files found. Checking for existing signatures...')

signatures_path = os.path.join(
config.repr.directory,
'video_signatures', '**',
'**.npy')

signatures = glob(os.path.join(signatures_path), recursive=True)

if len(signatures) == 0 or overwrite:

# Load signatures and labels
#
command = f'python extract_features.py -cp {config_path}'
command = shlex.split(command)
subprocess.run(command, check=True)

# Check if signatures were generated properly
signatures = glob(os.path.join(signatures_path), recursive=True)

assert len(signatures) > 0, 'No signature files were found.'

available_df = pd.read_csv(
os.path.join(
'benchmarks',
benchmark,
'labels.csv'))
frame_level = glob(
os.path.join(
config.repr.directory,
'frame_level', '**',
'**.npy'), recursive=True)

signatures_permutations = get_frame_sampling_permutations(
list(range(1, 6)),
frame_level)

scoreboard = dict()

for fs, sigs in signatures_permutations.items():

results_analysis = dict()

for r in np.linspace(0.1, 0.25, num=10):

results = []

for i in range(5):

mAP, pr_curve = get_result(
available_df,
sigs,
ratio=r,
file_index=frame_level)
results.append(mAP)

results_analysis[r] = results

scoreboard[fs] = results_analysis

results_file = open('benchmarks/scoreboard.json', "w")
json.dump(scoreboard, results_file)
print('Saved scoreboard on {}'.format('benchmarks/scoreboard.json'))

else:

print(f'Please review the dataset (@ {source_folder})')

if __name__ == '__main__':

main()
159 changes: 159 additions & 0 deletions benchmarks/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import pandas as pd
import numpy as np
from winnow.feature_extraction.loading_utils import evaluate, calculate_similarities, global_vector
from winnow.feature_extraction.utils import load_image, download_file
from winnow.feature_extraction import SimilarityModel
from collections import defaultdict
import os
import shutil
from glob import glob

def get_queries(min_num_of_samples, df, col='original_filename'):

fc = df[col].value_counts()
msk = fc >= min_num_of_samples

return fc[msk].index.values


def get_query_dataset(df, query, ratio=.22, col='original_filename'):

msk = df[col] == query
occ = df.loc[msk, :]
negative = df.loc[~msk, :]
n_positive_samples = len(occ)
positive_head = occ.sample(1)['new_filename'].values[0]

query_total = n_positive_samples / ratio
to_be_sampled = int(query_total - n_positive_samples)
confounders = negative.sample(to_be_sampled)
confounders.loc[:, 'label'] = 'X'
occ.loc[:, 'label'] = 'E'
merged = pd.concat([confounders, occ])

query_d = dict()

for i, row in merged.iterrows():

query_d[row['new_filename']] = row['label']

return positive_head, query_d


def get_ground_truth(available_df, queries, min_samples=4, ratio=0.2):

ground_truth = dict()

for query in queries:

head, query_ds = get_query_dataset(available_df, query, ratio=ratio)

ground_truth[head] = query_ds

return ground_truth


def convert_ground_truth(gt, base_to_idx):

queries = list(gt.keys())

qi = {base_to_idx[x]: i+1 for i, x in enumerate(queries)}

new_ds = dict()

for k, v in gt.items():

sub_d = dict()

for kk, vv in v.items():

sub_d[base_to_idx[kk]] = vv

new_ds[qi[base_to_idx[k]]] = sub_d

return new_ds


def get_result(df,
signatures,
min_samples=4,
ratio=0.25,
all_videos=False,
file_index=None):

if file_index is None:

signatures_data = np.array([np.load(x) for x in signatures])
basename = [os.path.basename(x)[:-4] for x in signatures]

else:

basename = [os.path.basename(x)[:-4] for x in file_index]
signatures_data = np.array(signatures)
signatures = file_index

basename_to_idx = {x: i for i, x in enumerate(basename)}

queries = get_queries(min_samples, df)
query_idx = [basename_to_idx[x] for x in queries]
similarities = calculate_similarities(query_idx, signatures_data)

ground_truth = get_ground_truth(df, queries, ratio=ratio)
final_gt = convert_ground_truth(ground_truth, basename_to_idx)
mAP, pr_curve = evaluate(final_gt, similarities, all_videos=all_videos)
return mAP, pr_curve


def download_dataset(
dst,
url="https://winnowpre.s3.amazonaws.com/augmented_dataset.tar.xz"):

if not os.path.exists(dst):

os.makedirs(dst)

number_of_files = len(glob(dst + '/**'))
print('Files Found',number_of_files)

if number_of_files < 2:

print('Downloading sample dataset to:{}'.format(dst))

fp = os.path.join(dst, 'dataset.tar.gz')
if not os.path.isfile(fp):

download_file(fp, url)
# unzip files
print('unpacking', fp)
shutil.unpack_archive(fp, dst)
# Delete tar
os.unlink(fp)
else:
print('Files have already been downloaded')


def get_frame_sampling_permutations(frame_samplings, frame_level_files):

d = defaultdict(list)

for v in frame_level_files:

data = np.load(v)

for frame_sampling in frame_samplings:

d[frame_sampling].append(data[::frame_sampling])

sm = SimilarityModel()

signatures = defaultdict(list)
for fs in d.keys():

video_level = np.array([global_vector(x) for x in d[fs]])
signatures[fs].append(
sm.predict_from_features(
video_level.reshape(
video_level.shape[0],
video_level.shape[2])))

return signatures
Loading