process_data.py

#!/usr/bin/python3
"""Process data generated by hogwild runs"""
# pylint: disable=C0103

from multiprocessing import Pool
import argparse
import logging
import os
import time
from itertools import count
from functools import partial
import csv
import matplotlib.pyplot as plt
import numpy as np

parser = argparse.ArgumentParser(description='Wrapper for data parallelism')

NUM_WORKERS = 10
NUM_POINTS = -25*1000


def load_csv_file(fname, skip_header=0, skip_size=1):
    """Generic function to load a formatted csv file

    Very useful to parallelize loading. Returns None on failure, to allow for
    pruning of failed files without crashing

    Looks through the loaded values, and offsets as necessary - useful when
    runs were made in pieces (eg, manual learning rate decay or simulated
    attack)"""

    # check if zipped version exists, and use that if it does
    if os.path.isfile("{}.gz".format(fname)):
        logging.debug('Found %s', "{}.gz".format(fname))
        if os.path.isfile(fname):  # unzipped exists!
            os.remove(fname)
        cmd = "gunzip -c {}.gz > {}".format(fname, fname)
        output = os.popen(cmd)
        for l in output.read().splitlines():
            logging.warn(l)
        logging.info('Unzipped %s', fname)

    if os.path.isfile(fname):
        # pylint: disable=E1101
        strt = time.process_time()
        data = np.genfromtxt(fname, delimiter=',', dtype=float,
                             skip_header=skip_header)
        dtime = time.process_time() - strt
        logging.debug(' - Disk Read time: %sS', dtime)

        # handle appended logs by offsetting each appended time by the end time
        # of the previous log -> converts all time into a monotonically
        # increasing counter
        accum_total = 0
        for i in range(0, len(data) - 1, skip_size):
            if i > 0 and data[i, 0] < data[i-1, 0]:
                data[i:, 0] += data[i-1, 0] - accum_total
                accum_total = data[i-1, 0]
                logging.debug('Found an appended log for %s at %i', fname, i)

        etime = time.process_time()
        logging.debug(' - Appended processing time: %sS/%sS', etime - dtime,
                      etime)

        if os.path.isfile("{}.gz".format(fname)):  # zipped exists
            os.remove(fname)  # remove unzipped
        else:  # create zipped
            cmd = "gzip -c {} > {}.gz".format(fname, fname)
            output = os.popen(cmd)
            for l in output.read().splitlines():
                logging.warn(l)
            if os.path.isfile("{}.gz".format(fname)):
                os.remove(fname)  # remove unzipped

        return data
    else:
        logging.error("%s not found", fname)
        return None


class hogwild_run(object):
    """Object to hold information for each run, for post-processing

    If initiated with a filepath, it will extract the run configuration from
    the file name.

    This object will also generate filenames (optionally, with filepaths) from
    the given configuration for a user-specified number of runs

    File formats should be:
        Single run -> [name]-[workers], eg baseline-3, indsc-3
        Multiple runs -> [name]-[workers]-[run], eg baseline-3-0, indsc-3-0
        targeted -> [name]-[workers]-[target]-[bias]-[run], eg targ-3-6-20-0
    """

    def __init__(self, filepath=None):
        """Either instantiate an empty object or extract information from path

        If empty (ie, no path), must call setup before any further use"""
        if filepath is None:
            logging.debug('Instantiated an empty run')
            self.runname = None
            self.workers = None
            self.target = None
            self.bias = None
            self.single_run = None
            self.path = None
            self.runs = None

        # extract the run info from the file name
        # SETS UP PATH
        else:
            logging.debug('Instantiated a run from a filepath %s', filepath)
            # remove the file extension
            filepath = filepath[:-1] if filepath.endswith('/') else filepath
            runname = filepath.split('/')[-1].split('.')[0]
            runname = runname.split('-')  # break into individual components
            filepath = '/'.join(filepath.split('/')[:-1])

            if len(runname) == 2:  # single run
                self.setup(runname[0], workers=runname[1], single_run=True,
                           path=filepath)

            elif len(runname) == 3:  # non-targeted multiple run
                self.setup(runname[0], workers=runname[1], single_run=False,
                           path=filepath)

            elif len(runname) == 5:  # targeted run (multiple runs)
                self.setup(runname[0], workers=runname[1], target=runname[2],
                           bias=runname[3], path=filepath)
            else:
                raise NotImplementedError

    def setup(self, runname, workers=1, target=None,  # pylint: disable=R0913
              bias=None, single_run=False, path=None, runs=50):
        """Assign run configuration information

        Called by init, or allows a manual user override. This function must be
        called at least once before using any others"""
        self.runname = runname
        self.workers = workers
        self.target = int(target)
        self.bias = bias
        self.single_run = single_run
        self.path = path
        self.runs = runs
        logging.debug('runname is %s', runname)
        logging.debug('workers is %s', workers)
        logging.debug('target is %s', target)
        logging.debug('bias is %s', bias)
        logging.debug('single_run is %s', single_run)
        logging.debug('path is %s', path)
        logging.debug('runs is %s', runs)

    def format_name(self, append=None):
        """Returns either the hoghwild name or a nice name for graph titles"""
        assert(self.runname is not None), 'get_filename called before setup!'
        # if one is none, both must be none
        if self.target is None or self.bias is None:
            assert(self.target is None), 'Target is set but bias is not!'
            assert(self.bias is None), 'Bias is set but target is not!'

        if append is None:
            return self.simple_name()
        else:
            return self.nice_name(append)

    def nice_name(self, append):
        """Generate a nice filename for graph titles"""
        rval = "{}, {} {}".format(self.runname, self.workers, "Workers" if
                                  int(self.workers) > 1 else "Worker")

        # pdb.set_trace()
        if self.target is not None:
            bias = float(self.bias.split('_')[1])
            while bias > 100:
                bias /= 10
            rval = "Target {} at {:2.0f}% Bias {}".format(self.target,
                                                          bias, rval)

        if append != '':
            rval = "{}\n{}".format(rval, append)

        return rval

    def simple_name(self):
        """Generate filename strings to match hogwild runs

        Do not use this information directly unless there is a single run and
        no run information in the filename! This function is most useful for
        generating a string which can be used to identify runs (eg, for a plot
        title)"""
        if self.target is not None and self.bias is not None:
            rval = "{}-{}-{}-{}".format(self.runname, self.workers,
                                        self.target, self.bias)

        else:
            rval = "{}-{}".format(self.runname, self.workers)

        return rval

    def get_filename(self, runs=None):
        """Same as format_name, but returns a list of filenames instead

        This function does return run information too, hence the list. This is
        the function which should be used to actually access files"""
        assert(self.runname is not None), 'get_filename called before setup!'
        runs = self.runs if runs is None else runs

        if self.single_run:
            # only ran a single instance of each single_run, doesn't have
            # multiple runs!
            return [self.format_name()]

        else:
            return ["{}-{}".format(self.format_name(), run) for run in
                    range(runs)]

    def get_fullnames(self, path=None):
        """Use this function to load data files

        Uses the output of get_filename (because it inclues the run
        information), and prepends a path.

        Will not return a filepath unless it can verify it's existence"""
        npath = path if path is not None else self.path
        assert(npath is not None), 'Path was not assigned'

        names = ["{}/{}.hogwild".format(npath, fname) for fname in
                 self.get_filename()]
        logging.debug('Before pruning: %s', names)
        names = [x for x in names if os.path.exists(x)]
        logging.debug('After pruning: %s', names)
        assert(len(names) != 0), 'No folder matching this configuration found!'

        return names

    def find_runs(self):
        """Look for runs, instead of forcing the user to specify"""
        # TODO look for files with matching file names and different run
        # numbers to count the runs
        raise NotImplementedError

    def load_all_preds(self):
        """load confidence files for each run -> single run at a time"""
        load_func = partial(load_csv_file, skip_header=0)
        for run in self.get_fullnames():
            logging.info('Loading confidences for run %s/%s', run,
                         len(self.get_fullnames()))
            # pylint: disable=E1101
            strt = time.time()
            with Pool(NUM_WORKERS) as p:  # pylint: disable=E1129
                data = p.map(load_func, ["{}/conf.{}".format(run, corr_label)
                                         for corr_label in range(10)])

            # make sure all predictions loaded correctly
            # Process them in a separate loop to avoid any wasted work, ie, the
            # predictions for the last label failed to load but the following
            # loop would process the first 9 before failing and discarding the
            # work!
            loaded = True
            for idx, preds in enumerate(data):
                if preds is None:
                    logging.error('Failed to load predictions for %s in %s',
                                  idx, run)
                    logging.info('Load time: %sS', time.time() - strt)
                    loaded = False
                    continue

            logging.info('Load time: %sS', time.time() - strt)

            if loaded:
                yield data

    def load_all_eval(self):
        """Load all eval files

        Uses get_fullname, so a path must be set before using this function

        Parallelizes across evaluation files... This really isn't necessecary
        when multiple runs aren't used, but it's helpful when eval files are
        large AND multiple runs are used"""
        for fname in self.get_fullnames():
            # pylint: disable=E1101
            strt = time.process_time()
            data = load_csv_file("{}/eval".format(fname), skip_header=1)
            if data is not None:
                logging.debug('Load time: %sS', time.process_time() - strt)
                yield data

        # func = partial(load_csv_file, skip_header=1)
        # with Pool(NUM_WORKERS) as p:  # pylint: disable=E1129
        #     data = p.map(func, ["{}/eval".format(fname) for fname in
        #                         self.get_fullnames()])
        # return [x for x in data if x is not None]


def average_at_evals(curr_run, pred_rate=None):
    """Average the confidences for each evaluation - ie, all the ones with the
    same time

    Call once for each run
    """
    mean_func = partial(np.mean, axis=0)
    fdata = []  # list of all labels for the current run
    fpdata = []
    for corr_label, curr_pred in zip(curr_run, pred_rate if pred_rate is not
                                     None else count()):
        # pylint: disable=E1101

        # Truncate partial predictions (eg, if a run was stopped early and eval
        # did not complete)
        sdata = []  # data, separated by step
        data_step = []  # a single step
        pdata = []  # prediction rates, separated by step

        curr_time = corr_label[0, 0]
        start_idx = 0
        for end_idx, c_row in enumerate(corr_label):
            if c_row[0] != curr_time:
                sdata.append(data_step)
                data_step = []
                curr_time = c_row[0]
                if pred_rate is not None:
                    labels, counts = np.unique(curr_pred[start_idx:end_idx],
                                               return_counts=True)
                    rates = np.zeros(10)
                    for idx, lbl in enumerate(labels):
                        rates[int(lbl)] = counts[idx]
                    pdata.append(rates)
                    start_idx = end_idx
            data_step.append(c_row)
        sdata.append(data_step)
        if pred_rate is not None:
            labels, counts = np.unique(curr_pred[start_idx:end_idx],
                                       return_counts=True)
            rates = np.zeros(10)
            for idx, lbl in enumerate(labels):
                rates[int(lbl)] = counts[idx]
            pdata.append(rates)

        # Find the average confidence for each class over all
        # images belonging to that class
        #
        # limit to 10 threads to make condor scheduling
        # deterministic
        with Pool(NUM_WORKERS) as p:  # pylint: disable=E1129
            sdata = p.map(mean_func, sdata)

        fdata.append(sdata)
        fpdata.append(pdata)

    if pred_rate is None:
        return fdata
    else:
        return fdata, fpdata


def compute_targeted(curr_run, runInfo):
    """Find the tolerance of the correct label to the target label for the
    run"""
    # assert(runInfo.target is not None), 'Target cannot be none'

    tolerance_to_targ = []
    # pylint: disable=E1101
    strt = time.process_time()
    for cidx, corr_label in enumerate(curr_run):
        tol = np.zeros((len(corr_label), 2))
        if abs(NUM_POINTS < len(tol)):
            tol[NUM_POINTS:, 0] = corr_label[NUM_POINTS:, 0]
            tol[NUM_POINTS:, 1] = corr_label[NUM_POINTS:, cidx+1] - \
                corr_label[NUM_POINTS:, runInfo.target]
            tolerance_to_targ.append(tol[NUM_POINTS:])
        else:
            tol[:, 0] = corr_label[:, 0]
            tol[:, 1] = corr_label[:, cidx+1] - \
                corr_label[:, runInfo.target]
            tolerance_to_targ.append(tol)
    logging.info('%.4fS to compute', time.process_time() - strt)

    return average_at_evals(tolerance_to_targ)


def subtract_max(row, corr):
    """Compute the difference between the correct prediction and the next
    highest confidence prediction

    Computes only on a single row - this function is split over all rows by a
    mp pool"""
    # nrow = np.zeros(2)
    # nrow[0] = row[0]  # keep timing information

    # Only take the max over non-correct predictions
    candidates = np.zeros(9)
    candidates[:corr] = row[1:corr+1]  # all labels before correct
    candidates[corr:] = row[corr+2:]  # all labels after correct
    # nrow[1:] = row[corr+1] - np.max(candidates)

    return row[0], row[corr+1] - np.max(candidates), np.argmax(row[1:])
    # return [nrow, np.argmax(row)]


def compute_indiscriminate(curr_run):
    """Find the tolerance of the correct label to the next highest confidence
    prediction"""
    tolerance_to_any = []
    prediction_rates = []
    # pylint: disable=E1101
    strt = time.process_time()
    for cidx, corr_label in enumerate(curr_run):
        max_func = partial(subtract_max, corr=cidx)
        with Pool(NUM_WORKERS) as p:  # pylint: disable=E1129
            if abs(NUM_POINTS) < len(corr_label):
                tol_and_lbl = p.map(max_func, corr_label[NUM_POINTS:])
            else:
                tol_and_lbl = p.map(max_func, corr_label)
        tol_and_lbl = np.asarray(tol_and_lbl)
        tolerance_to_any.append(tol_and_lbl[:, :2])
        prediction_rates.append(tol_and_lbl[:, 2])
    logging.info('%.4fS to compute', time.process_time() - strt)

    assert(len(tolerance_to_any) != 0), 'Tolerances are the wrong length'
    assert(len(prediction_rates) != 0), 'Predictions are the wrong length'

    tta, pr = average_at_evals(tolerance_to_any, pred_rate=prediction_rates)
    pr = np.sum(pr, axis=0)
    for row in pr:
        row /= np.sum(row)

    return tta, pr


def plot_eval(runInfo):
    """Plot evaluation accuracy over time for each run in the configuration"""
    for run, d in enumerate(runInfo.load_all_eval()):
        accuracy_fig = plt.figure(figsize=(20, 15))
        accuracy_axs = accuracy_fig.add_subplot(1, 1, 1)
        accuracy_axs.set_title(runInfo.format_name(
            'Accuracy on Validation set'))
        accuracy_axs.set_xlabel('Time (Seconds since start of training)')
        accuracy_axs.set_ylabel('Top-1 Accuracy')
        accuracy_axs.legend(loc='lower right')

        nd = np.asarray(d)
        accuracy_axs.plot(nd[:, 0], d[:, 1], label="Run {}".format(run))

        accuracy_fig.savefig(runInfo.format_name() + '_' + str(run) +
                             '_eval.png')


def plot_pred_rate(prediction_rates, runInfo, run):
    """Calculate and plot global prediction rates"""
    nt_extended = np.zeros(len(prediction_rates)+1)
    pred_rate_fig = plt.figure()
    pred_rate_axs = pred_rate_fig.add_subplot(1, 1, 1)
    title = "Prediction Rates ({})".format(run)
    pred_rate_axs.set_title(runInfo.format_name(title))
    pred_rate_axs.set_xlabel('Time')
    pred_rate_axs.set_ylabel('Prediction rate (%)')

    x_axis_values = [i for i in range(len(prediction_rates) + 1)]

    save_vals = []
    save_vals.append(x_axis_values)

    # add a line for each class
    for lbl in range(10):
        # duplicate the last value for clarity
        nt_extended = np.zeros(len(prediction_rates)+1)
        nt_extended[:-1] = prediction_rates[:, lbl]
        nt_extended[-1] = prediction_rates[-1, lbl]
        save_vals.append(nt_extended)

        pred_rate_axs.plot(x_axis_values, nt_extended,
                           label='Label {}'.format(lbl))

    pred_rate_axs.legend(loc='lower left')

    # save plot
    save_name = runInfo.format_name() + '_{}_predR.png'.format(run)
    pred_rate_fig.savefig(save_name)
    logging.info('Saved %s', save_name)

    # save csv for plotting manually
    save_name = runInfo.format_name() + '_{}_predR.csv'.format(run)
    with open(save_name, "w", newline="") as f:
        writer = csv.writer(f)
        for row in save_vals:
            writer.writerow(row)
            logging.info(row)
    logging.info('Saved %s', save_name)


def plot_confidences(runInfo, targ_axs=None, indsc_axs=None):
    """Plot targeted and indiscriminate tolerance for each run in the
    configuration

    Can directly modify axis instead of creating new ones, eg, for generating a
    page of graphs"""
    # for each image
    #   predicted_value = max(confidences)

    # prediction rate statistics:
    # ten graphs, one for each correct label:
    # each graph has ten lines, one for each label:
    # at each point in time, how many samples/1000 were predicted to belong
    # to the current label
    # pred_rate_fig = plt.figure(figsize=(30, 20))

    average_pred_rate = None
    counter = 0

    for ridx, run in enumerate(runInfo.load_all_preds()):
        logging.info('Processing %i', ridx)

        # targeted tolerances figure; only if this run had a target label
        if runInfo.target is not None:
            if targ_axs is None:
                targ_tol_fig = plt.figure()
                targ_tol_axs = targ_tol_fig.add_subplot(1, 1, 1)
            else:
                targ_tol_axs = targ_axs
            targ_tol_axs.set_title(runInfo.format_name(
                'Targeted to {}'.format(runInfo.target)))
            targ_tol_axs.set_xlabel('Time (Seconds since start of training)')
            targ_tol_axs.set_ylabel('Tolerance towards label {}'.format(
                runInfo.target))
            targ_tol_axs.legend(loc='lower right')

        # indiscriminate tolerances figure
        if indsc_axs is None:
            indsc_tol_fig = plt.figure()
            indsc_tol_axs = indsc_tol_fig.add_subplot(1, 1, 1)
        else:
            indsc_tol_axs = indsc_axs
        indsc_tol_axs.set_title(runInfo.format_name('Indiscriminate'))
        indsc_tol_axs.set_xlabel('Time (Seconds since start of training)')
        indsc_tol_axs.set_ylabel('Tolerance towards next highest')
        indsc_tol_axs.legend(loc='lower right')

        # actually calculate the tolerances and prediction rates
        indsc_tolerance, pred_rate = compute_indiscriminate(run)

        # remove duplicates
        # sum_preds = []
        # for i in range(len(pred_rate)-1, 0, -1):
        #     if pred_rate[i][0] != pred_rate[i-1][0]:
        #         sum_preds.append(pred_rate[i])
        # if pred_rate[1][0] != pred_rate[0][0]:
        #     sum_preds.append(pred_rate[0])

        if average_pred_rate is None:
            average_pred_rate = np.asarray(pred_rate)
        else:
            if len(pred_rate) > len(average_pred_rate):
                logging.debug('Resized non-avg predictions')
                average_pred_rate += np.asarray(pred_rate[
                    -len(average_pred_rate):])
            elif len(pred_rate) < len(average_pred_rate):
                logging.debug('Resized average prediction')
                # pylint: disable=E1136
                average_pred_rate = average_pred_rate[-len(pred_rate):]
                assert(len(average_pred_rate) == len(pred_rate)), 'bad size'
                average_pred_rate += np.asarray(pred_rate)
            else:
                assert(len(average_pred_rate) == len(pred_rate)), 'bad size'
                average_pred_rate += np.asarray(pred_rate)
        counter += 1

        plot_pred_rate(pred_rate, runInfo, ridx)

        if runInfo.target is not None:
            targ_tolerance = compute_targeted(run, runInfo)
            itera = zip(targ_tolerance, indsc_tolerance)
        else:
            # count is a really ugly solution here, but it does the job.
            # Really, targ_tolerance doesn't exist when not running with a
            # target, but we still want to iterate over the following loop,
            # this is just a silly way to avoid having to change the logic :(
            itera = zip(count(), indsc_tolerance)

        # for each correct label, plot!
        # tolerances: generates a single line for each class
        # prediciton rates: generate a single sub-graph with ten rates for each
        # class
        for lbl, (tt, it) in enumerate(itera):
            nt = np.asarray(it)
            nt_extended = np.zeros((len(nt)+1, 2))
            nt_extended[:-1] = nt
            nt_extended[-1] = nt[-1]
            nt_extended[-1, 0] += 10
            indsc_tol_axs.plot(nt_extended[:, 0],
                               nt_extended[:, 1],
                               label='Label {}'.format(lbl))

            if runInfo.target is not None:
                nt = np.asarray(tt)
                nt_extended[:-1, 1] = nt[:, 1]
                nt_extended[-1, 1] = nt[-1, 1]
                targ_tol_axs.plot(nt_extended[:, 0],
                                  nt_extended[:, 1],
                                  label='Label {}'.format(lbl))

        if runInfo.target is not None:
            targ_tol_fig.savefig(runInfo.format_name() +
                                 '_{}_targ.png'.format(ridx))

        indsc_tol_fig.savefig(runInfo.format_name() +
                              '_{}_indsc.png'.format(ridx))

    plot_pred_rate(average_pred_rate / counter, runInfo, 'Average')


if __name__ == '__main__':
    FORMAT = '%(message)s [%(levelno)s-%(asctime)s %(module)s:%(funcName)s]'
    logging.basicConfig(level=logging.DEBUG, format=FORMAT)
    parser.add_argument('filepath', type=str)
    args = parser.parse_args()

    run_info = hogwild_run(args.filepath)

    plot_eval(run_info)
    plot_confidences(run_info)

    logging.info('Finished plotting')