diff --git a/README.md b/README.md index 8b3001b..3469573 100644 --- a/README.md +++ b/README.md @@ -1 +1 @@ -# aquavis \ No newline at end of file +# This program takes a csv file and displays the data in a useful way. \ No newline at end of file diff --git a/data_loaders/csv_loader.py b/data_loaders/csv_loader.py new file mode 100644 index 0000000..9c08528 --- /dev/null +++ b/data_loaders/csv_loader.py @@ -0,0 +1,28 @@ +from abc import ABC, abstractmethod +import pandas as pd +from data_loaders.data_loader import DataLoader + + +class CsvLoader(DataLoader): + def __init__(self, name, delimiter): + """ + Reads the given csv file + + @type name: str + @param name: the name of the csv file + @type delimiter: str + @param delimiter: the delimiter separating the column names in the csv file + """ + + self.name1 = name + self.delim = delimiter + + def load(self): + """ + Reads the csv file into a dataframe + + @rtype: Pandas DataFrame + @returns: the pandas dataframe containing the read data from the csv file + """ + + return (pd.read_csv(self.name1, self.delim)) diff --git a/data_loaders/data_loader.py b/data_loaders/data_loader.py new file mode 100644 index 0000000..2da709b --- /dev/null +++ b/data_loaders/data_loader.py @@ -0,0 +1,6 @@ +from abc import ABC, abstractmethod + + +class DataLoader(ABC): + def load(self): + pass diff --git a/data_loaders/init.py b/data_loaders/init.py new file mode 100644 index 0000000..e69de29 diff --git a/driver.py b/driver.py new file mode 100644 index 0000000..dd958ff --- /dev/null +++ b/driver.py @@ -0,0 +1,321 @@ +from abc import ABC, abstractmethod +from processors.drop_close import DropClose +from processors.outlier_removal import OutlierRemoval +from processors.distance_calculator import DistanceCalculator +from processors.step_range import StepRange +from processors.processor import Processor +from data_loaders.csv_loader import CsvLoader +from data_loaders.data_loader import DataLoader +from models.gpr import GPR +from models.svr import SV +from models.model_manager import ModelManager +from models.model import Model +from visualizers.visualization import Visualization +from visualizers.two_d_vis import TwoDVis +from visualizers.three_d_vis import ThreeDVis +import matplotlib as mpl +from matplotlib import pyplot as plt +import argparse +import json + + +def set_defaults(): + """ + Gives variables default values + + @rtype: tuple + @returns: a tuple containing key variables for future methods + """ + + delimiter = "," + lat = "Latitude" + lon = "Longitude" + dep = "Depth" + range_name = "Current Step" + tol = 0.0001 + outlier_tol = 3 + range_min = 0 + range_max = 1 + ans = 'n' + tol_mse = 0.01 + ans1 = 1 + length = 20 + ans2 = 2 + ans3 = 2 + return (delimiter, lat, lon, dep, range_name, tol, outlier_tol, range_min, range_max, ans, tol_mse, ans1, length, ans2, ans3) + +def arg_parse_and_json(tup): + """ + Parses user-inputted arguments and reads possible json file + + @type tup: tuple + @param tup: the necessary variables for the functions of this method + @rtype: tuple + @returns: a tuple containing key variables for future methods + """ + + parser = argparse.ArgumentParser(description = "This program takes a csv file and displays the data in a useful way.") + parser.add_argument('--json', '-j', help = "name of json file (default = no file)") + parser.add_argument('--file', '-f', required = True, help = "name of csv file") + parser.add_argument('--extra_data', '-e', required = True, help = "name of column of user-selected data") + parser.add_argument('--delimiter', '-d', help = "delimiter separating the column names in the file (make this first in json file) (default = ',')") + parser.add_argument('--latitude', '-l', help = "name of latitude column (default = 'Latitude')") + parser.add_argument('--longitude', '-lo', help = "name of longitude column (default = 'Longitude')") + parser.add_argument('--depth', '-de', help = "name of depth column (default = 'Depth')") + parser.add_argument('--current_step', '-c', help = "name of current step column (default = 'Current Step')") + parser.add_argument('--dist_tol', '-dt', help = "tolerance for points being too close together on chart (default = 0.0001)") + parser.add_argument('--outlier_tol', '-o', help = "tolerance for outliers (in standard deviations) (default = 3)") + parser.add_argument('--first', '-fi', help = "first step in range being examined (default = 0)") + parser.add_argument('--last', '-la', help = "last step in range being examined (default = 1)") + parser.add_argument('--predictor', '-p', help = "whether or not to use a predictor model to interpolate (default = no)", action = 'store_true') + parser.add_argument('--mse_tol', '-t', help = "tolerance for MSE (default = 0.01)") + parser.add_argument('--model', '-m', help = "type of model to preferably use(1(GPR)/2(SVR)) (default = 1)") + parser.add_argument('--size', '-s', help = "number of points per row/column in the square/cubic grid that represents the model (default = 20)") + parser.add_argument('--dimension', '-di', help = "dimensions of model (default = 2)") + parser.add_argument('--plot', '-pl', help = "dimensions of hard data plot (default = 2)") + args = parser.parse_args() + + file_name = args.file + user = args.extra_data + + delimiter = tup[0] + lat = tup[1] + lon = tup[2] + dep = tup[3] + range_name = tup[4] + tol = tup[5] + outlier_tol = tup[6] + range_min = tup[7] + range_max = tup[8] + ans = tup[9] + tol_mse = tup[10] + ans1 = tup[11] + length = tup[12] + ans2 = tup[13] + ans3 = tup[14] + + if args.json != None: + with open(arg.json) as f: + data = json.load(f) + json_file = [] + for key, value in data.items(): + json_file.append([key, value]) + for i in range(len(json_file)): + if jason_file[i][0] == "delimiter": + delimiter = json_file[i][1] + elif jason_file[i][0] == "latitude": + lat = json_file[i][1] + elif jason_file[i][0] == "longitude": + lon = json_file[i][1] + elif jason_file[i][0] == "depth": + dep = json_file[i][1] + elif jason_file[i][0] == "current_step": + range_name = json_file[i][1] + elif jason_file[i][0] == "dist_tol": + tol = float(json_file[i][1]) + elif jason_file[i][0] == "outlier_tol": + outlier_tol = float(json_file[i][1]) + elif jason_file[i][0] == "first": + range_min = int(json_file[i][1]) + elif jason_file[i][0] == "last": + range_max = int(json_file[i][1]) + elif jason_file[i][0] == "predictor": + if bool(jason_file[i][1]) == True: + ans = 'y' + elif jason_file[i][0] == "mse_tol": + tol_mse = float(json_file[i][1]) + elif jason_file[i][0] == "model": + ans1 = int(json_file[i][1]) + elif jason_file[i][0] == "size": + length = int(json_file[i][1]) + elif jason_file[i][0] == "dimension": + ans2 = int(json_file[i][1]) + else: + ans3 = int(json_file[i][1]) + + if args.delimiter != None: + delimiter = args.delimiter + if args.latitude != None: + lat = args.latitude + if args.depth != None: + dep = args.depth + if args.current_step != None: + range_name = args.current_step + if args.dist_tol != None: + tol = float(args.dist_tol) + if args.outlier_tol != None: + outlier_tol = float(args.outlier_tol) + if args.first != None: + range_min = int(args.first) + if args.last != None: + range_max = int(args.last) + if bool(args.predictor) == True: + ans = 'y' + if args.mse_tol != None: + tol_mse = float(args.mse_tol) + if args.model != None: + ans1 = int(args.model) + if args.size != None: + length = int(args.size) + if args.dimension != None: + ans2 = int(args.dimension) + if args.plot != None: + ans3 = int(args.plot) + + return (delimiter, lat, lon, dep, range_name, tol, outlier_tol, range_min, range_max, ans, tol_mse, ans1, length, ans2, ans3, user, file_name) + +def set_up(tup): + """ + Processes and sets up data for models/graphing + + @type tup: tuple + @param tup: the necessary variables for the functions of this method + @rtype: tuple + @returns: a tuple containing key variables for future methods + """ + + delimiter = tup[0] + lat = tup[1] + lon = tup[2] + dep = tup[3] + range_name = tup[4] + tol = tup[5] + outlier_tol = tup[6] + range_min = tup[7] + range_max = tup[8] + ans = tup[9] + tol_mse = tup[10] + ans1 = tup[11] + length = tup[12] + ans2 = tup[13] + ans3 = tup[14] + user = tup[15] + file_name = tup[16] + + file = CsvLoader(name = file_name, delimiter = delimiter) + read_file = file.load() + + distance = DistanceCalculator(lat_name = lat, long_name = lon, file = read_file) + distance_arr = distance.process() + + file_ranger = StepRange(step_name = range_name, step_min = range_min, step_max = range_max, file = read_file) + file_ranged = file_ranger.process() + + close_dropper = DropClose(dep_name = dep, dist_arr = distance_arr, tolerance = tol, file = file_ranged) + close_dropped = close_dropper.process() + + outlier_remover = OutlierRemoval(tolerance = outlier_tol, user_name = user, file = close_dropped) + outlier_removed = outlier_remover.process() + + distance = DistanceCalculator(lat_name = lat, long_name = lon, file = outlier_removed) + distance_arr = distance.process() + + return (lat, lon, dep, ans, tol_mse, ans1, length, ans2, ans3, distance_arr, outlier_removed, user) + +def execute_model(tup): + """ + Creates a fit model and graphs predicted interpolation data + + @type tup: tuple + @param tup: the necessary variables for the functions of this method + @rtype: tuple + @returns: a tuple containing key variables for future methods + """ + + lat = tup[0] + lon = tup[1] + dep = tup[2] + ans = tup[3] + tol_mse = tup[4] + ans1 = tup[5] + length = tup[6] + ans2 = tup[7] + ans3 = tup[8] + distance_arr = tup[9] + outlier_removed = tup[10] + user = tup[11] + + + if ans == "y": + + Gauss = GPR(dist_array = distance_arr, user_name = user, dep_name = dep, length = length, file = outlier_removed, lat_n = lat, lon_n = lon, dims = ans2) + svr_mod = SV(dist_arr = distance_arr, user_name = user, dep_name = dep, length = length, file = outlier_removed, lat_n = lat, lon_n = lon, dims = ans2) + if ans1 == 1: + model_manager = ModelManager(primary = Gauss, secondary = svr_mod, tolerance = tol_mse, dimension = ans2) + x = model_manager.fit() + else: + model_manager = ModelManager(primary = svr_mod, secondary = Gauss, tolerance = tol_mse, dimension = ans2) + x = model_manager.fit() + if ans2 == 2: + gridpoints = model_manager.predict() + plt.figure() + cmap = mpl.cm.jet; + plt.xlabel("Distance (m)") + plt.ylabel(dep) + for i in range(len(gridpoints[1])): + gridpoints[1][i] = gridpoints[1][i] * -1 + cb = plt.colorbar(plt.scatter(gridpoints[0], gridpoints[1], c = gridpoints[2], cmap = cmap)) + cb.set_label(user) + if len(gridpoints) == 4: + plt.figure() + plt.xlabel("Distance (m)") + plt.ylabel(dep) + cb1 = plt.colorbar(plt.scatter(gridpoints[0], gridpoints[1], c = gridpoints[3], cmap = cmap)) + cb1.set_label("Variance (" + str(user) + ")") + plt.show() + else: + cmap = mpl.cm.jet + gridpoints = model_manager.predict() + fig = plt.figure() + ax = fig.add_subplot(111, projection = '3d') + ax.set_xlabel(lat) + ax.set_ylabel(lon) + ax.set_zlabel(dep) + for i in range(len(gridpoints[2])): + gridpoints[2][i] = gridpoints[2][i] * -1 + cb = plt.colorbar(ax.scatter(gridpoints[0], gridpoints[1], gridpoints[2], c = gridpoints[3], cmap = cmap)) + cb.set_label(user) + if len(gridpoints) == 5: + fig1 = plt.figure() + axis = fig1.add_subplot(111, projection = '3d') + axis.set_xlabel(lat) + axis.set_ylabel(lon) + axis.set_zlabel(dep) + cb1 = plt.colorbar(axis.scatter(gridpoints[0], gridpoints[1], gridpoints[2], c = gridpoints[4], cmap = cmap)) + cb1.set_label("Variance (" + str(user) + ")") + plt.show() + return (lat, lon, dep, ans3, distance_arr, outlier_removed, user) +def plot_data(tup): + """ + Plots and shows the hard data + + @type tup: tuple + @param tup: the necessary variables for the functions of this method + """ + + lat = tup[0] + lon = tup[1] + dep = tup[2] + ans3 = tup[3] + distance_arr = tup[4] + outlier_removed = tup[5] + user = tup[6] + if ans3 == 2: + graph = TwoDVis(dep_name = dep, dist_arr = distance_arr, user_name = user, file = outlier_removed) + graph.plot() + else: + graph = ThreeDVis(dep_name = dep, dist_arr = distance_arr, user_name = user, lat_name = lat, lon_name = lon, file = outlier_removed) + graph.plot() + +def main(): + """ + Executes all methods in the class + """ + + a = set_defaults() + b = arg_parse_and_json(a) + c = set_up(b) + d = execute_model(c) + plot_data(d) + +main() \ No newline at end of file diff --git a/models/gpr.py b/models/gpr.py new file mode 100644 index 0000000..21817c0 --- /dev/null +++ b/models/gpr.py @@ -0,0 +1,147 @@ +from abc import ABC, abstractmethod +from sklearn.metrics import mean_squared_error +from sklearn.gaussian_process.kernels import RBF, ConstantKernel, Matern +from sklearn.gaussian_process import GaussianProcessRegressor +import pandas as pd +import numpy as np +import matplotlib as mpl +from matplotlib import pyplot as plt +from models.model import Model + + +class GPR(Model): + def __init__(self, dist_array, user_name, dep_name, length, file, lat_n, lon_n, dims): + """ + Creates a Gaussian Process Regression-based model to interpolate data + + @type dist_array: array-like + @param dist_array: the array that contains each point's distance from the original point + @type user_name: str + @param user_name: the name of column of the additional data set selected by the user + @type dep_name: str + @param dep_name: the name of the depth column in the csv file + @type length: int + @param length: the number of points per row/column that will be displayed on the graph + @type file: Pandas DataFrame + @param file: the pandas dataframe containing all of the read data from the csv file + @type lat_n: str + @param lat_n: the name of the latitude column in the csv file + @type lon_n: str + @param lon_n: the name of the longitude column in the csv file + @type dims: int + @param dims: the desired dimensions of the graph representing the interpolations of the GPR + """ + + self.dist = dist_array + self.user = file[user_name].values + self.dimensions = int(dims) + self.depth = file[dep_name].values.tolist() + self.leng = length + self._gp = GaussianProcessRegressor(normalize_y=True, kernel=Matern()+ConstantKernel(), alpha = 0.0001) + self.is_fit = False + if self.dimensions == 2: + self.x_coor = np.linspace(min(self.dist), max(self.dist), self.leng) + self.y_coor = np.linspace(min(self.depth), max(self.depth), self.leng) + self.xx, self.yy = np.meshgrid(self.x_coor, self.y_coor) + self.x = [] + self.y = [] + self.xy = [] + for i in range(len(self.xx)): + for j in range(len(self.xx[i])): + self.xy.append([self.xx[i][j], self.yy[i][j]]) + self.x.append(self.xx[i][j]) + self.y.append(self.yy[i][j]) + self.x_arr = [] + for i in range(len(self.depth)): + self.x_arr.append([self.dist[i], self.depth[i]]) + else: + self.lat = file[lat_n].values + self.lon = file[lon_n].values + self.fit_array = [] + for i in range(len(self.lat)): + self.fit_array.append([self.lat[i], self.lon[i], self.depth[i]]) + self.x_set = np.linspace(min(self.lat), max(self.lat), self.leng) + self.y_set = np.linspace(min(self.lon), max(self.lon), self.leng) + self.z_set = np.linspace(min(self.depth), max(self.depth), self.leng) + self.xxx, self.yyy, self.zzz = np.meshgrid(self.x_set, self.y_set, self.z_set) + + @property + def gp(self): + """ + Checks to see if the model is fit whenever referenced, and, if not, fits the model + + @rtype: Gaussian Process Regressor + @returns: the fit model + """ + + if self.is_fit == False: + if self.dimensions == 2: + self._gp.fit(self.x_arr, self.user) + else: + self._gp.fit(self.fit_array, self.user) + return self._gp + + def fit(self): + """ + Gives access to the fit GPR model (used in model_manager class) + + @rtype: SVR + @returns: the fit GPR model + """ + + return self.gp + + def predict(self): + """ + Makes a prediction for interpolated datapoints + + @rtype: array-like + @returns: a list containing the xy coordinates and the predicted values at the interpolation points + """ + + pred = self.gp.predict(self.xy, return_std = True) + return ([self.x, self.y, pred[0], pred[1]]) + + def mse(self): + """ + Calculates the mean squared error of the predicted vs actual data at known datapoints + + @rtype: float + @returns: the mean squared error between the predicted vs actual data at known datapoints + """ + + actual = [] + if self.dimensions == 2: + for i in range(len(self.dist)): + actual.append([self.dist[i], self.depth[i]]) + predicted = (self.gp.predict(actual)) + else: + for i in range(len(self.lat)): + actual.append([self.lat[i], self.lon[i], self.depth[i]]) + predicted = self.gp.predict(actual) + return mean_squared_error(self.user, predicted) + + def predict3d(self): + """ + Makes a prediction for interpolated datapoints fit to a 3D visualization + + @rtype: array-like + @returns: a list containing the xyz coordinates, the predicted values, and the variance at the interpolation points + """ + + x_coordinates = [] + y_coordinates = [] + z_coordinates = [] + xyz_coordinates = [] + for i in range(len(self.xxx)): + for j in range(len(self.xxx[i])): + for k in range(len(self.xxx[i][j])): + x_coordinates.append(self.xxx[i][j][k]) + y_coordinates.append(self.yyy[i][j][k]) + z_coordinates.append(self.zzz[i][j][k]) + xyz_coordinates.append([self.xxx[i][j][k], self.yyy[i][j][k], self.zzz[i][j][k]]) + pred = self.gp.predict(xyz_coordinates, return_std = True) + return ([x_coordinates, y_coordinates, z_coordinates, pred[0], pred[1]]) + + + diff --git a/models/init.py b/models/init.py new file mode 100644 index 0000000..e69de29 diff --git a/models/model.py b/models/model.py new file mode 100644 index 0000000..e2daeb3 --- /dev/null +++ b/models/model.py @@ -0,0 +1,12 @@ +from abc import ABC, abstractmethod + + +class Model(ABC): + def fit(self): + pass + + def predict(self): + pass + + def mse(self): + pass diff --git a/models/model_manager.py b/models/model_manager.py new file mode 100644 index 0000000..290ded9 --- /dev/null +++ b/models/model_manager.py @@ -0,0 +1,87 @@ +from abc import ABC, abstractmethod +from models.model import Model +from models.gpr import GPR +from models.svr import SV + + +class ModelManager(Model): + def __init__(self, primary, secondary, tolerance, dimension): + """ + Manages which model is more fit to graph based on MSE, user preference, and user-selected MSE tolerance + + @type primary: Gaussian Process Regressor/SVR + @param primary: the user's preferred type of model + @type secondary: Gaussian Process Regressor/SVR + @param secondary: the user's non-preferred type of model + @type tolerance: float + @param tolerance: the user's tolerance for MSE + @type dimension: int + @param dimension: the user's selected dimensions for the visualization of the model (2D/3D) + """ + + self.tol = float(tolerance) + self.pri = primary + self.sec = secondary + self.mod = 0 + self.dim = dimension + + def fit(self): + """ + Picks the model based on MSE, user preference, and user-selected MSE tolerance + + @rtype: Gaussian Process Regressor/SVR + @returns: the GPR or SVR object fit to the known data + """ + + if self.pri.mse() > self.tol: + print ("Your selected model did not fit your selected tolerance. Checking the other model...") + if self.sec.mse() > self.tol: + print ("The other model did not fit your selected tolerance either. Using the closest one...") + dif = self.pri.mse() - self.tol + dif1 = self.sec.mse() - self.tol + if dif <= dif1: + print ("Your selected model was used") + return self.pri.fit() + else: + print ("The other model was used") + self.mod += 1 + return self.sec.fit() + else: + print ("The other model fits your selected tolerance. Using the other model...") + self.mod += 1 + return self.sec.fit() + else: + print ("Your selected model fit your selected tolerance.") + return self.pri.fit() + + def predict(self): + """ + Predicts the interpolated data based on the pre-selected model and dimensions + + @rtype: array-like + @returns: a list that contains all the data needed to graph + """ + + if self.dim == 2: + if self.mod == 0: + return self.pri.predict() + else: + return self.sec.predict() + else: + if self.mod == 0: + return self.pri.predict3d() + else: + return self.sec.predict3d() + def mse(self): + """ + Calculates the mean squared error of the predicted data set vs. the actual data set + + @rtype: float + @returns: the float value of mse + """ + + if self.mod == 0: + return self.pri.mse() + else: + return self.sec.mse() + diff --git a/models/svr.py b/models/svr.py new file mode 100644 index 0000000..c57d207 --- /dev/null +++ b/models/svr.py @@ -0,0 +1,144 @@ +from abc import ABC, abstractmethod +from sklearn import svm +from sklearn.metrics import mean_squared_error +import pandas as pd +import numpy as np +from models.model import Model + + +class SV(Model): + def __init__(self, dist_arr, user_name, dep_name, length, file, lat_n, lon_n, dims): + """ + Creates a Support Vector Regression-based model to interpolate data + + @type dist_arr: array-like + @param dist_arr: the array that contains each point's distance from the original point + @type user_name: str + @param user_name: the name of column of the additional data set selected by the user + @type dep_name: str + @param dep_name: the name of the depth column in the csv file + @type length: int + @param length: the number of points per row/column that will be displayed on the graph + @type file: Pandas DataFrame + @param file: the pandas dataframe containing all of the read data from the csv file + @type lat_n: str + @param lat_n: the name of the latitude column in the csv file + @type lon_n: str + @param lon_n: the name of the longitude column in the csv file + @type dims: int + @param dims: the desired dimensions of the graph representing the interpolations of the GPR + """ + + self.dist = dist_arr + self.user = file[user_name].values + self.dimensions = int(dims) + self.depth = file[dep_name].values.tolist() + self.is_fit = False + self._clf = svm.SVR(kernel = 'poly', gamma = 'scale') + self.dimns = length + + if self.dimensions == 2: + self.x_coor = np.linspace(min(self.dist), max(self.dist), self.dimns) + self.y_coor = np.linspace(min(self.depth), max(self.depth), self.dimns) + self.xx, self.yy = np.meshgrid(self.x_coor, self.y_coor) + self.x = [] + self.y = [] + self.xy = [] + for i in range(len(self.xx)): + for j in range(len(self.xx[i])): + self.xy.append([self.xx[i][j], self.yy[i][j]]) + self.x.append(self.xx[i][j]) + self.y.append(self.yy[i][j]) + self.x_arr = [] + for i in range(len(self.depth)): + self.x_arr.append([self.dist[i], self.depth[i]]) + + + else: + self.lat = file[lat_n].values + self.lon = file[lon_n].values + self.fit_array = [] + for i in range(len(self.lat)): + self.fit_array.append([self.lat[i], self.lon[i], self.depth[i]]) + self.x_set = np.linspace(min(self.lat), max(self.lat), self.dimns) + self.y_set = np.linspace(min(self.lon), max(self.lon), self.dimns) + self.z_set = np.linspace(min(self.depth), max(self.depth), self.dimns) + self.xxx, self.yyy, self.zzz = np.meshgrid(self.x_set, self.y_set, self.z_set) + + @property + def clf(self): + """ + Checks to see if the model is fit whenever referenced, and, if not, fits the model + + @rtype: SVR + @returns: the fit model + """ + + if self.is_fit == False: + if self.dimensions == 2: + self._clf.fit(self.x_arr, self.user) + else: + self._clf.fit(self.fit_array, self.user) + return self._clf + + def fit(self): + """ + Gives access to the fit SVR model (used in model_manager class) + + @rtype: SVR + @returns: the fit SVR model + """ + + return self.clf + + def predict(self): + """ + Makes a prediction for interpolated datapoints + + @rtype: array-like + @returns: a list containing the xy coordinates and the predicted values at the interpolation points + """ + + return ([self.x, self.y, self.clf.predict(self.xy)]) + + def mse(self): + """ + Calculates the mean squared error of the predicted vs actual data at known datapoints + + @rtype: float + @returns: the mean squared error between the predicted vs actual data at known datapoints + """ + + actual = [] + if self.dimensions == 2: + for i in range(len(self.dist)): + actual.append([self.dist[i], self.depth[i]]) + predicted = self.clf.predict(actual) + else: + for i in range(len(self.lat)): + actual.append([self.lat[i], self.lon[i], self.depth[i]]) + predicted = self.clf.predict(actual) + + return mean_squared_error(self.user, predicted) + + def predict3d(self): + """ + Makes a prediction for interpolated datapoints fit to a 3D visualization + + @rtype: array-like + @returns: a list containing the xyz coordinates and the predicted values at the interpolation points + """ + + x_coordinates = [] + y_coordinates = [] + z_coordinates = [] + xyz_coordinates = [] + for i in range(len(self.xxx)): + for j in range(len(self.xxx[i])): + for k in range(len(self.xxx[i][j])): + x_coordinates.append(self.xxx[i][j][k]) + y_coordinates.append(self.yyy[i][j][k]) + z_coordinates.append(self.zzz[i][j][k]) + xyz_coordinates.append([self.xxx[i][j][k], self.yyy[i][j][k], self.zzz[i][j][k]]) + return ([x_coordinates, y_coordinates, z_coordinates, self.clf.predict(xyz_coordinates)]) + diff --git a/processors/distance_calculator.py b/processors/distance_calculator.py new file mode 100644 index 0000000..91a65d6 --- /dev/null +++ b/processors/distance_calculator.py @@ -0,0 +1,47 @@ +from abc import ABC +import utm +import numpy as np +from processors.processor import Processor + + +class DistanceCalculator(Processor): + def __init__(self, lat_name, long_name, file): + """ + Calculates the distance from the original point to each of the other points based on longitude and latitude + + @type lat_name: str + @param lat_name: the name of the latitude column in the csv file + @type long_name: str + @param long_name: the name of the longitude column in the csv file + @type Pandas DataFrame + @param file: the pandas dataframe that holds all of the read data from the csv file + """ + + self.lat_list = file[lat_name].values + self.lon_list = file[long_name].values + + def process(self): + """ + Uses latitude and longitude data to calculate the distance from the first point to the others + + @rtype: array_like + @returns: the array containing each point's distance from the first point, to be used for graphing + """ + + utm_arr = [] + for i in range(len(self.lat_list)): + utm_arr.append(utm.from_latlon(self.lat_list[i], self.lon_list[i])) + NEdist = [] + for i in range(len(utm_arr)): + NEdist.append([(utm_arr[i])[0], (utm_arr[i])[1]]) + OE = NEdist[0][0] + ON = NEdist[0][1] + dist = [] + for i in range(len(NEdist)): + CE = np.abs(NEdist[i][0] - OE) + CN = np.abs(NEdist[i][1] - ON) + dist.append(float(np.abs(np.sqrt((CE ** 2) + (CN ** 2))))) + return dist + + + diff --git a/processors/drop_close.py b/processors/drop_close.py new file mode 100644 index 0000000..7e784e5 --- /dev/null +++ b/processors/drop_close.py @@ -0,0 +1,52 @@ +from abc import ABC, abstractmethod +import numpy as np +import pandas as pd +from processors.processor import Processor + + +class DropClose(Processor): + def __init__(self, dep_name, dist_arr, tolerance, file): + """ + Drops points from the data set that are problematically close together on the graph + + @type dep_name: str + @param dep_name: the name of the column that represents depth in the csv file + @type dist_arr: array-like + @param dist_arr: the array that contains each point's distance from the original point + @type tolerance: float + @param tolerance: the user's tolerance for how close points are allowed to be on the graph + @type file: Pandas DataFrame + @param file: the read pandas dataframe that holds all of the data from the csv file + """ + + self.depth = file[dep_name].values.tolist() + self.distance = dist_arr + self.tol = float(tolerance) + self.trimmed = file.copy() + + def process(self): + """ + Removes points from the dataframe that are too close together (based on the user's tolerance) + + @rtype: Pandas DataFrame + @returns: the new dataframe ridden of problematic data + """ + + dropped = 0 + for i in range(len(self.depth)): + k = 0 + OD = self.distance[i] + ODP = self.depth[i] + for j in range(len(self.depth)): + CD = self.distance[j] - OD + CDP = self.depth[j] - ODP + dt = float(np.abs(np.sqrt((CD ** 2) + (CDP ** 2)))) + if i != j and dt < self.tol: + k += 1 + if k != 0: + self.trimmed.drop([self.trimmed.index[i - dropped]]) + dropped += 1 + return self.trimmed + + + diff --git a/processors/init.py b/processors/init.py new file mode 100644 index 0000000..e69de29 diff --git a/processors/outlier_removal.py b/processors/outlier_removal.py new file mode 100644 index 0000000..d245cfb --- /dev/null +++ b/processors/outlier_removal.py @@ -0,0 +1,38 @@ +from abc import ABC, abstractmethod +import numpy as np +import pandas as pd +from processors.processor import Processor + + +class OutlierRemoval(Processor): + def __init__(self, tolerance, user_name, file): + """ + Removes outliers from the data set with a user-selected tolerance + + @type tolerance: + @param tolerance: the user's tolerance for outliers + @param user_name: the name of the column of the user-selected data in the csv file + @param file: the pandas dataframe that holds all the data from the csv file + """ + + self.tol = float(tolerance) + self.user = file[user_name].values + self.new_file = file.copy() + def process(self): + """ + Calculates and removes the oulying points + + @rtype Pandas DataFrame + @returns: a new pandas dataframes ridden of outliers + """ + + mean = np.mean(self.user) + std = np.std(self.user) + dropped = 0 + for i in range(len(self.user)): + if np.abs(self.user[i] - mean) > (std*self.tol): + self.new_file.drop([file.index[i - dropped]]) + dropped += 1 + return self.new_file + + diff --git a/processors/processor.py b/processors/processor.py new file mode 100644 index 0000000..ecf7997 --- /dev/null +++ b/processors/processor.py @@ -0,0 +1,6 @@ +from abc import ABC, abstractmethod + + +class Processor(ABC): + def process(self): + pass \ No newline at end of file diff --git a/processors/step_range.py b/processors/step_range.py new file mode 100644 index 0000000..39cdfdd --- /dev/null +++ b/processors/step_range.py @@ -0,0 +1,39 @@ +from abc import ABC, abstractmethod +from processors.processor import Processor + + +class StepRange: + def __init__(self, step_name, step_min, step_max, file): + """ + Trims the data set to only contain data within the user's desired step range + + @type step_name: str + @param step_name: the name of the column in the csv file that holds current step data + @type step_min: int + @param step_min: the first step in the range to be examined + @type step_max: int + @param step_max: the last step in the range to be examined + @type file: Pandas DataFrame + @param file: the pandas dataframe containing the read data from the csv file + """ + + self.step = file[step_name].values.tolist() + step_min = int(step_min) + step_max = int(step_max) + self.start = self.step.index(step_min) + if (step_max == max(self.step)): + self.end = len(self.step) - 1 + else: + self.end = self.step.index(step_max + 1) - 1 + self.cop = file.copy() + + def process(self): + """ + Removes all data that is not within the selected step range + + @rtype: Pandas DataFrame + @returns: the dataframe containing only the data from the selected step range + """ + + return (self.cop[self.start:self.end + 1]) + diff --git a/visualizers/init.py b/visualizers/init.py new file mode 100644 index 0000000..e69de29 diff --git a/visualizers/three_d_vis.py b/visualizers/three_d_vis.py new file mode 100644 index 0000000..970816d --- /dev/null +++ b/visualizers/three_d_vis.py @@ -0,0 +1,54 @@ +import matplotlib as mpl +from matplotlib import pyplot as plt +from mpl_toolkits import mplot3d +from abc import ABC, abstractmethod +import pandas as pd +from visualizers.visualization import Visualization + + +class ThreeDVis(Visualization): + def __init__(self, dep_name, dist_arr, user_name, lat_name, lon_name, file): + """ + Creates and plots a 3D visualization of the hard data + + @type dept_name: str + @param dep_name: the name of the depth column in the csv file + @type dist_arr: array-like + @param dist_arr: the array that contains each point's distance from the original point + @type user_name: str + @param user_name: the name of the column of the user-selected data in the csv file + @type lat_name: str + @param lat_name: the name of the latitude column in the csv file + @type lon_name: str + @param lon_name: the name of the longitude column in the csv file + @type file: Pandas DataFrame + @param file: the pandas dataframe containing the read data from the csv file + """ + + self.depth = file[dep_name].values.tolist() + self.user = file[user_name].values + self.dist = dist_arr + self.lat = file[lat_name].values + self.lon = file[lon_name].values + self.lat_n = lat_name + self.lon_n = lon_name + self.user_n = user_name + self.dep_n = dep_name + + def plot(self): + """ + Plots and shows a 3D graph of the hard data + """ + + fig = plt.figure() + cmap = mpl.cm.jet + ax = fig.add_subplot(111, projection = '3d') + for i in range(len(self.depth)): + self.depth[i] = (self.depth[i] * -1) + axis = ax.scatter(self.lat, self.lon, (self.depth), c = self.user, cmap = cmap) + ax.set_xlabel(self.lat_n) + ax.set_ylabel(self.lon_n) + ax.set_zlabel(self.dep_n) + cb = plt.colorbar(axis) + cb.set_label(self.user_n) + plt.show() diff --git a/visualizers/two_d_vis.py b/visualizers/two_d_vis.py new file mode 100644 index 0000000..d0346c8 --- /dev/null +++ b/visualizers/two_d_vis.py @@ -0,0 +1,44 @@ +from abc import ABC, abstractmethod +import matplotlib as mpl +from matplotlib import pyplot as plt +import pandas as pd +from visualizers.visualization import Visualization + + +class TwoDVis (Visualization): + def __init__(self, dep_name, dist_arr, user_name, file): + """ + Creates and plots a 2D visualization of the hard data + + @type dep_name: str + @param dep_name: the name of the depth column in the csv file + @type dist_arr: array-like + @param dist_arr: the array that contains each point's distance from the original point + @type user_name: str + @param user_name: the name of the column of the user-selected data in the csv file + @type file: Pandas DataFrame + @param file: the pandas dataframe containing the read data from the csv file + """ + + self.depth = file[dep_name].values.tolist() + self.user = file[user_name].values + self.dist = dist_arr + self.dep_n = dep_name + self.user_n = user_name + + def plot (self): + """ + Plots and shows a 2D graph of the hard data + """ + + plt.figure() + plt.xlabel("Distance") + plt.ylabel(self.dep_n) + cmap = mpl.cm.jet + for i in range(len(self.depth)): + self.depth[i] = (self.depth[i] * -1) + axis = plt.scatter(self.dist, self.depth, c=self.user, cmap = cmap) + cb_title = plt.colorbar(axis) + cb_title.set_label(self.user_n) + plt.show() + diff --git a/visualizers/visualization.py b/visualizers/visualization.py new file mode 100644 index 0000000..226991c --- /dev/null +++ b/visualizers/visualization.py @@ -0,0 +1,6 @@ +from abc import ABC, abstractmethod + + +class Visualization(ABC): + def plot(self): + pass \ No newline at end of file