Merge pull request #27 from joristaglio/master

Fixed batch processing/multi-column testing, deprecated unused parameters
datarobot · Mar 1, 2018 · ea8433f · ea8433f
2 parents c44e1cb + 9206ef8
commit ea8433f
Show file tree

Hide file tree

Showing 36 changed files with 230 additions and 208 deletions.
diff --git a/.pytest_cache/v/cache/lastfailed b/.pytest_cache/v/cache/lastfailed
@@ -5,5 +5,26 @@
   "tests/image_featurizer_test.py::test_load_and_featurize_data_multiple_columns[squeezenet]": true, 
   "tests/image_featurizer_test.py::test_load_and_featurize_data_multiple_columns[vgg16]": true, 
   "tests/image_featurizer_test.py::test_load_and_featurize_data_multiple_columns[vgg19]": true, 
-  "tests/image_featurizer_test.py::test_load_and_featurize_data_multiple_columns[xception]": true
+  "tests/image_featurizer_test.py::test_load_and_featurize_data_multiple_columns[xception]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_data_multiple_columns_no_batch_processing[inceptionv3]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_data_multiple_columns_no_batch_processing[resnet50]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_data_multiple_columns_no_batch_processing[xception]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_data_multiple_columns_with_batch_processing[inceptionv3]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_data_multiple_columns_with_batch_processing[resnet50]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_data_multiple_columns_with_batch_processing[squeezenet]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_data_multiple_columns_with_batch_processing[vgg16]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_data_multiple_columns_with_batch_processing[vgg19]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_data_multiple_columns_with_batch_processing[xception]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_single_column_no_batch_processing[inceptionv3]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_single_column_no_batch_processing[resnet50]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_single_column_no_batch_processing[squeezenet]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_single_column_no_batch_processing[vgg16]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_single_column_no_batch_processing[vgg19]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_single_column_no_batch_processing[xception]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_single_column_with_batch_processing[inceptionv3]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_single_column_with_batch_processing[resnet50]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_single_column_with_batch_processing[squeezenet]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_single_column_with_batch_processing[vgg16]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_single_column_with_batch_processing[vgg19]": true, 
+  "tests/image_featurizer_test.py::test_load_and_featurize_single_column_with_batch_processing[xception]": true
 }
diff --git a/pic2vec/__init__.py b/pic2vec/__init__.py
@@ -11,7 +11,7 @@
                                                _initialize_model, _check_downsampling_mismatch,
                                                build_featurizer)
 
-from pic2vec.feature_preprocessing import (_create_csv_with_image_paths,  # NOQA
+from pic2vec.feature_preprocessing import (_create_df_with_image_paths,  # NOQA
                                                     _find_directory_image_paths,
                                                     _find_csv_image_paths,
                                                     _find_combined_image_paths,

diff --git a/pic2vec/data_featurizing.py b/pic2vec/data_featurizing.py
@@ -112,7 +112,7 @@ def _named_path_finder(csv_name, model_str, model_depth, model_output,
     return named_path
 
 
-def _create_features_df_helper(data_array, full_feature_array, image_column_header, df):
+def _create_features_df_helper(data_array, full_feature_array, image_column_header):
     # Log how many photos are missing or blank:
     zeros_index = [np.count_nonzero(array_slice) == 0 for array_slice in data_array[:]]
     logging.info('Number of missing photos: {}'.format(len(zeros_index)))
@@ -128,13 +128,12 @@ def _create_features_df_helper(data_array, full_feature_array, image_column_head
     df_missing = pd.DataFrame(data=zeros_index, columns=missing_column_header)
 
     # Create the full combined csv+features dataframe
-    df_full = pd.concat([df, df_missing, df_features], axis=1)
+    df_features_full = pd.concat([df_missing, df_features], axis=1)
 
-    return df_full, df_features
+    return df_features_full
 
 
-def create_features(data_array, new_feature_array, df_prev, image_column_header,
-                    image_list, continued_column=False, df_features_prev=pd.DataFrame(),
+def create_features(data_array, new_feature_array, image_column_header,
                     save_features=False):
     """
     Write the feature array to a new csv, and append the features to the appropriate
@@ -166,12 +165,6 @@ def create_features(data_array, new_feature_array, df_prev, image_column_header,
 
     # -------------- #
     # ERROR CHECKING #
-
-    # Raise error if the image_column_header is not in the csv
-    if image_column_header not in df_prev.columns:
-        raise ValueError('Must pass the name of the column where the images are '
-                         'stored in the csv. The column passed was not in the csv.')
-
     # Raise error if the data array has the wrong shape
     if len(data_array.shape) != 4:
         raise ValueError('Data array must be 4D array, with shape: [batch, height, width, channel].'
@@ -185,11 +178,8 @@ def create_features(data_array, new_feature_array, df_prev, image_column_header,
 
     logging.info('Adding image features to csv.')
 
-    df_full, df_features = _create_features_df_helper(data_array, new_feature_array,
-                                                      image_column_header, df_prev)
-
-    if continued_column and save_features:
-        df_features = pd.concat([df_features_prev, df_features], axis=1)
+    df_features = _create_features_df_helper(data_array, new_feature_array,
+                                             image_column_header)
 
     # Return the full combined dataframe
-    return df_full, df_features
+    return df_features
diff --git a/pic2vec/feature_preprocessing.py b/pic2vec/feature_preprocessing.py
@@ -63,7 +63,7 @@
 }
 
 
-def _create_csv_with_image_paths(list_of_images, new_csv_name, image_column_header, save_csv):
+def _create_df_with_image_paths(list_of_images, image_column_header):
     """
     Take in a list of image names, and create a new csv file where each
     image name is a new row.
@@ -86,11 +86,6 @@ def _create_csv_with_image_paths(list_of_images, new_csv_name, image_column_head
 
     """
     df = pd.DataFrame(list_of_images, columns=[image_column_header])
-
-    if save_csv:
-        _create_csv_path(new_csv_name)
-        df.to_csv(new_csv_name, index=False)
-
     return df
 
 
@@ -244,7 +239,7 @@ def _find_combined_image_paths(image_path, csv_path, image_column_header):
     return list_of_images, df
 
 
-def _image_paths_finder(image_path, csv_path, image_column_header, new_csv_name, save_csv):
+def _image_paths_finder(image_path, csv_path, image_column_header, new_csv_name):
     """
     Given an image column header, and either a csv path or an image directory,
     find the list of image paths. If just a csv, it's pulled from the column.
@@ -278,9 +273,8 @@ def _image_paths_finder(image_path, csv_path, image_column_header, new_csv_name,
         list_of_images = _find_directory_image_paths(image_path)
 
         # Create the new csv in a folder called 'featurizer_csv/'
-        df = _create_csv_with_image_paths(list_of_images, new_csv_name=new_csv_name,
-                                          image_column_header=image_column_header,
-                                          save_csv=save_csv)
+        df = _create_df_with_image_paths(list_of_images,
+                                         image_column_header=image_column_header)
 
         logging.warning('Created csv from directory. Stored at {}'.format(new_csv_name))
 

diff --git a/pic2vec/image_featurizer.py b/pic2vec/image_featurizer.py
@@ -189,8 +189,7 @@ def load_data(self,
                   csv_path='',
                   new_csv_name='featurizer_csv/generated_images_csv',
                   grayscale=False,
-                  save_data=True,
-                  save_csv=False
+                  save_data=True
                   # crop_size = (299, 299),
                   # number_crops = 0,
                   # random_crop = False,
@@ -245,13 +244,13 @@ def load_data(self,
                 raise ValueError('If building the csv from an image directory, the featurizer can '
                                  'only create a single image column. If two image columns are '
                                  'needed, please create a csv to pass in.')
-            _create_csv_path(new_csv_name)
 
         # If the image_dict hasn't been passed in (which only happens in batch processing),
         # build the full image dict and save the original dataframe
         if not image_dict:
-            image_dict, df = self._full_image_dict_finder(image_path, csv_path, image_column_headers,
-                                                          new_csv_name, save_csv)
+            image_dict, df = self._full_image_dict_finder(image_path, csv_path,
+                                                          image_column_headers,
+                                                          new_csv_name)
             self.df_original = df
             self.full_dataframe = df
             self.image_column_headers = image_column_headers
@@ -268,7 +267,6 @@ def load_data(self,
         self.csv_path = csv_path
         self.image_path = image_path
         self.scaled_size = scaled_size
-
         return full_image_data
 
     @t.guard(batch_data=t.Type(np.ndarray),
@@ -318,13 +316,13 @@ def featurize(self, batch_data=np.zeros((1)), image_column_headers='',
         if batch_processing:
             assert len(image_column_headers) == 1 or isinstance(image_column_headers, str)
         else:
-            assert len(image_column_headers) == self.data.shape[0]
+            assert len(image_column_headers) == batch_data.shape[0]
         logging.info("Trying to featurize data.")
 
         # Initialize featurized data vector with appropriate size
         features = np.zeros((batch_data.shape[1],
                              self.num_features * len(image_column_headers)))
-
+        print(features.shape)
         # Save csv
         full_dataframe, df_features = self._featurize_helper(
             features, image_column_headers, save_features, batch_data)
@@ -336,7 +334,6 @@ def featurize(self, batch_data=np.zeros((1)), image_column_headers='',
             self.save_csv(omit_model, omit_depth, omit_output, omit_time)
 
         self.full_dataframe = full_dataframe
-
         return full_dataframe, df_features
 
     def load_and_featurize_data(self,
@@ -414,7 +411,7 @@ def load_and_featurize_data(self,
         # how many images exist in total, to control batch processing.
         full_image_dict, df_original = self._full_image_dict_finder(image_path, csv_path,
                                                                     image_column_headers,
-                                                                    new_csv_name, save_csv)
+                                                                    new_csv_name)
         # Save the fixed inputs and full image dict
         self.df_original = df_original
         self.image_column_headers = image_column_headers
@@ -435,7 +432,7 @@ def load_and_featurize_data(self,
         # If batch processing is turned off, load the images in one big batch and features them all
         else:
             full_data = self.load_data(image_column_headers, image_path, full_image_dict, csv_path,
-                                       new_csv_name, grayscale, save_data, save_csv)
+                                       new_csv_name, grayscale, save_data)
 
             full_df, features_df = \
                 self.featurize(full_data, image_column_headers=image_column_headers,
@@ -521,30 +518,30 @@ def _load_data_helper(self,
 
     def _featurize_helper(self, features, image_column_headers,
                           save_features, batch_data):
-        full_dataframe = pd.DataFrame()
-        # For each image column, perform the full featurization and add the features to the csv
+
+        # Save the initial features list
+        features_list = []
+
+        # For each image column, perform the full featurization and add the features to the df
         for column in range(batch_data.shape[0]):
-            if not column:
-                df_prev = self.df_original
-            else:
-                df_prev = self.full_dataframe
-            print(df_prev)
             # Featurize the data, and save it to the appropriate columns
             partial_features = featurize_data(self.featurizer, batch_data[column])
 
             features[:, self.num_features * column:self.num_features * column + self.num_features]\
                 = partial_features
 
             # Save the full dataframe
-            column_dataframe, df_features = \
+            df_features = \
                 create_features(batch_data[column],
-                                features,
-                                df_prev,
+                                partial_features,
                                 image_column_headers[column],
-                                self.image_dict[image_column_headers[column]],
-                                continued_column=bool(column),
                                 save_features=save_features)
 
+            features_list.append(df_features)
+
+        df_features = pd.concat(features_list, axis=1)
+        full_dataframe = pd.concat([self.df_original, df_features], axis=1)
+
         return full_dataframe, df_features
 
     def _batch_processing(self,
@@ -585,12 +582,12 @@ def _batch_processing(self,
                 # Load the images
                 batch_data = self.load_data(column, image_path,
                                             batch_image_dict, csv_path, new_csv_name,
-                                            grayscale, False, False)
+                                            grayscale, save_data=False)
 
                 # If this is the first batch, the batch features will be saved alone.
                 # Otherwise, they are concatenated to the last batch
                 batch_features_list.append(self.featurize(batch_data, column,
-                                                          True, save_features)[1])
+                                                          save_features, batch_processing=True)[1])
 
                 # Increment index by batch size
                 index += batch_size
@@ -605,13 +602,11 @@ def _batch_processing(self,
         # Return the full dataframe and features dataframe
         return full_df, full_features_df
 
-    def _full_image_dict_finder(self, image_path, csv_path, image_column_headers, new_csv_name,
-                                save_csv):
+    def _full_image_dict_finder(self, image_path, csv_path, image_column_headers, new_csv_name):
         full_image_dict = {}
-
         for column in image_column_headers:
             list_of_image_paths, df = _image_paths_finder(image_path, csv_path,
-                                                          column, new_csv_name, save_csv)
+                                                          column, new_csv_name)
 
             full_image_dict[column] = list_of_image_paths
         return full_image_dict, df

diff --git a/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_inceptionv3.npy b/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_inceptionv3.npy
diff --git a/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_inceptionv3_mult.npy b/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_inceptionv3_mult.npy
diff --git a/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_resnet50.npy b/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_resnet50.npy
diff --git a/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_resnet50_mult.npy b/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_resnet50_mult.npy
diff --git a/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_squeezenet.npy b/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_squeezenet.npy
diff --git a/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_squeezenet_mult.npy b/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_squeezenet_mult.npy
diff --git a/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_vgg16.npy b/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_vgg16.npy
diff --git a/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_vgg16_mult.npy b/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_vgg16_mult.npy
diff --git a/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_vgg19.npy b/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_vgg19.npy
diff --git a/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_vgg19_mult.npy b/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_vgg19_mult.npy
diff --git a/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_xception.npy b/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_xception.npy
diff --git a/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_xception_mult.npy b/tests/ImageFeaturizer_testing/array_tests/check_prediction_array_xception_mult.npy
diff --git a/tests/ImageFeaturizer_testing/csv_checking/inceptionv3_check_csv b/tests/ImageFeaturizer_testing/csv_checking/inceptionv3_check_csv
diff --git a/tests/ImageFeaturizer_testing/csv_checking/inceptionv3_check_csv_mult b/tests/ImageFeaturizer_testing/csv_checking/inceptionv3_check_csv_mult
diff --git a/...rizer_testing/csv_checking/mult_check_csv → ...r_testing/csv_checking/mult_check_csv.csv b/...rizer_testing/csv_checking/mult_check_csv → ...r_testing/csv_checking/mult_check_csv.csv
diff --git a/tests/ImageFeaturizer_testing/csv_checking/resnet50_check_csv b/tests/ImageFeaturizer_testing/csv_checking/resnet50_check_csv
diff --git a/tests/ImageFeaturizer_testing/csv_checking/resnet50_check_csv_mult b/tests/ImageFeaturizer_testing/csv_checking/resnet50_check_csv_mult