From 7d38e102a62fa11270a6470f160f8651cd3d8abf Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Mon, 18 Mar 2024 23:19:05 +0000
Subject: [PATCH 01/31] Create ray conda environment

---
 environment_maple.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/environment_maple.yml b/environment_maple.yml
index 06d1303..7f89678 100644
--- a/environment_maple.yml
+++ b/environment_maple.yml
@@ -1,4 +1,4 @@
-name: maple_py310
+name: maple_py310_ray
 channels:
   - defaults
 dependencies:
@@ -11,3 +11,6 @@ dependencies:
   - pip:
     - tensorflow[and-cuda]
     - opencv-python
+    - ray
+    - pandas==2.1.4
+    - pyarrow

From c89494f350422507ec273591305ce3acc30dff89 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Mon, 18 Mar 2024 23:43:59 +0000
Subject: [PATCH 02/31] Format file

---
 maple_workflow.py | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/maple_workflow.py b/maple_workflow.py
index 9afb28b..ca52581 100644
--- a/maple_workflow.py
+++ b/maple_workflow.py
@@ -60,7 +60,8 @@ def tile_image(config: MPL_Config, input_img_name: str):
 
     # Create subfolder for each image
     new_file_name = input_img_name.split(".tif")[0]
-    worker_divided_img_subroot = os.path.join(worker_divided_img_root, new_file_name)
+    worker_divided_img_subroot = os.path.join(
+        worker_divided_img_root, new_file_name)
 
     print(worker_divided_img_subroot)
 
@@ -93,7 +94,8 @@ def cal_water_mask(config: MPL_Config, input_img_name: str):
     """
     image_file_name = (input_img_name).split(".tif")[0]
 
-    worker_water_root = config.WATER_MASK_DIR  # os.path.join(worker_root, "water_shp")
+    # os.path.join(worker_root, "water_shp")
+    worker_water_root = config.WATER_MASK_DIR
     temp_water_root = (
         config.TEMP_W_IMG_DIR
     )  # os.path.join(worker_root, "temp_8bitmask")
@@ -130,7 +132,7 @@ def cal_water_mask(config: MPL_Config, input_img_name: str):
     # %% Median and Otsu
     value = 5
 
-    ### UPDATED CODE - amal 01/11/2023
+    # UPDATED CODE - amal 01/11/2023
     # cmd line execution thrown exceptions unable to capture
     # Using gdal to execute the gdal_Translate
     # output file checked against the cmd line gdal_translate
@@ -165,7 +167,8 @@ def cal_water_mask(config: MPL_Config, input_img_name: str):
     # dealing with a single channel so we divide all the pixels in the image by
     # 255 to get a value between [0, 1].
     normalized_bilat_img = bilat_img / 255
-    normalized_blurred_bilat_img = filters.gaussian(normalized_bilat_img, sigma=2.0)
+    normalized_blurred_bilat_img = filters.gaussian(
+        normalized_bilat_img, sigma=2.0)
 
     # find the threshold to filter if all values are same otsu cannot find value
     # hence t is made to 0.0
@@ -203,7 +206,8 @@ def infer_image(config: MPL_Config, input_img_name: str):
 
     # Create subfolder for each image
     new_file_name = input_img_name.split(".tif")[0]
-    worker_divided_img_subroot = os.path.join(worker_divided_img_root, new_file_name)
+    worker_divided_img_subroot = os.path.join(
+        worker_divided_img_root, new_file_name)
 
     print(worker_divided_img_subroot)
 
@@ -211,14 +215,14 @@ def infer_image(config: MPL_Config, input_img_name: str):
     file2 = os.path.join(worker_divided_img_subroot, "image_param.h5")
 
     worker_output_shp_root = config.OUTPUT_SHP_DIR
-    worker_output_shp_subroot = os.path.join(worker_output_shp_root, new_file_name)
+    worker_output_shp_subroot = os.path.join(
+        worker_output_shp_root, new_file_name)
     try:
         shutil.rmtree(worker_output_shp_subroot)
     except:
         print("directory deletion failed")
         pass
 
-
     inference.inference_image(
         config,
         worker_output_shp_subroot,
@@ -251,8 +255,10 @@ def stich_shapefile(config: MPL_Config, input_img_name: str):
     # Create subfolder for each image within the worker img root
     new_file_name = input_img_name.split(".tif")[0]
 
-    worker_finaloutput_subroot = os.path.join(worker_finaloutput_root, new_file_name)
-    worker_output_shp_subroot = os.path.join(worker_output_shp_root, new_file_name)
+    worker_finaloutput_subroot = os.path.join(
+        worker_finaloutput_root, new_file_name)
+    worker_output_shp_subroot = os.path.join(
+        worker_output_shp_root, new_file_name)
 
     try:
         shutil.rmtree(worker_finaloutput_subroot)

From 156fdfbcfa731f2d990a37c02428fe7fbab4b182 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Mon, 18 Mar 2024 23:48:09 +0000
Subject: [PATCH 03/31] Load geotiffs into ray dataset

---
 maple_workflow.py | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/maple_workflow.py b/maple_workflow.py
index ca52581..5b07c03 100644
--- a/maple_workflow.py
+++ b/maple_workflow.py
@@ -23,6 +23,7 @@
 import mpl_stitchshpfile_new as stich
 import numpy as np
 import os
+import ray
 import sys
 import shutil
 from pathlib import Path
@@ -32,12 +33,37 @@
 from skimage import color, filters, io
 from skimage.morphology import disk
 import tensorflow as tf
+from typing import Any, Dict
 
 # work tag
 WORKTAG = 1
 DIETAG = 0
 
 
+def create_geotiff_images_dataset(input_image_dir: str) -> ray.data.Dataset:
+    return ray.data.read_binary_files(input_image_dir, include_paths=True)
+
+
+def add_virtual_GDAL_file_path(row: Dict[str, Any]) -> Dict[str, Any]:
+    vfs_dir_path = '/vsimem/vsidir/'
+    image_file_name = os.path.basename(row["path"])
+    vfs_filename = os.path.join(vfs_dir_path, image_file_name)
+    dst = gdal.VSIFOpenL(vfs_filename, 'wb+')
+    gdal.VSIFWriteL(row["bytes"], 1, len(row["bytes"]), dst)
+    gdal.VSIFCloseL(dst)
+    row["image_file_name"] = image_file_name
+    row["vfs_image_path"] = vfs_filename
+    return row
+
+
+def test_gdal_operation(row: Dict[str, Any]) -> Dict[str, Any]:
+    input_image_gtif = gdal.Open(row["path"])
+    vfs_image_gtif = gdal.Open(row["vfs_image_path"])
+    row["gdal_test"] = np.array_equal(input_image_gtif.GetRasterBand(
+        1).ReadAsArray(), vfs_image_gtif.GetRasterBand(1).ReadAsArray())
+    return row
+
+
 def tile_image(config: MPL_Config, input_img_name: str):
     """
     Tile the image into multiple pre-deifined sized parts so that the processing can be done on smaller parts due to
@@ -326,6 +352,10 @@ def stich_shapefile(config: MPL_Config, input_img_name: str):
         args.root_dir, args.weight_file, num_gpus_per_core=args.gpus_per_core
     )
 
+    print("0. load geotiffs into ray dataset")
+    dataset = create_geotiff_images_dataset(
+        config.INPUT_IMAGE_DIR).map(add_virtual_GDAL_file_path)
+    print("ray dataset:", dataset.schema())
     print("1.start caculating wartermask")
     cal_water_mask(config, image_name)
     print("2. start tiling image")

From de4a0193dc5103d0b2a242997c83e7256fdc9c34 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Mon, 18 Mar 2024 23:52:49 +0000
Subject: [PATCH 04/31] Calculate Water Mask

---
 maple_workflow.py | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/maple_workflow.py b/maple_workflow.py
index 5b07c03..722ea83 100644
--- a/maple_workflow.py
+++ b/maple_workflow.py
@@ -109,16 +109,16 @@ def tile_image(config: MPL_Config, input_img_name: str):
     print("finished tiling")
 
 
-def cal_water_mask(config: MPL_Config, input_img_name: str):
+def cal_water_mask(row: Dict[str, Any], config: MPL_Config) -> Dict[str, Any]:
     """
     This will calculate the water mask to avoid (inference) processing of the masked areas with water
     Uses gdal to transform the image into the required format.
     Parameters
     ----------
+    row : Row in Ray Dataset, there is one row per image.
     config : Contains static configuration information regarding the workflow.
-    input_img_name : Name of the input image
     """
-    image_file_name = (input_img_name).split(".tif")[0]
+    image_name = row["image_file_name"].split(".tif")[0]
 
     # os.path.join(worker_root, "water_shp")
     worker_water_root = config.WATER_MASK_DIR
@@ -126,8 +126,8 @@ def cal_water_mask(config: MPL_Config, input_img_name: str):
         config.TEMP_W_IMG_DIR
     )  # os.path.join(worker_root, "temp_8bitmask")
 
-    worker_water_subroot = os.path.join(worker_water_root, image_file_name)
-    temp_water_subroot = os.path.join(temp_water_root, image_file_name)
+    worker_water_subroot = os.path.join(worker_water_root, image_name)
+    temp_water_subroot = os.path.join(temp_water_root, image_name)
     # Prepare to make directories to create the files
     try:
         shutil.rmtree(worker_water_subroot)
@@ -146,18 +146,18 @@ def cal_water_mask(config: MPL_Config, input_img_name: str):
     Path(temp_water_subroot).mkdir(parents=True, exist_ok=True)
 
     output_watermask = os.path.join(
-        worker_water_subroot, r"%s_watermask.tif" % image_file_name
+        worker_water_subroot, r"%s_watermask.tif" % image_name
     )
     output_tif_8b_file = os.path.join(
-        temp_water_subroot, r"%s_8bit.tif" % image_file_name
+        temp_water_subroot, r"%s_8bit.tif" % image_name
     )
     nir_band = 3  # set number of NIR band
 
-    input_image = os.path.join(config.INPUT_IMAGE_DIR, input_img_name)
-
     # %% Median and Otsu
     value = 5
 
+    input_image_file_path = row["vfs_image_path"]
+
     # UPDATED CODE - amal 01/11/2023
     # cmd line execution thrown exceptions unable to capture
     # Using gdal to execute the gdal_Translate
@@ -166,7 +166,7 @@ def cal_water_mask(config: MPL_Config, input_img_name: str):
     try:
         gdal.Translate(
             destName=output_tif_8b_file,
-            srcDS=input_image,
+            srcDS=input_image_file_path,
             format="GTiff",
             outputType=gdal.GDT_Byte,
         )
@@ -181,7 +181,7 @@ def cal_water_mask(config: MPL_Config, input_img_name: str):
 
     bilat_img = filters.rank.median(nir, disk(value))
 
-    gtif = gdal.Open(input_image)
+    gtif = gdal.Open(input_image_file_path)
     geotransform = gtif.GetGeoTransform()
     sourceSR = gtif.GetProjection()
 
@@ -214,6 +214,8 @@ def cal_water_mask(config: MPL_Config, input_img_name: str):
     dst_ds.SetProjection(sourceSR)
     dst_ds.FlushCache()
     dst_ds = None
+    row["mask"] = mask
+    return row
 
 
 def infer_image(config: MPL_Config, input_img_name: str):
@@ -356,8 +358,10 @@ def stich_shapefile(config: MPL_Config, input_img_name: str):
     dataset = create_geotiff_images_dataset(
         config.INPUT_IMAGE_DIR).map(add_virtual_GDAL_file_path)
     print("ray dataset:", dataset.schema())
-    print("1.start caculating wartermask")
-    cal_water_mask(config, image_name)
+    print("1. start calculating watermask")
+    dataset_with_water_mask = dataset.map(fn=cal_water_mask,
+                              fn_kwargs={"config": config})
+    print("dataset with water mask: ", dataset_with_water_mask.schema())
     print("2. start tiling image")
     tile_image(config, image_name)
     print("3. start inferencing")

From 9eb36fd4e57820092566f8568800f189b08fb316 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Mon, 18 Mar 2024 23:57:58 +0000
Subject: [PATCH 05/31] Apply water mask and tile image

---
 maple_workflow.py | 199 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 153 insertions(+), 46 deletions(-)

diff --git a/maple_workflow.py b/maple_workflow.py
index 722ea83..51a5af5 100644
--- a/maple_workflow.py
+++ b/maple_workflow.py
@@ -16,6 +16,7 @@
 """
 
 import argparse
+import cv2
 import mpl_clean_inference as inf_clean
 import mpl_divideimg_234_water_new as divide
 import mpl_infer_tiles_GPU_new as inference
@@ -28,6 +29,7 @@
 import shutil
 from pathlib import Path
 
+from dataclasses import dataclass
 from mpl_config import MPL_Config
 from osgeo import gdal
 from skimage import color, filters, io
@@ -40,6 +42,27 @@
 DIETAG = 0
 
 
+@dataclass
+class ImageMetadata:
+    dict_i_j: dict[float, dict[float, float]]
+    dict_n: dict[float, list[float, float]]
+    x_resolution: float
+    y_resolution: float
+
+
+@dataclass
+class ImageTileMetadata:
+    upper_left_row: float
+    upper_left_col: float
+    tile_num: int
+
+
+@dataclass
+class ImageTile:
+    tile_values: np.array
+    tile_metadata: ImageTileMetadata
+
+
 def create_geotiff_images_dataset(input_image_dir: str) -> ray.data.Dataset:
     return ray.data.read_binary_files(input_image_dir, include_paths=True)
 
@@ -64,51 +87,6 @@ def test_gdal_operation(row: Dict[str, Any]) -> Dict[str, Any]:
     return row
 
 
-def tile_image(config: MPL_Config, input_img_name: str):
-    """
-    Tile the image into multiple pre-deifined sized parts so that the processing can be done on smaller parts due to
-    processing limitations
-
-    Parameters
-    ----------
-    config : Contains static configuration information regarding the workflow.
-    input_img_name : Name of the input image
-    """
-    sys.path.append(config.ROOT_DIR)
-
-    crop_size = config.CROP_SIZE
-
-    # worker roots
-    worker_img_root = config.INPUT_IMAGE_DIR
-    worker_divided_img_root = config.DIVIDED_IMAGE_DIR
-    # input image path
-    input_img_path = os.path.join(worker_img_root, input_img_name)
-
-    # Create subfolder for each image
-    new_file_name = input_img_name.split(".tif")[0]
-    worker_divided_img_subroot = os.path.join(
-        worker_divided_img_root, new_file_name)
-
-    print(worker_divided_img_subroot)
-
-    try:
-        shutil.rmtree(worker_divided_img_subroot)
-    except:
-        print("directory deletion failed")
-        pass
-    os.mkdir(worker_divided_img_subroot)
-
-    file1 = os.path.join(worker_divided_img_subroot, "image_data.h5")
-    file2 = os.path.join(worker_divided_img_subroot, "image_param.h5")
-    # ----------------------------------------------------------------------------------------------------
-    # Call divide image <mpl_divided_img_water> to put the water mask and also to tile and store the data
-    # Multiple image overlaps are NOT taken into account in called code.
-    #
-    divide.divide_image(config, input_img_path, crop_size, file1, file2)
-
-    print("finished tiling")
-
-
 def cal_water_mask(row: Dict[str, Any], config: MPL_Config) -> Dict[str, Any]:
     """
     This will calculate the water mask to avoid (inference) processing of the masked areas with water
@@ -218,6 +196,133 @@ def cal_water_mask(row: Dict[str, Any], config: MPL_Config) -> Dict[str, Any]:
     return row
 
 
+def tile_image(row: Dict[str, Any], config: MPL_Config) -> ray.data.Dataset:
+    """
+    Tile the image into multiple pre-deifined sized parts so that the processing can be done on smaller parts due to
+    processing limitations
+
+    Parameters
+    ----------
+    config : Contains static configuration information regarding the workflow.
+    input_img_name : Name of the input image
+    """
+
+    input_image_gtif = gdal.Open(row["vfs_image_path"])
+    mask = row["mask"]
+
+    # convert the original image into the geo cordinates for further processing using gdal
+    # https://gdal.org/tutorials/geotransforms_tut.html
+    # GT(0) x-coordinate of the upper-left corner of the upper-left pixel.
+    # GT(1) w-e pixel resolution / pixel width.
+    # GT(2) row rotation (typically zero).
+    # GT(3) y-coordinate of the upper-left corner of the upper-left pixel.
+    # GT(4) column rotation (typically zero).
+    # GT(5) n-s pixel resolution / pixel height (negative value for a north-up image).
+
+    # ---------------------- crop image from the water mask----------------------
+    # dot product of the mask and the orignal data before breaking it for processing
+    # Also band 2 3 and 4 are taken because the 4 bands cannot be processed by the NN learingin algo
+    # Need to make sure that the training bands are the same as the bands used for inferencing.
+    #
+    final_array_2 = input_image_gtif.GetRasterBand(2).ReadAsArray()
+    final_array_3 = input_image_gtif.GetRasterBand(3).ReadAsArray()
+    final_array_4 = input_image_gtif.GetRasterBand(4).ReadAsArray()
+
+    final_array_2 = np.multiply(final_array_2, mask)
+    final_array_3 = np.multiply(final_array_3, mask)
+    final_array_4 = np.multiply(final_array_4, mask)
+
+    # ulx, uly is the upper left corner
+    ulx, x_resolution, _, uly, _, y_resolution = input_image_gtif.GetGeoTransform()
+
+    # ---------------------- Divide image (tile) ----------------------
+    overlap_rate = 0.2
+    block_size = config.CROP_SIZE
+    ysize = input_image_gtif.RasterYSize
+    xsize = input_image_gtif.RasterXSize
+
+    # Load the data frame
+    from collections import defaultdict
+
+    dict_ij = defaultdict(dict)
+    dict_n = defaultdict(dict)
+    tile_count = 0
+
+    y_list = range(0, ysize, int(block_size * (1 - overlap_rate)))
+    x_list = range(0, xsize, int(block_size * (1 - overlap_rate)))
+    dict_n["total"] = [len(y_list), len(x_list)]
+
+    # ---------------------- Find each Upper left (x,y) for each divided images ----------------------
+    #  ***-----------------
+    #  ***
+    #  ***
+    #  |
+    #  |
+    #
+    tiles = []
+    for id_i, i in enumerate(y_list):
+        # don't want moving window to be larger than row size of input raster
+        if i + block_size < ysize:
+            rows = block_size
+        else:
+            rows = ysize - i
+
+        # read col
+        for id_j, j in enumerate(x_list):
+            if j + block_size < xsize:
+                cols = block_size
+            else:
+                cols = xsize - j
+            # get block out of the whole raster
+            # todo check the array values is similar as ReadAsArray()
+            band_1_array = final_array_4[i: i + rows, j: j + cols]
+            band_2_array = final_array_2[i: i + rows, j: j + cols]
+            band_3_array = final_array_3[i: i + rows, j: j + cols]
+
+            # filter out black image
+            if (
+                band_3_array[0, 0] == 0
+                and band_3_array[0, -1] == 0
+                and band_3_array[-1, 0] == 0
+                and band_3_array[-1, -1] == 0
+            ):
+                continue
+
+            dict_ij[id_i][id_j] = tile_count
+            dict_n[tile_count] = [id_i, id_j]
+
+            # stack three bands into one array
+            img = np.stack((band_1_array, band_2_array, band_3_array), axis=2)
+            cv2.normalize(img, img, 0, 255, cv2.NORM_MINMAX)
+            img = img.astype(np.uint8)
+            B, G, R = cv2.split(img)
+            out_B = cv2.equalizeHist(B)
+            out_R = cv2.equalizeHist(R)
+            out_G = cv2.equalizeHist(G)
+            final_image = cv2.merge((out_B, out_G, out_R))
+
+            # Upper left (x,y) for each images
+            ul_row_divided_img = uly + i * y_resolution
+            ul_col_divided_img = ulx + j * x_resolution
+
+            tile_metadata = ImageTileMetadata(
+                upper_left_row=ul_row_divided_img, upper_left_col=ul_col_divided_img, tile_num=tile_count)
+            image_tile = ImageTile(
+                tile_values=final_image, tile_metadata=tile_metadata)
+            tiles.append(image_tile)
+            tile_count += 1
+
+    # --------------- Store all the title as an object file
+    image_metadata = ImageMetadata(
+        dict_i_j=dict_ij, dict_n=dict_n, x_resolution=x_resolution, y_resolution=y_resolution)
+    row["image_metadata"] = image_metadata
+    current_row = row
+    for image_tile in tiles:
+        new_row = current_row
+        new_row["image_tile"] = image_tile
+        yield new_row
+
+
 def infer_image(config: MPL_Config, input_img_name: str):
     """
     Inference based on the trained model reperesented by the saved weights
@@ -363,7 +468,9 @@ def stich_shapefile(config: MPL_Config, input_img_name: str):
                               fn_kwargs={"config": config})
     print("dataset with water mask: ", dataset_with_water_mask.schema())
     print("2. start tiling image")
-    tile_image(config, image_name)
+    image_tiles_dataset = dataset_with_water_mask.flat_map(
+        fn=tile_image, fn_kwargs={"config": config})
+    print("dataset with image tiles: ", image_tiles_dataset.schema())
     print("3. start inferencing")
     infer_image(config, image_name)
     print("4. start stiching")

From 6793e734edec08f86cdd13ddf8833d4c913b7d20 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Tue, 19 Mar 2024 00:11:55 +0000
Subject: [PATCH 06/31] Starting to rayify inference but there's a bug

---
 maple_workflow.py      |  9 +++--
 mpl_infer_tiles_ray.py | 89 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 3 deletions(-)
 create mode 100644 mpl_infer_tiles_ray.py

diff --git a/maple_workflow.py b/maple_workflow.py
index 51a5af5..9fa7bf6 100644
--- a/maple_workflow.py
+++ b/maple_workflow.py
@@ -18,8 +18,8 @@
 import argparse
 import cv2
 import mpl_clean_inference as inf_clean
-import mpl_divideimg_234_water_new as divide
 import mpl_infer_tiles_GPU_new as inference
+import mpl_infer_tiles_ray as ray_inference
 import mpl_process_shapefile as process
 import mpl_stitchshpfile_new as stich
 import numpy as np
@@ -299,7 +299,7 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> ray.data.Dataset:
             out_B = cv2.equalizeHist(B)
             out_R = cv2.equalizeHist(R)
             out_G = cv2.equalizeHist(G)
-            final_image = cv2.merge((out_B, out_G, out_R))
+            final_image = np.array(cv2.merge((out_B, out_G, out_R)))
 
             # Upper left (x,y) for each images
             ul_row_divided_img = uly + i * y_resolution
@@ -465,13 +465,16 @@ def stich_shapefile(config: MPL_Config, input_img_name: str):
     print("ray dataset:", dataset.schema())
     print("1. start calculating watermask")
     dataset_with_water_mask = dataset.map(fn=cal_water_mask,
-                              fn_kwargs={"config": config})
+                                          fn_kwargs={"config": config})
     print("dataset with water mask: ", dataset_with_water_mask.schema())
     print("2. start tiling image")
     image_tiles_dataset = dataset_with_water_mask.flat_map(
         fn=tile_image, fn_kwargs={"config": config})
     print("dataset with image tiles: ", image_tiles_dataset.schema())
     print("3. start inferencing")
+    inferenced_dataset = image_tiles_dataset.map(
+        fn=ray_inference.MaskRCNNPredictor, fn_constructor_kwargs={"config": config}, concurrency=2)
+    print("inferenced?", inferenced_dataset.schema())
     infer_image(config, image_name)
     print("4. start stiching")
     stich_shapefile(config, image_name)
diff --git a/mpl_infer_tiles_ray.py b/mpl_infer_tiles_ray.py
new file mode 100644
index 0000000..3310d12
--- /dev/null
+++ b/mpl_infer_tiles_ray.py
@@ -0,0 +1,89 @@
+#!/usr/bin/python3
+"""
+MAPLE Workflow
+(3) Inference using the trained Mask RCNN
+Will load the tiled images and do the inference.
+
+Project: Permafrost Discovery Gateway: Mapping Application for Arctic Permafrost Land Environment(MAPLE)
+PI      : Chandi Witharana
+Author  : Rajitha Udwalpola
+"""
+
+import model as modellib
+import numpy as np
+import ray
+import tensorflow as tf
+
+from dataclasses import dataclass
+from mpl_config import MPL_Config, PolygonConfig
+from skimage.measure import find_contours
+from typing import Any, Dict
+
+
+@dataclass
+class ShapefileResult:
+    polygons: np.array
+    class_id: str
+
+
+class MaskRCNNPredictor:
+    def __init__(
+        self,
+        config: MPL_Config
+    ):
+        self.config = config
+        # Used to identify a specific predictor when mulitple predictors are
+        # created to run inference in parallel. The counter is also used to
+        # know which GPU to use when multiple are available.
+        self.process_counter = 1  # TODO need to fix this process_counter
+        self.use_gpu = config.NUM_GPUS_PER_CORE > 0
+        self.device = "/gpu:%d" % self.process_counter if self.use_gpu else "/cpu:0"
+
+        with tf.device(self.device):
+            self.model = modellib.MaskRCNN(
+                mode="inference", model_dir=self.config.MODEL_DIR, config=PolygonConfig()
+            )
+        self.model.keras_model.load_weights(
+            self.config.WEIGHT_PATH, by_name=True)
+
+    def __call__(self, row: Dict[str, Any]) -> Dict[str, Any]:
+
+        # get the upper left x y of the image
+        image_tile = row["image_tile"]
+        ul_row_divided_img = image_tile.tile_metadata.upper_left_row
+        ul_col_divided_img = image_tile.tile_metadata.upper_left_col
+        image_tile_values = image_tile.tile_values
+        image_metadata = row["image_metadata"]
+        x_resolution = image_metadata.x_resolution
+        y_resolution = image_metadata.y_resolution
+
+        results = self.model.detect([image_tile_values], verbose=False)
+
+        r = results[0]
+        shapefile_results = []
+        if len(r["class_ids"]):
+            for id_masks in range(r["masks"].shape[2]):
+                # read the mask
+                mask = r["masks"][:, :, id_masks]
+                padded_mask = np.zeros(
+                    (mask.shape[0] + 2, mask.shape[1] + 2), dtype=np.uint8
+                )
+                padded_mask[1:-1, 1:-1] = mask
+                class_id = r["class_ids"][id_masks]
+
+                try:
+                    contours = find_contours(padded_mask, 0.5, "high")[
+                        0
+                    ] * np.array([[y_resolution, x_resolution]])
+                    contours = contours + np.array(
+                        [[float(ul_row_divided_img), float(ul_col_divided_img)]]
+                    )
+                    # swap two cols
+                    contours.T[[0, 1]] = contours.T[[1, 0]]
+                    shapefile_results.append(ShapefileResult(
+                        polygons=contours, class_id=class_id))
+                except:
+                    contours = []
+                    pass
+        row["shapefile_results"] = shapefile_results
+        return row

From d2708361d4f28f94054b49ff3583199d218922e2 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Tue, 19 Mar 2024 18:19:46 +0000
Subject: [PATCH 07/31] Fix inference ray bug

---
 maple_workflow.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/maple_workflow.py b/maple_workflow.py
index 9fa7bf6..114db55 100644
--- a/maple_workflow.py
+++ b/maple_workflow.py
@@ -317,10 +317,12 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> ray.data.Dataset:
         dict_i_j=dict_ij, dict_n=dict_n, x_resolution=x_resolution, y_resolution=y_resolution)
     row["image_metadata"] = image_metadata
     current_row = row
+    new_rows = []
     for image_tile in tiles:
         new_row = current_row
         new_row["image_tile"] = image_tile
-        yield new_row
+        new_rows.append(new_row)
+    return new_rows
 
 
 def infer_image(config: MPL_Config, input_img_name: str):

From fe30d5734ed576290aa4477a826c42954bd30148 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Tue, 2 Apr 2024 23:22:23 +0000
Subject: [PATCH 08/31] Fixed creating image tiles -> needed to use deepcopy,
 also made changes to how the dataclasses represented the dict

---
 maple_workflow.py | 34 +++++++++++++++-------------------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/maple_workflow.py b/maple_workflow.py
index 114db55..16ecaf9 100644
--- a/maple_workflow.py
+++ b/maple_workflow.py
@@ -16,6 +16,7 @@
 """
 
 import argparse
+import copy
 import cv2
 import mpl_clean_inference as inf_clean
 import mpl_infer_tiles_GPU_new as inference
@@ -35,7 +36,7 @@
 from skimage import color, filters, io
 from skimage.morphology import disk
 import tensorflow as tf
-from typing import Any, Dict
+from typing import Any, Dict, List
 
 # work tag
 WORKTAG = 1
@@ -44,8 +45,8 @@
 
 @dataclass
 class ImageMetadata:
-    dict_i_j: dict[float, dict[float, float]]
-    dict_n: dict[float, list[float, float]]
+    len_x_list: int
+    len_y_list: int
     x_resolution: float
     y_resolution: float
 
@@ -54,6 +55,8 @@ class ImageMetadata:
 class ImageTileMetadata:
     upper_left_row: float
     upper_left_col: float
+    id_i: int
+    id_j: int
     tile_num: int
 
 
@@ -196,7 +199,7 @@ def cal_water_mask(row: Dict[str, Any], config: MPL_Config) -> Dict[str, Any]:
     return row
 
 
-def tile_image(row: Dict[str, Any], config: MPL_Config) -> ray.data.Dataset:
+def tile_image(row: Dict[str, Any], config: MPL_Config) -> List[Dict[str, Any]]:
     """
     Tile the image into multiple pre-deifined sized parts so that the processing can be done on smaller parts due to
     processing limitations
@@ -241,16 +244,10 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> ray.data.Dataset:
     ysize = input_image_gtif.RasterYSize
     xsize = input_image_gtif.RasterXSize
 
-    # Load the data frame
-    from collections import defaultdict
-
-    dict_ij = defaultdict(dict)
-    dict_n = defaultdict(dict)
     tile_count = 0
 
     y_list = range(0, ysize, int(block_size * (1 - overlap_rate)))
     x_list = range(0, xsize, int(block_size * (1 - overlap_rate)))
-    dict_n["total"] = [len(y_list), len(x_list)]
 
     # ---------------------- Find each Upper left (x,y) for each divided images ----------------------
     #  ***-----------------
@@ -288,9 +285,6 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> ray.data.Dataset:
             ):
                 continue
 
-            dict_ij[id_i][id_j] = tile_count
-            dict_n[tile_count] = [id_i, id_j]
-
             # stack three bands into one array
             img = np.stack((band_1_array, band_2_array, band_3_array), axis=2)
             cv2.normalize(img, img, 0, 255, cv2.NORM_MINMAX)
@@ -299,14 +293,14 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> ray.data.Dataset:
             out_B = cv2.equalizeHist(B)
             out_R = cv2.equalizeHist(R)
             out_G = cv2.equalizeHist(G)
-            final_image = np.array(cv2.merge((out_B, out_G, out_R)))
+            final_image = np.array(cv2.merge((out_B, out_G, out_R)))        
 
             # Upper left (x,y) for each images
             ul_row_divided_img = uly + i * y_resolution
             ul_col_divided_img = ulx + j * x_resolution
 
             tile_metadata = ImageTileMetadata(
-                upper_left_row=ul_row_divided_img, upper_left_col=ul_col_divided_img, tile_num=tile_count)
+                upper_left_row=ul_row_divided_img, upper_left_col=ul_col_divided_img, tile_num=tile_count, id_i=id_i, id_j=id_j)
             image_tile = ImageTile(
                 tile_values=final_image, tile_metadata=tile_metadata)
             tiles.append(image_tile)
@@ -314,12 +308,11 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> ray.data.Dataset:
 
     # --------------- Store all the title as an object file
     image_metadata = ImageMetadata(
-        dict_i_j=dict_ij, dict_n=dict_n, x_resolution=x_resolution, y_resolution=y_resolution)
+        len_x_list=len(x_list), len_y_list=len(y_list), x_resolution=x_resolution, y_resolution=y_resolution)
     row["image_metadata"] = image_metadata
-    current_row = row
     new_rows = []
     for image_tile in tiles:
-        new_row = current_row
+        new_row = copy.deepcopy(row)
         new_row["image_tile"] = image_tile
         new_rows.append(new_row)
     return new_rows
@@ -472,12 +465,14 @@ def stich_shapefile(config: MPL_Config, input_img_name: str):
     print("2. start tiling image")
     image_tiles_dataset = dataset_with_water_mask.flat_map(
         fn=tile_image, fn_kwargs={"config": config})
+    image_tiles_dataset = image_tiles_dataset.drop_columns(["bytes"])
+    image_tiles_dataset = image_tiles_dataset.drop_columns(["mask"])
     print("dataset with image tiles: ", image_tiles_dataset.schema())
     print("3. start inferencing")
     inferenced_dataset = image_tiles_dataset.map(
         fn=ray_inference.MaskRCNNPredictor, fn_constructor_kwargs={"config": config}, concurrency=2)
     print("inferenced?", inferenced_dataset.schema())
-    infer_image(config, image_name)
+    """
     print("4. start stiching")
     stich_shapefile(config, image_name)
     process.process_shapefile(config, image_name)
@@ -487,6 +482,7 @@ def stich_shapefile(config: MPL_Config, input_img_name: str):
         config.PROJECTED_SHP_DIR,
         "./data/input_bound/sample2_out_boundry.shp",
     )
+    """
 
 # Once you are done you can check the output on ArcGIS (win) or else you can check in QGIS (nx) Add the image and the
 # shp, shx, dbf as layers.

From c86e41d7286af3ecef05f36cc8b8d0a5e0f9631e Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Tue, 2 Apr 2024 23:27:05 +0000
Subject: [PATCH 09/31] Fix inference -> Ray throws type errors when we set a
 column value to be a List

---
 mpl_infer_tiles_ray.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/mpl_infer_tiles_ray.py b/mpl_infer_tiles_ray.py
index 3310d12..10fa1b3 100644
--- a/mpl_infer_tiles_ray.py
+++ b/mpl_infer_tiles_ray.py
@@ -9,6 +9,7 @@
 Author  : Rajitha Udwalpola
 """
 
+import copy
 import model as modellib
 import numpy as np
 import ray
@@ -17,13 +18,17 @@
 from dataclasses import dataclass
 from mpl_config import MPL_Config, PolygonConfig
 from skimage.measure import find_contours
-from typing import Any, Dict
+from typing import Any, Dict, List
 
 
 @dataclass
 class ShapefileResult:
     polygons: np.array
-    class_id: str
+    class_id: int
+
+@dataclass
+class ShapefileResults:
+    shapefile_results: List[ShapefileResult]
 
 
 class MaskRCNNPredictor:
@@ -85,5 +90,6 @@ def __call__(self, row: Dict[str, Any]) -> Dict[str, Any]:
                 except:
                     contours = []
                     pass
-        row["shapefile_results"] = shapefile_results
+        row["num_polygons_in_tile"] = r["masks"].shape[2]
+        row["tile_shapefile_results"] = ShapefileResults(shapefile_results)
         return row

From f07e7d7c448c3c8747fa7b4d0211a2f94d096517 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Tue, 2 Apr 2024 23:39:13 +0000
Subject: [PATCH 10/31] Remove old inference code

---
 maple_workflow.py | 45 ---------------------------------------------
 1 file changed, 45 deletions(-)

diff --git a/maple_workflow.py b/maple_workflow.py
index 16ecaf9..dd67a70 100644
--- a/maple_workflow.py
+++ b/maple_workflow.py
@@ -19,7 +19,6 @@
 import copy
 import cv2
 import mpl_clean_inference as inf_clean
-import mpl_infer_tiles_GPU_new as inference
 import mpl_infer_tiles_ray as ray_inference
 import mpl_process_shapefile as process
 import mpl_stitchshpfile_new as stich
@@ -318,50 +317,6 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> List[Dict[str, Any]]:
     return new_rows
 
 
-def infer_image(config: MPL_Config, input_img_name: str):
-    """
-    Inference based on the trained model reperesented by the saved weights
-
-    Parameters
-    ----------
-    config : Contains static configuration information regarding the workflow.
-    input_img_name : Name of the input image file
-    """
-    sys.path.append(config.ROOT_DIR)
-
-    # worker roots
-    worker_divided_img_root = config.DIVIDED_IMAGE_DIR
-
-    # Create subfolder for each image
-    new_file_name = input_img_name.split(".tif")[0]
-    worker_divided_img_subroot = os.path.join(
-        worker_divided_img_root, new_file_name)
-
-    print(worker_divided_img_subroot)
-
-    file1 = os.path.join(worker_divided_img_subroot, "image_data.h5")
-    file2 = os.path.join(worker_divided_img_subroot, "image_param.h5")
-
-    worker_output_shp_root = config.OUTPUT_SHP_DIR
-    worker_output_shp_subroot = os.path.join(
-        worker_output_shp_root, new_file_name)
-    try:
-        shutil.rmtree(worker_output_shp_subroot)
-    except:
-        print("directory deletion failed")
-        pass
-
-    inference.inference_image(
-        config,
-        worker_output_shp_subroot,
-        file1,
-        file2,
-        new_file_name,
-    )
-
-    print("done")
-
-
 def stich_shapefile(config: MPL_Config, input_img_name: str):
     """
     Put (stich) the image tiles back to the original

From 81c2b23946ffa9ba717ff2d6244745a19fff5200 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Tue, 2 Apr 2024 23:49:03 +0000
Subject: [PATCH 11/31] Stitching image tiles back together using Ray

---
 maple_workflow.py        |  52 +------
 mpl_stitchshpfile_ray.py | 309 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 314 insertions(+), 47 deletions(-)
 create mode 100644 mpl_stitchshpfile_ray.py

diff --git a/maple_workflow.py b/maple_workflow.py
index dd67a70..88ae27c 100644
--- a/maple_workflow.py
+++ b/maple_workflow.py
@@ -22,6 +22,7 @@
 import mpl_infer_tiles_ray as ray_inference
 import mpl_process_shapefile as process
 import mpl_stitchshpfile_new as stich
+import mpl_stitchshpfile_ray as ray_stitch
 import numpy as np
 import os
 import ray
@@ -317,50 +318,6 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> List[Dict[str, Any]]:
     return new_rows
 
 
-def stich_shapefile(config: MPL_Config, input_img_name: str):
-    """
-    Put (stich) the image tiles back to the original
-
-    Parameters
-    ----------
-    config : Contains static configuration information regarding the workflow.
-    input_img_name : Name of the input image file
-
-    Returns
-    -------
-
-    """
-    sys.path.append(config.ROOT_DIR)
-
-    worker_finaloutput_root = config.FINAL_SHP_DIR
-    worker_output_shp_root = config.OUTPUT_SHP_DIR
-
-    # Create subfolder for each image within the worker img root
-    new_file_name = input_img_name.split(".tif")[0]
-
-    worker_finaloutput_subroot = os.path.join(
-        worker_finaloutput_root, new_file_name)
-    worker_output_shp_subroot = os.path.join(
-        worker_output_shp_root, new_file_name)
-
-    try:
-        shutil.rmtree(worker_finaloutput_subroot)
-    except:
-        print("directory deletion failed")
-        pass
-    os.mkdir(worker_finaloutput_subroot)
-
-    stich.stitch_shapefile(
-        config,
-        worker_output_shp_subroot,
-        worker_finaloutput_subroot,
-        new_file_name,
-        new_file_name,
-    )
-
-    return "done Divide"
-
-
 if __name__ == "__main__":
     tf.compat.v1.disable_eager_execution()
     parser = argparse.ArgumentParser(
@@ -426,10 +383,11 @@ def stich_shapefile(config: MPL_Config, input_img_name: str):
     print("3. start inferencing")
     inferenced_dataset = image_tiles_dataset.map(
         fn=ray_inference.MaskRCNNPredictor, fn_constructor_kwargs={"config": config}, concurrency=2)
-    print("inferenced?", inferenced_dataset.schema())
-    """
+    print("inferenced:", inferenced_dataset.schema())
     print("4. start stiching")
-    stich_shapefile(config, image_name)
+    data_per_image = inferenced_dataset.groupby("image_file_name").map_groups(ray_stitch.stitch_shapefile_df)
+    print("grouped by file: ", data_per_image.schema())
+    """
     process.process_shapefile(config, image_name)
     print("5. start cleaning")
     inf_clean.clean_inference_shapes(
diff --git a/mpl_stitchshpfile_ray.py b/mpl_stitchshpfile_ray.py
new file mode 100644
index 0000000..8efc925
--- /dev/null
+++ b/mpl_stitchshpfile_ray.py
@@ -0,0 +1,309 @@
+#!/usr/bin/env python3
+"""
+MAPLE Workflow
+(4) Stich back to the original image dims from the tiles created by the inference process
+Project: Permafrost Discovery Gateway: Mapping Application for Arctic Permafrost Land Environment(MAPLE)
+PI      : Chandi Witharana
+Author  : Rajitha Udwalpola
+"""
+
+import glob
+import numpy as np
+import os
+import pandas as pd
+import pickle
+import random
+import ray
+import shapefile
+
+from collections import defaultdict
+from dataclasses import dataclass
+from mpl_config import MPL_Config
+from osgeo import ogr
+from scipy.spatial import distance
+from shapely.geometry import Polygon
+from typing import Any, Dict, List
+
+@dataclass
+class ShapefileResult:
+    polygons: np.array
+    class_id: int
+
+@dataclass
+class ShapefileResults:
+    shapefile_results: List[ShapefileResult]
+
+@dataclass
+class ImageMetadata:
+    len_x_list: int
+    len_y_list: int
+    x_resolution: float
+    y_resolution: float
+
+def stitch_shapefile(group):
+   for index, row in group.iterrows():
+        assert 1 ==0, "grrrroup: {}".format(row["image_tile"])
+
+#def stitch_shapefile(group: List[Dict[str, Any]]) -> Dict[str, Any]:
+#    return group[0]
+
+#def stitch_shapefile(group):
+#  return group
+
+def stitch_shapefile_df(group: pd.DataFrame):
+    image_shapefile_results = []
+    temp_polygon_dict = defaultdict(dict)
+    dict_ij = defaultdict(dict)
+    for index, row in group.iterrows():
+        image_shapefile_results.extend(row["tile_shapefile_results"].shapefile_results)
+        image_tile = row["image_tile"]
+        tile_num = image_tile.tile_metadata.tile_num
+        temp_polygon_dict[tile_num] = row["num_polygons_in_tile"]
+        id_i = image_tile.tile_metadata.id_i
+        id_j = image_tile.tile_metadata.id_j
+        dict_ij[id_i][id_j] = tile_num
+
+    polygon_dict = defaultdict(dict)
+    poly_count = 0
+    for k, v in temp_polygon_dict.items():
+        polygon_dict[k] = [poly_count, poly_count + v]
+        poly_count += v
+
+    first_row = group.head(1)
+    image_data_from_arbitrary_row = first_row["image_metadata"][0]
+    size_i, size_j = image_data_from_arbitrary_row.len_y_list, image_data_from_arbitrary_row.len_x_list
+
+    # create a list to store those centroid point
+    centroid_list = list()
+    # create a count number for final checking
+    for shapefile_result in image_shapefile_results:
+        # create a polygon in shapely
+        ref_polygon = Polygon(shapefile_result.polygons)
+        # parse wkt return
+        geom = ogr.CreateGeometryFromWkt(ref_polygon.centroid.wkt)
+        centroid_x, centroid_y = geom.GetPoint(0)[0], geom.GetPoint(0)[1]
+        centroid_list.append([centroid_x, centroid_y])
+
+    close_list = list()
+    print("Total number of polygons: ", len(centroid_list))
+    tile_blocksize = 4
+
+    for id_i in range(0, size_i, 3):
+        if id_i + tile_blocksize < size_i:
+            n_i = tile_blocksize
+        else:
+            n_i = size_i - id_i
+
+        for id_j in range(0, size_j, 3):
+            if id_j + tile_blocksize < size_j:
+                n_j = tile_blocksize
+            else:
+                n_j = size_j - id_j
+
+            # add to the neighbor list.
+            centroid_neighbors = []
+            poly_neighbors = []
+
+            for ii in range(n_i):
+                for jj in range(n_j):
+                    if (ii + id_i) in dict_ij.keys():
+                        if (jj + id_j) in dict_ij[(ii + id_i)].keys():
+                            n = dict_ij[ii + id_i][jj + id_j]
+                            poly_range = polygon_dict[n]
+                            poly_list = [*range(poly_range[0], poly_range[1])]
+                            poly_neighbors.extend(poly_list)
+                            centroid_neighbors.extend(
+                                centroid_list[poly_range[0]: poly_range[1]]
+                            )
+
+            if len(centroid_neighbors) == 0:
+                continue
+            dst_array = distance.cdist(
+                centroid_neighbors, centroid_neighbors, "euclidean"
+            )
+
+            # filter out close objects
+            filter_object_array = np.argwhere(
+                (dst_array < 10) & (dst_array > 0))
+
+            filter_object_array[:, 0] = [
+                poly_neighbors[i] for i in filter_object_array[:, 0]
+            ]
+            filter_object_array[:, 1] = [
+                poly_neighbors[i] for i in filter_object_array[:, 1]
+            ]
+
+            if filter_object_array.shape[0] != 0:
+                for i in filter_object_array:
+                    close_list.append(i.tolist())
+            else:
+                continue
+
+    # remove duplicated index
+    close_list = set(frozenset(sublist) for sublist in close_list)
+    close_list = [list(x) for x in close_list]
+
+    # --------------- looking for connected components in a graph ---------------
+    def connected_components(lists):
+        neighbors = defaultdict(set)
+        seen = set()
+        for each in lists:
+            for item in each:
+                neighbors[item].update(each)
+
+        def component(node, neighbors=neighbors, seen=seen, see=seen.add):
+            nodes = set([node])
+            next_node = nodes.pop
+            while nodes:
+                node = next_node()
+                see(node)
+                nodes |= neighbors[node] - seen
+                yield node
+
+        for node in neighbors:
+            if node not in seen:
+                yield sorted(component(node))
+
+    close_list = list(connected_components(close_list))
+
+    # --------------- create a new shp file to store ---------------
+    # randomly pick one of many duplications
+    chosen_polygon_indexes = []
+    for close_possible in close_list:
+        chosen_polygon_indexes.append(random.choice(close_possible))
+
+    image_shapefile_results_without_dups = [
+        image_shapefile_results[index] for index in chosen_polygon_indexes]
+
+    first_row["image_shapefile_results"] = ShapefileResults(image_shapefile_results_without_dups)
+    return first_row
+
+
+def stitch_shapefile2(group: List[Dict[str, Any]]) -> Dict[str, Any]:
+    image_shapefile_results = []
+    temp_polygon_dict = defaultdict(dict)
+    dict_ij = defaultdict(dict)
+    image_data_from_arbitrary_row = group[0]["image_metadata"]
+    for row in group:
+        image_shapefile_results.extend(row["tile_shapefile_results"])
+        image_tile = row["image_tile"]
+        tile_num = image_tile.tile_metadata.tile_num
+        temp_polygon_dict[tile_num] = len(row["tile_shapefile_results"])
+        id_i = image_tile.tile_metadata.id_i
+        id_j = image_tile.tile_metadata.id_j
+        dict_ij[id_i][id_j] = tile_num
+
+    polygon_dict = defaultdict(dict)
+    poly_count = 0
+    for k, v in temp_polygon_dict.items():
+        polygon_dict[k] = [poly_count, poly_count + v[0]]
+        poly_count += v[0]
+
+    size_i, size_j = image_data_from_arbitrary_row.len_y_list, image_data_from_arbitrary_row.len_x_list
+
+    # create a list to store those centroid point
+    centroid_list = list()
+    # create a count number for final checking
+    for shapefile_result in image_shapefile_results:
+        # create a polygon in shapely
+        ref_polygon = Polygon(shapefile_result.polygons)
+        # parse wkt return
+        geom = ogr.CreateGeometryFromWkt(ref_polygon.centroid.wkt)
+        centroid_x, centroid_y = geom.GetPoint(0)[0], geom.GetPoint(0)[1]
+        centroid_list.append([centroid_x, centroid_y])
+
+    close_list = list()
+    print("Total number of polygons: ", len(centroid_list))
+    tile_blocksize = 4
+
+    for id_i in range(0, size_i, 3):
+        if id_i + tile_blocksize < size_i:
+            n_i = tile_blocksize
+        else:
+            n_i = size_i - id_i
+
+        for id_j in range(0, size_j, 3):
+            if id_j + tile_blocksize < size_j:
+                n_j = tile_blocksize
+            else:
+                n_j = size_j - id_j
+
+            # add to the neighbor list.
+            centroid_neighbors = []
+            poly_neighbors = []
+
+            for ii in range(n_i):
+                for jj in range(n_j):
+                    if (ii + id_i) in dict_ij.keys():
+                        if (jj + id_j) in dict_ij[(ii + id_i)].keys():
+                            n = dict_ij[ii + id_i][jj + id_j]
+                            poly_range = polygon_dict[n]
+                            poly_list = [*range(poly_range[0], poly_range[1])]
+                            poly_neighbors.extend(poly_list)
+                            centroid_neighbors.extend(
+                                centroid_list[poly_range[0]: poly_range[1]]
+                            )
+
+            if len(centroid_neighbors) == 0:
+                continue
+            dst_array = distance.cdist(
+                centroid_neighbors, centroid_neighbors, "euclidean"
+            )
+
+            # filter out close objects
+            filter_object_array = np.argwhere(
+                (dst_array < 10) & (dst_array > 0))
+
+            filter_object_array[:, 0] = [
+                poly_neighbors[i] for i in filter_object_array[:, 0]
+            ]
+            filter_object_array[:, 1] = [
+                poly_neighbors[i] for i in filter_object_array[:, 1]
+            ]
+
+            if filter_object_array.shape[0] != 0:
+                for i in filter_object_array:
+                    close_list.append(i.tolist())
+            else:
+                continue
+
+    # remove duplicated index
+    close_list = set(frozenset(sublist) for sublist in close_list)
+    close_list = [list(x) for x in close_list]
+
+    # --------------- looking for connected components in a graph ---------------
+    def connected_components(lists):
+        neighbors = defaultdict(set)
+        seen = set()
+        for each in lists:
+            for item in each:
+                neighbors[item].update(each)
+
+        def component(node, neighbors=neighbors, seen=seen, see=seen.add):
+            nodes = set([node])
+            next_node = nodes.pop
+            while nodes:
+                node = next_node()
+                see(node)
+                nodes |= neighbors[node] - seen
+                yield node
+
+        for node in neighbors:
+            if node not in seen:
+                yield sorted(component(node))
+
+    close_list = list(connected_components(close_list))
+
+    # --------------- create a new shp file to store ---------------
+    # randomly pick one of many duplications
+    chosen_polygon_indexes = []
+    for close_possible in close_list:
+        chosen_polygon_indexes.append(random.choice(close_possible))
+
+    image_shapefile_results_without_dups = [
+        image_shapefile_results[index] for index in chosen_polygon_indexes]
+
+    group[0]["image_shapefile_results"] = image_shapefile_results_without_dups
+    return group[0]
+
+

From a71cce7ced8ad63c8534826c7a97369eb8caf0d4 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Fri, 5 Apr 2024 16:47:13 +0000
Subject: [PATCH 12/31] Close gdal files after opening them fix and also added
 code to write shapefiles

---
 maple_workflow.py       |  14 ++++-
 write_shapefiles_ray.py | 129 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 141 insertions(+), 2 deletions(-)
 create mode 100644 write_shapefiles_ray.py

diff --git a/maple_workflow.py b/maple_workflow.py
index 88ae27c..6b3da68 100644
--- a/maple_workflow.py
+++ b/maple_workflow.py
@@ -20,6 +20,7 @@
 import cv2
 import mpl_clean_inference as inf_clean
 import mpl_infer_tiles_ray as ray_inference
+import write_shapefiles_ray as write_shapefiles
 import mpl_process_shapefile as process
 import mpl_stitchshpfile_new as stich
 import mpl_stitchshpfile_ray as ray_stitch
@@ -165,6 +166,8 @@ def cal_water_mask(row: Dict[str, Any], config: MPL_Config) -> Dict[str, Any]:
     gtif = gdal.Open(input_image_file_path)
     geotransform = gtif.GetGeoTransform()
     sourceSR = gtif.GetProjection()
+    # Close the file.
+    gtif = None
 
     x = np.shape(image)[1]
     y = np.shape(image)[0]
@@ -244,6 +247,9 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> List[Dict[str, Any]]:
     ysize = input_image_gtif.RasterYSize
     xsize = input_image_gtif.RasterXSize
 
+    # Close the file.
+    input_image_gtif = None
+
     tile_count = 0
 
     y_list = range(0, ysize, int(block_size * (1 - overlap_rate)))
@@ -377,7 +383,7 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> List[Dict[str, Any]]:
     print("2. start tiling image")
     image_tiles_dataset = dataset_with_water_mask.flat_map(
         fn=tile_image, fn_kwargs={"config": config})
-    image_tiles_dataset = image_tiles_dataset.drop_columns(["bytes"])
+    #image_tiles_dataset = image_tiles_dataset.drop_columns(["bytes"])
     image_tiles_dataset = image_tiles_dataset.drop_columns(["mask"])
     print("dataset with image tiles: ", image_tiles_dataset.schema())
     print("3. start inferencing")
@@ -385,8 +391,12 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> List[Dict[str, Any]]:
         fn=ray_inference.MaskRCNNPredictor, fn_constructor_kwargs={"config": config}, concurrency=2)
     print("inferenced:", inferenced_dataset.schema())
     print("4. start stiching")
-    data_per_image = inferenced_dataset.groupby("image_file_name").map_groups(ray_stitch.stitch_shapefile_df)
+    data_per_image = inferenced_dataset.groupby("image_file_name").map_groups(ray_stitch.stitch_shapefile)
     print("grouped by file: ", data_per_image.schema())
+    print("5. experimenting with shapefiles")
+    shapefiles_dataset = data_per_image.map(
+        fn=write_shapefiles.WriteShapefiles, fn_constructor_kwargs={"shpfile_output_dir": config.TEST_SHAPEFILE}, concurrency=2)
+    print("done writing to shapefiles", shapefiles_dataset.schema())
     """
     process.process_shapefile(config, image_name)
     print("5. start cleaning")
diff --git a/write_shapefiles_ray.py b/write_shapefiles_ray.py
new file mode 100644
index 0000000..b19084b
--- /dev/null
+++ b/write_shapefiles_ray.py
@@ -0,0 +1,129 @@
+import os
+import shutil
+import shapefile
+
+from mpl_config import MPL_Config
+from osgeo import gdal, osr
+from shapely.geometry import Polygon, Point
+from typing import Any, Dict
+
+def delete_and_create_dir(dir: str):
+    try:
+        shutil.rmtree(dir)
+    except Exception as e:
+        print(f"Error deleting directory {dir}: {e}")
+
+    try:
+        os.makedirs(dir, exist_ok=True)
+        print(f"Created directory {dir}")
+    except Exception as e:
+        print(f"Error creating directory {dir}: {e}")
+
+class WriteShapefiles:
+    def __init__(
+        self,
+        shpfile_output_dir: str
+    ):
+        self.shpfile_output_dir = shpfile_output_dir
+        delete_and_create_dir(self.shpfile_output_dir)
+
+    def get_coordinate_system_info(self, filepath: str):
+        try:
+            # Open the dataset
+            dataset = gdal.Open(filepath)
+
+            # Check if the dataset is valid
+            if dataset is None:
+                raise Exception("Error: Unable to open the dataset.")
+
+            # Get the spatial reference
+            spatial_ref = osr.SpatialReference()
+
+            # Try to import the coordinate system from WKT
+            if spatial_ref.ImportFromWkt(dataset.GetProjection()) != gdal.CE_None:
+                raise Exception("Error: Unable to import coordinate system from WKT.")
+
+            # Check if the spatial reference is valid
+            if spatial_ref.Validate() != gdal.CE_None:
+                raise Exception("Error: Invalid spatial reference.")
+
+            # Export the spatial reference to WKT
+            dataset = None
+            return spatial_ref.ExportToWkt()
+
+        except Exception as e:
+            print(f"Error when getting the coordinate system info: {e}")
+            dataset = None
+            return None
+
+    def write_prj_file(self, geotiff_path: str, prj_file_path: str):
+        """
+        Will create the prj file by getting the geo cordinate system from the input tiff file
+
+        Parameters
+        ----------
+        geotiff_path : Path to the geo tiff used for processing
+        prj_file_path : Path to the location to create the prj files
+        """
+        try:
+            # Get the coordinate system information
+            wkt = self.get_coordinate_system_info(geotiff_path)
+
+            print(wkt)
+            if wkt is not None:
+                # Write the WKT to a .prj file
+                with open(prj_file_path, "w") as prj_file:
+                    prj_file.write(wkt)
+
+                print(f"Coordinate system information written to {prj_file_path}")
+            else:
+                print("Failed to get coordinate system information.")
+
+        except Exception as e:
+            print(f"Error when trying to write the prj file: {e}")
+
+    def write_shapefile(self, row: Dict[str, Any], shapefile_output_dir_for_image: str):
+        w = shapefile.Writer(shapefile_output_dir_for_image)
+        w.field("Class", "C", size=5)
+        w.field("Sensor", "C", "10")
+        w.field("Date", "C", "10")
+        w.field("Time", "C", "10")
+        w.field("CatalogID", "C", "20")
+        w.field("Area", "N", decimal=3)
+        w.field("CentroidX", "N", decimal=3)
+        w.field("CentroidY", "N", decimal=3)
+        w.field("Perimeter", "N", decimal=3)
+        w.field("Length", "N", decimal=3)
+        w.field("Width", "N", decimal=3)
+        image_file_name = row["image_file_name"].split(".tif")[0]
+        for shapefile_result in row["image_shapefile_results"].shapefile_results:
+            polygons = shapefile_result.polygons
+            w.poly([polygons.tolist()])
+
+            poly = Polygon(polygons)
+            centroid = poly.centroid
+            box = poly.minimum_rotated_rectangle
+            x, y = box.exterior.coords.xy
+            p0 = Point(x[0], y[0])
+            p1 = Point(x[1], y[1])
+            p2 = Point(x[2], y[2])
+            edge_length = (p0.distance(p1), p1.distance(p2))
+            length = max(edge_length)
+            width = min(edge_length)
+
+            w.record(Class=shapefile_result.class_id, Sensor=image_file_name[0:1], Date=image_file_name[1:2],
+                     Time=image_file_name[2:3], CatalogID=image_file_name[3:4], Area=poly.area,
+                     CentroidX=centroid.x, CentroidY=centroid.y, Perimeter=poly.length, Length=length, Width=width)
+        w.close()
+
+    def __call__(self, row: Dict[str, Any]) -> Dict[str, Any]:
+        image_file_name = row["image_file_name"].split(".tif")[0]
+        shapefile_output_dir_for_image = os.path.join(self.shpfile_output_dir, f"{image_file_name}.shp")
+        self.write_shapefile(row, shapefile_output_dir_for_image)
+
+        prj_output_dir_for_image = os.path.join(self.shpfile_output_dir, f"{image_file_name}.prj")
+        print("here is the virtual file path: ", row["vfs_image_path"])
+        self.write_prj_file(geotiff_path=row["vfs_image_path"], prj_file_path=prj_output_dir_for_image)
+        row["shapefile_output_dir"] = shapefile_output_dir_for_image
+        return row
+

From 0e3f5fedfdb34ac5d090b61a66f9fd22c81ecb9d Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Fri, 5 Apr 2024 16:50:05 +0000
Subject: [PATCH 13/31] Remove unnecessary code from stitch shapefile

---
 mpl_stitchshpfile_ray.py | 142 +--------------------------------------
 1 file changed, 1 insertion(+), 141 deletions(-)

diff --git a/mpl_stitchshpfile_ray.py b/mpl_stitchshpfile_ray.py
index 8efc925..50ab039 100644
--- a/mpl_stitchshpfile_ray.py
+++ b/mpl_stitchshpfile_ray.py
@@ -40,17 +40,7 @@ class ImageMetadata:
     x_resolution: float
     y_resolution: float
 
-def stitch_shapefile(group):
-   for index, row in group.iterrows():
-        assert 1 ==0, "grrrroup: {}".format(row["image_tile"])
-
-#def stitch_shapefile(group: List[Dict[str, Any]]) -> Dict[str, Any]:
-#    return group[0]
-
-#def stitch_shapefile(group):
-#  return group
-
-def stitch_shapefile_df(group: pd.DataFrame):
+def stitch_shapefile(group: pd.DataFrame):
     image_shapefile_results = []
     temp_polygon_dict = defaultdict(dict)
     dict_ij = defaultdict(dict)
@@ -177,133 +167,3 @@ def component(node, neighbors=neighbors, seen=seen, see=seen.add):
 
     first_row["image_shapefile_results"] = ShapefileResults(image_shapefile_results_without_dups)
     return first_row
-
-
-def stitch_shapefile2(group: List[Dict[str, Any]]) -> Dict[str, Any]:
-    image_shapefile_results = []
-    temp_polygon_dict = defaultdict(dict)
-    dict_ij = defaultdict(dict)
-    image_data_from_arbitrary_row = group[0]["image_metadata"]
-    for row in group:
-        image_shapefile_results.extend(row["tile_shapefile_results"])
-        image_tile = row["image_tile"]
-        tile_num = image_tile.tile_metadata.tile_num
-        temp_polygon_dict[tile_num] = len(row["tile_shapefile_results"])
-        id_i = image_tile.tile_metadata.id_i
-        id_j = image_tile.tile_metadata.id_j
-        dict_ij[id_i][id_j] = tile_num
-
-    polygon_dict = defaultdict(dict)
-    poly_count = 0
-    for k, v in temp_polygon_dict.items():
-        polygon_dict[k] = [poly_count, poly_count + v[0]]
-        poly_count += v[0]
-
-    size_i, size_j = image_data_from_arbitrary_row.len_y_list, image_data_from_arbitrary_row.len_x_list
-
-    # create a list to store those centroid point
-    centroid_list = list()
-    # create a count number for final checking
-    for shapefile_result in image_shapefile_results:
-        # create a polygon in shapely
-        ref_polygon = Polygon(shapefile_result.polygons)
-        # parse wkt return
-        geom = ogr.CreateGeometryFromWkt(ref_polygon.centroid.wkt)
-        centroid_x, centroid_y = geom.GetPoint(0)[0], geom.GetPoint(0)[1]
-        centroid_list.append([centroid_x, centroid_y])
-
-    close_list = list()
-    print("Total number of polygons: ", len(centroid_list))
-    tile_blocksize = 4
-
-    for id_i in range(0, size_i, 3):
-        if id_i + tile_blocksize < size_i:
-            n_i = tile_blocksize
-        else:
-            n_i = size_i - id_i
-
-        for id_j in range(0, size_j, 3):
-            if id_j + tile_blocksize < size_j:
-                n_j = tile_blocksize
-            else:
-                n_j = size_j - id_j
-
-            # add to the neighbor list.
-            centroid_neighbors = []
-            poly_neighbors = []
-
-            for ii in range(n_i):
-                for jj in range(n_j):
-                    if (ii + id_i) in dict_ij.keys():
-                        if (jj + id_j) in dict_ij[(ii + id_i)].keys():
-                            n = dict_ij[ii + id_i][jj + id_j]
-                            poly_range = polygon_dict[n]
-                            poly_list = [*range(poly_range[0], poly_range[1])]
-                            poly_neighbors.extend(poly_list)
-                            centroid_neighbors.extend(
-                                centroid_list[poly_range[0]: poly_range[1]]
-                            )
-
-            if len(centroid_neighbors) == 0:
-                continue
-            dst_array = distance.cdist(
-                centroid_neighbors, centroid_neighbors, "euclidean"
-            )
-
-            # filter out close objects
-            filter_object_array = np.argwhere(
-                (dst_array < 10) & (dst_array > 0))
-
-            filter_object_array[:, 0] = [
-                poly_neighbors[i] for i in filter_object_array[:, 0]
-            ]
-            filter_object_array[:, 1] = [
-                poly_neighbors[i] for i in filter_object_array[:, 1]
-            ]
-
-            if filter_object_array.shape[0] != 0:
-                for i in filter_object_array:
-                    close_list.append(i.tolist())
-            else:
-                continue
-
-    # remove duplicated index
-    close_list = set(frozenset(sublist) for sublist in close_list)
-    close_list = [list(x) for x in close_list]
-
-    # --------------- looking for connected components in a graph ---------------
-    def connected_components(lists):
-        neighbors = defaultdict(set)
-        seen = set()
-        for each in lists:
-            for item in each:
-                neighbors[item].update(each)
-
-        def component(node, neighbors=neighbors, seen=seen, see=seen.add):
-            nodes = set([node])
-            next_node = nodes.pop
-            while nodes:
-                node = next_node()
-                see(node)
-                nodes |= neighbors[node] - seen
-                yield node
-
-        for node in neighbors:
-            if node not in seen:
-                yield sorted(component(node))
-
-    close_list = list(connected_components(close_list))
-
-    # --------------- create a new shp file to store ---------------
-    # randomly pick one of many duplications
-    chosen_polygon_indexes = []
-    for close_possible in close_list:
-        chosen_polygon_indexes.append(random.choice(close_possible))
-
-    image_shapefile_results_without_dups = [
-        image_shapefile_results[index] for index in chosen_polygon_indexes]
-
-    group[0]["image_shapefile_results"] = image_shapefile_results_without_dups
-    return group[0]
-
-

From 833fcf20382e169053a50426789b3e27d1ec1586 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Mon, 8 Apr 2024 21:35:42 +0000
Subject: [PATCH 14/31] fix gdal virtual file system issue - the gdal vfs only
 works in a single process so instead of having the virtual file in the ray
 dataset, adapted the code to store the image bytes in the dataset and create
 the gdal virtual file locally when needed

---
 gdal_virtual_file_system.py | 22 ++++++++++++++++
 maple_workflow.py           | 48 ++++++++++++++++-------------------
 write_shapefiles_ray.py     | 50 ++++++++++++++++++++++++-------------
 3 files changed, 75 insertions(+), 45 deletions(-)
 create mode 100644 gdal_virtual_file_system.py

diff --git a/gdal_virtual_file_system.py b/gdal_virtual_file_system.py
new file mode 100644
index 0000000..3f74303
--- /dev/null
+++ b/gdal_virtual_file_system.py
@@ -0,0 +1,22 @@
+import os
+from osgeo import gdal
+
+class GDALVirtualFileSystem:
+    def __init__(
+        self,
+        file_path: str,
+        file_bytes: bytes
+    ):
+        vfs_dir_path = '/vsimem/vsidir/'
+        file_name = os.path.basename(file_path)
+        self.vfs_filename = os.path.join(vfs_dir_path, file_name)
+        self.file_bytes = file_bytes
+
+    def create_virtual_file(self) -> str:
+        dst = gdal.VSIFOpenL(self.vfs_filename, 'wb+')
+        gdal.VSIFWriteL(self.file_bytes, 1, len(self.file_bytes), dst)
+        gdal.VSIFCloseL(dst)
+        return self.vfs_filename
+
+    def close_virtual_file(self):
+        gdal.Unlink(self.vfs_filename)
diff --git a/maple_workflow.py b/maple_workflow.py
index 6b3da68..f2cd96a 100644
--- a/maple_workflow.py
+++ b/maple_workflow.py
@@ -23,6 +23,7 @@
 import write_shapefiles_ray as write_shapefiles
 import mpl_process_shapefile as process
 import mpl_stitchshpfile_new as stich
+import gdal_virtual_file_system as gdal_vfs
 import mpl_stitchshpfile_ray as ray_stitch
 import numpy as np
 import os
@@ -71,23 +72,8 @@ def create_geotiff_images_dataset(input_image_dir: str) -> ray.data.Dataset:
     return ray.data.read_binary_files(input_image_dir, include_paths=True)
 
 
-def add_virtual_GDAL_file_path(row: Dict[str, Any]) -> Dict[str, Any]:
-    vfs_dir_path = '/vsimem/vsidir/'
-    image_file_name = os.path.basename(row["path"])
-    vfs_filename = os.path.join(vfs_dir_path, image_file_name)
-    dst = gdal.VSIFOpenL(vfs_filename, 'wb+')
-    gdal.VSIFWriteL(row["bytes"], 1, len(row["bytes"]), dst)
-    gdal.VSIFCloseL(dst)
-    row["image_file_name"] = image_file_name
-    row["vfs_image_path"] = vfs_filename
-    return row
-
-
-def test_gdal_operation(row: Dict[str, Any]) -> Dict[str, Any]:
-    input_image_gtif = gdal.Open(row["path"])
-    vfs_image_gtif = gdal.Open(row["vfs_image_path"])
-    row["gdal_test"] = np.array_equal(input_image_gtif.GetRasterBand(
-        1).ReadAsArray(), vfs_image_gtif.GetRasterBand(1).ReadAsArray())
+def add_image_name(row: Dict[str, Any]) -> Dict[str, Any]:
+    row["image_name"] = os.path.basename(row["path"]).split(".tif")[0]
     return row
 
 
@@ -100,14 +86,13 @@ def cal_water_mask(row: Dict[str, Any], config: MPL_Config) -> Dict[str, Any]:
     row : Row in Ray Dataset, there is one row per image.
     config : Contains static configuration information regarding the workflow.
     """
-    image_name = row["image_file_name"].split(".tif")[0]
-
     # os.path.join(worker_root, "water_shp")
     worker_water_root = config.WATER_MASK_DIR
     temp_water_root = (
         config.TEMP_W_IMG_DIR
     )  # os.path.join(worker_root, "temp_8bitmask")
 
+    image_name = row["image_name"]
     worker_water_subroot = os.path.join(worker_water_root, image_name)
     temp_water_subroot = os.path.join(temp_water_root, image_name)
     # Prepare to make directories to create the files
@@ -138,7 +123,10 @@ def cal_water_mask(row: Dict[str, Any], config: MPL_Config) -> Dict[str, Any]:
     # %% Median and Otsu
     value = 5
 
-    input_image_file_path = row["vfs_image_path"]
+    # Create virtual file system file for image to use GDAL's file apis.
+    vfs = gdal_vfs.GDALVirtualFileSystem(
+        file_path=row["path"], file_bytes=row["bytes"])
+    virtual_file_path = vfs.create_virtual_file()
 
     # UPDATED CODE - amal 01/11/2023
     # cmd line execution thrown exceptions unable to capture
@@ -148,7 +136,7 @@ def cal_water_mask(row: Dict[str, Any], config: MPL_Config) -> Dict[str, Any]:
     try:
         gdal.Translate(
             destName=output_tif_8b_file,
-            srcDS=input_image_file_path,
+            srcDS=virtual_file_path,
             format="GTiff",
             outputType=gdal.GDT_Byte,
         )
@@ -163,11 +151,12 @@ def cal_water_mask(row: Dict[str, Any], config: MPL_Config) -> Dict[str, Any]:
 
     bilat_img = filters.rank.median(nir, disk(value))
 
-    gtif = gdal.Open(input_image_file_path)
+    gtif = gdal.Open(virtual_file_path)
     geotransform = gtif.GetGeoTransform()
     sourceSR = gtif.GetProjection()
     # Close the file.
     gtif = None
+    vfs.close_virtual_file()
 
     x = np.shape(image)[1]
     y = np.shape(image)[0]
@@ -213,7 +202,11 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> List[Dict[str, Any]]:
     input_img_name : Name of the input image
     """
 
-    input_image_gtif = gdal.Open(row["vfs_image_path"])
+    # Create virtual file system file for image to use GDAL's file apis.
+    vfs = gdal_vfs.GDALVirtualFileSystem(
+        file_path=row["path"], file_bytes=row["bytes"])
+    virtual_image_file_path = vfs.create_virtual_file()
+    input_image_gtif = gdal.Open(virtual_image_file_path)
     mask = row["mask"]
 
     # convert the original image into the geo cordinates for further processing using gdal
@@ -249,6 +242,7 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> List[Dict[str, Any]]:
 
     # Close the file.
     input_image_gtif = None
+    vfs.close_virtual_file()
 
     tile_count = 0
 
@@ -299,7 +293,7 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> List[Dict[str, Any]]:
             out_B = cv2.equalizeHist(B)
             out_R = cv2.equalizeHist(R)
             out_G = cv2.equalizeHist(G)
-            final_image = np.array(cv2.merge((out_B, out_G, out_R)))        
+            final_image = np.array(cv2.merge((out_B, out_G, out_R)))
 
             # Upper left (x,y) for each images
             ul_row_divided_img = uly + i * y_resolution
@@ -374,7 +368,7 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> List[Dict[str, Any]]:
 
     print("0. load geotiffs into ray dataset")
     dataset = create_geotiff_images_dataset(
-        config.INPUT_IMAGE_DIR).map(add_virtual_GDAL_file_path)
+        config.INPUT_IMAGE_DIR).map(add_image_name)
     print("ray dataset:", dataset.schema())
     print("1. start calculating watermask")
     dataset_with_water_mask = dataset.map(fn=cal_water_mask,
@@ -383,7 +377,6 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> List[Dict[str, Any]]:
     print("2. start tiling image")
     image_tiles_dataset = dataset_with_water_mask.flat_map(
         fn=tile_image, fn_kwargs={"config": config})
-    #image_tiles_dataset = image_tiles_dataset.drop_columns(["bytes"])
     image_tiles_dataset = image_tiles_dataset.drop_columns(["mask"])
     print("dataset with image tiles: ", image_tiles_dataset.schema())
     print("3. start inferencing")
@@ -391,7 +384,8 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> List[Dict[str, Any]]:
         fn=ray_inference.MaskRCNNPredictor, fn_constructor_kwargs={"config": config}, concurrency=2)
     print("inferenced:", inferenced_dataset.schema())
     print("4. start stiching")
-    data_per_image = inferenced_dataset.groupby("image_file_name").map_groups(ray_stitch.stitch_shapefile)
+    data_per_image = inferenced_dataset.groupby(
+        "image_name").map_groups(ray_stitch.stitch_shapefile)
     print("grouped by file: ", data_per_image.schema())
     print("5. experimenting with shapefiles")
     shapefiles_dataset = data_per_image.map(
diff --git a/write_shapefiles_ray.py b/write_shapefiles_ray.py
index b19084b..1a9729e 100644
--- a/write_shapefiles_ray.py
+++ b/write_shapefiles_ray.py
@@ -2,11 +2,12 @@
 import shutil
 import shapefile
 
-from mpl_config import MPL_Config
 from osgeo import gdal, osr
+import gdal_virtual_file_system as gdal_vfs
 from shapely.geometry import Polygon, Point
 from typing import Any, Dict
 
+
 def delete_and_create_dir(dir: str):
     try:
         shutil.rmtree(dir)
@@ -19,6 +20,7 @@ def delete_and_create_dir(dir: str):
     except Exception as e:
         print(f"Error creating directory {dir}: {e}")
 
+
 class WriteShapefiles:
     def __init__(
         self,
@@ -27,21 +29,28 @@ def __init__(
         self.shpfile_output_dir = shpfile_output_dir
         delete_and_create_dir(self.shpfile_output_dir)
 
-    def get_coordinate_system_info(self, filepath: str):
+    def get_coordinate_system_info(self, file_path: str, file_bytes: bytes):
         try:
             # Open the dataset
-            dataset = gdal.Open(filepath)
+            # Create virtual file system file for image to use GDAL's file apis.
+            vfs = gdal_vfs.GDALVirtualFileSystem(
+                file_path, file_bytes)
+            virtual_file_path = vfs.create_virtual_file()
+            print("here is the virtual file path: ", virtual_file_path)
+            dataset = gdal.Open(virtual_file_path)
 
             # Check if the dataset is valid
             if dataset is None:
-                raise Exception("Error: Unable to open the dataset.")
+                print("gdal error: ", gdal.GetLastErrorMsg())
+                raise Exception("Error: Unable to open the dataset")
 
             # Get the spatial reference
             spatial_ref = osr.SpatialReference()
 
             # Try to import the coordinate system from WKT
             if spatial_ref.ImportFromWkt(dataset.GetProjection()) != gdal.CE_None:
-                raise Exception("Error: Unable to import coordinate system from WKT.")
+                raise Exception(
+                    "Error: Unable to import coordinate system from WKT.")
 
             # Check if the spatial reference is valid
             if spatial_ref.Validate() != gdal.CE_None:
@@ -49,25 +58,28 @@ def get_coordinate_system_info(self, filepath: str):
 
             # Export the spatial reference to WKT
             dataset = None
+            vfs.close_virtual_file()
             return spatial_ref.ExportToWkt()
 
         except Exception as e:
             print(f"Error when getting the coordinate system info: {e}")
             dataset = None
+            vfs.close_virtual_file()
             return None
 
-    def write_prj_file(self, geotiff_path: str, prj_file_path: str):
+    def write_prj_file(self, geotiff_path: str, geotiff_bytes: bytes, prj_file_path: str):
         """
         Will create the prj file by getting the geo cordinate system from the input tiff file
 
         Parameters
         ----------
-        geotiff_path : Path to the geo tiff used for processing
+        geotiff_path : geotiff_path : Path to the geo tiff used for processing
+        geotiff_bytes: Bytes of the input image
         prj_file_path : Path to the location to create the prj files
         """
         try:
             # Get the coordinate system information
-            wkt = self.get_coordinate_system_info(geotiff_path)
+            wkt = self.get_coordinate_system_info(geotiff_path, geotiff_bytes)
 
             print(wkt)
             if wkt is not None:
@@ -75,7 +87,8 @@ def write_prj_file(self, geotiff_path: str, prj_file_path: str):
                 with open(prj_file_path, "w") as prj_file:
                     prj_file.write(wkt)
 
-                print(f"Coordinate system information written to {prj_file_path}")
+                print(
+                    f"Coordinate system information written to {prj_file_path}")
             else:
                 print("Failed to get coordinate system information.")
 
@@ -95,7 +108,7 @@ def write_shapefile(self, row: Dict[str, Any], shapefile_output_dir_for_image: s
         w.field("Perimeter", "N", decimal=3)
         w.field("Length", "N", decimal=3)
         w.field("Width", "N", decimal=3)
-        image_file_name = row["image_file_name"].split(".tif")[0]
+        image_name = row["image_name"]
         for shapefile_result in row["image_shapefile_results"].shapefile_results:
             polygons = shapefile_result.polygons
             w.poly([polygons.tolist()])
@@ -111,19 +124,20 @@ def write_shapefile(self, row: Dict[str, Any], shapefile_output_dir_for_image: s
             length = max(edge_length)
             width = min(edge_length)
 
-            w.record(Class=shapefile_result.class_id, Sensor=image_file_name[0:1], Date=image_file_name[1:2],
-                     Time=image_file_name[2:3], CatalogID=image_file_name[3:4], Area=poly.area,
+            w.record(Class=shapefile_result.class_id, Sensor=image_name[0:1], Date=image_name[1:2],
+                     Time=image_name[2:3], CatalogID=image_name[3:4], Area=poly.area,
                      CentroidX=centroid.x, CentroidY=centroid.y, Perimeter=poly.length, Length=length, Width=width)
         w.close()
 
     def __call__(self, row: Dict[str, Any]) -> Dict[str, Any]:
-        image_file_name = row["image_file_name"].split(".tif")[0]
-        shapefile_output_dir_for_image = os.path.join(self.shpfile_output_dir, f"{image_file_name}.shp")
+        image_name = row["image_name"]
+        shapefile_output_dir_for_image = os.path.join(
+            self.shpfile_output_dir, f"{image_name}.shp")
         self.write_shapefile(row, shapefile_output_dir_for_image)
 
-        prj_output_dir_for_image = os.path.join(self.shpfile_output_dir, f"{image_file_name}.prj")
-        print("here is the virtual file path: ", row["vfs_image_path"])
-        self.write_prj_file(geotiff_path=row["vfs_image_path"], prj_file_path=prj_output_dir_for_image)
+        prj_output_dir_for_image = os.path.join(
+            self.shpfile_output_dir, f"{image_name}.prj")
+        self.write_prj_file(
+            geotiff_path=row["path"], geotiff_bytes=row["bytes"], prj_file_path=prj_output_dir_for_image)
         row["shapefile_output_dir"] = shapefile_output_dir_for_image
         return row
-

From e9ce98686206542d846faeb41cf851a3b5b23670 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Thu, 11 Apr 2024 21:00:08 +0000
Subject: [PATCH 15/31] Break logic up into different files and fix imports

---
 maple_workflow.py               | 340 +++-----------------------------
 mpl_config.py                   |   2 +
 mpl_infer_tiles_ray.py          |  13 +-
 mpl_stitchshpfile_ray.py        | 169 ----------------
 mpl_tile_and_stitch_ray_util.py | 329 ++++++++++++++++++++++++++++++
 5 files changed, 362 insertions(+), 491 deletions(-)
 delete mode 100644 mpl_stitchshpfile_ray.py
 create mode 100644 mpl_tile_and_stitch_ray_util.py

diff --git a/maple_workflow.py b/maple_workflow.py
index f2cd96a..1811094 100644
--- a/maple_workflow.py
+++ b/maple_workflow.py
@@ -5,67 +5,26 @@
 1. Create water mask
 2. Image Tiling
 Classification / Inference
-3. Infer "ground truth" from images based on the trained model
 Post processing
-4. Stich back to the original image dims from the tiles (2.)
-5. Clean the output based on known ground truth
+3. Stich back to the original image dims from the tiles (2.)
+
 
 Project: Permafrost Discovery Gateway: Mapping Application for Arctic Permafrost Land Environment(MAPLE)
 PI      : Chandi Witharana
-Author  : Rajitha Udwalpola
 """
 
 import argparse
-import copy
-import cv2
-import mpl_clean_inference as inf_clean
-import mpl_infer_tiles_ray as ray_inference
-import write_shapefiles_ray as write_shapefiles
-import mpl_process_shapefile as process
-import mpl_stitchshpfile_new as stich
-import gdal_virtual_file_system as gdal_vfs
-import mpl_stitchshpfile_ray as ray_stitch
-import numpy as np
 import os
-import ray
-import sys
-import shutil
-from pathlib import Path
+from typing import Any, Dict
 
-from dataclasses import dataclass
-from mpl_config import MPL_Config
-from osgeo import gdal
-from skimage import color, filters, io
-from skimage.morphology import disk
 import tensorflow as tf
-from typing import Any, Dict, List
-
-# work tag
-WORKTAG = 1
-DIETAG = 0
-
-
-@dataclass
-class ImageMetadata:
-    len_x_list: int
-    len_y_list: int
-    x_resolution: float
-    y_resolution: float
-
-
-@dataclass
-class ImageTileMetadata:
-    upper_left_row: float
-    upper_left_col: float
-    id_i: int
-    id_j: int
-    tile_num: int
-
+import ray
 
-@dataclass
-class ImageTile:
-    tile_values: np.array
-    tile_metadata: ImageTileMetadata
+from mpl_config import MPL_Config
+import mpl_preprocessing as preprocessing
+import mpl_infer_tiles_ray as ray_inference
+import write_shapefiles_ray as write_shapefiles
+import mpl_tile_and_stitch_ray_util as tile_and_stitch_util
 
 
 def create_geotiff_images_dataset(input_image_dir: str) -> ray.data.Dataset:
@@ -77,247 +36,6 @@ def add_image_name(row: Dict[str, Any]) -> Dict[str, Any]:
     return row
 
 
-def cal_water_mask(row: Dict[str, Any], config: MPL_Config) -> Dict[str, Any]:
-    """
-    This will calculate the water mask to avoid (inference) processing of the masked areas with water
-    Uses gdal to transform the image into the required format.
-    Parameters
-    ----------
-    row : Row in Ray Dataset, there is one row per image.
-    config : Contains static configuration information regarding the workflow.
-    """
-    # os.path.join(worker_root, "water_shp")
-    worker_water_root = config.WATER_MASK_DIR
-    temp_water_root = (
-        config.TEMP_W_IMG_DIR
-    )  # os.path.join(worker_root, "temp_8bitmask")
-
-    image_name = row["image_name"]
-    worker_water_subroot = os.path.join(worker_water_root, image_name)
-    temp_water_subroot = os.path.join(temp_water_root, image_name)
-    # Prepare to make directories to create the files
-    try:
-        shutil.rmtree(worker_water_subroot)
-    except:
-        print("directory deletion failed")
-        pass
-
-    try:
-        shutil.rmtree(temp_water_subroot)
-    except:
-        print("directory deletion failed")
-        pass
-
-    # check local storage for temporary storage
-    Path(worker_water_subroot).mkdir(parents=True, exist_ok=True)
-    Path(temp_water_subroot).mkdir(parents=True, exist_ok=True)
-
-    output_watermask = os.path.join(
-        worker_water_subroot, r"%s_watermask.tif" % image_name
-    )
-    output_tif_8b_file = os.path.join(
-        temp_water_subroot, r"%s_8bit.tif" % image_name
-    )
-    nir_band = 3  # set number of NIR band
-
-    # %% Median and Otsu
-    value = 5
-
-    # Create virtual file system file for image to use GDAL's file apis.
-    vfs = gdal_vfs.GDALVirtualFileSystem(
-        file_path=row["path"], file_bytes=row["bytes"])
-    virtual_file_path = vfs.create_virtual_file()
-
-    # UPDATED CODE - amal 01/11/2023
-    # cmd line execution thrown exceptions unable to capture
-    # Using gdal to execute the gdal_Translate
-    # output file checked against the cmd line gdal_translate
-    gdal.UseExceptions()  # Enable errors
-    try:
-        gdal.Translate(
-            destName=output_tif_8b_file,
-            srcDS=virtual_file_path,
-            format="GTiff",
-            outputType=gdal.GDT_Byte,
-        )
-    except RuntimeError:
-        print("gdal Translate failed with", gdal.GetLastErrorMsg())
-        pass
-
-    image = io.imread(
-        output_tif_8b_file
-    )  # image[rows, columns, dimensions]-> image[:,:,3] is near Infrared
-    nir = image[:, :, nir_band]
-
-    bilat_img = filters.rank.median(nir, disk(value))
-
-    gtif = gdal.Open(virtual_file_path)
-    geotransform = gtif.GetGeoTransform()
-    sourceSR = gtif.GetProjection()
-    # Close the file.
-    gtif = None
-    vfs.close_virtual_file()
-
-    x = np.shape(image)[1]
-    y = np.shape(image)[0]
-
-    # Normalize and blur before thresholding. Usually instead of normalizing
-    # a rgb to greyscale transformation is applied. In this case, we are already
-    # dealing with a single channel so we divide all the pixels in the image by
-    # 255 to get a value between [0, 1].
-    normalized_bilat_img = bilat_img / 255
-    normalized_blurred_bilat_img = filters.gaussian(
-        normalized_bilat_img, sigma=2.0)
-
-    # find the threshold to filter if all values are same otsu cannot find value
-    # hence t is made to 0.0
-    try:
-        t = filters.threshold_otsu(normalized_blurred_bilat_img)
-    except:
-        t = 0.0
-    # perform inverse binary thresholding
-    mask = normalized_blurred_bilat_img > t
-
-    # output np array as GeoTiff
-    dst_ds = gdal.GetDriverByName("GTiff").Create(
-        output_watermask, x, y, 1, gdal.GDT_Byte, ["NBITS=1"]
-    )
-    dst_ds.GetRasterBand(1).WriteArray(mask)
-    dst_ds.SetGeoTransform(geotransform)
-    dst_ds.SetProjection(sourceSR)
-    dst_ds.FlushCache()
-    dst_ds = None
-    row["mask"] = mask
-    return row
-
-
-def tile_image(row: Dict[str, Any], config: MPL_Config) -> List[Dict[str, Any]]:
-    """
-    Tile the image into multiple pre-deifined sized parts so that the processing can be done on smaller parts due to
-    processing limitations
-
-    Parameters
-    ----------
-    config : Contains static configuration information regarding the workflow.
-    input_img_name : Name of the input image
-    """
-
-    # Create virtual file system file for image to use GDAL's file apis.
-    vfs = gdal_vfs.GDALVirtualFileSystem(
-        file_path=row["path"], file_bytes=row["bytes"])
-    virtual_image_file_path = vfs.create_virtual_file()
-    input_image_gtif = gdal.Open(virtual_image_file_path)
-    mask = row["mask"]
-
-    # convert the original image into the geo cordinates for further processing using gdal
-    # https://gdal.org/tutorials/geotransforms_tut.html
-    # GT(0) x-coordinate of the upper-left corner of the upper-left pixel.
-    # GT(1) w-e pixel resolution / pixel width.
-    # GT(2) row rotation (typically zero).
-    # GT(3) y-coordinate of the upper-left corner of the upper-left pixel.
-    # GT(4) column rotation (typically zero).
-    # GT(5) n-s pixel resolution / pixel height (negative value for a north-up image).
-
-    # ---------------------- crop image from the water mask----------------------
-    # dot product of the mask and the orignal data before breaking it for processing
-    # Also band 2 3 and 4 are taken because the 4 bands cannot be processed by the NN learingin algo
-    # Need to make sure that the training bands are the same as the bands used for inferencing.
-    #
-    final_array_2 = input_image_gtif.GetRasterBand(2).ReadAsArray()
-    final_array_3 = input_image_gtif.GetRasterBand(3).ReadAsArray()
-    final_array_4 = input_image_gtif.GetRasterBand(4).ReadAsArray()
-
-    final_array_2 = np.multiply(final_array_2, mask)
-    final_array_3 = np.multiply(final_array_3, mask)
-    final_array_4 = np.multiply(final_array_4, mask)
-
-    # ulx, uly is the upper left corner
-    ulx, x_resolution, _, uly, _, y_resolution = input_image_gtif.GetGeoTransform()
-
-    # ---------------------- Divide image (tile) ----------------------
-    overlap_rate = 0.2
-    block_size = config.CROP_SIZE
-    ysize = input_image_gtif.RasterYSize
-    xsize = input_image_gtif.RasterXSize
-
-    # Close the file.
-    input_image_gtif = None
-    vfs.close_virtual_file()
-
-    tile_count = 0
-
-    y_list = range(0, ysize, int(block_size * (1 - overlap_rate)))
-    x_list = range(0, xsize, int(block_size * (1 - overlap_rate)))
-
-    # ---------------------- Find each Upper left (x,y) for each divided images ----------------------
-    #  ***-----------------
-    #  ***
-    #  ***
-    #  |
-    #  |
-    #
-    tiles = []
-    for id_i, i in enumerate(y_list):
-        # don't want moving window to be larger than row size of input raster
-        if i + block_size < ysize:
-            rows = block_size
-        else:
-            rows = ysize - i
-
-        # read col
-        for id_j, j in enumerate(x_list):
-            if j + block_size < xsize:
-                cols = block_size
-            else:
-                cols = xsize - j
-            # get block out of the whole raster
-            # todo check the array values is similar as ReadAsArray()
-            band_1_array = final_array_4[i: i + rows, j: j + cols]
-            band_2_array = final_array_2[i: i + rows, j: j + cols]
-            band_3_array = final_array_3[i: i + rows, j: j + cols]
-
-            # filter out black image
-            if (
-                band_3_array[0, 0] == 0
-                and band_3_array[0, -1] == 0
-                and band_3_array[-1, 0] == 0
-                and band_3_array[-1, -1] == 0
-            ):
-                continue
-
-            # stack three bands into one array
-            img = np.stack((band_1_array, band_2_array, band_3_array), axis=2)
-            cv2.normalize(img, img, 0, 255, cv2.NORM_MINMAX)
-            img = img.astype(np.uint8)
-            B, G, R = cv2.split(img)
-            out_B = cv2.equalizeHist(B)
-            out_R = cv2.equalizeHist(R)
-            out_G = cv2.equalizeHist(G)
-            final_image = np.array(cv2.merge((out_B, out_G, out_R)))
-
-            # Upper left (x,y) for each images
-            ul_row_divided_img = uly + i * y_resolution
-            ul_col_divided_img = ulx + j * x_resolution
-
-            tile_metadata = ImageTileMetadata(
-                upper_left_row=ul_row_divided_img, upper_left_col=ul_col_divided_img, tile_num=tile_count, id_i=id_i, id_j=id_j)
-            image_tile = ImageTile(
-                tile_values=final_image, tile_metadata=tile_metadata)
-            tiles.append(image_tile)
-            tile_count += 1
-
-    # --------------- Store all the title as an object file
-    image_metadata = ImageMetadata(
-        len_x_list=len(x_list), len_y_list=len(y_list), x_resolution=x_resolution, y_resolution=y_resolution)
-    row["image_metadata"] = image_metadata
-    new_rows = []
-    for image_tile in tiles:
-        new_row = copy.deepcopy(row)
-        new_row["image_tile"] = image_tile
-        new_rows.append(new_row)
-    return new_rows
-
-
 if __name__ == "__main__":
     tf.compat.v1.disable_eager_execution()
     parser = argparse.ArgumentParser(
@@ -366,40 +84,32 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> List[Dict[str, Any]]:
         args.root_dir, args.weight_file, num_gpus_per_core=args.gpus_per_core
     )
 
-    print("0. load geotiffs into ray dataset")
+    print("0. Load geotiffs into ray dataset")
     dataset = create_geotiff_images_dataset(
         config.INPUT_IMAGE_DIR).map(add_image_name)
-    print("ray dataset:", dataset.schema())
-    print("1. start calculating watermask")
-    dataset_with_water_mask = dataset.map(fn=cal_water_mask,
+    print("Ray dataset schema:", dataset.schema())
+    print("1. Start calculating watermask")
+    dataset_with_water_mask = dataset.map(fn=preprocessing.cal_water_mask,
                                           fn_kwargs={"config": config})
-    print("dataset with water mask: ", dataset_with_water_mask.schema())
-    print("2. start tiling image")
+    print("Dataset schema with water mask: ", dataset_with_water_mask.schema())
+    print("2. Start tiling image")
     image_tiles_dataset = dataset_with_water_mask.flat_map(
-        fn=tile_image, fn_kwargs={"config": config})
+        fn=tile_and_stitch_util.tile_image, fn_kwargs={"config": config})
     image_tiles_dataset = image_tiles_dataset.drop_columns(["mask"])
-    print("dataset with image tiles: ", image_tiles_dataset.schema())
-    print("3. start inferencing")
+    print("Dataset schema with image tiles: ", image_tiles_dataset.schema())
+    print("3. Start inferencing")
     inferenced_dataset = image_tiles_dataset.map(
         fn=ray_inference.MaskRCNNPredictor, fn_constructor_kwargs={"config": config}, concurrency=2)
-    print("inferenced:", inferenced_dataset.schema())
-    print("4. start stiching")
+    print("Dataset schema with inferenced tiles: ", inferenced_dataset.schema())
+    print("4. Start stitching")
     data_per_image = inferenced_dataset.groupby(
-        "image_name").map_groups(ray_stitch.stitch_shapefile)
-    print("grouped by file: ", data_per_image.schema())
-    print("5. experimenting with shapefiles")
+        "image_name").map_groups(tile_and_stitch_util.stitch_shapefile)
+    print("Dataset schema where each row is an image (result of a group by tile): ",
+          data_per_image.schema())
+    print("5. Write shapefiles")
     shapefiles_dataset = data_per_image.map(
         fn=write_shapefiles.WriteShapefiles, fn_constructor_kwargs={"shpfile_output_dir": config.TEST_SHAPEFILE}, concurrency=2)
-    print("done writing to shapefiles", shapefiles_dataset.schema())
-    """
-    process.process_shapefile(config, image_name)
-    print("5. start cleaning")
-    inf_clean.clean_inference_shapes(
-        config.CLEAN_DATA_DIR,
-        config.PROJECTED_SHP_DIR,
-        "./data/input_bound/sample2_out_boundry.shp",
-    )
-    """
+    print("Done writing shapefiles", shapefiles_dataset.schema())
 
 # Once you are done you can check the output on ArcGIS (win) or else you can check in QGIS (nx) Add the image and the
 # shp, shx, dbf as layers.
diff --git a/mpl_config.py b/mpl_config.py
index 46105c7..6494e3e 100644
--- a/mpl_config.py
+++ b/mpl_config.py
@@ -48,6 +48,8 @@ def __init__(
         self.TEMP_W_IMG_DIR = self.ROOT_DIR + r"/data/water_mask/temp"
         self.OUTPUT_IMAGE_DIR = self.ROOT_DIR + r"/data/output_img"
         self.WORKER_ROOT = self.ROOT_DIR + r"/data"
+        self.MODEL_DIR = self.ROOT_DIR + r"/local_dir/datasets/logs"
+        self.TEST_SHAPEFILE = self.ROOT_DIR + r"/data/ray_shapefiles"
 
         # ADDED to include inference cleaning post-processing
         self.CLEAN_DATA_DIR = self.ROOT_DIR + r"/data/cln_data"
diff --git a/mpl_infer_tiles_ray.py b/mpl_infer_tiles_ray.py
index 10fa1b3..ceb0e5a 100644
--- a/mpl_infer_tiles_ray.py
+++ b/mpl_infer_tiles_ray.py
@@ -9,17 +9,15 @@
 Author  : Rajitha Udwalpola
 """
 
-import copy
-import model as modellib
+from dataclasses import dataclass
+from typing import Any, Dict, List
+
 import numpy as np
-import ray
 import tensorflow as tf
-
-from dataclasses import dataclass
-from mpl_config import MPL_Config, PolygonConfig
 from skimage.measure import find_contours
-from typing import Any, Dict, List
 
+import model as modellib
+from mpl_config import MPL_Config, PolygonConfig
 
 @dataclass
 class ShapefileResult:
@@ -90,6 +88,7 @@ def __call__(self, row: Dict[str, Any]) -> Dict[str, Any]:
                 except:
                     contours = []
                     pass
+
         row["num_polygons_in_tile"] = r["masks"].shape[2]
         row["tile_shapefile_results"] = ShapefileResults(shapefile_results)
         return row
diff --git a/mpl_stitchshpfile_ray.py b/mpl_stitchshpfile_ray.py
deleted file mode 100644
index 50ab039..0000000
--- a/mpl_stitchshpfile_ray.py
+++ /dev/null
@@ -1,169 +0,0 @@
-#!/usr/bin/env python3
-"""
-MAPLE Workflow
-(4) Stich back to the original image dims from the tiles created by the inference process
-Project: Permafrost Discovery Gateway: Mapping Application for Arctic Permafrost Land Environment(MAPLE)
-PI      : Chandi Witharana
-Author  : Rajitha Udwalpola
-"""
-
-import glob
-import numpy as np
-import os
-import pandas as pd
-import pickle
-import random
-import ray
-import shapefile
-
-from collections import defaultdict
-from dataclasses import dataclass
-from mpl_config import MPL_Config
-from osgeo import ogr
-from scipy.spatial import distance
-from shapely.geometry import Polygon
-from typing import Any, Dict, List
-
-@dataclass
-class ShapefileResult:
-    polygons: np.array
-    class_id: int
-
-@dataclass
-class ShapefileResults:
-    shapefile_results: List[ShapefileResult]
-
-@dataclass
-class ImageMetadata:
-    len_x_list: int
-    len_y_list: int
-    x_resolution: float
-    y_resolution: float
-
-def stitch_shapefile(group: pd.DataFrame):
-    image_shapefile_results = []
-    temp_polygon_dict = defaultdict(dict)
-    dict_ij = defaultdict(dict)
-    for index, row in group.iterrows():
-        image_shapefile_results.extend(row["tile_shapefile_results"].shapefile_results)
-        image_tile = row["image_tile"]
-        tile_num = image_tile.tile_metadata.tile_num
-        temp_polygon_dict[tile_num] = row["num_polygons_in_tile"]
-        id_i = image_tile.tile_metadata.id_i
-        id_j = image_tile.tile_metadata.id_j
-        dict_ij[id_i][id_j] = tile_num
-
-    polygon_dict = defaultdict(dict)
-    poly_count = 0
-    for k, v in temp_polygon_dict.items():
-        polygon_dict[k] = [poly_count, poly_count + v]
-        poly_count += v
-
-    first_row = group.head(1)
-    image_data_from_arbitrary_row = first_row["image_metadata"][0]
-    size_i, size_j = image_data_from_arbitrary_row.len_y_list, image_data_from_arbitrary_row.len_x_list
-
-    # create a list to store those centroid point
-    centroid_list = list()
-    # create a count number for final checking
-    for shapefile_result in image_shapefile_results:
-        # create a polygon in shapely
-        ref_polygon = Polygon(shapefile_result.polygons)
-        # parse wkt return
-        geom = ogr.CreateGeometryFromWkt(ref_polygon.centroid.wkt)
-        centroid_x, centroid_y = geom.GetPoint(0)[0], geom.GetPoint(0)[1]
-        centroid_list.append([centroid_x, centroid_y])
-
-    close_list = list()
-    print("Total number of polygons: ", len(centroid_list))
-    tile_blocksize = 4
-
-    for id_i in range(0, size_i, 3):
-        if id_i + tile_blocksize < size_i:
-            n_i = tile_blocksize
-        else:
-            n_i = size_i - id_i
-
-        for id_j in range(0, size_j, 3):
-            if id_j + tile_blocksize < size_j:
-                n_j = tile_blocksize
-            else:
-                n_j = size_j - id_j
-
-            # add to the neighbor list.
-            centroid_neighbors = []
-            poly_neighbors = []
-
-            for ii in range(n_i):
-                for jj in range(n_j):
-                    if (ii + id_i) in dict_ij.keys():
-                        if (jj + id_j) in dict_ij[(ii + id_i)].keys():
-                            n = dict_ij[ii + id_i][jj + id_j]
-                            poly_range = polygon_dict[n]
-                            poly_list = [*range(poly_range[0], poly_range[1])]
-                            poly_neighbors.extend(poly_list)
-                            centroid_neighbors.extend(
-                                centroid_list[poly_range[0]: poly_range[1]]
-                            )
-
-            if len(centroid_neighbors) == 0:
-                continue
-            dst_array = distance.cdist(
-                centroid_neighbors, centroid_neighbors, "euclidean"
-            )
-
-            # filter out close objects
-            filter_object_array = np.argwhere(
-                (dst_array < 10) & (dst_array > 0))
-
-            filter_object_array[:, 0] = [
-                poly_neighbors[i] for i in filter_object_array[:, 0]
-            ]
-            filter_object_array[:, 1] = [
-                poly_neighbors[i] for i in filter_object_array[:, 1]
-            ]
-
-            if filter_object_array.shape[0] != 0:
-                for i in filter_object_array:
-                    close_list.append(i.tolist())
-            else:
-                continue
-
-    # remove duplicated index
-    close_list = set(frozenset(sublist) for sublist in close_list)
-    close_list = [list(x) for x in close_list]
-
-    # --------------- looking for connected components in a graph ---------------
-    def connected_components(lists):
-        neighbors = defaultdict(set)
-        seen = set()
-        for each in lists:
-            for item in each:
-                neighbors[item].update(each)
-
-        def component(node, neighbors=neighbors, seen=seen, see=seen.add):
-            nodes = set([node])
-            next_node = nodes.pop
-            while nodes:
-                node = next_node()
-                see(node)
-                nodes |= neighbors[node] - seen
-                yield node
-
-        for node in neighbors:
-            if node not in seen:
-                yield sorted(component(node))
-
-    close_list = list(connected_components(close_list))
-
-    # --------------- create a new shp file to store ---------------
-    # randomly pick one of many duplications
-    chosen_polygon_indexes = []
-    for close_possible in close_list:
-        chosen_polygon_indexes.append(random.choice(close_possible))
-
-    image_shapefile_results_without_dups = [
-        image_shapefile_results[index] for index in chosen_polygon_indexes]
-
-    first_row["image_shapefile_results"] = ShapefileResults(image_shapefile_results_without_dups)
-    return first_row
diff --git a/mpl_tile_and_stitch_ray_util.py b/mpl_tile_and_stitch_ray_util.py
new file mode 100644
index 0000000..7778a2a
--- /dev/null
+++ b/mpl_tile_and_stitch_ray_util.py
@@ -0,0 +1,329 @@
+#!/usr/bin/env python3
+"""
+MAPLE Workflow
+Functions for image tiling and stitching.
+Project: Permafrost Discovery Gateway: Mapping Application for Arctic Permafrost Land Environment(MAPLE)
+PI      : Chandi Witharana
+"""
+import copy
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Dict, List
+import random
+
+import cv2
+from osgeo import gdal, ogr
+from scipy.spatial import distance
+from shapely.geometry import Polygon
+import numpy as np
+import pandas as pd
+
+import gdal_virtual_file_system as gdal_vfs
+from mpl_config import MPL_Config
+
+
+@dataclass
+class ImageMetadata:
+    len_x_list: int
+    len_y_list: int
+    x_resolution: float
+    y_resolution: float
+
+
+@dataclass
+class ImageTileMetadata:
+    upper_left_row: float
+    upper_left_col: float
+    id_i: int
+    id_j: int
+    tile_num: int
+
+
+@dataclass
+class ImageTile:
+    tile_values: np.array
+    tile_metadata: ImageTileMetadata
+
+
+@dataclass
+class ShapefileResult:
+    polygons: np.array
+    class_id: int
+
+
+@dataclass
+class ShapefileResults:
+    shapefile_results: List[ShapefileResult]
+
+
+def tile_image(row: Dict[str, Any], config: MPL_Config) -> List[Dict[str, Any]]:
+    """
+    Tile the image into multiple pre-deifined sized parts so that the processing can be done on smaller parts due to
+    processing limitations
+
+    Parameters
+    ----------
+    row : Row in ray dataset corresponding to the information for a single image.
+    config : Contains static configuration information regarding the workflow.
+    """
+
+    # Create virtual file system file for image to use GDAL's file apis.
+    vfs = gdal_vfs.GDALVirtualFileSystem(
+        file_path=row["path"], file_bytes=row["bytes"])
+    virtual_image_file_path = vfs.create_virtual_file()
+    input_image_gtif = gdal.Open(virtual_image_file_path)
+    mask = row["mask"]
+
+    # convert the original image into the geo cordinates for further processing using gdal
+    # https://gdal.org/tutorials/geotransforms_tut.html
+    # GT(0) x-coordinate of the upper-left corner of the upper-left pixel.
+    # GT(1) w-e pixel resolution / pixel width.
+    # GT(2) row rotation (typically zero).
+    # GT(3) y-coordinate of the upper-left corner of the upper-left pixel.
+    # GT(4) column rotation (typically zero).
+    # GT(5) n-s pixel resolution / pixel height (negative value for a north-up image).
+
+    # ---------------------- crop image from the water mask----------------------
+    # dot product of the mask and the orignal data before breaking it for processing
+    # Also band 2 3 and 4 are taken because the 4 bands cannot be processed by the NN learingin algo
+    # Need to make sure that the training bands are the same as the bands used for inferencing.
+    #
+    final_array_2 = input_image_gtif.GetRasterBand(2).ReadAsArray()
+    final_array_3 = input_image_gtif.GetRasterBand(3).ReadAsArray()
+    final_array_4 = input_image_gtif.GetRasterBand(4).ReadAsArray()
+
+    final_array_2 = np.multiply(final_array_2, mask)
+    final_array_3 = np.multiply(final_array_3, mask)
+    final_array_4 = np.multiply(final_array_4, mask)
+
+    # ulx, uly is the upper left corner
+    ulx, x_resolution, _, uly, _, y_resolution = input_image_gtif.GetGeoTransform()
+
+    # ---------------------- Divide image (tile) ----------------------
+    overlap_rate = 0.2
+    block_size = config.CROP_SIZE
+    ysize = input_image_gtif.RasterYSize
+    xsize = input_image_gtif.RasterXSize
+
+    # Close the file.
+    input_image_gtif = None
+    vfs.close_virtual_file()
+
+    tile_count = 0
+
+    y_list = range(0, ysize, int(block_size * (1 - overlap_rate)))
+    x_list = range(0, xsize, int(block_size * (1 - overlap_rate)))
+
+    # ---------------------- Find each Upper left (x,y) for each divided images ----------------------
+    #  ***-----------------
+    #  ***
+    #  ***
+    #  |
+    #  |
+    #
+    tiles = []
+    for id_i, i in enumerate(y_list):
+        # don't want moving window to be larger than row size of input raster
+        if i + block_size < ysize:
+            rows = block_size
+        else:
+            rows = ysize - i
+
+        # read col
+        for id_j, j in enumerate(x_list):
+            if j + block_size < xsize:
+                cols = block_size
+            else:
+                cols = xsize - j
+            # get block out of the whole raster
+            # todo check the array values is similar as ReadAsArray()
+            band_1_array = final_array_4[i: i + rows, j: j + cols]
+            band_2_array = final_array_2[i: i + rows, j: j + cols]
+            band_3_array = final_array_3[i: i + rows, j: j + cols]
+
+            # filter out black image
+            if (
+                band_3_array[0, 0] == 0
+                and band_3_array[0, -1] == 0
+                and band_3_array[-1, 0] == 0
+                and band_3_array[-1, -1] == 0
+            ):
+                continue
+
+            # stack three bands into one array
+            img = np.stack((band_1_array, band_2_array, band_3_array), axis=2)
+            cv2.normalize(img, img, 0, 255, cv2.NORM_MINMAX)
+            img = img.astype(np.uint8)
+            B, G, R = cv2.split(img)
+            out_B = cv2.equalizeHist(B)
+            out_R = cv2.equalizeHist(R)
+            out_G = cv2.equalizeHist(G)
+            final_image = np.array(cv2.merge((out_B, out_G, out_R)))
+
+            # Upper left (x,y) for each images
+            ul_row_divided_img = uly + i * y_resolution
+            ul_col_divided_img = ulx + j * x_resolution
+
+            tile_metadata = ImageTileMetadata(
+                upper_left_row=ul_row_divided_img, upper_left_col=ul_col_divided_img, tile_num=tile_count, id_i=id_i, id_j=id_j)
+            image_tile = ImageTile(
+                tile_values=final_image, tile_metadata=tile_metadata)
+            tiles.append(image_tile)
+            tile_count += 1
+
+    # --------------- Store all the title as an object file
+    image_metadata = ImageMetadata(
+        len_x_list=len(x_list), len_y_list=len(y_list), x_resolution=x_resolution, y_resolution=y_resolution)
+    row["image_metadata"] = image_metadata
+    new_rows = []
+    tile_count = 0
+    for image_tile in tiles:
+        new_row = copy.deepcopy(row)
+        new_row["image_tile"] = image_tile
+        new_row["tile_num"] = tile_count
+        new_rows.append(new_row)
+        tile_count += 1
+    return new_rows
+
+
+def stitch_shapefile(group: pd.DataFrame):
+    image_shapefile_results = []
+    temp_polygon_dict = defaultdict(dict)
+    dict_ij = defaultdict(dict)
+    sorted_group = group.sort_values(by="tile_num")
+    for index, row in sorted_group.iterrows():
+        image_shapefile_results.extend(
+            row["tile_shapefile_results"].shapefile_results)
+        image_tile = row["image_tile"]
+        tile_num = row["tile_num"]
+        temp_polygon_dict[tile_num] = row["num_polygons_in_tile"]
+        id_i = image_tile.tile_metadata.id_i
+        id_j = image_tile.tile_metadata.id_j
+        dict_ij[id_i][id_j] = tile_num
+
+    polygon_dict = defaultdict(dict)
+    poly_count = 0
+    for k, v in temp_polygon_dict.items():
+        polygon_dict[k] = [poly_count, poly_count + v]
+        poly_count += v
+
+    first_row = group.head(1)
+    image_data_from_arbitrary_row = first_row["image_metadata"][0]
+    size_i, size_j = image_data_from_arbitrary_row.len_y_list, image_data_from_arbitrary_row.len_x_list
+
+    # create a list to store those centroid point
+    centroid_list = list()
+    # create a count number for final checking
+    for shapefile_result in image_shapefile_results:
+        # create a polygon in shapely
+        ref_polygon = Polygon(shapefile_result.polygons)
+        # parse wkt return
+        geom = ogr.CreateGeometryFromWkt(ref_polygon.centroid.wkt)
+        centroid_x, centroid_y = geom.GetPoint(0)[0], geom.GetPoint(0)[1]
+        centroid_list.append([centroid_x, centroid_y])
+
+    close_list = list()
+    print("Total number of polygons: ", len(centroid_list))
+    tile_blocksize = 4
+
+    for id_i in range(0, size_i, 3):
+        if id_i + tile_blocksize < size_i:
+            n_i = tile_blocksize
+        else:
+            n_i = size_i - id_i
+
+        for id_j in range(0, size_j, 3):
+            if id_j + tile_blocksize < size_j:
+                n_j = tile_blocksize
+            else:
+                n_j = size_j - id_j
+
+            # add to the neighbor list.
+            centroid_neighbors = []
+            poly_neighbors = []
+
+            for ii in range(n_i):
+                for jj in range(n_j):
+                    if (ii + id_i) in dict_ij.keys():
+                        if (jj + id_j) in dict_ij[(ii + id_i)].keys():
+                            n = dict_ij[ii + id_i][jj + id_j]
+                            poly_range = polygon_dict[n]
+                            poly_list = [*range(poly_range[0], poly_range[1])]
+                            poly_neighbors.extend(poly_list)
+                            centroid_neighbors.extend(
+                                centroid_list[poly_range[0]: poly_range[1]]
+                            )
+
+            if len(centroid_neighbors) == 0:
+                continue
+            dst_array = distance.cdist(
+                centroid_neighbors, centroid_neighbors, "euclidean"
+            )
+
+            # filter out close objects
+            filter_object_array = np.argwhere(
+                (dst_array < 10) & (dst_array > 0))
+
+            filter_object_array[:, 0] = [
+                poly_neighbors[i] for i in filter_object_array[:, 0]
+            ]
+            filter_object_array[:, 1] = [
+                poly_neighbors[i] for i in filter_object_array[:, 1]
+            ]
+
+            if filter_object_array.shape[0] != 0:
+                for i in filter_object_array:
+                    close_list.append(i.tolist())
+            else:
+                continue
+
+    # remove duplicated index
+    close_list = set(frozenset(sublist) for sublist in close_list)
+    close_list = [list(x) for x in close_list]
+
+    # --------------- looking for connected components in a graph ---------------
+    def connected_components(lists):
+        neighbors = defaultdict(set)
+        seen = set()
+        for each in lists:
+            for item in each:
+                neighbors[item].update(each)
+
+        def component(node, neighbors=neighbors, seen=seen, see=seen.add):
+            nodes = set([node])
+            next_node = nodes.pop
+            while nodes:
+                node = next_node()
+                see(node)
+                nodes |= neighbors[node] - seen
+                yield node
+
+        for node in neighbors:
+            if node not in seen:
+                yield sorted(component(node))
+
+    close_list = list(connected_components(close_list))
+
+    # --------------- create a new shp file to store ---------------
+    # randomly pick one of many duplications
+    del_index_list = list()
+    for close_possible in close_list:
+        close_possible.sort()
+        random_id = close_possible[0]
+        # random_id = random.choice(close_possible)
+        close_possible.remove(random_id)
+        del_index_list.extend(close_possible)
+
+    print("Features before: ", len(image_shapefile_results))
+
+    # Note that we sort the indices in reversed order to ensure that the shift of indices
+    # induced by the deletion of elements at lower indices won’t invalidate the index
+    # specifications of elements at larger indices
+    for index in sorted(del_index_list, reverse=True):
+        del image_shapefile_results[index]
+
+    print("Features after: ", len(image_shapefile_results))
+
+    first_row["image_shapefile_results"] = ShapefileResults(
+        image_shapefile_results)
+    return first_row

From 89d565655057aa24a188a3db75b64012eff58812 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Fri, 12 Apr 2024 19:01:21 +0000
Subject: [PATCH 16/31] Rename files, add compare_unordered_shapefiles for
 testing purposes

---
 compare_unordered_shapefiles.py               |  51 +++++++
 environment_maple.yml                         |   3 +-
 ray_image_preprocessing.py                    | 135 ++++++++++++++++++
 mpl_infer_tiles_ray.py => ray_infer_tiles.py  |   0
 maple_workflow.py => ray_maple_workflow.py    |  16 +--
 ...ray_util.py => ray_tile_and_stitch_util.py |   0
 ...apefiles_ray.py => ray_write_shapefiles.py |   1 +
 7 files changed, 197 insertions(+), 9 deletions(-)
 create mode 100644 compare_unordered_shapefiles.py
 create mode 100644 ray_image_preprocessing.py
 rename mpl_infer_tiles_ray.py => ray_infer_tiles.py (100%)
 rename maple_workflow.py => ray_maple_workflow.py (86%)
 rename mpl_tile_and_stitch_ray_util.py => ray_tile_and_stitch_util.py (100%)
 rename write_shapefiles_ray.py => ray_write_shapefiles.py (99%)

diff --git a/compare_unordered_shapefiles.py b/compare_unordered_shapefiles.py
new file mode 100644
index 0000000..cca04a6
--- /dev/null
+++ b/compare_unordered_shapefiles.py
@@ -0,0 +1,51 @@
+import fiona
+import argparse
+import sys
+from shapely.geometry import shape
+
+def feature_to_hashable(feature):
+    """Converts a feature into a hashable representation."""
+    geometry_repr = shape(feature['geometry']).wkt  # Represent geometry as WKT
+    attributes = tuple(sorted(feature['properties'].items()))  # Sort attributes
+    return (geometry_repr, attributes)
+
+def compare_unordered_shapefiles(file1, file2):
+    """Compares shapefiles using the set approach."""
+    with fiona.open(file1) as src1, fiona.open(file2) as src2:
+        set1 = {feature_to_hashable(feature) for feature in src1}
+        set2 = {feature_to_hashable(feature) for feature in src2}
+
+        if len(set1) != len(set2):
+            print("{} doesn't have the same number of elements: {}, as {}: {}".format(
+                file1, len(set1), file2, len(set2)
+            ))
+            return
+
+        if set1 == set2:
+            print("Shapefiles seem to have identical features")
+            return
+
+        diff1 = set1 - set2  # Features only in shapefile1
+        diff2 = set2 - set1  # Features only in shapefile2
+
+        print("Shapefiles have differences:")
+        print(f"{len(diff1)} features unique to {file1}")
+        print(f"{len(diff2)} features unique to {file2}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Compare the features in two shapefiles.")
+    parser.add_argument("file1", help="Path to the first shapefile")
+    parser.add_argument("file2", help="Path to the second shapefile")
+
+    args = parser.parse_args()
+
+    # Check if both file paths were provided
+    if not args.file1 or not args.file2:
+        print("Error: Please provide both shapefile paths.")
+        sys.exit(1)
+
+    compare_unordered_shapefiles(args.file1, args.file2)
+
+# Usage:
+#compare_unordered_shapefiles("/usr/local/google/home/kaylahardie/MAPLE_v3/data/ray_shapefiles/test_image_01.shp", "/usr/local/google/home/kaylahardie/MAPLE_v3/data/projected_shp/test_image_01/test_image_01.shp")
+#compare_unordered_shapefiles("/usr/local/google/home/kaylahardie/MAPLE_v3/data/projected_shp copy/test_image_01/test_image_01.shp", "/usr/local/google/home/kaylahardie/MAPLE_v3/data/projected_shp/test_image_01/test_image_01.shp")  
\ No newline at end of file
diff --git a/environment_maple.yml b/environment_maple.yml
index 7f89678..221aff3 100644
--- a/environment_maple.yml
+++ b/environment_maple.yml
@@ -4,12 +4,13 @@ channels:
 dependencies:
   - python=3.10
   - gdal
+  - fiona
   - pyshp
   - scikit-image
   - shapely
   - pip=23.3.1
   - pip:
-    - tensorflow[and-cuda]
+    - tensorflow[and-cuda]==2.15.1
     - opencv-python
     - ray
     - pandas==2.1.4
diff --git a/ray_image_preprocessing.py b/ray_image_preprocessing.py
new file mode 100644
index 0000000..69109cc
--- /dev/null
+++ b/ray_image_preprocessing.py
@@ -0,0 +1,135 @@
+"""
+Preprocessing helper functions for the MAPLE pipeline.
+
+Project: Permafrost Discovery Gateway: Mapping Application for Arctic Permafrost Land Environment(MAPLE)
+PI      : Chandi Witharana
+"""
+
+
+import os
+from pathlib import Path
+import shutil
+from typing import Dict, Any
+
+import numpy as np
+from osgeo import gdal
+from skimage import filters, io
+from skimage.morphology import disk
+
+from mpl_config import MPL_Config
+import gdal_virtual_file_system as gdal_vfs
+
+
+
+def cal_water_mask(row: Dict[str, Any], config: MPL_Config) -> Dict[str, Any]:
+    """
+    This will calculate the water mask to avoid (inference) processing of the masked areas with water
+    Uses gdal to transform the image into the required format.
+    Parameters
+    ----------
+    row : Row in Ray Dataset, there is one row per image.
+    config : Contains static configuration information regarding the workflow.
+    """
+    # os.path.join(worker_root, "water_shp")
+    worker_water_root = config.WATER_MASK_DIR
+    temp_water_root = (
+        config.TEMP_W_IMG_DIR
+    )  # os.path.join(worker_root, "temp_8bitmask")
+
+    image_name = row["image_name"]
+    worker_water_subroot = os.path.join(worker_water_root, image_name)
+    temp_water_subroot = os.path.join(temp_water_root, image_name)
+    # Prepare to make directories to create the files
+    try:
+        shutil.rmtree(worker_water_subroot)
+    except:
+        print("directory deletion failed")
+        pass
+
+    try:
+        shutil.rmtree(temp_water_subroot)
+    except:
+        print("directory deletion failed")
+        pass
+
+    # check local storage for temporary storage
+    Path(worker_water_subroot).mkdir(parents=True, exist_ok=True)
+    Path(temp_water_subroot).mkdir(parents=True, exist_ok=True)
+
+    output_watermask = os.path.join(
+        worker_water_subroot, r"%s_watermask.tif" % image_name
+    )
+    output_tif_8b_file = os.path.join(
+        temp_water_subroot, r"%s_8bit.tif" % image_name
+    )
+    nir_band = 3  # set number of NIR band
+
+    # %% Median and Otsu
+    value = 5
+
+    # Create virtual file system file for image to use GDAL's file apis.
+    vfs = gdal_vfs.GDALVirtualFileSystem(
+        file_path=row["path"], file_bytes=row["bytes"])
+    virtual_file_path = vfs.create_virtual_file()
+
+    # UPDATED CODE - amal 01/11/2023
+    # cmd line execution thrown exceptions unable to capture
+    # Using gdal to execute the gdal_Translate
+    # output file checked against the cmd line gdal_translate
+    gdal.UseExceptions()  # Enable errors
+    try:
+        gdal.Translate(
+            destName=output_tif_8b_file,
+            srcDS=virtual_file_path,
+            format="GTiff",
+            outputType=gdal.GDT_Byte,
+        )
+    except RuntimeError:
+        print("gdal Translate failed with", gdal.GetLastErrorMsg())
+        pass
+
+    image = io.imread(
+        output_tif_8b_file
+    )  # image[rows, columns, dimensions]-> image[:,:,3] is near Infrared
+    nir = image[:, :, nir_band]
+
+    bilat_img = filters.rank.median(nir, disk(value))
+
+    gtif = gdal.Open(virtual_file_path)
+    geotransform = gtif.GetGeoTransform()
+    sourceSR = gtif.GetProjection()
+    # Close the file.
+    gtif = None
+    vfs.close_virtual_file()
+
+    x = np.shape(image)[1]
+    y = np.shape(image)[0]
+
+    # Normalize and blur before thresholding. Usually instead of normalizing
+    # a rgb to greyscale transformation is applied. In this case, we are already
+    # dealing with a single channel so we divide all the pixels in the image by
+    # 255 to get a value between [0, 1].
+    normalized_bilat_img = bilat_img / 255
+    normalized_blurred_bilat_img = filters.gaussian(
+        normalized_bilat_img, sigma=2.0)
+
+    # find the threshold to filter if all values are same otsu cannot find value
+    # hence t is made to 0.0
+    try:
+        t = filters.threshold_otsu(normalized_blurred_bilat_img)
+    except:
+        t = 0.0
+    # perform inverse binary thresholding
+    mask = normalized_blurred_bilat_img > t
+
+    # output np array as GeoTiff
+    dst_ds = gdal.GetDriverByName("GTiff").Create(
+        output_watermask, x, y, 1, gdal.GDT_Byte, ["NBITS=1"]
+    )
+    dst_ds.GetRasterBand(1).WriteArray(mask)
+    dst_ds.SetGeoTransform(geotransform)
+    dst_ds.SetProjection(sourceSR)
+    dst_ds.FlushCache()
+    dst_ds = None
+    row["mask"] = mask
+    return row
diff --git a/mpl_infer_tiles_ray.py b/ray_infer_tiles.py
similarity index 100%
rename from mpl_infer_tiles_ray.py
rename to ray_infer_tiles.py
diff --git a/maple_workflow.py b/ray_maple_workflow.py
similarity index 86%
rename from maple_workflow.py
rename to ray_maple_workflow.py
index 1811094..733f1ae 100644
--- a/maple_workflow.py
+++ b/ray_maple_workflow.py
@@ -21,10 +21,10 @@
 import ray
 
 from mpl_config import MPL_Config
-import mpl_preprocessing as preprocessing
-import mpl_infer_tiles_ray as ray_inference
-import write_shapefiles_ray as write_shapefiles
-import mpl_tile_and_stitch_ray_util as tile_and_stitch_util
+import ray_image_preprocessing
+import ray_infer_tiles as ray_inference
+import ray_write_shapefiles
+import ray_tile_and_stitch_util
 
 
 def create_geotiff_images_dataset(input_image_dir: str) -> ray.data.Dataset:
@@ -89,12 +89,12 @@ def add_image_name(row: Dict[str, Any]) -> Dict[str, Any]:
         config.INPUT_IMAGE_DIR).map(add_image_name)
     print("Ray dataset schema:", dataset.schema())
     print("1. Start calculating watermask")
-    dataset_with_water_mask = dataset.map(fn=preprocessing.cal_water_mask,
+    dataset_with_water_mask = dataset.map(fn=ray_image_preprocessing.cal_water_mask,
                                           fn_kwargs={"config": config})
     print("Dataset schema with water mask: ", dataset_with_water_mask.schema())
     print("2. Start tiling image")
     image_tiles_dataset = dataset_with_water_mask.flat_map(
-        fn=tile_and_stitch_util.tile_image, fn_kwargs={"config": config})
+        fn=ray_tile_and_stitch_util.tile_image, fn_kwargs={"config": config})
     image_tiles_dataset = image_tiles_dataset.drop_columns(["mask"])
     print("Dataset schema with image tiles: ", image_tiles_dataset.schema())
     print("3. Start inferencing")
@@ -103,12 +103,12 @@ def add_image_name(row: Dict[str, Any]) -> Dict[str, Any]:
     print("Dataset schema with inferenced tiles: ", inferenced_dataset.schema())
     print("4. Start stitching")
     data_per_image = inferenced_dataset.groupby(
-        "image_name").map_groups(tile_and_stitch_util.stitch_shapefile)
+        "image_name").map_groups(ray_tile_and_stitch_util.stitch_shapefile)
     print("Dataset schema where each row is an image (result of a group by tile): ",
           data_per_image.schema())
     print("5. Write shapefiles")
     shapefiles_dataset = data_per_image.map(
-        fn=write_shapefiles.WriteShapefiles, fn_constructor_kwargs={"shpfile_output_dir": config.TEST_SHAPEFILE}, concurrency=2)
+        fn=ray_write_shapefiles.WriteShapefiles, fn_constructor_kwargs={"shpfile_output_dir": config.TEST_SHAPEFILE}, concurrency=2)
     print("Done writing shapefiles", shapefiles_dataset.schema())
 
 # Once you are done you can check the output on ArcGIS (win) or else you can check in QGIS (nx) Add the image and the
diff --git a/mpl_tile_and_stitch_ray_util.py b/ray_tile_and_stitch_util.py
similarity index 100%
rename from mpl_tile_and_stitch_ray_util.py
rename to ray_tile_and_stitch_util.py
diff --git a/write_shapefiles_ray.py b/ray_write_shapefiles.py
similarity index 99%
rename from write_shapefiles_ray.py
rename to ray_write_shapefiles.py
index 1a9729e..fc351f6 100644
--- a/write_shapefiles_ray.py
+++ b/ray_write_shapefiles.py
@@ -131,6 +131,7 @@ def write_shapefile(self, row: Dict[str, Any], shapefile_output_dir_for_image: s
 
     def __call__(self, row: Dict[str, Any]) -> Dict[str, Any]:
         image_name = row["image_name"]
+        print("Writing shapefiles for:", image_name)
         shapefile_output_dir_for_image = os.path.join(
             self.shpfile_output_dir, f"{image_name}.shp")
         self.write_shapefile(row, shapefile_output_dir_for_image)

From c03addd5efc918fff1793a411e05c2265f89a7ed Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Fri, 12 Apr 2024 19:04:17 +0000
Subject: [PATCH 17/31] Add back original maple_workflow file

---
 maple_workflow.py | 341 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 341 insertions(+)
 create mode 100644 maple_workflow.py

diff --git a/maple_workflow.py b/maple_workflow.py
new file mode 100644
index 0000000..7886e29
--- /dev/null
+++ b/maple_workflow.py
@@ -0,0 +1,341 @@
+"""
+MAPLE Workflow
+Main Script that runs the inference workflow pipeline.
+Pre Process
+1. Create water mask
+2. Image Tiling
+Classification / Inference
+3. Infer "ground truth" from images based on the trained model
+Post processing
+4. Stich back to the original image dims from the tiles (2.)
+5. Clean the output based on known ground truth
+
+Project: Permafrost Discovery Gateway: Mapping Application for Arctic Permafrost Land Environment(MAPLE)
+PI      : Chandi Witharana
+Author  : Rajitha Udwalpola
+"""
+
+import argparse
+import mpl_clean_inference as inf_clean
+import mpl_divideimg_234_water_new as divide
+import mpl_infer_tiles_GPU_new as inference
+import mpl_process_shapefile as process
+import mpl_stitchshpfile_new as stich
+import numpy as np
+import os
+import sys
+import shutil
+from pathlib import Path
+
+from mpl_config import MPL_Config
+from osgeo import gdal
+from skimage import color, filters, io
+from skimage.morphology import disk
+import tensorflow as tf
+
+# work tag
+WORKTAG = 1
+DIETAG = 0
+
+
+def tile_image(config: MPL_Config, input_img_name: str):
+    """
+    Tile the image into multiple pre-deifined sized parts so that the processing can be done on smaller parts due to
+    processing limitations
+
+    Parameters
+    ----------
+    config : Contains static configuration information regarding the workflow.
+    input_img_name : Name of the input image
+    """
+    sys.path.append(config.ROOT_DIR)
+
+    crop_size = config.CROP_SIZE
+
+    # worker roots
+    worker_img_root = config.INPUT_IMAGE_DIR
+    worker_divided_img_root = config.DIVIDED_IMAGE_DIR
+    # input image path
+    input_img_path = os.path.join(worker_img_root, input_img_name)
+
+    # Create subfolder for each image
+    new_file_name = input_img_name.split(".tif")[0]
+    worker_divided_img_subroot = os.path.join(worker_divided_img_root, new_file_name)
+
+    print(worker_divided_img_subroot)
+
+    try:
+        shutil.rmtree(worker_divided_img_subroot)
+    except:
+        print("directory deletion failed")
+        pass
+    os.mkdir(worker_divided_img_subroot)
+
+    file1 = os.path.join(worker_divided_img_subroot, "image_data.h5")
+    file2 = os.path.join(worker_divided_img_subroot, "image_param.h5")
+    # ----------------------------------------------------------------------------------------------------
+    # Call divide image <mpl_divided_img_water> to put the water mask and also to tile and store the data
+    # Multiple image overlaps are NOT taken into account in called code.
+    #
+    divide.divide_image(config, input_img_path, crop_size, file1, file2)
+
+    print("finished tiling")
+
+
+def cal_water_mask(config: MPL_Config, input_img_name: str):
+    """
+    This will calculate the water mask to avoid (inference) processing of the masked areas with water
+    Uses gdal to transform the image into the required format.
+    Parameters
+    ----------
+    config : Contains static configuration information regarding the workflow.
+    input_img_name : Name of the input image
+    """
+    image_file_name = (input_img_name).split(".tif")[0]
+
+    worker_water_root = config.WATER_MASK_DIR  # os.path.join(worker_root, "water_shp")
+    temp_water_root = (
+        config.TEMP_W_IMG_DIR
+    )  # os.path.join(worker_root, "temp_8bitmask")
+
+    worker_water_subroot = os.path.join(worker_water_root, image_file_name)
+    temp_water_subroot = os.path.join(temp_water_root, image_file_name)
+    # Prepare to make directories to create the files
+    try:
+        shutil.rmtree(worker_water_subroot)
+    except:
+        print("directory deletion failed")
+        pass
+
+    try:
+        shutil.rmtree(temp_water_subroot)
+    except:
+        print("directory deletion failed")
+        pass
+
+    # check local storage for temporary storage
+    Path(worker_water_subroot).mkdir(parents=True, exist_ok=True)
+    Path(temp_water_subroot).mkdir(parents=True, exist_ok=True)
+
+    output_watermask = os.path.join(
+        worker_water_subroot, r"%s_watermask.tif" % image_file_name
+    )
+    output_tif_8b_file = os.path.join(
+        temp_water_subroot, r"%s_8bit.tif" % image_file_name
+    )
+    nir_band = 3  # set number of NIR band
+
+    input_image = os.path.join(config.INPUT_IMAGE_DIR, input_img_name)
+
+    # %% Median and Otsu
+    value = 5
+
+    ### UPDATED CODE - amal 01/11/2023
+    # cmd line execution thrown exceptions unable to capture
+    # Using gdal to execute the gdal_Translate
+    # output file checked against the cmd line gdal_translate
+    gdal.UseExceptions()  # Enable errors
+    try:
+        gdal.Translate(
+            destName=output_tif_8b_file,
+            srcDS=input_image,
+            format="GTiff",
+            outputType=gdal.GDT_Byte,
+        )
+    except RuntimeError:
+        print("gdal Translate failed with", gdal.GetLastErrorMsg())
+        pass
+
+    image = io.imread(
+        output_tif_8b_file
+    )  # image[rows, columns, dimensions]-> image[:,:,3] is near Infrared
+    nir = image[:, :, nir_band]
+
+    bilat_img = filters.rank.median(nir, disk(value))
+
+    gtif = gdal.Open(input_image)
+    geotransform = gtif.GetGeoTransform()
+    sourceSR = gtif.GetProjection()
+
+    x = np.shape(image)[1]
+    y = np.shape(image)[0]
+
+    # Normalize and blur before thresholding. Usually instead of normalizing
+    # a rgb to greyscale transformation is applied. In this case, we are already
+    # dealing with a single channel so we divide all the pixels in the image by
+    # 255 to get a value between [0, 1].
+    normalized_bilat_img = bilat_img / 255
+    normalized_blurred_bilat_img = filters.gaussian(normalized_bilat_img, sigma=2.0)
+
+    # find the threshold to filter if all values are same otsu cannot find value
+    # hence t is made to 0.0
+    try:
+        t = filters.threshold_otsu(normalized_blurred_bilat_img)
+    except:
+        t = 0.0
+    # perform inverse binary thresholding
+    mask = normalized_blurred_bilat_img > t
+
+    # output np array as GeoTiff
+    dst_ds = gdal.GetDriverByName("GTiff").Create(
+        output_watermask, x, y, 1, gdal.GDT_Byte, ["NBITS=1"]
+    )
+    dst_ds.GetRasterBand(1).WriteArray(mask)
+    dst_ds.SetGeoTransform(geotransform)
+    dst_ds.SetProjection(sourceSR)
+    dst_ds.FlushCache()
+    dst_ds = None
+
+
+def infer_image(config: MPL_Config, input_img_name: str):
+    """
+    Inference based on the trained model reperesented by the saved weights
+
+    Parameters
+    ----------
+    config : Contains static configuration information regarding the workflow.
+    input_img_name : Name of the input image file
+    """
+    sys.path.append(config.ROOT_DIR)
+
+    # worker roots
+    worker_divided_img_root = config.DIVIDED_IMAGE_DIR
+
+    # Create subfolder for each image
+    new_file_name = input_img_name.split(".tif")[0]
+    worker_divided_img_subroot = os.path.join(worker_divided_img_root, new_file_name)
+
+    print(worker_divided_img_subroot)
+
+    file1 = os.path.join(worker_divided_img_subroot, "image_data.h5")
+    file2 = os.path.join(worker_divided_img_subroot, "image_param.h5")
+
+    worker_output_shp_root = config.OUTPUT_SHP_DIR
+    worker_output_shp_subroot = os.path.join(worker_output_shp_root, new_file_name)
+    try:
+        shutil.rmtree(worker_output_shp_subroot)
+    except:
+        print("directory deletion failed")
+        pass
+
+
+    inference.inference_image(
+        config,
+        worker_output_shp_subroot,
+        file1,
+        file2,
+        new_file_name,
+    )
+
+    print("done")
+
+
+def stich_shapefile(config: MPL_Config, input_img_name: str):
+    """
+    Put (stich) the image tiles back to the original
+
+    Parameters
+    ----------
+    config : Contains static configuration information regarding the workflow.
+    input_img_name : Name of the input image file
+
+    Returns
+    -------
+
+    """
+    sys.path.append(config.ROOT_DIR)
+
+    worker_finaloutput_root = config.FINAL_SHP_DIR
+    worker_output_shp_root = config.OUTPUT_SHP_DIR
+
+    # Create subfolder for each image within the worker img root
+    new_file_name = input_img_name.split(".tif")[0]
+
+    worker_finaloutput_subroot = os.path.join(worker_finaloutput_root, new_file_name)
+    worker_output_shp_subroot = os.path.join(worker_output_shp_root, new_file_name)
+
+    try:
+        shutil.rmtree(worker_finaloutput_subroot)
+    except:
+        print("directory deletion failed")
+        pass
+    os.mkdir(worker_finaloutput_subroot)
+
+    stich.stitch_shapefile(
+        config,
+        worker_output_shp_subroot,
+        worker_finaloutput_subroot,
+        new_file_name,
+        new_file_name,
+    )
+
+    return "done Divide"
+
+
+if __name__ == "__main__":
+    tf.compat.v1.disable_eager_execution()
+    parser = argparse.ArgumentParser(
+        description="Extract IWPs from satellite image scenes using MAPLE."
+    )
+
+    # Optional Arguments
+    parser.add_argument(
+        "--image",
+        required=False,
+        default="test_image_01.tif",
+        metavar="<command>",
+        help="Image name",
+    )
+
+    parser.add_argument(
+        "--root_dir",
+        required=False,
+        default="",
+        help="The directory path from where the workflow is running. If none is "
+        "provided, the current working directory will be used by the workflow.",
+    )
+
+    parser.add_argument(
+        "--weight_file",
+        required=False,
+        default="hyp_best_train_weights_final.h5",
+        help="The file path to where the model weights can be found. Should be "
+        "relative to the root directory.",
+    )
+
+    parser.add_argument(
+        "--gpus_per_core",
+        required=False,
+        default=1,
+        help="Number of GPUs available per core. Used to determine how many "
+        "inference processes to spin up. Set this to 0 if you want to run the "
+        "workflow on a CPU.",
+        type=int
+    )
+
+    args = parser.parse_args()
+
+    image_name = args.image
+    config = MPL_Config(
+        args.root_dir, args.weight_file, num_gpus_per_core=args.gpus_per_core
+    )
+
+    print("1.start caculating wartermask")
+    cal_water_mask(config, image_name)
+    print("2. start tiling image")
+    tile_image(config, image_name)
+    print("3. start inferencing")
+    infer_image(config, image_name)
+    print("4. start stiching")
+    stich_shapefile(config, image_name)
+    process.process_shapefile(config, image_name)
+    print("5. finish stichting")
+    print("5. start cleaning")
+    inf_clean.clean_inference_shapes(
+        config.CLEAN_DATA_DIR,
+        config.PROJECTED_SHP_DIR,
+       "./data/input_bound/sample2_out_boundry.shp",
+    )
+
+# Once you are done you can check the output on ArcGIS (win) or else you can check in QGIS (nx) Add the image and the
+# shp, shx, dbf as layers.
\ No newline at end of file

From acfd23bf7194a97c2be0c4307f8baac6dd44d1aa Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Fri, 12 Apr 2024 21:37:25 +0000
Subject: [PATCH 18/31] fix file name indexing for shapefile

---
 ray_write_shapefiles.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ray_write_shapefiles.py b/ray_write_shapefiles.py
index fc351f6..8007d5f 100644
--- a/ray_write_shapefiles.py
+++ b/ray_write_shapefiles.py
@@ -124,8 +124,8 @@ def write_shapefile(self, row: Dict[str, Any], shapefile_output_dir_for_image: s
             length = max(edge_length)
             width = min(edge_length)
 
-            w.record(Class=shapefile_result.class_id, Sensor=image_name[0:1], Date=image_name[1:2],
-                     Time=image_name[2:3], CatalogID=image_name[3:4], Area=poly.area,
+            w.record(Class=shapefile_result.class_id, Sensor=image_name[0:4], Date=image_name[5:13],
+                     Time=image_name[13:19], CatalogID=image_name[20:36], Area=poly.area,
                      CentroidX=centroid.x, CentroidY=centroid.y, Perimeter=poly.length, Length=length, Width=width)
         w.close()
 

From 68c1c95a5a4c5ea46109d52377399794e1abb058 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Fri, 12 Apr 2024 21:45:54 +0000
Subject: [PATCH 19/31] Add documentation for compare_unordered_shapefiles

---
 compare_unordered_shapefiles.py | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/compare_unordered_shapefiles.py b/compare_unordered_shapefiles.py
index cca04a6..506d09a 100644
--- a/compare_unordered_shapefiles.py
+++ b/compare_unordered_shapefiles.py
@@ -1,24 +1,32 @@
-import fiona
+"""
+Compares the features in two shapefiles. The geometry and properties for each feature are compared. 
+The features don't have to be in the same order in the two files for them to be equal. This approach
+assumes that each feature in the file is unique (ie. it doesn't guarantee equality if there are 
+duplicate features in one of the files.)
+"""
 import argparse
 import sys
+
+import fiona
 from shapely.geometry import shape
 
+
 def feature_to_hashable(feature):
     """Converts a feature into a hashable representation."""
-    geometry_repr = shape(feature['geometry']).wkt  # Represent geometry as WKT
-    attributes = tuple(sorted(feature['properties'].items()))  # Sort attributes
+    geometry_repr = shape(feature['geometry']).wkt
+    attributes = tuple(sorted(feature['properties'].items()))
     return (geometry_repr, attributes)
 
+
 def compare_unordered_shapefiles(file1, file2):
-    """Compares shapefiles using the set approach."""
+    """Compares shapefiles using sets. This approach doesn't work if there are duplicate features in one of the files."""
     with fiona.open(file1) as src1, fiona.open(file2) as src2:
         set1 = {feature_to_hashable(feature) for feature in src1}
         set2 = {feature_to_hashable(feature) for feature in src2}
 
         if len(set1) != len(set2):
-            print("{} doesn't have the same number of elements: {}, as {}: {}".format(
-                file1, len(set1), file2, len(set2)
-            ))
+            print(
+                f"{file1} doesn't have the same number of elements: {len(set1)}, as {file2}: {len(set2)}")
             return
 
         if set1 == set2:
@@ -32,8 +40,10 @@ def compare_unordered_shapefiles(file1, file2):
         print(f"{len(diff1)} features unique to {file1}")
         print(f"{len(diff2)} features unique to {file2}")
 
+
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Compare the features in two shapefiles.")
+    parser = argparse.ArgumentParser(
+        description="Compare the features in two shapefiles.")
     parser.add_argument("file1", help="Path to the first shapefile")
     parser.add_argument("file2", help="Path to the second shapefile")
 
@@ -47,5 +57,4 @@ def compare_unordered_shapefiles(file1, file2):
     compare_unordered_shapefiles(args.file1, args.file2)
 
 # Usage:
-#compare_unordered_shapefiles("/usr/local/google/home/kaylahardie/MAPLE_v3/data/ray_shapefiles/test_image_01.shp", "/usr/local/google/home/kaylahardie/MAPLE_v3/data/projected_shp/test_image_01/test_image_01.shp")
-#compare_unordered_shapefiles("/usr/local/google/home/kaylahardie/MAPLE_v3/data/projected_shp copy/test_image_01/test_image_01.shp", "/usr/local/google/home/kaylahardie/MAPLE_v3/data/projected_shp/test_image_01/test_image_01.shp")  
\ No newline at end of file
+# python compare_unordered_shapefiles.py <path to file1> <path to file2>

From bdbe6fb522eedb54e92d859b067b92a43e69f9a4 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Fri, 12 Apr 2024 22:04:00 +0000
Subject: [PATCH 20/31] Adding more documentation

---
 gdal_virtual_file_system.py | 5 +++++
 mpl_config.py               | 2 +-
 ray_image_preprocessing.py  | 5 +++--
 ray_maple_workflow.py       | 2 +-
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/gdal_virtual_file_system.py b/gdal_virtual_file_system.py
index 3f74303..d053a8f 100644
--- a/gdal_virtual_file_system.py
+++ b/gdal_virtual_file_system.py
@@ -1,3 +1,8 @@
+"""
+Class that manages open and closing of a single gdal file.
+The pattern is to instantiate the class, call create to open it and then 
+call close to close the file.
+"""
 import os
 from osgeo import gdal
 
diff --git a/mpl_config.py b/mpl_config.py
index 6494e3e..31639b6 100644
--- a/mpl_config.py
+++ b/mpl_config.py
@@ -49,7 +49,7 @@ def __init__(
         self.OUTPUT_IMAGE_DIR = self.ROOT_DIR + r"/data/output_img"
         self.WORKER_ROOT = self.ROOT_DIR + r"/data"
         self.MODEL_DIR = self.ROOT_DIR + r"/local_dir/datasets/logs"
-        self.TEST_SHAPEFILE = self.ROOT_DIR + r"/data/ray_shapefiles"
+        self.RAY_SHAPEFILES = self.ROOT_DIR + r"/data/ray_shapefiles"
 
         # ADDED to include inference cleaning post-processing
         self.CLEAN_DATA_DIR = self.ROOT_DIR + r"/data/cln_data"
diff --git a/ray_image_preprocessing.py b/ray_image_preprocessing.py
index 69109cc..6409c34 100644
--- a/ray_image_preprocessing.py
+++ b/ray_image_preprocessing.py
@@ -25,16 +25,17 @@ def cal_water_mask(row: Dict[str, Any], config: MPL_Config) -> Dict[str, Any]:
     """
     This will calculate the water mask to avoid (inference) processing of the masked areas with water
     Uses gdal to transform the image into the required format.
+    The result of this function is that it will add a "mask" column with the water mask to each row.
+    It also writes the water mask to the config.WATER_MASK_DIR.
     Parameters
     ----------
     row : Row in Ray Dataset, there is one row per image.
     config : Contains static configuration information regarding the workflow.
     """
-    # os.path.join(worker_root, "water_shp")
     worker_water_root = config.WATER_MASK_DIR
     temp_water_root = (
         config.TEMP_W_IMG_DIR
-    )  # os.path.join(worker_root, "temp_8bitmask")
+    ) 
 
     image_name = row["image_name"]
     worker_water_subroot = os.path.join(worker_water_root, image_name)
diff --git a/ray_maple_workflow.py b/ray_maple_workflow.py
index 733f1ae..7ca941c 100644
--- a/ray_maple_workflow.py
+++ b/ray_maple_workflow.py
@@ -108,7 +108,7 @@ def add_image_name(row: Dict[str, Any]) -> Dict[str, Any]:
           data_per_image.schema())
     print("5. Write shapefiles")
     shapefiles_dataset = data_per_image.map(
-        fn=ray_write_shapefiles.WriteShapefiles, fn_constructor_kwargs={"shpfile_output_dir": config.TEST_SHAPEFILE}, concurrency=2)
+        fn=ray_write_shapefiles.WriteShapefiles, fn_constructor_kwargs={"shpfile_output_dir": config.RAY_SHAPEFILES}, concurrency=2)
     print("Done writing shapefiles", shapefiles_dataset.schema())
 
 # Once you are done you can check the output on ArcGIS (win) or else you can check in QGIS (nx) Add the image and the

From 18dea967f049bb7b8d8194f7a9441e673749a8ba Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Fri, 12 Apr 2024 22:09:11 +0000
Subject: [PATCH 21/31] More documentation

---
 ...unordered_shapefiles.py => compare_shapefile_features.py | 6 +++---
 ray_maple_workflow.py                                       | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)
 rename compare_unordered_shapefiles.py => compare_shapefile_features.py (91%)

diff --git a/compare_unordered_shapefiles.py b/compare_shapefile_features.py
similarity index 91%
rename from compare_unordered_shapefiles.py
rename to compare_shapefile_features.py
index 506d09a..06924f2 100644
--- a/compare_unordered_shapefiles.py
+++ b/compare_shapefile_features.py
@@ -18,7 +18,7 @@ def feature_to_hashable(feature):
     return (geometry_repr, attributes)
 
 
-def compare_unordered_shapefiles(file1, file2):
+def compare_shapefile_features(file1, file2):
     """Compares shapefiles using sets. This approach doesn't work if there are duplicate features in one of the files."""
     with fiona.open(file1) as src1, fiona.open(file2) as src2:
         set1 = {feature_to_hashable(feature) for feature in src1}
@@ -54,7 +54,7 @@ def compare_unordered_shapefiles(file1, file2):
         print("Error: Please provide both shapefile paths.")
         sys.exit(1)
 
-    compare_unordered_shapefiles(args.file1, args.file2)
+    compare_shapefile_features(args.file1, args.file2)
 
 # Usage:
-# python compare_unordered_shapefiles.py <path to file1> <path to file2>
+# python compare_shapefile_features.py <path to file1> <path to file2>
diff --git a/ray_maple_workflow.py b/ray_maple_workflow.py
index 7ca941c..ba8c18f 100644
--- a/ray_maple_workflow.py
+++ b/ray_maple_workflow.py
@@ -113,3 +113,5 @@ def add_image_name(row: Dict[str, Any]) -> Dict[str, Any]:
 
 # Once you are done you can check the output on ArcGIS (win) or else you can check in QGIS (nx) Add the image and the
 # shp, shx, dbf as layers.
+# You can also look at compare_shapefile_features.py for how to compare the features in two shapefiles.
+# You can also use 'ogrinfo -so -al <path to shapefile>' on the command line to examine a shapefile.

From 99f7b41c3ef5e9774bc6d017c1969b73b3ca12bf Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Fri, 12 Apr 2024 22:13:58 +0000
Subject: [PATCH 22/31] More comments

---
 ray_tile_and_stitch_util.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/ray_tile_and_stitch_util.py b/ray_tile_and_stitch_util.py
index 7778a2a..ced68e2 100644
--- a/ray_tile_and_stitch_util.py
+++ b/ray_tile_and_stitch_util.py
@@ -34,6 +34,7 @@ class ImageMetadata:
 class ImageTileMetadata:
     upper_left_row: float
     upper_left_col: float
+    # Indexes that are used to reconstruct image after tiling.
     id_i: int
     id_j: int
     tile_num: int
@@ -187,6 +188,17 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> List[Dict[str, Any]]:
 
 
 def stitch_shapefile(group: pd.DataFrame):
+    """
+    Create a shapefile for each image.
+    Note that normally ray rows are dictionaries but this is a group because we've called
+    grouped by. When ray does a groupby and the rows can't be represented in numpy arrays
+    it uses pandas dataframe.
+
+    Parameters
+    ----------
+    group: a pandas dataframe that has all of the shapefile information for each tile in the
+            image.
+    """
     image_shapefile_results = []
     temp_polygon_dict = defaultdict(dict)
     dict_ij = defaultdict(dict)

From f43a4b6b5edf9608920c0714810658f3c26ae09d Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Thu, 16 May 2024 23:15:38 +0000
Subject: [PATCH 23/31] Making it possible to read and write to gcs buckets
 when running ray_maple_workflow.py

---
 environment_maple.yml   |  2 ++
 mpl_config.py           | 17 +++++++++---
 ray_infer_tiles.py      | 23 ++++++++++++++---
 ray_maple_workflow.py   | 18 ++++++++-----
 ray_write_shapefiles.py | 57 +++++++++++++++++++++++++++++------------
 5 files changed, 86 insertions(+), 31 deletions(-)

diff --git a/environment_maple.yml b/environment_maple.yml
index 221aff3..63c16a2 100644
--- a/environment_maple.yml
+++ b/environment_maple.yml
@@ -10,6 +10,8 @@ dependencies:
   - shapely
   - pip=23.3.1
   - pip:
+    - gcsfs
+    - google-auth
     - tensorflow[and-cuda]==2.15.1
     - opencv-python
     - ray
diff --git a/mpl_config.py b/mpl_config.py
index 31639b6..a250fa4 100644
--- a/mpl_config.py
+++ b/mpl_config.py
@@ -11,6 +11,9 @@
 
 import os
 
+import gcsfs
+import google.auth
+
 from config import Config
 
 
@@ -34,12 +37,12 @@ def __init__(
         crop_size=200,
         num_gpus_per_core=1,
     ):
-        ## Do not change this section
+        # Do not change this section
         # Code depends on the relative locations indicated so should not change
         # Code expects some of the locations to be available when executing.
         # -----------------------------------------------------------------
         self.ROOT_DIR = root_dir if root_dir else os.getcwd()
-        self.INPUT_IMAGE_DIR = self.ROOT_DIR + r"/data/input_img_local"
+        self.INPUT_IMAGE_DIR = self.ROOT_DIR + r"/data/input_img"
         self.DIVIDED_IMAGE_DIR = self.ROOT_DIR + r"/data/divided_img"
         self.OUTPUT_SHP_DIR = self.ROOT_DIR + r"/data/output_shp"
         self.FINAL_SHP_DIR = self.ROOT_DIR + r"/data/final_shp"
@@ -49,7 +52,15 @@ def __init__(
         self.OUTPUT_IMAGE_DIR = self.ROOT_DIR + r"/data/output_img"
         self.WORKER_ROOT = self.ROOT_DIR + r"/data"
         self.MODEL_DIR = self.ROOT_DIR + r"/local_dir/datasets/logs"
-        self.RAY_SHAPEFILES = self.ROOT_DIR + r"/data/ray_shapefiles"
+        self.RAY_OUTPUT_SHAPEFILES_DIR = self.ROOT_DIR + r"/data/ray_output_shapefiles"
+
+        self.GCP_FILESYSTEM = None
+        if (self.ROOT_DIR.startswith(
+            "gcs://") or self.ROOT_DIR.startswith("gs://")):
+            creds, _ = google.auth.load_credentials_from_file(
+                "/usr/local/google/home/kaylahardie/.config/gcloud/application_default_credentials.json", scopes=["https://www.googleapis.com/auth/cloud-platform"])
+            self.GCP_FILESYSTEM = gcsfs.GCSFileSystem(
+                project="pdg-project-406720", token=creds)
 
         # ADDED to include inference cleaning post-processing
         self.CLEAN_DATA_DIR = self.ROOT_DIR + r"/data/cln_data"
diff --git a/ray_infer_tiles.py b/ray_infer_tiles.py
index ceb0e5a..83c87d7 100644
--- a/ray_infer_tiles.py
+++ b/ray_infer_tiles.py
@@ -10,6 +10,8 @@
 """
 
 from dataclasses import dataclass
+import os
+import tempfile
 from typing import Any, Dict, List
 
 import numpy as np
@@ -19,11 +21,13 @@
 import model as modellib
 from mpl_config import MPL_Config, PolygonConfig
 
+
 @dataclass
 class ShapefileResult:
     polygons: np.array
     class_id: int
 
+
 @dataclass
 class ShapefileResults:
     shapefile_results: List[ShapefileResult]
@@ -44,10 +48,21 @@ def __init__(
 
         with tf.device(self.device):
             self.model = modellib.MaskRCNN(
-                mode="inference", model_dir=self.config.MODEL_DIR, config=PolygonConfig()
-            )
-        self.model.keras_model.load_weights(
-            self.config.WEIGHT_PATH, by_name=True)
+                mode="inference", model_dir=self.config.MODEL_DIR, config=PolygonConfig())
+
+        if self.config.GCP_FILESYSTEM is not None:
+            # Create a temporary file because keras load weights doesn't work with a hd5 file object,
+            # it only works with a file path. We need the file object to do the authentication.
+            with tempfile.NamedTemporaryFile(suffix=".h5", delete=False) as temp_file:
+                # Copy the GCS file to the temporary file
+                config.GCP_FILESYSTEM.get(self.config.WEIGHT_PATH, temp_file.name)
+
+                # Load weights from the temporary file
+                self.model.keras_model.load_weights(temp_file.name, by_name=True)
+            os.remove(temp_file.name)
+        else:
+            self.model.keras_model.load_weights(self.config.WEIGHT_PATH, by_name=True)
+        
 
     def __call__(self, row: Dict[str, Any]) -> Dict[str, Any]:
 
diff --git a/ray_maple_workflow.py b/ray_maple_workflow.py
index ba8c18f..674f345 100644
--- a/ray_maple_workflow.py
+++ b/ray_maple_workflow.py
@@ -27,8 +27,10 @@
 import ray_tile_and_stitch_util
 
 
-def create_geotiff_images_dataset(input_image_dir: str) -> ray.data.Dataset:
-    return ray.data.read_binary_files(input_image_dir, include_paths=True)
+def create_geotiff_images_dataset(config: MPL_Config) -> ray.data.Dataset:
+    if config.GCP_FILESYSTEM is not None:
+        return ray.data.read_binary_files(config.INPUT_IMAGE_DIR + "/", filesystem=config.GCP_FILESYSTEM, include_paths=True)
+    return ray.data.read_binary_files(config.INPUT_IMAGE_DIR, include_paths=True)
 
 
 def add_image_name(row: Dict[str, Any]) -> Dict[str, Any]:
@@ -37,6 +39,7 @@ def add_image_name(row: Dict[str, Any]) -> Dict[str, Any]:
 
 
 if __name__ == "__main__":
+    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/usr/local/google/home/kaylahardie/.config/gcloud/application_default_credentials.json"
     tf.compat.v1.disable_eager_execution()
     parser = argparse.ArgumentParser(
         description="Extract IWPs from satellite image scenes using MAPLE."
@@ -56,7 +59,9 @@ def add_image_name(row: Dict[str, Any]) -> Dict[str, Any]:
         required=False,
         default="",
         help="The directory path from where the workflow is running. If none is "
-        "provided, the current working directory will be used by the workflow.",
+        "provided, the current working directory will be used by the workflow. "
+        "If the root directory starts with gcs:// or gs:// then the workflow will "
+        "read and write to the google cloud storage buckets.",
     )
 
     parser.add_argument(
@@ -85,8 +90,7 @@ def add_image_name(row: Dict[str, Any]) -> Dict[str, Any]:
     )
 
     print("0. Load geotiffs into ray dataset")
-    dataset = create_geotiff_images_dataset(
-        config.INPUT_IMAGE_DIR).map(add_image_name)
+    dataset = create_geotiff_images_dataset(config).map(add_image_name)
     print("Ray dataset schema:", dataset.schema())
     print("1. Start calculating watermask")
     dataset_with_water_mask = dataset.map(fn=ray_image_preprocessing.cal_water_mask,
@@ -108,9 +112,9 @@ def add_image_name(row: Dict[str, Any]) -> Dict[str, Any]:
           data_per_image.schema())
     print("5. Write shapefiles")
     shapefiles_dataset = data_per_image.map(
-        fn=ray_write_shapefiles.WriteShapefiles, fn_constructor_kwargs={"shpfile_output_dir": config.RAY_SHAPEFILES}, concurrency=2)
+        fn=ray_write_shapefiles.WriteShapefiles, fn_constructor_kwargs={"config": config}, concurrency=2)
     print("Done writing shapefiles", shapefiles_dataset.schema())
-
+    
 # Once you are done you can check the output on ArcGIS (win) or else you can check in QGIS (nx) Add the image and the
 # shp, shx, dbf as layers.
 # You can also look at compare_shapefile_features.py for how to compare the features in two shapefiles.
diff --git a/ray_write_shapefiles.py b/ray_write_shapefiles.py
index 8007d5f..c4abfca 100644
--- a/ray_write_shapefiles.py
+++ b/ray_write_shapefiles.py
@@ -1,11 +1,14 @@
 import os
 import shutil
-import shapefile
+from io import BytesIO 
+from typing import Any, Dict
 
+import shapefile
 from osgeo import gdal, osr
-import gdal_virtual_file_system as gdal_vfs
 from shapely.geometry import Polygon, Point
-from typing import Any, Dict
+
+import gdal_virtual_file_system as gdal_vfs
+from mpl_config import MPL_Config
 
 
 def delete_and_create_dir(dir: str):
@@ -24,10 +27,11 @@ def delete_and_create_dir(dir: str):
 class WriteShapefiles:
     def __init__(
         self,
-        shpfile_output_dir: str
+        config: MPL_Config
     ):
-        self.shpfile_output_dir = shpfile_output_dir
-        delete_and_create_dir(self.shpfile_output_dir)
+        self.config = config
+        if self.config.GCP_FILESYSTEM is None:
+            delete_and_create_dir(self.config.RAY_SHAPEFILES)
 
     def get_coordinate_system_info(self, file_path: str, file_bytes: bytes):
         try:
@@ -84,9 +88,12 @@ def write_prj_file(self, geotiff_path: str, geotiff_bytes: bytes, prj_file_path:
             print(wkt)
             if wkt is not None:
                 # Write the WKT to a .prj file
-                with open(prj_file_path, "w") as prj_file:
-                    prj_file.write(wkt)
-
+                if self.config.GCP_FILESYSTEM is not None:
+                    with self.config.GCP_FILESYSTEM.open(prj_file_path, "w") as prj_file:
+                        prj_file.write(wkt)
+                else:
+                    with open(prj_file_path, "w") as prj_file:
+                        prj_file.write(wkt)
                 print(
                     f"Coordinate system information written to {prj_file_path}")
             else:
@@ -95,8 +102,7 @@ def write_prj_file(self, geotiff_path: str, geotiff_bytes: bytes, prj_file_path:
         except Exception as e:
             print(f"Error when trying to write the prj file: {e}")
 
-    def write_shapefile(self, row: Dict[str, Any], shapefile_output_dir_for_image: str):
-        w = shapefile.Writer(shapefile_output_dir_for_image)
+    def write_shapefile_helper(self, row: Dict[str, Any], w):
         w.field("Class", "C", size=5)
         w.field("Sensor", "C", "10")
         w.field("Date", "C", "10")
@@ -127,18 +133,35 @@ def write_shapefile(self, row: Dict[str, Any], shapefile_output_dir_for_image: s
             w.record(Class=shapefile_result.class_id, Sensor=image_name[0:4], Date=image_name[5:13],
                      Time=image_name[13:19], CatalogID=image_name[20:36], Area=poly.area,
                      CentroidX=centroid.x, CentroidY=centroid.y, Perimeter=poly.length, Length=length, Width=width)
-        w.close()
+
+    def write_shapefile(self, row: Dict[str, Any], shapefile_output_dir_for_image):
+        if self.config.GCP_FILESYSTEM is not None:
+            with BytesIO() as shp_mem, BytesIO() as shx_mem, BytesIO() as dbf_mem:
+                with shapefile.Writer(shp=shp_mem, shx=shx_mem, dbf=dbf_mem) as writer:
+                    self.write_shapefile_helper(row, writer)
+                    writer.close()
+
+                with self.config.GCP_FILESYSTEM.open(f"{shapefile_output_dir_for_image}.shp", 'wb') as shp_file:
+                    shp_file.write(shp_mem.getvalue())
+                with self.config.GCP_FILESYSTEM.open(f"{shapefile_output_dir_for_image}.shx", 'wb') as shx_file:
+                    shx_file.write(shx_mem.getvalue())
+                with self.config.GCP_FILESYSTEM.open(f"{shapefile_output_dir_for_image}.dbf", 'wb') as dbf_file:
+                    dbf_file.write(dbf_mem.getvalue())
+        else:
+            writer = shapefile.Writer(f"{shapefile_output_dir_for_image}.shp")
+            self.write_shapefile_helper(row, writer)
+            writer.close()
+
+
+
 
     def __call__(self, row: Dict[str, Any]) -> Dict[str, Any]:
         image_name = row["image_name"]
         print("Writing shapefiles for:", image_name)
         shapefile_output_dir_for_image = os.path.join(
-            self.shpfile_output_dir, f"{image_name}.shp")
+            self.config.RAY_OUTPUT_SHAPEFILES_DIR, image_name)
         self.write_shapefile(row, shapefile_output_dir_for_image)
-
-        prj_output_dir_for_image = os.path.join(
-            self.shpfile_output_dir, f"{image_name}.prj")
         self.write_prj_file(
-            geotiff_path=row["path"], geotiff_bytes=row["bytes"], prj_file_path=prj_output_dir_for_image)
+            geotiff_path=row["path"], geotiff_bytes=row["bytes"], prj_file_path=f"{shapefile_output_dir_for_image}.prj")
         row["shapefile_output_dir"] = shapefile_output_dir_for_image
         return row

From 24523c6604854a52d2421fa98cea9b15da2ea7c0 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Thu, 16 May 2024 23:59:05 +0000
Subject: [PATCH 24/31] add application default credentials directory as a flag
 so it isn't hardcoded, this is needed for service account impersonation if
 running the code on your local computer and want it to be able to access gcs
 buckets

---
 mpl_config.py         | 17 ++++++++++++-----
 ray_maple_workflow.py | 13 +++++++++++--
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/mpl_config.py b/mpl_config.py
index a250fa4..48b5904 100644
--- a/mpl_config.py
+++ b/mpl_config.py
@@ -32,6 +32,7 @@ class MPL_Config(object):
     def __init__(
         self,
         root_dir="",
+        adc_dir="",
         weight_file="hyp_best_train_weights_final.h5",
         logging=True,
         crop_size=200,
@@ -56,11 +57,17 @@ def __init__(
 
         self.GCP_FILESYSTEM = None
         if (self.ROOT_DIR.startswith(
-            "gcs://") or self.ROOT_DIR.startswith("gs://")):
-            creds, _ = google.auth.load_credentials_from_file(
-                "/usr/local/google/home/kaylahardie/.config/gcloud/application_default_credentials.json", scopes=["https://www.googleapis.com/auth/cloud-platform"])
-            self.GCP_FILESYSTEM = gcsfs.GCSFileSystem(
-                project="pdg-project-406720", token=creds)
+                "gcs://") or self.ROOT_DIR.startswith("gs://")):
+            if adc_dir:
+                print("You are using application default credentials to authenticate.")
+                creds, _ = google.auth.load_credentials_from_file(
+                    adc_dir, scopes=["https://www.googleapis.com/auth/cloud-platform"])
+                self.GCP_FILESYSTEM = gcsfs.GCSFileSystem(
+                    project="pdg-project-406720", token=creds)
+            else:
+                print("Please specify an application default credentials directory if you are running this code on your local computer.")
+                self.GCP_FILESYSTEM = gcsfs.GCSFileSystem(
+                    project="pdg-project-406720")
 
         # ADDED to include inference cleaning post-processing
         self.CLEAN_DATA_DIR = self.ROOT_DIR + r"/data/cln_data"
diff --git a/ray_maple_workflow.py b/ray_maple_workflow.py
index 674f345..eabc6e4 100644
--- a/ray_maple_workflow.py
+++ b/ray_maple_workflow.py
@@ -39,7 +39,6 @@ def add_image_name(row: Dict[str, Any]) -> Dict[str, Any]:
 
 
 if __name__ == "__main__":
-    os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "/usr/local/google/home/kaylahardie/.config/gcloud/application_default_credentials.json"
     tf.compat.v1.disable_eager_execution()
     parser = argparse.ArgumentParser(
         description="Extract IWPs from satellite image scenes using MAPLE."
@@ -64,6 +63,16 @@ def add_image_name(row: Dict[str, Any]) -> Dict[str, Any]:
         "read and write to the google cloud storage buckets.",
     )
 
+    parser.add_argument(
+        "--adc_dir",
+        required=False,
+        default="",
+        help="The directory path for application default credentials (adc). This path must be set if "
+        "you want to give ray access to your gcs buckets when you are running this workflow on your "
+        "*local computer*. It is necessary for service account impersonation, which is used to give "
+        "this code access to your storage bucket when running the code locally."
+    )
+
     parser.add_argument(
         "--weight_file",
         required=False,
@@ -86,7 +95,7 @@ def add_image_name(row: Dict[str, Any]) -> Dict[str, Any]:
 
     image_name = args.image
     config = MPL_Config(
-        args.root_dir, args.weight_file, num_gpus_per_core=args.gpus_per_core
+        args.root_dir, args.adc_dir, args.weight_file, num_gpus_per_core=args.gpus_per_core
     )
 
     print("0. Load geotiffs into ray dataset")

From 5cdf5c01094e576f702283bd277bb419a9fd0774 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Fri, 17 May 2024 19:39:26 +0000
Subject: [PATCH 25/31] Each run is now outputted under it's own datetime
 directory

---
 ray_write_shapefiles.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ray_write_shapefiles.py b/ray_write_shapefiles.py
index c4abfca..0ab8932 100644
--- a/ray_write_shapefiles.py
+++ b/ray_write_shapefiles.py
@@ -1,4 +1,5 @@
 import os
+from datetime import datetime
 import shutil
 from io import BytesIO 
 from typing import Any, Dict
@@ -30,6 +31,7 @@ def __init__(
         config: MPL_Config
     ):
         self.config = config
+        self.current_timestamp_str = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
         if self.config.GCP_FILESYSTEM is None:
             delete_and_create_dir(self.config.RAY_SHAPEFILES)
 
@@ -159,7 +161,7 @@ def __call__(self, row: Dict[str, Any]) -> Dict[str, Any]:
         image_name = row["image_name"]
         print("Writing shapefiles for:", image_name)
         shapefile_output_dir_for_image = os.path.join(
-            self.config.RAY_OUTPUT_SHAPEFILES_DIR, image_name)
+            self.config.RAY_OUTPUT_SHAPEFILES_DIR, self.current_timestamp_str, image_name)
         self.write_shapefile(row, shapefile_output_dir_for_image)
         self.write_prj_file(
             geotiff_path=row["path"], geotiff_bytes=row["bytes"], prj_file_path=f"{shapefile_output_dir_for_image}.prj")

From 4fa1a8f19c8ef8f7eaa0751f613376727f38310d Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Tue, 18 Jun 2024 17:39:40 +0000
Subject: [PATCH 26/31] small fixes (trying to limit the changes on the
 original maple_workflow.py). Changed the input img directory back to be the
 input_img_local (what it was originally)

---
 maple_workflow.py | 3 +--
 mpl_config.py     | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/maple_workflow.py b/maple_workflow.py
index 7886e29..953d57e 100644
--- a/maple_workflow.py
+++ b/maple_workflow.py
@@ -329,12 +329,11 @@ def stich_shapefile(config: MPL_Config, input_img_name: str):
     print("4. start stiching")
     stich_shapefile(config, image_name)
     process.process_shapefile(config, image_name)
-    print("5. finish stichting")
     print("5. start cleaning")
     inf_clean.clean_inference_shapes(
         config.CLEAN_DATA_DIR,
         config.PROJECTED_SHP_DIR,
-       "./data/input_bound/sample2_out_boundry.shp",
+        "./data/input_bound/sample2_out_boundry.shp",
     )
 
 # Once you are done you can check the output on ArcGIS (win) or else you can check in QGIS (nx) Add the image and the
diff --git a/mpl_config.py b/mpl_config.py
index 48b5904..aadf711 100644
--- a/mpl_config.py
+++ b/mpl_config.py
@@ -43,7 +43,7 @@ def __init__(
         # Code expects some of the locations to be available when executing.
         # -----------------------------------------------------------------
         self.ROOT_DIR = root_dir if root_dir else os.getcwd()
-        self.INPUT_IMAGE_DIR = self.ROOT_DIR + r"/data/input_img"
+        self.INPUT_IMAGE_DIR = self.ROOT_DIR + r"/data/input_img_local"
         self.DIVIDED_IMAGE_DIR = self.ROOT_DIR + r"/data/divided_img"
         self.OUTPUT_SHP_DIR = self.ROOT_DIR + r"/data/output_shp"
         self.FINAL_SHP_DIR = self.ROOT_DIR + r"/data/final_shp"

From fe5caf516cfb0b7ac4111cb8f1ad9431858a3049 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Mon, 8 Jul 2024 19:54:32 +0000
Subject: [PATCH 27/31] addressing gugibugy's review comments

---
 environment_maple.yml       | 10 +++-
 gdal_virtual_file_path.py   | 39 +++++++++++++++
 gdal_virtual_file_system.py | 27 -----------
 ray_image_preprocessing.py  | 89 +++++++++++++++-------------------
 ray_infer_tiles.py          | 24 ++++-----
 ray_maple_workflow.py       | 60 ++++++++++++-----------
 ray_tile_and_stitch_util.py | 97 +++++++++++++++++++------------------
 ray_write_shapefiles.py     | 86 ++++++++++++++------------------
 8 files changed, 214 insertions(+), 218 deletions(-)
 create mode 100644 gdal_virtual_file_path.py
 delete mode 100644 gdal_virtual_file_system.py

diff --git a/environment_maple.yml b/environment_maple.yml
index 63c16a2..7043eda 100644
--- a/environment_maple.yml
+++ b/environment_maple.yml
@@ -1,9 +1,15 @@
 name: maple_py310_ray
 channels:
+  # conda-forge may have more up to date packages than defaults,
+  # it might also have some packages that defaults doesn't have.
+  # Originally added it here because defaults didn't have newer
+  # versions of gdal.
+  - conda-forge
   - defaults
 dependencies:
   - python=3.10
-  - gdal
+  # gdal introduces with statements for file handling in 3.8
+  - gdal>=3.8
   - fiona
   - pyshp
   - scikit-image
@@ -13,6 +19,8 @@ dependencies:
     - gcsfs
     - google-auth
     - tensorflow[and-cuda]==2.15.1
+    # For ray pipeline monitoring
+    - tqdm
     - opencv-python
     - ray
     - pandas==2.1.4
diff --git a/gdal_virtual_file_path.py b/gdal_virtual_file_path.py
new file mode 100644
index 0000000..b2b799d
--- /dev/null
+++ b/gdal_virtual_file_path.py
@@ -0,0 +1,39 @@
+"""
+Class that manages the opening and closing of a single gdal file using
+gdal's virtual file system.
+The pattern is to instantiate the class using a with statement to
+create a virtual file path.
+For example:
+
+with GDALVirtualFilePath(file_path, file_bytes) as virtual_file_path:
+    ....
+
+The with statement manages cleanup. The __enter__ is automatically called
+when you enter the with statement block and __exit__ is automatically
+called when you exit the block.
+
+See https://stackoverflow.com/questions/865115/how-do-i-correctly-clean-up-a-python-object
+for more information.
+"""
+import os
+from osgeo import gdal
+
+class GDALVirtualFilePath:
+    def __init__(
+        self,
+        file_path: str,
+        file_bytes: bytes
+    ):
+        vfs_dir_path = '/vsimem/vsidir/'
+        file_name = os.path.basename(file_path)
+        self.vfs_filename = os.path.join(vfs_dir_path, file_name)
+        self.file_bytes = file_bytes
+
+    def __enter__(self) -> str:
+        dst = gdal.VSIFOpenL(self.vfs_filename, 'wb+')
+        gdal.VSIFWriteL(self.file_bytes, 1, len(self.file_bytes), dst)
+        gdal.VSIFCloseL(dst)
+        return self.vfs_filename
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        gdal.Unlink(self.vfs_filename)
diff --git a/gdal_virtual_file_system.py b/gdal_virtual_file_system.py
deleted file mode 100644
index d053a8f..0000000
--- a/gdal_virtual_file_system.py
+++ /dev/null
@@ -1,27 +0,0 @@
-"""
-Class that manages open and closing of a single gdal file.
-The pattern is to instantiate the class, call create to open it and then 
-call close to close the file.
-"""
-import os
-from osgeo import gdal
-
-class GDALVirtualFileSystem:
-    def __init__(
-        self,
-        file_path: str,
-        file_bytes: bytes
-    ):
-        vfs_dir_path = '/vsimem/vsidir/'
-        file_name = os.path.basename(file_path)
-        self.vfs_filename = os.path.join(vfs_dir_path, file_name)
-        self.file_bytes = file_bytes
-
-    def create_virtual_file(self) -> str:
-        dst = gdal.VSIFOpenL(self.vfs_filename, 'wb+')
-        gdal.VSIFWriteL(self.file_bytes, 1, len(self.file_bytes), dst)
-        gdal.VSIFCloseL(dst)
-        return self.vfs_filename
-
-    def close_virtual_file(self):
-        gdal.Unlink(self.vfs_filename)
diff --git a/ray_image_preprocessing.py b/ray_image_preprocessing.py
index 6409c34..f287301 100644
--- a/ray_image_preprocessing.py
+++ b/ray_image_preprocessing.py
@@ -17,7 +17,7 @@
 from skimage.morphology import disk
 
 from mpl_config import MPL_Config
-import gdal_virtual_file_system as gdal_vfs
+import gdal_virtual_file_path as gdal_vfp
 
 
 
@@ -39,7 +39,6 @@ def cal_water_mask(row: Dict[str, Any], config: MPL_Config) -> Dict[str, Any]:
 
     image_name = row["image_name"]
     worker_water_subroot = os.path.join(worker_water_root, image_name)
-    temp_water_subroot = os.path.join(temp_water_root, image_name)
     # Prepare to make directories to create the files
     try:
         shutil.rmtree(worker_water_subroot)
@@ -47,64 +46,54 @@ def cal_water_mask(row: Dict[str, Any], config: MPL_Config) -> Dict[str, Any]:
         print("directory deletion failed")
         pass
 
-    try:
-        shutil.rmtree(temp_water_subroot)
-    except:
-        print("directory deletion failed")
-        pass
-
     # check local storage for temporary storage
     Path(worker_water_subroot).mkdir(parents=True, exist_ok=True)
-    Path(temp_water_subroot).mkdir(parents=True, exist_ok=True)
 
     output_watermask = os.path.join(
         worker_water_subroot, r"%s_watermask.tif" % image_name
     )
-    output_tif_8b_file = os.path.join(
-        temp_water_subroot, r"%s_8bit.tif" % image_name
+    
+    # Create a gdal virtual file path so we don't have to create
+    # and store a temp file to disk to then read it right after
+    # storing it.
+    output_tif_8b_virtual_file = os.path.join(
+        "/vsimem/vsidir", r"%s_8bit.tif" % image_name
     )
-    nir_band = 3  # set number of NIR band
+    nir_band = 4  # set number of NIR band
 
     # %% Median and Otsu
     value = 5
 
-    # Create virtual file system file for image to use GDAL's file apis.
-    vfs = gdal_vfs.GDALVirtualFileSystem(
-        file_path=row["path"], file_bytes=row["bytes"])
-    virtual_file_path = vfs.create_virtual_file()
-
-    # UPDATED CODE - amal 01/11/2023
-    # cmd line execution thrown exceptions unable to capture
-    # Using gdal to execute the gdal_Translate
-    # output file checked against the cmd line gdal_translate
-    gdal.UseExceptions()  # Enable errors
-    try:
-        gdal.Translate(
-            destName=output_tif_8b_file,
-            srcDS=virtual_file_path,
-            format="GTiff",
-            outputType=gdal.GDT_Byte,
-        )
-    except RuntimeError:
-        print("gdal Translate failed with", gdal.GetLastErrorMsg())
-        pass
-
-    image = io.imread(
-        output_tif_8b_file
-    )  # image[rows, columns, dimensions]-> image[:,:,3] is near Infrared
-    nir = image[:, :, nir_band]
-
-    bilat_img = filters.rank.median(nir, disk(value))
-
-    gtif = gdal.Open(virtual_file_path)
-    geotransform = gtif.GetGeoTransform()
-    sourceSR = gtif.GetProjection()
-    # Close the file.
-    gtif = None
-    vfs.close_virtual_file()
-
-    x = np.shape(image)[1]
-    y = np.shape(image)[0]
+    # Create virtual file path for image to use GDAL's file apis.
+    with gdal_vfp.GDALVirtualFilePath(
+        file_path=row["path"], file_bytes=row["bytes"]) as virtual_file_path:
+        # UPDATED CODE - amal 01/11/2023
+        # cmd line execution thrown exceptions unable to capture
+        # Using gdal to execute the gdal_Translate
+        # output file checked against the cmd line gdal_translate
+        gdal.UseExceptions()  # Enable errors
+        try:
+            gdal.Translate(
+                destName=output_tif_8b_virtual_file,
+                srcDS=virtual_file_path,
+                format="GTiff",
+                outputType=gdal.GDT_Byte,
+            )
+        except RuntimeError:
+            print("gdal Translate failed with", gdal.GetLastErrorMsg())
+            pass
+
+        with gdal.Open(output_tif_8b_virtual_file) as output_tif_8b:
+            nir = np.array(output_tif_8b.GetRasterBand(nir_band).ReadAsArray())
+            bilat_img = filters.rank.median(nir, disk(value))
+            gdal.Unlink(output_tif_8b_virtual_file)
+
+        with gdal.Open(virtual_file_path) as gtif:
+            geotransform = gtif.GetGeoTransform()
+            sourceSR = gtif.GetProjection()
+
+    x = np.shape(nir)[1]
+    y = np.shape(nir)[0]
 
     # Normalize and blur before thresholding. Usually instead of normalizing
     # a rgb to greyscale transformation is applied. In this case, we are already
@@ -131,6 +120,6 @@ def cal_water_mask(row: Dict[str, Any], config: MPL_Config) -> Dict[str, Any]:
     dst_ds.SetGeoTransform(geotransform)
     dst_ds.SetProjection(sourceSR)
     dst_ds.FlushCache()
-    dst_ds = None
+    del dst_ds
     row["mask"] = mask
     return row
diff --git a/ray_infer_tiles.py b/ray_infer_tiles.py
index 83c87d7..981a407 100644
--- a/ray_infer_tiles.py
+++ b/ray_infer_tiles.py
@@ -11,26 +11,18 @@
 
 from dataclasses import dataclass
 import os
+import random
 import tempfile
 from typing import Any, Dict, List
 
 import numpy as np
-import tensorflow as tf
 from skimage.measure import find_contours
+import ray
+import tensorflow as tf
 
 import model as modellib
 from mpl_config import MPL_Config, PolygonConfig
-
-
-@dataclass
-class ShapefileResult:
-    polygons: np.array
-    class_id: int
-
-
-@dataclass
-class ShapefileResults:
-    shapefile_results: List[ShapefileResult]
+from ray_tile_and_stitch_util import ShapefileResult, ShapefileResults
 
 
 class MaskRCNNPredictor:
@@ -40,9 +32,11 @@ def __init__(
     ):
         self.config = config
         # Used to identify a specific predictor when mulitple predictors are
-        # created to run inference in parallel. The counter is also used to
-        # know which GPU to use when multiple are available.
-        self.process_counter = 1  # TODO need to fix this process_counter
+        # created to run inference in parallel. 
+        # The process_counter could be assigned more optimally. We could have
+        # an accounting system so we optimize our gpus better.
+        ray_gpu_ids = ray.get_gpu_ids()
+        self.process_counter = random.choice(ray_gpu_ids) if ray_gpu_ids else 0
         self.use_gpu = config.NUM_GPUS_PER_CORE > 0
         self.device = "/gpu:%d" % self.process_counter if self.use_gpu else "/cpu:0"
 
diff --git a/ray_maple_workflow.py b/ray_maple_workflow.py
index eabc6e4..faf5e3a 100644
--- a/ray_maple_workflow.py
+++ b/ray_maple_workflow.py
@@ -17,12 +17,12 @@
 import os
 from typing import Any, Dict
 
-import tensorflow as tf
 import ray
+import tensorflow as tf
 
 from mpl_config import MPL_Config
 import ray_image_preprocessing
-import ray_infer_tiles as ray_inference
+import ray_infer_tiles
 import ray_write_shapefiles
 import ray_tile_and_stitch_util
 
@@ -91,6 +91,16 @@ def add_image_name(row: Dict[str, Any]) -> Dict[str, Any]:
         type=int
     )
 
+    parser.add_argument(
+        "--concurrency",
+        required=False,
+        default=2,
+        help="Number of ray workers for each map call. The concurrency parameter "
+        "should be tuned based on the resources available in the raycluster in "
+        "order to make the best use of the compute resources available. ",
+        type=int
+    )
+
     args = parser.parse_args()
 
     image_name = args.image
@@ -98,32 +108,26 @@ def add_image_name(row: Dict[str, Any]) -> Dict[str, Any]:
         args.root_dir, args.adc_dir, args.weight_file, num_gpus_per_core=args.gpus_per_core
     )
 
-    print("0. Load geotiffs into ray dataset")
-    dataset = create_geotiff_images_dataset(config).map(add_image_name)
-    print("Ray dataset schema:", dataset.schema())
-    print("1. Start calculating watermask")
-    dataset_with_water_mask = dataset.map(fn=ray_image_preprocessing.cal_water_mask,
-                                          fn_kwargs={"config": config})
-    print("Dataset schema with water mask: ", dataset_with_water_mask.schema())
-    print("2. Start tiling image")
-    image_tiles_dataset = dataset_with_water_mask.flat_map(
-        fn=ray_tile_and_stitch_util.tile_image, fn_kwargs={"config": config})
-    image_tiles_dataset = image_tiles_dataset.drop_columns(["mask"])
-    print("Dataset schema with image tiles: ", image_tiles_dataset.schema())
-    print("3. Start inferencing")
-    inferenced_dataset = image_tiles_dataset.map(
-        fn=ray_inference.MaskRCNNPredictor, fn_constructor_kwargs={"config": config}, concurrency=2)
-    print("Dataset schema with inferenced tiles: ", inferenced_dataset.schema())
-    print("4. Start stitching")
-    data_per_image = inferenced_dataset.groupby(
-        "image_name").map_groups(ray_tile_and_stitch_util.stitch_shapefile)
-    print("Dataset schema where each row is an image (result of a group by tile): ",
-          data_per_image.schema())
-    print("5. Write shapefiles")
-    shapefiles_dataset = data_per_image.map(
-        fn=ray_write_shapefiles.WriteShapefiles, fn_constructor_kwargs={"config": config}, concurrency=2)
-    print("Done writing shapefiles", shapefiles_dataset.schema())
-    
+    print("Ray pipeline starting")
+    ray_dataset = create_geotiff_images_dataset(config).map(
+        add_image_name, concurrency=args.concurrency).map(
+            fn=ray_image_preprocessing.cal_water_mask,
+            fn_kwargs={"config": config},
+            concurrency=args.concurrency).flat_map(
+                fn=ray_tile_and_stitch_util.tile_image,
+                fn_kwargs={"config": config},
+                concurrency=args.concurrency).drop_columns(["mask"]).map(
+                    fn=ray_infer_tiles.MaskRCNNPredictor,
+                    fn_constructor_kwargs={"config": config},
+                    concurrency=args.concurrency).groupby("image_name").map_groups(
+                        ray_tile_and_stitch_util.stitch_shapefile,
+                        concurrency=args.concurrency).map(
+                            fn=ray_write_shapefiles.WriteShapefiles,
+                            fn_constructor_kwargs={"config": config},
+                            concurrency=args.concurrency)
+
+    print("Ray pipeline finished: ", ray_dataset.schema())
+
 # Once you are done you can check the output on ArcGIS (win) or else you can check in QGIS (nx) Add the image and the
 # shp, shx, dbf as layers.
 # You can also look at compare_shapefile_features.py for how to compare the features in two shapefiles.
diff --git a/ray_tile_and_stitch_util.py b/ray_tile_and_stitch_util.py
index ced68e2..67b3213 100644
--- a/ray_tile_and_stitch_util.py
+++ b/ray_tile_and_stitch_util.py
@@ -9,16 +9,15 @@
 from collections import defaultdict
 from dataclasses import dataclass
 from typing import Any, Dict, List
-import random
 
 import cv2
+import numpy as np
+import pandas as pd
 from osgeo import gdal, ogr
 from scipy.spatial import distance
 from shapely.geometry import Polygon
-import numpy as np
-import pandas as pd
 
-import gdal_virtual_file_system as gdal_vfs
+import gdal_virtual_file_path as gdal_vfp
 from mpl_config import MPL_Config
 
 
@@ -68,47 +67,42 @@ def tile_image(row: Dict[str, Any], config: MPL_Config) -> List[Dict[str, Any]]:
     config : Contains static configuration information regarding the workflow.
     """
 
-    # Create virtual file system file for image to use GDAL's file apis.
-    vfs = gdal_vfs.GDALVirtualFileSystem(
-        file_path=row["path"], file_bytes=row["bytes"])
-    virtual_image_file_path = vfs.create_virtual_file()
-    input_image_gtif = gdal.Open(virtual_image_file_path)
-    mask = row["mask"]
-
-    # convert the original image into the geo cordinates for further processing using gdal
-    # https://gdal.org/tutorials/geotransforms_tut.html
-    # GT(0) x-coordinate of the upper-left corner of the upper-left pixel.
-    # GT(1) w-e pixel resolution / pixel width.
-    # GT(2) row rotation (typically zero).
-    # GT(3) y-coordinate of the upper-left corner of the upper-left pixel.
-    # GT(4) column rotation (typically zero).
-    # GT(5) n-s pixel resolution / pixel height (negative value for a north-up image).
-
-    # ---------------------- crop image from the water mask----------------------
-    # dot product of the mask and the orignal data before breaking it for processing
-    # Also band 2 3 and 4 are taken because the 4 bands cannot be processed by the NN learingin algo
-    # Need to make sure that the training bands are the same as the bands used for inferencing.
-    #
-    final_array_2 = input_image_gtif.GetRasterBand(2).ReadAsArray()
-    final_array_3 = input_image_gtif.GetRasterBand(3).ReadAsArray()
-    final_array_4 = input_image_gtif.GetRasterBand(4).ReadAsArray()
-
-    final_array_2 = np.multiply(final_array_2, mask)
-    final_array_3 = np.multiply(final_array_3, mask)
-    final_array_4 = np.multiply(final_array_4, mask)
-
-    # ulx, uly is the upper left corner
-    ulx, x_resolution, _, uly, _, y_resolution = input_image_gtif.GetGeoTransform()
-
-    # ---------------------- Divide image (tile) ----------------------
-    overlap_rate = 0.2
-    block_size = config.CROP_SIZE
-    ysize = input_image_gtif.RasterYSize
-    xsize = input_image_gtif.RasterXSize
-
-    # Close the file.
-    input_image_gtif = None
-    vfs.close_virtual_file()
+    # Create virtual file path for image to use GDAL's file apis.
+    with gdal_vfp.GDALVirtualFilePath(
+        file_path=row["path"], file_bytes=row["bytes"]) as virtual_image_file_path:
+        with gdal.Open(virtual_image_file_path) as input_image_gtif:
+            mask = row["mask"]
+
+            # convert the original image into the geo cordinates for further processing using gdal
+            # https://gdal.org/tutorials/geotransforms_tut.html
+            # GT(0) x-coordinate of the upper-left corner of the upper-left pixel.
+            # GT(1) w-e pixel resolution / pixel width.
+            # GT(2) row rotation (typically zero).
+            # GT(3) y-coordinate of the upper-left corner of the upper-left pixel.
+            # GT(4) column rotation (typically zero).
+            # GT(5) n-s pixel resolution / pixel height (negative value for a north-up image).
+
+            # ---------------------- crop image from the water mask----------------------
+            # dot product of the mask and the orignal data before breaking it for processing
+            # Also band 2 3 and 4 are taken because the 4 bands cannot be processed by the NN learingin algo
+            # Need to make sure that the training bands are the same as the bands used for inferencing.
+            #
+            final_array_2 = input_image_gtif.GetRasterBand(2).ReadAsArray()
+            final_array_3 = input_image_gtif.GetRasterBand(3).ReadAsArray()
+            final_array_4 = input_image_gtif.GetRasterBand(4).ReadAsArray()
+
+            final_array_2 = np.multiply(final_array_2, mask)
+            final_array_3 = np.multiply(final_array_3, mask)
+            final_array_4 = np.multiply(final_array_4, mask)
+
+            # ulx, uly is the upper left corner
+            ulx, x_resolution, _, uly, _, y_resolution = input_image_gtif.GetGeoTransform()
+
+            # ---------------------- Divide image (tile) ----------------------
+            overlap_rate = 0.2
+            block_size = config.CROP_SIZE
+            ysize = input_image_gtif.RasterYSize
+            xsize = input_image_gtif.RasterXSize
 
     tile_count = 0
 
@@ -198,6 +192,11 @@ def stitch_shapefile(group: pd.DataFrame):
     ----------
     group: a pandas dataframe that has all of the shapefile information for each tile in the
             image.
+
+    Returns
+    -------
+    A single row from the group with the shapefile results for all the tiles
+    stored in the "image_shapefile_results" column.
     """
     image_shapefile_results = []
     temp_polygon_dict = defaultdict(dict)
@@ -219,6 +218,9 @@ def stitch_shapefile(group: pd.DataFrame):
         polygon_dict[k] = [poly_count, poly_count + v]
         poly_count += v
 
+    # Choosing the first row is arbitrary, we just want to gather the
+    # image metadata which is the same for all the rows, since we did
+    # a groupby by image.
     first_row = group.head(1)
     image_data_from_arbitrary_row = first_row["image_metadata"][0]
     size_i, size_j = image_data_from_arbitrary_row.len_y_list, image_data_from_arbitrary_row.len_x_list
@@ -317,12 +319,13 @@ def component(node, neighbors=neighbors, seen=seen, see=seen.add):
     close_list = list(connected_components(close_list))
 
     # --------------- create a new shp file to store ---------------
-    # randomly pick one of many duplications
+    # Pick the first one if there's duplicates.
+    # This is arbirtary, choosing the first instead of a random one
+    # helps make the program more deterministic. 
     del_index_list = list()
     for close_possible in close_list:
         close_possible.sort()
         random_id = close_possible[0]
-        # random_id = random.choice(close_possible)
         close_possible.remove(random_id)
         del_index_list.extend(close_possible)
 
diff --git a/ray_write_shapefiles.py b/ray_write_shapefiles.py
index 0ab8932..2109986 100644
--- a/ray_write_shapefiles.py
+++ b/ray_write_shapefiles.py
@@ -1,14 +1,14 @@
-import os
 from datetime import datetime
-import shutil
 from io import BytesIO 
+import os
+import shutil
 from typing import Any, Dict
 
-import shapefile
 from osgeo import gdal, osr
 from shapely.geometry import Polygon, Point
+import shapefile
 
-import gdal_virtual_file_system as gdal_vfs
+import gdal_virtual_file_path as gdal_vfp
 from mpl_config import MPL_Config
 
 
@@ -33,44 +33,35 @@ def __init__(
         self.config = config
         self.current_timestamp_str = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
         if self.config.GCP_FILESYSTEM is None:
-            delete_and_create_dir(self.config.RAY_SHAPEFILES)
+            delete_and_create_dir(self.config.RAY_OUTPUT_SHAPEFILES_DIR)
 
-    def get_coordinate_system_info(self, file_path: str, file_bytes: bytes):
+    def __get_coordinate_system_info(self, file_path: str, file_bytes: bytes):
         try:
             # Open the dataset
-            # Create virtual file system file for image to use GDAL's file apis.
-            vfs = gdal_vfs.GDALVirtualFileSystem(
-                file_path, file_bytes)
-            virtual_file_path = vfs.create_virtual_file()
-            print("here is the virtual file path: ", virtual_file_path)
-            dataset = gdal.Open(virtual_file_path)
-
-            # Check if the dataset is valid
-            if dataset is None:
-                print("gdal error: ", gdal.GetLastErrorMsg())
-                raise Exception("Error: Unable to open the dataset")
-
-            # Get the spatial reference
+            # Create virtual file path for image to use GDAL's file apis.
             spatial_ref = osr.SpatialReference()
-
-            # Try to import the coordinate system from WKT
-            if spatial_ref.ImportFromWkt(dataset.GetProjection()) != gdal.CE_None:
-                raise Exception(
-                    "Error: Unable to import coordinate system from WKT.")
+            with gdal_vfp.GDALVirtualFilePath(
+                file_path, file_bytes) as virtual_file_path:
+                with gdal.Open(virtual_file_path) as dataset:
+                    # Check if the dataset is valid
+                    if dataset is None:
+                        print("gdal error: ", gdal.GetLastErrorMsg())
+                        raise Exception("Error: Unable to open the dataset")
+
+                    # Try to import the coordinate system from WKT
+                    if spatial_ref.ImportFromWkt(dataset.GetProjection()) != gdal.CE_None:
+                        raise Exception(
+                            "Error: Unable to import coordinate system from WKT.")
 
             # Check if the spatial reference is valid
             if spatial_ref.Validate() != gdal.CE_None:
                 raise Exception("Error: Invalid spatial reference.")
 
             # Export the spatial reference to WKT
-            dataset = None
-            vfs.close_virtual_file()
             return spatial_ref.ExportToWkt()
 
         except Exception as e:
             print(f"Error when getting the coordinate system info: {e}")
-            dataset = None
-            vfs.close_virtual_file()
             return None
 
     def write_prj_file(self, geotiff_path: str, geotiff_bytes: bytes, prj_file_path: str):
@@ -85,9 +76,7 @@ def write_prj_file(self, geotiff_path: str, geotiff_bytes: bytes, prj_file_path:
         """
         try:
             # Get the coordinate system information
-            wkt = self.get_coordinate_system_info(geotiff_path, geotiff_bytes)
-
-            print(wkt)
+            wkt = self.__get_coordinate_system_info(geotiff_path, geotiff_bytes)
             if wkt is not None:
                 # Write the WKT to a .prj file
                 if self.config.GCP_FILESYSTEM is not None:
@@ -104,22 +93,22 @@ def write_prj_file(self, geotiff_path: str, geotiff_bytes: bytes, prj_file_path:
         except Exception as e:
             print(f"Error when trying to write the prj file: {e}")
 
-    def write_shapefile_helper(self, row: Dict[str, Any], w):
-        w.field("Class", "C", size=5)
-        w.field("Sensor", "C", "10")
-        w.field("Date", "C", "10")
-        w.field("Time", "C", "10")
-        w.field("CatalogID", "C", "20")
-        w.field("Area", "N", decimal=3)
-        w.field("CentroidX", "N", decimal=3)
-        w.field("CentroidY", "N", decimal=3)
-        w.field("Perimeter", "N", decimal=3)
-        w.field("Length", "N", decimal=3)
-        w.field("Width", "N", decimal=3)
+    def __write_shapefile_helper(self, row: Dict[str, Any], writer: shapefile.Writer):
+        writer.field("Class", "C", size=5)
+        writer.field("Sensor", "C", "10")
+        writer.field("Date", "C", "10")
+        writer.field("Time", "C", "10")
+        writer.field("CatalogID", "C", "20")
+        writer.field("Area", "N", decimal=3)
+        writer.field("CentroidX", "N", decimal=3)
+        writer.field("CentroidY", "N", decimal=3)
+        writer.field("Perimeter", "N", decimal=3)
+        writer.field("Length", "N", decimal=3)
+        writer.field("Width", "N", decimal=3)
         image_name = row["image_name"]
         for shapefile_result in row["image_shapefile_results"].shapefile_results:
             polygons = shapefile_result.polygons
-            w.poly([polygons.tolist()])
+            writer.poly([polygons.tolist()])
 
             poly = Polygon(polygons)
             centroid = poly.centroid
@@ -132,7 +121,7 @@ def write_shapefile_helper(self, row: Dict[str, Any], w):
             length = max(edge_length)
             width = min(edge_length)
 
-            w.record(Class=shapefile_result.class_id, Sensor=image_name[0:4], Date=image_name[5:13],
+            writer.record(Class=shapefile_result.class_id, Sensor=image_name[0:4], Date=image_name[5:13],
                      Time=image_name[13:19], CatalogID=image_name[20:36], Area=poly.area,
                      CentroidX=centroid.x, CentroidY=centroid.y, Perimeter=poly.length, Length=length, Width=width)
 
@@ -140,7 +129,7 @@ def write_shapefile(self, row: Dict[str, Any], shapefile_output_dir_for_image):
         if self.config.GCP_FILESYSTEM is not None:
             with BytesIO() as shp_mem, BytesIO() as shx_mem, BytesIO() as dbf_mem:
                 with shapefile.Writer(shp=shp_mem, shx=shx_mem, dbf=dbf_mem) as writer:
-                    self.write_shapefile_helper(row, writer)
+                    self.__write_shapefile_helper(row, writer)
                     writer.close()
 
                 with self.config.GCP_FILESYSTEM.open(f"{shapefile_output_dir_for_image}.shp", 'wb') as shp_file:
@@ -151,12 +140,9 @@ def write_shapefile(self, row: Dict[str, Any], shapefile_output_dir_for_image):
                     dbf_file.write(dbf_mem.getvalue())
         else:
             writer = shapefile.Writer(f"{shapefile_output_dir_for_image}.shp")
-            self.write_shapefile_helper(row, writer)
+            self.__write_shapefile_helper(row, writer)
             writer.close()
 
-
-
-
     def __call__(self, row: Dict[str, Any]) -> Dict[str, Any]:
         image_name = row["image_name"]
         print("Writing shapefiles for:", image_name)

From 156d209c7695112e5b9b1d1a3335a74fd057dd09 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Thu, 18 Jul 2024 18:49:01 +0000
Subject: [PATCH 28/31] addressing more comments, fixing delete ray shapefile
 dir

---
 mpl_workflow_create_dir_struct.py |  2 +
 ray_maple_workflow.py             | 72 ++++++++++++++++++++++---------
 ray_write_shapefiles.py           | 15 -------
 3 files changed, 54 insertions(+), 35 deletions(-)

diff --git a/mpl_workflow_create_dir_struct.py b/mpl_workflow_create_dir_struct.py
index 9d5f6ca..02e9f1f 100644
--- a/mpl_workflow_create_dir_struct.py
+++ b/mpl_workflow_create_dir_struct.py
@@ -88,6 +88,7 @@ def create_maple_dir_structure():
     # ├── output_img
     # ├── output_shp
     # ├── projected_shp
+    # ├── ray_output_shapefiles
     # └── water_mask
     #     └── temp
     config = MPL_Config()
@@ -101,6 +102,7 @@ def create_maple_dir_structure():
     create_dir_path(config.OUTPUT_IMAGE_DIR)
     create_dir_path(os.path.join(config.WORKER_ROOT, "neighbors/"))
     create_dir_path(os.path.join(config.WORKER_ROOT, "projected_shp/"))
+    create_dir_path(config.RAY_OUTPUT_SHAPEFILES_DIR)
     create_dir_path(config.CLEAN_DATA_DIR)
 
 
diff --git a/ray_maple_workflow.py b/ray_maple_workflow.py
index faf5e3a..1142846 100644
--- a/ray_maple_workflow.py
+++ b/ray_maple_workflow.py
@@ -38,6 +38,16 @@ def add_image_name(row: Dict[str, Any]) -> Dict[str, Any]:
     return row
 
 
+def create_directory_if_not_exists(directory_path: str):
+    """Creates a directory with the specified path if it doesn't already exist."""
+
+    if not os.path.exists(directory_path):
+        os.makedirs(directory_path)
+        print(f"Directory created: {directory_path}")
+    else:
+        print(f"Directory already exists: {directory_path}") 
+
+
 if __name__ == "__main__":
     tf.compat.v1.disable_eager_execution()
     parser = argparse.ArgumentParser(
@@ -107,26 +117,48 @@ def add_image_name(row: Dict[str, Any]) -> Dict[str, Any]:
     config = MPL_Config(
         args.root_dir, args.adc_dir, args.weight_file, num_gpus_per_core=args.gpus_per_core
     )
-
-    print("Ray pipeline starting")
-    ray_dataset = create_geotiff_images_dataset(config).map(
-        add_image_name, concurrency=args.concurrency).map(
-            fn=ray_image_preprocessing.cal_water_mask,
-            fn_kwargs={"config": config},
-            concurrency=args.concurrency).flat_map(
-                fn=ray_tile_and_stitch_util.tile_image,
-                fn_kwargs={"config": config},
-                concurrency=args.concurrency).drop_columns(["mask"]).map(
-                    fn=ray_infer_tiles.MaskRCNNPredictor,
-                    fn_constructor_kwargs={"config": config},
-                    concurrency=args.concurrency).groupby("image_name").map_groups(
-                        ray_tile_and_stitch_util.stitch_shapefile,
-                        concurrency=args.concurrency).map(
-                            fn=ray_write_shapefiles.WriteShapefiles,
-                            fn_constructor_kwargs={"config": config},
-                            concurrency=args.concurrency)
-
-    print("Ray pipeline finished: ", ray_dataset.schema())
+    concurrency = args.concurrency
+
+    print("Starting MAPLE Ray pipeline...")
+    print("""This pipeline will:
+          - load geotiffs into ray dataset
+          - calculate a watermask for each image
+          - tile each watermasked image
+          - perform inference on each tile
+          - stitch the tile inference results to get the inference results for each image
+          - write the inference results for each image to shapefiles 
+          """)
+
+    # 0. Load geotiffs into ray dataset
+    dataset = create_geotiff_images_dataset(config).map(add_image_name, concurrency=concurrency)
+
+    # 1. Start calculating watermask
+    dataset_with_water_mask = dataset.map(fn=ray_image_preprocessing.cal_water_mask,
+                                          fn_kwargs={"config": config}, concurrency=args.concurrency)
+
+    # 2. Start tiling image
+    image_tiles_dataset = dataset_with_water_mask.flat_map(
+        fn=ray_tile_and_stitch_util.tile_image, fn_kwargs={"config": config}, concurrency=args.concurrency)
+    image_tiles_dataset = image_tiles_dataset.drop_columns(["mask"])
+
+    # 3. Start inferencing
+    inferenced_dataset = image_tiles_dataset.map(
+        fn=ray_infer_tiles.MaskRCNNPredictor, fn_constructor_kwargs={"config": config}, concurrency=concurrency)
+
+    # 4. Start stitching
+    data_per_image = inferenced_dataset.groupby(
+        "image_name").map_groups(ray_tile_and_stitch_util.stitch_shapefile, concurrency=args.concurrency)
+
+    # 5. Write shapefiles
+    # Create the output directory if it doesn't exist.
+    # TODO - do something similar for GCP directories, we have the code to create local
+    # dirs if they don't exist (ex. mpl_workflow_create_dir_struct.py), it'd be great to
+    # add support for creating GCP directories if they don't exist.
+    if config.GCP_FILESYSTEM is None:
+            create_directory_if_not_exists(config.RAY_OUTPUT_SHAPEFILES_DIR)
+    shapefiles_dataset = data_per_image.map(
+        fn=ray_write_shapefiles.WriteShapefiles, fn_constructor_kwargs={"config": config}, concurrency=concurrency)
+    print("MAPLE Ray pipeline finished, done writing shapefiles", shapefiles_dataset.schema())
 
 # Once you are done you can check the output on ArcGIS (win) or else you can check in QGIS (nx) Add the image and the
 # shp, shx, dbf as layers.
diff --git a/ray_write_shapefiles.py b/ray_write_shapefiles.py
index 2109986..e8eccc5 100644
--- a/ray_write_shapefiles.py
+++ b/ray_write_shapefiles.py
@@ -12,19 +12,6 @@
 from mpl_config import MPL_Config
 
 
-def delete_and_create_dir(dir: str):
-    try:
-        shutil.rmtree(dir)
-    except Exception as e:
-        print(f"Error deleting directory {dir}: {e}")
-
-    try:
-        os.makedirs(dir, exist_ok=True)
-        print(f"Created directory {dir}")
-    except Exception as e:
-        print(f"Error creating directory {dir}: {e}")
-
-
 class WriteShapefiles:
     def __init__(
         self,
@@ -32,8 +19,6 @@ def __init__(
     ):
         self.config = config
         self.current_timestamp_str = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-        if self.config.GCP_FILESYSTEM is None:
-            delete_and_create_dir(self.config.RAY_OUTPUT_SHAPEFILES_DIR)
 
     def __get_coordinate_system_info(self, file_path: str, file_bytes: bytes):
         try:

From f170b34e340e0cc2e57aca4ebbde262f1c7eb796 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Mon, 29 Jul 2024 01:40:58 +0000
Subject: [PATCH 29/31] Updated readme

---
 README.md             | 9 +++++++++
 ray_maple_workflow.py | 4 +++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 5e5fd8c..1d3e5f6 100644
--- a/README.md
+++ b/README.md
@@ -70,3 +70,12 @@ For the basic vanilla configuration you only need a list of input tiff files in
 <h2 id="weight-file"> Weight File </h2>
 
 This is the [trainded weight file](https://drive.google.com/file/d/1R51e8YqTKvc_5lq7wSKEbu1G1nyV1-YZ/view?usp=drive_link) that is required for the model to do the inferencing
+
+<h2> Running MAPLE using Ray.io </h2>
+In [PR #19](https://github.com/PermafrostDiscoveryGateway/MAPLE_v3/pull/19), we introduced a new MAPLE implementation that supports running MAPLE using ray.io.
+
+See [this comment](https://github.com/PermafrostDiscoveryGateway/MAPLE_v3/pull/19#issue-2241066807) for steps on how to run the MAPLE pipeline with your local file system.
+
+See [this comment](https://github.com/PermafrostDiscoveryGateway/MAPLE_v3/pull/19#issuecomment-2116407584) for steps on how to run the MAPLE pipeline with google cloud storage buckets.
+
+Refer to [this document](https://docs.google.com/document/d/14kru_GPsakDJFX-On1Db_NLClEFvB6oLFMQFFkNQqsA/edit?usp=sharing) for more information on ray. The [Ray Data](https://docs.google.com/document/d/14kru_GPsakDJFX-On1Db_NLClEFvB6oLFMQFFkNQqsA/edit#heading=h.6lw6d49z80gv) and [Ray Data Case Study: MAPLE](https://docs.google.com/document/d/14kru_GPsakDJFX-On1Db_NLClEFvB6oLFMQFFkNQqsA/edit#heading=h.otzb3b7a6vhk) are particularly helpful.
diff --git a/ray_maple_workflow.py b/ray_maple_workflow.py
index 1142846..cf3bcca 100644
--- a/ray_maple_workflow.py
+++ b/ray_maple_workflow.py
@@ -158,7 +158,9 @@ def create_directory_if_not_exists(directory_path: str):
             create_directory_if_not_exists(config.RAY_OUTPUT_SHAPEFILES_DIR)
     shapefiles_dataset = data_per_image.map(
         fn=ray_write_shapefiles.WriteShapefiles, fn_constructor_kwargs={"config": config}, concurrency=concurrency)
-    print("MAPLE Ray pipeline finished, done writing shapefiles", shapefiles_dataset.schema())
+    # Materialize dataset so that the pipeline steps are executed. 
+    materialized_dataset = shapefiles_dataset.materialize()
+    print("MAPLE Ray pipeline finished, done writing shapefiles", materialized_dataset.schema())
 
 # Once you are done you can check the output on ArcGIS (win) or else you can check in QGIS (nx) Add the image and the
 # shp, shx, dbf as layers.

From 9902b381c761c926a8f6a1af87b32460d8958760 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Mon, 29 Jul 2024 01:44:58 +0000
Subject: [PATCH 30/31] fix merge conflict with weight file

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 1d3e5f6..ff89c3f 100644
--- a/README.md
+++ b/README.md
@@ -69,7 +69,7 @@ For the basic vanilla configuration you only need a list of input tiff files in
 
 <h2 id="weight-file"> Weight File </h2>
 
-This is the [trainded weight file](https://drive.google.com/file/d/1R51e8YqTKvc_5lq7wSKEbu1G1nyV1-YZ/view?usp=drive_link) that is required for the model to do the inferencing
+This is the [trainded weight file](https://uconn-my.sharepoint.com/:u:/g/personal/amal_perera_uconn_edu/EQDp2IqNtMpS0Ri7t-A_gzkBHkmA5n8Vh8flLZxcD2TfUw?e=EcrPSX) that is required for the model to do the inferencing
 
 <h2> Running MAPLE using Ray.io </h2>
 In [PR #19](https://github.com/PermafrostDiscoveryGateway/MAPLE_v3/pull/19), we introduced a new MAPLE implementation that supports running MAPLE using ray.io.

From 3c6a08ae74ffe569b5c37a2677776fd7935cb372 Mon Sep 17 00:00:00 2001
From: Kayla Hardie <kaylahardie7@gmail.com>
Date: Mon, 29 Jul 2024 01:46:57 +0000
Subject: [PATCH 31/31] updating readme again

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ff89c3f..2c9ae3f 100644
--- a/README.md
+++ b/README.md
@@ -71,7 +71,7 @@ For the basic vanilla configuration you only need a list of input tiff files in
 
 This is the [trainded weight file](https://uconn-my.sharepoint.com/:u:/g/personal/amal_perera_uconn_edu/EQDp2IqNtMpS0Ri7t-A_gzkBHkmA5n8Vh8flLZxcD2TfUw?e=EcrPSX) that is required for the model to do the inferencing
 
-<h2> Running MAPLE using Ray.io </h2>
+## Running MAPLE using Ray.io
 In [PR #19](https://github.com/PermafrostDiscoveryGateway/MAPLE_v3/pull/19), we introduced a new MAPLE implementation that supports running MAPLE using ray.io.
 
 See [this comment](https://github.com/PermafrostDiscoveryGateway/MAPLE_v3/pull/19#issue-2241066807) for steps on how to run the MAPLE pipeline with your local file system.