Merge pull request #10 from jmuhlich/memory-fix-2

New quantification fixes memory problems #7
labsyspharm · Apr 3, 2020 · 556d8f9 · 556d8f9
2 parents 17a3dbc + ef621d5
commit 556d8f9
Showing 1 changed file with 99 additions and 73 deletions.
diff --git a/SingleCellDataExtraction.py b/SingleCellDataExtraction.py
@@ -9,72 +9,79 @@
 import os
 import skimage.measure as measure
 from pathlib import Path
+import tracemalloc
 
 
-def MaskChannel(mask_loaded,image_loaded):
+
+def MaskChannel(mask_loaded,image_loaded_z):
     """Function for quantifying a single channel image
 
     Returns a table with CellID according to the mask and the mean pixel intensity
     for the given channel for each cell"""
-    #Perform the masking to get all measures
-    dat = measure.regionprops(mask_loaded,image_loaded)
-    #Extract the stats
-    intensity = []
-    for i in range(len(dat)):
-        intensity.append(dat[i].mean_intensity)
-    #Return the dataframe object
-    return intensity
+    dat = measure.regionprops(mask_loaded, image_loaded_z)
+    n = len(dat)
+    intensity_z = np.empty(n)
+    for i in range(n):
+        intensity_z[i] = dat[i].mean_intensity
+        # Clear reference to avoid memory leak -- see MaskIDs for explanation.
+        dat[i] = None
+    return intensity_z
 
 
 def MaskIDs(mask):
     """This function will extract the CellIDs and the XY positions for each
     cell based on that cells centroid
 
     Returns a dictionary object"""
-    #Get the CellIDs for this dataset
+
     dat = measure.regionprops(mask)
-    #Extract the CellIDs
-    labels = []
-    xcoords = []
-    ycoords = []
-    area = []
-    minor_axis_length=[]
-    major_axis_length=[]
-    eccentricity = []
-    solidity = []
-    extent=[]
-    orientation=[]
-    for i in range(len(dat)):
-        #Get cell labels
-        labels.append(dat[i].label)
-        #Get x coordinate
-        xcoords.append(dat[i].centroid[0])
-        #Get y coordinate
-        ycoords.append(dat[i].centroid[1])
-        #Get Area
-        area.append(dat[i].area)
-        #Get the major_axis_length
-        major_axis_length.append(dat[i].major_axis_length)
-        #Get the minor_axis_length
-        minor_axis_length.append(dat[i].minor_axis_length)
-        #Get the eccentricity
-        eccentricity.append(dat[i].eccentricity)
-        #Get the solidity
-        solidity.append(dat[i].solidity)
-        #Get the extent
-        extent.append(dat[i].extent)
-        #Get the extent
-        orientation.append(dat[i].orientation)
-
-    #Form a dataframe from the lists
-    IDs = {"CellID": labels, "X_position": xcoords, "Y_position": ycoords,"Area":area,\
-        "MajorAxisLength":major_axis_length,"MinorAxisLength":minor_axis_length,\
-        "Eccentricity":eccentricity,"Solidity":solidity,"Extent":extent,"Orientation":orientation}
-    #Return the IDs object
+    n = len(dat)
+
+    # Pre-allocate numpy arrays for all properties we'll calculate.
+    labels = np.empty(n, int)
+    xcoords = np.empty(n)
+    ycoords = np.empty(n)
+    area = np.empty(n, int)
+    minor_axis_length = np.empty(n)
+    major_axis_length = np.empty(n)
+    eccentricity = np.empty(n)
+    solidity = np.empty(n)
+    extent = np.empty(n)
+    orientation = np.empty(n)
+
+    for i in range(n):
+        labels[i] = dat[i].label
+        xcoords[i] = dat[i].centroid[0]
+        ycoords[i] = dat[i].centroid[1]
+        area[i] = dat[i].area
+        major_axis_length[i] = dat[i].major_axis_length
+        minor_axis_length[i] = dat[i].minor_axis_length
+        eccentricity[i] = dat[i].eccentricity
+        solidity[i] = dat[i].solidity
+        extent[i] = dat[i].extent
+        orientation[i] = dat[i].orientation
+        # By clearing the reference to each RegionProperties object, we allow it
+        # and its cache to be garbage collected immediately. Otherwise memory
+        # usage creeps up needlessly while this function is executing.
+        dat[i] = None
+
+    IDs = {
+        "CellID": labels,
+        "X_position": xcoords,
+        "Y_position": ycoords,
+        "Area": area,
+        "MajorAxisLength": major_axis_length,
+        "MinorAxisLength": minor_axis_length,
+        "Eccentricity": eccentricity,
+        "Solidity": solidity,
+        "Extent": extent,
+        "Orientation": orientation,
+    }
+
     return IDs
 
 
-def PrepareData(mask,image,channel_names):
+def PrepareData(image,z):
     """Function for preparing input for maskzstack function. Connecting function
     to use with mc micro ilastik pipeline"""
 
@@ -85,14 +92,14 @@ def PrepareData(mask,image,channel_names):
         #Check to see if the image is ome.tif(f)
         if  image.endswith(('.ome.tif','.ome.tiff')):
             #Read the image
-            image_loaded = skimage.io.imread(image,plugin='tifffile')
+            image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile')
             #print('OME TIF(F) found') 
         else:
             #Read the image
-            image_loaded = skimage.io.imread(image,plugin='tifffile')
+            image_loaded_z = skimage.io.imread(image,img_num=z,plugin='tifffile')
             #print('TIF(F) found')
             # Remove extra axis
-            image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[3],image_loaded.shape[4]))
+            #image_loaded = image_loaded.reshape((image_loaded.shape[1],image_loaded.shape[3],image_loaded.shape[4]))
 
     #Check to see if image is hdf5
     elif image_path.suffix == '.h5' or image_path.suffix == '.hdf5':
@@ -108,20 +115,11 @@ def PrepareData(mask,image,channel_names):
         ###If the hdf5 is exported from ilastik fiji plugin, the order will need to be
         ###switched as above --> z_stack = np.swapaxes(z_stack,0,2) --> z_stack = np.swapaxes(z_stack,0,1)
 
-    #Read the mask
-    mask_loaded = skimage.io.imread(mask,plugin='tifffile')
-    #Read the channels names to pass to the
-    channel_names_loaded = pd.read_csv(channel_names,header=None)
-    #Add a column index for ease
-    channel_names_loaded.columns = ["marker"]
-    #Convert the channel names to a list
-    channel_names_loaded = list(channel_names_loaded.marker.values)
-
     #Return the objects
-    return mask_loaded, image_loaded, channel_names_loaded
+    return image_loaded_z
 
 
-def MaskZstack(mask_loaded,image_loaded,channel_names_loaded):
+def MaskZstack(mask_loaded,image,channel_names_loaded):
     """This function will extract the stats for each cell mask through each channel
     in the input image
 
@@ -133,9 +131,17 @@ def MaskZstack(mask_loaded,image_loaded,channel_names_loaded):
     IDs = pd.DataFrame(MaskIDs(mask_loaded))
     #Iterate through the z stack to extract intensities
     list_of_chan = []
-    for z in range(image_loaded.shape[0]):
-        #Get the z channel and the associated channel name from list of channel names
-        list_of_chan.append(MaskChannel(mask_loaded,image_loaded[z,:,:]))
+    #Get the z channel and the associated channel name from list of channel names
+    tracemalloc.start()
+    for z in range(len(channel_names_loaded)):
+        #Run the data Prep function
+        image_loaded_z = PrepareData(image,z)
+        #Use the above information to mask z stack
+        list_of_chan.append(MaskChannel(mask_loaded,image_loaded_z))
+        #Display memory monitoring --- next 3 lines can be commented
+        current, peak = tracemalloc.get_traced_memory()
+        print(f"Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB")
+        print("Finished "+str(z))
     #Convert the channel names list and the list of intensity values to a dictionary and combine with CellIDs and XY
     dat = pd.concat([IDs,pd.DataFrame(dict(zip(channel_names_loaded,list_of_chan)))],axis=1)
     #Get the name of the columns in the dataframe so we can reorder to histoCAT convention
@@ -162,25 +168,45 @@ def ExtractSingleCells(mask,image,channel_names,output):
 
     #Create pathlib object for output
     output = Path(output)
-    #Run the data Prep function
-    mask_loaded, image_loaded, channel_names_loaded = PrepareData(mask,image,channel_names)
-    #Use the above information to mask z stack
-    scdata = MaskZstack(mask_loaded,image_loaded,channel_names_loaded)
+
+    #Read the channels names
+    channel_names_loaded = pd.read_csv(channel_names,header=None)
+    #Add a column index for ease
+    channel_names_loaded.columns = ["marker"]
+    #Convert the channel names to a list
+    channel_names_loaded = list(channel_names_loaded.marker.values)
+    #Read the mask
+    mask_loaded = skimage.io.imread(mask,plugin='tifffile')
+
+    scdata_z = MaskZstack(mask_loaded,image,channel_names_loaded)
     #Write the singe cell data to a csv file using the image name
+
     im_full_name = os.path.basename(image)
     im_name = im_full_name.split('.')[0]
-    scdata.to_csv(str(Path(os.path.join(str(output),str(im_name+".csv")))),index=False)
+    scdata_z.to_csv(str(Path(os.path.join(str(output),str(im_name+".csv")))),index=False)
 
 
 def MultiExtractSingleCells(mask,image,channel_names,output):
     """Function for iterating over a list of z_stacks and output locations to
     export single-cell data from image masks"""
 
-    print("Extracting sinle-cell data for "+str(image)+'...')
+    print("Extracting single-cell data for "+str(image)+'...')
+    #Start memory monitoring --- next line can be commented
+    tracemalloc.start()
+    # ---
+
     #Run the ExtractSingleCells function for this image
     ExtractSingleCells(mask,image,channel_names,output)
-    #Print update
 
+    #Display memory monitoring --- next 3 lines can be commented
+    current, peak = tracemalloc.get_traced_memory()
+    print(f"Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB")
+    tracemalloc.stop()
+    # ---
+
+    #Print update
     im_full_name = os.path.basename(image)
     im_name = im_full_name.split('.')[0]
     print("Finished "+str(im_name))
+
+