Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hmi vae init #1

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Some comments added to clinical_associations_latent_dim_and_cluster_p…
…revalence.py
shanzaayub authored Jun 5, 2023
commit 36db03a3d4808a3144878e1fbf44c4349829e00a
33 changes: 18 additions & 15 deletions hmivae/clinical_associations_latent_dim_and_cluster_prevalence.py
Original file line number Diff line number Diff line change
@@ -76,7 +76,10 @@

dfs = []

for cvar in clinical_variables:
# cvar is clinical variable
# sub_cvar are the values the clincal variable can take on e.g. for cvar == ERStatus, sub_cvar == pos or sub_cvar == neg

for cvar in clinical_variables: # using all latent dims for this pass
cvar_dfs = []

for sub_cvar in patient_latent[cvar].unique():
@@ -85,13 +88,13 @@
selected_df = patient_latent.copy()[
~patient_latent[cvar].isna()
] # drop nan values for each var
selected_df[cvar] = list(map(int, selected_df[cvar] == sub_cvar))
selected_df[cvar] = list(map(int, selected_df[cvar] == sub_cvar)) # map 1 and 0 for entries that belong to the sub_cvar

X = selected_df[
latent_dim_cols
].to_numpy() # select columns corresponding to latent dims and convert to numpy
X = sm.add_constant(X) # add constant
y = selected_df[
y = selected_df[ # this is the 0 and 1 col
cvar
].to_numpy() # select the clinical variable column and convert to numpy -- no fillna(0) since all the nans should have been dropped
try:
@@ -130,7 +133,7 @@
].to_numpy() # select columns corresponding to latent dims and convert to numpy

perf_sep_features = []
for i in range(X.shape[1]):
for i in range(X.shape[1]): # introduce each latent dim one at a time to see which caused issues
X_1 = X.copy()[:, 0 : i + 1]
X_1 = sm.add_constant(X_1) # add constant
try:
@@ -141,7 +144,7 @@
# print(log_reg.summary())
except Exception as e:
print(f"{cvar}:{sub_cvar} for feature {i} has exception: {e}")
perf_sep_features.append(i)
perf_sep_features.append(i) # store the issue causing latent dim

# if len(perf_sep_features) == 0:
# sub_cvar_df = pd.DataFrame({})
@@ -173,11 +176,11 @@
latent_dim_cols
].to_numpy() # select columns corresponding to latent dims and convert to numpy
del_inds = del_inds
X = np.delete(X, del_inds, axis=1)
X = np.delete(X, del_inds, axis=1) # delete the issue causing latent dims from full set
print(X.shape)
X = sm.add_constant(X) # add constant
try:
log_reg = sm.Logit(y, X).fit() # fit the Logistic Regression model
log_reg = sm.Logit(y, X).fit() # fit the Logistic Regression model on the remaining latent dims
print(
f"Completed: tvalues for {cvar}:{sub_cvar}, features till {i} -> {log_reg.tvalues}"
)
@@ -187,7 +190,7 @@

tvalues = log_reg.tvalues[1:].tolist() # + [np.nan]

for i in del_inds:
for i in del_inds: # for latent dims that caused issues, store their tvalues as nan so we know which ones didn't work
if i > len(tvalues):
tvalues = np.insert(tvalues, i - 1, np.nan)
else:
@@ -238,7 +241,7 @@
hi_or_low = hi_or_low.groupby([patient_col, cluster_col]).size().unstack(fill_value=0)


hi_or_low = hi_or_low.div(hi_or_low.sum(axis=1), axis=0).fillna(0)
hi_or_low = hi_or_low.div(hi_or_low.sum(axis=1), axis=0).fillna(0) # get proportion of each cluster in each patient (all will sum to 1)


hi_low_cluster_variables = (
@@ -278,7 +281,7 @@
prop_cluster_cols
].to_numpy() # select columns corresponding to latent dims and convert to numpy
tvalues = {}
for cluster in range(X.shape[1]):
for cluster in range(X.shape[1]): # do each cluster one by one since these add up to 1 and Logit won't work
X1 = X[:, cluster]
X1 = sm.add_constant(X1)
try:
@@ -345,18 +348,18 @@
s_cluster_prevs = {}
mask = tifffile.imread(
f"../../../data/{cohort_dirs[cohort][0]}/{sample}{cohort_dirs[cohort][1]}"
)
) # get dims of the image
sample_df = adata_df.copy().query("Sample_name==@sample")
for cluster in clusters:
num_cells_in_sample = Counter(sample_df.leiden.tolist())
num_cells_in_clusters = num_cells_in_sample[cluster]
num_cells_in_clusters = num_cells_in_sample[cluster] # get number of cells belong to each cluster for each image

# print(num_cells_in_clusters)
# print(mask.shape[0] , mask.shape[1])

cluster_prevalance_per_mm2 = (
num_cells_in_clusters / (mask.shape[0] * mask.shape[1])
) * 1e6 # scale, 1 pixel == 1 micron
) * 1e6 # scale, 1 pixel == 1 micron, get the prevalence and scale

s_cluster_prevs[cluster] = cluster_prevalance_per_mm2

@@ -409,7 +412,7 @@
cvar
].to_numpy() # select the clinical variable column and convert to numpy -- no fillna(0) since all the nans should have been dropped
try:
log_reg = sm.Logit(y, X).fit() # fit the Logistic Regression model
log_reg = sm.Logit(y, X).fit() # fit the Logistic Regression model, doing them altogether this time, not one by one because don't need to

sub_cvar_df["cluster"] = [c for c in cluster_cols]

@@ -421,7 +424,7 @@

cvar_dfs.append(sub_cvar_df)
except Exception as e:
exception_variables.append((cvar, sub_cvar))
exception_variables.append((cvar, sub_cvar)) # I keep a track of the exception variables but I don't deal with them
print(f"{cvar}:{sub_cvar} had an exception occur: {e}")

full_cvar_dfs = pd.concat(cvar_dfs)