Skip to content

Commit

Permalink
Create Fisher_score.py
Browse files Browse the repository at this point in the history
  • Loading branch information
joanocha authored Jul 16, 2024
1 parent 923c356 commit cc03cea
Showing 1 changed file with 41 additions and 0 deletions.
41 changes: 41 additions & 0 deletions hdgp_tgp_AMY_popgen/Fisher_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import pandas as pd
import numpy as np
import argparse

def process_superpopulation(superpopulation):
# Read the gzipped file into a pandas DataFrame
df = pd.read_csv("genomewide.salti.lassip.hap.stats.gz", compression='gzip', delimiter='\t')

# Filter the DataFrame for the specified superpopulation
df_filtered = df[df['Superpopulation'] == superpopulation]

# Rank the 'h12' and 'h2h1' columns
df_filtered['h12_rank'] = df_filtered['h12'].rank(ascending=False)
df_filtered['h2h1_rank'] = df_filtered['h2h1'].rank(ascending=False)

# Calculate the number of windows (rows)
nwindows = df_filtered.shape[0]

# Calculate the r_h12 and r_h2h1 columns
df_filtered['r_h12'] = np.log(df_filtered['h12_rank'] / nwindows)
df_filtered['r_h2h1'] = np.log(df_filtered['h2h1_rank'] / nwindows)

# Calculate the Fscore column
df_filtered['Fscore'] = -2 * (df_filtered['r_h12'] + df_filtered['r_h2h1'])

# Save the filtered DataFrame to a gzipped TSV file
output_file_path = f'Fisher_{superpopulation}_windows.tsv.gz'
df_filtered.to_csv(output_file_path, sep='\t', index=False, compression='gzip')

return df_filtered.head()

if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Process superpopulation data.')
parser.add_argument('superpopulation', type=str, help='The superpopulation to process (e.g., WEA, AFR, CAS, EA).')

args = parser.parse_args()
superpopulation = args.superpopulation

result = process_superpopulation(superpopulation)
print(result)

0 comments on commit cc03cea

Please sign in to comment.