-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_dataframe.py
executable file
·51 lines (47 loc) · 1.96 KB
/
create_dataframe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import yaml
import pandas as pd
import numpy as np
from glob import glob
import sys
# Create the datatable containing the samples, units and paths of all
# fastq files formatted correctly. This is vital for the snakemake
# pipeline, without it, the wildcards can't be created.
with open(sys.argv[1]) as f_:
config = yaml.load(f_, Loader=yaml.FullLoader)
def create_dataframe(fl, fpl, config, slice):
if config['general']['paired_End'] and not config['general']['already_assembled']:
df = pd.DataFrame(columns=['sample', 'unit', 'fq1', 'fq2'],
index =range(int(len(fl)/2)), dtype=str)
i, j = (0, 0)
while i < len(fl)/2:
df.loc[i]['sample'] = fl[j].split('_')[0]
df.loc[i]['unit'] = fl[j].split('_')[1]
df.loc[i]['fq1'] = fpl[j][:slice]
df.loc[i]['fq2'] = fpl[j+1][:slice]
j += 2
i += 1
else:
df = pd.DataFrame(columns=['sample', 'unit', 'fq1', 'fq2'],
index = range(int(len(fl))), dtype=str)
i = 0
while i < len(fl):
df.loc[i]['sample'] = fl[i].split('_')[0]
df.loc[i]['unit'] = fl[i].split('_')[1]
df.loc[i]['fq1'] = fpl[i][:slice]
df.loc[i]['fq2'] = np.nan
i += 1
return df
if __name__ == '__main__':
if not config['general']['already_assembled']:
file_path_list = ['demultiplexed/' + name.split('/')[-1] for name in
sorted(glob(config['general']['filename'] + '/*.gz'))]
file_list = sorted([file_.split('/')[-1] for file_
in file_path_list])
slice = -3 # Remove the .gz extension from the file paths.
else:
file_path_list = sorted(glob('results/assembly/*/*.fastq'))
file_list = sorted([file_.split('/')[-1] for file_
in file_path_list])
slice = None
df = create_dataframe(file_list, file_path_list, config, slice)
df.to_csv('units.tsv', sep='\t')