-
Notifications
You must be signed in to change notification settings - Fork 0
/
Count_Kmer.py
executable file
·61 lines (51 loc) · 1.85 KB
/
Count_Kmer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/usr/bin/env python
"""
Name : Sara Nicholson
Date : March 2024
Description: Counts kmers for each fasta file in a directory
"""
from pathlib import Path
import sys
import re
import os
def count_kmer(fastq_file, kmer_length, outfile):
# initialize variable to place sequence
seq = ''
# Initialize kmer dictionary
kmer_dictionary = {}
# open file & add kmers from each sequence (seq) to kmer dictionary
with open(fastq_file, 'r') as fastq_seq:
for line in fastq_seq:
line = line.rstrip()
if re.match('^[ATGCN]+$', line):
seq += line
# set stop to length of sequence
stop = len(seq) - kmer_length + 1
for start in range(0, stop):
kmer = seq[start:start + kmer_length]
if kmer in kmer_dictionary:
# If kmer is already in dictionary, add 1 to count
kmer_dictionary[kmer] += 1
else:
# If kmer is not in dictionary, add the kmer to dictionary with count = 1
kmer_dictionary[kmer] = 1
# clear the sequence variable to add next sequence
seq = ''
# Set variable for tab seperation
sep = "\t"
# Open output file for writing
with open(outfile, 'w') as out:
for kmer in kmer_dictionary:
count = kmer_dictionary[kmer]
output = (kmer, str(count))
# format output to kmer + tab + count + newline
out.write(sep.join(output) + "\n")
if __name__ == "__main__":
# Initializations
fasta = str(sys.argv[1])
k_size = int(sys.argv[2])
# outfile = str(sys.argv[3])
for filename in os.listdir(fasta):
f = os.path.join(fasta, filename)
outfile = Path(f).stem
count_kmer(f, k_size, outfile)