-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFrequencyAnalyzer.py
85 lines (73 loc) · 3.02 KB
/
FrequencyAnalyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from sys import argv
from itertools import *
from decimal import *
from datetime import *
### SETTINGS ###
characters = 'GATC'
wordLength = 9
decimalSeperator = ','
csvSeperator = '| '
#decimalPrecision = 15
#getcontext().prec = decimalPrecision
startTime = datetime.now()
charactersSet = set(characters)
outputFilePath = argv[1]
inputFilePaths = argv[2:]
dictionary = list(map(lambda x: ''.join(x), list(product(characters, repeat=wordLength))))
def getLinesFromPath(path):
file = open(path,"r")
return file.read().splitlines()
def calculateFrequenciesForFile(path):
start = datetime.now()
print('\n##### Calculating Frequencies for ', path, ' #####')
inputWords = getLinesFromPath(path)
totalWords = 0
uniqueWords = 0
frequencyTable = {}
for word in dictionary:
frequencyTable[word] = 0
for word in inputWords:
if len(word) == wordLength and set(word).issubset(charactersSet):
if frequencyTable[word] == 0:
uniqueWords += 1
frequencyTable[word] += 1
totalWords += 1
#print (frequencyTable)
print(totalWords, 'valid lines analyzed in input file. (ignoring', len(inputWords) - totalWords, 'mismatches)')
print(uniqueWords, 'unique words found')
timeElapsed = datetime.now() - start
print('File analyzed in', str(timeElapsed.seconds ) + 's')
return {'frequencyTable': frequencyTable, 'totalWords': totalWords, 'uniqueWords': uniqueWords, 'filePath': path}
def analyzeAllFiles(paths):
print('\n######## Calculating word frequencies in ', len(inputFilePaths), ' input files. ########')
resultsets = []
for path in inputFilePaths:
resultsets.append(calculateFrequenciesForFile(path))
print('\n######## Finished analyzing ', len(resultsets), ' files', ' ########')
print ('-'*80)
print('\nStart writing output to file ', outputFilePath)
writeOutputFile(resultsets, outputFilePath)
print('Finished writing output file.')
timeElapsed = datetime.now() - startTime
print ('-'*80)
print('\n##### Valid answer found in ', str(timeElapsed.seconds ) + 's' + ': 42 #####\n\n\n')
def writeOutputFile(results, outputPath):
start = datetime.now()
outputFile = open(outputFilePath, 'w')
outputFile.write('sep=' + csvSeperator + '\n#Word')
for result in results:
outputFile.write(csvSeperator + 'abs_' + result['filePath'])
for result in results:
outputFile.write(csvSeperator + 'rel_' + result['filePath'])
outputFile.write('\n')
for word in dictionary:
outputFile.write(word)
for result in results:
outputFile.write(csvSeperator + str(result['frequencyTable'][word]))
for result in results:
outputFile.write(csvSeperator + str(result['frequencyTable'][word] / result['totalWords']).replace('.', decimalSeperator))
outputFile.write('\n')
outputFile.close()
timeElapsed = datetime.now() - start
print('File analyzed in', str(timeElapsed.seconds) + 's')
analyzeAllFiles(inputFilePaths)