-
Notifications
You must be signed in to change notification settings - Fork 0
/
pssm_to_libsvm.py
100 lines (84 loc) · 2.99 KB
/
pssm_to_libsvm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# /**
# * @author [Rosdyana Kusuma]
# * @email [[email protected]]
# * @create date 2018-04-19 04:03:20
# * @modify date 2018-04-23 03:19:21
# * @desc [convert pssm file to libsvm format]
# */
import json
from pprint import pprint
import os
import sys
AMINO = ["A", "R", "N", "D", "C", "Q", "E", "G", "H",
"I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
zero_padding = ['0', '0', '0', '0', '0', '0', '0', '0', '0',
'0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']
datajson = json.load(open('poslist.json'))
def readPSSM(filename):
listOfList = []
f = open(filename, "r")
lines = f.readlines()[3:-6]
for line in lines:
# split by space
temp = line[:-1].split(" ")
# remove all ''
temp = list(filter(lambda x: x != '', temp))
# if temp[1] in AMINO:
listOfList.append(temp[2:22])
return listOfList
def generateDatasetWithWindowSize(pssmFile, windowSize, resultFile):
print(pssmFile, " is processing")
listOfList = readPSSM(pssmFile)
listOfListWithZeroPadding = []
numOfPadding = int((windowSize - 1) / 2)
# zero padding at the beginning of pssm list of list
for i in range(numOfPadding):
listOfListWithZeroPadding.append(zero_padding)
# next copy value after zero padding
for l in listOfList:
listOfListWithZeroPadding.append(l)
# zero padding at the end of pssm list of list
for i in range(numOfPadding):
listOfListWithZeroPadding.append(zero_padding)
f_result = open(resultFile, "a")
proteinName = pssmFile.split("/")[1].split(".")[0]
posData = datajson[proteinName]
length = len(listOfListWithZeroPadding)
start = 0
end = start + windowSize - 1
i = 0
print("length of pssm list ", length)
cont = True
while i < length and cont:
listToWrite = []
#print("start ",start, " end ",end)
classType = ""
for j in range(start, end + 1):
if j == (end - numOfPadding):
if (j - numOfPadding + 1) in posData:
classType = "1"
else:
classType = "0"
for k in listOfListWithZeroPadding[j]:
listToWrite.append(k)
featureNum = 1
f_result.write(classType + " ")
for m in listToWrite:
f_result.write(str(featureNum) + ":" + str(m) + " ")
featureNum = featureNum + 1
f_result.write("\n")
i = i + 1
start = start + 1
end = start + windowSize - 1
if start >= length or end >= length:
cont = False
f_result.close()
dinput = sys.argv[1]
windowSize = int(sys.argv[2])
doutput = "similar{}libsvm".format(dinput[7:-4])
if not os.path.exists(doutput):
os.makedirs(doutput)
for pssmfile in os.listdir(dinput):
inputFile = "{}/{}".format(dinput, pssmfile)
resultFile = "{}/{}.libsvm".format(doutput, pssmfile.split(".")[0])
generateDatasetWithWindowSize(inputFile, windowSize, resultFile)