forked from mana438/RNABERT
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
68 lines (59 loc) · 2.26 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import torch
from .utils import (
get_config,
BertModel,
BertForMaskedLM,
)
from .dataload import DATA
import numpy as np
import os
import pandas as pd
FILE_LOC = os.path.dirname(os.path.realpath(__file__))
# https://github.com/mana438/RNABERT
# https://academic.oup.com/nargab/article/4/1/lqac012/6534363
class RNABERT:
def __init__(
self,
batch_size=20,
config=FILE_LOC + "/RNA_bert_config.json",
pretrained_model=FILE_LOC + "/bert_mul_2.pth",
device="cuda",
):
self.file_location = os.path.dirname(os.path.realpath(__file__))
self.batch_size = batch_size
self.config = get_config(file_path=config)
self.max_length = self.config.max_position_embeddings
self.maskrate = 0
self.mag = 1
self.config.hidden_size = self.config.num_attention_heads * self.config.multiple
model = BertModel(self.config)
self.model = BertForMaskedLM(self.config, model)
self.model.to(device)
print("device: ", device)
self.device = device
if device == "cuda":
self.model = torch.nn.DataParallel(self.model) # make parallel
self.model.load_state_dict(torch.load(pretrained_model))
def __call__(self, fasta_file):
print("-----start-------")
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
data = DATA(config=self.config, batch=self.batch_size)
seqs, names, dataloader = data.load_data_EMB([fasta_file])
features = self.make_feature(self.model, dataloader, seqs)
features = np.array([np.array(embedding).sum(0) for embedding in features])
return pd.DataFrame(features, index=names)
def make_feature(self, model, dataloader, seqs):
model.eval()
torch.backends.cudnn.benchmark = True
encoding = []
for batch in dataloader:
data, label, seq_len = batch
inputs = data.to(self.device)
_, _, encoded_layers = model(inputs)
encoding.append(encoded_layers.cpu().detach().numpy())
encoding = np.concatenate(encoding, 0)
embedding = []
for e, seq in zip(encoding, seqs):
embedding.append(e[: len(seq)].tolist())
return embedding