Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cuda 11 version update #6

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@
# python main.py -task make_data -n_cpus 2
elif args.task == 'make_data':
os.chdir(PROJECT_DIR + '/src')
os.system("python make_data.py -task df")
os.system(f"python make_data.py -task train_bert -target_summary_sent abs -n_cpus {args.n_cpus}")
os.system("python make_data.py -task df") # done
os.system(f"python make_data.py -task train_bert -target_summary_sent {args.target_summary_sent} -n_cpus {args.n_cpus}")
os.system(f"python make_data.py -task test_bert -n_cpus {args.n_cpus}")

# python main.py -task train -target_summary_sent abs -visible_gpus 0
Expand All @@ -68,7 +68,7 @@
param1 = " -ext_dropout 0.1 -lr 2e-3 -batch_size 500 -train_steps 5000 -accum_count 2 -use_interval true -warmup_steps 3000 -max_pos 512"
param2 = " -ext_dropout 0.1 -lr 2e-3 -batch_size 1000 -train_steps 5000 -accum_count 2 -use_interval true -warmup_steps 3000 -max_pos 512"
param3 = " -ext_dropout 0.1 -max_pos 512 -lr 2e-3 -warmup_steps 10000 -batch_size 3000 -accum_count 2 -train_steps 50000 -use_interval true"
do_str += param3
do_str += param1

if args.train_from is None:
os.system(f'mkdir {MODEL_DIR}/{now}')
Expand All @@ -95,7 +95,7 @@
"""
os.system(f"python train.py -task ext -mode validate -test_all True"
+ f" -model_path {MODEL_DIR}/{args.model_path}"
+ f" -bert_data_path {BERT_DATA_DIR}/valid_abs"
+ f" -bert_data_path {BERT_DATA_DIR}/valid_ext"
+ f" -result_path {RESULT_DIR}/result_{args.model_path}"
+ f" -log_file {LOG_DIR}/valid_{args.model_path}.log"
+ f" -test_batch_size 500 -batch_size 3000"
Expand Down
55 changes: 52 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,55 @@
beautifulsoup4==4.6.0
bs4==0.0.1
certifi==2021.10.8
charset-normalizer==2.0.7
click==8.0.3
colorama==0.4.4
Cython==0.29.24
dill==0.3.4
emoji==1.6.1
filelock==3.3.2
flatbuffers==2.0
gluonnlp==0.10.0
graphviz==0.8.4
idna==3.3
importlib-metadata==4.8.1
install==1.3.4
joblib==1.1.0
JPype1==1.3.0
kobert @ file:///home/eklee/PycharmProjects/summ/KoBERT
konlpy==0.5.2
kss==3.3.1.1
lxml==4.6.3
mecab-python===0.996-ko-0.9.2
multiprocess==0.70.9
numpy==1.17.2
mxnet==1.8.0.post0
nltk==3.6.5
numpy==1.21.3
oauthlib==3.1.1
onnxruntime==1.9.0
packaging==21.2
pandas==1.3.4
Pillow==8.4.0
protobuf==3.19.1
pyparsing==2.4.7
pyrouge==0.1.3
transformers==3.* #pytorch-transformers==1.2.0
PySocks==1.7.1
python-dateutil==2.8.2
pytz==2021.3
regex==2021.11.1
requests==2.26.0
requests-oauthlib==1.3.0
sacremoses==0.0.46
sentencepiece==0.1.91
six==1.16.0
tensorboardX==1.9
torch==1.1.0
tokenizers==0.7.0
torch==1.8.0+cu111
torchaudio==0.8.0
torchvision==0.9.0+cu111
tqdm==4.62.3
transformers==2.11.0
tweepy==3.10.0
typing-extensions==3.10.0.2
urllib3==1.26.7
zipp==3.6.0
39 changes: 26 additions & 13 deletions src/make_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import numpy as np
import pandas as pd
from tqdm import tqdm
from tqdm import trange
import ast
import argparse
import pickle

Expand Down Expand Up @@ -148,18 +150,14 @@ def create_json_files(df, data_type='train', target_summary_sent=None, path=''):

json_list = []
for i, row in df.iloc[start_idx:end_idx].iterrows():
original_sents_list = [preprocessing(original_sent).split() # , korean_tokenizer
for original_sent in row['article_original']]

original_sents_list = [preprocessing(original_sent).split() for original_sent in row['article_original']]
summary_sents_list = []
if target_summary_sent is not None:
if target_summary_sent == 'ext':
summary_sents = row['extractive_sents']
summary_sents = ast.literal_eval(row['extractive_sents'])
elif target_summary_sent == 'abs':
summary_sents = korean_sent_spliter(row['abstractive'])
summary_sents_list = [preprocessing(original_sent).split() # , korean_tokenizer
for original_sent in summary_sents]

summary_sents = korean_sent_spliter(row['abstractive'])
summary_sents_list = [preprocessing(original_sent).split() for original_sent in summary_sents]
json_list.append({'src': original_sents_list,
'tgt': summary_sents_list
})
Expand All @@ -176,7 +174,7 @@ def create_json_files(df, data_type='train', target_summary_sent=None, path=''):
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("-task", default=None, type=str, choices=['df', 'train_bert', 'test_bert'])
parser.add_argument("-target_summary_sent", default='abs', type=str)
parser.add_argument("-target_summary_sent", default='ext', type=str)
parser.add_argument("-n_cpus", default='2', type=str)

args = parser.parse_args()
Expand All @@ -190,7 +188,7 @@ def create_json_files(df, data_type='train', target_summary_sent=None, path=''):
# import data
with open(f'{RAW_DATA_DIR}/train.jsonl', 'r') as json_file:
train_json_list = list(json_file)
with open(f'{RAW_DATA_DIR}/extractive_test_v2.jsonl', 'r') as json_file:
with open(f'{RAW_DATA_DIR}/test.jsonl', 'r') as json_file:
test_json_list = list(json_file)

trains = []
Expand All @@ -204,7 +202,23 @@ def create_json_files(df, data_type='train', target_summary_sent=None, path=''):

# Convert raw data to df
df = pd.DataFrame(trains)
df['extractive_sents'] = df.apply(lambda row: list(np.array(row['article_original'])[row['extractive']]) , axis=1)
import ast
#df['extractive'] = df['extractive'].apply(lambda x: json.loads(x))
#df['article_original'] = df['article_original'].apply(lambda x: ast.literal_eval(x))

new_df = []
for i in trange(len(df)):
df_len = len(df.iloc[i]['extractive'])
df_text = []
for j in range(df_len):
idx = df.iloc[i]['extractive'][j]
if idx is None:
pass
else:
df_text.append(df.iloc[i]['article_original'][idx])
new_df.append(str(df_text))
df = pd.concat([df, pd.DataFrame(new_df, columns=['extractive_sents'])], axis=1)
#df['extractive_sents'] = df.apply(lambda row: list(np.array(row['article_original'])[row['extractive']]) , axis=1)

# random split
train_df = df.sample(frac=0.95,random_state=42) #random state is a seed value
Expand All @@ -231,7 +245,6 @@ def create_json_files(df, data_type='train', target_summary_sent=None, path=''):

for data_type in ['train', 'valid']:
df = pd.read_pickle(f"{RAW_DATA_DIR}/{data_type}_df.pickle")

## make json file
# 동일한 파일명 존재하면 덮어쓰는게 아니라 ignore됨에 따라 폴더 내 삭제 후 만들어주기
json_data_dir = f"{JSON_DATA_DIR}/{data_type}_{args.target_summary_sent}"
Expand All @@ -241,7 +254,7 @@ def create_json_files(df, data_type='train', target_summary_sent=None, path=''):
os.mkdir(json_data_dir)

create_json_files(df, data_type=data_type, target_summary_sent=args.target_summary_sent, path=JSON_DATA_DIR)

## Convert json to bert.pt files
bert_data_dir = f"{BERT_DATA_DIR}/{data_type}_{args.target_summary_sent}"
if os.path.exists(bert_data_dir):
Expand Down
2 changes: 1 addition & 1 deletion src/make_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
# python make_submission.py result_1209_1236_step_7000.candidate
if __name__ == '__main__':
# test set
with open(RAW_DATA_DIR + '/extractive_test_v2.jsonl', 'r') as json_file:
with open(RAW_DATA_DIR + '/test.jsonl', 'r') as json_file:
json_list = list(json_file)

tests = []
Expand Down
9 changes: 6 additions & 3 deletions src/models/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,16 @@ def __init__(self, data=None, device=None, is_test=False):
tgt = torch.tensor(self._pad(pre_tgt, 0))

segs = torch.tensor(self._pad(pre_segs, 0))
mask_src = 1 - (src == 0)
mask_tgt = 1 - (tgt == 0)
mask_src = ~(src == 0)
mask_tgt = ~(tgt == 0)
#mask_src = 1 - (src == 0)
#mask_tgt = 1 - (tgt == 0)


clss = torch.tensor(self._pad(pre_clss, -1))
src_sent_labels = torch.tensor(self._pad(pre_src_sent_labels, 0))
mask_cls = 1 - (clss == -1)
mask_cls = ~(clss == -1)
#mask_cls = 1 - (clss == -1)
clss[clss == -1] = 0
setattr(self, 'clss', clss.to(device))
setattr(self, 'mask_cls', mask_cls.to(device))
Expand Down
2 changes: 1 addition & 1 deletion src/models/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def forward(self, top_vecs, mask):
x = x + pos_emb

for i in range(self.num_inter_layers):
x = self.transformer_inter[i](i, x, x, 1 - mask) # all_sents * max_tokens * dim
x = self.transformer_inter[i](i, x, x, ~mask) # all_sents * max_tokens * dim

x = self.layer_norm(x)
sent_scores = self.sigmoid(self.wo(x))
Expand Down