-
Notifications
You must be signed in to change notification settings - Fork 1
/
gen_datasets.py
185 lines (157 loc) · 7.03 KB
/
gen_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
# Copyright (c) 2020.
#
# Yannik Benz
#
# Yannik Benz
import argparse
import logging
import os
import pandas as pd
from tqdm import tqdm
from code.attacks import simple_attacks as simple
import numpy as np
def perturb_conllu(input_file_path, attacker, perturbation_level, output_file_path):
"""
:param input_file_path: path to wiki input file
:param attacker: attacker to apply
:param perturbation_level: attack level to apply
:param output_file_path: file to write the perturbed data to
:param _nrows: (optional) limit the rows
"""
global phonetic_cache
words = set()
with open(input_file_path, encoding='utf-8', mode='r') as f:
for line in f:
if line.startswith('#') or line == '' or line == '\n':
continue
else:
words.add(line.split('\t')[0])
perturbed_word_dict = dict.fromkeys(words)
if attacker == 'phonetic':
phonetic_perturbed_words_dict = g2pp2g.perturb_words(list(words), phonetic_cache)
perturbed_word_dict = {**perturbed_word_dict, **phonetic_perturbed_words_dict}
else:
for word in words:
if attacker == 'viper':
perturbed_word = viper_ices.run(word, prob=perturbation_level, top_n=20)
else:
perturbed_word = simple.simple_perturb(word, attacker, perturbation_level)
perturbed_word_dict[word] = perturbed_word
with open(output_file_path, 'w', encoding='utf-8') as out_file:
with open(input_file_path, 'r', encoding='utf-8') as in_file:
outlines = []
sample_count = 0
sentence = []
for line in in_file:
if line.startswith('#') or line == '': # bos
outlines.append(line)
elif line == '\n': # eos
sample_count += 1
perturbed_words = 0
word_indexes = list(range(0, len(sentence)))
perturb_target = len(sentence) * perturbation_level
while perturbed_words < perturb_target:
if len(word_indexes) < 1:
break
index = np.random.choice(word_indexes)
word_indexes.remove(index)
word = sentence[index][0]
perturbed_word = perturbed_word_dict.get(word)
if perturbed_word is None:
continue
sentence[index][0] = perturbed_word
perturbed_words += 1 if perturbed_word != word else 0
outlines.extend(f"{word}\t{tag}" for (word, tag) in sentence)
outlines.append(line)
sentence = [] # clear
else:
sentence.append((line.split('\t')))
out_file.writelines(outlines)
def perturb_series(series: pd.Series, attacker, perturbation_level=0.2):
"""
:param series:
:param attacker:
:param perturbation_level: 0.2 low, 0.8 high
:return:
"""
global phonetic_cache
if attacker == 'viper':
return series.progress_apply(viper_ices.run, prob=perturbation_level, top_n=20)
elif attacker == 'phonetic':
perturbed, phonetic_cache = g2pp2g.perturb_series(series, phonetic_cache,
perturbation_level=perturbation_level)
return perturbed
else:
return series.progress_apply(simple.simple_perturb, method=attacker, perturbation_level=perturbation_level)
def load_pd_data(data_dir, data_split, eval_task):
if eval_task == 'snli':
return pd.read_csv(os.path.join(data_dir, f'{eval_task}/{data_split}.txt'), sep='\t').dropna(
subset=['sentence1', 'sentence2'])
elif eval_task == 'tc':
return pd.read_csv(os.path.join(data_dir, f"{eval_task}/{data_split}.txt"))
if __name__ == '__main__':
# init logger
log = logging.getLogger()
available_methods = ["full-swap", "inner-swap", "intrude", "disemvowel",
"truncate", "keyboard-typo", "natural-typo", "segment",
"phonetic", "viper"]
tasks = ["tc", "snli", "pos", "wiki"]
levels = ["low", "mid", "high"]
SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
parser = argparse.ArgumentParser()
parser.add_argument('--task', '-t', type=str, choices=tasks, required=True, dest='task')
parser.add_argument('--method', '-m', nargs='+', required=True,
choices=available_methods + ["all"], dest='methods')
parser.add_argument('--sample', '-s', type=bool, default=False, dest='sample')
parser.add_argument('--level', '-l', type=str, choices=levels, required=True, dest='level')
parser.add_argument('--indir', '-i', type=str, dest='indir')
# parser.add_argument('--outdir', '-o', type=str, dest='outdir', default=SCRIPT_DIR)
args = parser.parse_args()
# load CLI arguments
task = args.task
methods = args.methods
sample = args.sample
level = args.level
indir = args.indir
if 'all' in methods:
methods = available_methods
if 'phonetic' in methods:
from code.models import g2pp2g
g2pp2g.setup_gpu_share_config()
if 'viper' in methods:
from code.attacks.visual import viper_ices
# init phonetic cache
phonetic_cache = {}
tqdm.pandas() # make tqdm available for pandas
if level == 'low':
pert_level = 0.2
elif level == 'mid':
pert_level = 0.5
elif level == 'high':
pert_level = 0.8
else:
raise ValueError
save_path = os.path.join(indir, task)
for method in methods:
log.info(f"START {method}")
for split in ["train", "dev", "test"]:
log.info(f"Perturb {split} split")
if method == 'segment' and (task == 'pos' or task == 'wiki'):
print("Segmentation cannot be applied to POS tagging.")
elif task == 'wiki' or task == 'pos':
perturb_conllu(os.path.join(indir, task, f'{split}.txt'), method, pert_level,
os.path.join(save_path, f'{split}_{method}_{level}.txt'))
elif task == 'tc':
if split == 'dev':
continue
tc_data = load_pd_data(indir, split, task)
tc_pert_series = tc_data.apply(
lambda x: perturb_series(x, method, perturbation_level=pert_level)
if x.name == 'comment_text' else x, axis=0)
tc_pert_series.to_csv(os.path.join(save_path, f"{split}_{method}_{level}.txt"), index=None)
elif task == 'snli':
snli_data = load_pd_data(indir, split, task)
snli_pert_series = snli_data.apply(
lambda x: perturb_series(x, method, perturbation_level=pert_level)
if x.name in ["sentence1", "sentence2"] else x, axis=0)
snli_data.to_csv(os.path.join(save_path, f"{split}_{method}_{level}.txt"), index=None, sep='\t')