-
Notifications
You must be signed in to change notification settings - Fork 2
/
push_to_hf.py
148 lines (130 loc) · 4.62 KB
/
push_to_hf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from datasets import load_dataset
parsing_name_mapping = {
"mtop": ("utterance", "dcp_form", "locale"),
"hinglish_top": ("cs_query", "cs_parse"),
"top_v2": ("utterance", "semantic_parse"),
"cstop": ("utterance", "semantic_parse"),
"cstop_artificial": ("utterance", "semantic_parse"),
}
def fix_header_factory(dataset_name=None, uniform=False, locale=None):
def fix_headers(examples):
new_examples = {}
if uniform:
if dataset_name == "hinglish_top" and locale == "en":
columns = ("en_query", "en_parse")
else:
columns = parsing_name_mapping[dataset_name]
mapping = {columns[0]: "utterance", columns[1]: "semantic_parse"}
if len(columns) == 3:
mapping[columns[2]] = "locale"
elif locale:
new_examples["locale"] = locale
for header in examples.keys():
f_header = header.strip()
if not uniform:
new_examples[f_header] = examples[header]
elif f_header in mapping:
new_examples[mapping[f_header]] = examples[header]
return new_examples
return fix_headers
langs = ["en", "de", "es", "fr", "hi", "th"]
splits = ["eval", "test", "train"]
split_dict = {}
for lang in langs:
for split in splits:
split_dict[f"{split}_{lang}"] = f"./mtop/{lang}/{split}.txt.csv"
d = load_dataset("csv", data_files=split_dict)
d.map(fix_header_factory()).push_to_hub("WillHeld/mtop")
uniform_mtop = d.map(
fix_header_factory(dataset_name="mtop", uniform=True),
remove_columns=[
column
for column in d["train_en"].column_names
if column not in ["utterance", "semantic_parse", "locale"]
],
)
splits = ["validation", "test", "train"]
split_dict = {}
for split in splits:
split_dict[f"{split}"] = f"./cstop/{split}.tsv.csv"
d = load_dataset("csv", data_files=split_dict)
d.map(fix_header_factory()).push_to_hub("WillHeld/hinglish_top")
uniform_hinglish_top = d.map(
fix_header_factory(dataset_name="hinglish_top", uniform=True, locale="hin_en"),
remove_columns=[
column
for column in d["train"].column_names
if column not in ["utterance", "semantic_parse", "locale"]
],
)
uniform_hinglish_top_en = d.map(
fix_header_factory(dataset_name="hinglish_top", uniform=True, locale="en"),
remove_columns=[
column
for column in d["train"].column_names
if column not in ["utterance", "semantic_parse", "locale"]
],
)
splits = ["eval", "test", "train"]
domains = [
"alarm",
"event",
"messaging",
"music",
"navigation",
"reminder",
"timer",
"weather",
]
split_dict = {}
for split in splits:
split_dict[split] = [f"./topv2/{domain}_{split}.tsv.csv" for domain in domains]
d = load_dataset("csv", data_files=split_dict)
d.map(fix_header_factory()).push_to_hub("WillHeld/top_v2")
uniform_top_v2 = d.map(
fix_header_factory(dataset_name="top_v2", uniform=True, locale="en"),
remove_columns=[
column
for column in d["train"].column_names
if column not in ["utterance", "semantic_parse", "locale"]
],
)
splits = ["eval", "test", "train"]
split_dict = {}
for split in splits:
split_dict[split] = f"./CSTOP/CSTOP_{split}.tsv.csv"
d = load_dataset("csv", data_files=split_dict)
d.map(fix_header_factory()).push_to_hub("WillHeld/cstop")
uniform_cstop = d.map(
fix_header_factory(dataset_name="cstop", uniform=True, locale="spa_en"),
remove_columns=[
column
for column in d["train"].column_names
if column not in ["utterance", "semantic_parse", "locale"]
],
)
print(uniform_cstop)
splits = ["eval", "test", "train"]
split_dict = {}
for split in splits:
split_dict[split] = f"./cstop_artificial/{split}.csv"
d = load_dataset("csv", data_files=split_dict)
d.map(fix_header_factory()).push_to_hub("WillHeld/cstop_artificial")
uniform_cstop_a = d.map(
fix_header_factory(dataset_name="cstop_artificial", uniform=True, locale="en"),
remove_columns=[
column
for column in d["train"].column_names
if column not in ["utterance", "semantic_parse", "locale"]
],
)
uniform_dataset = uniform_mtop
for split in uniform_cstop:
uniform_dataset[split + "_cstop"] = uniform_cstop[split]
for split in uniform_top_v2:
uniform_dataset[split + "_top_v2"] = uniform_hinglish_top_en[split]
for split in uniform_hinglish_top:
uniform_dataset[split + "_hinglish_top"] = uniform_hinglish_top[split]
for split in uniform_cstop_a:
uniform_dataset[split + "_cstop_artificial"] = uniform_cstop_a[split]
uniform_dataset.push_to_hub("WillHeld/uniform_top")