forked from boostcampaitech2/mrc-level2-nlp-09
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharguments.py
98 lines (92 loc) · 3.25 KB
/
arguments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from dataclasses import asdict, dataclass, field
from typing import Any, Dict, List, Optional
@dataclass
class ModelArguments:
"""
Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
"""
model_name_or_path: str = field(
default="klue/roberta-large",
metadata={
"help": "Path to pretrained model or model identifier from huggingface.co/models"
},
)
config_name: Optional[str] = field(
default=None,
metadata={
"help": "Pretrained config name or path if not the same as model_name"
},
)
tokenizer_name: Optional[str] = field(
default=None,
metadata={
"help": "Pretrained tokenizer name or path if not the same as model_name"
},
)
@dataclass
class DataTrainingArguments:
"""
Arguments pertaining to what data we are going to input our model for training and eval.
"""
dataset_name: Optional[str] = field(
default="../data/train_dataset",
metadata={"help": "The name of the dataset to use."},
)
overwrite_cache: bool = field(
default=False,
metadata={"help": "Overwrite the cached training and evaluation sets"},
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
)
max_seq_length: int = field(
default=384,
metadata={
"help": "The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded."
},
)
pad_to_max_length: bool = field(
default=False,
metadata={
"help": "Whether to pad all samples to `max_seq_length`. "
"If False, will pad the samples dynamically when batching to the maximum length in the batch (which can "
"be faster on GPU but will be slower on TPU)."
},
)
doc_stride: int = field(
default=128,
metadata={
"help": "When splitting up a long document into chunks, how much stride to take between chunks."
},
)
max_answer_length: int = field(
default=30,
metadata={
"help": "The maximum length of an answer that can be generated. This is needed because the start "
"and end predictions are not conditioned on one another."
},
)
eval_retrieval: bool = field(
default=True,
metadata={"help": "Whether to run passage retrieval using sparse embedding."},
)
num_clusters: int = field(
default=64, metadata={"help": "Define how many clusters to use for faiss."}
)
top_k_retrieval: int = field(
default=20,
metadata={
"help": "Define how many top-k passages to retrieve based on similarity."
},
)
ng_top_k_retrieval: int = field(
default=5,
metadata={
"help": "Define how many top-k passages to include in negative samples"
},
)
use_faiss: bool = field(
default=False, metadata={"help": "Whether to build with faiss"}
)