forked from yyDing1/GNER
-
Notifications
You must be signed in to change notification settings - Fork 1
/
test_robust1.py
129 lines (109 loc) · 5.58 KB
/
test_robust1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import unittest
import json
from collections import defaultdict
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import nltk
from evaluate import extract_predictions, parser
# Download required NLTK tokenizer data
nltk.download('punkt')
nltk.download('punkt_tab')
class TestGNERModel(unittest.TestCase):
@classmethod
def setUpClass(cls):
# Load model and tokenizer
cls.tokenizer = AutoTokenizer.from_pretrained("dyyyyyyyy/GNER-LLaMA-7B")
cls.model = AutoModelForCausalLM.from_pretrained("dyyyyyyyy/GNER-LLaMA-7B", torch_dtype=torch.bfloat16)
cls.entity_labels = ["genre", "rating", "review", "plot", "song", "average ratings", "director", "character", "trailer", "year", "actor", "title"]
cls.instruction_template = (
"Please analyze the sentence provided, identifying the type of entity for each word on a token-by-token basis.\n"
"Output format is: word_1(label_1), word_2(label_2), ...\n"
"We'll use the BIO-format to label the entities, where:\n"
"1. B- (Begin) indicates the start of a named entity.\n"
"2. I- (Inside) is used for words within a named entity but are not the first word.\n"
"3. O (Outside) denotes words that are not part of a named entity.\n"
)
def generate_model_output(self, sentence, instruction):
input_texts = f"[INST] {instruction} [/INST]"
inputs = self.tokenizer(input_texts, return_tensors="pt")
outputs = self.model.generate(**inputs, max_new_tokens=640)
response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
return response[response.find("[/INST]") + len("[/INST]"):].strip()
def test_bio_output_english(self):
# English sentence setup
sentence = "Did George Clooney make a musical in the 1980s?"
words = nltk.word_tokenize(sentence)
instruction = (
f"{self.instruction_template}\nUse the specific entity tags: {', '.join(self.entity_labels)} and O.\n"
f"Sentence: {' '.join(words)}"
)
# Expected outputs
expected_bio_format = ['O', 'B-actor', 'I-actor', 'O', 'O', 'B-genre', 'O', 'O', 'B-year']
# Generate model response
response = self.generate_model_output(sentence, instruction)
# Bio-format prediction extraction
bio_predictions = extract_predictions({
"label_list": self.entity_labels,
"instance": {"words": words},
"prediction": response,
})
self.assertEqual(bio_predictions, expected_bio_format, "BIO-format output does not match expected format for English input.")
def test_bio_output_spanish(self):
# Spanish sentence setup
sentence = "¿George Clooney hizo un musical en los años 1980?"
words = nltk.word_tokenize(sentence)
instruction = (
f"{self.instruction_template}\nUse the specific entity tags: {', '.join(self.entity_labels)} and O.\n"
f"Sentence: {' '.join(words)}"
)
# Expected outputs
expected_bio_format = ['B-actor', 'I-actor', 'O', 'O', 'B-genre', 'O', 'O', 'O', 'B-year']
# Generate model response
response = self.generate_model_output(sentence, instruction)
# Bio-format prediction extraction
bio_predictions = extract_predictions({
"label_list": self.entity_labels,
"instance": {"words": words},
"prediction": response,
})
self.assertEqual(bio_predictions, expected_bio_format, "BIO-format output does not match expected format for Spanish input.")
def test_bio_output_specialChars(self):
# Special Character sentence setup
sentence = "!*! @^& *( *( $%$"
words = nltk.word_tokenize(sentence)
instruction = (
f"{self.instruction_template}\nUse the specific entity tags: {', '.join(self.entity_labels)} and O.\n"
f"Sentence: {' '.join(words)}"
)
# Expected outputs
expected_bio_format = ['O', 'O', 'O', 'O', 'O']
# Generate model response
response = self.generate_model_output(sentence, instruction)
# Bio-format prediction extraction
bio_predictions = extract_predictions({
"label_list": self.entity_labels,
"instance": {"words": words},
"prediction": response,
})
self.assertEqual(bio_predictions, expected_bio_format, "BIO-format output does not match expected format for Special Char input.")
def test_bio_output_bengali(self):
# Bengali sentence setup
sentence = "জর্জ ক্লুনি কি ১৯৮০ সালের দশকে কোনো মিউজিকাল বানিয়েছিলেন?"
words = nltk.word_tokenize(sentence)
instruction = (
f"{self.instruction_template}\nUse the specific entity tags: {', '.join(self.entity_labels)} and O.\n"
f"Sentence: {' '.join(words)}"
)
# Expected outputs
expected_bio_format = ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
# Generate model response
response = self.generate_model_output(sentence, instruction)
# Bio-format prediction extraction
bio_predictions = extract_predictions({
"label_list": self.entity_labels,
"instance": {"words": words},
"prediction": response,
})
self.assertEqual(bio_predictions, expected_bio_format, "BIO-format output does not match expected format for Bengali input.")
if __name__ == "__main__":
unittest.main()