gemma-2-27b-it on together

allenai · Jul 17, 2024 · 75cab09 · 75cab09
1 parent dd82749
commit 75cab09
Show file tree

Hide file tree

Showing 13 changed files with 22,695 additions and 1,777 deletions.
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/[email protected] b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/[email protected]
diff --git a/leaderboard/data_dir/all_stat_wildbench.-1.json b/leaderboard/data_dir/all_stat_wildbench.-1.json
diff --git a/leaderboard/data_dir/all_stat_wildbench.100.json b/leaderboard/data_dir/all_stat_wildbench.100.json
diff --git a/leaderboard/data_dir/all_stat_wildbench.1000.json b/leaderboard/data_dir/all_stat_wildbench.1000.json
diff --git a/leaderboard/data_dir/all_stat_wildbench.1500.json b/leaderboard/data_dir/all_stat_wildbench.1500.json
diff --git a/leaderboard/data_dir/all_stat_wildbench.2000.json b/leaderboard/data_dir/all_stat_wildbench.2000.json
diff --git a/leaderboard/data_dir/all_stat_wildbench.300.json b/leaderboard/data_dir/all_stat_wildbench.300.json
diff --git a/leaderboard/data_dir/all_stat_wildbench.3000.json b/leaderboard/data_dir/all_stat_wildbench.3000.json
diff --git a/leaderboard/data_dir/all_stat_wildbench.500.json b/leaderboard/data_dir/all_stat_wildbench.500.json
diff --git a/leaderboard/data_dir/score.json b/leaderboard/data_dir/score.json
diff --git a/leaderboard/data_dir/wb_elo_results.json b/leaderboard/data_dir/wb_elo_results.json
diff --git a/scripts/[email protected] b/scripts/[email protected]
@@ -0,0 +1,24 @@
+# export ANTHROPIC_API_KEY=your_anthropic_api_key
+model_name="google/gemma-2-27b-it@together"
+model_pretty_name="gemma-2-27b-it@together"
+output_dir="result_dirs/wild_bench_v2/"
+TEMP=0; TOP_P=1.0; MAX_TOKENS=4096;
+
+# shard_size should be 1024 // n_shards
+n_shards=8
+shard_size=128
+start_gpu=0
+shards_dir="${output_dir}/tmp_${model_pretty_name}"
+for ((start = 0, end = (($shard_size)), gpu = $start_gpu; gpu < $n_shards+$start_gpu; start += $shard_size, end += $shard_size, gpu++)); do
+    python src/unified_infer.py \
+        --data_name wild_bench \
+        --start_index $start --end_index $end \
+        --engine together \
+        --model_name $model_name \
+        --top_p $TOP_P --temperature $TEMP \
+        --max_tokens $MAX_TOKENS \
+        --output_folder $shards_dir &
+done 
+wait 
+python src/merge_results.py $shards_dir/ $model_pretty_name
+cp $shards_dir/${model_pretty_name}.json $output_dir/${model_pretty_name}.json
diff --git a/src/unified_utils.py b/src/unified_utils.py
@@ -259,6 +259,9 @@ def wrapper(*args, **kwargs):
                             if 'blocked' in err_msg:
                                 print ('blocked output issue!')
                                 return ['Error: this query is blocked by APIs.']
+                            if "`inputs` tokens + `max_new_tokens` must be <=" in err_msg:
+                                print ('Exceeding max tokens issue! (in together.ai)')
+                                return ['']
                                 #raise e
                             print(f"Retrying for the {retried + 1} time..")
                             #if 'output blocked by content filtering policy' in err_msg.lower():
@@ -276,7 +279,7 @@ def wrapper(*args, **kwargs):
                                 return ['']
                             if '504 Gateway Time-out' in err_msg:
                                 print ('Yi issue!')
-                                return ['']
+                                return [''] 
                             print("Retry limit reached. Saving the error message and returning.")
                             print(kwargs["prompt"])
                             raise e
@@ -436,6 +439,13 @@ def together_chat_request(
     if messages is None:
         messages = [{"role":"user","content": prompt}]
     client = Together(api_key=os.environ.get("TOGETHER_API_KEY"))
+    if "gemma-2" in model:
+        max_chars = 6000*4
+        # num_tokens = len(messages[0]["content"])/4 # estimate the number of tokens by dividing the length of the prompt by 4
+        if len(messages[0]["content"]) > max_chars:
+            print("Truncating prompt to 6000 tokens")
+            messages[0]["content"] = messages[0]["content"][:max_chars] + "... (truncated)"
+
     response = client.chat.completions.create(
         model=model,
         messages=messages,