Skip to content

Commit

Permalink
add mistral+vllm example for llm-leaderboard
Browse files Browse the repository at this point in the history
  • Loading branch information
tianweidut committed Jan 31, 2024
1 parent 411c05d commit 9b16130
Show file tree
Hide file tree
Showing 6 changed files with 252 additions and 62 deletions.
2 changes: 2 additions & 0 deletions example/llm-leaderboard/leaderboard.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ Current supported LLMs:
- chatglm 6b
- chatglm2 6b
- aquila 7b/7b-chat
- mistral-7b-instruct
- mistral-8*7b-instruct

## Build Starwhale Runtime

Expand Down
26 changes: 23 additions & 3 deletions example/llm-leaderboard/src/benchmark/cmmlu.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from __future__ import annotations

import re
import random
import typing as t

from starwhale import dataset
from starwhale.utils.debug import console
from starwhale.base.uri.resource import Resource

Expand Down Expand Up @@ -79,11 +79,27 @@ def generate_samples_prompt(
if few_shot <= 0:
return ""

ds = dataset(dataset_uri)
# simplify samples with some fixed questions
samples_features = [
{"question": "病毒体核心的主要物质是", "a": "类脂", "b": "核酸", "c": "蛋白质", "d": "磷酸", "answer": "B"},
{"question": "流行病学属于什么范畴", "a": "临床医学", "b": "生物医学", "c": "基础医学", "d": "预防医学", "answer": "D"},
{"question": "下列选项中,属于处分行为的是", "a": "捐助行为", "b": "抛弃所有权的行为", "c": "签订货物买卖合同", "d": "委托行为", "answer": "B"},
{"question": "对累犯从重处罚的刑罚制度,体现了我国刑法的", "a": "罪刑法定原则", "b": "惩罚与教育相结合原则", "c": "刑法适用平等原则", "d": "罪责刑相适应原则", "answer": "D"},
{"question": "犯罪分子具有刑法规定的减轻处罚情节的,应当在()判处刑罚。", "a": "法定刑幅度内按照最低刑", "b": "法定最高刑以下", "c": "法定刑以下", "d": "法定刑以内", "answer": "C"},
{"question": "下列短语中,是定中短语的是", "a": "打扫干净", "b": "操作方法", "c": "张华同学", "d": "已经完成", "answer": "B"},
{"question": "在下面重叠的例子中,表示“适度、适中”意义的是", "a": "白白的", "b": "坐坐", "c": "客客气气的", "d": "散散步", "answer": "A"},
{"question": "“员、祖、乡、分、妊、严”中包含的自由语素是", "a": "乡、分、严", "b": "祖、分、严", "c": "祖、乡、分", "d": "员、分、妊", "answer": "A"},
{"question": "必然王国和自由王国是社会发展的", "a": "两条不同的道路", "b": "两种不同的理想", "c": "两种不同的状态", "d": "两种不同的选择", "answer": "C"},
{"question": "在垄断资本主义阶段占统治地位的资本是", "a": "工业资本", "b": "金融资本", "c": "农业资本", "d": "银行资本", "answer": "B"},
]

random.shuffle(samples_features)
samples = []
total = 0
idx = 0
for i in range(0, few_shot):
features = ds[f"{subject}/dev/{i}"].features
features = samples_features[idx]
idx = (idx + 1) % len(samples_features)
question = self.generate_question(features, include_answer=True)
total += len_tokens(question)
if total > max_length:
Expand Down Expand Up @@ -121,6 +137,10 @@ def _ingest_choice(self, content: str) -> str:
if match:
return match.group(index)

m = re.findall(r"[ABCD]", content)
if len(m) >= 1:
return m[0]

raise ValueError(f"cannot ingest ABCD choice from {content}")

def calculate_score(
Expand Down
119 changes: 84 additions & 35 deletions example/llm-leaderboard/src/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
import os
import typing as t
import threading
import dataclasses
from collections import defaultdict

import numpy

from starwhale import evaluation
from starwhale import argument, evaluation
from starwhale.utils.debug import console

try:
Expand All @@ -30,27 +31,48 @@
_g_llm = None
_g_benchmarks: t.Dict[str, BenchmarkBase] = {}

max_prompt_length = int(os.environ.get("MAX_PROMPT_LENGTH", 2048))
max_new_tokens = int(os.environ.get("MAX_NEW_TOKENS", 256))

@dataclasses.dataclass
class ModelGenerateArguments:
max_prompt_length: int = dataclasses.field(
default=2048, metadata={"help": "max length of prompt"}
)
max_new_tokens: int = dataclasses.field(
default=256, metadata={"help": "max length of generated text"}
)
batch: int = dataclasses.field(
default=1, metadata={"help": "batch size for inference"}
)
temperature: float = dataclasses.field(
default=0.8, metadata={"help": "temperature"}
)
top_p: float = dataclasses.field(default=0.95, metadata={"help": "top p"})
tensor_parallel: int = dataclasses.field(
default=1, metadata={"help": "tensor parallel for vllm"}
)
max_model_len: int = dataclasses.field(
default=16384, metadata={"help": "max model len for vllm kv cache"}
)


# TODO: support multi-gpus evaluation
# TODO: enhance selected features
@argument(ModelGenerateArguments)
@evaluation.predict(
resources={"nvidia.com/gpu": 1},
replicas=1,
batch_size=32,
auto_log=False,
)
def predict_question(data: dict, external: dict) -> None:
# dev split is used for few shot samples
if data.get("_hf_split", "") == "dev":
return

def predict_question(
data: t.List[dict], external: dict, argument: ModelGenerateArguments
) -> None:
# TODO: record cpu/gpu/memory info per predict pod
global _g_llm

with threading.Lock():
if _g_llm is None:
_g_llm = get_built_llm()
_g_llm = get_built_llm(tensor_parallel=argument.tensor_parallel, max_model_len=argument.max_model_len)

global _g_benchmarks
dataset_uri = external["dataset_uri"]
Expand All @@ -59,34 +81,61 @@ def predict_question(data: dict, external: dict) -> None:
# TODO: use dataset_info to get benchmark
_g_benchmarks[dataset_name] = get_benchmark(dataset_name)

result = {}
benchmark = _g_benchmarks[dataset_name]()
for shot, show_name in few_shot_choices.items():
prompt = benchmark.generate_prompt(
data,
few_shot=shot,
dataset_uri=dataset_uri,
max_length=max_prompt_length,
len_tokens=_g_llm.calculate_tokens_length,
)
predict_result = _g_llm.do_predict(
prompt,
benchmark_type=benchmark.get_type(),
max_new_tokens=max_new_tokens,
predict_choice_by_logits=True,

inputs = []
for _index, _data in zip(external["index"], data):
# dev split is used for few shot samples
if _data.get("_hf_split", "") == "dev":
continue

for _shot, _show_name in few_shot_choices.items():
_prompt = benchmark.generate_prompt(
_data,
few_shot=_shot,
dataset_uri=dataset_uri,
max_length=argument.max_prompt_length,
len_tokens=_g_llm.calculate_tokens_length,
)
inputs.append((_index, _show_name, _data, _prompt))

predict_results = []
for idx in range(0, len(inputs), argument.batch):
batch_prompts = [x[-1] for x in inputs[idx : idx + argument.batch]]

if _g_llm.support_batch_inference():
_results = _g_llm.do_predict(
batch_prompts,
benchmark_type=benchmark.get_type(),
max_new_tokens=argument.max_new_tokens,
predict_choice_by_logits=True,
)
predict_results.extend(_results)
else:
for _prompt in batch_prompts:
_result = _g_llm.do_predict(
_prompt,
benchmark_type=benchmark.get_type(),
max_new_tokens=argument.max_new_tokens,
predict_choice_by_logits=True,
)
predict_results.append(_result)

for (_index, _show_name, _data, _prompt), predict_result in zip(
inputs, predict_results
):
score = benchmark.calculate_score(predict_result, _data)
console.trace(f"prompt:\n {_prompt}")
console.trace(f"answer: {_data['answer']}, predict: {score}")

evaluation.log(
category="results",
id=f"{benchmark.get_name()}-{_index}",
metrics={
"input": benchmark.make_input_features_display(_data),
"output": {_show_name: score},
},
)
result[show_name] = benchmark.calculate_score(predict_result, data)
console.trace(f"prompt:\n {prompt}")
console.trace(f"answer: {data['answer']}, predict: {result[show_name]}")

evaluation.log(
category="results",
id=f"{benchmark.get_name()}-{external['index']}",
metrics={
"input": benchmark.make_input_features_display(data),
"output": result,
},
)


@evaluation.evaluate(needs=[predict_question], use_predict_auto_log=False)
Expand Down
2 changes: 1 addition & 1 deletion example/llm-leaderboard/src/llm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from . import qwen, llama, tiger, aquila, xverse, chatglm, baichuan # noqa: F401
from . import qwen, llama, tiger, aquila, xverse, chatglm, baichuan, mistral # noqa: F401
from .base import get_llm, get_built_llm, get_supported_llm

__all__ = ["get_llm", "get_supported_llm", "get_built_llm"]
Loading

0 comments on commit 9b16130

Please sign in to comment.