Skip to content

Commit

Permalink
fix bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
v-leiyuxuan authored and Leavingseason committed Jul 13, 2024
1 parent db4ce32 commit cc3611c
Show file tree
Hide file tree
Showing 8 changed files with 85 additions and 38 deletions.
11 changes: 10 additions & 1 deletion RecLM-emb/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,17 @@ pip install -r requirements.txt
```

### Set OpenAI API Environment
If you want to use OpenAI API, you need to firstly run the following scripts in your console. Currently we only support the Azure OpenAI API.
If you want to use OpenAI API, you need to firstly run the following scripts in your console. If it is not Azure OpenAI API (OPENAI_API_TYPE is not "azure"), you only need to specify OPENAI_API_KEY and MODEL.

```bash
export OPENAI_API_KEY=xxx;
export OPENAI_API_BASE=https://xxx.openai.azure.com/;
export OPENAI_API_VERSION=2023-03-15-preview;
export OPENAI_API_TYPE=azure;
export MODEL=xxx;
```

We also support AzureCliCredential login:
```bash
az login
export OPENAI_API_BASE=https://xxx.openai.azure.com/;
Expand Down
8 changes: 4 additions & 4 deletions RecLM-emb/preprocess/data_process_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def parse_args():


def gen_conv(args, itemid2title, itemid2features):
max_sample_num = 18000
max_sample_num = 10000
with open(args.in_seq_data, 'r') as rd:
all_samples = rd.readlines()
if len(all_samples) > max_sample_num:
Expand Down Expand Up @@ -111,7 +111,7 @@ def gen_conv(args, itemid2title, itemid2features):


def gen_summary(args, itemid2title):
max_sample_num = 30000
max_sample_num = 10000
with open(args.in_seq_data, 'r') as rd:
all_samples = rd.readlines()
if len(all_samples) > max_sample_num:
Expand Down Expand Up @@ -167,7 +167,7 @@ def gen_query(args, itemid2title, itemid2features):
for id, title in tqdm(enumerate(itemid2title), desc='gen_query', total=len(itemid2title)):
if id==0:
continue
for _ in range(5):
for _ in range(1):
target_info = {'title': title[1]}
features = itemid2features[id] if 'description: ' not in itemid2features[id][-1][0] else itemid2features[id][:-1]

Expand Down Expand Up @@ -208,7 +208,7 @@ def gen_neg_query(args, itemid2title, itemid2features):
for id, title in tqdm(enumerate(itemid2title), desc='gen_neg_query', total=len(itemid2title)):
if id==0:
continue
for _ in range(5):
for _ in range(1):
target_info = {'title': title[1]}
features = []
for feature in itemid2features[id]:
Expand Down
46 changes: 31 additions & 15 deletions RecLM-emb/preprocess/gpt_api/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,44 @@
from openai import OpenAI, AzureOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider, AzureCliCredential

credential = AzureCliCredential()

token_provider = get_bearer_token_provider(
credential,
"https://cognitiveservices.azure.com/.default"
)

client = AzureOpenAI(
azure_endpoint=os.environ.get('OPENAI_API_BASE'),
azure_ad_token_provider=token_provider,
api_version=os.environ.get('OPENAI_API_VERSION'),
max_retries=5,
)
api_key = os.environ.get('OPENAI_API_KEY') if os.environ.get('OPENAI_API_KEY') else None
api_base = os.environ.get('OPENAI_API_BASE') if os.environ.get('OPENAI_API_BASE') else None
api_type = os.environ.get('OPENAI_API_TYPE') if os.environ.get('OPENAI_API_TYPE') else None
api_version = os.environ.get('OPENAI_API_VERSION') if os.environ.get('OPENAI_API_VERSION') else None

if api_key:
if api_type == "azure":
client = AzureOpenAI(
api_key=api_key,
api_version=api_version,
azure_endpoint=api_base
)
else:
client = OpenAI(
api_key=api_key
)
else:
credential = AzureCliCredential()

token_provider = get_bearer_token_provider(
credential,
"https://cognitiveservices.azure.com/.default"
)

client = AzureOpenAI(
azure_endpoint=api_base,
azure_ad_token_provider=token_provider,
api_version=api_version,
max_retries=5,
)

MODEL = os.environ.get('MODEL')

if MODEL.startswith("gpt-3"):
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo") # gpt-3.5-turbo gpt-4 gpt-4-0314
else:
encoding = tiktoken.encoding_for_model("gpt-4")



def call_chatgpt(prompt):
max_retry_cnt = 5
result = "NULL"
Expand Down
1 change: 1 addition & 0 deletions RecLM-emb/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ multiprocess==0.70.16
flash-attn==2.5.9.post1
gradio==4.29.0
azure-identity==1.17.1
tiktoken==0.7.0
4 changes: 2 additions & 2 deletions RecLM-emb/shell/run_multi_node.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ if [ "$MODEl_NAME_OR_PATH" = "meta-llama/Llama-2-7b-hf" ]; then
--data_cache_dir $HOME/.cache/hf_data \
--output_dir $OUTPUT_DIR \
--model_name_or_path $MODEl_NAME_OR_PATH \
--train_data $DATA_DIR/misspell2item.jsonl,$DATA_DIR/negquery2item.jsonl,$DATA_DIR/relativequery2item.jsonl,$DATA_DIR/title2item.jsonl,$DATA_DIR/vaguequery2item.jsonl,$DATA_DIR/gpt4_data.jsonl,$DATA_DIR/item2item.jsonl,$DATA_DIR/query2item.jsonl,$DATA_DIR/queryuser2item.jsonl,$DATA_DIR/user2item.jsonl \
--train_data $DATA_DIR/gpt4_data_v2.jsonl,$DATA_DIR/misspell2item.jsonl,$DATA_DIR/negquery2item.jsonl,$DATA_DIR/relativequery2item.jsonl,$DATA_DIR/title2item.jsonl,$DATA_DIR/vaguequery2item.jsonl,$DATA_DIR/gpt4_data.jsonl,$DATA_DIR/item2item.jsonl,$DATA_DIR/query2item.jsonl,$DATA_DIR/queryuser2item.jsonl,$DATA_DIR/user2item.jsonl \
--learning_rate 3e-5 \
--num_train_epochs 3 \
--per_device_train_batch_size 2 \
Expand All @@ -64,7 +64,7 @@ else
--data_cache_dir $HOME/.cache/hf_data \
--output_dir $OUTPUT_DIR \
--model_name_or_path $MODEl_NAME_OR_PATH \
--train_data $DATA_DIR/misspell2item.jsonl,$DATA_DIR/negquery2item.jsonl,$DATA_DIR/relativequery2item.jsonl,$DATA_DIR/title2item.jsonl,$DATA_DIR/vaguequery2item.jsonl,$DATA_DIR/gpt4_data.jsonl,$DATA_DIR/item2item.jsonl,$DATA_DIR/query2item.jsonl,$DATA_DIR/queryuser2item.jsonl,$DATA_DIR/user2item.jsonl \
--train_data $DATA_DIR/gpt4_data_v2.jsonl,$DATA_DIR/misspell2item.jsonl,$DATA_DIR/negquery2item.jsonl,$DATA_DIR/relativequery2item.jsonl,$DATA_DIR/title2item.jsonl,$DATA_DIR/vaguequery2item.jsonl,$DATA_DIR/gpt4_data.jsonl,$DATA_DIR/item2item.jsonl,$DATA_DIR/query2item.jsonl,$DATA_DIR/queryuser2item.jsonl,$DATA_DIR/user2item.jsonl \
--learning_rate 3e-5 \
--num_train_epochs 3 \
--per_device_train_batch_size 2 \
Expand Down
4 changes: 2 additions & 2 deletions RecLM-emb/shell/run_single_node.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ if [ "$MODEl_NAME_OR_PATH" = "meta-llama/Llama-2-7b-hf" ]; then
--data_cache_dir $HOME/.cache/hf_data \
--output_dir $OUTPUT_DIR \
--model_name_or_path $MODEl_NAME_OR_PATH \
--train_data $DATA_DIR/misspell2item.jsonl,$DATA_DIR/negquery2item.jsonl,$DATA_DIR/relativequery2item.jsonl,$DATA_DIR/title2item.jsonl,$DATA_DIR/vaguequery2item.jsonl,$DATA_DIR/gpt4_data.jsonl,$DATA_DIR/item2item.jsonl,$DATA_DIR/query2item.jsonl,$DATA_DIR/queryuser2item.jsonl,$DATA_DIR/user2item.jsonl \
--train_data $DATA_DIR/gpt4_data_v2.jsonl,$DATA_DIR/misspell2item.jsonl,$DATA_DIR/negquery2item.jsonl,$DATA_DIR/relativequery2item.jsonl,$DATA_DIR/title2item.jsonl,$DATA_DIR/vaguequery2item.jsonl,$DATA_DIR/gpt4_data.jsonl,$DATA_DIR/item2item.jsonl,$DATA_DIR/query2item.jsonl,$DATA_DIR/queryuser2item.jsonl,$DATA_DIR/user2item.jsonl \
--learning_rate 3e-5 \
--num_train_epochs 3 \
--per_device_train_batch_size 2 \
Expand Down Expand Up @@ -65,7 +65,7 @@ else
--data_cache_dir $HOME/.cache/hf_data \
--output_dir $OUTPUT_DIR \
--model_name_or_path $MODEl_NAME_OR_PATH \
--train_data $DATA_DIR/misspell2item.jsonl,$DATA_DIR/negquery2item.jsonl,$DATA_DIR/relativequery2item.jsonl,$DATA_DIR/title2item.jsonl,$DATA_DIR/vaguequery2item.jsonl,$DATA_DIR/gpt4_data.jsonl,$DATA_DIR/item2item.jsonl,$DATA_DIR/query2item.jsonl,$DATA_DIR/queryuser2item.jsonl,$DATA_DIR/user2item.jsonl \
--train_data $DATA_DIR/gpt4_data_v2.jsonl,$DATA_DIR/misspell2item.jsonl,$DATA_DIR/negquery2item.jsonl,$DATA_DIR/relativequery2item.jsonl,$DATA_DIR/title2item.jsonl,$DATA_DIR/vaguequery2item.jsonl,$DATA_DIR/gpt4_data.jsonl,$DATA_DIR/item2item.jsonl,$DATA_DIR/query2item.jsonl,$DATA_DIR/queryuser2item.jsonl,$DATA_DIR/user2item.jsonl \
--learning_rate 3e-5 \
--num_train_epochs 3 \
--per_device_train_batch_size 2 \
Expand Down
6 changes: 5 additions & 1 deletion RecLM-emb/src/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,11 @@ class ModelArguments:
default=None, metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
)
attn_implementation: Optional[str] = field(
default="eager", metadata={"help": "The attention implementation to use: 'eager', 'sdpa', 'flash_attention_2'"}, choices=["eager", "sdpa", "flash_attention_2"]
default="eager",
metadata={
"help": "The attention implementation to use: 'eager', 'sdpa', 'flash_attention_2'",
"choices": ["eager", "sdpa", "flash_attention_2"],
},
)
torch_dtype: Optional[str] = field(
default=None,
Expand Down
43 changes: 30 additions & 13 deletions RecLM-emb/src/openai_model_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,19 +20,36 @@
MAX_RETRIES = 5
INTERVAL = 5

credential = AzureCliCredential()

token_provider = get_bearer_token_provider(
credential,
"https://cognitiveservices.azure.com/.default"
)

client = AzureOpenAI(
azure_endpoint=os.environ.get('OPENAI_API_BASE'),
azure_ad_token_provider=token_provider,
api_version=os.environ.get('OPENAI_API_VERSION'),
max_retries=MAX_RETRIES,
)
api_key = os.environ.get('OPENAI_API_KEY') if os.environ.get('OPENAI_API_KEY') else None
api_base = os.environ.get('OPENAI_API_BASE') if os.environ.get('OPENAI_API_BASE') else None
api_type = os.environ.get('OPENAI_API_TYPE') if os.environ.get('OPENAI_API_TYPE') else None
api_version = os.environ.get('OPENAI_API_VERSION') if os.environ.get('OPENAI_API_VERSION') else None

if api_key:
if api_type == "azure":
client = AzureOpenAI(
api_key=api_key,
api_version=api_version,
azure_endpoint=api_base
)
else:
client = OpenAI(
api_key=api_key
)
else:
credential = AzureCliCredential()

token_provider = get_bearer_token_provider(
credential,
"https://cognitiveservices.azure.com/.default"
)

client = AzureOpenAI(
azure_endpoint=api_base,
azure_ad_token_provider=token_provider,
api_version=api_version,
max_retries=MAX_RETRIES,
)

def call_openai_embedding(model, text):
for i in range(MAX_RETRIES):
Expand Down

0 comments on commit cc3611c

Please sign in to comment.