microsoft · Leavingseason · Jun 19, 2024 · Jun 18, 2024 · Jun 19, 2024
diff --git a/Knowledge_Plugin/DOKE/call_openai.py b/Knowledge_Plugin/DOKE/call_openai.py
@@ -17,6 +17,8 @@
 import queue
 from concurrent.futures import ThreadPoolExecutor
 import multiprocessing
+import openai
+from openai import OpenAI
 
 api_keys = [
     "your openai keys"
@@ -44,55 +46,49 @@ def generate_Davinci(api_key, text):
 
 def generate_chatgpt(api_key, prompt, version):
     # 使用你的 API 密钥初始化 OpenAI GPT-3
-    openai.api_key = api_key
-    openai.api_base = "https://api.openai.com/v1"
+    client = OpenAI(api_key=api_key)
     text = [{'role': 'user', 'content': prompt}]
     if version == "0301":
         model = "gpt-3.5-turbo-0301"
     else:
         model = "gpt-3.5-turbo"
+
     for i in range(MAX_RETRIES):
         try:
-            # 进行 GPT-3 聊天模型 API 调用，并设置超时时间
-            response = openai.ChatCompletion.create(
+            response = client.chat.completions.create(
                 model=model,
                 messages=text,
                 temperature=0.0,
-                request_timeout=30,
+                max_tokens=2048,
+                frequency_penalty=0.0,
+                presence_penalty=0.0
             )
-            content = response['choices'][0]['message']['content']
+            content = response.choices[0].message.content.strip()
             return content
         except Exception as e:
             print(f"{api_key}\nError occurred: {e}. Retrying...")
-            time.sleep(INTERVAL)  # 重试之间的休眠时间
+            time.sleep(INTERVAL)
     print(f"Failed to get response for prompt: {prompt} after {MAX_RETRIES} retries.")
     return "None"
 
-def generate_gpt4(prompt):
-    available_configs = [
-        {"api_key": "your apikey", "url": "deployment url"},
-    ]
-    message = [{"role": "user", "content": prompt}]
-    data = {
-        "messages": message,
-        "max_tokens": 2048,
-        "temperature": 0.,
-        'n': 1,
-    }
-
+def generate_gpt4(api_key, prompt):
+    client = OpenAI(api_key=api_key)
+    text = [{'role': 'user', 'content': prompt}]
     for _ in range(MAX_RETRIES):
         try:
-            config = random.choice(available_configs)
-            headers = {'Content-Type': 'application/json', 'api-key': config["api_key"]}
-            response = requests.post(config["url"], json=data, headers=headers)
-            # print(response)
-            if (response.status_code == 200):
-                answer = response.json()["choices"][0]["message"]['content'].strip()
-                return answer
+            response = client.chat.completions.create(
+                model="gpt-4",
+                messages=text,
+                temperature=0.0,
+                max_tokens=2048,
+                frequency_penalty=0.0,
+                presence_penalty=0.0
+            )
+            content = response.choices[0].message.content.strip()
+            return content
         except Exception as e:
-            print(f"Error occurred: {e}. Retrying...")
-        time.sleep(30)  # 重试之间的休眠时间
-
+            print(f"{api_key}\nError occurred: {e}. Retrying...")
+            time.sleep(INTERVAL)
 
     print("out of max_retry_times")
     return "Error"
@@ -154,7 +150,7 @@ def worker(i, model, version):
         index, prompt = prompts_queue.get()
         api_key = api_keys[i % len(api_keys)]
         if model == "GPT4":
-            result = generate_gpt4(prompt)
+            result = generate_gpt4(api_key, prompt)
         if model == "ChatGPT":
             result = generate_chatgpt(api_key, prompt, version)
         elif model == "Davinci":

diff --git a/Knowledge_Plugin/DOKE/config/ml1m/popneg_empty.json b/Knowledge_Plugin/DOKE/config/ml1m/popneg_empty.json
@@ -1,7 +1,7 @@
 {
-    "sequential_data_path": "../../data/ml1m/sequential_data.txt",
-    "candidate_data_path": "../../data/ml1m/negative_samples_pop.txt",
-    "meta_data_path": "../../data/ml1m/metadata.json",
+    "sequential_data_path": "../data/ml1m/sequential_data.txt",
+    "candidate_data_path": "../data/ml1m/negative_samples_pop.txt",
+    "meta_data_path": "../data/ml1m/metadata.json",
     "topk": 200,
     "max_his_len": 50,
     "template": [

diff --git a/Knowledge_Plugin/DOKE/config/ml1m/popneg_feature.json b/Knowledge_Plugin/DOKE/config/ml1m/popneg_feature.json
@@ -1,7 +1,7 @@
 {
-    "sequential_data_path": "../../data/ml1m/sequential_data.txt",
-    "candidate_data_path": "../../data/ml1m/negative_samples_pop.txt",
-    "meta_data_path": "../../data/ml1m/metadata.json",
+    "sequential_data_path": "../data/ml1m/sequential_data.txt",
+    "candidate_data_path": "../data/ml1m/negative_samples_pop.txt",
+    "meta_data_path": "../data/ml1m/metadata.json",
     "topk": 200,
     "max_his_len": 50,
     "template": [

diff --git a/Knowledge_Plugin/DOKE/config/ml1m/popneg_global_I2I.json b/Knowledge_Plugin/DOKE/config/ml1m/popneg_global_I2I.json
@@ -1,8 +1,8 @@
 {
-    "sequential_data_path": "../../data/ml1m/sequential_data.txt",
-    "candidate_data_path": "../../data/ml1m/negative_samples_pop.txt",
-    "meta_data_path": "../../data/ml1m/metadata.json",
-    "global_cf_data_path": "data/ml1m/global_CF.json",
+    "sequential_data_path": "../data/ml1m/sequential_data.txt",
+    "candidate_data_path": "../data/ml1m/negative_samples_pop.txt",
+    "meta_data_path": "../data/ml1m/metadata.json",
+    "global_cf_data_path": "../data/ml1m/global_CF.json",
     "topk": 200,
     "max_his_len": 50,
     "template": [

diff --git a/Knowledge_Plugin/DOKE/config/ml1m/popneg_his-can_I2I-d1.json b/Knowledge_Plugin/DOKE/config/ml1m/popneg_his-can_I2I-d1.json
@@ -2,7 +2,7 @@
     "sequential_data_path": "../data/ml1m/sequential_data.txt",
     "candidate_data_path": "../data/ml1m/negative_samples_pop.txt",
     "meta_data_path": "../data/ml1m/metadata.json",
-    "cf_data_path": "data/ml1m/MF_CF_candidate_pop.json",
+    "cf_data_path": "../data/ml1m/MF_CF_candidate_pop.json",
     "topk": 200,
     "max_his_len": 50,
     "template": [

diff --git a/Knowledge_Plugin/DOKE/config/ml1m/popneg_his-can_I2I.json b/Knowledge_Plugin/DOKE/config/ml1m/popneg_his-can_I2I.json
@@ -1,8 +1,8 @@
 {
-    "sequential_data_path": "../../data/ml1m/sequential_data.txt",
-    "candidate_data_path": "../../data/ml1m/negative_samples_pop.txt",
-    "meta_data_path": "../../data/ml1m/metadata.json",
-    "cf_data_path": "data/ml1m/MF_CF_candidate_pop.json",
+    "sequential_data_path": "../data/ml1m/sequential_data.txt",
+    "candidate_data_path": "../data/ml1m/negative_samples_pop.txt",
+    "meta_data_path": "../data/ml1m/metadata.json",
+    "cf_data_path": "../data/ml1m/MF_CF_candidate_pop.json",
     "topk": 200,
     "max_his_len": 50,
     "template": [

diff --git a/Knowledge_Plugin/DOKE/config/ml1m/popneg_his-can_I2I_paper.json b/Knowledge_Plugin/DOKE/config/ml1m/popneg_his-can_I2I_paper.json
@@ -2,7 +2,7 @@
     "sequential_data_path": "../data/ml1m/sequential_data.txt",
     "candidate_data_path": "../data/ml1m/negative_samples_pop.txt",
     "meta_data_path": "../data/ml1m/metadata.json",
-    "cf_data_path": "data/ml1m/MF_CF_candidate_pop.json",
+    "cf_data_path": "../data/ml1m/MF_CF_candidate_pop.json",
     "topk": 200,
     "max_his_len": 50,
     "template": [

diff --git a/Knowledge_Plugin/DOKE/config/ml1m/popneg_his-can_I2I_recency.json b/Knowledge_Plugin/DOKE/config/ml1m/popneg_his-can_I2I_recency.json
@@ -2,7 +2,7 @@
     "sequential_data_path": "../data/ml1m/sequential_data.txt",
     "candidate_data_path": "../data/ml1m/negative_samples_pop.txt",
     "meta_data_path": "../data/ml1m/metadata.json",
-    "cf_data_path": "data/ml1m/MF_CF_candidate_pop.json",
+    "cf_data_path": "../data/ml1m/MF_CF_candidate_pop.json",
     "topk": 200,
     "max_his_len": 50,
     "template": [

diff --git a/Knowledge_Plugin/DOKE/config/ml1m/popneg_his-can_U2I-d1.json b/Knowledge_Plugin/DOKE/config/ml1m/popneg_his-can_U2I-d1.json
@@ -2,7 +2,7 @@
     "sequential_data_path": "../data/ml1m/sequential_data.txt",
     "candidate_data_path": "../data/ml1m/negative_samples_pop.txt",
     "meta_data_path": "../data/ml1m/metadata.json",
-    "u2i_data_path": "data/ml1m/SASRec_U2I_candidate_pop.json",
+    "u2i_data_path": "../data/ml1m/SASRec_U2I_candidate_pop.json",
     "topk": 200,
     "max_his_len": 50,
     "template": [

diff --git a/Knowledge_Plugin/DOKE/config/ml1m/popneg_his-can_U2I.json b/Knowledge_Plugin/DOKE/config/ml1m/popneg_his-can_U2I.json
@@ -1,8 +1,8 @@
 {
-    "sequential_data_path": "../../data/ml1m/sequential_data.txt",
-    "candidate_data_path": "../../data/ml1m/negative_samples_pop.txt",
-    "meta_data_path": "../../data/ml1m/metadata.json",
-    "u2i_data_path": "data/ml1m/SASRec_U2I_candidate_pop.json",
+    "sequential_data_path": "../data/ml1m/sequential_data.txt",
+    "candidate_data_path": "../data/ml1m/negative_samples_pop.txt",
+    "meta_data_path": "../data/ml1m/metadata.json",
+    "u2i_data_path": "../data/ml1m/SASRec_U2I_candidate_pop.json",
     "topk": 200,
     "max_his_len": 50,
     "template": [

diff --git a/Knowledge_Plugin/DOKE/config/ml1m/popneg_his-can_U2I_paper.json b/Knowledge_Plugin/DOKE/config/ml1m/popneg_his-can_U2I_paper.json
@@ -2,7 +2,7 @@
     "sequential_data_path": "../data/ml1m/sequential_data.txt",
     "candidate_data_path": "../data/ml1m/negative_samples_pop.txt",
     "meta_data_path": "../data/ml1m/metadata.json",
-    "u2i_data_path": "data/ml1m/SASRec_U2I_candidate_pop.json",
+    "u2i_data_path": "../data/ml1m/SASRec_U2I_candidate_pop.json",
     "topk": 200,
     "max_his_len": 50,
     "template": [

diff --git a/Knowledge_Plugin/DOKE/config/ml1m/popneg_his-can_U2I_paper_recency.json b/Knowledge_Plugin/DOKE/config/ml1m/popneg_his-can_U2I_paper_recency.json
@@ -2,7 +2,7 @@
     "sequential_data_path": "../data/ml1m/sequential_data.txt",
     "candidate_data_path": "../data/ml1m/negative_samples_pop.txt",
     "meta_data_path": "../data/ml1m/metadata.json",
-    "u2i_data_path": "data/ml1m/SASRec_U2I_candidate_pop.json",
+    "u2i_data_path": "../data/ml1m/SASRec_U2I_candidate_pop.json",
     "topk": 200,
     "max_his_len": 50,
     "template": [

diff --git a/Knowledge_Plugin/DOKE/config/ml1m/popneg_his-can_U2I_recency.json b/Knowledge_Plugin/DOKE/config/ml1m/popneg_his-can_U2I_recency.json
@@ -2,7 +2,7 @@
     "sequential_data_path": "../data/ml1m/sequential_data.txt",
     "candidate_data_path": "../data/ml1m/negative_samples_pop.txt",
     "meta_data_path": "../data/ml1m/metadata.json",
-    "u2i_data_path": "data/ml1m/SASRec_U2I_candidate_pop.json",
+    "u2i_data_path": "../data/ml1m/SASRec_U2I_candidate_pop.json",
     "topk": 200,
     "max_his_len": 50,
     "template": [

diff --git a/Knowledge_Plugin/DOKE/config/ml1m/popneg_his_I2I.json b/Knowledge_Plugin/DOKE/config/ml1m/popneg_his_I2I.json
@@ -2,7 +2,7 @@
     "sequential_data_path": "../data/ml1m/sequential_data.txt",
     "candidate_data_path": "../data/ml1m/negative_samples_pop.txt",
     "meta_data_path": "../data/ml1m/metadata.json",
-    "cf_data_path": "data/ml1m/normalized_CF.json",
+    "cf_data_path": "../data/ml1m/normalized_CF.json",
     "topk": 200,
     "max_his_len": 50,
     "template": [

diff --git a/Knowledge_Plugin/DOKE/config/ml1m/popneg_his_I2I_path-I.json b/Knowledge_Plugin/DOKE/config/ml1m/popneg_his_I2I_path-I.json
@@ -2,8 +2,8 @@
     "sequential_data_path": "../data/ml1m/sequential_data.txt",
     "candidate_data_path": "../data/ml1m/negative_samples_pop.txt",
     "meta_data_path": "../data/ml1m/metadata.json",
-    "cf_data_path": "data/ml1m/normalized_CF.json",
-    "reasoning_path_data_path": "data/ml1m/path_text_dict-I.json",
+    "cf_data_path": "../data/ml1m/normalized_CF.json",
+    "reasoning_path_data_path": "../data/ml1m/path_text_dict-I.json",
     "topk": 200,
     "max_his_len": 50,
     "template": [

diff --git a/Knowledge_Plugin/DOKE/config/ml1m/popneg_his_I2I_path-II.json b/Knowledge_Plugin/DOKE/config/ml1m/popneg_his_I2I_path-II.json
@@ -2,8 +2,8 @@
     "sequential_data_path": "../data/ml1m/sequential_data.txt",
     "candidate_data_path": "../data/ml1m/negative_samples_pop.txt",
     "meta_data_path": "../data/ml1m/metadata.json",
-    "cf_data_path": "data/ml1m/normalized_CF.json",
-    "reasoning_path_data_path": "data/ml1m/path_text_dict-II.json",
+    "cf_data_path": "../data/ml1m/normalized_CF.json",
+    "reasoning_path_data_path": "../data/ml1m/path_text_dict-II.json",
     "topk": 200,
     "max_his_len": 50,
     "template": [

diff --git a/Knowledge_Plugin/DOKE/config/ml1m/popneg_his_I2I_path-III.json b/Knowledge_Plugin/DOKE/config/ml1m/popneg_his_I2I_path-III.json
@@ -2,8 +2,8 @@
     "sequential_data_path": "../data/ml1m/sequential_data.txt",
     "candidate_data_path": "../data/ml1m/negative_samples_pop.txt",
     "meta_data_path": "../data/ml1m/metadata.json",
-    "cf_data_path": "data/ml1m/normalized_CF.json",
-    "reasoning_path_data_path": "data/ml1m/path_text_dict-III.json",
+    "cf_data_path": "../data/ml1m/normalized_CF.json",
+    "reasoning_path_data_path": "../data/ml1m/path_text_dict-III.json",
     "topk": 200,
     "max_his_len": 50,
     "template": [

diff --git a/Knowledge_Plugin/DOKE/config/ml1m/popneg_his_I2I_path.json b/Knowledge_Plugin/DOKE/config/ml1m/popneg_his_I2I_path.json
@@ -2,8 +2,8 @@
     "sequential_data_path": "../data/ml1m/sequential_data.txt",
     "candidate_data_path": "../data/ml1m/negative_samples_pop.txt",
     "meta_data_path": "../data/ml1m/metadata.json",
-    "cf_data_path": "data/ml1m/normalized_CF.json",
-    "reasoning_path_data_path": "data/ml1m/path_text_dict.json",
+    "cf_data_path": "../data/ml1m/normalized_CF.json",
+    "reasoning_path_data_path": "../data/ml1m/path_text_dict.json",
     "topk": 200,
     "max_his_len": 50,
     "template": [

diff --git a/Knowledge_Plugin/DOKE/config/ml1m/popneg_his_U2I.json b/Knowledge_Plugin/DOKE/config/ml1m/popneg_his_U2I.json
@@ -1,8 +1,8 @@
 {
-    "sequential_data_path": "../../data/ml1m/sequential_data.txt",
-    "candidate_data_path": "../../data/ml1m/negative_samples_pop.txt",
-    "meta_data_path": "../../data/ml1m/metadata.json",
-    "u2i_data_path": "data/ml1m/SASRec_U2I_pop.json",
+    "sequential_data_path": "../data/ml1m/sequential_data.txt",
+    "candidate_data_path": "../data/ml1m/negative_samples_pop.txt",
+    "meta_data_path": "../data/ml1m/metadata.json",
+    "u2i_data_path": "../data/ml1m/SASRec_U2I_pop.json",
     "topk": 200,
     "max_his_len": 50,
     "template": [

diff --git a/Knowledge_Plugin/Knowledge_Extraction/extract_U2I.py b/Knowledge_Plugin/Knowledge_Extraction/extract_U2I.py
@@ -69,6 +69,7 @@ def prepare_U2I_dict(embedding, sequential_data, candidate_data):
         candidate_scores = [(item2_id, score) for item2_id, score in enumerate(user_item_score[idx]) if item2_id in candidates]
         U2I_candidate_dict[user] = sorted(candidate_scores, key=lambda x:-x[1])[:20]
     return U2I_dict, U2I_candidate_dict
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='extract CF information')
     parser.add_argument('--dataset', type=str, default='steam', help='dataset')

diff --git a/Knowledge_Plugin/README.md b/Knowledge_Plugin/README.md
@@ -25,10 +25,10 @@ For example:
 ```bash
 cd Knowledge_Extraction
 python extract_I2I.py \
-    --dataset beauty \
+    --dataset ml1m \
     --negative_type pop
 python extract_U2I.py \
-    --dataset beauty \
+    --dataset ml1m \
     --negative_type pop
 ```
 
@@ -46,8 +46,8 @@ python generate_prompt.py \
     --config config/ml1m/popneg_his_I2I.json \
     --dataset ml1m
 python call_openai.py \
-    --prompt out/prompts/ml1m/popneg_his_I2I_path.json \
+    --prompt out/prompts/ml1m/popneg_his_I2I.json \
     --model ChatGPT \
     --dataset ml1m
-bash metric.bash out/result/ml1m/ChatGPT_popneg_his_I2I_path ml1m
+bash metric.bash out/result/ml1m/ChatGPT_popneg_his_I2I ml1m
 ```
diff --git a/Knowledge_Plugin/preprocess/step1-Raw_Dataset_Parsing/README.md b/Knowledge_Plugin/preprocess/step1-Raw_Dataset_Parsing/README.md
@@ -5,6 +5,7 @@ We need to process the following data sets separately:
 + Online Retail (https://www.kaggle.com/carrie1/ecommerce-data)
 
 # Download
+Create the directory at Knowledge_Plugin/
 ```bash
 mkdir data/raw_data
 cd data/raw_data
@@ -22,6 +23,9 @@ unzip ml-1m.zip
 
 Run each notebook according to the dataset.
 
++ data_preprocess_amazon.ipynb
++ data_preprocess_ml1m.ipynb
++ data_preprocess_onlineretail.ipynb
 
 # Result