tune prompts for new-version LLM

- [x] tune system and tool prompt for newer GPT - [x] modify default args to the most common - [x] add demostrations with placeholder - [x] pass all single-turn test cases
microsoft · May 11, 2024 · 1c5fdaf · 1c5fdaf
1 parent 566f185
commit 1c5fdaf
Show file tree

Hide file tree

Showing 6 changed files with 37 additions and 20 deletions.
diff --git a/InteRecAgent/app.py b/InteRecAgent/app.py
@@ -73,14 +73,17 @@
     "--demo_mode",
     type=str,
     choices=["zero", "fixed", "dynamic"],
-    default="zero",
+    default="dynamic",
     help="Directory path of demonstrations",
 )
 parser.add_argument(
-    "--demo_dir_or_file", type=str, help="Directory or file path of demonstrations"
+    "--demo_dir_or_file",
+    type=str,
+    default="./demonstration/seed_demos_placeholder.jsonl",
+    help="Directory or file path of demonstrations"
 )
 parser.add_argument(
-    "--num_demos", type=int, default=3, help="number of demos for in-context learning"
+    "--num_demos", type=int, default=5, help="number of demos for in-context learning"
 )
 
 # reflection mechanism

diff --git a/InteRecAgent/demonstration/seed_demos.jsonl b/InteRecAgent/demonstration/seed_demos.jsonl
@@ -11,3 +11,4 @@
 {"request": "I have played Rust and Portal, which game suit me most in the recommendation list?", "plan": "1. candidate store tool (store games in recommendation list as candidates); 2. ranking tool (rank the candidates according to play history); 3. map tool"}
 {"request": "I have played NBA2K, refine the recommendation list for me.", "plan": "1. candidate store tool (store games in recommendation list as candidates); 2. ranking tool (rank the candidates according to play history); 3. map tool"}
 {"request": "Which games are first-person in above list?", "plan": "1. candidate store tool (store games in above list as candidates); 2. hard condition tool (filter first-person games from candidates); 3. map tool"}
+{"request": "I want some puzzle games, such as Sudoku, do you have any suggestions?", "plan": "1. hard condition tool (filter puzzle games from candidates); 2. soft condition tool (similar to Sudoku); 3. map tool"}
diff --git a/InteRecAgent/demonstration/seed_demos_placeholder.jsonl b/InteRecAgent/demonstration/seed_demos_placeholder.jsonl
@@ -0,0 +1,14 @@
+{"request": "I want some TAG games.", "plan": "1. hard condition filter tool (for TAG games); 2. ranking tool (by popularity); 3. map tool"}
+{"request": "I have played a lot of TAG games, like GAME1, GAME2, GAME3, GAME4. GAME3 is my favorite games, but GAME1 and GAME2 make me disgusted. Can you recommend some popular games released after 2016, similar with GAME4?", "plan": "1. hard condition filter tool (released after 2016); 2. soft condition filter tool (similar with GAME4); 3. ranking tool (user history: GAME1, GAME2, GAME3, GAME4; user unwants GAME1 and GAME2); 4. map tool"}
+{"request": "I have played GAME1 and GAME2 in the past, please suggest me some games.", "plan": "1. ranking tool (user history GAME1 and GAME2, by preference); 2. map tool"}
+{"request": "I am looking for some games similar to GAME, do you have any suggestions?", "plan": "1. soft condition filter tool (similar to GAME); 2. ranking tool (by similarity); 3. map tool"}
+{"request": "Give me some TAG1 or TAG2 games, but I don't like TAG3 games.", "plan": "1. hard condition filter tool (TAG1 or TAG2 games, not TAG3 games); 2. ranking tool (by popularity); 3. map tool"}
+{"request": "I'm looking for some TAG games like GAME, but I want a cheaper one.", "plan": "1. look up tool (price of GAME); 2. hard condition filter tool (TAG games, cheaper than the price of GAME); 2. soft condition filter tool (similar to GAME); 3. ranking tool (by similarity); 4. map tool"}
+{"request": "I used to play GAME1 and GAME2, but now I feel bored of them. Please give 3 some new games.", "plan": "1. hard condition filter tool (released in recent years); 2. ranking tool (user has played GAME1 and GAME2, user doesn't want the two games; by preference); 3. map tool"}
+{"request": "I want some TAG games to play, but I don't like GAME1 and GAME2.", "plan": "1. hard condition filter tool (TAG games); 2. ranking tool (user unwants GAME1 and GAME2; by preference); 3. map tool"}
+{"request": "Please give me details about the game GAME.", "plan": "1. look up tool (game description about GAME)"}
+{"request": "How much does the game cost?", "plan": "1. look up tool (price of the game)"}
+{"request": "I have played GAME1 and GAME2, which game suits me most in the recommendation list?", "plan": "1. candidate store tool (store games in above recommendation list as candidates); 2. ranking tool (rank the candidates according to history: GAME1, GAME2, by preference); 3. map tool"}
+{"request": "I have played GAME, refine the recommendation list for me.", "plan": "1. candidate store tool (store games in recommendation list as candidates); 2. ranking tool (rank the candidates according to play history GAME, by preference); 3. map tool"}
+{"request": "Which games are TAG in above list?", "plan": "1. candidate store tool (store games in above list as candidates); 2. hard condition tool (filter TAG games from candidates); 3. map tool"}
+{"request": "I want some TAG games, such as GAME, do you have any suggestions?", "plan": "1. hard condition tool (filter TAG games from candidates); 2. soft condition tool (similar to GAME); 3. map tool"}
diff --git a/InteRecAgent/llm4crs/prompt/system.py b/InteRecAgent/llm4crs/prompt/system.py
@@ -130,10 +130,10 @@
 
 If human is looking up information of {item}s, such as the description of {item}s, number of {item}s, price of {item}s and so on, use the {LookUpTool}. \
 
-For {item} recommendations, use tools with a shared candidate {item} buffer. Buffer is initialized with all {item}s. Filtering tools fetch candidates from the buffer and update it. \
+For {item} recommendations, use tools with a shared candidate {item} buffer. Buffer is initialized with all {item}s. Filtering tools fetch candidates from the buffer and update it. Remember to use {HardFilterTool} before {SoftFilterTool} if both are needed. Remember to use {RankingTool} to process human's historical interactions or remove unwanted candidates. \
 Ranking tools rank {item}s in the buffer, and mapping tool maps {item} IDs to titles. \
 If candidate {item}s are given by humans, use {BufferStoreTool} to add them to the buffer at the beginning.
-Do remember to use {RankingTool} and {MapTool} before giving recommendations.
+You MUST use {RankingTool} and {MapTool} before giving recommendations.
 
 Think about whether to use tool first. If yes, make tool using plan and give the input of each tool. Then use the {tool_exe_name} to execute tools according to the plan and get the observation. \
 Only those tool names are optional when making plans: {tool_names}

diff --git a/InteRecAgent/llm4crs/prompt/tool.py b/InteRecAgent/llm4crs/prompt/tool.py
@@ -39,19 +39,18 @@
 The tool is a hard-condition {item} filtering tool. The tool is useful when human want {item}s with some hard conditions on {item} properties. \
 The input of the tool should be a one-line SQL SELECT command converted from hard conditions. Here are some rules: \
 1. {item} titles can not be used as conditions in SQL;
-2. the tool can not find similar {item}s;
-3. always use pattern match logic for columns with string type;
-4. only one {item} information table is allowed to appear in SQL command;
-5. select all {item}s that meet the conditions, do not use the LIMIT keyword;
-6. try to use OR instead of AND;
-7. use given related values for categorical columns instead of human's description.
+2. always use pattern match logic for columns with string type;
+3. only one {item} information table is allowed to appear in SQL command;
+4. select all {item}s that meet the conditions, do not use the LIMIT keyword;
+5. try to use OR instead of AND;
+6. use given related values for categorical columns instead of human's description.
 """
 
 
 SOFT_FILTER_TOOL_DESC = """
-The tool is a soft condition {item} filtering tool. \
-The tool can find similar {item}s for specific seed {item}s. \
-Never use this tool if human doesn't express to find some {item}s similar with seed {item}s. \
+The tool is a soft condition {item} filtering tool to find similar {item}s for specific seed {item}s. \
+Use this tool ONLY WHEN human explicitly want to find similar {item}s with seed {item}s. \
+The tool can not recommend {item}s based on human's history. \
 There is a similarity score threshold in the tool, only {item}s with similarity above the threshold would be kept. \
 Besides, the tool could be used to calculate the similarity scores with seed {item}s for {item}s in candidate buffer for ranking tool to refine. \
 The input of the tool should be a list of seed {item} titles/names, which should be a Python list of strings. \
@@ -64,7 +63,7 @@
 The input of the tool should be a json string, which may consist of three keys: "schema", "prefer" and "unwanted". \
 "schema" represents ranking schema, optional choices: "popularity", "similarity" and "preference", indicating rank by {item} popularity, rank by similarity, rank by human preference ("prefer" {item}s). \
 The "schema" depends on previous tool using and human preference. If "prefer" info here not empty, "preference" schema should be used. If similarity filtering tool is used before, prioritize using "similarity" except human want popular {item}s.
-"prefer" represents {item} names that human has enjoyed or human has interacted with before, which should be an array of {item} titles. Keywords: "used to do", "I like", "prefer".
+"prefer" represents {item} names that human has enjoyed or human has interacted with before (human's history), which should be an array of {item} titles. Keywords: "used to do", "I like", "prefer".
 "unwanted" represents {item} names that human doesn't like or doesn't want to see in next conversations, which should be an array of {item} titles. Keywords: "don't like", "boring", "interested in". 
 "prefer" and "unwanted" {item}s should be extracted from human request and previous conversations. Only {item} names are allowed to appear in the input. \
 The human's feedback for you recommendation in conversation history could be regard as "prefer" or "unwanted", like "I have tried those items you recommend" or "I don't like those".

diff --git a/InteRecAgent/run.sh b/InteRecAgent/run.sh
@@ -24,19 +24,19 @@ fi
 
 
 domain="game"   # item domain
-enable_shorten=1    # whether to enable shorten chat history
+enable_shorten=0    # whether to enable shorten chat history
 
 # demonstration mode. 
 # 1. zero: Zero-shot setting. No demonstration would be inserted into prompt.
 # 2. fixed: Fixed demonstrations are used for all cases. It would use the first n demonstrations in the `demo_dir_or_file`, where n is `num_demos`.
 # 3. dynamic: Retrieval the most n relevant demonstrations.
-demo_mode="zero"  # ["zero", "fixed", "dynamic"]
+demo_mode="dynamic"  # ["zero", "fixed", "dynamic"]
 
-num_demos=3 # number of demonstrations to use
+num_demos=5 # number of demonstrations to use, when demo_mode=="fixed" and num_demos<0, all demonstrations would be used.
 
 # folder or file path of demonstrations. If folder, all jsonl files would be used. 
 # If demo_mode=="zero", the argument does not function.
-demo_dir_or_file="path-to-demonstrations"   
+demo_dir_or_file="./demonstration/seed_demos_placeholder.jsonl"   
 
 
 enable_reflection=0 # whether to use reflection. Reflection would increase the token usage and the response delay.
@@ -64,4 +64,4 @@ python ./app.py \
     --enable_reflection=$enable_reflection \
     --plan_first=$plan_first \
     --langchain=$langchain \
-    # --demo_dir_or_file=$demo_dir_or_file \
+    --demo_dir_or_file=$demo_dir_or_file