Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Address information leakage issue; delete Chinese annotations #22

Merged
merged 1 commit into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions InteRecAgent/demonstration/tagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@


parser = argparse.ArgumentParser()
parser.add_argument("--demo_dir_or_file", type=str, default="/home/v-huangxu/work/gen_demos")
parser.add_argument("--demo_dir_or_file", type=str, default="./work/gen_demos")
parser.add_argument("--save", type=str, default="./tagged/")
args, _ = parser.parse_known_args()

Expand Down Expand Up @@ -62,10 +62,10 @@ def extract_tags(file_path):
tags = re.findall(pattern, content)
return tags

# 示例用法
file_path = '/home/v-huangxu/work/LLM4CRS/tagged/tag_cache.txt'
# Example usage
file_path = './work/LLM4CRS/tagged/tag_cache.txt'
tags = extract_tags(file_path)
# print(tags) # 输出:['y', 'n', 'Y', ...]
# print(tags) # Output:['y', 'n', 'Y', ...]


examples = load_examples(args.demo_dir_or_file)
Expand Down
2 changes: 1 addition & 1 deletion InteRecAgent/llm4crs/demo/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def fit_domain(self, examples: List[Dict], domain: str):


if __name__ == "__main__":
selector = DemoSelector("/home/v-huangxu/work/LLM4CRS/demonstration/gen_demos/2023-06-28-08_53_56.jsonl", k=3)
selector = DemoSelector("./LLM4CRS/demonstration/gen_demos/2023-06-28-08_53_56.jsonl", k=3)
request = "I want some farming games."

demo_prompt = selector(request)
Expand Down
65 changes: 4 additions & 61 deletions InteRecAgent/preprocess/movies.ipynb

Large diffs are not rendered by default.

98 changes: 2 additions & 96 deletions InteRecAgent/preprocess/prepare_amazon.ipynb

Large diffs are not rendered by default.

14 changes: 3 additions & 11 deletions InteRecAgent/preprocess/prepare_steam.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,7 @@
"from bs4 import BeautifulSoup \n",
" \n",
"def remove_html_tags(text): \n",
" \"\"\"删除文本中的HTML标签\"\"\" \n",
" \"\"\"Remove HTML tags from text\"\"\" \n",
" soup = BeautifulSoup(text, \"html.parser\") \n",
" return soup.get_text() \n",
" \n",
Expand Down Expand Up @@ -1374,17 +1374,9 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sucessfully saved into /home/v-huangxu/blob/raw_datasets/steam/chatbot/simulator_test_data_900_230816.jsonl.\n"
]
}
],
"outputs": [],
"source": [
"from typing import *\n",
"import json, pickle\n",
Expand Down
138 changes: 2 additions & 136 deletions InteRecAgent/preprocess/preprocess_redial.ipynb

Large diffs are not rendered by default.

10 changes: 5 additions & 5 deletions RecExplainer/preprocess/data_preprocess_amazon.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ def get_interaction(datas):
user_seq[user].append((item, time))

for user, item_time in user_seq.items():
item_time.sort(key=lambda x: x[1]) # 对各个数据集得单独排序
item_time.sort(key=lambda x: x[1])
items = []
for t in item_time:
items.append(t[0])
Expand All @@ -234,15 +234,15 @@ def check_Kcore(user_items, user_core, item_core):
for item, num in item_count.items():
if num < item_core:
return user_count, item_count, False
return user_count, item_count, True # 已经保证Kcore
return user_count, item_count, True # Already guaranteed Kcore

def filter_Kcore(user_items, user_core, item_core):
# 循环过滤 K-core,过滤掉不满足K-core的user和item
# Loop filter K-core, filter out users and items that do not meet K-core
user_count, item_count, isKcore = check_Kcore(user_items, user_core, item_core)
while not isKcore:
cur_user_items = copy.deepcopy(user_items)
for user, num in user_count.items():
if user_count[user] < user_core: # 直接把user 删除
if user_count[user] < user_core: # Delete the user
cur_user_items.pop(user)
else:
for item in user_items[user]:
Expand All @@ -265,7 +265,7 @@ def id_map(user_items): # user_items dict
item_id = 1
final_data = {}
random_user_list = list(user_items.keys())
random.shuffle(random_user_list) # user 随机打乱后重新编码
random.shuffle(random_user_list) # user is shuffled and re-encoded
for user in random_user_list:
items = user_items[user]
if user not in user2id:
Expand Down
2 changes: 1 addition & 1 deletion RecLM-gen/scripts/rl_merge.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ CUDA_VISIBLE_DEVICES=8 python main.py \
--RL_actor_lora_a 2 \
--RL_critic_lora_r 4 \
--RL_critic_lora_a 2 \
--RL_load /home/lws/projects/RecLM-gen/snap/ICR_SubMovie_Title64T_0_Llama7bChat_LCT_E40_CCR2_SCG2-0.5_IDX/RL_Total_train_LM-True_VM-False_NR-20.1_SN-2_Q-False_T6_FG-True_LR-5e-06_LDO-0.0_WD-0.0_KLC-0.3_EW-0.01_RS-False_RW-True_VFC-0.1_KLT-0.05_LRP-2.0_GAMMA-0.99_GAS-4_LB-1_RA_0.5_/4800step_RL \
--RL_load snap/ICR_SubMovie_Title64T_0_Llama7bChat_LCT_E40_CCR2_SCG2-0.5_IDX/RL_Total_train_LM-True_VM-False_NR-20.1_SN-2_Q-False_T6_FG-True_LR-5e-06_LDO-0.0_WD-0.0_KLC-0.3_EW-0.01_RS-False_RW-True_VFC-0.1_KLT-0.05_LRP-2.0_GAMMA-0.99_GAS-4_LB-1_RA_0.5_/4800step_RL \
--lm_head_full_tune \
--FA2

22 changes: 11 additions & 11 deletions RecLM-gen/unirec/asyc_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,30 +163,30 @@ async def model_runner(self):
self.queue_lock = asyncio.Lock(loop=app.loop)
self.needs_processing = asyncio.Event(loop=app.loop)
logger.info("started model runner for {}".format(self.model_name))
# while True 无限循环,程序会处于监听状态
# while True: Infinite loop, the program will be in a listening state
while True:
# 等待有任务来
# Waiting for a task to come
await self.needs_processing.wait()
self.needs_processing.clear()
# 清空计时器
# Clear timer
if self.needs_processing_timer is not None:
self.needs_processing_timer.cancel()
self.needs_processing_timer = None
# 处理队列都开启锁
# All processing queues are locked
async with self.queue_lock:
# 如果队列不为空则设置最长等待时间
# If the queue is not empty, set the maximum waiting time
if self.queue:
longest_wait = app.loop.time() - self.queue[0]["time"]
else: # oops
longest_wait = None
# 日志记录启动处理,队列大小,等待时间
# Logger start processing
logger.debug("launching processing. queue size: {}. longest wait: {}".format(len(self.queue), longest_wait))
# 获取一个批次的数据
# Get a batch of data
to_process = self.queue[:MAX_BATCH_SIZE]
# 然后把这些数据从任务队列中删除
# delete these data from the task queue
del self.queue[:len(to_process)]
self.schedule_processing_if_needed()
# 生成批数据
# Generate batch data
# print(to_process)
if len(to_process) == 0:
continue
Expand All @@ -196,11 +196,11 @@ async def model_runner(self):
'item_seq': torch.stack([t["item_id_list"] for t in to_process], dim=0),
}
# print(batch_data)
# 在一个单独的线程中运行模型,然后返回结果
# Run the model in a separate thread and return the results
scores = await app.loop.run_in_executor(
None, functools.partial(self.run_model, batch_data)
)
# 记录结果并设置一个完成事件
# Log the results and set a completion event
for t, s in zip(to_process, scores):
t["score"] = s
t["done_event"].set()
Expand Down
6 changes: 3 additions & 3 deletions RecLM-gen/utils/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,15 +223,15 @@ def __init__(self, model_name='', port=8000) -> None:
self.client = None
self.max_wrong_time = 2
self.port = port
self.model_name = 'gpt-3.5-turbo-1106' if 'gpt-3.5-turbo-1106' in model_name else model_name
self.model_name = 'gpt-3.5' if 'gpt-3.5' in model_name else model_name
self.init_client()
print(f'use model of {self.model_name}')

def init_client(self):
self.client = OpenAI(
api_key='sk-E9oyiDL777ZaNZdRrzRSPzsbvbqvhebRl2xiTheKjh6bE4Jx' if self.model_name == 'gpt-3.5-turbo-1106' else 'EMPTY',
api_key='xxx' if self.model_name == 'gpt-3.5' else 'EMPTY',
max_retries=self.max_wrong_time,
base_url='https://openkey.cloud/v1' if self.model_name == 'gpt-3.5-turbo-1106' else f'http://127.0.0.1:{self.port}/v1'
base_url='https://xxx.xxx/v1' if self.model_name == 'gpt-3.5' else f'http://127.0.0.1:{self.port}/v1'
)

def call(self, content, t=0.0):
Expand Down