-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
目标值明明是大于0的值,但目前预测结果值为负数 #2461
Comments
最好把代码的链接也贴过来,另外训练的过程是否正常呢,Cost是否正常在下降? 另外执行训练代码会报错: import cv2 error, please install opencv-python: pip install opencv-python
I0614 11:21:08.539682 31 Util.cpp:166] commandline: --use_gpu=False
I0614 11:21:08.571806 31 GradientMachine.cpp:85] Initing parameters..
I0614 11:21:08.575495 31 GradientMachine.cpp:92] Init parameters done.
[0, 20419555, 16, '\xe4\xb8\x8a\xe6\xb5\xb7\xe5\xb8\x82|\xe4\xb8\x8a\xe6\xb5\xb7\xe5\xb8\x82', '1001', '\xe7\x99\xbd\xe9\xb9\xbf\xe5\x8e\x9f \xe7\x99\xbd\xe5\x98\x89\xe8\xbd\xa9 \xe5\xa8\xb6 \xe7\x9a\x84 \xe7\xac\xac\xe4\xb8\x83\xe4\xbb\xbb \xe8\x80\x81\xe5\xa9\x86 \xe4\xbb\x99\xe8\x8d\x89 \xe6\xb4\x9e\xe6\x88\xbf\xe8\x8a\xb1\xe7\x83\x9b \xe5\xa4\x9c \xe7\x99\xbd\xe5\x98\x89\xe8\xbd\xa9 \xe8\xb7\x91 \xe4\xba\x86', '', 3600]
Traceback (most recent call last):
File "train.py", line 157, in <module>
main()
File "train.py", line 142, in main
feature = [user_id_code, int(time_period), user_location_code, content_id_code, cate_id_code, title_code, brief_code, check_in_period_code]
NameError: global name 'user_id_code' is not defined |
您好, |
|
那就是这种情况是正常的?从使用角度来讲的话是没什么问题的?那负值代表什么意义呢?相似 or 不相似呢? |
data reader代码如下:#!/usr/bin/python
#encoding=utf8
import sys
import os
import random
#__all__ = [
# 'train', 'test', 'get_content_word_dict', 'max_content_id', 'max_user_id',
# 'user_location_dict', 'max_category_id', 'content_info'
#]
class CategoryFeatureGenerator(object):
def __init__(self):
self.dic = dict()
self.dic['unk'] = 0
self.counter = 1
def register(self, key):
'''
Register record.
'''
if key not in self.dic:
self.dic[key] = self.counter
self.counter += 1
def size(self):
return len(self.dic)
def gen(self, key):
'''
Generate one-hot representation for a record.
'''
if key not in self.dic:
res = self.dic['unk']
else:
res = self.dic[key]
return res
def __repr__(self):
return '<CategoryFeatureGenerator %d>' % len(self.dic)
feature_fields = ['user_id','user_location','content_id','cate_id','word','check_in_period']
feature_dict = {}
for key in feature_fields:
feature_dict[key] = CategoryFeatureGenerator()
def __init_dataset__(path):
with open(path, "r") as f:
for line in f:
user_id, time_period, user_location, app_id, content_id, cate_id, title, brief, quality_level, check_in_period, read_time = line.strip().split('\t')
feature_dict['user_id'].register(user_id)
feature_dict['content_id'].register(int(content_id))
user_location_list = user_location.split('|')
for ul in user_location_list:
feature_dict['user_location'].register(ul)
feature_dict['cate_id'].register(cate_id)
title_list = title.split(' ')
for w in title_list:
feature_dict['word'].register(w.lower())
brief_list = brief.split(' ')
for w in brief_list:
feature_dict['word'].register(w.lower())
feature_dict['check_in_period'].register(int(check_in_period))
class ReaderData(object):
def __init__(self, data_path, test_ratio, is_test):
__init_dataset__(data_path)
self.data_path = data_path
self.test_ratio = test_ratio
self.is_test = is_test
def reader_creator(self):
def reader():
rand = random.Random()
path = self.data_path
test_ratio = self.test_ratio
is_test = self.is_test
with open(path, "r") as f:
for line in f:
if (rand.random() < test_ratio) == is_test:
user_id, time_period, user_location, app_id, content_id, cate_id, title, brief, quality_level, check_in_period, read_time = line.strip().split('\t')
user_id_code = feature_dict['user_id'].gen(user_id)
user_location_code = [feature_dict['user_location'].gen(ul) for ul in user_location.split('|')]
content_id_code = feature_dict['content_id'].gen(int(content_id))
cate_id_code = feature_dict['cate_id'].gen(cate_id)
title_code = [feature_dict['word'].gen(w.lower()) for w in title.split(' ')]
brief_code = [feature_dict['word'].gen(w.lower()) for w in brief.split(' ')]
check_in_period_code = feature_dict['check_in_period'].gen(int(check_in_period))
record = [user_id_code, int(time_period), user_location_code, content_id_code, cate_id_code, title_code, brief_code, check_in_period_code]
yield record + [[float(read_time)]]
return reader
def get_content_word_dict(self):
return feature_dict['word'].dic
def user_id_len(self):
return feature_dict['user_id'].size()
def get_user_location_dict(self):
return feature_dict['user_location'].dic
def content_id_len(self):
return feature_dict['content_id'].size()
def category_id_len(self):
return feature_dict['cate_id'].size()
def check_in_period_len(self):
return feature_dict['check_in_period'].size()
if __name__ == '__main__':
path = "./videoSample"
test_ratio = 0.1
is_test = False
trainer = ReaderData(path, test_ratio, is_test)
print trainer.user_id_len()
a = trainer.get_user_location_dict()
for no, rcd in enumerate(trainer.read()):
print no, rcd
if no > 10 : break train的代码如下:#!/usr/bin/python
#encoding=utf8
import paddle.v2 as paddle
import cPickle
import copy
from paddle.v2.dataset.video import feature_dict, ReaderData
dataset_train = ReaderData("./videoSample", 0.1, False)
def get_usr_combined_features():
uid = paddle.layer.data(
name='user_id',
type=paddle.data_type.integer_value(
dataset_train.user_id_len()))
usr_emb = paddle.layer.embedding(input=uid, size=32)
usr_fc = paddle.layer.fc(input=usr_emb, size=32)
time_period = paddle.layer.data(
name='time_period',
type=paddle.data_type.integer_value(24))
time_period_emb = paddle.layer.embedding(input=time_period, size=16)
time_period_fc = paddle.layer.fc(input=time_period_emb, size=16)
usr_location = paddle.layer.data(
name='user_location',
type=paddle.data_type.sparse_binary_vector(
len(dataset_train.get_user_location_dict())))
usr_location_fc = paddle.layer.fc(input=usr_location, size=32)
usr_combined_features = paddle.layer.fc(
input=[usr_fc, time_period_fc, usr_location_fc],
size=200,
act=paddle.activation.Tanh())
return usr_combined_features
def get_content_combined_features():
content_word_dict = dataset_train.get_content_word_dict()
content_id = paddle.layer.data(
name='content_id',
type=paddle.data_type.integer_value(
dataset_train.content_id_len()))
content_emb = paddle.layer.embedding(input=content_id, size=32)
content_fc = paddle.layer.fc(input=content_emb, size=32)
content_categories = paddle.layer.data(
name='category_id',
type=paddle.data_type.integer_value(
dataset_train.category_id_len()))
content_categories_emb = paddle.layer.embedding(input=content_categories, size=16)
content_categories_fc = paddle.layer.fc(input=content_categories_emb, size=16)
content_title_id = paddle.layer.data(
name='title',
type=paddle.data_type.integer_value_sequence(len(content_word_dict)))
content_title_emb = paddle.layer.embedding(input=content_title_id, size=32)
content_title_conv = paddle.networks.sequence_conv_pool(
input=content_title_emb, hidden_size=32, context_len=2)
content_brief_id = paddle.layer.data(
name='brief',
type=paddle.data_type.integer_value_sequence(len(content_word_dict)))
content_brief_emb = paddle.layer.embedding(input=content_brief_id, size=32)
content_brief_conv = paddle.networks.sequence_conv_pool(
input=content_brief_emb, hidden_size=32, context_len=2)
check_in_period = paddle.layer.data(
name='check_in_period',
type=paddle.data_type.integer_value(
dataset_train.check_in_period_len()))
check_in_period_emb = paddle.layer.embedding(input=check_in_period, size=32)
check_in_period_fc = paddle.layer.fc(input=check_in_period_emb, size=32)
content_combined_features = paddle.layer.fc(
input=[content_fc, content_categories_fc, content_title_conv, content_brief_conv, check_in_period_fc],
size=200,
act=paddle.activation.Tanh())
return content_combined_features
def main():
paddle.init(use_gpu=False)
usr_combined_features = get_usr_combined_features()
content_combined_features = get_content_combined_features()
inference = paddle.layer.cos_sim(
a=usr_combined_features, b=content_combined_features, size=1)
cost = paddle.layer.mse_cost(
input=inference,
label=paddle.layer.data(
name='read_time', type=paddle.data_type.dense_vector(1)))
parameters = paddle.parameters.create(cost)
trainer = paddle.trainer.SGD(
cost=cost,
parameters=parameters,
update_equation=paddle.optimizer.Adam(learning_rate=1e-4))
feeding = {
'user_id': 0,
'time_period': 1,
'user_location': 2,
'content_id': 3,
'category_id': 4,
'title': 5,
'brief': 6,
'check_in_period': 7,
'read_time': 8
}
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
if event.batch_id % 100 == 0:
print "Pass %d Batch %d Cost %.2f" % (
event.pass_id, event.batch_id, event.cost)
trainer.train(
reader=paddle.batch(
paddle.reader.shuffle(
dataset_train.reader_creator(), buf_size=8192),
batch_size=256),
event_handler=event_handler,
feeding=feeding,
num_passes=1)
user_id = "CA43E1EADD90B21CA3D4775E4484DD9A"
content_id = 20468703
time_period = 11
user_location = "甘肃省|兰州市"
cate_id = "1001"
title = "为 陈坤 曾经 秘密 生 下 儿子 如今 却 另 嫁 他人"
brief = ""
check_in_period = 1495144819909
user_id_code = feature_dict['user_id'].gen(user_id)
content_id_code = feature_dict['content_id'].gen(content_id)
cate_id_code = feature_dict['cate_id'].gen(cate_id)
title_code = [feature_dict['word'].gen(w.lower()) for w in title.split(' ')]
brief_code = [feature_dict['word'].gen(w.lower()) for w in brief.split(' ')]
check_in_period_code = feature_dict['check_in_period'].gen(int(check_in_period))
user_location_code = [feature_dict['user_location'].gen(ul) for ul in user_location.split('|')]
print [user_id, content_id, time_period, user_location, cate_id, title, brief, check_in_period]
feature = [user_id_code, int(time_period), user_location_code, content_id_code, cate_id_code, title_code, brief_code, check_in_period_code]
print feature
infer_dict = copy.copy(feeding)
del infer_dict['read_time']
prediction = paddle.infer(
output_layer=inference,
parameters=parameters,
input=[feature],
feeding=infer_dict)
print prediction
if __name__ == '__main__':
main() |
cos 在 [-1, 1] 区间取值,区间的两个端点代表了从 [不相关 <-- ... --> 相关],在使用中通过在验证集上 plot 一对儿样本 cos 值的分布,grid search 来确定阈值,小于阈值是不相关,大于阈值是相关。 |
那么问题就来了,我要预测的是阅读时长这个目标,挑选的这条数据是从训练样本中选出的,样本中的y是84953,预测出来的是-0.035左右,这样的数据是不是很不合理?是跟我目前的训练样本量太少有关吗,准备再拿正式的样本数据跑一遍看看 |
又用600w+的训练数据进行了一次模型训练,选了一条训练数据里面的样本,预测的结果是0.99221039,这个也跟样本本来的y值(84953.0)相差很远,我预测的是用户对当前内容的阅读时长,模型先计算内容特征与用户特征之间的cos相似度,然后再对阅读时长进行回归,理论上不该相差这么远吧,还是我理解错了? 模型计算的代码如下:具体模型训练过程的cost值如下: |
|
I close this issue due to inactivity. please feel free to reopen it if more information is available. |
从训练样本里面抽取了一条数据用训练好的模型进行预测,得到的预测值竟然是负的,跟训练数据里面的目标值相差太远了,即使是模型没训练好,也不至于是负数啊,还是说哪里使用有问题,求解。
目标值明明是>0的,感觉不科学啊,训练时计算相似度是用的cos_sim,相关计算代码如下:
The text was updated successfully, but these errors were encountered: