Skip to content

Commit

Permalink
Change output format to ModelDirectory
Browse files Browse the repository at this point in the history
  • Loading branch information
Wentao Dai committed Dec 26, 2019
1 parent 2aae859 commit 59fa6c8
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 47 deletions.
81 changes: 42 additions & 39 deletions azureml-designer-modules/entries/score_sar_entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
from enum import Enum
from pathlib import Path
import joblib
import time

from azureml.studio.core.data_frame_schema import DataFrameSchema
from azureml.studio.core.logger import module_logger as logger
from azureml.studio.core.io.data_frame_directory import load_data_frame_from_directory, save_data_frame_to_directory
from azureml.studio.core.io.model_directory import load_model_from_directory


class ScoreType(Enum):
Expand All @@ -26,26 +26,41 @@ class ItemSet(Enum):
SCORE_ONLY = 'Items in score set'


MODEL_NAME = 'sar_model'
def joblib_loader(load_from_dir, model_spec):
file_name = model_spec['file_name']
with open(Path(load_from_dir) / file_name, 'rb') as fin:
return joblib.load(fin)


def recommend_items(model, data, ranking_metric, top_k, sort_top_k, remove_seen, normalize):
if ranking_metric == RankingMetric.RATING:
return model.recommend_k_items(test=data, top_k=top_k, sort_top_k=sort_top_k, remove_seen=remove_seen,
normalize=normalize)
if ranking_metric == RankingMetric.SIMILARITY:
return model.get_item_based_topk(items=data, top_k=top_k, sort_top_k=sort_top_k)
if ranking_metric == RankingMetric.POPULARITY:
return model.get_popularity_based_topk(top_k=top_k, sort_top_k=sort_top_k)
raise ValueError(f"Got unexpected ranking metric: {ranking_metric}.")
class ScoreSARModule:
def __init__(self, model, input_data):
self._model = model
self._input_data = input_data

@property
def model(self):
return self._model

def predict_ratings(model, data, items_to_predict, normalize):
if items_to_predict == ItemSet.TRAIN_ONLY:
return model.predict_training_items(test=data, normalize=normalize)
if items_to_predict == ItemSet.SCORE_ONLY:
return model.predict(test=data, normalize=normalize)
raise ValueError(f"Got unexpected 'items to predict': {items_to_predict}.")
@property
def input_data(self):
return self._input_data

def recommend_items(self, ranking_metric, top_k, sort_top_k, remove_seen, normalize):
if ranking_metric == RankingMetric.RATING:
return self.model.recommend_k_items(test=self.input_data, top_k=top_k, sort_top_k=sort_top_k,
remove_seen=remove_seen, normalize=normalize)
if ranking_metric == RankingMetric.SIMILARITY:
return self.model.get_item_based_topk(items=self.input_data, top_k=top_k, sort_top_k=sort_top_k)
if ranking_metric == RankingMetric.POPULARITY:
return self.model.get_popularity_based_topk(top_k=top_k, sort_top_k=sort_top_k)
raise ValueError(f"Got unexpected ranking metric: {ranking_metric}.")

def predict_ratings(self, items_to_predict, normalize):
if items_to_predict == ItemSet.TRAIN_ONLY:
return self.model.predict_training_items(test=self.input_data, normalize=normalize)
if items_to_predict == ItemSet.SCORE_ONLY:
return self.model.predict(test=self.input_data, normalize=normalize)
raise ValueError(f"Got unexpected 'items to predict': {items_to_predict}.")


if __name__ == '__main__':
Expand Down Expand Up @@ -75,38 +90,26 @@ def predict_ratings(model, data, items_to_predict, normalize):
args, _ = parser.parse_known_args()

logger.info(f"Arguments: {args}")

with open(Path(args.trained_model) / MODEL_NAME, 'rb') as f:
sar_model = joblib.load(f)

dataset_to_score = load_data_frame_from_directory(args.dataset_to_score).data
logger.debug(f"Shape of loaded DataFrame: {dataset_to_score.shape}")

sort_top_k = strtobool(args.sort_top_k) if args.sort_top_k else None
remove_seen_items = strtobool(args.remove_seen_items) if args.remove_seen_items else None
normalize = strtobool(args.normalize) if args.normalize else None

start_time = time.time()
sar_model = load_model_from_directory(args.trained_model, model_loader=joblib_loader).data
dataset_to_score = load_data_frame_from_directory(args.dataset_to_score).data
logger.debug(f"Shape of loaded DataFrame: {dataset_to_score.shape}")

score_sar_module = ScoreSARModule(model=sar_model, input_data=dataset_to_score)

score_type = ScoreType(args.score_type)
if score_type == ScoreType.ITEM_RECOMMENDATION:
score_result = recommend_items(model=sar_model,
data=dataset_to_score,
ranking_metric=RankingMetric(args.ranking_metric),
top_k=args.top_k,
sort_top_k=sort_top_k,
remove_seen=args.remove_seen_items,
normalize=normalize)
score_result = score_sar_module.recommend_items(ranking_metric=RankingMetric(args.ranking_metric),
top_k=args.top_k, sort_top_k=sort_top_k,
remove_seen=args.remove_seen_items, normalize=normalize)
elif score_type == ScoreType.RATING_PREDICTION:
score_result = predict_ratings(model=sar_model,
data=dataset_to_score,
items_to_predict=ItemSet(args.items_to_predict),
normalize=normalize)
score_result = score_sar_module.predict_ratings(items_to_predict=ItemSet(args.items_to_predict),
normalize=normalize)
else:
raise ValueError(f"Got unexpected score type: {score_type}.")

test_time = time.time() - start_time
logger.debug("Took {} seconds for score.\n".format(test_time))

save_data_frame_to_directory(args.score_result, data=score_result,
schema=DataFrameSchema.data_frame_to_dict(score_result))
25 changes: 19 additions & 6 deletions azureml-designer-modules/entries/train_sar_entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,26 @@
from reco_utils.recommender.sar import SAR

from azureml.studio.core.logger import module_logger as logger
from azureml.studio.core.utils.fileutils import ensure_folder
from azureml.studio.core.io.data_frame_directory import load_data_frame_from_directory
from azureml.studio.core.io.model_directory import save_model_to_directory


MODEL_NAME = 'sar_model'
def joblib_dumper(data, file_name=None):
"""Return a dumper to dump a model with pickle."""
if not file_name:
file_name = '_data.pkl'

def model_dumper(save_to):
full_path = Path(save_to) / file_name
ensure_folder(Path(save_to))
with open(full_path, 'wb') as fout:
joblib.dump(data, fout, protocol=4)

model_spec = {'model_type': 'joblib', 'file_name': file_name}
return model_spec

return model_dumper


if __name__ == '__main__':
Expand Down Expand Up @@ -49,8 +65,5 @@
train_time = time.time() - start_time
print("Took {} seconds for training.".format(train_time))

model_dir = Path(args.output_model)
if not model_dir.exists():
model_dir.mkdir()
with open(model_dir / MODEL_NAME, 'wb') as f:
joblib.dump(model, f, protocol=4)
save_model_to_directory(save_to=args.output_model, model_dumper=joblib_dumper(data=model))

2 changes: 1 addition & 1 deletion azureml-designer-modules/module_specs/sar_score.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: SAR Score
version: 0.0.21
version: 0.0.23
category: Experimentation
description: |
Python SAR Recommender
Expand Down
2 changes: 1 addition & 1 deletion azureml-designer-modules/module_specs/sar_train.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: SAR Train
id: efd1af54-0d31-42e1-b3d5-ce3b7c538705
version: 0.0.11
version: 0.0.18
category: Experimentation
description: "SAR Train from CAT Recommender repo: https://github.com/Microsoft/Recommenders/tree/master/."
inputs:
Expand Down

0 comments on commit 59fa6c8

Please sign in to comment.