Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Splitter module next step #2

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 50 additions & 0 deletions azureml-designer-modules/entries/sar_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import argparse

from azureml.studio.core.data_frame_schema import DataFrameSchema
from azureml.studio.core.io.data_frame_directory import load_model_from_directory, \
load_data_frame_from_directory, save_data_frame_to_directory


def get_args():
parser = argparse.ArgumentParser()

parser.add_argument(
'--model',
help='The SAR model.',
)

parser.add_argument(
'--users', type=float,
help='Users to recommend for.',
)

parser.add_argument(
'--item-count-to-recommend', type=str,
help='Recommend this number of items.',
)

parser.add_argument(
'--output',
help='The recommended items.',
)

known_args, _ = parser.parse_known_args()
return known_args


if __name__ == '__main__':
args = get_args()

model = load_model_from_directory(args.model).model
input_df = load_data_frame_from_directory(args.users).data

result = model.recommend_k_items(
input_df,
top_k=args.item_count_to_recommend,
)

save_data_frame_to_directory(
args.output,
result,
schema=DataFrameSchema.data_frame_to_dict(result),
)
93 changes: 93 additions & 0 deletions azureml-designer-modules/entries/stratified_splitter_entry.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import argparse

from azureml.studio.core.logger import module_logger as logger
from reco_utils.dataset.python_splitters import python_stratified_split
from azureml.studio.core.data_frame_schema import DataFrameSchema
from azureml.studio.core.io.data_frame_directory import load_data_frame_from_directory, save_data_frame_to_directory


def get_args():
# TODO: this function should be replaced by a function in SDK
# which can automatically generate the args object according to the YAML spec file.
parser = argparse.ArgumentParser()

parser.add_argument(
'--input-path',
help='The input directory.',
)

parser.add_argument(
'--ratio', type=float,
help='A float parameter.',
)

parser.add_argument(
'--col-user', type=str,
help='A string parameter.',
)

parser.add_argument(
'--col-item', type=str,
help='A string parameter.',
)

parser.add_argument(
'--seed', type=int,
help='An int parameter.',
)

parser.add_argument(
'--output-train',
help='The output training data directory.',
)
parser.add_argument(
'--output-test',
help='The output test data directory.',
)

known_args, _ = parser.parse_known_args()
return known_args


if __name__ == '__main__':
args = get_args()

input_df = load_data_frame_from_directory(args.input_path).data

ratio = args.ratio
col_user = args.col_user
col_item = args.col_item
seed = args.seed

logger.debug(f"Received parameters:")
logger.debug(f"Input path: {args.input_path}")
logger.debug(f"Ratio: {ratio}")
logger.debug(f"User column: {col_user}")
logger.debug(f"Item column: {col_item}")
logger.debug(f"Seed: {seed}")

logger.debug(f"Shape of loaded DataFrame: {input_df.shape}")
logger.debug(f"Cols of DataFrame: {input_df.columns}")

output_train, output_test = python_stratified_split(
input_df,
ratio=args.ratio,
col_user=args.col_user,
col_item=args.col_item,
seed=args.seed,
)

logger.debug(f"Output data path: {args.output_train}")
logger.debug(f"Output test path: {args.output_test}")

save_data_frame_to_directory(
args.output_train,
output_train,
schema=DataFrameSchema.data_frame_to_dict(output_train),
)
save_data_frame_to_directory(
args.output_test,
output_test,
schema=DataFrameSchema.data_frame_to_dict(output_test),
)

39 changes: 39 additions & 0 deletions azureml-designer-modules/module_specs/sar_score.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: SAR Recommend
# id is not needed in the future. will be calculated from name using uuid5.
id: efd1af54-0d31-42e1-b3d5-ce3b7c538706
version: 0.0.1
category: Experimentation
description: |
Python SAR Recommender
repo: https://github.com/Microsoft/Recommenders/tree/master/
inputs:
- name: SAR Model
type: ModelDirectory
description: The directory contains SAR model.
port: true
- name: Users
type: DataFrameDirectory
description: Users to recommend for.
port: true
- name: Item count to recommend
type: Int
description: Recommend this number of items.
min: 1
default: 10
outputs:
- name: Output
type: DataFrameDirectory
description: The recommended items.
implementation:
container:
conda: reco_base.yaml
command: [
python,
azureml-designer-modules/entries/sar_score.py
]
args: [
--model, {inputPath: SAR Model},
--users, {inputValue: Users},
--item-count-to-recommend, {inputValue: Item count to recommend},
--output, {inputValue: Output},
]
56 changes: 56 additions & 0 deletions azureml-designer-modules/module_specs/stratified_splitter.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
name: Stratified Splitter
# id is not needed in the future. will be calculated from name using uuid5.
id: efd1af54-0d31-42e1-b3d5-ce3b7c538705
version: 0.0.9
category: Experimentation
description: |
Python stratified splitter from CAT Recommender
repo: https://github.com/Microsoft/Recommenders/tree/master/
inputs:
- name: Input path
type: DataFrameDirectory
description: The directory contains dataframe.
port: true
- name: Ratio
type: Float
optional: True
description: |
Ratio for splitting data.
* If it is a single float number, it splits data into two halves and the ratio argument indicates the ratio of training data set.
* If it is a list of float numbers, the splitter splits data into several portions corresponding to the split ratios.
* If a list is provided and the ratios are not summed to 1, they will be normalized.
- name: User column
type: String
description: Column name of user IDs.
- name: Item column
type: String
description: Column name of item IDs.
- name: Seed
type: Int
min: 1
max: 100
default: 42
description: Seed.
outputs:
- name: Output train data
type: DataFrameDirectory
description: The output directory contains a training dataframe.
- name: Output test data
type: DataFrameDirectory
description: The output directory contains a test dataframe.
implementation:
container:
conda: reco_base.yaml
command: [
python,
azureml-designer-modules/entries/stratified_splitter_entry.py
]
args: [
--input-path, {inputPath: Input path},
--ratio, {inputValue: Ratio},
--col-user, {inputValue: User column},
--col-item, {inputValue: Item column},
--seed, {inputValue: Seed},
--output-train, {outputPath: Output train data},
--output-test, {outputPath: Output test data},
]
6 changes: 6 additions & 0 deletions scripts/generate_conda_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@

PIP_GPU = {"nvidia-ml-py3": "nvidia-ml-py3>=7.352.0"}
PIP_PYSPARK = {"databricks-cli": "databricks-cli==0.8.6"}
PIP_AZUREML_MODULES = {"azureml-designer-core": "azureml-designer-core==0.0.26"}

PIP_DARWIN = {
"nni": "nni==0.5.2.1.1",
Expand Down Expand Up @@ -116,6 +117,9 @@
parser.add_argument(
"--pyspark-version", help="provide specific version of PySpark to use"
)
parser.add_argument(
"--azureml-modules", action="store_true", help="run as azureml designer modules"
)
args = parser.parse_args()

# check pyspark version
Expand Down Expand Up @@ -154,6 +158,8 @@
if args.gpu:
conda_packages.update(CONDA_GPU)
pip_packages.update(PIP_GPU)
if args.azureml_modules:
pip_packages.update(PIP_AZUREML_MODULES)

# check for os platform support
if platform == 'darwin':
Expand Down