From 5f1c49d6cb086f515a06177e6827ae337de07bc7 Mon Sep 17 00:00:00 2001 From: Heyi Tang Date: Mon, 16 Dec 2019 11:29:40 +0800 Subject: [PATCH 1/2] Add module Stratified splitter as AzureML Designer Module --- .../entries/stratified_splitter_entry.py | 74 +++++++++++++++++++ .../module_specs/stratified_splitter.yaml | 68 +++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 azureml-designer-modules/entries/stratified_splitter_entry.py create mode 100644 azureml-designer-modules/module_specs/stratified_splitter.yaml diff --git a/azureml-designer-modules/entries/stratified_splitter_entry.py b/azureml-designer-modules/entries/stratified_splitter_entry.py new file mode 100644 index 0000000000..d91f505e32 --- /dev/null +++ b/azureml-designer-modules/entries/stratified_splitter_entry.py @@ -0,0 +1,74 @@ +import argparse + +from azureml.studio.core.logger import module_logger as logger +from reco_utils.dataset.python_splitters import python_stratified_split +from azureml.studio.core.data_frame_schema import DataFrameSchema +from azureml.studio.core.io.data_frame_directory import load_data_frame_from_directory, save_data_frame_to_directory + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument( + '--input_path', + help='The input directory.', + ) + + parser.add_argument( + '--ratio', type=float, + help='A float parameter.', + ) + + parser.add_argument( + '--col_user', type=str, + help='A string parameter.', + ) + + parser.add_argument( + '--col_item', type=str, + help='A string parameter.', + ) + + parser.add_argument( + '--seed', type=int, + help='An int parameter.', + ) + + parser.add_argument( + '--output_train', + help='The output training data directory.', + ) + parser.add_argument( + '--output_test', + help='The output test data directory.', + ) + + args, _ = parser.parse_known_args() + + input_df = load_data_frame_from_directory(args.input_path).data + + #logger.info(f"Hello world from {PACKAGE_NAME} {VERSION}") + + ratio = args.ratio + col_user = args.col_user + col_item = args.col_item + seed = args.seed + + logger.debug(f"Received parameters:") + logger.debug(f"Ratio: {ratio}") + logger.debug(f"User: {col_user}") + logger.debug(f"Item: {col_item}") + logger.debug(f"Seed: {seed}") + + logger.debug(f"Input path: {args.input_path}") + logger.debug(f"Shape of loaded DataFrame: {input_df.shape}") + logger.debug(f"Cols of DataFrame: {input_df.columns}") + + output_train, output_test = python_stratified_split(input_df, ratio=args.ratio, col_user=args.col_user, col_item=args.col_item, seed=args.seed) + + logger.debug(f"Output path: {args.output_train}") + logger.debug(f"Output path: {args.output_test}") + + save_data_frame_to_directory(args.output_train, output_train, schema=DataFrameSchema.data_frame_to_dict(output_train)) + save_data_frame_to_directory(args.output_test, output_test, schema=DataFrameSchema.data_frame_to_dict(output_test)) + diff --git a/azureml-designer-modules/module_specs/stratified_splitter.yaml b/azureml-designer-modules/module_specs/stratified_splitter.yaml new file mode 100644 index 0000000000..1de908f0be --- /dev/null +++ b/azureml-designer-modules/module_specs/stratified_splitter.yaml @@ -0,0 +1,68 @@ +name: Stratified Splitter +id: efd1af54-0d31-42e1-b3d5-ce3b7c538705 +version: 0.0.9 +category: Experimentation +description: "Python stratified splitter from CAT Recommender repo: https://github.com/Microsoft/Recommenders/tree/master/." +inputs: +- name: Input path + type: DataFrameDirectory + description: The directory contains dataframe. + port: true +- name: Ratio + type: Float + optional: True + description: > + Ratio for splitting data. If it is a single float number, + it splits data into two halves and the ratio argument indicates the ratio of + training data set; if it is a list of float numbers, the splitter splits + data into several portions corresponding to the split ratios. If a list is + provided and the ratios are not summed to 1, they will be normalized. +- name: User column + type: String + description: Column name of user IDs. +- name: Item column + type: String + description: Column name of item IDs. +- name: Seed + type: Int + min: 1 + max: 100 + default: 42 + description: Seed. +outputs: +- name: Output train data + type: DataFrameDirectory + description: The output directory contains a training dataframe. + port: true +- name: Output test data + type: DataFrameDirectory + description: The output directory contains a test dataframe. + port: true +implementation: + container: + conda: + name: CAT_module_environment + channels: + - defaults + dependencies: + - python=3.7 + - pip: + - azureml-designer-core==0.0.26.* + command: + - python + - azureml-designer-modules/entries/stratified_splitter_entry.py + args: + - --input_path + - inputPath: Input path + - --ratio + - inputValue: Ratio + - --col_user + - inputValue: User column + - --col_item + - inputValue: Item column + - --seed + - inputValue: Seed + - --output_train + - outputPath: Output train data + - --output_test + - outputPath: Output test data \ No newline at end of file From a7aa2300f314a0aa4effcabff46b8cf902a79f49 Mon Sep 17 00:00:00 2001 From: Wentao Dai Date: Tue, 17 Dec 2019 10:59:17 +0800 Subject: [PATCH 2/2] Update version and arg format --- .../entries/stratified_splitter_entry.py | 10 +++++----- .../module_specs/stratified_splitter.yaml | 13 +++++++------ 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/azureml-designer-modules/entries/stratified_splitter_entry.py b/azureml-designer-modules/entries/stratified_splitter_entry.py index d91f505e32..73b3595bc4 100644 --- a/azureml-designer-modules/entries/stratified_splitter_entry.py +++ b/azureml-designer-modules/entries/stratified_splitter_entry.py @@ -10,7 +10,7 @@ parser = argparse.ArgumentParser() parser.add_argument( - '--input_path', + '--input-path', help='The input directory.', ) @@ -20,12 +20,12 @@ ) parser.add_argument( - '--col_user', type=str, + '--col-user', type=str, help='A string parameter.', ) parser.add_argument( - '--col_item', type=str, + '--col-item', type=str, help='A string parameter.', ) @@ -35,11 +35,11 @@ ) parser.add_argument( - '--output_train', + '--output-train', help='The output training data directory.', ) parser.add_argument( - '--output_test', + '--output-test', help='The output test data directory.', ) diff --git a/azureml-designer-modules/module_specs/stratified_splitter.yaml b/azureml-designer-modules/module_specs/stratified_splitter.yaml index 1de908f0be..7c8c1c1ad3 100644 --- a/azureml-designer-modules/module_specs/stratified_splitter.yaml +++ b/azureml-designer-modules/module_specs/stratified_splitter.yaml @@ -1,6 +1,6 @@ name: Stratified Splitter id: efd1af54-0d31-42e1-b3d5-ce3b7c538705 -version: 0.0.9 +version: 0.0.22 category: Experimentation description: "Python stratified splitter from CAT Recommender repo: https://github.com/Microsoft/Recommenders/tree/master/." inputs: @@ -48,21 +48,22 @@ implementation: - python=3.7 - pip: - azureml-designer-core==0.0.26.* + - azureml-designer-classic-modules==0.0.105 command: - python - azureml-designer-modules/entries/stratified_splitter_entry.py args: - - --input_path + - --input-path - inputPath: Input path - --ratio - inputValue: Ratio - - --col_user + - --col-user - inputValue: User column - - --col_item + - --col-item - inputValue: Item column - --seed - inputValue: Seed - - --output_train + - --output-train - outputPath: Output train data - - --output_test + - --output-test - outputPath: Output test data \ No newline at end of file