-
Notifications
You must be signed in to change notification settings - Fork 6.8k
a user friendly way to use g2c in module and an example of g2c #8632
Conversation
tests/python/unittest/test_module.py
Outdated
@@ -78,15 +78,15 @@ def test_module_ctx_group(): | |||
b = mx.symbol.Variable('b') | |||
c = a + b | |||
shape = (2, 5) | |||
mod1 = mx.mod.Module(c, context=[mx.cpu(0)], data_names=['a', 'b'], label_names=None, | |||
group2ctxs=[{'dev1':mx.cpu(1),'dev2':mx.cpu(2)}]) | |||
mod1 = mx.mod.Module(c, context=[mx.cpu(0), mx.cpu(1)], data_names=['a', 'b'], label_names=None, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please add test cases for all types of supported inputs
|
||
def matrix_fact_model_parallel_net(factor_size, num_hidden, max_user, max_item): | ||
# set ctx_group attribute to 'dev1' for the symbols created in this scope, | ||
# the symbols will be binded to the context that 'dev1' map to in group2ctxs |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
binded
-> bound
item_weight = mx.symbol.Variable('item_weight', stype='row_sparse') | ||
item = mx.symbol.contrib.SparseEmbedding(data=item, weight=item_weight, | ||
input_dim=max_item, output_dim=factor_size) | ||
# non-linear transformation of user features |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not move Line 35 - Line 40 to dev2
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
src/executor/graph_executor.cc:396: Check failed: device[nid] == devid (0 vs. 1) device of same output not equal to each other
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should spend some effort to investigate the error message before merging this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@eric-haibin-lin @reminisce I added some codes in graph_executor.cc
for debug,
python matrix_factorization_model_parallel.py
[00:53:08] src/executor/graph_executor.cc:365: args context
[00:53:08] src/executor/graph_executor.cc:384: nid: 0 ctx.dev_id 0
[00:53:08] src/executor/graph_executor.cc:384: nid: 1 ctx.dev_id 0
[00:53:08] src/executor/graph_executor.cc:384: nid: 3 ctx.dev_id 1
[00:53:08] src/executor/graph_executor.cc:384: nid: 4 ctx.dev_id 1
[00:53:08] src/executor/graph_executor.cc:384: nid: 6 ctx.dev_id 0
[00:53:08] src/executor/graph_executor.cc:384: nid: 7 ctx.dev_id 0
[00:53:08] src/executor/graph_executor.cc:384: nid: 12 ctx.dev_id 1
[00:53:08] src/executor/graph_executor.cc:386: =====================
[00:53:08] src/executor/graph_executor.cc:387: 1 num_forward_outputs
[00:53:08] src/executor/graph_executor.cc:388: 5 g.outputs.size()
[00:53:08] src/executor/graph_executor.cc:389: 7 arg_grad_ctxes.size()
[00:53:08] src/executor/graph_executor.cc:393: arg grads contexts
[00:53:08] src/executor/graph_executor.cc:397: nid 19 ctx 0
[00:53:08] src/executor/graph_executor.cc:397: nid 18 ctx 0
[00:53:08] src/executor/graph_executor.cc:397: nid 18 ctx 1
[00:53:08] src/executor/graph_executor.cc:397: nid 20 ctx 1
[00:53:08] src/executor/graph_executor.cc:399: =====================
[00:53:08] src/executor/graph_executor.cc:409: fail nid 18 ctx 1
[00:53:08] src/executor/graph_executor.cc:423: node 0 var user
[00:53:08] src/executor/graph_executor.cc:423: node 1 var user_weight
[00:53:08] src/executor/graph_executor.cc:425: node 2 _contrib_SparseEmbedding
[00:53:08] src/executor/graph_executor.cc:428: input 0 (entry id)
[00:53:08] src/executor/graph_executor.cc:428: input 1 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 2 (entry id)
[00:53:08] src/executor/graph_executor.cc:423: node 3 var ufcweight
[00:53:08] src/executor/graph_executor.cc:423: node 4 var ufcbias
[00:53:08] src/executor/graph_executor.cc:425: node 5 FullyConnected
[00:53:08] src/executor/graph_executor.cc:428: input 2 (entry id)
[00:53:08] src/executor/graph_executor.cc:428: input 3 (entry id)
[00:53:08] src/executor/graph_executor.cc:428: input 4 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 5 (entry id)
[00:53:08] src/executor/graph_executor.cc:423: node 6 var item
[00:53:08] src/executor/graph_executor.cc:423: node 7 var item_weight
[00:53:08] src/executor/graph_executor.cc:425: node 8 _contrib_SparseEmbedding
[00:53:08] src/executor/graph_executor.cc:428: input 6 (entry id)
[00:53:08] src/executor/graph_executor.cc:428: input 7 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 8 (entry id)
[00:53:08] src/executor/graph_executor.cc:425: node 9 elemwise_mul
[00:53:08] src/executor/graph_executor.cc:428: input 5 (entry id)
[00:53:08] src/executor/graph_executor.cc:428: input 8 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 9 (entry id)
[00:53:08] src/executor/graph_executor.cc:425: node 10 sum
[00:53:08] src/executor/graph_executor.cc:428: input 9 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 10 (entry id)
[00:53:08] src/executor/graph_executor.cc:425: node 11 Flatten
[00:53:08] src/executor/graph_executor.cc:428: input 10 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 11 (entry id)
[00:53:08] src/executor/graph_executor.cc:423: node 12 var score
[00:53:08] src/executor/graph_executor.cc:425: node 13 LinearRegressionOutput
[00:53:08] src/executor/graph_executor.cc:428: input 11 (entry id)
[00:53:08] src/executor/graph_executor.cc:428: input 12 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 13 (entry id)
[00:53:08] src/executor/graph_executor.cc:425: node 14 _backward_LinearRegressionOutput
[00:53:08] src/executor/graph_executor.cc:428: input 12 (entry id)
[00:53:08] src/executor/graph_executor.cc:428: input 13 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 14 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 15 (entry id)
[00:53:08] src/executor/graph_executor.cc:425: node 15 _backward_copy
[00:53:08] src/executor/graph_executor.cc:428: input 14 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 16 (entry id)
[00:53:08] src/executor/graph_executor.cc:425: node 16 _backward_sum
[00:53:08] src/executor/graph_executor.cc:428: input 16 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 17 (entry id)
[00:53:08] src/executor/graph_executor.cc:425: node 17 _backward_mul
[00:53:08] src/executor/graph_executor.cc:428: input 17 (entry id)
[00:53:08] src/executor/graph_executor.cc:428: input 5 (entry id)
[00:53:08] src/executor/graph_executor.cc:428: input 8 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 18 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 19 (entry id)
[00:53:08] src/executor/graph_executor.cc:425: node 18 _backward_FullyConnected
[00:53:08] src/executor/graph_executor.cc:428: input 18 (entry id)
[00:53:08] src/executor/graph_executor.cc:428: input 2 (entry id)
[00:53:08] src/executor/graph_executor.cc:428: input 3 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 20 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 21 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 22 (entry id)
[00:53:08] src/executor/graph_executor.cc:425: node 19 _backward_SparseEmbedding
[00:53:08] src/executor/graph_executor.cc:428: input 20 (entry id)
[00:53:08] src/executor/graph_executor.cc:428: input 0 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 23 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 24 (entry id)
[00:53:08] src/executor/graph_executor.cc:425: node 20 _backward_SparseEmbedding
[00:53:08] src/executor/graph_executor.cc:428: input 19 (entry id)
[00:53:08] src/executor/graph_executor.cc:428: input 6 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 25 (entry id)
[00:53:08] src/executor/graph_executor.cc:432: output 26 (entry id)
[00:53:08] /home/hanfeng/zyh/zyhmxnet/dmlc-core/include/dmlc/./logging.h:308: [00:53:08] src/executor/graph_executor.cc:436: Check failed: device[nid] == devid (0 vs. 1) fullyconnected0_backward device of same output not equal to each other
So as you can see, the contexts of node 3 var ufcweight
and node 4 var ufcbias
are at dev1
, but the contexts of their grads are at dev1
and dev2
because the outputs below arg grads contexts
[00:53:08] src/executor/graph_executor.cc:393: arg grads contexts
[00:53:08] src/executor/graph_executor.cc:397: nid 19 ctx 0
[00:53:08] src/executor/graph_executor.cc:397: nid 18 ctx 0
[00:53:08] src/executor/graph_executor.cc:397: nid 18 ctx 1
[00:53:08] src/executor/graph_executor.cc:397: nid 20 ctx 1
As you can see in the graph structure, node 18
is _backward_FullyConnected
so nid 18 ctx 0
and nid 18 ctx 1
are the grads of ufcweight
and ufcbias
.
parser.add_argument('--batch-size', type=int, default=1024, | ||
help='number of examples per batch') | ||
parser.add_argument('--print-every', type=int, default=100, | ||
help='logging frequency') |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I made a typo. Should be logging interval
instead of logging frequency
|
||
# construct the model | ||
net = matrix_fact_model_parallel_net(factor_size, factor_size, max_user, max_movies) | ||
a = time.time() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove this extra line, too?
# create kvstore | ||
kv = mx.kvstore.create('local') if num_gpus > 1 else None | ||
|
||
# initialize the module |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be clearer if we first create the variable group2ctxs={'dev1':mx.cpu(), 'dev2':[mx.gpu(i) for i in range(num_gpus)]})
with some documentation to explain what is going on, then pass it to Module constructor.
example/sparse/readme.md
Outdated
|
||
## Model Parallel | ||
|
||
The example demonstrates the basic usage of `group2ctxs` in `Module`, which allows part of model on cpu and another part of model on gpu. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
which allows one part of the model trained on cpu and the other on gpu.
tests/python/unittest/test_module.py
Outdated
assert np.all(mod1_input_grads[0].asnumpy() == mod2_input_grads[0].asnumpy()) | ||
assert np.all(mod1_input_grads[1].asnumpy() == mod2_input_grads[1].asnumpy()) | ||
|
||
test_module_ctx_group_impl([mx.cpu(0)], {'dev1': mx.cpu(1), 'dev2': mx.cpu(2)}) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: usually we name internal functions in test_xxx
as check_xxx
- check_module_ctx_group
Hi @ZiyueHuang |
Hi @mbaijal |
…e#8632) * a user friendly way to use g2c in module * also support g2c to be list * update * update test * g2c example * Update matrix_factorization_model_parallel.py * address comments * update * update * remove fc * debug g2c * Revert "debug g2c" This reverts commit caabdc5. * update * move g2c example to another folder * update * readme
…e#8632) * a user friendly way to use g2c in module * also support g2c to be list * update * update test * g2c example * Update matrix_factorization_model_parallel.py * address comments * update * update * remove fc * debug g2c * Revert "debug g2c" This reverts commit caabdc5. * update * move g2c example to another folder * update * readme
…e#8632) * a user friendly way to use g2c in module * also support g2c to be list * update * update test * g2c example * Update matrix_factorization_model_parallel.py * address comments * update * update * remove fc * debug g2c * Revert "debug g2c" This reverts commit caabdc5. * update * move g2c example to another folder * update * readme
…e#8632) * a user friendly way to use g2c in module * also support g2c to be list * update * update test * g2c example * Update matrix_factorization_model_parallel.py * address comments * update * update * remove fc * debug g2c * Revert "debug g2c" This reverts commit caabdc5. * update * move g2c example to another folder * update * readme
Description
add the interface stated in #8539
also backward compatible
As a feature requested in #8168
cc @eric-haibin-lin
Checklist
Essentials
make lint
)Changes
Comments