Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge from master #2

Merged
merged 5 commits into from
Oct 29, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ci/batch/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,4 @@ FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
ENV PATH /opt/conda/bin:$PATH
RUN git clone https://github.com/dmlc/gluon-nlp
WORKDIR gluon-nlp
RUN /bin/bash -c 'CONDA_ENVS_PATH=$PWD/conda CONDA_PKGS_DIRS=$PWD/conda/pkgs conda init bash && source /root/.bashrc && conda env update --prune -p conda/gpu/py3 -f env/gpu/py3.yml && source activate ./conda/gpu/py3 && pip install -v -e . && pip install awscli && python -m spacy download en && python -m spacy download de && python -m nltk.downloader all'
ADD gluon_nlp_job.sh .
2 changes: 1 addition & 1 deletion docs/api/notes/data_api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ lengths in the minibatch, which allows the fast tensor manipulation in GPU.

.. code:: python

>>> batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0),
>>> batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0, pad_val=0),
>>> nlp.data.batchify.Stack())

:class:`~gluonnlp.data.batchify.Tuple` wraps multiple batchify functions and applies each input function on each input field,
Expand Down
8 changes: 4 additions & 4 deletions docs/examples/machine_translation/gnmt.md
Original file line number Diff line number Diff line change
Expand Up @@ -277,12 +277,12 @@ is to construct the sampler and `DataLoader`. The first step is to use the `batc
function, which pads and stacks sequences to form mini-batches.

```{.python .input}
train_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(),
nlp.data.batchify.Pad(),
train_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(pad_val=0),
nlp.data.batchify.Pad(pad_val=0),
nlp.data.batchify.Stack(dtype='float32'),
nlp.data.batchify.Stack(dtype='float32'))
test_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(),
nlp.data.batchify.Pad(),
test_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(pad_val=0),
nlp.data.batchify.Pad(pad_val=0),
nlp.data.batchify.Stack(dtype='float32'),
nlp.data.batchify.Stack(dtype='float32'),
nlp.data.batchify.Stack())
Expand Down
4 changes: 2 additions & 2 deletions docs/examples/machine_translation/transformer.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ Now, we have obtained the transformed datasets. The next step is to construct th

```{.python .input}
wmt_test_batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Pad(),
nlp.data.batchify.Pad(),
nlp.data.batchify.Pad(pad_val=0),
nlp.data.batchify.Pad(pad_val=0),
nlp.data.batchify.Stack(dtype='float32'),
nlp.data.batchify.Stack(dtype='float32'),
nlp.data.batchify.Stack())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ For more advanced usage examples of the DataLoader object, check out the

```{.python .input}
batch_size = 2
dataset_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(),
dataset_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(pad_val=0),
nlp.data.batchify.Stack())
data_loader = gluon.data.DataLoader(dataset,
batch_size=batch_size,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ def get_dataloader():

# Construct the DataLoader Pad data, stack label and lengths
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Pad(axis=0),
nlp.data.batchify.Pad(axis=0, pad_val=0),
nlp.data.batchify.Stack())

# In this example, we use a FixedBucketSampler,
Expand Down
2 changes: 1 addition & 1 deletion docs/examples/sentiment_analysis/sentiment_analysis.md
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def get_dataloader():

# Pad data, stack label and lengths
batchify_fn = nlp.data.batchify.Tuple(
nlp.data.batchify.Pad(axis=0, ret_length=True),
nlp.data.batchify.Pad(axis=0, pad_val=0, ret_length=True),
nlp.data.batchify.Stack(dtype='float32'))
batch_sampler = nlp.data.sampler.FixedBucketSampler(
train_data_lengths,
Expand Down
2 changes: 1 addition & 1 deletion env/cpu/py3-master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ dependencies:
- scipy=1.3.1
- pip:
- pylint-quotes==0.2.1
- mxnet-mkl>=1.6.0b20191006
- mxnet-cu100>=1.6.0b20191027
- sacremoses
- sentencepiece<0.2
- sphinx-autodoc-typehints==1.7.0
2 changes: 1 addition & 1 deletion env/docker/py3.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ dependencies:
- scikit-learn=0.21.3
- pip:
- pylint-quotes<0.2
- mxnet-cu101mkl>=1.6.0b20191006
- mxnet-cu100>=1.6.0b20191027
- sacremoses
- sentencepiece<0.2
- https://github.com/szha/mx-theme/tarball/master
Expand Down
2 changes: 1 addition & 1 deletion env/gpu/py3-master.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ dependencies:
- scipy=1.3.1
- pip:
- pylint-quotes<0.2
- mxnet-cu101mkl>=1.6.0b20191006
- mxnet-cu100>=1.6.0b20191027
- sacremoses
- sentencepiece<0.2
- https://github.com/szha/mx-theme/tarball/master
Expand Down
3 changes: 3 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,6 @@ markers =

env =
MXNET_HOME=tests/data

filterwarnings =
error
8 changes: 4 additions & 4 deletions scripts/bert/finetune_icsl.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,10 +328,10 @@ def train(args):
dev_data_bert = dev_data.transform(idsl_transform, lazy=False)
test_data_bert = test_data.transform(idsl_transform, lazy=False)
# Construct the DataLoader
batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(), # Subword ID
nlp.data.batchify.Pad(), # Subword Mask
nlp.data.batchify.Pad(), # Beginning of subword
nlp.data.batchify.Pad(), # Tag IDs
batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(pad_val=0), # Subword ID
nlp.data.batchify.Pad(pad_val=0), # Subword Mask
nlp.data.batchify.Pad(pad_val=0), # Beginning of subword
nlp.data.batchify.Pad(pad_val=0), # Tag IDs
nlp.data.batchify.Stack(), # Intent Label
nlp.data.batchify.Stack()) # Valid Length
train_batch_sampler = nlp.data.sampler.SortedBucketSampler(
Expand Down
4 changes: 2 additions & 2 deletions scripts/machine_translation/dataprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -217,12 +217,12 @@ def get_dataloader(data_set, args, dataset_type,
data_lengths = get_data_lengths(data_set)

if dataset_type == 'train':
train_batchify_fn = btf.Tuple(btf.Pad(), btf.Pad(),
train_batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0),
btf.Stack(dtype='float32'), btf.Stack(dtype='float32'))

else:
data_lengths = list(map(lambda x: x[-1], data_lengths))
test_batchify_fn = btf.Tuple(btf.Pad(), btf.Pad(),
test_batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0),
btf.Stack(dtype='float32'), btf.Stack(dtype='float32'),
btf.Stack())

Expand Down
2 changes: 0 additions & 2 deletions scripts/machine_translation/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,8 @@
import os
from gluonnlp.base import get_home_dir
from gluonnlp.data.translation import _TranslationDataset, _get_pair_key
from gluonnlp.data.registry import register


@register(segment=['train', 'val', 'test'])
class TOY(_TranslationDataset):
"""A Small Translation Dataset for Testing Scripts.

Expand Down
2 changes: 1 addition & 1 deletion scripts/natural_language_inference/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def prepare_data_loader(args, dataset, vocab, test=False):
lazy=False)

# Batching
batchify_fn = btf.Tuple(btf.Pad(), btf.Pad(), btf.Stack(dtype='int32'))
batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0), btf.Stack(dtype='int32'))
data_lengths = [max(len(d[0]), len(d[1])) for d in dataset]
batch_sampler = nlp.data.FixedBucketSampler(lengths=data_lengths,
batch_size=args.batch_size,
Expand Down
2 changes: 1 addition & 1 deletion scripts/parsing/parser/biaffine_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ def flatten_numpy(arr):
return arc_accuracy, rel_accuracy, overall_accuracy, outputs
return outputs

def save_parameters(self, filename):
def save_parameters(self, filename): # pylint: disable=arguments-differ
"""Save model

Parameters
Expand Down
2 changes: 1 addition & 1 deletion scripts/sentiment_analysis/finetune_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def preprocess_dataset(dataset):
test_dataset, test_data_lengths = preprocess_dataset(test_dataset)

# Construct the DataLoader. Pad data and stack label
batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0, ret_length=True),
batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0, pad_val=0, ret_length=True),
nlp.data.batchify.Stack(dtype='float32'))
if args.bucket_type is None:
print('Bucketing strategy is not used!')
Expand Down
19 changes: 14 additions & 5 deletions scripts/tests/test_dataprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,13 @@

"""Test DataProcessor."""


import sys
import os
import pytest
import warnings
import time

import pytest

sys.path.append(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'machine_translation'))

from ..machine_translation.dataprocessor import process_dataset
Expand All @@ -38,12 +39,16 @@ def test_toy():
assert len(train_en_de) == 30
assert len(val_en_de) == 30
assert len(test_en_de) == 30
en_vocab, de_vocab = train_en_de.src_vocab, train_en_de.tgt_vocab
with warnings.catch_warnings(): # TODO https://github.com/dmlc/gluon-nlp/issues/978
warnings.simplefilter("ignore")
en_vocab, de_vocab = train_en_de.src_vocab, train_en_de.tgt_vocab
assert len(en_vocab) == 358
assert len(de_vocab) == 381
train_de_en = TOY(segment='train', src_lang='de', tgt_lang='en',
root='tests/data/translation_test')
de_vocab, en_vocab = train_de_en.src_vocab, train_de_en.tgt_vocab
with warnings.catch_warnings(): # TODO https://github.com/dmlc/gluon-nlp/issues/978
warnings.simplefilter("ignore")
de_vocab, en_vocab = train_de_en.src_vocab, train_de_en.tgt_vocab
assert len(en_vocab) == 358
assert len(de_vocab) == 381
for i in range(10):
Expand All @@ -62,7 +67,11 @@ def test_translation_preprocess():
for (src_max_len, tgt_max_len) in max_lens:
data_train = TOY('train', src_lang=src_lang, tgt_lang=tgt_lang)
data_val = TOY('val', src_lang=src_lang, tgt_lang=tgt_lang)
src_vocab, tgt_vocab = data_train.src_vocab, data_train.tgt_vocab

# TODO https://github.com/dmlc/gluon-nlp/issues/978
with warnings.catch_warnings():
warnings.simplefilter("ignore")
src_vocab, tgt_vocab = data_train.src_vocab, data_train.tgt_vocab
data_val_processed = process_dataset(data_val, src_vocab, tgt_vocab,
src_max_len, tgt_max_len)
for (src, tgt), (preprocessed_src, preprocessed_tgt) in zip(data_val, data_val_processed):
Expand Down
8 changes: 4 additions & 4 deletions scripts/tests/test_encoder_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@


def test_gnmt_encoder():
ctx = mx.Context.default_ctx
ctx = mx.current_context()
for cell_type in ["lstm", "gru", "relu_rnn", "tanh_rnn"]:
for num_layers, num_bi_layers in [(2, 1), (3, 0)]:
for use_residual in [False, True]:
Expand All @@ -51,7 +51,7 @@ def test_gnmt_encoder():


def test_gnmt_encoder_decoder():
ctx = mx.Context.default_ctx
ctx = mx.current_context()
num_hidden = 8
encoder = GNMTEncoder(cell_type="lstm", num_layers=3, num_bi_layers=1, hidden_size=num_hidden,
dropout=0.0, use_residual=True, prefix='gnmt_encoder_')
Expand Down Expand Up @@ -99,7 +99,7 @@ def test_gnmt_encoder_decoder():
assert(len(additional_outputs) == 0)

def test_transformer_encoder():
ctx = mx.Context.default_ctx
ctx = mx.current_context()
for num_layers in range(1, 3):
for output_attention in [True, False]:
for use_residual in [False, True]:
Expand Down Expand Up @@ -137,7 +137,7 @@ def test_transformer_encoder():
assert(len(additional_outputs) == 0)

def test_transformer_encoder_decoder():
ctx = mx.Context.default_ctx
ctx = mx.current_context()
units = 16
encoder = TransformerEncoder(num_layers=3, units=units, hidden_size=32, num_heads=8, max_length=10,
dropout=0.0, use_residual=True, prefix='transformer_encoder_')
Expand Down
2 changes: 1 addition & 1 deletion scripts/text_classification/fasttext_word_ngram.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def get_dataloader(train_dataset, train_data_lengths,
""" Construct the DataLoader. Pad data, stack label and lengths"""
bucket_num, bucket_ratio = 20, 0.2
batchify_fn = gluonnlp.data.batchify.Tuple(
gluonnlp.data.batchify.Pad(axis=0, ret_length=True),
gluonnlp.data.batchify.Pad(axis=0, pad_val=0, ret_length=True),
gluonnlp.data.batchify.Stack(dtype='float32'))
batch_sampler = gluonnlp.data.sampler.FixedBucketSampler(
train_data_lengths,
Expand Down
3 changes: 1 addition & 2 deletions scripts/word_embeddings/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,8 +85,7 @@ def __init__(self, token_to_idx, output_dim, batch_size, negatives_weights,
dtype=dtype)

self.negatives_sampler = nlp.data.UnigramCandidateSampler(
weights=negatives_weights**smoothing, shape=(batch_size, ),
dtype='int64')
weights=negatives_weights**smoothing, dtype='int64')

def __getitem__(self, tokens):
return self.embedding[tokens]
Expand Down
2 changes: 1 addition & 1 deletion src/gluonnlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from . import initializer
from .vocab import Vocab

__version__ = '0.8.1.dev'
__version__ = '0.9.0.dev'

__all__ = ['data',
'model',
Expand Down
12 changes: 6 additions & 6 deletions src/gluonnlp/data/batchify/batchify.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,7 @@ class Pad:
>>> a = [1, 2, 3, 4]
>>> b = [4, 5, 6]
>>> c = [8, 2]
>>> bf.Pad()([a, b, c])
>>> bf.Pad(pad_val=0)([a, b, c])
<BLANKLINE>
[[1. 2. 3. 4.]
[4. 5. 6. 0.]
Expand All @@ -197,7 +197,7 @@ class Pad:
>>> a = [1, 2, 3, 4]
>>> b = [4, 5, 6]
>>> c = [8, 2]
>>> batch, length = bf.Pad(ret_length=True)([a, b, c])
>>> batch, length = bf.Pad(pad_val=0, ret_length=True)([a, b, c])
>>> batch
<BLANKLINE>
[[1. 2. 3. 4.]
Expand Down Expand Up @@ -306,7 +306,7 @@ class Tuple:
>>> a = ([1, 2, 3, 4], 0)
>>> b = ([5, 7], 1)
>>> c = ([1, 2, 3, 4, 5, 6, 7], 0)
>>> f1, f2 = bf.Tuple(bf.Pad(), bf.Stack())([a, b])
>>> f1, f2 = bf.Tuple(bf.Pad(pad_val=0), bf.Stack())([a, b])
>>> f1
<BLANKLINE>
[[1. 2. 3. 4.]
Expand Down Expand Up @@ -404,7 +404,7 @@ class Dict:
>>> a = {'data': [1, 2, 3, 4], 'label': 0}
>>> b = {'data': [5, 7], 'label': 1}
>>> c = {'data': [1, 2, 3, 4, 5, 6, 7], 'label': 0}
>>> batchify_fn = Dict({'data': Pad(), 'label': Stack()})
>>> batchify_fn = Dict({'data': Pad(pad_val=0), 'label': Stack()})
>>> sample = batchify_fn([a, b, c])
>>> sample['data']
<BLANKLINE>
Expand Down Expand Up @@ -474,7 +474,7 @@ class NamedTuple:
>>> a = SampleData([1, 2, 3, 4], 0)
>>> b = SampleData([5, 7], 1)
>>> c = SampleData([1, 2, 3, 4, 5, 6, 7], 0)
>>> batchify_fn = NamedTuple(SampleData, {'data': Pad(), 'label': Stack()})
>>> batchify_fn = NamedTuple(SampleData, {'data': Pad(pad_val=0), 'label': Stack()})
>>> sample = batchify_fn([a, b, c])
>>> sample
SampleData(data=
Expand All @@ -491,7 +491,7 @@ class NamedTuple:
[1. 2. 3. 4. 5. 6. 7.]]
<NDArray 3x7 @cpu_shared(0)>
>>> # Let's consider to use a list
>>> batchify_fn = NamedTuple(SampleData, [Pad(), Stack()])
>>> batchify_fn = NamedTuple(SampleData, [Pad(pad_val=0), Stack()])
>>> batchify_fn([a, b, c])
SampleData(data=
[[1. 2. 3. 4. 0. 0. 0.]
Expand Down
22 changes: 5 additions & 17 deletions src/gluonnlp/data/candidate_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@

__all__ = ['UnigramCandidateSampler']

import functools
import operator

import mxnet as mx
import numpy as np

Expand All @@ -36,20 +33,15 @@ class UnigramCandidateSampler(mx.gluon.HybridBlock):
weights : mx.nd.NDArray
Unnormalized class probabilities. Samples are drawn and returned on the
same context as weights.context.
shape : int or tuple of int
Shape of data to be sampled.
TODO: Specifying the shape is only a workaround until random_like
operators are available in mxnet
dtype : str or np.dtype, default 'float32'
Data type of the candidates. Make sure that the dtype precision is
large enough to represent the size of your weights array precisely. For
example, float32 can not distinguish 2**24 from 2**24 + 1.

"""

def __init__(self, weights, shape, dtype='float32'):
def __init__(self, weights, dtype='float32'):
super(UnigramCandidateSampler, self).__init__()
self._shape = shape
self._dtype = dtype
self.N = weights.size

Expand Down Expand Up @@ -108,8 +100,6 @@ def hybrid_forward(self, F, candidates_like, prob, alias):
----------
candidates_like: mxnet.nd.NDArray or mxnet.sym.Symbol
This input specifies the shape of the to be sampled candidates. #
TODO shape selection is not yet supported. Shape must be specified
in the constructor.

Returns
-------
Expand All @@ -118,15 +108,13 @@ def hybrid_forward(self, F, candidates_like, prob, alias):
are sampled based on the weights specified on creation of the
UnigramCandidateSampler.
"""
flat_shape = functools.reduce(operator.mul, self._shape)
idx = F.random.uniform(low=0, high=self.N, shape=flat_shape,
dtype='float64').floor()
candidates_flat = candidates_like.reshape((-1, )).astype('float64')
idx = F.random.uniform_like(candidates_flat, low=0, high=self.N).floor()
prob = F.gather_nd(prob, idx.reshape((1, -1)))
alias = F.gather_nd(alias, idx.reshape((1, -1)))
where = F.random.uniform(shape=flat_shape,
dtype='float64') < prob
where = F.random.uniform_like(candidates_flat) < prob
hit = idx * where
alt = alias * (1 - where)
candidates = (hit + alt).reshape(self._shape)
candidates = (hit + alt).reshape_like(candidates_like)

return candidates.astype(self._dtype)
Loading