rnn2source.bib

Automatically generated by Mendeley Desktop 1.17.9
Any changes to this file will be lost if it is regenerated by Mendeley.

BibTeX export options can be customized via Options -> BibTeX in Mendeley Desktop

@article{Tai2015,
abstract = {Because of their superior ability to preserve sequence information over time, Long Short-Term Memory (LSTM) networks, a type of recurrent neural network with a more complex computational unit, have obtained strong results on a variety of sequence modeling tasks. The only underlying LSTM structure that has been explored so far is a linear chain. However, natural language exhibits syntactic properties that would naturally combine words to phrases. We introduce the Tree-LSTM, a generalization of LSTMs to tree-structured network topologies. Tree-LSTMs outperform all existing systems and strong LSTM baselines on two tasks: predicting the semantic relatedness of two sentences (SemEval 2014, Task 1) and sentiment classification (Stanford Sentiment Treebank).},
archivePrefix = {arXiv},
arxivId = {1503.00075},
author = {Tai, Kai Sheng and Socher, Richard and Manning, Christopher D.},
doi = {10.1515/popets-2015-0023},
eprint = {1503.00075},
file = {:home/vasilis/Downloads/1503.00075.pdf:pdf},
isbn = {9781941643723},
issn = {9781941643723},
journal = {Proceedings of ACL},
pages = {1556--1566},
pmid = {18267787},
title = {{Improved Semantic Representations From Tree-Structured Long Short-Term Memory Networks}},
url = {http://arxiv.org/abs/1503.00075},
year = {2015}
}
@article{Schmidt2006,
abstract = {Model-driven engineering technologies offer a promising approach to address the inability of third-generation languages to alleviate the complexity of platforms and express domain concepts effectively. Over the past five decades, software researchers and developers have been creating abstractions that help them program in terms of their design intent rather than the underlying computing environment—for example, CPU, memory, and network devices—and shield them from the complexities of these environments. From the early days of computing, these abstractions included both language and platform technologies. For example, early programming languages, such as assembly and Fortran, shielded developers from complexities of programming with machine code. Likewise, early operating system platforms, such as OS/360 and Unix, shielded developers from complexities of programming directly to hardware. Although these early languages and platforms raised the level of abstraction, they still had a distinct "computing-oriented" focus. In particular, they provided abstractions of the solution space—that is, the domain of computing technologies themselves—rather than abstractions of the problem space that express designs in terms of concepts in application domains, such as telecom, aerospace, healthcare, insurance, and biology. LESSONS FROM COMPUTER-AIDED SOFTWARE ENGINEERING Various past efforts have created technologies that further elevated the level of abstraction used to develop software. One prominent effort begun in the 1980s was computer-aided software engineering (CASE), which focused on developing software methods and tools that enabled developers to express their designs in terms of general-purpose graphical programming representations, such as state machines, structure diagrams, and dataflow diagrams. One goal of CASE was to enable more thorough analysis of graphical programs that incur less complexity than conventional general-purpose programming languages—for example, by avoiding memory corruption and leaks associated with languages like C. Another goal was to synthesize implementation artifacts from graphical representations to reduce the effort of manually coding, debugging, and porting programs. Although CASE attracted considerable attention in the research and trade literature, it wasn't widely adopted in practice. One problem it faced was that the general-purpose graphical language representations for writing programs in CASE tools mapped poorly onto the underlying platforms, which were largely single-node operating systems—such as DOS, OS/2, or Windows—that lacked support for important quality-of-service (QoS) properties, such as transparent distribution, fault tolerance, and security. The amount and complexity of generated code needed to compensate for the paucity of the underlying platforms was beyond the grasp of translation technologies available at the time, which made it hard to develop, debug, and evolve CASE tools and applications created with these tools. Another problem with CASE was its inability to scale to handle complex, production-scale systems in a broad range of application domains. In general, CASE tools did not support concurrent engineering, so they were limited to programs written by a single person or by a team that serialized their access to files used by these tools. Moreover, due to a lack of powerful common middleware platforms, CASE tools targeted proprietary execution environments, which made it hard to integrate the code they generated with other software language and platform technologies. CASE tools also didn't support many application domains effectively because their "one-size-fits-all" graphical representations were too generic and noncustomizable. As a result, CASE had relatively little impact on commercial software development during the 1980s and 1990s, focusing primarily on a few domains, such as telecom call processing, that mapped nicely onto state machine representations. To},
author = {Schmidt, Douglas C},
doi = {10.1109/MC.2006.58},
file = {:home/vasilis/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Schmidt - 2006 - Model-Driven Engineering.pdf:pdf},
isbn = {0018-9162},
issn = {00189162},
journal = {IEEE Computer},
number = {2},
pages = {25--31},
pmid = {19417437},
title = {{Model-Driven Engineering}},
url = {http://www.computer.org/portal/site/computer/menuitem.e533b16739f5...},
volume = {39},
year = {2006}
}
@article{Parisotto2017,
abstract = {Recent years have seen the proposal of a number of neural architectures for the problem of Program Induction. Given a set of input-output examples, these ar-chitectures are able to learn mappings that generalize to new test inputs. While achieving impressive results, these approaches have a number of important limi-tations: (a) they are computationally expensive and hard to train, (b) a model has to be trained for each task (program) separately, and (c) it is hard to interpret or verify the correctness of the learnt mapping (as it is defined by a neural network). In this paper, we propose a novel technique, Neuro-Symbolic Program Synthesis, to overcome the above-mentioned problems. Once trained, our approach can au-tomatically construct computer programs in a domain-specific language that are consistent with a set of input-output examples provided at test time. Our method is based on two novel neural modules. The first module, called the cross corre-lation I/O network, given a set of input-output examples, produces a continuous representation of the set of I/O examples. The second module, the Recursive-Reverse-Recursive Neural Network (R3NN), given the continuous representation of the examples, synthesizes a program by incrementally expanding partial pro-grams. We demonstrate the effectiveness of our approach by applying it to the rich and complex domain of regular expression based string transformations. Ex-periments show that the R3NN model is not only able to construct programs from new input-output examples, but it is also able to construct new programs for tasks that it had never observed before during training.},
archivePrefix = {arXiv},
arxivId = {1611.01855},
author = {Parisotto, Emilio and Mohamed, Abdel-Rahman and Singh, Rishabh and Li, Lihong and Zhou, Dengyong and Kohli, Pushmeet},
eprint = {1611.01855},
file = {:home/vasilis/Downloads/1611.01855v1.pdf:pdf},
pages = {1--14},
title = {{Neuro-Symbolic Program Synthesis}},
url = {https://www.microsoft.com/en-us/research/wp-content/uploads/2017/03/neural{\_}flashfill{\_}iclr.pdf},
year = {2017}
}
@inproceedings{Martens2011,
archivePrefix = {arXiv},
arxivId = {arXiv:gr-qc/9809069v1},
author = {Sutskever, Ilya and Martens, James and Hinton, Geoffrey},
booktitle = {ICML 2011},
doi = {2},
eprint = {9809069v1},
file = {:home/vasilis/Downloads/ICML2011Sutskever{\_}524.pdf:pdf},
isbn = {0976-0245},
issn = {09760245},
keywords = {LSTM,RNN},
pmid = {15003161},
primaryClass = {arXiv:gr-qc},
title = {{Generating Text with Recurrent Neural Networks}},
url = {http://www.icml-2011.org/papers/524{\_}icmlpaper.pdf},
year = {2011}
}
@article{Yin2017,
abstract = {We consider the problem of parsing natural language descriptions into source code written in a general-purpose programming language like Python. Existing data-driven methods treat this problem as a language generation task without considering the underlying syntax of the target programming language. Informed by previous work in semantic parsing, in this paper we propose a novel neural architecture powered by a grammar model to explicitly capture the target syntax as prior knowledge. Experiments find this an effective way to scale up to generation of complex programs from natural language descriptions, achieving state-of-the-art results that well outperform previous code generation and semantic parsing approaches.},
archivePrefix = {arXiv},
arxivId = {1704.01696},
author = {Yin, Pengcheng and Neubig, Graham},
eprint = {1704.01696},
file = {:tmp/mozilla{\_}vasilis0/1704.01696v1.pdf:pdf},
title = {{A Syntactic Neural Model for General-Purpose Code Generation}},
url = {http://arxiv.org/abs/1704.01696},
year = {2017}
}
@article{Bahdanau2014,
abstract = {Neural machine translation is a recently proposed approach to machine translation. Unlike the traditional statistical machine translation, the neural machine translation aims at building a single neural network that can be jointly tuned to maximize the translation performance. The models proposed recently for neural machine translation often belong to a family of encoder-decoders and consists of an encoder that encodes a source sentence into a fixed-length vector from which a decoder generates a translation. In this paper, we conjecture that the use of a fixed-length vector is a bottleneck in improving the performance of this basic encoder-decoder architecture, and propose to extend this by allowing a model to automatically (soft-)search for parts of a source sentence that are relevant to predicting a target word, without having to form these parts as a hard segment explicitly. With this new approach, we achieve a translation performance comparable to the existing state-of-the-art phrase-based system on the task of English-to-French translation. Furthermore, qualitative analysis reveals that the (soft-)alignments found by the model agree well with our intuition.},
archivePrefix = {arXiv},
arxivId = {1409.0473},
author = {Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
doi = {10.1146/annurev.neuro.26.041002.131047},
eprint = {1409.0473},
file = {:home/vasilis/Downloads/bahdanau.pdf:pdf},
isbn = {0147-006X (Print)},
issn = {0147-006X},
pages = {1--15},
pmid = {14527267},
title = {{Neural Machine Translation by Jointly Learning to Align and Translate}},
url = {http://arxiv.org/abs/1409.0473},
year = {2014}
}
@article{LeCun2015,
abstract = {Deep learning allows computational models that are composed of multiple processing layers to learn representations of data with multiple levels of abstraction. These methods have dramatically improved the state-of-the-art in speech recognition, visual object recognition, object detection and many other domains such as drug discovery and genomics. Deep learning discovers intricate structure in large data sets by using the backpropagation algorithm to indicate how a machine should change its internal parameters that are used to compute the representation in each layer from the representation in the previous layer. Deep convolutional nets have brought about breakthroughs in processing images, video, speech and audio, whereas recurrent nets have shone light on sequential data such as text and speech.},
archivePrefix = {arXiv},
arxivId = {arXiv:1312.6184v5},
author = {LeCun, Yann and Bengio, Yoshua and Hinton, Geoffrey},
doi = {10.1038/nature14539},
eprint = {arXiv:1312.6184v5},
file = {:home/vasilis/Desktop/deep-learning-nature2015.pdf:pdf},
isbn = {9780521835688},
issn = {0028-0836},
journal = {Nature},
number = {7553},
pages = {436--444},
pmid = {10463930},
title = {{Deep learning}},
url = {http://dx.doi.org/10.1038/nature14539},
volume = {521},
year = {2015}
}
@article{Joulin2015,
abstract = {Despite the recent achievements in machine learning, we are still very far from achieving real artificial intelligence. In this paper, we discuss the limitations of standard deep learning approaches and show that some of these limitations can be overcome by learning how to grow the complexity of a model in a structured way. Specifically, we study the simplest sequence prediction problems that are beyond the scope of what is learnable with standard recurrent networks, algorithmically generated sequences which can only be learned by models which have the capacity to count and to memorize sequences. We show that some basic algorithms can be learned from sequential data using a recurrent network associated with a trainable memory.},
archivePrefix = {arXiv},
arxivId = {arXiv:1503.01007v4},
author = {Joulin, Armand and Mikolov, Tomas},
eprint = {arXiv:1503.01007v4},
file = {:tmp/mozilla{\_}vasilis0/1503.01007.pdf:pdf},
isbn = {1503.01007},
issn = {10495258},
journal = {arXiv},
pages = {1--10},
title = {{Inferring Algorithmic Patterns with Stack-Augmented Recurrent Nets}},
year = {2015}
}
@article{Pascanu2012,
abstract = {There are two widely known issues with properly training Recurrent Neural Networks, the vanishing and the exploding gradient problems detailed in Bengio et al. (1994). In this paper we attempt to improve the understanding of the underlying issues by exploring these problems from an analytical, a geometric and a dynamical systems perspective. Our analysis is used to justify a simple yet effective solution. We propose a gradient norm clipping strategy to deal with exploding gradients and a soft constraint for the vanishing gradients problem. We validate empirically our hypothesis and proposed solutions in the experimental section.},
archivePrefix = {arXiv},
arxivId = {1211.5063},
author = {Pascanu, Razvan and Mikolov, Tomas and Bengio, Yoshua},
doi = {10.1109/72.279181},
eprint = {1211.5063},
file = {:home/vasilis/Desktop/pascanu13.pdf:pdf},
isbn = {08997667 (ISSN)},
issn = {1045-9227},
number = {2},
pmid = {18267787},
title = {{On the difficulty of training Recurrent Neural Networks}},
url = {http://arxiv.org/abs/1211.5063},
year = {2012}
}
@article{Biermann1985,
abstract = {Ten methodologies for automatic program construction are presented, discussed and compared. Some of the techniques generate code from formal input-output specifications while others work from examples of the target behaviour or from natural language input.},
author = {Biermann, Alan W.},
doi = {10.1016/S0747-7171(85)80010-9},
file = {:home/vasilis/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Biermann - 1985 - Automatic programming A tutorial on formal methodologies.pdf:pdf},
issn = {07477171},
journal = {Journal of Symbolic Computation},
number = {2},
pages = {119--142},
title = {{Automatic programming : A tutorial on formal methodologies}},
url = {http://www.sciencedirect.com/science/article/pii/S0747717185800109},
volume = {1},
year = {1985}
}
@article{Sukhbaatar2015,
abstract = {We introduce a neural network with a recurrent attention model over a possibly large external memory. The architecture is a form of Memory Network (Weston et al., 2015) but unlike the model in that work, it is trained end-to-end, and hence requires significantly less supervision during training, making it more generally applicable in realistic settings. It can also be seen as an extension of RNNsearch to the case where multiple computational steps (hops) are performed per output symbol. The flexibility of the model allows us to apply it to tasks as diverse as (synthetic) question answering and to language modeling. For the former our approach is competitive with Memory Networks, but with less supervision. For the latter, on the Penn TreeBank and Text8 datasets our approach demonstrates comparable performance to RNNs and LSTMs. In both cases we show that the key concept of multiple computational hops yields improved results.},
archivePrefix = {arXiv},
arxivId = {1503.08895},
author = {Sukhbaatar, Sainbayar and Szlam, Arthur and Weston, Jason and Fergus, Rob},
doi = {v5},
eprint = {1503.08895},
file = {:home/vasilis/Downloads/1503.08895.pdf:pdf},
isbn = {1551-6709},
issn = {10495258},
journal = {NIPS},
pages = {1--11},
pmid = {9377276},
title = {{End-To-End Memory Networks}},
url = {http://arxiv.org/abs/1503.08895},
year = {2015}
}
@article{Liu2016,
archivePrefix = {arXiv},
arxivId = {arXiv:1605.05101v1},
author = {Liu, Pengfei and Qiu, Xipeng and Huang, Xuanjing},
eprint = {arXiv:1605.05101v1},
file = {:home/vasilis/.local/share/data/Mendeley Ltd./Mendeley Desktop/Downloaded/Liu, Qiu, Huang - 2016 - Recurrent Neural Network for Text Classification.pdf:pdf},
title = {{Recurrent Neural Network for Text Classification}},
year = {2016}
}
@article{Bergstra2012,
author = {Bergstra, James and Bengio, Yoshua},
file = {:home/vasilis/Downloads/bergstra12a.pdf:pdf},
journal = {Journal of Machine Learning Research},
keywords = {deep learning,global optimization,model selection,neural networks,response surface modeling},
title = {{Random Search for Hyper-Parameter Optimization}},
year = {2012}
}
@article{Graves2013,
abstract = {This paper shows how Long Short-term Memory recurrent neural networks can be used to generate complex sequences with long-range structure, simply by predicting one data point at a time. The approach is demonstrated for text (where the data are discrete) and online handwriting (where the data are real-valued). It is then extended to handwriting synthesis by allowing the network to condition its predictions on a text sequence. The resulting system is able to generate highly realistic cursive handwriting in a wide variety of styles.},
archivePrefix = {arXiv},
arxivId = {arXiv:1308.0850v5},
author = {Graves, Alex},
doi = {10.1145/2661829.2661935},
eprint = {arXiv:1308.0850v5},
file = {:home/vasilis/Dropbox/2015-10 Bountris - RNN source code/Resources/Generating Sequences With RNNs - Alex Graves.pdf:pdf},
isbn = {2000201075},
issn = {18792782},
journal = {Technical Reports},
pages = {1--43},
pmid = {23459267},
title = {{Generating Sequences with Recurrent Neural Networks}},
url = {http://arxiv.org/abs/1308.0850},
year = {2013}
}
@article{Sutskever,
archivePrefix = {arXiv},
arxivId = {1409.3215},
author = {Sutskever, Ilya and Vinyals, O and Le, Quoc V.},
doi = {10.1007/s10107-014-0839-0},
eprint = {1409.3215},
file = {:tmp/mozilla{\_}vasilis0/1409.3215v3.pdf:pdf},
isbn = {1409.3215},
issn = {09205691},
journal = {Nips},
pages = {1--9},
pmid = {2079951},
title = {{Sequence to sequence learning with neural networks}},
url = {http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural},
year = {2014}
}
@article{Goodfellow2014,
archivePrefix = {arXiv},
arxivId = {arXiv:1406.2661v1},
author = {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Warde-Farley, David and Ozair, Sherjiz and Courville, Aaron and Bengio, Yoshua},
doi = {10.1109/TASLP.2014.2383614},
eprint = {arXiv:1406.2661v1},
file = {:home/vasilis/Desktop/1406.2661.pdf:pdf},
isbn = {2329-9290},
issn = {15587916},
keywords = {Recurrent neural network (RNN),slot filling,spoken language understanding (SLU),word embedding},
pmid = {1000183096},
title = {{Generative Adversarial Nets}},
year = {2014}
}
@article{Srivastava2014,
abstract = {Deep neural nets with a large number of parameters are very powerful machine learning systems. However, overfitting is a serious problem in such networks. Large networks are also slow to use, making it difficult to deal with overfitting by combining the predictions of many different large neural nets at test time. Dropout is a technique for addressing this problem. The key idea is to randomly drop units (along with their connections) from the neural network during training. This prevents units from co-adapting too much. During training, dropout samples from an exponential number of different " thinned " networks. At test time, it is easy to approximate the effect of averaging the predictions of all these thinned networks by simply using a single unthinned network that has smaller weights. This significantly reduces overfitting and gives major improvements over other regularization methods. We show that dropout improves the performance of neural networks on supervised learning tasks in vision, speech recognition, document classification and computational biology, obtaining state-of-the-art results on many benchmark data sets.},
archivePrefix = {arXiv},
arxivId = {1102.4807},
author = {Srivastava, Nitish and Hinton, Geoffrey and Krizhevsky, Alex and Sutskever, Ilya and Salakhutdinov, Ruslan},
doi = {10.1214/12-AOS1000},
eprint = {1102.4807},
file = {:home/vasilis/Downloads/ADOBE{\_}ILLUSTRATOR{\_}CC{\_}V2015{\_}MULTI{\_}WIN64-XFORCE/srivastava14a.pdf:pdf;:home/vasilis/Downloads/JMLRdropout.pdf:pdf},
isbn = {1532-4435},
issn = {15337928},
journal = {Journal of Machine Learning Research},
keywords = {deep learning,model combination,neural networks,regularization},
pages = {1929--1958},
title = {{Dropout: A Simple Way to Prevent Neural Networks from Overfitting}},
volume = {15},
year = {2014}
}
@article{Murali2017,
abstract = {We present a data-driven approach to the problem of inductive computer program synthesis. Our method learns a probabilistic model for real-world programs from a corpus of existing code. It uses this model during synthesis to automatically infer a posterior distribution over sketches, or syntactic models of the problem to be synthesized. Sketches sampled from this posterior are then used to drive combinatorial synthesis of a program in a high-level programming language. The key technical innovation of our approach --- embodied in a system called Bayou --- is utilizing user-supplied evidence as to the program's desired behavior, along with a Bayesian update, to obtain a posterior distribution over the program's true, latent specification (indicating user intent), which in turn produces a posterior over possible sketches. As we show experimentally, explicitly modeling uncertainty in specification significantly increases the accuracy of the synthesis algorithm. We evaluate Bayou's ability to synthesize Java and Android methods. We find that using just a few example API sequences to communicate user intent, Bayou can synthesize complex method bodies, some implementing tasks never encountered during training.},
archivePrefix = {arXiv},
arxivId = {1703.05698},
author = {Murali, Vijayaraghavan and Chaudhuri, Swarat and Jermaine, Chris},
eprint = {1703.05698},
file = {:home/vasilis/Downloads/1703.05698.pdf:pdf},
title = {{Bayesian Sketch Learning for Program Synthesis}},
url = {http://arxiv.org/abs/1703.05698},
year = {2017}
}
@article{SutskeverThesis,
abstract = {Recurrent Neural Networks (RNNs) are powerful sequence models that were believed to be difficult to train, and as a result they were rarely used in machine learning applications. This thesis presents methods that overcome the difficulty of training RNNs, and applications of RNNs to challenging problems. We first describe a newprobabilistic sequence model that combines Restricted Boltzmann Machines and RNNs. The new model is more powerful than similar models while being less difficult to train. Next, we present a new variant of the Hessian-free (HF) optimizer and show that it can train RNNs on tasks that have extreme long-range temporal dependencies, which were previously considered to be impossibly hard. We then apply HF to character-level language modelling and get excellent results. We also apply HF to optimal control and obtain RNN control laws that can successfully operate under conditions of delayed feedback and unknown disturbances. Finally, we describe a random parameter initialization scheme that allows gradient descent with mo- mentum to train RNNs on problems with long-term dependencies. This directly contradicts widespread beliefs about the inability of first-order methods to do so, and suggests that previous attempts at training RNNs failed partly due to flaws in the random initialization},
archivePrefix = {arXiv},
arxivId = {arXiv:submit/1456339},
author = {Sutskever, Ilya},
eprint = {1456339},
file = {:home/vasilis/Dropbox/2015-10 Bountris - RNN source code/Resources/Trainning RNNs - Sutskever's Thesis.pdf:pdf},
isbn = {978-0-499-22066-0},
journal = {PhD thesis},
pages = {101},
primaryClass = {arXiv:submit},
title = {{Training Recurrent neural Networks}},
year = {2013}
}
@article{Hochreiter1997,
abstract = {Learning to store information over extended time intervals by recurrent backpropagation takes a very long time, mostly because of insufficient, decaying error backflow. We briefly review Hochreiter's (1991) analysis of this problem, then address it b...},
archivePrefix = {arXiv},
arxivId = {1206.2944},
author = {Hochreiter, Sepp and Schmidhuber, Jurgen J{\"{u}}rgen},
doi = {10.1162/neco.1997.9.8.1735},
eprint = {1206.2944},
file = {:tmp/mozilla{\_}vasilis0/Bobby{\_}paper1.pdf:pdf},
isbn = {08997667 (ISSN)},
issn = {0899-7667},
journal = {Neural Computation},
number = {8},
pages = {1--32},
pmid = {9377276},
title = {{Long short-term memory}},
url = {http://www.mitpressjournals.org/doi/abs/10.1162/neco.1997.9.8.1735{\%}5Cnfile:///Files/F0/F0267A41-D807-4137-A5AC-8D9A84E9E12C.pdf{\%}5Cnpapers3://publication/doi/10.1162/neco.1997.9.8.1735{\%}5Cnfile:///Files/B1/B10E2649-D486-4D93-B71B-80023681156B.pdf},
volume = {9},
year = {1997}
}
@article{Parnas1985,
abstract = {A former member of the SDIO Panel on Computing in Support of Battle Management explains why he believes the “Star Wars” effort will not achieve its stated goals.},
author = {Parnas, David Lorge},
doi = {10.1145/382288.382289},
isbn = {0001-0782},
issn = {01635948},
journal = {ACM SIGSOFT Software Engineering Notes},
number = {12},
pages = {15--23},
title = {{Software aspects of strategic defense systems}},
volume = {10},
year = {1985}
}