-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
22 changed files
with
1,366 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
# Inflating Topic Relevance with Ideology: A Case Study of Political Ideology Bias in Social Topic Detection Models | ||
|
||
This is a PyTorch implementation of the experiments described in our submitted paper for COLING 2020. | ||
|
||
## Environment Setup | ||
|
||
>pip install -r requirements.txt | ||
|
||
## Training off-the-shell NLP models and prediction inference | ||
|
||
1) Pre-process data; | ||
|
||
2) Change configuration files in the folder "myallennlp/training_config" by adding the paths to training and dev datasets; | ||
|
||
>"train_data_path": TRAIN_FILE, | ||
>"validation_data_path": DEV_FILE, | ||
3) Run training command: | ||
|
||
>mkdir save | ||
>allennlp train myallennlp/training_config/topical_extractor_\<model\>.jsonnet --serialization-dir save/\<model\> --include-package myallennlp | ||
4) Inference command: | ||
|
||
>python myallennlp/prediction.py <path_to_saved_model> --pred_file <path_to_test_file> | ||
## Training and testing our proposed adversarial approach | ||
|
||
For ELMo+ADV and GloVe+ADV: | ||
>python mymodel/main.py --base_model \<model\> --train_data_path <path_to_train_file> --val_data_path <path_to_dev_file> --test_data_path <path_to_test_file> --save_root <path_to_save_root> --train --test --lr 4e-4 --batch_size 64 --n_epoch 75 | ||
For BERT+ADV: | ||
>python mymodel/main.py --base_model \<model\> --train_data_path <path_to_train_file> --val_data_path <path_to_dev_file> --test_data_path <path_to_test_file> --save_root <path_to_save_root> --train --test --lr 3e-5 --batch_size 32 --n_epoch 5 |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from myallennlp import models | ||
from myallennlp.data import dataset_readers | ||
from myallennlp import modules |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from myallennlp.data.dataset_readers import tweet_reader |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
from typing import Dict | ||
import csv | ||
import logging | ||
import emoji | ||
from overrides import overrides | ||
|
||
from allennlp.common.file_utils import cached_path | ||
from allennlp.data.dataset_readers.dataset_reader import DatasetReader | ||
from allennlp.data.fields import Field, TextField, LabelField, MetadataField | ||
from allennlp.data.instance import Instance | ||
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer | ||
from allennlp.data.tokenizers import Tokenizer, WordTokenizer, Token | ||
|
||
|
||
logger = logging.getLogger(__name__) # pylint: disable=invalid-name | ||
|
||
|
||
@DatasetReader.register("tweet_reader") | ||
class TweetReader(DatasetReader): | ||
""" | ||
Reads a file containing tweets in both con and lib groups. | ||
This data is formatted as csv, one tweet instance per line. Three columns in the data are | ||
"group", "tweet" and "label". | ||
Parameters | ||
---------- | ||
tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``) | ||
See :class:`Tokenizer`. | ||
token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``) | ||
See :class:`TokenIndexer`. | ||
""" | ||
|
||
def __init__(self, | ||
tokenizer: Tokenizer = None, | ||
token_indexers: Dict[str, TokenIndexer] = None, | ||
lazy: bool = False) -> None: | ||
super().__init__(lazy) | ||
self._tokenizer = tokenizer or WordTokenizer() | ||
self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} | ||
|
||
@overrides | ||
def _read(self, file_path: str): | ||
# if `file_path` is a URL, redirect to the cache | ||
file_path = cached_path(file_path) | ||
|
||
with open(file_path, 'r') as tweet_file: | ||
logger.info("Reading tweet instances from dataset at: %s", file_path) | ||
csv_reader = csv.reader(tweet_file, delimiter=',', quoting=csv.QUOTE_ALL, escapechar='\\') | ||
for row in csv_reader: | ||
if len(row) == 3: | ||
_, tweet, label = row | ||
else: | ||
# These were cases where the row has wrong columns; we'll just skip them. | ||
continue | ||
if label not in ['0', '1']: | ||
# These were cases where the annotators disagreed; we'll just skip them. | ||
continue | ||
yield self.text_to_instance(tweet, label) | ||
|
||
@overrides | ||
def text_to_instance(self, # type: ignore | ||
tweet: str, | ||
label: str = None) -> Instance: | ||
# pylint: disable=arguments-differ | ||
fields: Dict[str, Field] = {} | ||
tokens = self._tokenizer.tokenize(tweet) | ||
fields['tokens'] = TextField(tokens, self._token_indexers) | ||
if label: | ||
fields['label'] = LabelField(label) | ||
return Instance(fields) | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
from myallennlp.models import topical_extractor | ||
from myallennlp.models import topical_extractor_bert |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
from typing import Dict, List, Optional, Any | ||
|
||
import torch | ||
|
||
from allennlp.common.checks import check_dimensions_match | ||
from allennlp.data import Vocabulary | ||
from allennlp.modules import Seq2SeqEncoder, TimeDistributed, TextFieldEmbedder, FeedForward, InputVariationalDropout | ||
from allennlp.models.model import Model | ||
from allennlp.nn import InitializerApplicator, RegularizerApplicator | ||
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits, replace_masked_values | ||
from allennlp.training.metrics import F1Measure | ||
|
||
|
||
@Model.register("topical_extractor") | ||
class TopicalExtractor(Model): | ||
""" | ||
This model predicts the relativity of a tweet to a specific topic. It is a binary classification problem. | ||
Specifically, the implementation is a simple Bi-LSTM model. | ||
Parameters | ||
---------- | ||
vocab : ``Vocabulary``, required | ||
A Vocabulary, required in order to compute sizes for input/output projections. | ||
text_field_embedder : ``TextFieldEmbedder``, required | ||
Used to embed the ``tokens`` ``TextField`` we get as input to the model. | ||
encoder : ``Seq2SeqEncoder`` | ||
The encoder (with its own internal stacking) that we will use in between embedding tokens | ||
and predicting output tags. | ||
output_feedforward : ``FeedForward`` | ||
Used to prepare the concatenated premise and hypothesis for prediction. | ||
output_logit : ``FeedForward`` | ||
This feedforward network computes the output logits. | ||
dropout : ``float``, optional (default=0.5) | ||
Dropout percentage to use. | ||
initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``) | ||
Used to initialize the model parameters. | ||
regularizer : ``RegularizerApplicator``, optional (default=``None``) | ||
If provided, will be used to calculate the regularization penalty during training. | ||
""" | ||
def __init__(self, vocab: Vocabulary, | ||
text_field_embedder: TextFieldEmbedder, | ||
encoder: Seq2SeqEncoder, | ||
output_feedforward: FeedForward, | ||
output_logit: FeedForward, | ||
dropout: float = 0.5, | ||
initializer: InitializerApplicator = InitializerApplicator(), | ||
regularizer: Optional[RegularizerApplicator] = None) -> None: | ||
super().__init__(vocab, regularizer) | ||
|
||
self._text_field_embedder = text_field_embedder | ||
self._encoder = encoder | ||
|
||
if dropout: | ||
self.dropout = torch.nn.Dropout(dropout) | ||
self.rnn_input_dropout = InputVariationalDropout(dropout) | ||
else: | ||
self.dropout = None | ||
self.rnn_input_dropout = None | ||
|
||
self._output_feedforward = output_feedforward | ||
self._output_logit = output_logit | ||
|
||
self._num_labels = vocab.get_vocab_size(namespace="labels") | ||
|
||
check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(), | ||
"text field embedding dim", "encoder input dim") | ||
|
||
self._f1 = F1Measure(positive_label=vocab._token_to_index["labels"]["1"]) | ||
self._loss = torch.nn.CrossEntropyLoss() | ||
|
||
initializer(self) | ||
|
||
|
||
def forward(self, # type: ignore | ||
tokens: Dict[str, torch.LongTensor], | ||
label: torch.IntTensor = None) -> Dict[str, torch.Tensor]: | ||
# pylint: disable=arguments-differ | ||
""" | ||
Parameters | ||
---------- | ||
tokens : Dict[str, torch.LongTensor], required | ||
The output of ``TextField.as_array()``, which should typically be passed directly to a | ||
``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` | ||
tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": | ||
Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used | ||
for the ``TokenIndexers`` when you created the ``TextField`` representing your | ||
sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, | ||
which knows how to combine different word representations into a single vector per | ||
token in your input. | ||
label : torch.IntTensor, optional (default = None) | ||
From a ``LabelField`` | ||
Returns | ||
------- | ||
An output dictionary consisting of: | ||
label_logits : torch.FloatTensor | ||
A tensor of shape ``(batch_size, num_labels)`` representing unnormalised log | ||
probabilities of the relativity label. | ||
label_probs : torch.FloatTensor | ||
A tensor of shape ``(batch_size, num_labels)`` representing probabilities of the | ||
relativity label. | ||
loss : torch.FloatTensor, optional | ||
A scalar loss to be optimised. | ||
""" | ||
embedded_tweet = self._text_field_embedder(tokens) | ||
mask = get_text_field_mask(tokens).float() | ||
|
||
# apply dropout for LSTM | ||
if self.rnn_input_dropout: | ||
embedded_tweet = self.rnn_input_dropout(embedded_tweet) | ||
|
||
# encode tweet, (batch_size, tweet_length, hidden_dim) | ||
encoded_tweet = self._encoder(embedded_tweet, mask) | ||
|
||
# The pooling layer -- max pooling. | ||
# (batch_size, model_dim) | ||
encode_max, _ = replace_masked_values(encoded_tweet, mask.unsqueeze(-1), -1e7).max(dim=1) | ||
|
||
output_hidden = self._output_feedforward(encode_max) | ||
label_logits = self._output_logit(output_hidden) | ||
label_probs = torch.nn.functional.softmax(label_logits, dim=-1) | ||
|
||
output_dict = {"label_logits": label_logits, "label_probs": label_probs} | ||
|
||
if label is not None: | ||
loss = self._loss(label_logits, label.long().view(-1)) | ||
self._f1(label_logits, label) | ||
output_dict["loss"] = loss | ||
|
||
return output_dict | ||
|
||
def get_metrics(self, reset: bool = False) -> Dict[str, float]: | ||
metrics = self._f1.get_metric(reset) | ||
return {'precision': metrics[0], | ||
'recall': metrics[1], | ||
'fscore': metrics[2]} | ||
|
||
|
Oops, something went wrong.