Skip to content

Commit

Permalink
add code
Browse files Browse the repository at this point in the history
  • Loading branch information
MeiqiGuo committed Oct 26, 2020
1 parent 082db27 commit b533ca4
Show file tree
Hide file tree
Showing 22 changed files with 1,366 additions and 0 deletions.
36 changes: 36 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Inflating Topic Relevance with Ideology: A Case Study of Political Ideology Bias in Social Topic Detection Models

This is a PyTorch implementation of the experiments described in our submitted paper for COLING 2020.

## Environment Setup

>pip install -r requirements.txt

## Training off-the-shell NLP models and prediction inference

1) Pre-process data;

2) Change configuration files in the folder "myallennlp/training_config" by adding the paths to training and dev datasets;

>"train_data_path": TRAIN_FILE,
>"validation_data_path": DEV_FILE,
3) Run training command:

>mkdir save
>allennlp train myallennlp/training_config/topical_extractor_\<model\>.jsonnet --serialization-dir save/\<model\> --include-package myallennlp
4) Inference command:

>python myallennlp/prediction.py <path_to_saved_model> --pred_file <path_to_test_file>
## Training and testing our proposed adversarial approach

For ELMo+ADV and GloVe+ADV:
>python mymodel/main.py --base_model \<model\> --train_data_path <path_to_train_file> --val_data_path <path_to_dev_file> --test_data_path <path_to_test_file> --save_root <path_to_save_root> --train --test --lr 4e-4 --batch_size 64 --n_epoch 75
For BERT+ADV:
>python mymodel/main.py --base_model \<model\> --train_data_path <path_to_train_file> --val_data_path <path_to_dev_file> --test_data_path <path_to_test_file> --save_root <path_to_save_root> --train --test --lr 3e-5 --batch_size 32 --n_epoch 5
Binary file added myallennlp/.DS_Store
Binary file not shown.
3 changes: 3 additions & 0 deletions myallennlp/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from myallennlp import models
from myallennlp.data import dataset_readers
from myallennlp import modules
Binary file added myallennlp/data/.DS_Store
Binary file not shown.
1 change: 1 addition & 0 deletions myallennlp/data/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from myallennlp.data.dataset_readers import tweet_reader
Empty file.
72 changes: 72 additions & 0 deletions myallennlp/data/dataset_readers/tweet_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from typing import Dict
import csv
import logging
import emoji
from overrides import overrides

from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import Field, TextField, LabelField, MetadataField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
from allennlp.data.tokenizers import Tokenizer, WordTokenizer, Token


logger = logging.getLogger(__name__) # pylint: disable=invalid-name


@DatasetReader.register("tweet_reader")
class TweetReader(DatasetReader):
"""
Reads a file containing tweets in both con and lib groups.
This data is formatted as csv, one tweet instance per line. Three columns in the data are
"group", "tweet" and "label".
Parameters
----------
tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``)
See :class:`Tokenizer`.
token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
See :class:`TokenIndexer`.
"""

def __init__(self,
tokenizer: Tokenizer = None,
token_indexers: Dict[str, TokenIndexer] = None,
lazy: bool = False) -> None:
super().__init__(lazy)
self._tokenizer = tokenizer or WordTokenizer()
self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}

@overrides
def _read(self, file_path: str):
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(file_path)

with open(file_path, 'r') as tweet_file:
logger.info("Reading tweet instances from dataset at: %s", file_path)
csv_reader = csv.reader(tweet_file, delimiter=',', quoting=csv.QUOTE_ALL, escapechar='\\')
for row in csv_reader:
if len(row) == 3:
_, tweet, label = row
else:
# These were cases where the row has wrong columns; we'll just skip them.
continue
if label not in ['0', '1']:
# These were cases where the annotators disagreed; we'll just skip them.
continue
yield self.text_to_instance(tweet, label)

@overrides
def text_to_instance(self, # type: ignore
tweet: str,
label: str = None) -> Instance:
# pylint: disable=arguments-differ
fields: Dict[str, Field] = {}
tokens = self._tokenizer.tokenize(tweet)
fields['tokens'] = TextField(tokens, self._token_indexers)
if label:
fields['label'] = LabelField(label)
return Instance(fields)



2 changes: 2 additions & 0 deletions myallennlp/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from myallennlp.models import topical_extractor
from myallennlp.models import topical_extractor_bert
137 changes: 137 additions & 0 deletions myallennlp/models/topical_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
from typing import Dict, List, Optional, Any

import torch

from allennlp.common.checks import check_dimensions_match
from allennlp.data import Vocabulary
from allennlp.modules import Seq2SeqEncoder, TimeDistributed, TextFieldEmbedder, FeedForward, InputVariationalDropout
from allennlp.models.model import Model
from allennlp.nn import InitializerApplicator, RegularizerApplicator
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits, replace_masked_values
from allennlp.training.metrics import F1Measure


@Model.register("topical_extractor")
class TopicalExtractor(Model):
"""
This model predicts the relativity of a tweet to a specific topic. It is a binary classification problem.
Specifically, the implementation is a simple Bi-LSTM model.
Parameters
----------
vocab : ``Vocabulary``, required
A Vocabulary, required in order to compute sizes for input/output projections.
text_field_embedder : ``TextFieldEmbedder``, required
Used to embed the ``tokens`` ``TextField`` we get as input to the model.
encoder : ``Seq2SeqEncoder``
The encoder (with its own internal stacking) that we will use in between embedding tokens
and predicting output tags.
output_feedforward : ``FeedForward``
Used to prepare the concatenated premise and hypothesis for prediction.
output_logit : ``FeedForward``
This feedforward network computes the output logits.
dropout : ``float``, optional (default=0.5)
Dropout percentage to use.
initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``)
Used to initialize the model parameters.
regularizer : ``RegularizerApplicator``, optional (default=``None``)
If provided, will be used to calculate the regularization penalty during training.
"""
def __init__(self, vocab: Vocabulary,
text_field_embedder: TextFieldEmbedder,
encoder: Seq2SeqEncoder,
output_feedforward: FeedForward,
output_logit: FeedForward,
dropout: float = 0.5,
initializer: InitializerApplicator = InitializerApplicator(),
regularizer: Optional[RegularizerApplicator] = None) -> None:
super().__init__(vocab, regularizer)

self._text_field_embedder = text_field_embedder
self._encoder = encoder

if dropout:
self.dropout = torch.nn.Dropout(dropout)
self.rnn_input_dropout = InputVariationalDropout(dropout)
else:
self.dropout = None
self.rnn_input_dropout = None

self._output_feedforward = output_feedforward
self._output_logit = output_logit

self._num_labels = vocab.get_vocab_size(namespace="labels")

check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
"text field embedding dim", "encoder input dim")

self._f1 = F1Measure(positive_label=vocab._token_to_index["labels"]["1"])
self._loss = torch.nn.CrossEntropyLoss()

initializer(self)


def forward(self, # type: ignore
tokens: Dict[str, torch.LongTensor],
label: torch.IntTensor = None) -> Dict[str, torch.Tensor]:
# pylint: disable=arguments-differ
"""
Parameters
----------
tokens : Dict[str, torch.LongTensor], required
The output of ``TextField.as_array()``, which should typically be passed directly to a
``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
for the ``TokenIndexers`` when you created the ``TextField`` representing your
sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
which knows how to combine different word representations into a single vector per
token in your input.
label : torch.IntTensor, optional (default = None)
From a ``LabelField``
Returns
-------
An output dictionary consisting of:
label_logits : torch.FloatTensor
A tensor of shape ``(batch_size, num_labels)`` representing unnormalised log
probabilities of the relativity label.
label_probs : torch.FloatTensor
A tensor of shape ``(batch_size, num_labels)`` representing probabilities of the
relativity label.
loss : torch.FloatTensor, optional
A scalar loss to be optimised.
"""
embedded_tweet = self._text_field_embedder(tokens)
mask = get_text_field_mask(tokens).float()

# apply dropout for LSTM
if self.rnn_input_dropout:
embedded_tweet = self.rnn_input_dropout(embedded_tweet)

# encode tweet, (batch_size, tweet_length, hidden_dim)
encoded_tweet = self._encoder(embedded_tweet, mask)

# The pooling layer -- max pooling.
# (batch_size, model_dim)
encode_max, _ = replace_masked_values(encoded_tweet, mask.unsqueeze(-1), -1e7).max(dim=1)

output_hidden = self._output_feedforward(encode_max)
label_logits = self._output_logit(output_hidden)
label_probs = torch.nn.functional.softmax(label_logits, dim=-1)

output_dict = {"label_logits": label_logits, "label_probs": label_probs}

if label is not None:
loss = self._loss(label_logits, label.long().view(-1))
self._f1(label_logits, label)
output_dict["loss"] = loss

return output_dict

def get_metrics(self, reset: bool = False) -> Dict[str, float]:
metrics = self._f1.get_metric(reset)
return {'precision': metrics[0],
'recall': metrics[1],
'fscore': metrics[2]}


Loading

0 comments on commit b533ca4

Please sign in to comment.