add code

MeiqiGuo · Oct 26, 2020 · b533ca4 · b533ca4
1 parent 082db27
commit b533ca4
Show file tree

Hide file tree

Showing 22 changed files with 1,366 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,36 @@
+# Inflating Topic Relevance with Ideology: A Case Study of Political Ideology Bias in Social Topic Detection Models
+
+This is a PyTorch implementation of the experiments described in our submitted paper for COLING 2020.
+
+## Environment Setup
+
+>pip install -r requirements.txt
+
+
+## Training off-the-shell NLP models and prediction inference
+
+1) Pre-process data;
+
+2) Change configuration files in the folder "myallennlp/training_config" by adding the paths to training and dev datasets;
+
+>"train_data_path": TRAIN_FILE,
+
+>"validation_data_path": DEV_FILE,
+
+3) Run training command:
+
+>mkdir save
+
+>allennlp train myallennlp/training_config/topical_extractor_\<model\>.jsonnet --serialization-dir save/\<model\> --include-package myallennlp
+  
+4) Inference command:
+
+>python myallennlp/prediction.py <path_to_saved_model> --pred_file <path_to_test_file>
+  
+## Training and testing our proposed adversarial approach
+
+For ELMo+ADV and GloVe+ADV:
+>python mymodel/main.py --base_model \<model\> --train_data_path <path_to_train_file> --val_data_path <path_to_dev_file> --test_data_path <path_to_test_file> --save_root <path_to_save_root> --train --test --lr 4e-4 --batch_size 64 --n_epoch 75
+
+For BERT+ADV:
+>python mymodel/main.py --base_model \<model\> --train_data_path <path_to_train_file> --val_data_path <path_to_dev_file> --test_data_path <path_to_test_file> --save_root <path_to_save_root> --train --test --lr 3e-5 --batch_size 32 --n_epoch 5
diff --git a/myallennlp/.DS_Store b/myallennlp/.DS_Store
diff --git a/myallennlp/__init__.py b/myallennlp/__init__.py
@@ -0,0 +1,3 @@
+from myallennlp import models
+from myallennlp.data import dataset_readers
+from myallennlp import modules
diff --git a/myallennlp/data/.DS_Store b/myallennlp/data/.DS_Store
diff --git a/myallennlp/data/__init__.py b/myallennlp/data/__init__.py
@@ -0,0 +1 @@
+from myallennlp.data.dataset_readers import tweet_reader
diff --git a/myallennlp/data/dataset_readers/__init__.py b/myallennlp/data/dataset_readers/__init__.py
diff --git a/myallennlp/data/dataset_readers/tweet_reader.py b/myallennlp/data/dataset_readers/tweet_reader.py
@@ -0,0 +1,72 @@
+from typing import Dict
+import csv
+import logging
+import emoji
+from overrides import overrides
+
+from allennlp.common.file_utils import cached_path
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.fields import Field, TextField, LabelField, MetadataField
+from allennlp.data.instance import Instance
+from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenIndexer
+from allennlp.data.tokenizers import Tokenizer, WordTokenizer, Token
+
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+
+@DatasetReader.register("tweet_reader")
+class TweetReader(DatasetReader):
+    """
+    Reads a file containing tweets in both con and lib groups.
+    This data is formatted as csv, one tweet instance per line.  Three columns in the data are
+    "group", "tweet" and "label".
+    Parameters
+    ----------
+    tokenizer : ``Tokenizer``, optional (default=``WordTokenizer()``)
+        See :class:`Tokenizer`.
+    token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
+        See :class:`TokenIndexer`.
+    """
+
+    def __init__(self,
+                 tokenizer: Tokenizer = None,
+                 token_indexers: Dict[str, TokenIndexer] = None,
+                 lazy: bool = False) -> None:
+        super().__init__(lazy)
+        self._tokenizer = tokenizer or WordTokenizer()
+        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
+
+    @overrides
+    def _read(self, file_path: str):
+        # if `file_path` is a URL, redirect to the cache
+        file_path = cached_path(file_path)
+
+        with open(file_path, 'r') as tweet_file:
+            logger.info("Reading tweet instances from dataset at: %s", file_path)
+            csv_reader = csv.reader(tweet_file, delimiter=',', quoting=csv.QUOTE_ALL, escapechar='\\')
+            for row in csv_reader:
+                if len(row) == 3:
+                    _, tweet, label = row
+                else:
+                    # These were cases where the row has wrong columns; we'll just skip them.
+                    continue
+                if label not in ['0', '1']:
+                # These were cases where the annotators disagreed; we'll just skip them.
+                    continue
+                yield self.text_to_instance(tweet, label)
+
+    @overrides
+    def text_to_instance(self,  # type: ignore
+                         tweet: str,
+                         label: str = None) -> Instance:
+        # pylint: disable=arguments-differ
+        fields: Dict[str, Field] = {}
+        tokens = self._tokenizer.tokenize(tweet)
+        fields['tokens'] = TextField(tokens, self._token_indexers)
+        if label:
+            fields['label'] = LabelField(label)
+        return Instance(fields)
+
+
+
diff --git a/myallennlp/models/__init__.py b/myallennlp/models/__init__.py
@@ -0,0 +1,2 @@
+from myallennlp.models import topical_extractor
+from myallennlp.models import topical_extractor_bert
diff --git a/myallennlp/models/topical_extractor.py b/myallennlp/models/topical_extractor.py
@@ -0,0 +1,137 @@
+from typing import Dict, List, Optional, Any
+
+import torch
+
+from allennlp.common.checks import check_dimensions_match
+from allennlp.data import Vocabulary
+from allennlp.modules import Seq2SeqEncoder, TimeDistributed, TextFieldEmbedder, FeedForward, InputVariationalDropout
+from allennlp.models.model import Model
+from allennlp.nn import InitializerApplicator, RegularizerApplicator
+from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits, replace_masked_values
+from allennlp.training.metrics import F1Measure
+
+
+@Model.register("topical_extractor")
+class TopicalExtractor(Model):
+    """
+    This model predicts the relativity of a tweet to a specific topic. It is a binary classification problem.
+    Specifically, the implementation is a simple Bi-LSTM model.
+    Parameters
+    ----------
+    vocab : ``Vocabulary``, required
+        A Vocabulary, required in order to compute sizes for input/output projections.
+    text_field_embedder : ``TextFieldEmbedder``, required
+        Used to embed the ``tokens`` ``TextField`` we get as input to the model.
+    encoder : ``Seq2SeqEncoder``
+        The encoder (with its own internal stacking) that we will use in between embedding tokens
+        and predicting output tags.
+    output_feedforward : ``FeedForward``
+        Used to prepare the concatenated premise and hypothesis for prediction.
+    output_logit : ``FeedForward``
+        This feedforward network computes the output logits.
+    dropout : ``float``, optional (default=0.5)
+        Dropout percentage to use.
+    initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``)
+        Used to initialize the model parameters.
+    regularizer : ``RegularizerApplicator``, optional (default=``None``)
+        If provided, will be used to calculate the regularization penalty during training.
+    """
+    def __init__(self, vocab: Vocabulary,
+                 text_field_embedder: TextFieldEmbedder,
+                 encoder: Seq2SeqEncoder,
+                 output_feedforward: FeedForward,
+                 output_logit: FeedForward,
+                 dropout: float = 0.5,
+                 initializer: InitializerApplicator = InitializerApplicator(),
+                 regularizer: Optional[RegularizerApplicator] = None) -> None:
+        super().__init__(vocab, regularizer)
+
+        self._text_field_embedder = text_field_embedder
+        self._encoder = encoder
+
+        if dropout:
+            self.dropout = torch.nn.Dropout(dropout)
+            self.rnn_input_dropout = InputVariationalDropout(dropout)
+        else:
+            self.dropout = None
+            self.rnn_input_dropout = None
+
+        self._output_feedforward = output_feedforward
+        self._output_logit = output_logit
+
+        self._num_labels = vocab.get_vocab_size(namespace="labels")
+
+        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
+                               "text field embedding dim", "encoder input dim")
+
+        self._f1 = F1Measure(positive_label=vocab._token_to_index["labels"]["1"])
+        self._loss = torch.nn.CrossEntropyLoss()
+
+        initializer(self)
+
+
+    def forward(self,  # type: ignore
+                tokens: Dict[str, torch.LongTensor],
+                label: torch.IntTensor = None) -> Dict[str, torch.Tensor]:
+        # pylint: disable=arguments-differ
+        """
+        Parameters
+        ----------
+        tokens : Dict[str, torch.LongTensor], required
+            The output of ``TextField.as_array()``, which should typically be passed directly to a
+            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
+            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
+            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
+            for the ``TokenIndexers`` when you created the ``TextField`` representing your
+            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
+            which knows how to combine different word representations into a single vector per
+            token in your input.
+        label : torch.IntTensor, optional (default = None)
+            From a ``LabelField``
+
+        Returns
+        -------
+        An output dictionary consisting of:
+         label_logits : torch.FloatTensor
+            A tensor of shape ``(batch_size, num_labels)`` representing unnormalised log
+            probabilities of the relativity label.
+        label_probs : torch.FloatTensor
+            A tensor of shape ``(batch_size, num_labels)`` representing probabilities of the
+            relativity label.
+        loss : torch.FloatTensor, optional
+            A scalar loss to be optimised.
+        """
+        embedded_tweet = self._text_field_embedder(tokens)
+        mask = get_text_field_mask(tokens).float()
+
+        # apply dropout for LSTM
+        if self.rnn_input_dropout:
+            embedded_tweet = self.rnn_input_dropout(embedded_tweet)
+
+        # encode tweet, (batch_size, tweet_length, hidden_dim)
+        encoded_tweet = self._encoder(embedded_tweet, mask)
+
+        # The pooling layer -- max pooling.
+        # (batch_size, model_dim)
+        encode_max, _ = replace_masked_values(encoded_tweet, mask.unsqueeze(-1), -1e7).max(dim=1)
+
+        output_hidden = self._output_feedforward(encode_max)
+        label_logits = self._output_logit(output_hidden)
+        label_probs = torch.nn.functional.softmax(label_logits, dim=-1)
+
+        output_dict = {"label_logits": label_logits, "label_probs": label_probs}
+
+        if label is not None:
+            loss = self._loss(label_logits, label.long().view(-1))
+            self._f1(label_logits, label)
+            output_dict["loss"] = loss
+
+        return output_dict
+
+    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
+        metrics = self._f1.get_metric(reset)
+        return {'precision': metrics[0],
+                'recall': metrics[1],
+                'fscore':  metrics[2]}
+
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from myallennlp.data.dataset_readers import tweet_reader
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from myallennlp.models import topical_extractor
		from myallennlp.models import topical_extractor_bert