comment_sentiment.py

import random as rd
import time

import nltk
from nltk.tokenize import TweetTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


def get_comment_sentiment(comment, keywords, upvotes):
    tokenizer_words = TweetTokenizer()
    tokens_sentences = [tokenizer_words.tokenize(t) for t in nltk.sent_tokenize(comment)]
    sentenceStruct, rawScores = [], []
    start_time = time.time()
    for ind in range(0, len(tokens_sentences)):
        keywordDetected = False
        sentenceStruct = tokens_sentences[ind]
        sentence = " ".join(sentenceStruct)
        sentenceScore = sentiment_scores(sentence, upvotes)
        for word in sentenceStruct:
            for keyword in keywords:
                if not keywordDetected:
                    if keyword.casefold() == word.casefold():
                        rawScores += (keyword, sentenceScore)
                        keywordDetected = True
        if not keywordDetected:
            rawScores += ('sentence', sentenceScore)
    # print(time.time()-start_time)

    midScores = []
    currKey = 'placeholder'
    currScore = 0.0
    counter = 0
    # start_time = time.time()
    for ind in range(0, len(rawScores)):
        if (type(rawScores[ind]) is str) & (rawScores[ind] != 'sentence'):
            if currKey == 'placeholder':
                currKey = rawScores[ind]
                currScore += rawScores[ind + 1]
                counter += 1
            elif rawScores[ind] != currKey:
                midScores += (currKey, currScore / counter)
                currKey = rawScores[ind]
                currScore = 0.0
                currScore += rawScores[ind + 1]
                counter = 1
            else:
                currScore += rawScores[ind + 1]
                counter += 1
        elif rawScores[ind] == 'sentence':
            currScore += rawScores[ind + 1]
            counter += 1
    midScores += (currKey, currScore / counter)
    # print(time.time()-start_time)

    finalScores, coveredWords = [], []
    # start_time = time.time()
    for ind in range(0, len(midScores)):
        if (type(midScores[ind]) is str) & (midScores[ind] not in coveredWords):
            coveredWords += [midScores[ind]]
            currKey = midScores[ind]
            currScore = midScores[ind + 1]
            counter = 1
            scoreAdded = False
            for ell in range(ind + 2, len(midScores)):
                if type(midScores[ell]) is str:
                    if (midScores[ell] == currKey) & (ell != (len(midScores) - 2)):
                        currScore += midScores[ell + 1]
                        counter += 1
                    elif (midScores[ell] == currKey) & (ell == (len(midScores) - 2)):
                        currScore += midScores[ell + 1]
                        counter += 1
                        finalScores += (currKey, currScore / counter)
                        scoreAdded = True
                        break
            if scoreAdded is False:
                finalScores += (currKey, currScore / counter)
    print(time.time() - start_time)
    return finalScores


def sentiment_scores(sentence, upvotes):
    sid_obj = SentimentIntensityAnalyzer()
    sentiment_dict = sid_obj.polarity_scores(sentence)
    pos = sentiment_dict['pos']
    neu = sentiment_dict['neu']
    neg = sentiment_dict['neg']
    score = (2 * neu * upvotes) + (pos * upvotes) - (neg * upvotes)
    return score


def simplify_title(titles):
    keywords = []
    titleReferences = []
    commonWords = 'The the This this Is is A a Of of Into into For for But but And and So so There there Through through As as Like like He he She she They they Them them It it'
    start_time = time.time()
    for ind in range(0, len(titles)):
        simplified = False
        title = titles[ind].split()
        while not simplified:
            keyword = title[rd.randint(0, (len(title) - 1))]
            if keyword not in commonWords:
                simplified = True
                keywords += [keyword]
        titleReferences += (titles[ind], keyword)
    print(time.time() - start_time)
    return keywords, titleReferences


def get_scores(titles, comment, upvotes):
    keys, titleRefs = simplify_title(titles)
    scores = get_comment_sentiment(comment, keys, upvotes)
    rankings = []
    start_time = time.time()
    for ind in range(0, len(scores)):
        if type(scores[ind]) is str:
            if scores[ind] == titleRefs[ind + 1]:
                rankings += (titleRefs[ind], scores[ind + 1])
    print(time.time() - start_time)
    return rankings