diff --git a/README.md b/README.md index def2a55..58cd553 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,13 @@ # Happier Fun Tokenizer +Translated the original py2 code to py3 code with `2to3` + +``` +2to3 -w . +``` + +--- + This code implements a basic, Twitter-aware tokenizer. Originally developed by [Christopher Potts](http://web.stanford.edu/~cgpotts/) ([Happy Fun Tokenizer](http://sentiment.christopherpotts.net/code-data/happyfuntokenizing.py)) and updated by [H. Andrew Schwartz](http://www3.cs.stonybrook.edu/~has/). Shared with Christopher's permission. diff --git a/happierfuntokenizing.py b/happierfuntokenizing.py index de32177..aecdcb5 100644 --- a/happierfuntokenizing.py +++ b/happierfuntokenizing.py @@ -39,7 +39,7 @@ ###################################################################### import re -import htmlentitydefs +import html.entities ###################################################################### # The following strings are components in the regular expression @@ -164,10 +164,10 @@ def tokenize(self, s): # Try to ensure unicode: if self.use_unicode: try: - s = unicode(s) + s = str(s) except UnicodeDecodeError: s = str(s).encode('string_escape') - s = unicode(s) + s = str(s) # Fix HTML character entitites: s = self.__html2unicode(s) s = self.__removeHex(s) @@ -176,7 +176,7 @@ def tokenize(self, s): #print words #debug # Possible alter the case, but avoid changing emoticons like :D into :d: if not self.preserve_case: - words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words) + words = list(map((lambda x : x if emoticon_re.search(x) else x.lower()), words)) return words @@ -188,7 +188,7 @@ def tokenize_random_tweet(self): try: import twitter except ImportError: - print "Apologies. The random tweet functionality requires the Python twitter library: http://code.google.com/p/python-twitter/" + print("Apologies. The random tweet functionality requires the Python twitter library: http://code.google.com/p/python-twitter/") from random import shuffle api = twitter.Api() tweets = api.GetPublicTimeline() @@ -211,16 +211,16 @@ def __html2unicode(self, s): entnum = ent[2:-1] try: entnum = int(entnum) - s = s.replace(ent, unichr(entnum)) + s = s.replace(ent, chr(entnum)) except: pass # Now the alpha versions: ents = set(html_entity_alpha_re.findall(s)) - ents = filter((lambda x : x != amp), ents) + ents = list(filter((lambda x : x != amp), ents)) for ent in ents: entname = ent[1:-1] try: - s = s.replace(ent, unichr(htmlentitydefs.name2codepoint[entname])) + s = s.replace(ent, chr(html.entities.name2codepoint[entname])) except: pass s = s.replace(amp, " and ") @@ -239,18 +239,18 @@ def __removeHex(self, s): import sys samples = ( - u"RT @ #happyfuncoding: this is a typical Twitter tweet :-)", - u"It's perhaps noteworthy that phone numbers like +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace.", - u'Something about <3 pain
', - u"This is more like a Facebook message with a url: http://www.youtube.com/watch?v=dQw4w9WgXcQ, youtube.com google.com https://google.com/ ", - u"HTML entities & other Web oddities can be an ácute pain >:(", + "RT @ #happyfuncoding: this is a typical Twitter tweet :-)", + "It's perhaps noteworthy that phone numbers like +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace.", + 'Something about <3 pain
', + "This is more like a Facebook message with a url: http://www.youtube.com/watch?v=dQw4w9WgXcQ, youtube.com google.com https://google.com/ ", + "HTML entities & other Web oddities can be an ácute pain >:(", ) if len(sys.argv) > 1 and (sys.argv[1]): samples = sys.argv[1:] for s in samples: - print "======================================================================" - print s + print("======================================================================") + print(s) tokenized = tok.tokenize(s) - print "\n".join(tokenized).encode('utf8', 'ignore') if tok.use_unicode else "\n".join(tokenized) + print("\n".join(tokenized).encode('utf8', 'ignore') if tok.use_unicode else "\n".join(tokenized))