dlatk · blmoistawinde · Mar 19, 2022
diff --git a/README.md b/README.md
@@ -1,5 +1,13 @@
 # Happier Fun Tokenizer
 
+Translated the original py2 code to py3 code with `2to3`
+
+```
+2to3 -w .
+```
+
+---
+
 This code implements a basic, Twitter-aware tokenizer. Originally developed by [Christopher Potts](http://web.stanford.edu/~cgpotts/) 
 ([Happy Fun Tokenizer](http://sentiment.christopherpotts.net/code-data/happyfuntokenizing.py)) and updated by [H. Andrew Schwartz](http://www3.cs.stonybrook.edu/~has/). Shared with Christopher's permission.
 

diff --git a/happierfuntokenizing.py b/happierfuntokenizing.py
@@ -39,7 +39,7 @@
 ######################################################################
 
 import re
-import htmlentitydefs
+import html.entities
 
 ######################################################################
 # The following strings are components in the regular expression
@@ -164,10 +164,10 @@ def tokenize(self, s):
         # Try to ensure unicode:
         if self.use_unicode:
             try:
-                s = unicode(s)
+                s = str(s)
             except UnicodeDecodeError:
                 s = str(s).encode('string_escape')
-                s = unicode(s)
+                s = str(s)
         # Fix HTML character entitites:
         s = self.__html2unicode(s)
         s = self.__removeHex(s)
@@ -176,7 +176,7 @@ def tokenize(self, s):
         #print words #debug
         # Possible alter the case, but avoid changing emoticons like :D into :d:
         if not self.preserve_case:            
-            words = map((lambda x : x if emoticon_re.search(x) else x.lower()), words)
+            words = list(map((lambda x : x if emoticon_re.search(x) else x.lower()), words))
 
         return words
 
@@ -188,7 +188,7 @@ def tokenize_random_tweet(self):
         try:
             import twitter
         except ImportError:
-            print "Apologies. The random tweet functionality requires the Python twitter library: http://code.google.com/p/python-twitter/"
+            print("Apologies. The random tweet functionality requires the Python twitter library: http://code.google.com/p/python-twitter/")
         from random import shuffle
         api = twitter.Api()
         tweets = api.GetPublicTimeline()
@@ -211,16 +211,16 @@ def __html2unicode(self, s):
                 entnum = ent[2:-1]
                 try:
                     entnum = int(entnum)
-                    s = s.replace(ent, unichr(entnum))	
+                    s = s.replace(ent, chr(entnum))	
                 except:
                     pass
         # Now the alpha versions:
         ents = set(html_entity_alpha_re.findall(s))
-        ents = filter((lambda x : x != amp), ents)
+        ents = list(filter((lambda x : x != amp), ents))
         for ent in ents:
             entname = ent[1:-1]
             try:            
-                s = s.replace(ent, unichr(htmlentitydefs.name2codepoint[entname]))
+                s = s.replace(ent, chr(html.entities.name2codepoint[entname]))
             except:
                 pass                    
             s = s.replace(amp, " and ")
@@ -239,18 +239,18 @@ def __removeHex(self, s):
     import sys
 
     samples = (
-        u"RT @ #happyfuncoding: this is a typical Twitter tweet :-)",
-        u"It's perhaps noteworthy that phone numbers like +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace.",
-        u'Something </sarcasm> about <fails to break this up> <3 </3 <\\3 mañana vergüenza güenza création tonterías tonteréas <em class="grumpy">pain</em> <meta name="viewport" content="width=device-width"> <br />',
-        u"This is more like a Facebook message with a url: http://www.youtube.com/watch?v=dQw4w9WgXcQ, youtube.com google.com https://google.com/ ",
-        u"HTML entities &amp; other Web oddities can be an &aacute;cute <em class='grumpy'>pain</em> >:(",
+        "RT @ #happyfuncoding: this is a typical Twitter tweet :-)",
+        "It's perhaps noteworthy that phone numbers like +1 (800) 123-4567, (800) 123-4567, and 123-4567 are treated as words despite their whitespace.",
+        'Something </sarcasm> about <fails to break this up> <3 </3 <\\3 mañana vergüenza güenza création tonterías tonteréas <em class="grumpy">pain</em> <meta name="viewport" content="width=device-width"> <br />',
+        "This is more like a Facebook message with a url: http://www.youtube.com/watch?v=dQw4w9WgXcQ, youtube.com google.com https://google.com/ ",
+        "HTML entities &amp; other Web oddities can be an &aacute;cute <em class='grumpy'>pain</em> >:(",
         )
 
     if len(sys.argv) > 1 and (sys.argv[1]):
         samples = sys.argv[1:]
 
     for s in samples:
-        print "======================================================================"
-        print s
+        print("======================================================================")
+        print(s)
         tokenized = tok.tokenize(s)
-        print "\n".join(tokenized).encode('utf8', 'ignore') if tok.use_unicode else "\n".join(tokenized)
+        print("\n".join(tokenized).encode('utf8', 'ignore') if tok.use_unicode else "\n".join(tokenized))