maintenance: better syntax and simplified code (#10)

* better syntax and simplified code * simplify syntax
adbar · Jun 18, 2024 · 1aa0c67 · 1aa0c67
1 parent 1817a4c
commit 1aa0c67
Showing 1 changed file with 15 additions and 12 deletions.
diff --git a/py3langid/langid.py b/py3langid/langid.py
@@ -17,6 +17,7 @@
 
 from base64 import b64decode
 from collections import Counter
+from operator import itemgetter
 from pathlib import Path
 from urllib.parse import parse_qs
 
@@ -33,6 +34,9 @@
 # affect the relative ordering of the predicted classes. It can be
 # re-enabled at runtime - see the readme.
 
+# quantization: faster but less precise
+DATATYPE = "uint16"
+
 
 def load_model(path=None):
     """
@@ -60,7 +64,7 @@ def set_languages(langs=None):
     return IDENTIFIER.set_languages(langs)
 
 
-def classify(instance, datatype='uint16'):
+def classify(instance, datatype=DATATYPE):
     """
     Convenience method using a global identifier instance with the default
     model included in langid.py. Identifies the language that a string is
@@ -198,9 +202,7 @@ def set_languages(self, langs=None):
         nb_ptc, nb_pc, nb_classes = self.__full_model
 
         if langs is None:
-            self.nb_classes = nb_classes
-            self.nb_ptc = nb_ptc
-            self.nb_pc = nb_pc
+            self.nb_classes, self.nb_ptc, self.nb_pc = nb_classes, nb_ptc, nb_pc
 
         else:
             # We were passed a restricted set of languages. Trim the arrays accordingly
@@ -209,12 +211,12 @@ def set_languages(self, langs=None):
                 if lang not in nb_classes:
                     raise ValueError(f"Unknown language code {lang}")
 
-            subset_mask = np.fromiter((l in langs for l in nb_classes), dtype=bool)
+            subset_mask = np.isin(nb_classes, langs)
             self.nb_classes = [c for c in nb_classes if c in langs]
             self.nb_ptc = nb_ptc[:, subset_mask]
             self.nb_pc = nb_pc[subset_mask]
 
-    def instance2fv(self, text, datatype='uint16'):
+    def instance2fv(self, text, datatype=DATATYPE):
         """
         Map an instance into the feature space of the trained model.
 
@@ -227,11 +229,12 @@ def instance2fv(self, text, datatype='uint16'):
 
         # Convert the text to a sequence of ascii values and
         # Count the number of times we enter each state
-        state = 0
-        indexes = []
-        for letter in list(text):
+        state, indexes = 0, []
+        extend = indexes.extend
+
+        for letter in text:
             state = self.tk_nextmove[(state << 8) + letter]
-            indexes.extend(self.tk_output.get(state, []))
+            extend(self.tk_output.get(state, []))
 
         # datatype: consider that less feature counts are going to be needed
         arr = np.zeros(self.nb_numfeats, dtype=datatype)
@@ -247,7 +250,7 @@ def nb_classprobs(self, fv):
         # compute the partial log-probability of the document in each class
         return pdc + self.nb_pc
 
-    def classify(self, text, datatype='uint16'):
+    def classify(self, text, datatype=DATATYPE):
         """
         Classify an instance.
         """
@@ -262,7 +265,7 @@ def rank(self, text):
         """
         fv = self.instance2fv(text)
         probs = self.norm_probs(self.nb_classprobs(fv))
-        return [(str(k), float(v)) for (v, k) in sorted(zip(probs, self.nb_classes), reverse=True)]
+        return sorted(zip(self.nb_classes, probs), key=itemgetter(1), reverse=True)
 
     def cl_path(self, path):
         """