Merge pull request #470 from citrus-it/tell

36214035 tell() method on files opened in text mode has poor performance
omniosorg · Feb 15, 2024 · d2be472 · d2be472
2 parents ac092d8 + e69593f
commit d2be472
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 25 deletions.
diff --git a/src/modules/indexer.py b/src/modules/indexer.py
@@ -174,6 +174,11 @@ def __init__(
 
         self.old_out_token = None
 
+        # Handle for the output main dictionary file and
+        # the current position within.
+        self.out_main_dict_handle = None
+        self.out_main_dict_pos = 0
+
     @staticmethod
     def __decode_fmri(pfmri):
         """Turn fmris into strings correctly while writing out
@@ -393,12 +398,9 @@ def _process_fmris(self, fmris):
             self._progtrack.job_add_progress(self._progtrack.JOB_REBUILD_SEARCH)
         return removed_paths
 
-    def _write_main_dict_line(
-        self, file_handle, token, fv_fmri_pos_list_list, out_dir
-    ):
+    def _write_main_dict_line(self, token, fv_fmri_pos_list_list, out_dir):
         """Writes out the new main dictionary file and also adds the
-        token offsets to _data_token_offset. file_handle is the file
-        handle for the output main dictionary file. token is the token
+        token offsets to _data_token_offset. token is the token
         to add to the file. fv_fmri_pos_list_list is a structure of
         lists inside of lists several layers deep. The top layer is a
         list of action types. The second layer contains the keys for
@@ -418,8 +420,7 @@ def _write_main_dict_line(
             )
         self.old_out_token = token
 
-        cur_location_int = file_handle.tell()
-        cur_location = str(cur_location_int)
+        cur_location = str(self.out_main_dict_pos)
         self._data_token_offset.write_entity(token, cur_location)
 
         for at, st_list in fv_fmri_pos_list_list:
@@ -438,11 +439,18 @@ def _write_main_dict_line(
                 for fv, p_list in fv_list:
                     for p_id, m_off_set in p_list:
                         p_id = int(p_id)
-                        self._data_fmri_offsets.add_pair(p_id, cur_location_int)
-        file_handle.write(
-            self._data_main_dict.transform_main_dict_line(
-                token, fv_fmri_pos_list_list
-            )
+                        self._data_fmri_offsets.add_pair(
+                            p_id, self.out_main_dict_pos
+                        )
+        data = self._data_main_dict.transform_main_dict_line(
+            token, fv_fmri_pos_list_list
+        )
+        self.out_main_dict_handle.write(data)
+        # Using tell() on file objects opened in text mode
+        # is very slow compared to simple counting.
+        # https://docs.python.org/3/library/io.html#performance
+        self.out_main_dict_pos += len(
+            data.encode(self.out_main_dict_handle.encoding)
         )
 
     @staticmethod
@@ -597,11 +605,12 @@ def _update_index(self, dicts, out_dir):
         self._data_main_dict.write_dict_file(out_dir, self.file_version_number)
         # The dictionary file's opened in append mode to avoid removing
         # the version information the search storage class added.
-        out_main_dict_handle = open(
+        self.out_main_dict_handle = open(
             os.path.join(out_dir, self._data_main_dict.get_file_name()),
             "a",
             buffering=PKG_FILE_BUFSIZ,
         )
+        self.out_main_dict_pos = self.out_main_dict_handle.tell()
 
         self._data_token_offset.open_out_file(out_dir, self.file_version_number)
 
@@ -641,10 +650,7 @@ def _update_index(self, dicts, out_dir):
                 while new_toks_available and next_new_tok < tok:
                     assert len(next_new_tok) > 0
                     self._write_main_dict_line(
-                        out_main_dict_handle,
-                        next_new_tok,
-                        new_tok_info,
-                        out_dir,
+                        next_new_tok, new_tok_info, out_dir
                     )
                     try:
                         next_new_tok, new_tok_info = next(new_toks_it)
@@ -668,18 +674,14 @@ def _update_index(self, dicts, out_dir):
                 # associated with it, write them to the file.
                 if existing_entries:
                     assert len(tok) > 0
-                    self._write_main_dict_line(
-                        out_main_dict_handle, tok, existing_entries, out_dir
-                    )
+                    self._write_main_dict_line(tok, existing_entries, out_dir)
 
             # For any new tokens which are alphabetically after the
             # last entry in the existing file, add them to the end
             # of the file.
             while new_toks_available:
                 assert len(next_new_tok) > 0
-                self._write_main_dict_line(
-                    out_main_dict_handle, next_new_tok, new_tok_info, out_dir
-                )
+                self._write_main_dict_line(next_new_tok, new_tok_info, out_dir)
                 try:
                     next_new_tok, new_tok_info = next(new_toks_it)
                 except StopIteration:
@@ -689,7 +691,7 @@ def _update_index(self, dicts, out_dir):
                 file_handle.close()
                 self._data_main_dict.close_file_handle()
 
-            out_main_dict_handle.close()
+            self.out_main_dict_handle.close()
             self._data_token_offset.close_file_handle()
             for fh in self.at_fh.values():
                 fh.close()

diff --git a/src/modules/manifest.py b/src/modules/manifest.py
@@ -1347,7 +1347,10 @@ def __handle_list(lst, cp):
                             if return_line:
                                 arg = l
                             __handle_list(inds, arg)
-            cur_pos = file_handle.tell()
+            # Using tell() on file objects opened in text mode
+            # is very slow compared to simple counting.
+            # https://docs.python.org/3/library/io.html#performance
+            cur_pos += len(line.encode(file_handle.encoding))
             line = file_handle.readline()
         file_handle.close()
         return action_dict