From e69593f95b792f164a5e4a793a36205af5104a7e Mon Sep 17 00:00:00 2001 From: Jakub Kulik Date: Tue, 13 Feb 2024 06:41:50 -0800 Subject: [PATCH] 36214035 tell() method on files opened in text mode has poor performance --- src/modules/indexer.py | 50 +++++++++++++++++++++-------------------- src/modules/manifest.py | 5 ++++- 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/src/modules/indexer.py b/src/modules/indexer.py index 88f6db814..b7a879379 100644 --- a/src/modules/indexer.py +++ b/src/modules/indexer.py @@ -174,6 +174,11 @@ def __init__( self.old_out_token = None + # Handle for the output main dictionary file and + # the current position within. + self.out_main_dict_handle = None + self.out_main_dict_pos = 0 + @staticmethod def __decode_fmri(pfmri): """Turn fmris into strings correctly while writing out @@ -393,12 +398,9 @@ def _process_fmris(self, fmris): self._progtrack.job_add_progress(self._progtrack.JOB_REBUILD_SEARCH) return removed_paths - def _write_main_dict_line( - self, file_handle, token, fv_fmri_pos_list_list, out_dir - ): + def _write_main_dict_line(self, token, fv_fmri_pos_list_list, out_dir): """Writes out the new main dictionary file and also adds the - token offsets to _data_token_offset. file_handle is the file - handle for the output main dictionary file. token is the token + token offsets to _data_token_offset. token is the token to add to the file. fv_fmri_pos_list_list is a structure of lists inside of lists several layers deep. The top layer is a list of action types. The second layer contains the keys for @@ -418,8 +420,7 @@ def _write_main_dict_line( ) self.old_out_token = token - cur_location_int = file_handle.tell() - cur_location = str(cur_location_int) + cur_location = str(self.out_main_dict_pos) self._data_token_offset.write_entity(token, cur_location) for at, st_list in fv_fmri_pos_list_list: @@ -438,11 +439,18 @@ def _write_main_dict_line( for fv, p_list in fv_list: for p_id, m_off_set in p_list: p_id = int(p_id) - self._data_fmri_offsets.add_pair(p_id, cur_location_int) - file_handle.write( - self._data_main_dict.transform_main_dict_line( - token, fv_fmri_pos_list_list - ) + self._data_fmri_offsets.add_pair( + p_id, self.out_main_dict_pos + ) + data = self._data_main_dict.transform_main_dict_line( + token, fv_fmri_pos_list_list + ) + self.out_main_dict_handle.write(data) + # Using tell() on file objects opened in text mode + # is very slow compared to simple counting. + # https://docs.python.org/3/library/io.html#performance + self.out_main_dict_pos += len( + data.encode(self.out_main_dict_handle.encoding) ) @staticmethod @@ -597,11 +605,12 @@ def _update_index(self, dicts, out_dir): self._data_main_dict.write_dict_file(out_dir, self.file_version_number) # The dictionary file's opened in append mode to avoid removing # the version information the search storage class added. - out_main_dict_handle = open( + self.out_main_dict_handle = open( os.path.join(out_dir, self._data_main_dict.get_file_name()), "a", buffering=PKG_FILE_BUFSIZ, ) + self.out_main_dict_pos = self.out_main_dict_handle.tell() self._data_token_offset.open_out_file(out_dir, self.file_version_number) @@ -641,10 +650,7 @@ def _update_index(self, dicts, out_dir): while new_toks_available and next_new_tok < tok: assert len(next_new_tok) > 0 self._write_main_dict_line( - out_main_dict_handle, - next_new_tok, - new_tok_info, - out_dir, + next_new_tok, new_tok_info, out_dir ) try: next_new_tok, new_tok_info = next(new_toks_it) @@ -668,18 +674,14 @@ def _update_index(self, dicts, out_dir): # associated with it, write them to the file. if existing_entries: assert len(tok) > 0 - self._write_main_dict_line( - out_main_dict_handle, tok, existing_entries, out_dir - ) + self._write_main_dict_line(tok, existing_entries, out_dir) # For any new tokens which are alphabetically after the # last entry in the existing file, add them to the end # of the file. while new_toks_available: assert len(next_new_tok) > 0 - self._write_main_dict_line( - out_main_dict_handle, next_new_tok, new_tok_info, out_dir - ) + self._write_main_dict_line(next_new_tok, new_tok_info, out_dir) try: next_new_tok, new_tok_info = next(new_toks_it) except StopIteration: @@ -689,7 +691,7 @@ def _update_index(self, dicts, out_dir): file_handle.close() self._data_main_dict.close_file_handle() - out_main_dict_handle.close() + self.out_main_dict_handle.close() self._data_token_offset.close_file_handle() for fh in self.at_fh.values(): fh.close() diff --git a/src/modules/manifest.py b/src/modules/manifest.py index a652619f0..007a13554 100644 --- a/src/modules/manifest.py +++ b/src/modules/manifest.py @@ -1347,7 +1347,10 @@ def __handle_list(lst, cp): if return_line: arg = l __handle_list(inds, arg) - cur_pos = file_handle.tell() + # Using tell() on file objects opened in text mode + # is very slow compared to simple counting. + # https://docs.python.org/3/library/io.html#performance + cur_pos += len(line.encode(file_handle.encoding)) line = file_handle.readline() file_handle.close() return action_dict