Skip to content

Commit

Permalink
Merge pull request #25 from MannLabs/development
Browse files Browse the repository at this point in the history
Some speedups in the table processing
  • Loading branch information
ammarcsj authored Jan 22, 2024
2 parents 55a3837 + 07c2416 commit 301ca05
Show file tree
Hide file tree
Showing 17 changed files with 1,867 additions and 1,022 deletions.
2 changes: 1 addition & 1 deletion directlfq/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


__project__ = "directlfq"
__version__ = "0.2.15"
__version__ = "0.2.16"
__license__ = "Apache"
__description__ = "An open-source Python package of the AlphaPept ecosystem"
__author__ = "Mann Labs"
Expand Down
10 changes: 9 additions & 1 deletion directlfq/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,15 @@ def setup_logging():
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')


##########################
LOG_PROCESSED_PROTEINS = True

def set_log_processed_proteins(log_processed_proteins = True):
global LOG_PROCESSED_PROTEINS
LOG_PROCESSED_PROTEINS = log_processed_proteins


##########################
PROTEIN_ID = 'protein'
QUANT_ID = 'ion'

Expand All @@ -22,3 +23,10 @@ def set_global_protein_and_ion_id(protein_id = 'protein', quant_id = 'ion'):
PROTEIN_ID = protein_id
QUANT_ID = quant_id

##########################
COMPILE_NORMALIZED_ION_TABLE = True

def set_compile_normalized_ion_table(compile_normalized_ion_table = True):
global COMPILE_NORMALIZED_ION_TABLE
COMPILE_NORMALIZED_ION_TABLE = compile_normalized_ion_table

7 changes: 5 additions & 2 deletions directlfq/lfq_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@

def run_lfq(input_file, columns_to_add = [], selected_proteins_file :str = None, mq_protein_groups_txt = None, min_nonan = 1, input_type_to_use = None, maximum_number_of_quadratic_ions_to_use_per_protein = 10,
number_of_quadratic_samples = 50, num_cores = None, filename_suffix = "", deactivate_normalization = False, filter_dict = None, log_processed_proteins = True, protein_id = 'protein', quant_id = 'ion'
):
,compile_normalized_ion_table = True):
"""Run the directLFQ pipeline on a given input file. The input file is expected to contain ion intensities. The output is a table containing protein intensities.
Args:
Expand All @@ -38,6 +38,7 @@ def run_lfq(input_file, columns_to_add = [], selected_proteins_file :str = None
"""
config.set_global_protein_and_ion_id(protein_id=protein_id, quant_id=quant_id)
config.set_log_processed_proteins(log_processed_proteins=log_processed_proteins)
config.set_compile_normalized_ion_table(compile_normalized_ion_table= compile_normalized_ion_table)

LOGGER.info("Starting directLFQ analysis.")
input_file = prepare_input_filename(input_file)
Expand All @@ -62,7 +63,9 @@ def run_lfq(input_file, columns_to_add = [], selected_proteins_file :str = None
outfile_basename = get_outfile_basename(input_file, input_type_to_use, selected_proteins_file, deactivate_normalization,filename_suffix)
save_run_config(outfile_basename, locals())
save_protein_df(protein_df,outfile_basename)
save_ion_df(ion_df,outfile_basename)

if config.COMPILE_NORMALIZED_ION_TABLE:
save_ion_df(ion_df,outfile_basename)

LOGGER.info("Analysis finished!")

Expand Down
169 changes: 108 additions & 61 deletions directlfq/protein_intensity_estimation.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,30 +29,71 @@ def estimate_protein_intensities(normed_df, min_nonan, num_samples_quadratic, nu
"derives protein pseudointensities from between-sample normalized data"

allprots = list(normed_df.index.get_level_values(0).unique())
LOGGER.info(f"{len(allprots)} prots total")
LOGGER.info(f"{len(allprots)} lfq-groups total")

list_of_tuple_w_protein_profiles_and_shifted_peptides = get_list_of_tuple_w_protein_profiles_and_shifted_peptides(allprots, normed_df, num_samples_quadratic, min_nonan, num_cores)
protein_df = get_protein_dataframe_from_list_of_protein_profiles(allprots=allprots, list_of_tuple_w_protein_profiles_and_shifted_peptides=list_of_tuple_w_protein_profiles_and_shifted_peptides, normed_df= normed_df)
ion_df = get_ion_intensity_dataframe_from_list_of_shifted_peptides(list_of_tuple_w_protein_profiles_and_shifted_peptides, allprots)
if config.COMPILE_NORMALIZED_ION_TABLE:
ion_df = get_ion_intensity_dataframe_from_list_of_shifted_peptides(list_of_tuple_w_protein_profiles_and_shifted_peptides, allprots)
else:
ion_df = None

return protein_df, ion_df


def get_list_of_tuple_w_protein_profiles_and_shifted_peptides(allprots, normed_df, num_samples_quadratic, min_nonan, num_cores):
input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan = get_input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan(normed_df, allprots, num_samples_quadratic, min_nonan)

if num_cores is not None and num_cores <=1:
list_of_tuple_w_protein_profiles_and_shifted_peptides = get_list_with_sequential_processing(allprots, normed_df, num_samples_quadratic, min_nonan)
list_of_tuple_w_protein_profiles_and_shifted_peptides = get_list_with_sequential_processing(input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan)
else:
list_of_tuple_w_protein_profiles_and_shifted_peptides = get_list_with_multiprocessing(allprots, normed_df, num_samples_quadratic, min_nonan, num_cores)
list_of_tuple_w_protein_profiles_and_shifted_peptides = get_list_with_multiprocessing(input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan, num_cores)
return list_of_tuple_w_protein_profiles_and_shifted_peptides

def get_list_with_sequential_processing(allprots, normed_df, num_samples_quadratic, min_nonan):
input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan = get_input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan(normed_df, allprots, num_samples_quadratic, min_nonan)

def get_input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan(normed_df, allprots, num_samples_quadratic, min_nonan):
list_of_normed_dfs = get_normed_dfs(normed_df)
return zip(range(len(list_of_normed_dfs)),list_of_normed_dfs, itertools.repeat(num_samples_quadratic), itertools.repeat(min_nonan))


def get_normed_dfs(normed_df):
protein_names = normed_df.index.get_level_values(0).to_numpy()
ion_names = normed_df.index.get_level_values(1).to_numpy()
normed_array = normed_df.to_numpy()
indices_of_proteinname_switch = find_nameswitch_indices(protein_names)
results_list = [get_subdf(normed_array, indices_of_proteinname_switch, idx, protein_names, ion_names) for idx in range(len(indices_of_proteinname_switch)-1)]

return results_list


def find_nameswitch_indices(arr):
change_indices = np.where(arr[:-1] != arr[1:])[0] + 1

# Add the index 0 for the start of the first element
start_indices = np.insert(change_indices, 0, 0)

#Append the index of the last element
start_indices = np.append(start_indices, len(arr))

return start_indices


def get_subdf(normed_array, indices_of_proteinname_switch, idx, protein_names, ion_names):
start_switch = indices_of_proteinname_switch[idx]
end_switch = indices_of_proteinname_switch[idx+1]
sub_array = normed_array[start_switch:end_switch]
index_sub_array = pd.MultiIndex.from_arrays([protein_names[start_switch:end_switch], ion_names[start_switch:end_switch]], names=[config.PROTEIN_ID, config.QUANT_ID])
return pd.DataFrame(sub_array, index = index_sub_array)




def get_list_with_sequential_processing(input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan):
list_of_tuple_w_protein_profiles_and_shifted_peptides = list(map(lambda x : calculate_peptide_and_protein_intensities(*x), input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan))
return list_of_tuple_w_protein_profiles_and_shifted_peptides

def get_list_with_multiprocessing(allprots, normed_df, num_samples_quadratic, min_nonan, num_cores):
def get_list_with_multiprocessing(input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan, num_cores):
pool = get_configured_multiprocessing_pool(num_cores)
input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan = get_input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan(normed_df, allprots, num_samples_quadratic, min_nonan)
list_of_tuple_w_protein_profiles_and_shifted_peptides = pool.starmap(calculate_peptide_and_protein_intensities, input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan)
pool.close()
return list_of_tuple_w_protein_profiles_and_shifted_peptides
Expand All @@ -66,65 +107,17 @@ def get_configured_multiprocessing_pool(num_cores):
LOGGER.info(f"using {pool._processes} processes")
return pool

def calculate_peptide_and_protein_intensities_from_list_of_peptide_intensity_dfs(idx, list_of_peptide_intensity_dfs, num_samples_quadratic, min_nonan):
for peptide_intensity_df in list_of_peptide_intensity_dfs:
calculate_peptide_and_protein_intensities

def get_input_specification_tuplelist_idx__df__num_samples_quadratic__min_nonan(normed_df, allprots, num_samples_quadratic, min_nonan):
list_of_normed_dfs = get_normed_dfs(normed_df, allprots)
return zip(range(len(list_of_normed_dfs)),list_of_normed_dfs, itertools.repeat(num_samples_quadratic), itertools.repeat(min_nonan))




def get_normed_dfs(normed_df, allprots):
list_of_normed_dfs = []
for protein in allprots:
peptide_intensity_df = pd.DataFrame(normed_df.loc[protein])#DataFrame definition to avoid pandas Series objects
list_of_normed_dfs.append(peptide_intensity_df)

return list_of_normed_dfs


def get_ion_intensity_dataframe_from_list_of_shifted_peptides(list_of_tuple_w_protein_profiles_and_shifted_peptides, allprots):
ion_ints = [x[1] for x in list_of_tuple_w_protein_profiles_and_shifted_peptides]
ion_ints = add_protein_names_to_ion_ints(ion_ints, allprots)
ion_df = 2**pd.concat(ion_ints)
ion_df = ion_df.replace(np.nan, 0)
return ion_df

def add_protein_names_to_ion_ints(ion_ints, allprots):
ion_ints = [add_protein_name_to_ion_df(ion_ints[idx], allprots[idx]) for idx in range(len(ion_ints))]
return ion_ints

def add_protein_name_to_ion_df(ion_df, protein):
ion_df[config.PROTEIN_ID] = protein
ion_df = ion_df.reset_index().set_index([config.PROTEIN_ID, config.QUANT_ID])
return ion_df


def get_protein_dataframe_from_list_of_protein_profiles(allprots, list_of_tuple_w_protein_profiles_and_shifted_peptides, normed_df):
index_list = []
profile_list = []

list_of_protein_profiles = [x[0] for x in list_of_tuple_w_protein_profiles_and_shifted_peptides]

for idx in range(len(allprots)):
if list_of_protein_profiles[idx] is None:
continue
index_list.append(allprots[idx])
profile_list.append(list_of_protein_profiles[idx])

index_for_protein_df = pd.Index(data=index_list, name=config.PROTEIN_ID)
protein_df = 2**pd.DataFrame(profile_list, index = index_for_protein_df, columns = normed_df.columns)
protein_df = protein_df.replace(np.nan, 0)
protein_df = protein_df.reset_index()
return protein_df


def calculate_peptide_and_protein_intensities(idx,peptide_intensity_df , num_samples_quadratic, min_nonan):
def calculate_peptide_and_protein_intensities(idx, peptide_intensity_df, num_samples_quadratic, min_nonan):
if len(peptide_intensity_df.index) > 1:
peptide_intensity_df = ProtvalCutter(peptide_intensity_df, maximum_df_length=100).get_dataframe()

if(idx%100 ==0) and config.LOG_PROCESSED_PROTEINS:
LOGGER.info(f"prot {idx}")
LOGGER.info(f"lfq-object {idx}")
summed_pepint = np.nansum(2**peptide_intensity_df)

if(peptide_intensity_df.shape[1]<2):
Expand Down Expand Up @@ -200,3 +193,57 @@ def get_dataframe(self):
def _get_shortened_dataframe(self):
shortened_index = self._sorted_idx[:self._maximum_df_length]
return self._protvals_df.loc[shortened_index]




def get_ion_intensity_dataframe_from_list_of_shifted_peptides(list_of_tuple_w_protein_profiles_and_shifted_peptides, allprots):
ion_names = []
ion_vals = []
protein_names = []
column_names = list_of_tuple_w_protein_profiles_and_shifted_peptides[0][1].columns.tolist()
for idx in range(len(list_of_tuple_w_protein_profiles_and_shifted_peptides)):
protein_name = allprots[idx]
ion_df = list_of_tuple_w_protein_profiles_and_shifted_peptides[idx][1]
ion_names += ion_df.index.values.tolist()
ion_vals.append(ion_df.to_numpy())
protein_names.extend([protein_name]*len(ion_df.index))
merged_ions = 2**np.concatenate(ion_vals)
merged_ions = np.nan_to_num(merged_ions)
ion_df = pd.DataFrame(merged_ions)
ion_df.columns = column_names
ion_df["ion"] = ion_names
ion_df["protein"] = protein_names
ion_df = ion_df.set_index(["protein", "ion"])
return ion_df



def add_protein_names_to_ion_ints(ion_ints, allprots):
ion_ints = [add_protein_name_to_ion_df(ion_ints[idx], allprots[idx]) for idx in range(len(ion_ints))]
return ion_ints

def add_protein_name_to_ion_df(ion_df, protein):
ion_df[config.PROTEIN_ID] = protein
ion_df = ion_df.reset_index().set_index([config.PROTEIN_ID, config.QUANT_ID])
return ion_df


def get_protein_dataframe_from_list_of_protein_profiles(allprots, list_of_tuple_w_protein_profiles_and_shifted_peptides, normed_df):
index_list = []
profile_list = []

list_of_protein_profiles = [x[0] for x in list_of_tuple_w_protein_profiles_and_shifted_peptides]

for idx in range(len(allprots)):
if list_of_protein_profiles[idx] is None:
continue
index_list.append(allprots[idx])
profile_list.append(list_of_protein_profiles[idx])

index_for_protein_df = pd.Index(data=index_list, name=config.PROTEIN_ID)
protein_df = 2**pd.DataFrame(profile_list, index = index_for_protein_df, columns = normed_df.columns)
protein_df = protein_df.replace(np.nan, 0)
protein_df = protein_df.reset_index()
return protein_df

2 changes: 1 addition & 1 deletion misc/bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.2.15
current_version = 0.2.16
commit = True
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
Expand Down
Loading

0 comments on commit 301ca05

Please sign in to comment.