From 94c02f4486004fdac1633c544bd5b483f0a938f5 Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 14 Aug 2023 11:16:41 +0100 Subject: [PATCH 01/12] Allow overwrite if forced --- PopPUNK/assign.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 0422f1ce..10063369 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -386,7 +386,7 @@ def assign_query_hdf5(dbFuncs, readDBParams = dbFuncs['readDBParams'] getSeqsInDb = dbFuncs['getSeqsInDb'] - if ref_db == output: + if ref_db == output and overwrite == False: sys.stderr.write("--output and --ref-db must be different to " "prevent overwrite.\n") sys.exit(1) From b9f9025f5b27aa531bed49e1eabf823f40fed245 Mon Sep 17 00:00:00 2001 From: John Lees Date: Mon, 14 Aug 2023 11:30:39 +0100 Subject: [PATCH 02/12] Update error message --- PopPUNK/assign.py | 2 +- PopPUNK/network.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 10063369..54937eff 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -509,7 +509,7 @@ def assign_query_hdf5(dbFuncs, n_vertices = len(get_vertex_list(genomeNetwork, use_gpu = gpu_graph)) if n_vertices != len(rNames): - sys.stderr.write(f"There are {n_vertices} vertices in the network but {len(rNames)} reference names supplied; " + \ + sys.stderr.write(f"There are {n_vertices} vertices in the network but {len(rNames)} reference names supplied; " + \ "please check the '--model-dir' variable is pointing to the correct directory\n") if model.type == 'lineage': diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 3c9c6a4e..07bc7674 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -529,6 +529,11 @@ def network_to_edges(prev_G_fn, rlist, adding_qq_dists = False, exit(1) edge_weights = list(prev_G.ep['weight']) + if len(old_ids) != max(old_source_ids, old_target_ids) + 1: + sys.stderr.write(f"Network size {max(old_source_ids, old_target_ids) + 1} does " + f"not match rlist/qlist size {len(old_ids)}\n") + sys.exit(1) + # If appending queries to an existing network, then the recovered links can be left # unchanged, as the new IDs are the queries, and the existing sequences will not be found # in the list of IDs From 6cb3f5a08bfccbdd979b35d359ab35b950abfca9 Mon Sep 17 00:00:00 2001 From: John Lees Date: Wed, 23 Aug 2023 09:24:37 +0100 Subject: [PATCH 03/12] Rebuild hdbscan on CI --- .github/workflows/azure_ci.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/azure_ci.yml b/.github/workflows/azure_ci.yml index e2ba7539..f848e775 100644 --- a/.github/workflows/azure_ci.yml +++ b/.github/workflows/azure_ci.yml @@ -27,6 +27,10 @@ jobs: micromamba-version: '1.4.6-0' environment-file: environment.yml cache-environment: true + - name: Rebuild HDBSCAN libraries + shell: bash -l {0} + run: | + pip install --upgrade git+https://github.com/scikit-learn-contrib/hdbscan.git#egg=hdbscan - name: Install and run_test.py shell: bash -l {0} run: | From fcb6ee011c455c24a2b56eaa2875a6dba5bfe9b0 Mon Sep 17 00:00:00 2001 From: John Lees Date: Wed, 23 Aug 2023 09:32:18 +0100 Subject: [PATCH 04/12] Install hdbscan from archive --- .github/workflows/azure_ci.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/azure_ci.yml b/.github/workflows/azure_ci.yml index f848e775..1fb4b4eb 100644 --- a/.github/workflows/azure_ci.yml +++ b/.github/workflows/azure_ci.yml @@ -30,7 +30,12 @@ jobs: - name: Rebuild HDBSCAN libraries shell: bash -l {0} run: | - pip install --upgrade git+https://github.com/scikit-learn-contrib/hdbscan.git#egg=hdbscan + wget https://github.com/scikit-learn-contrib/hdbscan/archive/master.zip + unzip master.zip + rm master.zip + pushd hdbscan-master + python setup.py install + popd - name: Install and run_test.py shell: bash -l {0} run: | From 62a5fadc84815cf3f7888f385cc22f908f4d5fde Mon Sep 17 00:00:00 2001 From: John Lees Date: Thu, 24 Aug 2023 11:26:38 +0100 Subject: [PATCH 05/12] Revert CI change --- .github/workflows/azure_ci.yml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/.github/workflows/azure_ci.yml b/.github/workflows/azure_ci.yml index 1fb4b4eb..e2ba7539 100644 --- a/.github/workflows/azure_ci.yml +++ b/.github/workflows/azure_ci.yml @@ -27,15 +27,6 @@ jobs: micromamba-version: '1.4.6-0' environment-file: environment.yml cache-environment: true - - name: Rebuild HDBSCAN libraries - shell: bash -l {0} - run: | - wget https://github.com/scikit-learn-contrib/hdbscan/archive/master.zip - unzip master.zip - rm master.zip - pushd hdbscan-master - python setup.py install - popd - name: Install and run_test.py shell: bash -l {0} run: | From ea093b153a5bff6a3cd3f0075954d3f12d0fec2e Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 25 Aug 2023 08:54:42 +0100 Subject: [PATCH 06/12] temporarily turn off mamba cache --- .github/workflows/azure_ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/azure_ci.yml b/.github/workflows/azure_ci.yml index e2ba7539..856f2ad2 100644 --- a/.github/workflows/azure_ci.yml +++ b/.github/workflows/azure_ci.yml @@ -26,7 +26,7 @@ jobs: with: micromamba-version: '1.4.6-0' environment-file: environment.yml - cache-environment: true + cache-environment: false - name: Install and run_test.py shell: bash -l {0} run: | From 2f940e8635f826fd18fd1e45528cfe6a72d4d8a0 Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 25 Aug 2023 08:54:42 +0100 Subject: [PATCH 07/12] Revert "temporarily turn off mamba cache" This reverts commit ea093b153a5bff6a3cd3f0075954d3f12d0fec2e. --- .github/workflows/azure_ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/azure_ci.yml b/.github/workflows/azure_ci.yml index 856f2ad2..e2ba7539 100644 --- a/.github/workflows/azure_ci.yml +++ b/.github/workflows/azure_ci.yml @@ -26,7 +26,7 @@ jobs: with: micromamba-version: '1.4.6-0' environment-file: environment.yml - cache-environment: false + cache-environment: true - name: Install and run_test.py shell: bash -l {0} run: | From 7be0813275cc4db2f88ddf8b9e1a861a3aca5c29 Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 25 Aug 2023 09:45:11 +0100 Subject: [PATCH 08/12] Need to take len of network objects --- .github/workflows/azure_ci.yml | 2 +- PopPUNK/network.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/azure_ci.yml b/.github/workflows/azure_ci.yml index e2ba7539..856f2ad2 100644 --- a/.github/workflows/azure_ci.yml +++ b/.github/workflows/azure_ci.yml @@ -26,7 +26,7 @@ jobs: with: micromamba-version: '1.4.6-0' environment-file: environment.yml - cache-environment: true + cache-environment: false - name: Install and run_test.py shell: bash -l {0} run: | diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 07bc7674..27ecb8be 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -529,8 +529,9 @@ def network_to_edges(prev_G_fn, rlist, adding_qq_dists = False, exit(1) edge_weights = list(prev_G.ep['weight']) - if len(old_ids) != max(old_source_ids, old_target_ids) + 1: - sys.stderr.write(f"Network size {max(old_source_ids, old_target_ids) + 1} does " + network_size = max(len(old_source_ids), len(old_target_ids)) + 1 + if len(old_ids) != network_size: + sys.stderr.write(f"Network size {network_size} does " f"not match rlist/qlist size {len(old_ids)}\n") sys.exit(1) From 2fef9566c7f1e0557463f7d5f2e14f3278b8bb6e Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 25 Aug 2023 09:48:02 +0100 Subject: [PATCH 09/12] Renew mamba CI cache each day --- .github/workflows/azure_ci.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/azure_ci.yml b/.github/workflows/azure_ci.yml index 856f2ad2..dad7477d 100644 --- a/.github/workflows/azure_ci.yml +++ b/.github/workflows/azure_ci.yml @@ -21,12 +21,17 @@ jobs: uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} + - name: Get current date + id: date + run: echo "date=$(date +%Y-%m-%d)" >> "${GITHUB_OUTPUT}" - name: Install Conda environment from environment.yml uses: mamba-org/setup-micromamba@v1 with: micromamba-version: '1.4.6-0' environment-file: environment.yml - cache-environment: false + # persist on the same day. + cache-environment-key: environment-${{ steps.date.outputs.date }} + cache-downloads-key: downloads-${{ steps.date.outputs.date }} - name: Install and run_test.py shell: bash -l {0} run: | From 7131009b6e445d0f1bccb0ae5f7b84bbdd9e7517 Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 25 Aug 2023 10:12:18 +0100 Subject: [PATCH 10/12] Extract attribute first --- PopPUNK/network.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 27ecb8be..97e6108c 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -517,10 +517,12 @@ def network_to_edges(prev_G_fn, rlist, adding_qq_dists = False, G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) old_source_ids = G_df['source'].astype('int32').to_arrow().to_pylist() old_target_ids = G_df['destination'].astype('int32').to_arrow().to_pylist() + network_size = max(max(old_source_ids), max(old_target_ids)) + 1 else: # get the source and target nodes old_source_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "source") old_target_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "target") + network_size = max(max(old_source_ids.a), max(old_target_ids.a)) + 1 # get the weights if weights: if prev_G.edge_properties.keys() is None or 'weight' not in prev_G.edge_properties.keys(): @@ -529,7 +531,6 @@ def network_to_edges(prev_G_fn, rlist, adding_qq_dists = False, exit(1) edge_weights = list(prev_G.ep['weight']) - network_size = max(len(old_source_ids), len(old_target_ids)) + 1 if len(old_ids) != network_size: sys.stderr.write(f"Network size {network_size} does " f"not match rlist/qlist size {len(old_ids)}\n") From 60825f920e86987be2bff643b1ff6bce17544727 Mon Sep 17 00:00:00 2001 From: John Lees Date: Fri, 25 Aug 2023 10:34:03 +0100 Subject: [PATCH 11/12] Throw error earlier --- PopPUNK/assign.py | 3 ++- PopPUNK/network.py | 22 ++++++++++------------ 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 54937eff..669e138c 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -509,8 +509,9 @@ def assign_query_hdf5(dbFuncs, n_vertices = len(get_vertex_list(genomeNetwork, use_gpu = gpu_graph)) if n_vertices != len(rNames): - sys.stderr.write(f"There are {n_vertices} vertices in the network but {len(rNames)} reference names supplied; " + \ + sys.stderr.write(f"ERROR: There are {n_vertices} vertices in the network but {len(rNames)} reference names supplied; " + \ "please check the '--model-dir' variable is pointing to the correct directory\n") + sys.exit(1) if model.type == 'lineage': # Assign lineages by calculating query-query information diff --git a/PopPUNK/network.py b/PopPUNK/network.py index 97e6108c..1c1be972 100644 --- a/PopPUNK/network.py +++ b/PopPUNK/network.py @@ -517,12 +517,10 @@ def network_to_edges(prev_G_fn, rlist, adding_qq_dists = False, G_df.rename(columns={'src': 'source','dst': 'destination'}, inplace=True) old_source_ids = G_df['source'].astype('int32').to_arrow().to_pylist() old_target_ids = G_df['destination'].astype('int32').to_arrow().to_pylist() - network_size = max(max(old_source_ids), max(old_target_ids)) + 1 else: # get the source and target nodes old_source_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "source") old_target_ids = gt.edge_endpoint_property(prev_G, prev_G.vertex_index, "target") - network_size = max(max(old_source_ids.a), max(old_target_ids.a)) + 1 # get the weights if weights: if prev_G.edge_properties.keys() is None or 'weight' not in prev_G.edge_properties.keys(): @@ -531,11 +529,6 @@ def network_to_edges(prev_G_fn, rlist, adding_qq_dists = False, exit(1) edge_weights = list(prev_G.ep['weight']) - if len(old_ids) != network_size: - sys.stderr.write(f"Network size {network_size} does " - f"not match rlist/qlist size {len(old_ids)}\n") - sys.exit(1) - # If appending queries to an existing network, then the recovered links can be left # unchanged, as the new IDs are the queries, and the existing sequences will not be found # in the list of IDs @@ -543,11 +536,16 @@ def network_to_edges(prev_G_fn, rlist, adding_qq_dists = False, source_ids = old_source_ids target_ids = old_target_ids else: - # Update IDs to new versions - old_id_indices = [rlist.index(x) for x in old_ids] - # translate to indices - source_ids = [old_id_indices[x] for x in old_source_ids] - target_ids = [old_id_indices[x] for x in old_target_ids] + try: + # Update IDs to new versions + old_id_indices = [rlist.index(x) for x in old_ids] + # translate to indices + source_ids = [old_id_indices[x] for x in old_source_ids] + target_ids = [old_id_indices[x] for x in old_target_ids] + except ValueError: + sys.stderr.write(f"Network size mismatch. Previous network nodes: {max(old_id_indices)}." + f"New network nodes: {max(old_source_ids.a)}/{max(old_target_ids.a)}\n") + sys.exit(1) # return values if weights: From 30731926168113215dc3cc289f97655e8da9696d Mon Sep 17 00:00:00 2001 From: Harry Hung <4848896+HarryHung@users.noreply.github.com> Date: Fri, 8 Sep 2023 15:21:33 +0100 Subject: [PATCH 12/12] Fix overwrite, correct error messages --- PopPUNK/assign.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/PopPUNK/assign.py b/PopPUNK/assign.py index 669e138c..dd4642a0 100644 --- a/PopPUNK/assign.py +++ b/PopPUNK/assign.py @@ -271,8 +271,8 @@ def assign_query(dbFuncs, constructDatabase = dbFuncs['constructDatabase'] readDBParams = dbFuncs['readDBParams'] - if ref_db == output: - sys.stderr.write("--output and --ref-db must be different to " + if ref_db == output and overwrite == False: + sys.stderr.write("--output and --db must be different to " "prevent overwrite.\n") sys.exit(1) @@ -387,7 +387,7 @@ def assign_query_hdf5(dbFuncs, getSeqsInDb = dbFuncs['getSeqsInDb'] if ref_db == output and overwrite == False: - sys.stderr.write("--output and --ref-db must be different to " + sys.stderr.write("--output and --db must be different to " "prevent overwrite.\n") sys.exit(1) if (update_db and not distances):