Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add plink converter function #515

Open
wants to merge 18 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 167 additions & 0 deletions malariagen_data/anoph/to_plink.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
from typing import Optional

import allel # type: ignore
import numpy as np
import os
import bed_reader
from dask.diagnostics import ProgressBar

from .snp_data import AnophelesSnpData
from . import base_params


class PlinkConverter(
AnophelesSnpData,
):
def __init__(
self,
**kwargs,
):
# N.B., this class is designed to work cooperatively, and
# so it's important that any remaining parameters are passed
# to the superclass constructor.
super().__init__(**kwargs)

def _create_plink_outfile(
self,
*,
results_dir,
region,
n_snps,
min_minor_ac,
thin_offset,
max_missing_an,
):
return f"{results_dir}/{region}.{n_snps}.{min_minor_ac}.{thin_offset}.{max_missing_an}"

def _biallelic_snps_to_plink(
self,
*,
results_dir,
region,
n_snps,
thin_offset,
sample_sets,
sample_query,
sample_indices,
site_mask,
min_minor_ac,
max_missing_an,
random_seed,
inline_array,
chunks,
):
# Define output file
plink_file_path = self._create_plink_outfile(
results_dir=results_dir,
region=region,
n_snps=n_snps,
thin_offset=thin_offset,
min_minor_ac=min_minor_ac,
max_missing_an=max_missing_an,
)

bed_file_path = f"{plink_file_path}.bed"
if os.path.exists(bed_file_path):
return plink_file_path
Comment on lines +65 to +66
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Slightly concerned here that if a user changes their mind about something, like what sample sets to use, then reruns this function, the new file will not be written.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly consider adding an overwrite parameter which is True by default, but could be set to False to avoid recomputation.


# Get snps
ds_snps = self.biallelic_snp_calls(
region=region,
sample_sets=sample_sets,
sample_query=sample_query,
sample_indices=sample_indices,
site_mask=site_mask,
min_minor_ac=min_minor_ac,
max_missing_an=max_missing_an,
n_snps=n_snps,
thin_offset=thin_offset,
random_seed=random_seed,
inline_array=inline_array,
chunks=chunks,
)

# Set up dataset with required vars for plink conversion
ds_snps_asc = ds_snps[
[
"variant_contig",
"variant_position",
"variant_allele",
"sample_id",
"call_genotype",
]
]
Comment on lines +84 to +93
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this necessary? Could just access ds_snps directly.


# Compute gt ref counts
with self._spinner("Computing genotype ref counts"):
gt_asc = ds_snps_asc["call_genotype"].data.compute()
gn_ref = allel.GenotypeDaskArray(gt_asc).to_n_ref(fill=-127)
with ProgressBar():
gn_ref = gn_ref.compute()
Comment on lines +96 to +100
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
with self._spinner("Computing genotype ref counts"):
gt_asc = ds_snps_asc["call_genotype"].data.compute()
gn_ref = allel.GenotypeDaskArray(gt_asc).to_n_ref(fill=-127)
with ProgressBar():
gn_ref = gn_ref.compute()
with self._dask_progress("Computing genotype ref counts"):
gt_asc = ds_snps_asc["call_genotype"].data # dask array
gn_ref = allel.GenotypeDaskArray(gt_asc).to_n_ref(fill=-127)
gn_ref = gn_ref.compute()


# Ensure genotypes vary
loc_var = np.any(gn_ref != gn_ref[:, 0, np.newaxis], axis=1)

# Load final data
with ProgressBar():
ds_snps_final = ds_snps_asc[
["variant_contig", "variant_position", "variant_allele", "sample_id"]
].isel(variants=loc_var)
Comment on lines +105 to +109
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Load final data
with ProgressBar():
ds_snps_final = ds_snps_asc[
["variant_contig", "variant_position", "variant_allele", "sample_id"]
].isel(variants=loc_var)
# Load final data
ds_snps_final = dask_compress_dataset(ds_snps_asc, loc_var, dim="variants")

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function dask_compress_dataset() can be imported from the util module, and provides an optimised implementation of selecting from a dataset using a boolean indexer.


# Init vars for input to bed reader
gn_ref_final = gn_ref[loc_var]
val = gn_ref_final.T
alleles = ds_snps_final["variant_allele"].values
properties = {
"iid": ds_snps_final["sample_id"].values,
"chromosome": ds_snps_final["variant_contig"].values,
"bp_position": ds_snps_final["variant_position"].values,
"allele_1": alleles[:, 0],
"allele_2": alleles[:, 1],
}
Comment on lines +114 to +121
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could consider using a spinner...

Suggested change
alleles = ds_snps_final["variant_allele"].values
properties = {
"iid": ds_snps_final["sample_id"].values,
"chromosome": ds_snps_final["variant_contig"].values,
"bp_position": ds_snps_final["variant_position"].values,
"allele_1": alleles[:, 0],
"allele_2": alleles[:, 1],
}
with self._spinner("Prepare output data"):
alleles = ds_snps_final["variant_allele"].values
properties = {
"iid": ds_snps_final["sample_id"].values,
"chromosome": ds_snps_final["variant_contig"].values,
"bp_position": ds_snps_final["variant_position"].values,
"allele_1": alleles[:, 0],
"allele_2": alleles[:, 1],
}


bed_reader.to_bed(
filepath=bed_file_path,
val=val,
properties=properties,
count_A1=True,
)

print(f"PLINK files written to to: {plink_file_path}")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove print statement.


return plink_file_path

def biallelic_snps_to_plink(
self,
results_dir,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit, consider output_dir?

region: base_params.regions,
n_snps: base_params.n_snps,
thin_offset: base_params.thin_offset = 0,
sample_sets: Optional[base_params.sample_sets] = None,
sample_query: Optional[base_params.sample_query] = None,
sample_indices: Optional[base_params.sample_indices] = None,
site_mask: Optional[base_params.site_mask] = None,
min_minor_ac: Optional[base_params.min_minor_ac] = 0,
max_missing_an: Optional[base_params.max_missing_an] = 0,
Comment on lines +144 to +145
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider using same defaults as PCA and NJT here?

random_seed: base_params.random_seed = 42,
inline_array: base_params.inline_array = base_params.inline_array_default,
chunks: base_params.chunks = base_params.native_chunks,
):
params = dict(
results_dir=results_dir,
region=region,
n_snps=n_snps,
thin_offset=thin_offset,
sample_sets=sample_sets,
sample_query=sample_query,
sample_indices=sample_indices,
site_mask=site_mask,
min_minor_ac=min_minor_ac,
max_missing_an=max_missing_an,
random_seed=random_seed,
)

filepath = self._biallelic_snps_to_plink(
inline_array=inline_array, chunks=chunks, **params
)
return filepath
2 changes: 2 additions & 0 deletions malariagen_data/anopheles.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
from .anoph.pca import AnophelesPca
from .anoph.sample_metadata import AnophelesSampleMetadata, locate_cohorts
from .anoph.snp_data import AnophelesSnpData
from .anoph.to_plink import PlinkConverter
from .anoph.g123 import AnophelesG123Analysis
from .anoph.fst import AnophelesFstAnalysis
from .anoph.h12 import AnophelesH12Analysis
Expand Down Expand Up @@ -105,6 +106,7 @@ class AnophelesDataResource(
AnophelesG123Analysis,
AnophelesFstAnalysis,
AnophelesSnpFrequencyAnalysis,
PlinkConverter,
AnophelesPca,
AnophelesIgv,
AnophelesAimData,
Expand Down
Loading