Skip to content

Commit

Permalink
Added exclusion criteria for ENA runs with missing collection date
Browse files Browse the repository at this point in the history
Added test for collection data filter
Added command line option to disable filter criteria (other viruses)
  • Loading branch information
johausmann committed Nov 7, 2023
1 parent b652204 commit f02c5f4
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 5 deletions.
2 changes: 1 addition & 1 deletion covigator/accessor/abstract_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def _parse_country(self, sample: Union[SampleEna, SampleCovid19]):
sample.continent_alpha_2 = parsed_country.continent_alpha_2
sample.continent = parsed_country.continent

def _parse_dates(self, sample: Union[SampleEna, SampleCovid19], disable_minimum_date: bool=False):
def _parse_dates(self, sample: Union[SampleEna, SampleCovid19], disable_minimum_date: bool = False):
sample.collection_date = _parse_abstract(sample.collection_date, date.fromisoformat)
sample.first_created = _parse_abstract(sample.first_created, date.fromisoformat)
if not disable_minimum_date:
Expand Down
14 changes: 13 additions & 1 deletion covigator/accessor/ena_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class EnaAccessor(AbstractAccessor):
]

def __init__(self, tax_id: str, host_tax_id: str, database: Database, maximum=None,
disable_minimum_date: bool=False):
disable_minimum_date: bool = False, disable_collection_date: bool = False):

super().__init__()
logger.info("Initialising ENA accessor")
Expand All @@ -92,11 +92,15 @@ def __init__(self, tax_id: str, host_tax_id: str, database: Database, maximum=No
self.disable_minimum_date = disable_minimum_date
if self.disable_minimum_date:
logger.info("Disabling minimum date filter criteria")
self.disable_collection_date_filter = disable_collection_date
if self.disable_collection_date_filter:
logger.info("Disabling empty collection date filter criteria")

self.excluded_samples_by_host_tax_id = {}
self.excluded_samples_by_fastq_ftp = 0
self.excluded_samples_by_instrument_platform = {}
self.excluded_samples_by_library_strategy = {}
self.excluded_samples_by_empty_collection_date = {}
self.excluded_existing = 0
self.included = 0
self.excluded = 0
Expand Down Expand Up @@ -191,6 +195,12 @@ def _parse_ena_run(self, run):
def _complies_with_inclusion_criteria(self, ena_run: dict):
# NOTE: this uses the original dictionary instead of the parsed SampleEna class for performance reasons
included = True
# Skip samples with empty collection date - Note this checks not if collection date is too early
if not self.disable_collection_date_filter:
collection_date = ena_run.get("collection_date")
if collection_date is None or collection_date.strip() == "":
included = False
self.excluded_samples_by_empty_collection_date += 1
# Skip host id filter if data is not available for selected Virus
if self.host_tax_id_filter:
host_tax_id = ena_run.get("host_tax_id")
Expand All @@ -212,6 +222,7 @@ def _complies_with_inclusion_criteria(self, ena_run: dict):
included = False # skips not included library strategies
self.excluded_samples_by_library_strategy[str(library_strategy)] = \
self.excluded_samples_by_library_strategy.get(str(library_strategy), 0) + 1
# Skip samples
if not included:
self.excluded += 1
return included
Expand All @@ -224,6 +235,7 @@ def _log_results(self):
logger.info("Excluded by platform runs = {}".format(self.excluded_samples_by_instrument_platform))
logger.info("Excluded by host if runs = {}".format(self.excluded_samples_by_host_tax_id))
logger.info("Excluded by library strategy = {}".format(self.excluded_samples_by_library_strategy))
logger.info("Excluded by empty collection date = {}".format(self.excluded_samples_by_empty_collection_date))

def _write_execution_log(self, session: Session):
end_time = datetime.now()
Expand Down
9 changes: 8 additions & 1 deletion covigator/command_line.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,22 @@ def ena_accessor():
help="disable minimum date exclusion criteria",
action="store_true"
)
parser.add_argument(
"--disable-collection-date",
dest="disable_collection_date",
help="disable collection date exclusion criteria",
action="store_true"
)

args = parser.parse_args()
tax_id = args.tax_id
host_tax_id = args.host_tax_id
disable_minimum_date = args.disable_minimum_date
disable_exclusion_date = args.disable_exclusion_date
config = Configuration(verbose=True)
covigator.configuration.initialise_logs(config.logfile_accesor)
EnaAccessor(tax_id=tax_id, host_tax_id=host_tax_id, database=Database(config=config, initialize=True),
disable_minimum_date=disable_minimum_date).access()
disable_minimum_date=disable_minimum_date, disable_collection_date=disable_exclusion_date).access()


def covid19_portal_accessor():
Expand Down
5 changes: 3 additions & 2 deletions covigator/tests/unit_tests/faked_objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,12 @@ def __init__(self):

class FakeEnaAccessor(EnaAccessor):

def __init__(self, results, database=None, host_tax_id=HOMO_SAPIENS_TAXID, disable_minimum_date=False):
def __init__(self, results, database=None, host_tax_id=HOMO_SAPIENS_TAXID, disable_minimum_date=False,
disable_collection_date=False):
# uses an in memory database or the one provided
super().__init__(tax_id=SARS_COV_2_TAXID, host_tax_id=host_tax_id,
database=database if database else Database(test=True, config=Configuration()),
disable_minimum_date=disable_minimum_date)
disable_minimum_date=disable_minimum_date, disable_collection_date=disable_collection_date)
self.results = results

def _get_ena_runs_page(self):
Expand Down
73 changes: 73 additions & 0 deletions covigator/tests/unit_tests/test_ena_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,79 @@ def test_filtering_by_missing_fastqs(self):
self.assertEqual(ena_accessor.excluded, 2)
self.assertEqual(ena_accessor.excluded_samples_by_fastq_ftp, 2)

def test_filtering_empty_collection_date(self):
# Test that samples with empty collection date are filtered out
ena_accessor_empty_collection = FakeEnaAccessor(results=[
{"run_accession": "ERR4080483",
"scientific_name": "Severe acute respiratory syndrome coronavirus 2",
"instrument_platform": "ILLUMINA",
"library_strategy": "WGS",
"fastq_ftp": "ftp.sra.ebi.ac.uk/vol1/fastq/ERR408/003/ERR4080483/ERR4080483_1.fastq.gz",
"fastq_md5": "a91a9dfa2f7008e13a7ce9767aa9aaf3",
"host_tax_id": "9606",
"first_created": "2020-01-01",
"collection_date": "2019-12-31",
},
{"run_accession": "ERR4080484",
"scientific_name": "Severe acute respiratory syndrome coronavirus 2",
"instrument_platform": "ILLUMINA",
"library_strategy": "WGS",
"fastq_ftp": "ftp.sra.ebi.ac.uk/vol1/fastq/ERR408/003/ERR4080483/ERR4080483_1.fastq.gz",
"fastq_md5": "c57fef34933cbbec2e9e08867f3c664c",
"host_tax_id": "9606",
"first_created": "2020-01-01 14:50",
"collection_date": "2019-12-31 12:12:12"},
{"run_accession": "ERR4080485",
"scientific_name": "Severe acute respiratory syndrome coronavirus 2",
"instrument_platform": "ILLUMINA",
"library_strategy": "WGS",
"fastq_ftp": "ftp.sra.ebi.ac.uk/vol1/fastq/ERR408/005/ERR4080485/ERR4080485_1.fastq.gz",
"fastq_md5": "4de269d2b5831e1c5175586af694d21e",
"host_tax_id": "9606",
"first_created": "",
"collection_date": ""}
])
ena_accessor_empty_collection.access()
self.assertEqual(ena_accessor_empty_collection.included, 2)
self.assertEqual(ena_accessor_empty_collection, 1)
self.assertEqual(ena_accessor_empty_collection.excluded_samples_by_empty_collection_date, 1)

# Test that collection date filter can be disabled
ena_accessor_empty_collection_disabled = FakeEnaAccessor(results=[

Check warning on line 279 in covigator/tests/unit_tests/test_ena_accessor.py

View check run for this annotation

Codacy Production / Codacy Static Code Analysis

covigator/tests/unit_tests/test_ena_accessor.py#L279

Unused variable 'ena_accessor_empty_collection_disabled'
{"run_accession": "ERR4080483",
"scientific_name": "Severe acute respiratory syndrome coronavirus 2",
"instrument_platform": "ILLUMINA",
"library_strategy": "WGS",
"fastq_ftp": "ftp.sra.ebi.ac.uk/vol1/fastq/ERR408/003/ERR4080483/ERR4080483_1.fastq.gz",
"fastq_md5": "a91a9dfa2f7008e13a7ce9767aa9aaf3",
"host_tax_id": "9606",
"first_created": "2020-01-01",
"collection_date": "2019-12-31",
},
{"run_accession": "ERR4080484",
"scientific_name": "Severe acute respiratory syndrome coronavirus 2",
"instrument_platform": "ILLUMINA",
"library_strategy": "WGS",
"fastq_ftp": "ftp.sra.ebi.ac.uk/vol1/fastq/ERR408/003/ERR4080483/ERR4080483_1.fastq.gz",
"fastq_md5": "c57fef34933cbbec2e9e08867f3c664c",
"host_tax_id": "9606",
"first_created": "2020-01-01 14:50",
"collection_date": "2019-12-31 12:12:12"},
{"run_accession": "ERR4080485",
"scientific_name": "Severe acute respiratory syndrome coronavirus 2",
"instrument_platform": "ILLUMINA",
"library_strategy": "WGS",
"fastq_ftp": "ftp.sra.ebi.ac.uk/vol1/fastq/ERR408/005/ERR4080485/ERR4080485_1.fastq.gz",
"fastq_md5": "4de269d2b5831e1c5175586af694d21e",
"host_tax_id": "9606",
"first_created": "",
"collection_date": ""}
], disable_collection_date=True)
ena_accessor_empty_collection.access()
self.assertEqual(ena_accessor_empty_collection.included, 3)
self.assertEqual(ena_accessor_empty_collection.exlcuded, 0)
self.assertEqual(ena_accessor_empty_collection.excluded_samples_by_empty_collection_date, 0)

def test_no_filtering(self):
ena_accessor = FakeEnaAccessor([
{"run_accession": "ERR4080483",
Expand Down

0 comments on commit f02c5f4

Please sign in to comment.