Skip to content

Commit

Permalink
Updated utils.input_validator.validate_genome_and_fetch_urls to obt…
Browse files Browse the repository at this point in the history
…ain `available_server` more explicitly. Previously, it relied on HTTP response codes, but there were instances where the UCSC Genome Browser showed a normal (200) response while internally being in error. Therefore, with this change, a more explicit method is employed by searching for specific keywords present in the normal HTML, to determine if the server is functioning correctly.
  • Loading branch information
akikuno committed Jan 16, 2024
1 parent b83669c commit 24a0259
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 40 deletions.
2 changes: 1 addition & 1 deletion src/DAJIN2/core/preprocess/genome_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def fetch_seq_coordinates(genome: str, blat_url: str, seq: str) -> dict:
response = urlopen(url).read().decode("utf8").split("\n")
matches = [x for x in response if "100.0%" in x]
if not matches:
raise ValueError(f"{seq[:10]}... is not found in {genome}")
raise ValueError(f"{seq[:60]}... is not found in {genome}")
chrom, strand, start, end, _ = matches[0].split()[-5:]
return {"chrom": chrom, "strand": strand, "start": int(start), "end": int(end)}

Expand Down
26 changes: 14 additions & 12 deletions src/DAJIN2/utils/input_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,20 +97,22 @@ def exists_cached_genome(genome: str, tempdir: Path, exists_cache_control: bool)
########################################################################


def is_webpage_available(url: str) -> bool:
def get_html(url: str) -> str:
try:
with urlopen(url) as response:
with urlopen(url, timeout=10) as response:
html = response.read().decode("utf-8")
if "TITLE" not in html:
return True
title = next((h for h in html.split("\n") if "<TITLE>" in h), "")
return "Error" not in title
return html
except URLError:
return False
return ""


def get_first_available_url(urls: list[str]) -> str | None:
return next((url for url in urls if is_webpage_available(url)), None)
def format_url(key: str, url: str) -> str:
return url + "/hg38" if key == "goldenpath" else url


def get_first_available_url(key: str, urls: list[str]) -> str | None:
search_keys = {"blat": "BLAT Search Genome", "das": "GRCh38/hg38", "goldenpath": "bigZips"}
return next((url for url in urls if search_keys[key] in get_html(format_url(key, url))), None)


def fetch_xml_data(url: str) -> bytes:
Expand Down Expand Up @@ -150,11 +152,11 @@ def validate_genome_and_fetch_urls(genome: str) -> dict[str, str]:
],
"goldenpath": [
"https://hgdownload.cse.ucsc.edu/goldenPath",
"http://hgdownload-euro.soe.ucsc.edu/goldenPath",
"https://hgdownload.soe.ucsc.edu/goldenPath",
],
}

available_servers = {key: get_first_available_url(urls) for key, urls in server_lists.items()}
available_servers = {key: get_first_available_url(key, urls) for key, urls in server_lists.items()}

error_messages = {
"blat": "All UCSC blat servers are currently down. Please wait for a while and try again.",
Expand All @@ -163,7 +165,7 @@ def validate_genome_and_fetch_urls(genome: str) -> dict[str, str]:
}

for key, message in error_messages.items():
if not available_servers[key]:
if available_servers[key] is None:
raise URLError(message)

if not is_genome_in_ucsc_ids(genome, available_servers["das"]):
Expand Down
4 changes: 2 additions & 2 deletions tests/src/preprocess/test_genome_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@ def test_fetch_seq_coodinates_strand_minus():
def test_fetch_seq_coodinates_error():
genome = "mm39"
blat_url = "https://genome.ucsc.edu/cgi-bin/hgBlat"
seq = "XXXXXXXXXXXXXXXXX"
seq = "X" * 100
with pytest.raises(ValueError) as e:
genome_fetcher.fetch_seq_coordinates(genome, blat_url, seq)
assert str(e.value) == f"{seq[:10]}... is not found in {genome}"
assert str(e.value) == f"{seq[:60]}... is not found in {genome}"


@pytest.mark.slow
Expand Down
25 changes: 0 additions & 25 deletions tests/src/utils/test_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,31 +79,6 @@ def test_fasta_without_error():
# validate URL
###############################################################################


@pytest.mark.slow
def test_available_url_pass():
assert input_validator.is_webpage_available("https://example.com") is True


@pytest.mark.slow
def test_available_url_fail():
assert input_validator.is_webpage_available("https://example_xxx.com") is False


@pytest.mark.slow
def test_get_first_available_url():
test = input_validator.get_first_available_url(["https://example_xxx.com", "https://example.com"])
answer = "https://example.com"
assert test == answer


@pytest.mark.slow
def test_get_first_available_url_not_found():
test = input_validator.get_first_available_url(["https://example_xxx.com", "https://example_yyy.com"])
answer = None
assert test == answer


@pytest.mark.slow
def test_available_genome_pass():
genome = "mm10"
Expand Down

0 comments on commit 24a0259

Please sign in to comment.