diff --git a/src/DAJIN2/core/preprocess/genome_fetcher.py b/src/DAJIN2/core/preprocess/genome_fetcher.py
index 90db0a96..0f75ec7f 100644
--- a/src/DAJIN2/core/preprocess/genome_fetcher.py
+++ b/src/DAJIN2/core/preprocess/genome_fetcher.py
@@ -8,7 +8,7 @@ def fetch_seq_coordinates(genome: str, blat_url: str, seq: str) -> dict:
response = urlopen(url).read().decode("utf8").split("\n")
matches = [x for x in response if "100.0%" in x]
if not matches:
- raise ValueError(f"{seq[:10]}... is not found in {genome}")
+ raise ValueError(f"{seq[:60]}... is not found in {genome}")
chrom, strand, start, end, _ = matches[0].split()[-5:]
return {"chrom": chrom, "strand": strand, "start": int(start), "end": int(end)}
diff --git a/src/DAJIN2/utils/input_validator.py b/src/DAJIN2/utils/input_validator.py
index a10aa5c9..df775395 100644
--- a/src/DAJIN2/utils/input_validator.py
+++ b/src/DAJIN2/utils/input_validator.py
@@ -97,20 +97,22 @@ def exists_cached_genome(genome: str, tempdir: Path, exists_cache_control: bool)
########################################################################
-def is_webpage_available(url: str) -> bool:
+def get_html(url: str) -> str:
try:
- with urlopen(url) as response:
+ with urlopen(url, timeout=10) as response:
html = response.read().decode("utf-8")
- if "TITLE" not in html:
- return True
- title = next((h for h in html.split("\n") if "
" in h), "")
- return "Error" not in title
+ return html
except URLError:
- return False
+ return ""
-def get_first_available_url(urls: list[str]) -> str | None:
- return next((url for url in urls if is_webpage_available(url)), None)
+def format_url(key: str, url: str) -> str:
+ return url + "/hg38" if key == "goldenpath" else url
+
+
+def get_first_available_url(key: str, urls: list[str]) -> str | None:
+ search_keys = {"blat": "BLAT Search Genome", "das": "GRCh38/hg38", "goldenpath": "bigZips"}
+ return next((url for url in urls if search_keys[key] in get_html(format_url(key, url))), None)
def fetch_xml_data(url: str) -> bytes:
@@ -150,11 +152,11 @@ def validate_genome_and_fetch_urls(genome: str) -> dict[str, str]:
],
"goldenpath": [
"https://hgdownload.cse.ucsc.edu/goldenPath",
- "http://hgdownload-euro.soe.ucsc.edu/goldenPath",
+ "https://hgdownload.soe.ucsc.edu/goldenPath",
],
}
- available_servers = {key: get_first_available_url(urls) for key, urls in server_lists.items()}
+ available_servers = {key: get_first_available_url(key, urls) for key, urls in server_lists.items()}
error_messages = {
"blat": "All UCSC blat servers are currently down. Please wait for a while and try again.",
@@ -163,7 +165,7 @@ def validate_genome_and_fetch_urls(genome: str) -> dict[str, str]:
}
for key, message in error_messages.items():
- if not available_servers[key]:
+ if available_servers[key] is None:
raise URLError(message)
if not is_genome_in_ucsc_ids(genome, available_servers["das"]):
diff --git a/tests/src/preprocess/test_genome_fetcher.py b/tests/src/preprocess/test_genome_fetcher.py
index 0dae046d..1b825e23 100644
--- a/tests/src/preprocess/test_genome_fetcher.py
+++ b/tests/src/preprocess/test_genome_fetcher.py
@@ -28,10 +28,10 @@ def test_fetch_seq_coodinates_strand_minus():
def test_fetch_seq_coodinates_error():
genome = "mm39"
blat_url = "https://genome.ucsc.edu/cgi-bin/hgBlat"
- seq = "XXXXXXXXXXXXXXXXX"
+ seq = "X" * 100
with pytest.raises(ValueError) as e:
genome_fetcher.fetch_seq_coordinates(genome, blat_url, seq)
- assert str(e.value) == f"{seq[:10]}... is not found in {genome}"
+ assert str(e.value) == f"{seq[:60]}... is not found in {genome}"
@pytest.mark.slow
diff --git a/tests/src/utils/test_validator.py b/tests/src/utils/test_validator.py
index 0f8b7717..7c46003e 100644
--- a/tests/src/utils/test_validator.py
+++ b/tests/src/utils/test_validator.py
@@ -79,31 +79,6 @@ def test_fasta_without_error():
# validate URL
###############################################################################
-
-@pytest.mark.slow
-def test_available_url_pass():
- assert input_validator.is_webpage_available("https://example.com") is True
-
-
-@pytest.mark.slow
-def test_available_url_fail():
- assert input_validator.is_webpage_available("https://example_xxx.com") is False
-
-
-@pytest.mark.slow
-def test_get_first_available_url():
- test = input_validator.get_first_available_url(["https://example_xxx.com", "https://example.com"])
- answer = "https://example.com"
- assert test == answer
-
-
-@pytest.mark.slow
-def test_get_first_available_url_not_found():
- test = input_validator.get_first_available_url(["https://example_xxx.com", "https://example_yyy.com"])
- answer = None
- assert test == answer
-
-
@pytest.mark.slow
def test_available_genome_pass():
genome = "mm10"