diff --git a/src/DAJIN2/core/preprocess/genome_fetcher.py b/src/DAJIN2/core/preprocess/genome_fetcher.py index 90db0a96..0f75ec7f 100644 --- a/src/DAJIN2/core/preprocess/genome_fetcher.py +++ b/src/DAJIN2/core/preprocess/genome_fetcher.py @@ -8,7 +8,7 @@ def fetch_seq_coordinates(genome: str, blat_url: str, seq: str) -> dict: response = urlopen(url).read().decode("utf8").split("\n") matches = [x for x in response if "100.0%" in x] if not matches: - raise ValueError(f"{seq[:10]}... is not found in {genome}") + raise ValueError(f"{seq[:60]}... is not found in {genome}") chrom, strand, start, end, _ = matches[0].split()[-5:] return {"chrom": chrom, "strand": strand, "start": int(start), "end": int(end)} diff --git a/src/DAJIN2/utils/input_validator.py b/src/DAJIN2/utils/input_validator.py index a10aa5c9..df775395 100644 --- a/src/DAJIN2/utils/input_validator.py +++ b/src/DAJIN2/utils/input_validator.py @@ -97,20 +97,22 @@ def exists_cached_genome(genome: str, tempdir: Path, exists_cache_control: bool) ######################################################################## -def is_webpage_available(url: str) -> bool: +def get_html(url: str) -> str: try: - with urlopen(url) as response: + with urlopen(url, timeout=10) as response: html = response.read().decode("utf-8") - if "TITLE" not in html: - return True - title = next((h for h in html.split("\n") if "" in h), "") - return "Error" not in title + return html except URLError: - return False + return "" -def get_first_available_url(urls: list[str]) -> str | None: - return next((url for url in urls if is_webpage_available(url)), None) +def format_url(key: str, url: str) -> str: + return url + "/hg38" if key == "goldenpath" else url + + +def get_first_available_url(key: str, urls: list[str]) -> str | None: + search_keys = {"blat": "BLAT Search Genome", "das": "GRCh38/hg38", "goldenpath": "bigZips"} + return next((url for url in urls if search_keys[key] in get_html(format_url(key, url))), None) def fetch_xml_data(url: str) -> bytes: @@ -150,11 +152,11 @@ def validate_genome_and_fetch_urls(genome: str) -> dict[str, str]: ], "goldenpath": [ "https://hgdownload.cse.ucsc.edu/goldenPath", - "http://hgdownload-euro.soe.ucsc.edu/goldenPath", + "https://hgdownload.soe.ucsc.edu/goldenPath", ], } - available_servers = {key: get_first_available_url(urls) for key, urls in server_lists.items()} + available_servers = {key: get_first_available_url(key, urls) for key, urls in server_lists.items()} error_messages = { "blat": "All UCSC blat servers are currently down. Please wait for a while and try again.", @@ -163,7 +165,7 @@ def validate_genome_and_fetch_urls(genome: str) -> dict[str, str]: } for key, message in error_messages.items(): - if not available_servers[key]: + if available_servers[key] is None: raise URLError(message) if not is_genome_in_ucsc_ids(genome, available_servers["das"]): diff --git a/tests/src/preprocess/test_genome_fetcher.py b/tests/src/preprocess/test_genome_fetcher.py index 0dae046d..1b825e23 100644 --- a/tests/src/preprocess/test_genome_fetcher.py +++ b/tests/src/preprocess/test_genome_fetcher.py @@ -28,10 +28,10 @@ def test_fetch_seq_coodinates_strand_minus(): def test_fetch_seq_coodinates_error(): genome = "mm39" blat_url = "https://genome.ucsc.edu/cgi-bin/hgBlat" - seq = "XXXXXXXXXXXXXXXXX" + seq = "X" * 100 with pytest.raises(ValueError) as e: genome_fetcher.fetch_seq_coordinates(genome, blat_url, seq) - assert str(e.value) == f"{seq[:10]}... is not found in {genome}" + assert str(e.value) == f"{seq[:60]}... is not found in {genome}" @pytest.mark.slow diff --git a/tests/src/utils/test_validator.py b/tests/src/utils/test_validator.py index 0f8b7717..7c46003e 100644 --- a/tests/src/utils/test_validator.py +++ b/tests/src/utils/test_validator.py @@ -79,31 +79,6 @@ def test_fasta_without_error(): # validate URL ############################################################################### - -@pytest.mark.slow -def test_available_url_pass(): - assert input_validator.is_webpage_available("https://example.com") is True - - -@pytest.mark.slow -def test_available_url_fail(): - assert input_validator.is_webpage_available("https://example_xxx.com") is False - - -@pytest.mark.slow -def test_get_first_available_url(): - test = input_validator.get_first_available_url(["https://example_xxx.com", "https://example.com"]) - answer = "https://example.com" - assert test == answer - - -@pytest.mark.slow -def test_get_first_available_url_not_found(): - test = input_validator.get_first_available_url(["https://example_xxx.com", "https://example_yyy.com"]) - answer = None - assert test == answer - - @pytest.mark.slow def test_available_genome_pass(): genome = "mm10"