diff --git a/datashuttle/utils/data_transfer.py b/datashuttle/utils/data_transfer.py index 1350feceb..12e860daa 100644 --- a/datashuttle/utils/data_transfer.py +++ b/datashuttle/utils/data_transfer.py @@ -83,7 +83,6 @@ def build_a_list_of_all_files_and_folders_to_transfer(self) -> List[str]: self.update_list_with_non_ses_sub_level_folders( extra_folder_names, extra_filenames, sub ) - continue # Datatype (sub and ses level) -------------------------------- diff --git a/tests/ssh_test_utils.py b/tests/ssh_test_utils.py index 31aba6942..0e6b5d067 100644 --- a/tests/ssh_test_utils.py +++ b/tests/ssh_test_utils.py @@ -1,5 +1,8 @@ import builtins import copy +import stat + +import paramiko from datashuttle.utils import rclone, ssh @@ -57,7 +60,7 @@ def setup_hostkeys(project): restore_mock_input(orig_builtin) orig_getpass = copy.deepcopy(ssh.getpass.getpass) - ssh.getpass.getpass = lambda _: "password" + ssh.getpass.getpass = lambda _: "password" # type: ignore ssh.setup_ssh_key(project.cfg, log=False) ssh.getpass.getpass = orig_getpass @@ -81,3 +84,37 @@ def build_docker_image(project): central_host_id="localhost", central_host_username="sshuser", ) + + +def sftp_recursive_file_search(sftp, path_, all_filenames): + try: + sftp.stat(path_) + except FileNotFoundError: + return + + for file_or_folder in sftp.listdir_attr(path_): + if stat.S_ISDIR(file_or_folder.st_mode): + sftp_recursive_file_search( + sftp, + path_ + "/" + file_or_folder.filename, + all_filenames, + ) + else: + all_filenames.append(path_ + "/" + file_or_folder.filename) + + +def recursive_search_central(project): + """ """ + with paramiko.SSHClient() as client: + ssh.connect_client(client, project.cfg) + + sftp = client.open_sftp() + + all_filenames = [] + + sftp_recursive_file_search( + sftp, + (project.cfg["central_path"] / "rawdata").as_posix(), + all_filenames, + ) + return all_filenames diff --git a/tests/tests_integration/test_ssh_file_transfer.py b/tests/tests_integration/test_ssh_file_transfer.py index bc8a10c07..c1b4645f3 100644 --- a/tests/tests_integration/test_ssh_file_transfer.py +++ b/tests/tests_integration/test_ssh_file_transfer.py @@ -2,7 +2,6 @@ """ import copy import shutil -import stat from pathlib import Path import pandas as pd @@ -18,61 +17,41 @@ TEST_SSH = True # TODO: base on whether docker / singularity is installed. + +PARAM_SUBS = [ + ["all"], + ["all_sub"], + ["all_non_sub"], + ["sub-001"], + ["sub-003_date-20231901"], + ["sub-002", "all_non_sub"], +] +PARAM_SES = [ + ["all"], + ["all_non_ses"], + ["all_ses"], + ["ses-001"], + ["ses-002_random-key"], + ["all_non_ses", "ses-001"], +] +PARAM_DATATYPE = [ + ["all"], + ["all_ses_level_non_datatype"], + ["all_datatype"], + ["behav"], + ["ephys"], + ["histology"], + ["funcimg"], + ["histology", "behav", "all_ses_level_non_datatype"], +] + + class TestFileTransfer: @pytest.fixture( scope="class", - params=[ - # False, - pytest.param( - True, - marks=pytest.mark.skipif( - TEST_SSH is False, reason="TEST_SSH is set to False." - ), - ), - ], ) - def project_and_test_information(self, request, tmpdir_factory): - """ - Create a project for SSH testing. Setup - the project as normal, and switch configs - to use SSH connection. - - Although SSH is used for transfer, for SSH tests, - checking the created filepaths is always - done through the local filesystem for speed - and convenience. As such, the drive that is - SSH to must also be mounted and the path - supplied to the location SSH'd to. - - For speed, create the project once, - and all files to transfer. Then in the - test function, the folder are transferred. - Partial cleanup is done in the test function - i.e. deleting the central_path to which the - items have been transferred. This is achieved - by using "class" scope. - - NOTES - ----- - - Pytest params - The `params` key sets the - `params` attribute on the pytest `request` fixture. - This attribute is used to set the `testing_ssh` variable - to `True` or `False`. In the first run, this is set to - `False`, meaning local filesystem tests are run. In the - second run, this is set with a pytest parameter that is - `True` (i.e. SSH tests are run) but is skipped if `TEST_SSH` - in `ssh_config` (set in conftest.py` is `False`. - - - For convenience, files are transferred - with SSH and then checked through the local filesystem - mount. This is significantly easier than checking - everything through SFTP. However, on Windows the - mounted filesystem is quite slow to update, taking - a few seconds after SSH transfer. This makes the - tests run very slowly. We can get rid - of this limitation on linux. - """ - testing_ssh = request.param + def pathtable_and_project(self, tmpdir_factory): + """ """ tmp_path = tmpdir_factory.mktemp("test") base_path = tmp_path / "test with space" @@ -82,17 +61,25 @@ def project_and_test_information(self, request, tmpdir_factory): base_path, test_project_name ) - if testing_ssh: - ssh_test_utils.build_docker_image(project) - ssh_test_utils.setup_hostkeys(project) - pathtable = get_pathtable(project.cfg["local_path"]) self.create_all_pathtable_files(pathtable) - yield [pathtable, project, testing_ssh] + yield [pathtable, project] test_utils.teardown_project(cwd, project) + @pytest.fixture( + scope="class", + ) + def ssh_setup(self, pathtable_and_project): + pathtable, project = pathtable_and_project + ssh_test_utils.build_docker_image(project) + ssh_test_utils.setup_hostkeys(project) + + project.upload_all() + + return [pathtable, project] + # ------------------------------------------------------------------------- # Utils # ------------------------------------------------------------------------- @@ -104,188 +91,168 @@ def central_from_local(self, path_): # Test File Transfer - All Options # ------------------------------------------------------------------------- - @pytest.mark.parametrize( - "sub_names", - [ - ["all"], - ["all_sub"], - ["all_non_sub"], - ["sub-001"], - ["sub-003_date-20231901"], - ["sub-002", "all_non_sub"], - ], - ) - @pytest.mark.parametrize( - "ses_names", - [ - ["all"], - ["all_non_ses"], - ["all_ses"], - ["ses-001"], - ["ses-002_random-key"], - ["all_non_ses", "ses-001"], - ], - ) - @pytest.mark.parametrize( - "datatype", - [ - ["all"], - ["all_ses_level_non_datatype"], - ["all_datatype"], - ["behav"], - ["ephys"], - ["histology"], - ["funcimg"], - ["histology", "behav", "all_ses_level_non_datatype"], - ], - ) -# @pytest.mark.parametrize("upload_or_download", ["upload", "download"]) - def test_all_data_transfer_options( + @pytest.mark.parametrize("sub_names", PARAM_SUBS) + @pytest.mark.parametrize("ses_names", PARAM_SES) + @pytest.mark.parametrize("datatype", PARAM_DATATYPE) + @pytest.mark.parametrize("upload_or_download", ["upload", "download"]) + def test_combinations_filesystem_transfer( self, - project_and_test_information, + pathtable_and_project, sub_names, ses_names, datatype, -# upload_or_download, + upload_or_download, ): - """ - Parse the arguments to filter the pathtable, getting - the files expected to be transferred passed on the arguments - Note files in sub/ses/datatype folders must be handled - separately to those in non-sub, non-ses, non-datatype folders - - see test_utils.swap_local_and_central_paths() for the logic - on setting up and swapping local / central paths for - upload / download tests. - """ - pathtable, project, testing_ssh = project_and_test_information - - # transfer_function = test_utils.handle_upload_or_download( - # project, - # upload_or_download, - # swap_last_folder_only=testing_ssh, - # )[0] - - project.upload(sub_names, ses_names, datatype, init_log=False) - # transfer_function(sub_names, ses_names, datatype, init_log=False) - - # if upload_or_download == "download": - # test_utils.swap_local_and_central_paths( - # project, swap_last_folder_only=testing_ssh - # ) - - parsed_sub_names = self.parse_arguments(pathtable, sub_names, "sub") - parsed_ses_names = self.parse_arguments(pathtable, ses_names, "ses") - parsed_datatype = self.parse_arguments(pathtable, datatype, "datatype") + """ """ + pathtable, project = pathtable_and_project - # Filter pathtable to get files that were expected to be transferred - ( - sub_ses_dtype_arguments, - extra_arguments, - ) = self.make_pathtable_search_filter(parsed_sub_names, parsed_ses_names, parsed_datatype) + transfer_function = test_utils.handle_upload_or_download( + project, + upload_or_download, + swap_last_folder_only=False, + )[0] - datatype_folders = self.query_table(pathtable, sub_ses_dtype_arguments) - extra_folders = self.query_table(pathtable, extra_arguments) + transfer_function(sub_names, ses_names, datatype, init_log=False) - expected_paths = pd.concat([datatype_folders, extra_folders]) - expected_paths = expected_paths.drop_duplicates(subset="path") + if upload_or_download == "download": + test_utils.swap_local_and_central_paths( + project, swap_last_folder_only=False + ) - central_base_paths = expected_paths.base_folder.map( - lambda x: str(x).replace("local", "central") + expected_transferred_paths = self.get_expected_transferred_paths( + pathtable, sub_names, ses_names, datatype ) - expected_transferred_paths = central_base_paths / expected_paths.path - - # When transferring with SSH, there is a delay before - # filesystem catches up - # if testing_ssh: - # time.sleep(0.5) # Check what paths were actually moved # (through the local filesystem), and test - def sftp_recursive_search(sftp, path_, all_filenames): - try: - sftp.stat(path_) - except FileNotFoundError: - return - - for file_or_folder in sftp.listdir_attr(path_): - if stat.S_ISDIR(file_or_folder.st_mode): - sftp_recursive_search( - sftp, - path_ + "/" + file_or_folder.filename, - all_filenames, - ) - else: - all_filenames.append(path_ + "/" + file_or_folder.filename) + path_to_search = ( + self.central_from_local(project.cfg["local_path"]) / "rawdata" + ) + all_transferred = path_to_search.glob("**/*") + paths_to_transferred_files = list( + filter(Path.is_file, all_transferred) + ) - with paramiko.SSHClient() as client: - ssh.connect_client(client, project.cfg) + assert sorted(paths_to_transferred_files) == sorted( + expected_transferred_paths + ) + + # Teardown here, because we have session scope. + try: + shutil.rmtree(self.central_from_local(project.cfg["local_path"])) + except FileNotFoundError: + pass + + @pytest.mark.parametrize("sub_names", PARAM_SUBS) + @pytest.mark.parametrize("ses_names", PARAM_SES) + @pytest.mark.parametrize("datatype", PARAM_DATATYPE) + def test_combinations_ssh_transfer( + self, + ssh_setup, + sub_names, + ses_names, + datatype, + ): + """ """ + pathtable, project = ssh_setup - sftp = client.open_sftp() + true_central_path = project.cfg["central_path"] + tmp_central_path = project.cfg["central_path"] / "tmp" + project.update_config("central_path", tmp_central_path) - all_filenames = [] + breakpoint() + project.upload(sub_names, ses_names, datatype, init_log=False) - sftp_recursive_search( - sftp, - (project.cfg["central_path"] / "rawdata").as_posix(), - all_filenames, - ) + expected_transferred_paths = self.get_expected_transferred_paths( + pathtable, sub_names, ses_names, datatype + ) - paths_to_transferred_files = [] - for path_ in all_filenames: - parts = Path(path_).parts - paths_to_transferred_files.append( - Path(*parts[parts.index("rawdata") :]) - ) + transferred_files = ssh_test_utils.recursive_search_central(project) - expected_transferred_paths_ = [] - for path_ in expected_transferred_paths: - parts = Path(path_).parts - expected_transferred_paths_.append( - Path(*parts[parts.index("rawdata") :]) - ) + paths_to_transferred_files = self.remove_path_before_rawdata( + transferred_files + ) - assert sorted(paths_to_transferred_files) == sorted( - expected_transferred_paths_ - ) + expected_transferred_paths_ = self.remove_path_before_rawdata( + expected_transferred_paths + ) - project.upload_all() - shutil.rmtree(project.cfg["local_path"] / "rawdata") # TOOD: var + assert sorted(paths_to_transferred_files) == sorted( + expected_transferred_paths_ + ) - breakpoint() + with paramiko.SSHClient() as client: + ssh.connect_client(client, project.cfg) + client.exec_command( + f"rm -rf {(tmp_central_path).as_posix()}" + ) # TODO: own function as need to do on teardown) true_local_path = project.cfg["local_path"] - tmp_local_path = project.cfg["local_path"] / "tmp_local" - tmp_local_path.mkdirs() + tmp_local_path = project.cfg["local_path"] / "tmp" + tmp_local_path.mkdir() project.update_config("local_path", tmp_local_path) + project.update_config("central_path", true_central_path) - project.download(sub_names, ses_names, datatype, init_log=False) # TODO: why is this connecting so many times? + project.download( + sub_names, ses_names, datatype, init_log=False + ) # TODO: why is this connecting so many times? [during search - make issue] - all_transferred = list((project.cfg["local_path"] / "rawdata").glob("**/*")) - all_transferred = [path_ for path_ in all_transferred if path_.is_file()] - - paths_to_transferred_files = [] - for path_ in all_transferred: # TODO: rename all filenames - parts = Path(path_).parts - paths_to_transferred_files.append( - Path(*parts[parts.index("rawdata"):]) - ) + all_transferred = list((tmp_local_path / "rawdata").glob("**/*")) + all_transferred = [ + path_ for path_ in all_transferred if path_.is_file() + ] - assert sorted(paths_to_transferred_files) == sorted(expected_transferred_paths_) + paths_to_transferred_files = self.remove_path_before_rawdata( + all_transferred + ) - shutil.rmtree(project.cfg["local_path"]) # TOOD: var + assert sorted(paths_to_transferred_files) == sorted( + expected_transferred_paths_ + ) + shutil.rmtree(tmp_local_path) project.update_config("local_path", true_local_path) - with paramiko.SSHClient() as client: - ssh.connect_client(client, project.cfg) - - client.exec_command(f"rm -rf {(project.cfg['central_path'] / 'rawdata').as_posix()}") # TODO: own function as need to do on teardown) - # --------------------------------------------------------------------------------------------------------------- # Utils # --------------------------------------------------------------------------------------------------------------- + def get_expected_transferred_paths( + self, pathtable, sub_names, ses_names, datatype + ): + """""" + parsed_sub_names = self.parse_arguments(pathtable, sub_names, "sub") + parsed_ses_names = self.parse_arguments(pathtable, ses_names, "ses") + parsed_datatype = self.parse_arguments(pathtable, datatype, "datatype") + + # Filter pathtable to get files that were expected to be transferred + ( + sub_ses_dtype_arguments, + extra_arguments, + ) = self.make_pathtable_search_filter( + parsed_sub_names, parsed_ses_names, parsed_datatype + ) + + datatype_folders = self.query_table(pathtable, sub_ses_dtype_arguments) + extra_folders = self.query_table(pathtable, extra_arguments) + + expected_paths = pd.concat([datatype_folders, extra_folders]) + expected_paths = expected_paths.drop_duplicates(subset="path") + + central_base_paths = expected_paths.base_folder.map( + lambda x: str(x).replace("local", "central") + ) + expected_transferred_paths = central_base_paths / expected_paths.path + + return expected_transferred_paths + + def remove_path_before_rawdata(self, list_of_paths): + cut_paths = [] + for path_ in list_of_paths: # TODO: rename all filenames + parts = Path(path_).parts + cut_paths.append(Path(*parts[parts.index("rawdata") :])) + return cut_paths + def query_table(self, pathtable, arguments): """ Search the table for arguments, return empty