-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
✨ Add
Sftp
source and Prefect tasks (#1039)
* ✨ added sftp source file. * ✨ added sftp task file. * ✨ added salesforce flow file. * ✨ added integration test file. * ✨ added unit test file. * 📌 added paramiko package in pyproject. * 📝 updated docstrings. * ✅ updated unit test file. * 📝 updated commented code. * sftplist bug related to search patterns, solved and columns packed issue solved * 🐛 fixed a raise error bug. * ⚡️ removed unusded function. * ✅ updated integration test folder. * 📝 updated comments. * 📝 added feedback. * 📝 updated comments. * 🚧 Modified `rsa_key` description * 🔥 Removed `credentials` param * ♻️ Changed function name from `_get_file_object_file` to `_get_file_object` * ♻️ Changed function name from `_list_directory` to `_ls` * ♻️ Changed the way of handling file listing and recursive option * ✅ Cleaned up tests for SFTP * 🐛 Changed imported class name from `SftpCredentials` to `Sftp` * ♻️ Adjusted `_ls` function * ✅ Modified unit tests * ⚡️ Removed `time.sleep()` from the `sftp` task * 🐛 Added missing comma * 🚧 Added requirements * ✅ Updated tests and _ls function * 🔥 Removed integration tests for SFTP * 🎨 Formatted the code * 🎨 Added `allowlist-secret` * Update tests/unit/test_sftp.py Co-authored-by: Michał Zawadzki <[email protected]> * 🎨 Moved `pytest-mock` to dev-dependencies * removed code * 🐛 Updated `dummy_rsa_key` value * ✅ Added tests to SFTP source * 🎨 Removed extra lines * 🚧Added `noqa` * 🚧 Added `pragma: allowlist secret` * 🔥 Removed `noqa: RUF100, S608` --------- Co-authored-by: fdelgadodyvenia <[email protected]> Co-authored-by: rziemianek <[email protected]> Co-authored-by: Rafał Ziemianek <[email protected]> Co-authored-by: Michał Zawadzki <[email protected]>
- Loading branch information
1 parent
eb29241
commit 3de17df
Showing
10 changed files
with
680 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
"""Download data from a SFTP server to Azure Data Lake Storage.""" | ||
|
||
from prefect import flow | ||
from prefect.task_runners import ConcurrentTaskRunner | ||
|
||
from viadot.orchestration.prefect.tasks import df_to_adls, sftp_to_df | ||
|
||
|
||
@flow( | ||
name="SFTP extraction to ADLS", | ||
description="Extract data from a SFTP server and " | ||
+ "load it into Azure Data Lake Storage.", | ||
retries=1, | ||
retry_delay_seconds=60, | ||
task_runner=ConcurrentTaskRunner, | ||
) | ||
def sftp_to_adls( | ||
config_key: str | None = None, | ||
azure_key_vault_secret: str | None = None, | ||
file_name: str | None = None, | ||
sep: str = "\t", | ||
columns: list[str] | None = None, | ||
adls_config_key: str | None = None, | ||
adls_azure_key_vault_secret: str | None = None, | ||
adls_path: str | None = None, | ||
adls_path_overwrite: bool = False, | ||
) -> None: | ||
r"""Flow to download data from a SFTP server to Azure Data Lake. | ||
Args: | ||
config_key (str, optional): The key in the viadot config holding relevant | ||
credentials. Defaults to None. | ||
azure_key_vault_secret (str, optional): The name of the Azure Key Vault secret | ||
where credentials are stored. Defaults to None. | ||
file_name (str, optional): Path to the file in SFTP server. Defaults to None. | ||
sep (str, optional): The separator to use to read the CSV file. | ||
Defaults to "\t". | ||
columns (List[str], optional): Columns to read from the file. Defaults to None. | ||
adls_config_key (str, optional): The key in the viadot config holding | ||
relevant credentials. Defaults to None. | ||
adls_azure_key_vault_secret (str, optional): The name of the Azure Key | ||
Vault secret containing a dictionary with ACCOUNT_NAME and Service Principal | ||
credentials (TENANT_ID, CLIENT_ID, CLIENT_SECRET) for the Azure Data Lake. | ||
Defaults to None. | ||
adls_path (str, optional): Azure Data Lake destination file path | ||
(with file name). Defaults to None. | ||
adls_path_overwrite (bool, optional): Whether to overwrite the file in ADLS. | ||
Defaults to True. | ||
""" | ||
data_frame = sftp_to_df( | ||
config_key=config_key, | ||
azure_key_vault_secret=azure_key_vault_secret, | ||
file_name=file_name, | ||
sep=sep, | ||
columns=columns, | ||
) | ||
|
||
return df_to_adls( | ||
df=data_frame, | ||
path=adls_path, | ||
credentials_secret=adls_azure_key_vault_secret, | ||
config_key=adls_config_key, | ||
overwrite=adls_path_overwrite, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
"""Tasks from SFTP API.""" | ||
|
||
import pandas as pd | ||
from prefect import task | ||
|
||
from viadot.orchestration.prefect.exceptions import MissingSourceCredentialsError | ||
from viadot.orchestration.prefect.utils import get_credentials | ||
from viadot.sources import Sftp | ||
|
||
|
||
@task(retries=3, log_prints=True, retry_delay_seconds=10, timeout_seconds=60 * 60) | ||
def sftp_to_df( | ||
config_key: str | None = None, | ||
azure_key_vault_secret: str | None = None, | ||
file_name: str | None = None, | ||
sep: str = "\t", | ||
columns: list[str] | None = None, | ||
) -> pd.DataFrame: | ||
r"""Querying SFTP server and saving data as the data frame. | ||
Args: | ||
config_key (str, optional): The key in the viadot config holding relevant | ||
credentials. Defaults to None. | ||
azure_key_vault_secret (str, optional): The name of the Azure Key Vault secret | ||
where credentials are stored. Defaults to None. | ||
file_name (str, optional): Path to the file in SFTP server. Defaults to None. | ||
sep (str, optional): The separator to use to read the CSV file. | ||
Defaults to "\t". | ||
columns (List[str], optional): Columns to read from the file. Defaults to None. | ||
Returns: | ||
pd.DataFrame: The response data as a pandas DataFrame. | ||
""" | ||
if not (azure_key_vault_secret or config_key): | ||
raise MissingSourceCredentialsError | ||
|
||
if not config_key: | ||
credentials = get_credentials(azure_key_vault_secret) | ||
|
||
sftp = Sftp( | ||
credentials=credentials, | ||
config_key=config_key, | ||
) | ||
sftp.get_connection() | ||
|
||
return sftp.to_df(file_name=file_name, sep=sep, columns=columns) | ||
|
||
|
||
@task(retries=3, log_prints=True, retry_delay_seconds=10, timeout_seconds=60 * 60) | ||
def sftp_list( | ||
config_key: str | None = None, | ||
azure_key_vault_secret: str | None = None, | ||
path: str | None = None, | ||
recursive: bool = False, | ||
matching_path: str | None = None, | ||
) -> list[str]: | ||
"""Listing files in the SFTP server. | ||
Args: | ||
config_key (str, optional): The key in the viadot config holding relevant | ||
credentials. Defaults to None. | ||
azure_key_vault_secret (str, optional): The name of the Azure Key Vault secret | ||
where credentials are stored. Defaults to None. | ||
path (str, optional): Destination path from where to get the structure. | ||
Defaults to None. | ||
recursive (bool, optional): Get the structure in deeper folders. | ||
Defaults to False. | ||
matching_path (str, optional): Filtering folders to return by a regex pattern. | ||
Defaults to None. | ||
Returns: | ||
files_list (list[str]): List of files in the SFTP server. | ||
""" | ||
if not (azure_key_vault_secret or config_key): | ||
raise MissingSourceCredentialsError | ||
|
||
if not config_key: | ||
credentials = get_credentials(azure_key_vault_secret) | ||
|
||
sftp = Sftp( | ||
credentials=credentials, | ||
config_key=config_key, | ||
) | ||
sftp.get_connection() | ||
|
||
return sftp.get_files_list( | ||
path=path, recursive=recursive, matching_path=matching_path | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.