-
Notifications
You must be signed in to change notification settings - Fork 14.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AWSGlueJobHook updates job configuration if it exists #27893
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -92,10 +92,38 @@ def __init__( | |
kwargs["client_type"] = "glue" | ||
super().__init__(*args, **kwargs) | ||
|
||
def create_glue_job_config(self) -> dict: | ||
if self.s3_bucket is None: | ||
raise ValueError("Could not initialize glue job, error: Specify Parameter `s3_bucket`") | ||
|
||
default_command = { | ||
"Name": "glueetl", | ||
"ScriptLocation": self.script_location, | ||
} | ||
command = self.create_job_kwargs.pop("Command", default_command) | ||
|
||
s3_log_path = f"s3://{self.s3_bucket}/{self.s3_glue_logs}{self.job_name}" | ||
execution_role = self.get_iam_execution_role() | ||
|
||
ret_config = { | ||
"Name": self.job_name, | ||
"Description": self.desc, | ||
"LogUri": s3_log_path, | ||
"Role": execution_role["Role"]["Arn"], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is a pretty strong assumption to assume that the user wants the execution role as role no? Should not it be specified by the user? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My bad :) |
||
"ExecutionProperty": {"MaxConcurrentRuns": self.concurrent_run_limit}, | ||
"Command": command, | ||
"MaxRetries": self.retry_limit, | ||
**self.create_job_kwargs, | ||
} | ||
|
||
if hasattr(self, "num_of_dpus"): | ||
ret_config["MaxCapacity"] = self.num_of_dpus | ||
|
||
return ret_config | ||
|
||
def list_jobs(self) -> list: | ||
""":return: Lists of Jobs""" | ||
conn = self.get_conn() | ||
return conn.get_jobs() | ||
return self.get_conn().get_jobs() | ||
|
||
def get_iam_execution_role(self) -> dict: | ||
""":return: iam role for job execution""" | ||
|
@@ -120,14 +148,12 @@ def initialize_job( | |
to run job | ||
:return: | ||
""" | ||
glue_client = self.get_conn() | ||
script_arguments = script_arguments or {} | ||
run_kwargs = run_kwargs or {} | ||
|
||
try: | ||
job_name = self.get_or_create_glue_job() | ||
return glue_client.start_job_run(JobName=job_name, Arguments=script_arguments, **run_kwargs) | ||
|
||
job_name = self.create_or_update_glue_job() | ||
return self.get_conn().start_job_run(JobName=job_name, Arguments=script_arguments, **run_kwargs) | ||
except Exception as general_error: | ||
self.log.error("Failed to run aws glue job, error: %s", general_error) | ||
raise | ||
|
@@ -140,8 +166,7 @@ def get_job_state(self, job_name: str, run_id: str) -> str: | |
:param run_id: The job-run ID of the predecessor job run | ||
:return: State of the Glue job | ||
""" | ||
glue_client = self.get_conn() | ||
job_run = glue_client.get_job_run(JobName=job_name, RunId=run_id, PredecessorsIncluded=True) | ||
job_run = self.get_conn().get_job_run(JobName=job_name, RunId=run_id, PredecessorsIncluded=True) | ||
return job_run["JobRun"]["JobRunState"] | ||
|
||
def print_job_logs( | ||
|
@@ -231,54 +256,53 @@ def job_completion(self, job_name: str, run_id: str, verbose: bool = False) -> d | |
next_token=next_log_token, | ||
) | ||
|
||
def get_or_create_glue_job(self) -> str: | ||
def has_job(self, job_name) -> bool: | ||
""" | ||
Creates(or just returns) and returns the Job name | ||
:return:Name of the Job | ||
Checks if the job already exists | ||
|
||
:param job_name: unique job name per AWS account | ||
:return: Returns True if the job already exists and False if not. | ||
""" | ||
glue_client = self.get_conn() | ||
self.log.info("Checking if job already exists: %s", job_name) | ||
|
||
try: | ||
get_job_response = glue_client.get_job(JobName=self.job_name) | ||
self.log.info("Job Already exist. Returning Name of the job") | ||
return get_job_response["Job"]["Name"] | ||
|
||
except glue_client.exceptions.EntityNotFoundException: | ||
self.log.info("Job doesn't exist. Now creating and running AWS Glue Job") | ||
if self.s3_bucket is None: | ||
raise AirflowException("Could not initialize glue job, error: Specify Parameter `s3_bucket`") | ||
s3_log_path = f"s3://{self.s3_bucket}/{self.s3_glue_logs}{self.job_name}" | ||
execution_role = self.get_iam_execution_role() | ||
try: | ||
default_command = { | ||
"Name": "glueetl", | ||
"ScriptLocation": self.script_location, | ||
} | ||
command = self.create_job_kwargs.pop("Command", default_command) | ||
|
||
if "WorkerType" in self.create_job_kwargs and "NumberOfWorkers" in self.create_job_kwargs: | ||
create_job_response = glue_client.create_job( | ||
Name=self.job_name, | ||
Description=self.desc, | ||
LogUri=s3_log_path, | ||
Role=execution_role["Role"]["Arn"], | ||
ExecutionProperty={"MaxConcurrentRuns": self.concurrent_run_limit}, | ||
Command=command, | ||
MaxRetries=self.retry_limit, | ||
**self.create_job_kwargs, | ||
) | ||
else: | ||
create_job_response = glue_client.create_job( | ||
Name=self.job_name, | ||
Description=self.desc, | ||
LogUri=s3_log_path, | ||
Role=execution_role["Role"]["Arn"], | ||
ExecutionProperty={"MaxConcurrentRuns": self.concurrent_run_limit}, | ||
Command=command, | ||
MaxRetries=self.retry_limit, | ||
MaxCapacity=self.num_of_dpus, | ||
**self.create_job_kwargs, | ||
) | ||
return create_job_response["Name"] | ||
except Exception as general_error: | ||
self.log.error("Failed to create aws glue job, error: %s", general_error) | ||
raise | ||
self.get_conn().get_job(JobName=job_name) | ||
return True | ||
except self.get_conn().exceptions.EntityNotFoundException: | ||
return False | ||
|
||
def update_job(self, **job_kwargs) -> bool: | ||
""" | ||
Updates job configurations | ||
|
||
:param job_kwargs: Keyword args that define the configurations used for the job | ||
:return: True if job was updated and false otherwise | ||
""" | ||
job_name = job_kwargs.pop("Name") | ||
current_job = self.get_conn().get_job(JobName=job_name)["Job"] | ||
|
||
update_config = { | ||
key: value for key, value in job_kwargs.items() if current_job.get(key) != job_kwargs[key] | ||
} | ||
if update_config != {}: | ||
self.log.info("Updating job: %s", job_name) | ||
self.get_conn().update_job(JobName=job_name, JobUpdate=job_kwargs) | ||
self.log.info("Updated configurations: %s", update_config) | ||
return True | ||
else: | ||
return False | ||
|
||
def create_or_update_glue_job(self) -> str | None: | ||
""" | ||
Creates(or updates) and returns the Job name | ||
:return:Name of the Job | ||
""" | ||
config = self.create_glue_job_config() | ||
|
||
if self.has_job(self.job_name): | ||
self.update_job(**config) | ||
else: | ||
self.log.info("Creating job: %s", self.job_name) | ||
self.get_conn().create_job(**config) | ||
|
||
return self.job_name |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this name provided by the user and not hardcoded?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
here it is the default command used but the user can specify another one with the "Command" kwarg: https://github.com/apache/airflow/blob/23d33fe4f424b0302a1637a6005185ea4139c04e/airflow/providers/amazon/aws/hooks/glue.py#L103
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, you can ignore this comment then