-
Notifications
You must be signed in to change notification settings - Fork 3.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Modifications to run_job.sh amd run_all_jobs.sh and SpanSource to allow us to run jobs remotely Author: @jerryli9876 Fixes #79 URL: #79
- Loading branch information
1 parent
b03bb86
commit ad22592
Showing
6 changed files
with
226 additions
and
149 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
#!/usr/bash/env ruby | ||
# Runs all scalding jobs in package com.twitter.zipkin.hadoop | ||
|
||
require 'optparse' | ||
require 'ostruct' | ||
require 'pp' | ||
require 'date' | ||
|
||
class OptparseAllJobArguments | ||
|
||
# | ||
# Return a structure describing the options. | ||
# | ||
def self.parse(args) | ||
# The options specified on the command line will be collected in *options*. | ||
# We set default values here. | ||
options = OpenStruct.new | ||
options.uses_hadoop_config = false | ||
options.dates = [] | ||
options.output = "" | ||
|
||
opts = OptionParser.new do |opts| | ||
opts.banner = "Usage: run_job.rb -d DATE -o OUTPUT -c CONFIG" | ||
|
||
opts.separator "" | ||
opts.separator "Specific options:" | ||
|
||
opts.on("-d", "--date STARTDATE,ENDDATE", Array, "The DATES to run the jobs over. Expected format for dates are is %Y-%m-%dT%H:%M") do |list| | ||
options.dates = list | ||
end | ||
|
||
opts.on("-o", "--output OUTPUT", | ||
"The OUTPUT file to write to") do |output| | ||
options.output = output | ||
end | ||
|
||
opts.on("-c", "--config [CONFIG]", "Optional hadoop configurations for the job.") do |config| | ||
options.uses_hadoop_config = true | ||
options.hadoop_config = config || '' | ||
end | ||
|
||
opts.separator "" | ||
opts.separator "Common options:" | ||
opts.on_tail("-h", "--help", "Show this message") do | ||
puts opts | ||
exit | ||
end | ||
end | ||
opts.parse!(args) | ||
options | ||
end | ||
end | ||
|
||
options = OptparseAllJobArguments.parse(ARGV) | ||
|
||
date_cmd = options.dates.join(" ") | ||
end_date_cmd = options.dates.length > 1 ? options.dates.at(1) : options.dates.at(0) | ||
config_string = options.uses_hadoop_config ? " --config " + options.hadoop_config : "" | ||
run_job_cmd = "ruby " + File.dirname(__FILE__) + "/run_job.rb" + config_string | ||
|
||
puts "Run_job_command = " + run_job_cmd | ||
puts "Date = " + date_cmd | ||
|
||
def run_jobs_in_parallel(prep, jobs) | ||
threads = [] | ||
prepThread = Thread.new(prep) { |prep| system(prep) } | ||
for job in jobs | ||
threads << Thread.new(job) { |job| system(job) } | ||
end | ||
prepThread.join | ||
return threads | ||
# threads.each { |aThread| aThread.join } | ||
end | ||
|
||
#Run the commands | ||
system(run_job_cmd + " -j Preprocessed -p -d " + date_cmd) | ||
|
||
jobs_set_1 = Array[ run_job_cmd + " -j WorstRuntimes -o " + options.output + "/WorstRuntimes -d " + end_date_cmd, | ||
run_job_cmd + " -j MemcacheRequest -o " + options.output + "/MemcacheRequest -d " + end_date_cmd] | ||
|
||
jobs_set_2 = Array[ run_job_cmd + " -j PopularKeys -o " + options.output + "/PopularKeys -d " + end_date_cmd, | ||
run_job_cmd + " -j PopularAnnotations -o " + options.output + "/PopularAnnotations -d " + end_date_cmd | ||
# run_job_cmd + " -j WhaleReport -o " + options.output + "/WhaleReport -d " + end_date_cmd, | ||
] | ||
|
||
jobs_set_3 = Array[ run_job_cmd + " -j DependencyTree -o " + options.output + "/DependencyTree -d " + end_date_cmd, | ||
run_job_cmd + " -j Timeouts -s \" --error_type finagle.timeout \" -o " + options.output + "/Timeouts -d " + end_date_cmd, | ||
run_job_cmd + " -j Timeouts -s \" --error_type finagle.retry \" -o " + options.output + "/Retries -d " + end_date_cmd ] | ||
|
||
jobs = run_jobs_in_parallel(run_job_cmd + " -j FindNames -p -d " + end_date_cmd, jobs_set_1) | ||
jobs += run_jobs_in_parallel(run_job_cmd + " -j FindIDtoName -p -d " + end_date_cmd, jobs_set_2) | ||
jobs += run_jobs_in_parallel("", jobs_set_3) | ||
|
||
jobs.each { |aThread| aThread.join } | ||
|
||
puts "All jobs finished!" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
#!/usr/bash/env ruby | ||
# Script that rxuns a single scalding job | ||
|
||
require 'optparse' | ||
require 'ostruct' | ||
require 'pp' | ||
require 'date' | ||
|
||
$HOST = "my.remote.host" | ||
|
||
class OptparseJobArguments | ||
|
||
# | ||
# Return a structure describing the options. | ||
# | ||
def self.parse(args) | ||
# The options specified on the command line will be collected in *options*. | ||
# We set default values here. | ||
options = OpenStruct.new | ||
options.job = nil | ||
options.uses_settings = false | ||
options.uses_hadoop_config = false | ||
options.dates = [] | ||
options.output = "" | ||
options.preprocessor = false | ||
|
||
opts = OptionParser.new do |opts| | ||
opts.banner = "Usage: run_job.rb -j JOB -d DATE -o OUTPUT -p -s SETTINGS -c CONFIG" | ||
|
||
opts.separator "" | ||
opts.separator "Specific options:" | ||
|
||
opts.on("-j", "--job JOBNAME", | ||
"The JOBNAME to run") do |job| | ||
options.job = job | ||
end | ||
|
||
opts.on("-d", "--date DATES", Array, | ||
"The DATES to run the job over. Expected format is %Y-%m-%dT%H:%M") do |dates| | ||
options.dates = dates.map{|date| DateTime.strptime(date, '%Y-%m-%dT%H:%M')} | ||
end | ||
|
||
opts.on("-o", "--output OUTPUT", | ||
"The OUTPUT file to write to") do |output| | ||
options.output = output | ||
end | ||
|
||
opts.on("-p", "--[no-]prep", "Run as preprocessor") do |v| | ||
options.preprocessor = true | ||
end | ||
|
||
opts.on("-s", "--settings [SETTINGS]", "Optional settings for the job") do |settings| | ||
options.uses_settings = true | ||
options.settings = settings || '' | ||
end | ||
|
||
opts.on("-c", "--config [CONFIG]", "Optional hadoop configurations for the job.") do |config| | ||
options.uses_hadoop_config = true | ||
options.hadoop_config = config || '' | ||
end | ||
|
||
|
||
opts.separator "" | ||
opts.separator "Common options:" | ||
opts.on_tail("-h", "--help", "Show this message") do | ||
puts opts | ||
exit | ||
end | ||
end | ||
opts.parse!(args) | ||
options | ||
end | ||
end | ||
|
||
options = OptparseJobArguments.parse(ARGV) | ||
start_date = options.dates.at(0) | ||
end_date = options.dates.length > 1 ? options.dates.at(1) : options.dates.at(0) | ||
|
||
def time_to_remote_file(time, prefix) | ||
return prefix + time.year.to_s() + "/" + append_zero(time.month) + "/" + append_zero(time.day) + "/" + append_zero(time.hour) | ||
end | ||
|
||
def append_zero(x) | ||
if 0 <= x and x <= 9 | ||
0.to_s() + x.to_s() | ||
else | ||
x.to_s() | ||
end | ||
end | ||
|
||
# TODO: So hacky OMG what is this I don't even | ||
def is_hadoop_local_machine?() | ||
return system("hadoop dfs -test -e .") | ||
end | ||
|
||
def remote_file_exists?(pathname, options) | ||
cmd = is_hadoop_local_machine?() ? "" : "ssh -C " + $HOST + " " | ||
cmd += "hadoop" | ||
cmd += options.uses_hadoop_config ? " --config " + options.hadoop_config : "" | ||
cmd += " dfs -test -e " + pathname | ||
result = system(cmd) | ||
puts "In run_job, remote_file_exists for " + pathname + ": " + result.to_s() | ||
return result | ||
end | ||
|
||
def date_to_cmd(date) | ||
return date.to_s()[0..18] | ||
end | ||
|
||
cmd_head = File.dirname(__FILE__) + "/scald.rb --hdfs com.twitter.zipkin.hadoop." | ||
settings_string = options.uses_settings ? " " + options.settings : "" | ||
cmd_date = date_to_cmd(start_date) + " " + date_to_cmd(end_date) | ||
cmd_args = options.job + settings_string + " --date " + cmd_date + " --tz UTC" | ||
|
||
if options.preprocessor | ||
if not remote_file_exists?(time_to_remote_file(end_date, options.job + "/") + "/_SUCCESS", options) | ||
cmd = cmd_head + "sources." + cmd_args | ||
system(cmd) | ||
end | ||
else | ||
if not remote_file_exists?(options.output + "/_SUCCESS", options) | ||
cmd = cmd_head + cmd_args + " --output " + options.output | ||
system(cmd) | ||
end | ||
end |
Oops, something went wrong.