diff --git a/agent/bench-scripts/pbench_fio b/agent/bench-scripts/pbench_fio index c8e81cd4d1..16e8e6bb79 100755 --- a/agent/bench-scripts/pbench_fio +++ b/agent/bench-scripts/pbench_fio @@ -25,26 +25,32 @@ ver=2.2.5 orig_cmd="$*" # Defaults +keep_failed_tool_data="y" +tar_nonref_data="y" +postprocess_only=n +nr_samples=5 +maxstddevpct=5 # maximum allowable standard deviation in percent +max_failures=6 # after N failed attempts to hit below $maxstddevpct, move on to the nest test supported_test_types="read,write,rw,randread,randwrite,randrw" -supported_block_sizes="1,2,4,8,16,32,64,128,256,512,1024" install_only="n" config="" rate_iops="" test_types="read,randread" # default is -non- destructive block_sizes="4,64,1024" targets="/tmp/fio" -device_mode="concurrent" # can also be sequential (one job per device, once at a time) runtime=30 ramptime=5 iodepth=32 -jobs_per_dev=1 ioengine="libaio" job_mode="concurrent" # serial or concurrent -file_size="256M" +file_size="4096M" direct=1 # don't cache IO's by default sync=0 # don't sync IO's by default -clients="" # A list of hostnames (hosta,hostb,hostc) where you want uperf to run. Note: if you use this, pbench must be installed on these systems already. -tool_group=default +clients="" # A list of hostnames (hosta,hostb,hostc) where you want fio to run. Note: if you use this, pbench must be installed on these systems already. +tool_label_pattern="fio-" +tool_group="default" +max_key_length=20 +primary_metric="readwrite_IOPS" function fio_usage() { printf "The following options are available:\n" @@ -68,7 +74,7 @@ function fio_usage() { printf "\t\ttime in seconds to warm up test before taking measurements (default is $ramptime)\n" printf "\n" printf -- "\t-b int[,int] --block-sizes=str[,str] (default is $block_sizes)\n" - printf "\t\tone or more block sizes in KiB: %s\n" "$supported_block_sizes (default is $block_sizes)" + printf "\t\tone or more block sizes in KiB (default is $block_sizes)\n" printf "\n" printf -- "\t-s int[,int] --file-size=str[,str] (default is $file_size)\n" printf "\t\tfile sizes in MiB: %s\n" @@ -87,7 +93,7 @@ function fio_usage() { } function fio_process_options() { - opts=$(getopt -q -o jic:t:b:s:d: --longoptions "help,direct:,sync:,install,clients:,iodepth:,ioengine:,config:,jobs-per-dev:,job-mode:,rate-iops:,ramptime:,runtime:,test-types:,block-sizes:,file-size:,targets:,tool-group:" -n "getopt.sh" -- "$@"); + opts=$(getopt -q -o jic:t:b:s:d: --longoptions "help,max-stddev:,max-failures:,samples:,direct:,sync:,install,clients:,iodepth:,ioengine:,config:,jobs-per-dev:,job-mode:,rate-iops:,ramptime:,runtime:,test-types:,block-sizes:,file-size:,targets:,tool-group:" -n "getopt.sh" -- "$@"); if [ $? -ne 0 ]; then printf "\t${benchmark}: you specified an invalid option\n\n" fio_usage @@ -104,6 +110,27 @@ function fio_process_options() { shift; install_only="y" ;; + --max-stddev) + shift; + if [ -n "$1" ]; then + maxstddevpct="$1" + shift; + fi + ;; + --max-failures) + shift; + if [ -n "$1" ]; then + max_failures="$1" + shift; + fi + ;; + --samples) + shift; + if [ -n "$1" ]; then + nr_samples="$1" + shift; + fi + ;; --direct) shift; if [ -n "$1" ]; then @@ -245,6 +272,36 @@ function fio_install() { fi } +function print_iteration { + # printing a iteration assumes this must be a new row, so include \n first + printf "\n%28s" "$1" >>$benchmark_summary_txt_file + printf "\n%s" "$1" >>$benchmark_summary_csv_file + if [ $1 == "iteration" ]; then + # this is just a label, so no links here + printf "\n%28s %s %s" "iteration" "summary" "tools">>$benchmark_summary_html_file + else + printf "\n%28s %s %s" "$1" "summary" "tools">>$benchmark_summary_html_file + fi +} + +function print_value { + if [ -z "$2" ]; then + printf "%${spacing}s" "$1" >>$benchmark_summary_txt_file + printf "%s" ",$1,stddevpct" >>$benchmark_summary_csv_file + printf "%${spacing}s" "$1" >>$benchmark_summary_html_file + else + printf "%${spacing}s" "$1[+/-$2]" >>$benchmark_summary_txt_file + printf "%s" ",$1,$2" >>$benchmark_summary_csv_file + printf "%${spacing}s" "$1[+/-$2]" >>$benchmark_summary_html_file + fi +} + +function print_newline { + printf "\n" >>$benchmark_summary_txt_file + printf "\n" >>$benchmark_summary_csv_file + printf "\n" >>$benchmark_summary_html_file +} + # Make sure this devices exists function fio_device_check() { local devs=$1 @@ -331,6 +388,7 @@ function fio_run_job() { debug_log "fio jobfile could not be found: $fio_job_file" return fi + echo "running fio job: $fio_job_file" mkdir -p $benchmark_results_dir mkdir -p $benchmark_results_dir/clients @@ -374,7 +432,7 @@ function fio_run_job() { chmod +x $benchmark_results_dir/fio.cmd debug_log "$benchmark: Going to run [$bench_cmd $bench_opts $client_opts]" pushd $benchmark_results_dir >/dev/null - $benchmark_results_dir/fio.cmd >$benchmark_results_dir/result.txt + $benchmark_results_dir/fio.cmd >$benchmark_results_dir/fio-result.txt popd >/dev/null stop-tools --group=$tool_group --iteration=$iteration --dir=$benchmark_results_dir if [ ! -z "$clients" ]; then @@ -396,16 +454,20 @@ function fio_run_job() { # Run the benchmark and start/stop perf analysis tools function fio_run_benchmark() { fio_device_check "$targets" "$clients" - benchmark_summary_file="$benchmark_run_dir/summary-result.txt" - benchmark_summary_html_file="$benchmark_run_dir/summary-result.html" + benchmark_summary_txt_file="$benchmark_run_dir/summary-result.txt" + rm -f $benchmark_summary_txt_file benchmark_summary_csv_file="$benchmark_run_dir/summary-result.csv" + rm -f $benchmark_summary_csv_file + benchmark_summary_html_file="$benchmark_run_dir/summary-result.html" + rm -f $benchmark_summary_html_file + + printf "# these results generated with:\n# $script_name %s\n\n" "$orig_cmd" >$benchmark_summary_txt_file + printf "
\n# these results generated with:\n# $script_name %s\n\n" "$orig_cmd" >$benchmark_summary_html_file
+	printf "\n" >>$benchmark_summary_txt_file
+	printf "\n" >>$benchmark_summary_html_file
+
 	mkdir -p $benchmark_run_dir/.running
 	local count=1
-	printf "these results generated with:\n%s\n\n" "$orig_cmd" >$benchmark_summary_file
-	printf "%20s%20s%20s%20s%20s%20s%20s%20s%20s\n" "iteration" "rw IOPs" "rw kB/sec" "read IOPS" "read kB/sec" "read 95th-lat-ms" "write IOPS" "write kB/sec" "write 95th-lat-ms" >>$benchmark_summary_file
-	printf "
\nthese results generated with:\n%s\n\n" "$orig_cmd" >$benchmark_summary_html_file
-	printf "%20s %s %s%20s%20s%20s%20s%20s%20s%20s%20s\n" "iteration" "details" "tools" "rw IOPs" "rw kB/sec" "read IOPS" "read kB/sec" "read 95th-lat-ms" "write IOPS" "write kB/sec" "write 95th-lat-ms" >>$benchmark_summary_html_file
-	printf "%s,%s,%s,%s,%s,%s,%s,%s,%s\n" "iteration" "rw IOPs" "rw kB/sec" "read IOPS" "read kB/sec" "read 95th-lat-ms" "write IOPS" "write kB/sec" "write 95th-lat-ms" >>$benchmark_summary_csv_file
 	if [ "$job_mode" = "serial" ]; then
 		# if each target is separated by a space, there will be one job for each in next for loop
 		targets=`echo $targets | sed -e s/,/" "/g`
@@ -415,26 +477,156 @@ function fio_run_benchmark() {
 			for block_size in `echo $block_sizes | sed -e s/,/" "/g`; do
 				job_num=1
 				iteration="${count}-${test_type}-${block_size}KiB"
-				if [ "$job_mode" = "serial" ]; then
-					dev_short_name="`basename $dev`"
-					# easier to identify what job used what device when having 1 job per device
-					iteration="$iteration-${dev_short_name}"
+				iteration_dir=$benchmark_run_dir/$iteration
+				result_stddevpct=$maxstddevpct # this test case will get a "do-over" if the stddev is not low enough
+				failures=0
+				while [[ $(echo "if (${result_stddevpct} >= ${maxstddevpct}) 1 else 0" | bc) -eq 1 ]]; do
+					if [[ $failures -gt 0 ]]; then
+						echo "Restarting iteration $iteration ($count of $total_iterations)"
+						log "Restarting iteration $iteration ($count of $total_iterations)"
+					fi
+					mkdir -p $iteration_dir
+					# each attempt at a test config requires multiple samples to get stddev
+					for sample in `seq 1 $nr_samples`; do
+						if [ "$job_mode" = "serial" ]; then
+							dev_short_name="`basename $dev`"
+							# easier to identify what job used what device when having 1 job per device
+							iteration="$iteration-${dev_short_name}"
+						fi
+						benchmark_results_dir="$iteration_dir/sample$sample"
+						benchmark_tools_dir="$benchmark_results_dir/tools-$tool_group"
+						benchmark_results_dir="$iteration_dir/sample$sample"
+						if [ "$postprocess_only" != "y" ]; then
+							mkdir -p $benchmark_results_dir
+							fio_job_file="$benchmark_results_dir/fio.job"
+							fio_create_jobfile "$test_type" "$ioengine" "$block_size" "$iodepth" "$direct" "$sync" "$runtime" "$ramptime" "$file_size" "$rate_iops" "$dev" "$fio_job_file"
+							fio_run_job "$iteration" "$benchmark_results_dir" "$fio_job_file" "$clients"
+						fi
+					done
+
+					# find the keys that we will compute avg & stddev
+					# NOTE: we always choose "sample1" since it is
+					# always present and shares the same keys with
+					# every other sample
+					keys=`cat $iteration_dir/sample1/result.txt  | awk -F= '{print $1}'`
+					s_keys=""
+					key_nr=0
+					# for each key, get the average & stddev
+					for key in $keys; do
+						# the s_key is used in the summary reports to save space, it is just an abbreviated key
+						s_key=`echo $key | cut  -d- -f2-`
+						# remove the label pattern from the s_key
+						s_key=`echo $s_key | sed -e s/"$tool_label_pattern"//`
+						s_key=`echo $s_key | sed -e s/"transactions"/"trans"/`
+						# store these in reverse order as the keys and be sure to print values in reverse order later
+						#s_keys="$s_key $s_keys"
+						s_keys[$key_nr]="$s_key"
+						s_key_length=`echo $s_key | wc -m`
+						if [ $s_key_length -gt $max_key_length ]; then
+							max_key_length=$s_key_length
+						fi
+						iteration_samples=""
+						for sample in `seq 1 $nr_samples`; do
+							value=`grep -- "^$key" $iteration_dir/sample$sample/result.txt | awk -F= '{print $2}'`
+							iteration_samples="$iteration_samples $value"
+						done
+						avg_stddev_result=`avg-stddev $iteration_samples`
+						samples[$key_nr]="$iteration_samples"
+						avg[$key_nr]=`echo $avg_stddev_result | awk '{print $1}'`
+						avg[$key_nr]=`printf "%.2f" ${avg[$key_nr]}`
+						stddev[$key_nr]=`echo $avg_stddev_result | awk '{print $2}'`
+						stddevpct[$key_nr]=`echo $avg_stddev_result | awk '{print $3}'`
+						stddevpct[$key_nr]=`printf "%.1f" ${stddevpct[$key_nr]}`
+						closest[$key_nr]=`echo $avg_stddev_result | awk '{print $4}'`
+						if echo $key | grep -q "$primary_metric"; then
+							tput_index=$key_nr
+							tput_metric=$key
+						fi
+						((key_nr++))
+					done
+	
+					# create a symlink to the result dir which most accurately represents the average result
+					for sample in `seq 1 $nr_samples`; do
+						sample_dir="sample$sample"
+						if [ $sample -eq ${closest[$tput_index]} ]; then
+							msg="$tput_metric: ${samples[$tput_index]}  average: ${avg[$tput_index]} stddev: ${stddevpct[$tput_index]}%  closest-sample: $sample"
+							echo $msg | tee $iteration_dir/sample-runs-summary.txt
+							log $msg
+							pushd "$iteration_dir" >/dev/null; /bin/rm -rf reference-result; ln -sf $sample_dir reference-result; popd >/dev/null
+						else
+							# delete the tool data [and respose time log for rr tests] from the other samples to save space
+							# this option is off by default
+							if [ "$keep_failed_tool_data" == "n" ]; then
+								/bin/rm -rf $iteration_dir/$sample_dir/tools-* $iteration_dir/$sample_dir/response-times.txt
+							fi
+							# since non reference-result sample data is rarely referenced, tar it up to reduce the number of files used
+							if [ "$tar_nonref_data" == "y" ]; then
+								pushd "$iteration_dir" >/dev/null; tar --create --xz --force-local --file=$sample_dir.tar.xz $sample_dir && /bin/rm -rf $sample_dir; popd >/dev/null
+							fi
+						fi
+					done
+
+					# if we did not achieve the stddevpct, then move this result out of the way and try again
+					fail=0
+					if [[ $(echo "if (${stddevpct[$tput_index]} >= ${maxstddevpct}) 1 else 0" | bc) -eq 1 ]]; then
+						fail=1
+					fi
+					if [ $fail -eq 1 ]; then
+						let failures=$failures+1
+						msg="$iteration: the percent standard deviation (${stddevpct[$tput_index]}%) was not within maximum allowed (${maxstddevpct}%)"
+						echo $msg
+						log $msg
+						msg="This iteration will be repeated until either standard deviation is below the maximum allowed, or $max_failures failed attempts."
+						echo $msg
+						log $msg
+						msg="Changing the standard deviation percent can be done with --max-stddev= and the maximum failures with --max-failures="
+						echo $msg
+						log $msg
+						# tar up the failed iteration.  We may need to look at it later, but don't waste space by keeping it uncompressed
+						# if all attempts failed, leaving no good result, leave the last attempt uncompressed
+						if [ $failures -le $max_failures ]; then
+							pushd $benchmark_run_dir >/dev/null
+							mv $iteration $iteration-fail$failures
+							tar --create --xz --force-local --file=$iteration-fail$failures.tar.xz $iteration-fail$failures &&\
+							/bin/rm -rf $iteration-fail$failures
+							popd >/dev/null
+						fi
+					fi
+					# break out of this loop only stddev is low enough or too many failures
+					if [ $fail -eq 0 -o $failures -ge $max_failures ]; then
+						break
+					fi
+				done
+				spacing=`echo "$max_key_length + 1" | bc`
+				
+				((key_nr--))
+				# print the labels for this group
+				if [ "$last_test_type" != "$test_type" ]; then
+					print_newline
+					print_iteration "iteration"
+					for i in `seq 0 $key_nr`; do
+						print_value "${s_keys[$i]}"
+					done
 				fi
-				# note: there are no samples taken to produce average & stddev, so result is put directly in "reference-result"
-				benchmark_results_dir="$benchmark_run_dir/$iteration/reference-result"
-				benchmark_tools_dir="$benchmark_results_dir/tools-$tool_group"
-				fio_job_file="$benchmark_results_dir/fio.job"
-				fio_create_jobfile "$test_type" "$ioengine" "$block_size" "$iodepth" "$direct" "$sync" "$runtime" "$ramptime" "$file_size" "$rate_iops" "$dev" "$fio_job_file"
-				fio_run_job "$iteration" "$benchmark_results_dir" "$fio_job_file" "$clients"
-				let count=$count+1
+				# print the correspnding values
+				print_iteration $iteration
+				for i in `seq 0 $key_nr`; do
+					print_value "${avg[$i]}" "${stddevpct[$i]}%"
+				done
+
+				echo "Iteration $iteration complete ($count of $total_iterations), with 1 pass and $failures failures"
+				log "Iteration $iteration complete ($count of $total_iterations), with 1 pass and $failures failures"
+				last_test_type="$test_type"
+				let count=$count+1 # now we can move to the next iteration
 			done
 		done
 	done
-	printf "
" >>$benchmark_summary_html_file + printf "
\n" >>$benchmark_summary_html_file + printf "\n" >>$benchmark_summary_txt_file } function fio_print_summary() { - cat $benchmark_summary_file + cat $benchmark_summary_txt_file } fio_process_options "$@" diff --git a/agent/bench-scripts/postprocess/fio-postprocess b/agent/bench-scripts/postprocess/fio-postprocess index 69c3435763..6b30af7d10 100755 --- a/agent/bench-scripts/postprocess/fio-postprocess +++ b/agent/bench-scripts/postprocess/fio-postprocess @@ -7,7 +7,7 @@ my $dir = $ARGV[0]; my $iteration = $ARGV[1]; my $tool_group = $ARGV[2]; -open(JS, "<$dir/result.txt"); +open(JS, "<$dir/fio-result.txt"); # skip past the non json stuff while () { if (/^{/) { @@ -115,37 +115,26 @@ foreach $client_jobs ( @$fio_results{$fio_json_field} ) { # now write all of this data to various files -# only append to this file, as we write to it over multiple calls to this script -open(BENCHMARK_TXT, ">>$dir/../../summary-result.txt"); -open(BENCHMARK_HTML, ">>$dir/../../summary-result.html"); -open(BENCHMARK_CSV, ">>$dir/../../summary-result.csv"); - -# the overall benchmark summary result includes a single result for each iteration (but no more detail) -printf BENCHMARK_TXT "%20s%20.2f%20.2f%20.2f%20.2f%20.2f%20.2f%20.2f%20.2f\n", "$iteration", - $all_client_results{readwrite_iops}, $all_client_results{readwrite_bw}, - $all_client_results{read_iops}, $all_client_results{read_bw}, $all_client_results{read_95lat}/$all_client_results{read_95lat_samples}/1000, - $all_client_results{write_iops}, $all_client_results{write_bw}, $all_client_results{write_95lat}/$all_client_results{write_95lat_samples}/1000; - -printf BENCHMARK_HTML "%20s %s %s%20.2f%20.2f%20.2f%20.2f%20.2f%20.2f%20.2f%20.2f\n", "$iteration", - "details", "tools", - $all_client_results{readwrite_iops}, $all_client_results{readwrite_bw}, - $all_client_results{read_iops}, $all_client_results{read_bw}, $all_client_results{read_95lat}/$all_client_results{read_95lat_samples}/1000, - $all_client_results{write_iops}, $all_client_results{write_bw}, $all_client_results{write_95lat}/$all_client_results{write_95lat_samples}/1000; - -printf BENCHMARK_CSV "%s,%.2f,%.2f,%.2f,%.2f,%.2f,%.2f,%.2f,%.2f\n", "$iteration", - $all_client_results{readwrite_iops}, $all_client_results{readwrite_bw}, - $all_client_results{read_iops}, $all_client_results{read_bw}, $all_client_results{read_95lat}/$all_client_results{read_95lat_samples}/1000, - $all_client_results{write_iops}, $all_client_results{write_bw}, $all_client_results{write_95lat}/$all_client_results{write_95lat_samples}/1000; - # this file we start from scratch +open(RESULT_TXT, ">$dir/result.txt"); open(ITERATION_TXT, ">$dir/summary-result.txt"); open(ITERATION_HTML, ">$dir/summary-result.html"); + +printf RESULT_TXT "readwrite_IOPS=%.2f\n", $all_client_results{readwrite_iops}; +printf RESULT_TXT "rw_kB_sec=%.2f\n", $all_client_results{readwrite_bw}; +printf RESULT_TXT "read_IOPS=%.2f\n", $all_client_results{read_iops}; +printf RESULT_TXT "read_kB_sec=%.2f\n", $all_client_results{read_bw}; +printf RESULT_TXT "read_95th_lat_us=%.2f\n", $all_client_results{read_95lat}; +printf RESULT_TXT "write_IOPS=%.2f\n", $all_client_results{write_iops}; +printf RESULT_TXT "write_kB_sec=%.2f\n", $all_client_results{write_bw}; +printf RESULT_TXT "write_95th_lat_us=%.2f\n", $all_client_results{write_95lat}; + printf ITERATION_TXT "%20s%20s%20s%20s%20s%20s%20s%20s%20s%20s\n", "$iteration", "clients", - "readwrite-IOPS", "rw-kB/sec", + "readwrite-IOPS", "readwrite-kB/sec", "read-IOPS", "read-kB/sec", "read-95th-lat-ms", "write-IOPS", "write-kB/sec", "write-95th-lat-ms"; printf ITERATION_HTML "
%20s%20s%s%20s%20s%20s%20s%20s%20s%20s%20s\n", "$iteration", "clients", "     details",
-  "readwrite-IOPS", "rw-kB/sec",
+  "readwrite-IOPS", "readwrite-kB/sec",
   "read-IOPS", "read-kB/sec", "read-95th-lat-ms",
   "write-IOPS", "write-kB/sec", "write-95th-lat-ms";
 # the iteration result includes the summed result for the all of the clients, then per-client result
@@ -171,7 +160,7 @@ foreach $client_name (keys %all_job_results) {
 	# the client result summary file includes the summed result for the client, and then a result for each job run
 	open(CLIENT_RESULT, ">$dir/clients/$client_name/summary-result.txt") || die "could not open $dir/clients/$client_name/summary-result.txt";
 	printf CLIENT_RESULT "%20s%20s%20s%20s%20s%20s%20s%20s%20s%20s\n", "$client_name", "job_name",
-	  "rw-IOPS", "rw-kB/sec",
+	  "rw-IOPS", "readwrite-kB/sec",
 	  "read-IOPS", "read-kB/sec", "read-95th-lat-ms",
 	  "write-IOPS", "write-kB/sec", "write-95th-lat-ms";
 	printf CLIENT_RESULT "%20s%20s%20.2f%20.2f%20.2f%20.2f%20.2f%20.2f%20.2f%20.2f\n", "", "all_jobs", 
@@ -188,8 +177,6 @@ foreach $client_name (keys %all_job_results) {
 	}
 	close(CLIENT_RESULT);
 }
-close(BENCHMARK_TXT);
-close(BENCHMARK_HTML);
-close(BENCHMARK_CSV);
+close(RESULT_TXT);
 close(ITERATION_TXT);
 close(ITERATION_HTML);