forked from google/cluster-data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bibliography.bib
1090 lines (1028 loc) · 52.5 KB
/
bibliography.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
################################################################
# Introduction
################################################################
This bibliography is a resource for people writing papers that refer
to the Google cluster traces. It covers papers that analyze the
traces, as well as ones that use them as inputs to other studies.
* I recommend using \usepackage{url}.
* Entries are in publication-date order, with the most recent at the top.
* Bibtex ignores stuff that is outside the entries, so text like this is safe.
The following are the RECOMMENDED CITATIONS if you just need the basics:
1. \cite{clusterdata:Wilkes2011, clusterdata:Reiss2011} for the "full"
ClusterData2011 trace.
2. \cite{clusterdata:Reiss2012b} for the first thorough analysis of it.
If you use the traces, please send a bibtex entry that looks like one of
these to [email protected], so your paper can be added - and cited!
################################################################
# Trace-announcements
################################################################
These entries can be used to cite the traces themselves.
The first couple are for the May 2011 "full" trace.
@Misc{clusterdata:Wilkes2011,
author = {John Wilkes},
title = {More {Google} cluster data},
howpublished = {Google research blog},
month = Nov,
year = 2011,
note = {Posted at
\url{http://googleresearch.blogspot.com/2011/11/more-google-cluster-data.html}.},
}
@TechReport{clusterdata:Reiss2011,
author = {Charles Reiss and John Wilkes and Joseph L. Hellerstein},
title = {{Google} cluster-usage traces: format + schema},
institution = {Google Inc.},
year = 2011,
month = Nov,
type = {Technical Report},
address = {Mountain View, CA, USA},
note = {Revised 2014-11-17 for version 2.1. Posted at
\url{https://github.com/google/cluster-data}},
}
#----------------
# The next one is for the earlier "small" 7-hour trace.
# (Most people should not be using this.)
@Misc{clusterdata:Hellersetein2010,
author = {Joseph L. Hellerstein},
title = {{Google} cluster data},
howpublished = {Google research blog},
month = Jan,
year = 2010,
note = {Posted at \url{http://googleresearch.blogspot.com/2010/01/google-cluster-data.html}.},
}
#----------------
The next paper describes the policy choices and technologies used to
make the traces safe to release.
@InProceedings{clusterdata:Reiss2012,
author = {Charles Reiss and John Wilkes and Joseph L. Hellerstein},
title = {Obfuscatory obscanturism: making workload traces of
commercially-sensitive systems safe to release},
year = 2012,
booktitle = {3rd International Workshop on Cloud Management (CLOUDMAN)},
month = Apr,
publisher = {IEEE},
pages = {1279--1286},
address = {Maui, HI, USA},
abstract = {Cloud providers such as Google are interested in fostering
research on the daunting technical challenges they face in
supporting planetary-scale distributed systems, but no
academic organizations have similar scale systems on which to
experiment. Fortunately, good research can still be done using
traces of real-life production workloads, but there are risks
in releasing such data, including inadvertently disclosing
confidential or proprietary information, as happened with the
Netflix Prize data. This paper discusses these risks, and our
approach to them, which we call systematic obfuscation. It
protects proprietary and personal data while leaving it
possible to answer interesting research questions. We explain
and motivate some of the risks and concerns and propose how
they can best be mitigated, using as an example our recent
publication of a month-long trace of a production system
workload on a 11k-machine cluster.},
url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6212064},
}
################################################################
# Trace-analysis papers
################################################################
These papers are primarily about analyzing the traces.
Order: most recent first.
If you just want one citation about the Cluster2011 trace, then
use \cite{clusterdata:Reiss2012b}.
################ 2014
@INPROCEEDINGS{clusterdata:Abdul-Rahman2014,
author = {Abdul-Rahman, Omar Arif and Aida, Kento},
title = {Towards understanding the usage behavior of {Google} cloud
users: the mice and elephants phenomenon},
booktitle = {IEEE International Conference on Cloud Computing
Technology and Science (CloudCom)},
year = 2014,
month = dec,
address = {Singapore},
pages = {272--277},
keywords = { Google trace; Workload trace analysis; User session
view; Application composition; Mass-Count disparity;
Exploratory statistical analysis; Visual analysis;
Color-schemed graphs; Coarse grain classification;
Heavy-tailed distributions; Long-tailed lognormal
distributions; Exponential distribution; Normal distribution;
Discrete modes; Large web services; Batch processing;
MapReduce computation; Human users; },
abstract = {In the era of cloud computing, users encounter the
challenging task of effectively composing and running their
applications on the cloud. In an attempt to understand user
behavior in constructing applications and interacting with
typical cloud infrastructures, we analyzed a large utilization
dataset of Google cluster. In the present paper, we consider
user behavior in composing applications from the perspective
of topology, maximum requested computational resources, and
workload type. We model user dynamic behavior around the
user's session view. Mass-Count disparity metrics are used to
investigate the characteristics of underlying statistical
models and to characterize users into distinct groups
according to their composition and behavioral classes and
patterns. The present study reveals interesting insight into
the heterogeneous structure of the Google cloud workload.},
doi = {10.1109/CloudCom.2014.75},
}
################ 2013
@inproceedings{clusterdata:Di2013,
title = {Characterizing cloud applications on a {Google} data center},
author = {Di, Sheng and Kondo, Derrick and Franck, Cappello},
booktitle = {42nd International Conference on Parallel Processing (ICPP)},
year = 2013,
month = Oct,
address = {Lyon, France},
abstract = {In this paper, we characterize Google applications,
based on a one-month Google trace with over 650k jobs running
across over 12000 heterogeneous hosts from a Google data
center. On one hand, we carefully compute the valuable
statistics about task events and resource utilization for
Google applications, based on various types of resources (such
as CPU, memory) and execution types (e.g., whether they can
run batch tasks or not). Resource utilization per application
is observed with an extremely typical Pareto principle. On the
other hand, we classify applications via a K-means clustering
algorithm with optimized number of sets, based on task events
and resource usage. The number of applications in the Kmeans
clustering sets follows a Pareto-similar distribution. We
believe our work is very interesting and valuable for the
further investigation of Cloud environment.},
}
################ 2012
@INPROCEEDINGS{clusterdata:Reiss2012b,
title = {Heterogeneity and dynamicity of clouds at scale: {Google}
trace analysis},
author = {Charles Reiss and Alexey Tumanov and Gregory R. Ganger and
Randy H. Katz and Michael A. Kozuch},
booktitle = {ACM Symposium on Cloud Computing (SoCC)},
year = 2012,
month = Oct,
address = {San Jose, CA, USA},
abstract = {To better understand the challenges in developing
effective cloud-based resource schedulers, we analyze the
first publicly available trace data from a sizable
multi-purpose cluster. The most notable workload
characteristic is heterogeneity: in resource types (e.g.,
cores:RAM per machine) and their usage (e.g., duration and
resources needed). Such heterogeneity reduces the
effectiveness of traditional slot- and core-based scheduling.
Furthermore, some tasks are constrained as to the kind of
machine types they can use, increasing the complexity of
resource assignment and complicating task migration. The
workload is also highly dynamic, varying over time and most
workload features, and is driven by many short jobs that
demand quick scheduling decisions. While few simplifying
assumptions apply, we find that many longer-running jobs have
relatively stable resource utilizations, which can help
adaptive resource schedulers.},
url = {http://www.pdl.cmu.edu/PDL-FTP/CloudComputing/googletrace-socc2012.pdf},
privatenote = {An earlier version of this was posted at
\url{http://www.istc-cc.cmu.edu/publications/papers/2012/ISTC-CC-TR-12-101.pdf},
and included here as clusterdata:Reiss2012a. Please use this
version instead of that.},
}
@INPROCEEDINGS{clusterdata:Liu2012,
author = {Zitao Liu and Sangyeun Cho},
title = {Characterizing machines and workloads on a {Google} cluster},
booktitle = {8th International Workshop on Scheduling and Resource
Management for Parallel and Distributed Systems (SRMPDS)},
year = 2012,
month = Sep,
address = {Pittsburgh, PA, USA},
abstract = {Cloud computing offers high scalability, flexibility and
cost-effectiveness to meet emerging computing
requirements. Understanding the characteristics of real
workloads on a large production cloud cluster benefits not
only cloud service providers but also researchers and daily
users. This paper studies a large-scale Google cluster usage
trace dataset and characterizes how the machines in the
cluster are managed and the workloads submitted during a
29-day period behave. We focus on the frequency and pattern of
machine maintenance events, job- and task-level workload
behavior, and how the overall cluster resources are utilized.},
url = {http://www.cs.pitt.edu/cast/abstract/liu-srmpds12.html},
}
@INPROCEEDINGS{clusterdata:Di2012a,
author = {Sheng Di and Derrick Kondo and Walfredo Cirne},
title = {Characterization and comparison of cloud versus {Grid} workloads},
booktitle = {International Conference on Cluster Computing (IEEE CLUSTER)},
year = 2012,
month = Sep,
pages = {230--238},
address = {Beijing, China},
abstract = {A new era of Cloud Computing has emerged, but the
characteristics of Cloud load in data centers is not perfectly
clear. Yet this characterization is critical for the design of
novel Cloud job and resource management systems. In this
paper, we comprehensively characterize the job/task load and
host load in a real-world production data center at Google
Inc. We use a detailed trace of over 25 million tasks across
over 12,500 hosts. We study the differences between a Google
data center and other Grid/HPC systems, from the perspective
of both work load (w.r.t. jobs and tasks) and host load
(w.r.t. machines). In particular, we study the job length, job
submission frequency, and the resource utilization of jobs in
the different systems, and also investigate valuable
statistics of machine's maximum load, queue state and relative
usage levels, with different job priorities and resource
attributes. We find that the Google data center exhibits finer
resource allocation with respect to CPU and memory than that
of Grid/HPC systems. Google jobs are always submitted with
much higher frequency and they are much shorter than Grid
jobs. As such, Google host load exhibits higher variance and
noise.},
keywords = {cloud computing;computer centres;grid computing;queueing
theory;resource allocation;search engines;CPU;Google data
center;cloud computing;cloud job;cloud load;data centers;grid
workloads;grid-HPC systems;host load;job length;job submission
frequency;jobs resource utilization;machine maximum load;queue
state;real-world production data center;relative usage
levels;resource allocation;resource attributes;resource
management systems;task load;Capacity
planning;Google;Joints;Load modeling;Measurement;Memory
management;Resource management;Cloud Computing;Grid
Computing;Load Characterization},
doi = {10.1109/CLUSTER.2012.35},
privatenote = {An earlier version is available at
\url{http://hal.archives-ouvertes.fr/hal-00705858}. It used
to be included here as clusterdata:Di2012.},
}
################ 2010
@Article{clusterdata:Mishra2010,
author = {Mishra, Asit K. and Hellerstein, Joseph L. and Cirne,
Walfredo and Das, Chita R.},
title = {Towards characterizing cloud backend workloads: insights
from {Google} compute clusters},
journal = {SIGMETRICS Perform. Eval. Rev.},
volume = {37},
number = {4},
month = Mar,
year = 2010,
issn = {0163-5999},
pages = {34--41},
numpages = {8},
url = {http://doi.acm.org/10.1145/1773394.1773400},
doi = {10.1145/1773394.1773400},
publisher = {ACM},
abstract = {The advent of cloud computing promises highly available,
efficient, and flexible computing services for applications
such as web search, email, voice over IP, and web search
alerts. Our experience at Google is that realizing the
promises of cloud computing requires an extremely scalable
backend consisting of many large compute clusters that are
shared by application tasks with diverse service level
requirements for throughput, latency, and jitter. These
considerations impact (a) capacity planning to determine which
machine resources must grow and by how much and (b) task
scheduling to achieve high machine utilization and to meet
service level objectives.
Both capacity planning and task scheduling require a good
understanding of task resource consumption (e.g., CPU and
memory usage). This in turn demands simple and accurate
approaches to workload classification-determining how to form
groups of tasks (workloads) with similar resource demands. One
approach to workload classification is to make each task its
own workload. However, this approach scales poorly since tens
of thousands of tasks execute daily on Google compute
clusters. Another approach to workload classification is to
view all tasks as belonging to a single
workload. Unfortunately, applying such a coarse-grain workload
classification to the diversity of tasks running on Google
compute clusters results in large variances in predicted
resource consumptions.
This paper describes an approach to workload classification
and its application to the Google Cloud Backend, arguably the
largest cloud backend on the planet. Our methodology for
workload classification consists of: (1) identifying the
workload dimensions; (2) constructing task classes using an
off-the-shelf algorithm such as k-means; (3) determining the
break points for qualitative coordinates within the workload
dimensions; and (4) merging adjacent task classes to reduce
the number of workloads. We use the foregoing, especially the
notion of qualitative coordinates, to glean several insights
about the Google Cloud Backend: (a) the duration of task
executions is bimodal in that tasks either have a short
duration or a long duration; (b) most tasks have short
durations; and (c) most resources are consumed by a few tasks
with long duration that have large demands for CPU and
memory.},
}
################################################################
# Trace-usage papers
################################################################
These entries are for papers that primarily focus on some other topic, but
use the traces as inputs, e.g., in simulations or load predictions.
Order: most recent first.
################ 2016
@INPROCEEDINGS{clusterdata:Sliwko2016,
title = {{AGOCS} – Accurate {Google} Cloud Simulator Framework},
author = {Leszek Sliwko and Vladimir Getov},
booktitle = {16th IEEE International Conference on Scalable Computing and Communications (ScalCom 2016)},
year = 2016,
month = July,
pages={550--558},
address = {Toulouse, France},
keywords = {cloud system; workload traces; workload simulation framework; google cluster data},
abstract = {This paper presents the Accurate Google Cloud
Simulator (AGOCS) – a novel high-fidelity Cloud workload
simulator based on parsing real workload traces, which can be
conveniently used on a desktop machine for day-to-day
research. Our simulation is based on real-world workload
traces from a Google Cluster with 12.5K nodes, over a period
of a calendar month. The framework is able to reveal very
precise and detailed parameters of the executed jobs, tasks and
nodes as well as to provide actual resource usage statistics. The
system has been implemented in Scala language with focus on
parallel execution and an easy-to-extend design concept. The
paper presents the detailed structural framework for AGOCS
and discusses our main design decisions, whilst also suggesting
alternative and possibly performance enhancing future
approaches. The framework is available via the Open Source
GitHub repository.},
url = {http://dx.doi.org/10.1109/UIC-ATC-ScalCom-CBDCom-IoP-SmartWorld.2016.10},
doi={10.1109/UIC-ATC-ScalCom-CBDCom-IoP-SmartWorld.2016.10},
}
################ 2015
@INPROCEEDINGS{clusterdata:Carvalho2015,
title = {Prediction-Based Admission Control for {IaaS} Clouds with Multiple Service Classes},
author = {Marcus Carvalho and Daniel Menasce and Francisco Brasileiro},
booktitle = {IEEE International Conference on Cloud Computing Technology and Science (CloudCom)},
year = 2015,
month = Nov,
pages={82--90},
address = {Vancouver, BC, Canada},
keywords = {admission control;cloud computing;infrastructure-as-a-service;
performance prediction;quality of service;resource management},
abstract = {There is a growing adoption of cloud computing services,
attracting users with different requirements and budgets to run
their applications in cloud infrastructures. In order to match
users' needs, cloud providers can offer multiple service
classes with different pricing and Service Level Objective (SLO)
guarantees. Admission control mechanisms can help providers to
meet target SLOs by limiting the demand at peak periods. This
paper proposes a prediction-based admission control model for
IaaS clouds with multiple service classes, aiming to maximize
request admission rates while fulfilling availability SLOs
defined for each class. We evaluate our approach with trace-driven
simulations fed with data from production systems. Our results
show that admission control can reduce SLO violations
significantly, specially in underprovisioned scenarios. Moreover,
our predictive heuristics are less sensitive to different capacity
planning and SLO decisions, as they fulfill availability SLOs for
more than 91\% of requests even in the worst case scenario, for
which only 56\% of SLOs are fulfilled by a simpler greedy heuristic
and as little as 0.2\% when admission control is not used.},
url = {http://dx.doi.org/10.1109/CloudCom.2015.16},
doi={10.1109/CloudCom.2015.16},
}
@INPROCEEDINGS{clusterdata:Ismaeel2015,
author = {Salam Ismaeel and Ali Miri},
title = {Using {ELM} Techniques to Predict Data Centre {VM} Requests},
year = 2015,
booktitle = {IEEE International Conference on Cyber Security and Cloud Computing (CSCloud)},
month = Nov,
publisher = {IEEE},
address = {New York, NY, USA},
abstract = {Data centre prediction models can be used to forecast future loads for a
given centre in terms of CPU, memory, VM requests, and other parameters.
An effective and efficient model can not only be used to optimize resource allocation,
but can also be used as part of a strategy to conserve energy, improve performance
and increase profits for both clients and service providers. In this paper, we have
developed a prediction model, which combines k-means clustering techniques and
Extreme Learning Machines (ELMs). We have shown the effectiveness of our
proposed model by using it to estimate future VM requests in a data centre based
on its historical usage. We have tested our model on real Google traces that feature
over 25 million tasks collected over a 29-day time period. Experimental results
presented show that our proposed system outperforms other models reported in the literature.},
}
@INPROCEEDINGS{clusterdata:Sirbu2015,
title = {Towards Data-Driven Autonomics in Data Centers},
author = {Alina S{\^\i}rbu and Ozalp Babaoglu},
booktitle = {International Conference on Cloud and Autonomic Computing (ICCAC)},
month = Sep,
year = 2015,
address = {Cambridge, MA, USA},
publisher = {IEEE Computer Society},
keywords = {Data science; predictive analytics; Google cluster
trace; log data analysis; failure prediction; machine learning
classification; ensemble classifier; random forest; BigQuery},
abstract = {Continued reliance on human operators for managing data
centers is a major impediment for them from ever reaching
extreme dimensions. Large computer systems in general, and
data centers in particular, will ultimately be managed using
predictive computational and executable models obtained
through data-science tools, and at that point, the
intervention of humans will be limited to setting high-level
goals and policies rather than performing low-level
operations. Data-driven autonomics, where management and
control are based on holistic predictive models that are built
and updated using generated data, opens one possible path
towards limiting the role of operators in data centers. In
this paper, we present a data-science study of a public Google
dataset collected in a 12K-node cluster with the goal of
building and evaluating a predictive model for node failures.
We use BigQuery, the big data SQL platform from the Google
Cloud suite, to process massive amounts of data and generate a
rich feature set characterizing machine state over time. We
describe how an ensemble classifier can be built out of many
Random Forest classifiers each trained on these features, to
predict if machines will fail in a future 24-hour window. Our
evaluation reveals that if we limit false positive rates to
5\%, we can achieve true positive rates between 27\% and 88\%
with precision varying between 50\% and 72\%. We discuss the
practicality of including our predictive model as the central
component of a data-driven autonomic manager and operating it
on-line with live data streams (rather than off-line on data
logs). All of the scripts used for BigQuery and
classification analyses are publicly available from the
authors' website.},
url = {http://www.cs.unibo.it/babaoglu/papers/pdf/CAC2015.pdf},
}
@inproceedings {clusterdata:Delgado2015hawk,
author = {Pamela Delgado and Florin Dinu and Anne-Marie Kermarrec and Willy Zwaenepoel},
title = {{Hawk}: hybrid datacenter scheduling},
year = {2015},
booktitle = {USENIX Annual Technical Conference (USENIX ATC)},
month = Jul,
publisher = {USENIX Association},
pages = {499--510},
address = {Santa Clara, CA, USA},
isbn = {978-1-931971-225},
url = {https://www.usenix.org/conference/atc15/technical-session/presentation/delgado},
abstract = {
This paper addresses the problem of efficient scheduling of large clusters under
high load and heterogeneous workloads. A heterogeneous workload typically
consists of many short jobs and a small number of large jobs that consume the
bulk of the cluster’s resources.
Recent work advocates distributed scheduling to overcome the limitations
of centralized schedulers for large clusters with many competing
jobs. Such distributed schedulers are inherently scalable, but may make
poor scheduling decisions because of limited visibility into the overall
resource usage in the cluster. In particular, we demonstrate that under
high load, short jobs can fare poorly with such a distributed scheduler.
We propose instead a new hybrid centralized/distributed scheduler,
called Hawk. In Hawk, long jobs are scheduled using a centralized
scheduler, while short ones are scheduled in a fully distributed
way. Moreover, a small portion of the cluster is reserved for the use of
short jobs. In order to compensate for the occasional poor decisions
made by the distributed scheduler, we propose a novel and efficient
randomized work-stealing algorithm.
We evaluate Hawk using a trace-driven simulation and a prototype
implementation in Spark. In particular, using a Google trace, we show
that under high load, compared to the purely distributed Sparrow
scheduler, Hawk improves the 50th and 90th percentile runtimes by 80\%
and 90\% for short jobs and by 35\% and 10\% for long jobs,
respectively. Measurements of a prototype implementation using Spark on
a 100-node cluster confirm the results of the simulation.},
}
################ 2014
@InProceedings{clusterdata:Iglesias2014:task-estimation,
author = {Jesus Omana Iglesias and Liam Murphy Lero and Milan De
Cauwer and Deepak Mehta and Barry O'Sullivan},
title = {A methodology for online consolidation of tasks through
more accurate resource estimations},
year = 2014,
month = Dec,
booktitle = {IEEE/ACM Intl. Conf. on Utility and Cloud Computing (UCC)},
address = {London, UK},
abstract = {Cloud providers aim to provide computing services for a
wide range of applications, such as web applications, emails,
web searches, and map reduce jobs. These applications are
commonly scheduled to run on multi-purpose clusters that
nowadays are becoming larger and more heterogeneous. A major
challenge is to efficiently utilize the cluster's available
resources, in particular to maximize overall machine
utilization levels while minimizing application waiting
time. We studied a publicly available trace from a large
Google cluster ($\sim$12,000 machines) and observed that users
generally request more resources than required for running
their tasks, leading to low levels of utilization. In this
paper, we propose a methodology for achieving an efficient
utilization of the cluster's resources while providing the
users with fast and reliable computing services. The
methodology consists of three main modules: i) a prediction
module that forecasts the maximum resource requirement of a
task; ii) a scalable scheduling module that efficiently
allocates tasks to machines; and iii) a monitoring module that
tracks the levels of utilization of the machines and tasks. We
present results that show that the impact of more accurate
resource estimations for the scheduling of tasks can lead to
an increase in the average utilization of the cluster, a
reduction in the number of tasks being evicted, and a
reduction in task waiting time.},
keys = {online scheduling, Cloud computing, forecasting, resource provisioning,
constraint programming},
}
@InProceedings{clusterdata:Balliu2014,
author = {Alkida Balliu and Dennis Olivetti and Ozalp Babaoglu and
Moreno Marzolla and Alina Sirbu},
title = {{BiDAl: Big Data Analyzer} for cluster traces},
year = 2014,
booktitle = {Informatik Workshop on System Software Support for Big Data (BigSys)},
month = Sep,
publisher = {GI-Edition Lecture Notes in Informatics},
abstract = { Modern data centers that provide Internet-scale
services are stadium-size structures housing tens of thousands
of heterogeneous devices (server clusters, networking
equipment, power and cooling infrastructures) that must
operate continuously and reliably. As part of their
operation, these devices produce large amounts of data in the
form of event and error logs that are essential not only for
identifying problems but also for improving data center
efficiency and management. These activities employ data
analytics and often exploit hidden statistical patterns and
correlations among different factors present in the data.
Uncovering these patterns and correlations is challenging due
to the sheer volume of data to be analyzed. This paper
presents BiDAl, a prototype ``log-data analysis framework''
that incorporates various Big Data technologies to simplify
the analysis of data traces from large clusters. BiDAl is
written in Java with a modular and extensible architecture so
that different storage backends (currently, HDFS and SQLite
are supported), as well as different analysis languages
(current implementation supports SQL, R and Hadoop MapReduce)
can be easily selected as appropriate. We present the design
of BiDAl and describe our experience using it to analyze
several public traces of Google data clusters for building a
simulation model capable of reproducing observed behavior.},
}
@inproceedings{clusterdata:Caglar2014,
title = {{iOverbook}: intelligent resource-overbooking to support
soft real-time applications in the cloud},
author = {Faruk Caglar and Aniruddha Gokhale},
booktitle = {7th IEEE International Conference on Cloud Computing (IEEE CLOUD)},
year = 2014,
month = {Jun--Jul},
address = {Anchorage, AK, USA},
abstract = { Cloud service providers (CSPs) often overbook their
resources with user applications despite having to maintain
service-level agreements with their customers. Overbooking is
attractive to CSPs because it helps to reduce power
consumption in the data center by packing more user jobs in
less number of resources while improving their
profits. Overbooking becomes feasible because user
applications tend to overestimate their resource requirements
utilizing only a fraction of the allocated
resources. Arbitrary resource overbooking ratios, however, may
be detrimental to soft real-time applications, such as airline
reservations or Netflix video streaming, which are
increasingly hosted in the cloud. The changing dynamics of the
cloud preclude an offline determination of overbooking
ratios. To address these concerns, this paper presents
iOverbook, which uses a machine learning approach to make
systematic and online determination of overbooking ratios such
that the quality of service needs of soft real-time systems
can be met while still benefiting from
overbooking. Specifically, iOverbook utilizes historic data of
tasks and host machines in the cloud to extract their resource
usage patterns and predict future resource usage along with
the expected mean performance of host machines. To evaluate
our approach, we have used a large usage trace made available
by Google of one of its production data centers. In the
context of the traces, our experiments show that iOverbook can
help CSPs improve their resource utilization by an average of
12.5\% and save 32\% power in the data center.},
url = {http://www.dre.vanderbilt.edu/~gokhale/WWW/papers/CLOUD-2014.pdf},
}
@inproceedings{clusterdata:Sebastio2014,
author = {Sebastio, Stefano and Amoretti, Michele and Lluch Lafuente, Alberto},
title = {A computational field framework for collaborative task
execution in volunteer clouds},
booktitle = {International Symposium on Software Engineering for
Adaptive and Self-Managing Systems (SEAMS)},
year = 2014,
month = Jun,
isbn = {978-1-4503-2864-7},
address = {Hyderabad, India},
pages = {105--114},
url = {http://doi.acm.org/10.1145/2593929.2593943},
doi = {10.1145/2593929.2593943},
publisher = {ACM},
keywords = {ant colony optimization, bio-inspired algorithms, cloud
computing, distributed tasks execution, peer-to-peer, self-*
systems, spatial computing, volunteer computing},
abstract = {The increasing diffusion of cloud technologies offers
new opportunities for distributed and collaborative
computing. Volunteer clouds are a prominent example, where
participants join and leave the platform and collaborate by
sharing computational resources. The high complexity, dynamism
and unpredictability of such scenarios call for decentralized
self-* approaches. We present in this paper a framework for
the design and evaluation of self-adaptive collaborative task
execution strategies in volunteer clouds. As a byproduct, we
propose a novel strategy based on the Ant Colony Optimization
paradigm, that we validate through simulation-based
statistical analysis over Google cluster data.},
}
@inproceedings{clusterdata:Breitgand2014-adaptive,
title = {An adaptive utilization accelerator for virtualized environments},
author = {Breitgand, David and Dubitzky, Zvi and Epstein, Amir and
Feder, Oshrit and Glikson, Alex and Shapira, Inbar and
Toffetti, Giovanni},
booktitle = {International Conference on Cloud Engineering (IC2E)},
pages = {165--174},
year = 2014,
month = Mar,
publisher = IEEE,
address = {Boston, MA, USA},
abstract = { One of the key enablers of a cloud provider
competitiveness is ability to over-commit shared
infrastructure at ratios that are higher than those of other
competitors, without compromising non-functional requirements,
such as performance. A widely recognized impediment to
achieving this goal is so called ``Virtual Machines sprawl'',
a phenomenon referring to the situation when customers order
Virtual Machines (VM) on the cloud, use them extensively and
then leave them inactive for prolonged periods of time. Since
a typical cloud provisioning system treats new VM provision
requests according to the nominal virtual hardware
specification, an often occurring situation is that the
nominal resources of a cloud/pool become exhausted fast while
the physical hosts utilization remains low. We present IBM
adaPtive UtiLiSation AcceleratoR (IBM PULSAR), a cloud
resources scheduler that extends OpenStack Nova Filter
Scheduler. IBM PULSAR recognises that effective safely
attainable over-commit ratio varies with time due to
workloads' variability and dynamically adapts the effective
over-commit ratio to these changes.},
}
@ARTICLE{clusterdata:Zhang2014-Harmony,
author = {Qi Zhang and Mohamed Faten Zhani and Raouf Boutaba and
Joseph L Hellerstein},
title = {Dynamic heterogeneity-aware resource provisioning in the cloud},
journal = {IEEE Transactions on Cloud Computing (TCC)},
year = 2014,
month = Mar,
volume = 2,
number = 1,
abstract = { Data centers consume tremendous amounts of energy in
terms of power distribution and cooling. Dynamic capacity
provisioning is a promising approach for reducing energy
consumption by dynamically adjusting the number of active
machines to match resource demands. However, despite extensive
studies of the problem, existing solutions have not fully
considered the heterogeneity of both workload and machine
hardware found in production environments. In particular,
production data centers often comprise heterogeneous machines
with different capacities and energy consumption
characteristics. Meanwhile, the production cloud workloads
typically consist of diverse applications with different
priorities, performance and resource requirements. Failure to
consider the heterogeneity of both machines and workloads will
lead to both sub-optimal energy-savings and long scheduling
delays, due to incompatibility between workload requirements
and the resources offered by the provisioned machines. To
address this limitation, we present Harmony, a
Heterogeneity-Aware dynamic capacity provisioning scheme for
cloud data centers. Specifically, we first use the K-means
clustering algorithm to divide workload into distinct task
classes with similar characteristics in terms of resource and
performance requirements. Then we present a technique that
dynamically adjusting the number of machines to minimize total
energy consumption and scheduling delay. Simulations using
traces from a Google's compute cluster demonstrate Harmony can
reduce energy by 28 percent compared to
heterogeneity-oblivious solutions.},
}
################ 2013
@INPROCEEDINGS{clusterdata:Di2013a,
title = {Optimization of cloud task processing with checkpoint-restart mechanism},
author = {Di, Sheng and Robert, Yves and Vivien, Fr\'ed\'eric and
Kondo, Derrick and Wang, Cho-Li and Cappello, Franck},
booktitle = {25th International Conference on High Performance
Computing, Networking, Storage and Analysis (SC)},
year = 2013,
month = Nov,
address = {Denver, CO, USA},
abstract = {In this paper, we aim at optimizing fault-tolerance
techniques based on a checkpointing/restart mechanism, in the
context of cloud computing. Our contribution is
three-fold. (1) We derive a fresh formula to compute the
optimal number of checkpoints for cloud jobs with varied
distributions of failure events. Our analysis is not only
generic with no assumption on failure probability
distribution, but also attractively simple to apply in
practice. (2) We design an adaptive algorithm to optimize the
impact of checkpointing regarding various costs like
checkpointing/restart overhead. (3) We evaluate our optimized
solution in a real cluster environment with hundreds of
virtual machines and Berkeley Lab Checkpoint/Restart
tool. Task failure events are emulated via a production trace
produced on a large-scale Google data center. Experiments
confirm that our solution is fairly suitable for Google
systems. Our optimized formula outperforms Young's formula by
3--10 percent, reducing wallclock lengths by 50--100 seconds
per job on average.},
}
@inproceedings{clusterdata:Qiang2013-anomaly,
author = {Qiang Guan and Song Fu},
title = {Adaptive Anomaly Identification by Exploring Metric
Subspace in Cloud Computing Infrastructures},
booktitle = {32nd IEEE Symposium on Reliable Distributed Systems (SRDS)},
year = 2013,
month = Sep,
pages = {205--214},
address = {Braga, Portugal},
abstract = { Cloud computing has become increasingly popular by
obviating the need for users to own and maintain complex
computing infrastructures. However, due to their inherent
complexity and large scale, production cloud computing systems
are prone to various runtime problems caused by hardware and
software faults and environmental factors. Autonomic anomaly
detection is a crucial technique for understanding emergent,
cloud-wide phenomena and self-managing cloud resources for
system-level dependability assurance. To detect anomalous
cloud behaviors, we need to monitor the cloud execution and
collect runtime cloud performance data. These data consist of
values of performance metrics for different types of failures,
which display different correlations with the performance
metrics. In this paper, we present an adaptive anomaly
identification mechanism that explores the most relevant
principal components of different failure types in cloud
computing infrastructures. It integrates the cloud performance
metric analysis with filtering techniques to achieve
automated, efficient, and accurate anomaly identification. The
proposed mechanism adapts itself by recursively learning from
the newly verified detection results to refine future
detections. We have implemented a prototype of the anomaly
identification system and conducted experiments in an
on-campus cloud computing environment and by using the Google
data center traces. Our experimental results show that our
mechanism can achieve more efficient and accurate anomaly
detection than other existing schemes.},
}
@ARTICLE{clusterdata:Zhani2013-HARMONY,
title = {{HARMONY}: dynamic heterogeneity-aware resource provisioning in the cloud},
author = {Qi Zhang and Mohamed Faten Zhani and Raouf Boutaba and
Joseph L. Hellerstein},
journal = {The 33rd International Conference on Distributed Computing Systems (ICDCS)},
year = 2013,
pages = {510--519},
month = Jul,
address = {Philadelphia, PA, USA},
abstract = { Data centers today consume tremendous amount of energy
in terms of power distribution and cooling. Dynamic capacity
provisioning is a promising approach for reducing energy
consumption by dynamically adjusting the number of active
machines to match resource demands. However, despite extensive
studies of the problem, existing solutions for dynamic
capacity provisioning have not fully considered the
heterogeneity of both workload and machine hardware found in
production environments. In particular, production data
centers often comprise several generations of machines with
different capacities, capabilities and energy consumption
characteristics. Meanwhile, the workloads running in these
data centers typically consist of a wide variety of
applications with different priorities, performance objectives
and resource requirements. Failure to consider heterogenous
characteristics will lead to both sub-optimal energy-savings
and long scheduling delays, due to incompatibility between
workload requirements and the resources offered by the
provisioned machines. To address this limitation, in this
paper we present HARMONY, a Heterogeneity-Aware Resource
Management System for dynamic capacity provisioning in cloud
computing environments. Specifically, we first use the K-means
clustering algorithm to divide the workload into distinct task
classes with similar characteristics in terms of resource and
performance requirements. Then we present a novel technique
for dynamically adjusting the number of machines of each type
to minimize total energy consumption and performance penalty
in terms of scheduling delay. Through simulations using real
traces from Google's compute clusters, we found that our
approach can improve data center energy efficiency by up to
28\% compared to heterogeneity-oblivious solutions.},
}
@INPROCEEDINGS{clusterdata:Amoretti2013
title = {A cooperative approach for distributed task execution in autonomic clouds},
author = {Amoretti, M. and Lafuente, A.L. and Sebastio, S.},
booktitle = {21st Euromicro International Conference on Parallel,
Distributed and Network-Based Processing (PDP)},
publisher = {IEEE},
year = 2013,
month = Feb,
pages = {274--281},
abstract = {Virtualization and distributed computing are two key
pillars that guarantee scalability of applications deployed in
the Cloud. In Autonomous Cooperative Cloud-based Platforms,
autonomous computing nodes cooperate to offer a PaaS Cloud for
the deployment of user applications. Each node must allocate
the necessary resources for applications to be executed with
certain QoS guarantees. If the QoS of an application cannot be
guaranteed a node has mainly two options: to allocate more
resources (if it is possible) or to rely on the collaboration
of other nodes. Making a decision is not trivial since it
involves many factors (e.g. the cost of setting up virtual
machines, migrating applications, discovering
collaborators). In this paper we present a model of such
scenarios and experimental results validating the convenience
of cooperative strategies over selfish ones, where nodes do
not help each other. We describe the architecture of the
platform of autonomous clouds and the main features of the
model, which has been implemented and evaluated in the DEUS
discrete-event simulator. From the experimental evaluation,
based on workload data from the Google Cloud Backend, we can
conclude that (modulo our assumptions and simplifications) the
performance of a volunteer cloud can be compared to that of a
Google Cluster.},
doi = {10.1109/PDP.2013.47},
ISSN = {1066-6192},
address = {Belfast, UK},
url = {http://doi.ieeecomputersociety.org/10.1109/PDP.2013.47},
}
################ 2012
@INPROCEEDINGS{clusterdata:Di2012b,
title = {Host load prediction in a {Google} compute cloud with a {Bayesian} model},
author = {Di, Sheng and Kondo, Derrick and Cirne, Walfredo},
booktitle = {International Conference on High Performance Computing,
Networking, Storage and Analysis (SC)},
year = 2012,
month = Nov,
isbn = {978-1-4673-0804-5},
address = {Salt Lake City, UT, USA},
pages = {21:1--21:11},
abstract = {Prediction of host load in Cloud systems is critical for
achieving service-level agreements. However, accurate
prediction of host load in Clouds is extremely challenging
because it fluctuates drastically at small timescales. We
design a prediction method based on Bayes model to predict the
mean load over a long-term time interval, as well as the mean
load in consecutive future time intervals. We identify novel
predictive features of host load that capture the expectation,
predictability, trends and patterns of host load. We also
determine the most effective combinations of these features
for prediction. We evaluate our method using a detailed
one-month trace of a Google data center with thousands of
machines. Experiments show that the Bayes method achieves high
accuracy with a mean squared error of 0.0014. Moreover, the
Bayes method improves the load prediction accuracy by
5.6--50\% compared to other state-of-the-art methods based on
moving averages, auto-regression, and/or noise filters.},
url = {http://dl.acm.org/citation.cfm?id=2388996.2389025},
publisher = {IEEE Computer Society Press},
}
@INPROCEEDINGS{clusterdata:Zhang2012,
title = {Dynamic energy-aware capacity provisioning for cloud computing environments},
author = {Zhang, Qi and Zhani, Mohamed Faten and Zhang, Shuo and
Zhu, Quanyan and Boutaba, Raouf and Hellerstein, Joseph L.},
booktitle = {9th ACM International Conference on Autonomic Computing (ICAC)},
year = 2012,
month = Sep,
isbn = {978-1-4503-1520-3},
address = {San Jose, CA, USA},
pages = {145--154},
acmid = {2371562},
publisher = {ACM},
doi = {10.1145/2371536.2371562},
keywords = {cloud computing, energy management, model predictive
control, resource management},
abstract = {Data centers have recently gained significant popularity
as a cost-effective platform for hosting large-scale service
applications. While large data centers enjoy economies of
scale by amortizing initial capital investment over large
number of machines, they also incur tremendous energy cost in
terms of power distribution and cooling. An effective approach
for saving energy in data centers is to adjust dynamically the
data center capacity by turning off unused machines. However,
this dynamic capacity provisioning problem is known to be
challenging as it requires a careful understanding of the
resource demand characteristics as well as considerations to
various cost factors, including task scheduling delay, machine
reconfiguration cost and electricity price fluctuation. In
this paper, we provide a control-theoretic solution to the
dynamic capacity provisioning problem that minimizes the total
energy cost while meeting the performance objective in terms
of task scheduling delay. Specifically, we model this problem
as a constrained discrete-time optimal control problem, and
use Model Predictive Control (MPC) to find the optimal control
policy. Through extensive analysis and simulation using real
workload traces from Google's compute clusters, we show that
our proposed framework can achieve significant reduction in
energy cost, while maintaining an acceptable average
scheduling delay for individual tasks.},
}
@INPROCEEDINGS{clusterdata:Ali-Eldin2012
title = {Efficient provisioning of bursty scientific workloads on the
cloud using adaptive elasticity control},
author = {Ahmed Ali-Eldin and Maria Kihl and Johan Tordsson and Erik Elmroth},
booktitle = {3rd Workshop on Scientific Cloud Computing (ScienceCloud)},
year = 2012,
month = Jun,
address = {Delft, The Nederlands},
isbn = {978-1-4503-1340-7},
pages = {31--40},
url = {http://dl.acm.org/citation.cfm?id=2287044},
doi = {10.1145/2287036.2287044},
publisher = {ACM},
abstract = {Elasticity is the ability of a cloud infrastructure to
dynamically change the amount of resources allocated to a
running service as load changes. We build an autonomous
elasticity controller that changes the number of virtual
machines allocated to a service based on both monitored load
changes and predictions of future load. The cloud
infrastructure is modeled as a G/G/N queue. This model is used
to construct a hybrid reactive-adaptive controller that
quickly reacts to sudden load changes, prevents premature
release of resources, takes into account the heterogeneity of
the workload, and avoids oscillations. Using simulations with
Web and cluster workload traces, we show that our proposed
controller lowers the number of delayed requests by a factor
of 70 for the Web traces and 3 for the cluster traces when
compared to a reactive controller. Our controller also
decreases the average number of queued requests by a factor of
3 for both traces, and reduces oscillations by a factor of 7
for the Web traces and 3 for the cluster traces. This comes at
the expense of between 20\% and 30\% over-provisioning, as
compared to a few percent for the reactive controller.},
}
################ 2011
@INPROCEEDINGS{clusterdata:Sharma2011,
title = {Modeling and synthesizing task placement constraints in
{Google} compute clusters},
author = {Sharma, Bikash and Chudnovsky, Victor and Hellerstein,