From bff4ae84963aa96dbf09ab60135741bb19406526 Mon Sep 17 00:00:00 2001
From: weiting-chen <weiting.chen@intel.com>
Date: Tue, 21 Jun 2022 15:31:51 +0800
Subject: [PATCH] [DNM]Add monitoring sar support.

---
 tools/README.md       |  66 ++++--
 tools/monitor.py      | 146 ++++++++++++++
 tools/post_process.sh |  35 ++++
 tools/run_example.sh  |  41 ++++
 tools/template.ipynb  | 456 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 727 insertions(+), 17 deletions(-)
 create mode 100644 tools/monitor.py
 create mode 100755 tools/post_process.sh
 create mode 100755 tools/run_example.sh
 create mode 100644 tools/template.ipynb

diff --git a/tools/README.md b/tools/README.md
index 0c1d5113b..bebc6333c 100644
--- a/tools/README.md
+++ b/tools/README.md
@@ -1,73 +1,105 @@
-### Spark Eventlog Analyzer
+# Spark Eventlog Analyzer
 The pyspark script to analyze Gazelle's eventlog
 
 ## Prequisites
 
-# Jupyter Installation
+### Jupyter Installation
 ```
 pip3 install jupyter
 ```
 
-# Notebook Installation
+### Notebook Installation
 ```
 pip3 install notebook
 ```
 
-# iPyKernel
+### iPyKernel
 ```
 pip3 install ipykernel
 ```
-# FindSpark
+### FindSpark
 ```
 pip3 install findspark
 ```
 
-# Matplotlib
+### Matplotlib
 ```
 pip3 install matplotlib
 ```
 
-# Seaborn
+### Seaborn
 ```
 pip3 install seaborn
 ```
 
-# Pandasql
+### Pandasql
 ```
 pip3 install pandasql
 ```
 
-# PyHDFS
+### PyHDFS
 ```
 pip3 install pyhdfs
 ```
 
-# PyArrow
+### PyArrow
 ```
 pip3 install pyarrow
 ```
 
-### Put below two .ipynb in the jupyter root directory
+# Eventlog Analyzer Tools
+The eventlog analyzer includes sparklog.ipynb and gazelle_analysis.ipynb.
+Please put them into jupyter directory.
 ## sparklog.ipynb
 sparklog.ipynb is the function definition for spark eventlog analyzer
 
 ## gazelle_analysis.ipynb
 gazelle_analysis is the main program to call sparklog.ipynb and load the eventlog from hdfs.
 
-###How it works:
+##How it works:
 Launch gazelle_analysis.ipynb as main script
 
-###Parameters:
+##Parameters:
 In Analysis:generate_trace_view, the url for display.
 In App_Log_Analysis:get_basic_state, the url for display.
 In App_Log_Analysis:get_app_info, the url for display.
 In show_rst function, the url for html.
 In pyhdfs, the url for HDFS hosts.
 
-
-
-###To run in in commandline:
+##To run in in commandline:
 jupyter nbconvert --execute --to notebook --inplace --allow-errors --ExecutePreprocessor.timeout=-1 ./gazelle_analysis.ipynb --template classic
 
-###To convert into HTML:
+##To convert into HTML:
 jupyter nbconvert --to html ./gazelle_analysis.ipynb --output ./gazelle_analysis.html --template classic
+
+# Tools to collect sar information and generation trace view(.json):
+You can also use below files to collect sar information.
+The purpose for this tool is to generate a json file with sar information.
+After the json has been generated, you can use catapult to view your json file.
+
+monitor.py: main program to collect sar information, it must be started before your application and stopped after your application.
+post_process.sh: Post process the sar files after the monitor stop.
+run_example.sh: An example to teach you how to use monitor.py with your application.
+template.ipynb: A template to use for generating trace view(.json).
+
+## Usage
+Before run the tool, please make sure to set up some settings including
+In monitor.py,
+clients:the nodes in your cluster.
+base_dir: the base directory name to put logs.
+local_profile_dir: the local location to put logs.
+hdfs_address: the hdfs address to copy all the logs to hdfs.
+
+In sparklog.ipynb,
+Please replace sr124 to the master in your cluster and use to process the logs.
+Please replace sr525 to the catapult server. 
+
+You can check run_example.sh to see how to use the script to collect sar information.
+Please add below command in your script:
+```
+appid=`yarn application -list 2>&1 | tail -n 1 | awk -F"\t" '{print $1}'`
+rm -f log/memory*.csv
+python3 ./monitor.py start $appid
+$run_your_query
+python3 ./monitor.py stop $appid "spark_logs"
+```
diff --git a/tools/monitor.py b/tools/monitor.py
new file mode 100644
index 000000000..2816de488
--- /dev/null
+++ b/tools/monitor.py
@@ -0,0 +1,146 @@
+import os
+import pyhdfs
+import subprocess
+import sys
+import time
+
+from inspect import currentframe, getframeinfo
+from pathlib import Path
+
+clients=['sr124']
+home = str(Path.home())
+base_dir = 'profile'
+local_profile_dir=home+"/"+base_dir
+hdfs_address='10.1.0.24:50070'
+
+def killsar():
+    for l in clients:
+        try:
+            cmd="ssh "+l+" ps aux | grep -w sar | grep -v grep | tr -s ' ' | cut -d' ' -f2"
+            out=subprocess.check_output(cmd).decode('ascii').strip().split("\n")
+            for x in out:
+                cmd="ssh "+l+" kill "+x+" > /dev/null 2>&1"
+                subprocess.call(cmd,shell=True)
+        except Exception as e:
+            print(e)
+            pass
+    for l in clients:
+        try:
+            cmd="ssh "+l+" ps aux | grep -w pidstat | grep -v grep | tr -s ' ' | cut -d' ' -f2"
+            out=subprocess.check_output(cmd, shell=True).decode('ascii').strip().split("\n")
+            for x in out:
+                cmd="ssh "+l+" kill "+x+" > /dev/null 2>&1"
+                subprocess.call(cmd,shell=True)
+        except Exception as e: 
+            print(e)
+            pass
+    for l in clients:
+        try:
+            cmd="ssh "+l+" ps aux | grep -w perf | grep -v grep | tr -s ' ' | cut -d' ' -f2"
+            out=subprocess.check_output(cmd,shell=True).decode('ascii').strip().split("\n")
+        except Exception as e:
+            print(e)
+            pass
+
+def startmonitor(appid, **kwargs):
+    print("[monitor.py]Starting system monitoring ...")
+    print(clients)
+    appid_profile_dir=local_profile_dir+"/"+appid
+    cmd="mkdir -p "+appid_profile_dir
+    print("Launching CMD create application id profile dir: %s" % cmd)
+    subprocess.call(cmd,shell=True)
+
+    for l in clients:
+        cmd="ssh "+l+" date"
+        print(subprocess.check_output(cmd,shell=True).decode('ascii'))
+
+    killsar()
+
+    for l in clients:
+        print("[monitor.py]create profile directory")
+        client_profile_dir=appid_profile_dir+"/"+l
+        cmd="mkdir -p "+client_profile_dir
+        print("[monitor.py]Launching CMD create server profile dir: %s" % cmd)
+        subprocess.call(cmd,shell=True)
+        cmd="ssh "+l+" mkdir -p "+client_profile_dir
+        print("[monitor.py]Launching CMD create client profile dir: %s" % cmd)
+        subprocess.call(cmd,shell=True)
+        cmd="ssh "+l+" sar -o "+client_profile_dir+"/sar.bin -r -u -d -B -n DEV 1 >/dev/null 2>&1 &"
+        print("[monitor.py]Launching CMD create sar.bin file: %s" % cmd)
+        subprocess.call(cmd,shell=True)
+        if kwargs.get("collect_pid",False):
+            cmd="ssh "+l+" jps | grep CoarseGrainedExecutorBackend | head -n 1 | cut -d' ' -f 1 | xargs  -I % pidstat -h -t -p % 1  > "+client_profile_dir+"/pidstat.out 2>/dev/null &"
+            print("Launching CMD collect pid: %s" % cmd)
+            subprocess.call(cmd,shell=True)
+    return appid_profile_dir
+
+def stopmonitor(appid, eventlogdir, basedif):
+
+    appid_profile_dir=local_profile_dir+"/"+appid
+    cmd="mkdir -p "+appid_profile_dir
+    print("Launching CMD create application id profile dir: %s" % cmd)
+    subprocess.call(cmd,shell=True)
+
+    killsar()
+
+    with open("%s/starttime" % appid_profile_dir,"w") as f:
+        f.write("{:d}".format(int(time.time()*1000)))
+
+    hadoophome=os.environ["HADOOP_HOME"]
+    userlogdir="/opt/hadoop/yarn/logs"
+
+    for l in clients:
+        client_profile_dir=appid_profile_dir+"/"+l
+        cmd="ssh "+l+" sar -f "+client_profile_dir+"/sar.bin -r > "+client_profile_dir+"/sar_mem.sar;sar -f "+client_profile_dir+"/sar.bin -u > "+client_profile_dir+"/sar_cpu.sar;sar -f "+client_profile_dir+"/sar.bin -d > "+client_profile_dir+"/sar_disk.sar;sar -f "+client_profile_dir+"/sar.bin -n DEV > "+client_profile_dir+"/sar_nic.sar;sar -f "+client_profile_dir+"/sar.bin -B > "+client_profile_dir+"/sar_page.sar;"
+        print("Launching CMD: %s" % cmd)
+        subprocess.call(cmd,shell=True)
+        cmd="ssh "+l+" grep -rI xgbtck --no-filename "+userlogdir+"/"+appid+"/* | sed 's/^ //g'  > "+client_profile_dir+"/xgbtck.txt"
+        print("Launching CMD: %s" % cmd)
+        subprocess.call(cmd,shell=True)
+        cmd="scp -r "+l+":"+client_profile_dir+" "+appid_profile_dir+"/ > /dev/null 2>&1"
+        print("Launching CMD: %s" % cmd)
+        subprocess.call(cmd, shell=True)
+        cmd="ssh "+l+" jps | grep CoarseGrainedExecutorBackend | head -n 2 | tail -n 1 | cut -d' ' -f 1  | xargs -I % ps -To tid p % > "+client_profile_dir+"/sched_threads.txt"
+        subprocess.call(cmd, shell=True)
+        cmd="ssh "+l+" sar -V > "+client_profile_dir+"/sarv.txt"
+        print("Launching CMD: %s" % cmd)
+        subprocess.call(cmd, shell=True)
+        cmd="test -f "+client_profile_dir+"/perfstat.txt && head -n 1 "+client_profile_dir+"/perfstat.txt > "+client_profile_dir+"/perfstarttime"
+        print("Launching CMD: %s" % cmd)
+        subprocess.call(cmd,shell=True)
+
+    logfile=eventlogdir+"/"+appid
+    cmd="hadoop fs -copyToLocal "+logfile+" "+appid_profile_dir+"/app.log"
+    print("Launching CMD hadoop fs copytolocal: %s" % cmd)
+    subprocess.call(cmd,shell=True)
+
+    fs = pyhdfs.HdfsClient(hosts=hdfs_address, user_name='root')
+
+    print("Launching CMD hadoop fs -mkdir /%s" % basedif)
+    fs.mkdirs("/" + basedif + "/")
+    v=[os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(local_profile_dir+"/"+appid)) for f in fn]
+    for f in v:
+        paths=os.path.split(f)
+        fs.mkdirs("/"+ basedif + paths[0][len(local_profile_dir):])
+        fs.copy_from_local(f,"/"+ basedif + paths[0][len(local_profile_dir):]+"/"+paths[1],overwrite=True)
+
+
+if __name__ == '__main__':
+    if sys.argv[1]=="start":
+        startmonitor( sys.argv[2])
+    elif sys.argv[1]=="stop":
+        import datetime
+        from datetime import date
+        basedir=base_dir+"/"+date.today().strftime("%Y_%m_%d")
+        stopmonitor( sys.argv[2],sys.argv[3],basedir)
+
+        lastnightrun=["","",""]
+        with open("log/runs.txt") as f:
+            for l in f.readlines():
+                x=l.strip().split(" ")
+                if ( x[0]=="05" and x[2]!=sys.argv[2] ) or ( len(sys.argv)==5 and x[2]==sys.argv[4] ):
+                    lastnightrun[0]=x[0]
+                    lastnightrun[1]=x[1]
+                    lastnightrun[2]=x[2]
+        os.system(("./post_process.sh {} {} {} {}").format(date.today().strftime("%Y_%m_%d"), sys.argv[2], lastnightrun[1],lastnightrun[2]))
+
diff --git a/tools/post_process.sh b/tools/post_process.sh
new file mode 100755
index 000000000..7506004e0
--- /dev/null
+++ b/tools/post_process.sh
@@ -0,0 +1,35 @@
+# /bin/sh
+echo -e "Starting Post Processing ..."
+echo -e "Start notebook processing - template.ipynb"
+sed 's/BASEDIR_TEMP/profile\/'$1'/g' template.ipynb > tpcxx_$1_$2.ipynb
+sed -i 's/APPID_TEMP/'$2'/g' tpcxx_$1_$2.ipynb
+if [ $# -eq 4 ]
+then
+	sed -i 's/LAST_BASEDIR/profile\/'$3'/g' tpcxx_$1_$2.ipynb
+	sed -i 's/LAST_APPID/'$4'/g' tpcxx_$1_$2.ipynb
+fi
+
+hadoop fs -mkdir /history
+hadoop fs -cp /profile/$1/$2/app.log /history/$2
+echo -e "Finish notebook processing - template.ipynb"
+
+echo -e "Start notebook execution - tpcxx.ipynb"
+mkdir -p html
+jupyter nbconvert --execute --to notebook --inplace --allow-errors --ExecutePreprocessor.timeout=-1 ./tpcxx_$1_$2.ipynb --template classic
+jupyter nbconvert --to html ./tpcxx_$1_$2.ipynb --output html/tpcxx_$1_$2.html --template classic
+
+#echo -e "notebook processing - tpch_summary.ipynb"
+#sed 's/BASEDIR_TEMP/profile\/'$1'/g' tpch_template_summary.ipynb > tpch_summary_$1_$2.ipynb
+#sed -i 's/APPID_TEMP/'$2'/g' tpch_summary_$1_$2.ipynb
+#if [ $# -eq 4 ]
+#then
+#	sed -i 's/LAST_BASEDIR/profile\/'$3'/g' tpch_summary_$1_$2.ipynb
+#	sed -i 's/LAST_APPID/'$4'/g' tpch_summary_$1_$2.ipynb
+#fi
+
+#jupyter nbconvert --execute --to notebook --inplace --allow-errors --ExecutePreprocessor.timeout=-1 ./tpch_summary_$1_$2.ipynb --template classic
+#jupyter nbconvert --to html  --no-input ./tpch_summary_$1_$2.ipynb --output html/tpch_summary_$1_$2.html --template classic
+#echo -e "Finish notebook processing - tpch.ipynb"
+
+#rm -rf ./tpch_summary_$1_$2.ipynb
+echo -e "Finish Post Processing !!!"
diff --git a/tools/run_example.sh b/tools/run_example.sh
new file mode 100755
index 000000000..646339d82
--- /dev/null
+++ b/tools/run_example.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+#/home/spark-sql/collect_sar.sh
+eventlogdir="spark_logs"
+
+echo "Stoping Thrift Server ..."
+./run_spark_thrift_server.sh stop
+echo "Done"
+sleep 20
+echo "Cleaning Cache ..."
+./clean_cache.sh
+echo "Done"
+sleep 1
+echo "Starting Thrift Server ..."
+./run_spark_thrift_server.sh start
+echo "Done"
+sleep 40
+echo "Start Resource Monitoring ..."
+appid=`yarn application -list 2>&1 | tail -n 1 | awk -F"\t" '{print $1}'`
+echo `date +'%H %Y_%m_%d'` $appid ${1} >> log/runs.txt
+rm -f log/memory*.csv
+python3 ./monitor.py start $appid
+echo "Running TPCH Query"
+./run_tpch.py 2>&1 >> tpch_query.log | tee -a tpch_query.txt
+echo "Done"
+sleep 1
+echo "Stop Thrift Server"
+./run_spark_thrift_server.sh stop
+sleep 10
+python3 ./monitor.py stop $appid "spark_logs"
+echo '<font style="font-family: Courier New"">' > log/link.html
+echo 'history event: <a href="http://10.1.0.24:18080/history/'$appid'/jobs/">http://10.1.0.24:18080/history/'$appid'/jobs/</a><br>' >> log/link.html
+echo 'history on sr124: <a href="http://10.1.0.24:18080/history/'$appid'/jobs/">http://10.1.0.24:18080/history/'$appid'/jobs/</a><br>' >> log/link.html
+echo 'notebook on sr124: <a href="http://10.1.0.24:8888/notebooks/jenkins/tpch_'`date +'%Y_%m_%d'`'_'$appid'.ipynb">http://10.1.0.24:8888/notebooks/jenkins/tpch_'`date +'%Y_%m_%d'`'_'$appid'.ipynb</a><br>' >> log/link.html
+echo 'notebook html on sr124: <a href="http://10.1.0.24:8888/view/jenkins/html/tpch_'`date +'%Y_%m_%d'`'_'$appid'.html">http://10.1.0.24:8888/notebooks/jenkins/html/tpch_'`date +'%Y_%m_%d'`'_'$appid'.html</a><br>' >> log/link.html
+echo 'traceview on sr124: <a href="http://10.1.0.24:1088/tracing_examples/trace_viewer.html#/tracing/test_data/'$appid'.json">http://10.1.0.24:1088/tracing_examples/trace_viewer.html#/tracing/test_data/'$appid'.json</a><br>' >> log/link.html
+
+echo "</font><hr/>" >> log/link.html
+
+echo "All Jobs Are Done."
+#/home/spark-sql/stop_sar.sh
diff --git a/tools/template.ipynb b/tools/template.ipynb
new file mode 100644
index 000000000..87b4d351b
--- /dev/null
+++ b/tools/template.ipynb
@@ -0,0 +1,456 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from __future__ import nested_scopes\n",
+    "from IPython.core.display import display, HTML\n",
+    "display(HTML(\"<style>.container { width:100% !important; }</style>\"))\n",
+    "display(HTML('<style>.CodeMirror{font-family: \"Courier New\";font-size: 12pt;}</style>'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "logger = logging.getLogger()\n",
+    "logger.setLevel(logging.ERROR)\n",
+    "\n",
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import findspark\n",
+    "findspark.init()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "import os\n",
+    "import pandas\n",
+    "pandas.set_option('display.max_rows', None)\n",
+    "\n",
+    "import matplotlib\n",
+    "%matplotlib inline\n",
+    "import matplotlib.pyplot as plt\n",
+    "import matplotlib.ticker as mtick\n",
+    "from matplotlib import colors\n",
+    "from matplotlib import rcParams\n",
+    "rcParams['font.sans-serif'] =  'Courier New'\n",
+    "rcParams['font.family'] = 'Courier New'\n",
+    "rcParams['font.size'] = '12'\n",
+    "%matplotlib inline\n",
+    "\n",
+    "from IPython.display import display,HTML\n",
+    "import threading\n",
+    "import collections\n",
+    "\n",
+    "from IPython.display import display\n",
+    "from ipywidgets import IntProgress,Layout\n",
+    "import time\n",
+    "import threading\n",
+    "import gzip\n",
+    "import pyspark\n",
+    "import pyspark.sql\n",
+    "from pyspark.sql import SparkSession\n",
+    "from pyspark.sql.types import (StructType, StructField, DateType,\n",
+    "    TimestampType, StringType, LongType, IntegerType, DoubleType,FloatType)\n",
+    "from pyspark.sql.functions import to_date, floor\n",
+    "from pyspark.ml.feature import StringIndexer, VectorAssembler\n",
+    "from pyspark.ml import Pipeline\n",
+    "from pyspark.sql.functions import lit\n",
+    "import datetime\n",
+    "import time\n",
+    "from pyspark.storagelevel import StorageLevel\n",
+    "from pyspark.sql.window import Window\n",
+    "from pyspark.sql.functions import rank, col\n",
+    "from pyspark.ml import Pipeline\n",
+    "import pandas\n",
+    "import numpy\n",
+    "\n",
+    "\n",
+    "import re\n",
+    "import math\n",
+    "from functools import reduce\n",
+    "import json\n",
+    "\n",
+    "\n",
+    "from pyspark.sql.types import *\n",
+    "from pyspark.sql import functions as F\n",
+    "from datetime import date"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# start analysis cluster and run"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "code_folding": []
+   },
+   "outputs": [],
+   "source": [
+    "executors_per_node = 5\n",
+    "nodes=3\n",
+    "cores_per_executor=4\n",
+    "task_per_core=4\n",
+    "\n",
+    "\n",
+    "cache_size=20\n",
+    "total_size=100000\n",
+    "print('executor per node: {:d}\\nparallelism: {:d}\\nmemory: {:d}m\\noffheap:{:d}m'.format(executors_per_node,nodes*executors_per_node*cores_per_executor*task_per_core,int(math.floor(nodes*total_size/(nodes*executors_per_node)))-1024-int(math.floor(cache_size*1024/(nodes*executors_per_node))),int(math.floor(cache_size*1024/(nodes*executors_per_node)))))\n",
+    "\n",
+    "from pyspark import SparkConf, SparkContext\n",
+    "from pyspark.sql import SQLContext\n",
+    "conf = (SparkConf()\n",
+    "    .set(\"spark.default.parallelism\", \"{:d}\".format(nodes*executors_per_node))\n",
+    "    .set('spark.executor.instances', '{:d}'.format(nodes*executors_per_node))\n",
+    "    .set('spark.sql.files.maxPartitionBytes', '256m')\n",
+    "    .set('spark.app.name', 'pyspark_final')\n",
+    "    .set('spark.rdd.compress', 'False')\n",
+    "    .set('spark.serializer','org.apache.spark.serializer.KryoSerializer')    \n",
+    "    .set('spark.executor.cores','{:d}'.format(cores_per_executor))\n",
+    "    .set('spark.sql.adaptive.coalescePartitions.initialPartitionNum','{:d}'.format(nodes*executors_per_node*cores_per_executor*task_per_core))\n",
+    "    .set('spark.sql.adaptive.enabled',True)\n",
+    "    .set('spark.sql.adaptive.advisoryPartitionSizeInBytes', '256m')\n",
+    "    .set('spark.executor.memory', '{:d}m'.format(int(math.floor(nodes*total_size/(nodes*executors_per_node)))-1024-int(math.floor(cache_size*1024/(nodes*executors_per_node)))))\n",
+    "    .set('spark.task.cpus','1')\n",
+    "    .set('spark.driver.memory','128g')\n",
+    "    .set('spark.sql.inMemoryColumnarStorage.compressed','False')\n",
+    "    .set('spark.sql.inMemoryColumnarStorage.batchSize','100000')\n",
+    "#    .set('spark.memory.storageFraction','0.8')\n",
+    "#    .set('spark.memory.fraction','0.7')\n",
+    "        \n",
+    "    .set('spark.sql.execution.arrow.fallback.enabled','True')\n",
+    "    .set('spark.sql.execution.arrow.enabled','True')\n",
+    "    .set('spark.sql.execution.arrow.maxRecordsPerBatch','100000')\n",
+    "    .set(\"spark.sql.repl.eagerEval.enabled\", True)\n",
+    "        \n",
+    "    .set('spark.memory.offHeap.enabled','True')\n",
+    "    .set('spark.memory.offHeap.size','{:d}m'.format(int(math.floor(cache_size*1024/(nodes*executors_per_node)))))\n",
+    "    .set('spark.executor.memoryOverhead','{:d}m'.format(int(math.floor(cache_size*1024/(nodes*executors_per_node)))+3000))\n",
+    "    .set('spark.sql.join.preferSortMergeJoin','False')\n",
+    "#    .set('spark.executor.extraJavaOptions',\n",
+    "#          '-XX:+UseG1GC -XX:+PrintFlagsFinal -XX:+PrintReferenceGC -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:InitiatingHeapOccupancyPercent=35')\n",
+    "    .set('spark.executor.extraJavaOptions',\n",
+    "          '-XX:+UseParallelGC -XX:+UseParallelOldGC -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps')\n",
+    "    .set('spark.dynamicAllocation.enabled', 'False')\n",
+    "    .set('spark.shuffle.service.enabled', 'False')\n",
+    "    .set('spark.dynamicAllocation.shuffleTracking.enabled', 'Falze')\n",
+    "    .set(\"spark.sql.legacy.timeParserPolicy\",\"LEGACY\")\n",
+    "\n",
+    "        \n",
+    "#    .set(\"spark.sql.session.timeZone\", \"PST\")\n",
+    "     .set(\"spark.sql.session.timeZone\", \"Etc/GMT-8\")\n",
+    "#     .set(\"spark.driver.extraJavaOptions\",\"-Duser.timezone=UTC+8\")\n",
+    "    \n",
+    "       )\n",
+    "\n",
+    "sc = SparkContext(conf=conf,master='yarn')\n",
+    "sc.setLogLevel(\"ERROR\")\n",
+    "spark = SQLContext(sc)\n",
+    "time.sleep(10)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%html\n",
+    "<style>\n",
+    "div.output_stderr {\n",
+    "background: #ffdd;\n",
+    "display: none;\n",
+    "}\n",
+    "</style>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Sparklog"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "%run ./sparklog_monitor.ipynb"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import warnings\n",
+    "warnings.filterwarnings('ignore')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "basedir=\"BASEDIR_TEMP\"\n",
+    "appid=\"APPID_TEMP\"\n",
+    "\n",
+    "last_basedir=\"LAST_BASEDIR\"\n",
+    "last_appid=\"LAST_APPID\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# native, partition table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "app=Application_Run(appid,basedir=basedir)\n",
+    "appals=app.analysis['app']['als']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "appals.get_basic_state()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "appals.get_app_name()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#shuffle_df, dfx=appals.get_shuffle_stat()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# APP info"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "app.generate_trace_view(showemon=False,show_metric=emonmetric,disk_prefix='nvme%n1p1',nic_prefix=[\"'enp134s0f1'\",\"'eno1'\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "appals=app.analysis['app']['als']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "appals.get_app_info(disk_prefix='nvme%n1p1',nic_prefix=[\"'enp134s0f1'\",\"'eno1'\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "appals.show_critical_path_time_breakdown()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "hide_input": false,
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.8"
+  },
+  "nbTranslate": {
+   "displayLangs": [
+    "*"
+   ],
+   "hotkey": "alt-t",
+   "langInMainMenu": true,
+   "sourceLang": "en",
+   "targetLang": "fr",
+   "useGoogleTranslate": true
+  },
+  "toc": {
+   "base_numbering": 1,
+   "nav_menu": {},
+   "number_sections": true,
+   "sideBar": false,
+   "skip_h1_title": false,
+   "title_cell": "Table of Contents",
+   "title_sidebar": "Contents",
+   "toc_cell": false,
+   "toc_position": {
+    "height": "197px",
+    "left": "2188px",
+    "top": "111px",
+    "width": "269px"
+   },
+   "toc_section_display": true,
+   "toc_window_display": true
+  },
+  "varInspector": {
+   "cols": {
+    "lenName": 16,
+    "lenType": 16,
+    "lenVar": 40
+   },
+   "kernels_config": {
+    "python": {
+     "delete_cmd_postfix": "",
+     "delete_cmd_prefix": "del ",
+     "library": "var_list.py",
+     "varRefreshCmd": "print(var_dic_list())"
+    },
+    "r": {
+     "delete_cmd_postfix": ") ",
+     "delete_cmd_prefix": "rm(",
+     "library": "var_list.r",
+     "varRefreshCmd": "cat(var_dic_list()) "
+    }
+   },
+   "types_to_exclude": [
+    "module",
+    "function",
+    "builtin_function_or_method",
+    "instance",
+    "_Feature"
+   ],
+   "window_display": false
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}