diff --git a/tools/gazelle_analysis.ipynb b/tools/gazelle_analysis.ipynb index 839b19bea..607d84e8e 100644 --- a/tools/gazelle_analysis.ipynb +++ b/tools/gazelle_analysis.ipynb @@ -400,6 +400,22 @@ "# if you need to open the traceview from hostip instead of 127.0.0.1, hack catapult :-)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The command generate the traceview as json format. It can be open by traceviewer in https://chromium.googlesource.com/catapult.\n", + "# 1. clone catapult\n", + "# 2. copy/generate the traceview json file to a folder like /home/xxx/trace_result\n", + "# 3. cd catapult/bin/\n", + "# 4. python2.7 ./run_dev_server --no-install-hooks -d /home/xxx/trace_result -p1088\n", + "# 5. open like in browser: http://127.0.0.1:1088/tracing_examples/trace_viewer.html#/tracing/test_data/application_1647347981137_0221_traceview.json\n", + "\n", + "# if you need to open the traceview from hostip instead of 127.0.0.1, hack catapult :-)" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/tools/sparklog.ipynb b/tools/sparklog.ipynb index 9aa4219ec..753710aca 100644 --- a/tools/sparklog.ipynb +++ b/tools/sparklog.ipynb @@ -556,6 +556,8 @@ " queryid=kwargs.get('queryid',None)\n", " shownops=kwargs.get(\"shownops\",['ArrowRowToColumnarExec','ColumnarToRow','RowToArrowColumnar','ArrowColumnarToRow','Filter','HashAggregate','Project','SortAggregate','SortMergeJoin','window'])\n", " \n", + " desensitization=kwargs.get('desensitization',True)\n", + " \n", " def get_fields(colss):\n", " lvls=0\n", " colns=[]\n", @@ -590,9 +592,10 @@ " funcs[opname+str(len(columns))].extend(colns)\n", " for c in colns:\n", " if \" AS \" in c:\n", + " c=re.sub(\"#\\d+L*\",\"\",c)\n", " colname=re.search(r\" AS (.+)\",c).group(1)\n", " if colname not in columns:\n", - " columns[colname]=prefix+str(len(columns)) \n", + " columns[colname]=prefix\n", " \n", " plans=appals.queryplans.select('real_queryid','physicalPlanDescription').collect() if queryid is None else appals.queryplans.where(f\"real_queryid='{queryid}'\").select(\"physicalPlanDescription\").collect()\n", " \n", @@ -623,22 +626,29 @@ " idv=re.search(\"^\\(\\d+\\)\",l).group(0)\n", " if idv in nodes:\n", " desc=\"\"\n", - " while l!=\"\":\n", + " while l.strip()!=\"\":\n", " desc+=l+\"\\n\"\n", " idx+=1\n", " l=lines[idx]\n", + " desc=re.sub(r\"#\\d+L*\",r\"\",desc)\n", + " desc=re.sub(r\"= [^)]+\",r\"=\",desc)\n", + " desc=re.sub(r\"IN \\([^)]\\)\",r\"IN ()\",desc)\n", + " desc=re.sub(r\"In\\([^)]\\)\",r\"In()\",desc)\n", + " desc=re.sub(r\"EqualTo\\(([^,]+),[^)]+\\)\",r\"EqualTo(\\1,)\",desc)\n", + " ## add all keyword replace here\n", " nodes[idv].append(desc)\n", " tables={}\n", " columns={}\n", " functions={}\n", " for s in nodes.values():\n", - " p=re.search(r\"Scan arrow default\\.([^ ]+)\",s[0])\n", + " p=re.search(r\"Scan arrow [^.]*\\.([^ ]+)\",s[0])\n", " if p:\n", " tn=p.group(1)\n", " if not tn in tables:\n", - " tables[tn]=\"t_\"+str(len(tables))\n", - " s[0]=s[0].replace(tn,tables[tn])\n", - " s[1]=s[1].replace(tn,tables[tn])\n", + " tables[tn]=\"table\"\n", + " if desensitization:\n", + " s[0]=s[0].replace(tn,tables[tn])\n", + " s[1]=s[1].replace(tn,tables[tn])\n", " colsv=[]\n", " schema=[]\n", " for v in s[1].split(\"\\n\"):\n", @@ -652,21 +662,40 @@ " if not ct in columns:\n", " if len(cts)==2:\n", " cts[1]=cts[1]\n", - " columns[ct]=cts[1]+\"_\"+str(len(columns))\n", + " columns[ct]=cts[1]+\"_\"\n", " else:\n", - " columns[ct]=\"c_\"+str(len(columns))\n", + " columns[ct]=\"c_\"\n", + " if v.startswith(\"Location\") and desensitization:\n", + " s[1]=s[1].replace(v+\"\\n\",\"\")\n", + " \n", " get_column_names(s, \"Project\", \"Output\", \"proj_\", columns, functions)\n", " get_column_names(s, \"HashAggregate\", \"Results\", \"shagg_\", columns, functions)\n", " get_column_names(s, \"SortAggregate\", \"Results\", \"stagg_\", columns, functions)\n", - "\n", + " get_column_names(s, \"ColumnarConditionProject\", \"Arguments\", \"cproj_\", columns, functions)\n", + " get_column_names(s, \"ColumnarHashAggregate\", \"Results\", \"cshagg_\", columns, functions)\n", + " get_column_names(s, \"Window\", \"Arguments\", \"window_\", columns, functions)\n", + "\n", + " keys=[]\n", + " ckeys=list(columns.keys())\n", + " for l in range(0,len(ckeys)):\n", + " k1=ckeys[l]\n", + " for k in range(0,len(keys)):\n", + " if keys[k] in k1:\n", + " keys.insert(k,k1)\n", + " break\n", + " else:\n", + " keys.append(k1)\n", " \n", " for s in nodes.values():\n", " s[1]=html.escape(s[1])\n", - " for c,v in columns.items():\n", - " if v.startswith(\"array\") or v.startswith(\"map\") or v.startswith(\"struct\"):\n", - " s[1]=s[1].replace(c,''+html.escape(v)+\"\")\n", - " else:\n", - " s[1]=s[1].replace(c,\"\"+html.escape(v)+\"\")\n", + " if desensitization:\n", + " for c in keys:\n", + " v=columns[c]\n", + " if v.startswith(\"array\") or v.startswith(\"map\") or v.startswith(\"struct\"):\n", + " s[1]=re.sub(c, ''+html.escape(v)+\"\",s[1])\n", + " else:\n", + " s[1]=re.sub(c, \"\"+html.escape(v)+\"\",s[1])\n", + "\n", "\n", " htmls=['''''']\n", " qid=pr+1 if queryid is None else queryid\n", @@ -696,7 +725,7 @@ " colss=cols.group(1)\n", " colns=get_fields(colss)\n", " t=re.sub(\"\\[([^0-9].+)\\]\",\"\",t)\n", - " t+=\"[\"+';'.join(colns)+\"]\"\n", + " t+=\"[\"+';'.join(colns)+\"]\" \n", " if \":\" in t:\n", " lsx.append(re.sub(r'^([^:]+:)',r'\\1',t))\n", " else:\n", @@ -711,11 +740,12 @@ " functions[k]=[l for l in v if \"(\" in l]\n", " for f in functions.values():\n", " for idx in range(0,len(f)):\n", - " for c,v in columns.items():\n", + " for c in keys:\n", + " v=columns[c]\n", " if v.startswith(\"array\") or v.startswith(\"map\") or v.startswith(\"struct\"):\n", - " f[idx]=f[idx].replace(c,''+html.escape(v)+\"\")\n", + " f[idx]=re.sub(c, ''+html.escape(v)+\"\",f[idx])\n", " else:\n", - " f[idx]=f[idx].replace(c,\"\"+html.escape(v)+\"\")\n", + " f[idx]=re.sub(c, \"\"+html.escape(v)+\"\",f[idx])\n", " funchtml=\"
\"\n", " for k,v in functions.items():\n", " if shownops is not None:\n",