From fe33b549f7b2cac58d7da07b76091acd709171d3 Mon Sep 17 00:00:00 2001 From: mcarilli Date: Tue, 15 Sep 2020 10:00:06 -0600 Subject: [PATCH] Python recipe for automatic mixed precision (#1137) * fdsa * Tutorial runs * clarify one scaler per convergence run * adjust sizes, dont run illustrative sections * satisfying ocd * MORE * fdsa * details * rephrase * fix formatting * move script to recipes * hopefully moved to recipes * fdsa * add amp_tutorial to toctree * amp_tutorial -> amp_recipe * looks like backtick highlights dont render in card_description * correct path for amp_recipe.html * arch notes and saving/restoring * formatting * fdsa * Clarify autograd-autocast interaction for custom ops * touchups Co-authored-by: Brian Johnson --- _static/img/thumbnails/cropped/amp.png | Bin 0 -> 14849 bytes advanced_source/dispatcher.rst | 24 ++ recipes_source/recipes/README.txt | 4 + recipes_source/recipes/amp_recipe.py | 325 +++++++++++++++++++++++++ recipes_source/recipes_index.rst | 10 + 5 files changed, 363 insertions(+) create mode 100644 _static/img/thumbnails/cropped/amp.png create mode 100644 recipes_source/recipes/amp_recipe.py diff --git a/_static/img/thumbnails/cropped/amp.png b/_static/img/thumbnails/cropped/amp.png new file mode 100644 index 0000000000000000000000000000000000000000..a6916ce5605e99d5168c7d52aa157f913b8e7526 GIT binary patch literal 14849 zcmc(`bx<9_w=PP8TX1(taCZw%a0{}tad&qoxVyXS#wEDJ#x-o*osA^81s=ci&V5zy z&wJ}uovG^nzSX@}P0vhq_gcMbB2|^Yp`#F^z`($u%gIWq!@zt5{HGwnzPCiSSzf@v z5Z!re09@6LJSZKV9n7t4%_v>H9L*@rJgv-OU_4h3vUQwk+nuG}8cc z96&v_c*4(?6cw(ixRmr7ms3+S#X`cvg%Xx>-=1cmufcEDmn#SJpgW+*+d-ZW>5I|p ztwDf4Vt)SCOu(?OZ@|-m%e@ojmD;XI_owE+-Wb6%69#m?o!>GHL_O}lVxRjJ-v4wU zQ0!>3ZxO{vn%RS1Cu|X%9^HOBhkV6w>lry&dXAn^-4um(nPYA8E%a_2VK~#scmAot z-IMcud1+ljnul8aS;*fSxtAD!A^YyymHK;=hKkal?EZY?dFI0t_P#?v`(^Rl-zAU! z_`QE#PpoOTWHhf2CqGsF$zSM(^6hiJ%HZ~rBXPx0lwH5Jd#rUI@pL9KVbwTDdEK)J z3Wl>OOXTG(FxHKz>|EY;39?-evZ?p{yGmX{5s_+i)*Cen$Fk?2c}9t*I`lN&6B)r< zxXvua_t8B`rPJRTk-2l{bZ*^sV)*OyLa5+*v6lZ_W4P{x*h36UfGU4or+?vr__lv= zp@*XO=MIZ$1TR!KtZ?##Z;P60r{`KL4<^8;d*M=a2eBfl|0-l9WBbpwTjY-cgJpS@ zJe)12u3mZiw$?2IN~!fu_Kbwlz&ff_)nEsXR?LXv#De}&lX0h{pQ`ffXq<;B)~+n- zoa@;xK+hf$1&|uWpb~9SNfvQ?VUMiVH2h1mt)XR#hBHHHn$~2hzKjyjlDdtZCM_S( zx;<4(#|Gj}lE$plU_CY25lik91glWj?FQ|R$J*GI3ARD+5q&Tn$u!S1gRGQIeFcB~ z%2d~QYCWN4U;@tNH1#qKc#tex#86 z3g(nSxkheS|C89+%XuGvB}4G|%_}+}U7-B;KaxzBk_&LxKUy0d*I#)t-Bqd_w~?oN zLNr->U?2Z~4qI;qdZWrvx@qH`-~P@Gu;Mh@)Rj4g9v%Ugbhertdd8-J3QckRH9~$Teh?_>^tOa!fLyvz zn%7Kjal)H@rpg7-Dn7N15D%aQ!P9c(u0&Or==R7n5v5Rg=$rs;zvfQtL*FZ1^06VgP}toPAL zv4)PnM@Xf)ExUgsErNoqW~rsPe5~SD7QtL6Y*uF+`jHWHsL+lzOTk>x10ly zHSdM;X3_nu84||Ha#^qZDXbQ%6XWS=t`O~%-6P~4&hi~ctg@4 z{OQx+Z-PG^*^#;^YxmyD5Zp0Tzn-~kCo9a+#x#_5u4zYa*d-gK-k+|F{!3D}=hJB( zdQssu5b-bYmeA%E?0SqEO2IaR^@!LFQoTd<&t?uX{KuRVsGdb+6(A~sK7JlaV)c>n zm~#v2H@n?@1|Hm+zBw|iOUzki7sY41Z<9Q6y|aCT(Xj(Rwe2Lc1Ml2>1oY@GsN<2A z2`A-LZsTo&y_&^$N|PBoEIkl{GIabxf|C?*&(Tv>Bqvr@P&8InZe^y_* zqILo2roNU7O!3H@2#+LqpFNNo%YE*Ck2{$;gX1W2%NkM_hL4T#`X1DIuw@!$;t&5)r%f$EFfK0LGcFV<4v0G3b;rkY?shvM(M$3@>MyH0072Ik3 zT6e=7jKncTiy*rQM#r4J`MGKXyIP%s#U3gHJHPT-xcc;0O9Apq8HQRqk9w)bYEKlZ zpq~{GXS1`fM_FB(FO>Fy_p7jBld(5UuwXfY6m8`Ih1-lUWKswwI1dAF#q?fXSK({I zX)(S_O$vQsujXNfav}}8Xmwe*ljFsL!Kz6h<_-Zyvz8$_LRnZ;jpCA=Dv=K(c)wI9!?lBZQTI2IjzwAz zNbRW~f`^CtM=>YkN6v_m^Ktbu*5K{g=WWAxCoAvMPH0g7&7r~|ERF<@Y02F5Ng4ih z|4D@s4NaCPP^Ks&n`&f3uZA$w8%cITfWPx8O`_6PkMrxQ`j8S+WYAFnkOeAAmhr387U4(83?U&QTb}gB)n<^TFUs(M-H-hn9w9rUa zx8{k-Q?Yr_pV(J1I!gO z+t51cx@|zuG7iRSDwYY+^nv3XVFUqHrRg29;7!H)4y|x+;Wmv?%9PqUE`WW9(p=`> z6@!F#c)tyayV=OIG;Iq6Hf5ND2%wp)&HFoI?@M9^|6k|yAO$dE7itH)-lJl z_*D)?)j;dvA%*_?OXjJ&nsx(d|rf=4d04S>3B$aZPX&=n84qYK+ zJX2Mw~CkSnHOv!zx6d@riz};pNn`uX)R2-KE_c>u?a5fxa>+qdo3d0arU-74pbZ!HI|qfI0e!;PDn2AliUT;9e?}=w5oh8%Ogl)+#wuY zf`3b~I8?EZyePdd$e{Z4XIbR1EjtjYgMg>Lg>^|^_hXHw+_}W zaFcZ%Q<2vi1UiZG!Hly!ADLGq8+3QZV-Qb?{456^{^XgdtT>EzJ-PX)(qO((D;CF- z8HhvZ?7`}<;*0Wdp>*R$dSDEvKG~KB zcV$@ppz4r-<40)GQ~e#xZpoRy+qMcDd;`@zw|)FWh~Yv4)HC*p`K{`O2YJunT7-+` zjF@NCKozkRw$V&jy$r*2Tb~1dRMUXoGhi{Y{-TZQ&lTv)3Ey)0o^{QDf@wJt=)xIk zpa}kV_A7UnL3~I2V4at66AVh)xde-v-Br2de4OT!>yTf-F^|4o@Q09_8CJjmavaTS zO}#}3y-vrU)+gRuGlQ;%2y-2VapQ&{nmV)BRkloq# zp}=Ci&97tj^cIUL2Y0=E&ma!_(~M}v>(YR_NG~wRySvkX>|x>lNFqa?__EByDs}sl z7q{dO$zuB5AV#DcpCN;ahY(GYhQ9XdISKF*qm@UJVSK5$UxDk%Hg`>x^a{>w58xuA zeAZiIH_X!GNeg#z4!dIS0!#Z55_~0!Nh-)$(Bobr&<$y}Bo9aty5}>7zH@T7=yLgFCzYg zbNA_BS7PE3gZP(W;8vng5M1Hx6N6I$*TC?;5vL=Y)i{-%%bOwghw98fhx^~+oSBsl zo-^B2zI#cZyUgY#R6b28>B);t{|V<8wP?Fkd6dh^|6on@Lg3`jdG4QTO-b%Rj6A3ipdi*NEy35X+Is z2fM5W`sv<4ZhiX6cSnwTGPcHB>zGppIg|A9C<-9*TXQfOWD%LWyU_h-DfkrXXO(PW zSfX2c5cp@I7$5TF-_rZz^9W-Q@^>*gyWDDx>4d%|tNrAraP(azzdaztEV9i>P!$W@*06nx4+1 zA+tG~&kWhxC{U`GCjF?pl~KJGYhZ3pw)Ik^gxlZUIBrUp5ldG^`RB&psT1f2GwpL( zz!L=gH_!Q5D~Q*j!n66S5{;Hj`63+shdm4WGuST+t81N4^I)pi61~(~t?12Fc-Dp8 z&#Q2o!g$#n2^Us@n<#qw!C!->^A|6RhS{=s?3+!})N?qf)2n0VQlbR&kV3zkgFY280Uz^52L-nuo0UuP6XhRBN0AKo{`_2lXHcT9ty5C|k`n!| zc)}PqUEqXr)M}1R{A@I0un-YWkh)hY9%DFAEhQS?i)LuA(qA}^r8}c%NA|fw8MQDr zJBu`e+^Ns0$g;UbQb%d^dm#v?!XCF!@ru)c>tM{V^c42?`-Q8GHGvcfMMg}*hp!2K zT1?eyMf8WeL}7aTgZ46_Vsd|iA!++-(dRA*THJK!F?}W)Ldm6J2}rU)SFE5SY`xP2 zhl}aZv23_6<(GQ3VP~Y>a?TF8X`juE&eO}Jm5x0xh zj)$g_CnpW?RR|xtTAFi$a9C(Od{*@0^DKX4DC4W!^nWQ#2|hzY^7a}BdQBO&E}umC z&o2M~MO z(=RREwDA1iNR(KRDA|w8*x%0D81=NFFUZo6vrrM5pQnbKl8tb3eCkCqw(pSZ9oaP2i8X2+23uax_=>vJ5=C&mMi5%mVtJmWx(rr99rIb8}okxxibJ?aOLuOZseT88DQpF*NZC z12NVjAAhIJuUs-Yh*Z8^BaetvJIAl$(Nm{rokki?4hzOm^RvK=Z8dd|H8HD0#_V@K znF00$8J7Mm0DQ)}iSDD9I#JVnV@9;5_zCut12Py9lAGh~U@yCt9+PP@X*!Wzf0 z2*snWnAUlBEZNXyv3*@gRYqPWF;XwEjCE5y#*$UUR(zq4G``S+krz8F4aaC9Q1_dn zMU8$;XSVkT!a_qs5nI~XLpE3U^7D`1ED6g3KLrQm1qZ)rQ_d%?sw$b#YcOcUy~;d8 z89oV;t;+>4uouJbb5jl>>_iJ}isdhU!8>mrX(UcHQ%s^h=aXYC%2h!q>r+w~9{;MD znhVKlVePDZc)VHJW#|dk=sTpEWNHrPS@oNzAB$NUsfP&H+eegChUTb8M-vzc6FTqm z_D2WMQD1fkeZtUKv|loZg3Z-9QPl`0CvPg94m2prEuP9O zCZ9T4zWw~I=qOZXt`X5LdwL$p^2Z+_T!K09MZ#e0EtsSu22xX-#r2o2JGd~D9B}hp zrM(}py0)l1!T9yGUA9U(5Y8Fc2uq)oyc*Wp`VkvtmuN^qg}dVOmvcE6-XhPwHE2<@ zLa;8?UhGx)ZZk=D3x;%MA|~ycn$@+_yth2#4cWq>=$^Objcp`69AlUTrLTfJZdIQ3 zQj-|^Mp*8j`AU5sRvayLtYAr$2smpm!OBddV3@r~1h9uEx|s4J)c$ssfw^d&hK)Ljbg@0Fr zbA~8;>xac1db6j`&8B)jgZ5g~>J99!d{|VbTF+i1MFJ1L>xl8Y zo--IGF}yu4+O}=kGW+s)bcnz7Z=B4biELtSNK~UwIrPD-QouVkXAAg2R$;#Qw|uZO zh$2k5zt+!?`uxS-{pJStu%jI()O3=Nm?b@Qf$=A_$nZtXb7F@Bp=Q&}hls!4Bm-&p z6K%x|D~dyH)t;292GjHh+Twt!)`ZG~yq2$EwWNHG8@YH0WmWUvkO5{O{Re!TT_=P1 z1JXtqjujtkw?UU%kxc} z+r7<8z(QWCjnosrbI$duQa}0y3K5@Rc0qn?K{jO2CEZ6W*Mh3tTM+lt{fDpak1er7 zSR#D=o`E;MBK09XEWoWGDYe8f^~KgnUlwn_H!2C#L$+1DIN)N;C6#Z-lEE(&QHnLL zkM|rQq9lDD_xGE_0i>Z@i2eWK5&!C8t}ppWGneOGJNN}mmEyNKy-F{!A``?TExxG) zi>eM*1T0PTEt}v(DMpy0j}!d7J2Nf9sSGysj+wwewu&(E){eU2^Y{Da zKR#@l5rNQR0(cmh4{=12~QB`rZFhBTK+okEHp8vr*YCXOH$SlO97KgOa||BiV=-W zwQ^J_?t6Uu5?pIQo32=kjqA)@@DV);iufG7wE=t_LuO;ZMiew4Ju-nqI{z4(wdYGA zq`U`*^c7ayd%~a+mMgzx2?)GFI$lFyt<}luLe#y0fmI1Jm9NDu89TV{dnWSxj(^+> z-?7xSnf1)?2Q&X$^9q5VA5I1}rTgOg!qLa!Ezj!i9+o@GIWBPj6VLniKvGrIkbZl6 zyLvbR@1}RG2Q)4_W=@C&bYr@=!+3Wh)j})>x8qN4mM-qL5=Pi#hUkB-dLCTO{648G zoKZ~~e&oW8Rf59Fznq7Z(a4=`2!AUFNbke1#hqPt}i$?9TvY`xpxB6*Vne?8e z{3YDt@D3l06~0NqM8K2Dz6;2XvOip4U_K50r+ffARG7XuBD%^cNh9tfBjFOjq$Bet zz`&56%SnlAc&?u31~{zBuZCfvmK-7JD>GPM*(b@nVjN&lj$nVLnj6s~`B~&1`O&oZ z?c0w(Gns#v3F~o^bhYE+P{N3Oe4q0y@`%nPKD^)Jf8yE}S$NX!ey!9M_{ef_MR6OT znK+iF@#YreuI{M&M$%Anbg3fyrMoo?=}W0js@jB>uLRJlqB!LomesTg>&&|RHG$n< zl23NBhXgMrwD#(!7}E#WmT-%fS&YA$tWNmjQ9Bqz4Fu(~vJ{3?u_6NBq+CPVUBe<| z2Mv%+99Bmou#8w~Bu%A4Qc?h>AHRv>xXF1x);bkMGhuNW6FbH@tWKpDBX0SwN>= z-D;v9c{Grxo0ISD!S#j#M!C<@AcxJv^^WTwd37t1HKjn|%{MEra7X2@=@SM$Lt849 za{~So1B~#}+2jVmf zqwkxXX~MI#Tlu9u$vpbxw95JFOVIti44AP?6U>MJWi#V9g#l@f6~KJo?~j@i*t${&A-fb4P=LLTT_+BXH(C1Ht^1Gkf(l=T7HIC((Xu*|fnaq+ z?H%&~LsCnAF z{rX6Qa_XFXR8p`~wTge@uauxLzUX|KTMvlR=4!WEBW~Im;0W(A3FV7Um6@Hjj0@a> zs&IKtkUn4-4HncLqWR#ffrBtk!F0kqG7NZA#D-YejwwkO&*N%eyG*9C(y`0GkEg@# z5g966TA#HlSIa}kA=e8BX5g_h{!Cq{&*$g)C)-je&XL(SVT1UEOZ|%gE_5sRGMm{GaPr9Xg}t(_fx6Mw&mr z2Y_9J^;)mDHKG zVoJ?d0Sw1_F_-Jqa;LJcFDaB)Af50}rNn6gEBvGD?>}Ycpv<^!k&?! zGc+sAQC7V6Ldbeivb?pX-yx}Nga*n*w$r8b)!pALQ=3fMpC1&++%#N6dG=2-&Pf&c z3@f`*>O+>4kzCTu_g>>l)7EiKf5%*u)>>4{`5F1$H*EV4H1drA$`dg%)VV8bH*h#V ztszhJx-#81^-f+VVej?i)HCk>;d9#K<$z=bNRcv??VG!9XwkXX%3d|L-kh$GF;y~w zkTQ`tva}_j$>vR`_*z>HOgF5(jSL6VlLQ2i#I|WJr@hv=+jv^AAa;?*)J{r^SAn6? zXv4Bu9w}LYNoJNNGx~v$wCL{Fw9uAZlf{(jOWQ54K~RVpiUY0vI`4?QD|6v~m=N^~ zwmK`H#_;c+R_*lG6_60T9}XGx<+zlTiE89Y;d}GCP%Bf`s1c&k`=EU8W5*{@tFvf9 z*3L>k{8|qqOG9{Y7q?5lC*Pj6y-tQntFhQXWrt^wP?a2wW`X|P(9c~?triBJ>;v+m zjK9-Kg~o2@JgP^ZS{#i3YV)dSun4f0e;#Z@!BIjAlCkWlzUfr4n7H zy4NG+)0B;sx_Iya=2bR#>9uiODm$a_94;Bcr9Gjkd}zx(mKNQ-0K^GoYz+6>pW*_Y ziZIC~PcTV6P&4A`Kq)aUYkV4NNaDQAvb7w( ze3+28tmKEdb3nXI=Oz)&G9L8Fs*^qW%aVdcg($(+w&PvDiYIicdYrRDrNGu@cxijF z6hnRrak^V7F}DI%J#|HgkZeQ|M&UG3)krwW)NMzUhB!@SN7AWNgz|%`R!)VLfd<|0 zPZew>plPi<=_>#tC!;IINZc(t7aF~de9=j@E`Y(PRfUF`XLQv={3IF)y0)bJ8A><- zg}b>gt*k5(isaYY8zzNnP-LSf5AKZ4yoj1nef!R;kRZcB8p*y_g+DPkSsfQ)$FSHMO+^P{#jqTcrOkWwmFAIHGd$ zp-fb_v6vdgGu~BWYpLnIrpUz{Q7>p$1A z1B@D?L(@V%C`Otk`oj7Ex;4SH3w!*s8}UwWXLs{eXh4LEAmRbQ|A;_d{O^^_WaCUD z->^#++F2a;_evm`@#U6ENUyuVBsD42jPI8_vr@emb?7A&a=ku#VtKYV7~KHzc5+D; z4_v@rzX6=)DF=BDJRW3c9gtE$_X&>x)&EY0G;bI~4)oOz5d~tRbW-bMO=?zE}*1EbD4-Ot_fu4`f0QS60S;a0}W0>g65ZjkLp4SrYxq{ENmTp1~b*8F2fNvkAbe zAmgGBv}0MEnb{u>>dmV5R^cve|6vQV>w>J@FBx>AKz%o;$4M;cta!c2*KpM{bCct< zdX3xuf^t+FUq;l9)?J2@TN_t5Z2l#8EG&@{C~YB=RLgv2>J6#a#b`lg6Q6D&??yV1 z?->SfEEQ`_+wq~iZ#v$=l%H2H)ZhjN8ZhzWLhBuwM?aFV>;U-R1(W~4_WuTrga7}` z{{K{~Od|m`|kMr%1C-o z&P13{RLRnZG43u0$FtqA=Pb<%Yu`x1#MkEKayUU)eII1`+8BZ?Y}*Ds9b+8T zOSVM|Qdn>j#eRqpY~25j`KgS-IgT-<`_9dKgUn{}7n}IGdErFd^{&Au#Vw=sSTi`tTp} zuJj-BU)n|={^!j+>l8_!^6J;MakAj=hL5Emm7Dr{GKh_0t?)$ga4`in-OA5u%_{}TA_}9T5AH)xxSm= zW8v>(nTc>HApI}SK^vmW{JyBMIEv&LY_8vk%}h;|a-7p7vk<8S6y)MEqc^v}`^7Md zh?>908zBFpv>L!4E^zZB^>WBQw{zUIrTa253D`c_5Me@ErJja&EyqiXs0qD z)RE62cjL~1#(wk0zhbTI65MdH)X6NWb*dWD;-m7=^v5?2kBB!WC_UJwC7iq~pK!wK zr4b7cvfm=8MVmIZ)rRQ@6F-b?tR|$;BQ8OR@z>tmDRjMQpTEo9xg=>quATCksf{s@CC?A}+_%ik z6VB=)Bc~LQf#)6`#{t#}GhE-_Y=;+GIhn+hmluZ+$>xql`^iH+8`|?V;uW&Nta}gU z&*(=d66us=(Z$>C1%@v4`lb_dAcOTG_s}hzM#_;U0SI@SepQ%q#B~2OLCtL&Z!DczW-a3y!;6A`_1TCW|a|MllCN3+wu~4F6 zW~zNn^xr!4Q3|jZVAR+UU$%~T#5$5HlMNMZF1ggTt@?)F=NW8*XJ4A(x%fk#?HqBK zVL-{R9UVwx^>icHNoccrF6g&jw3e%a&PQUUtx4u7;on8F_OCL#pCr>b;@=?9@87d* z&(l+K&w|;^mw&egqNnsYMWKQ$&9vi zKA5iG4tE$s6FVjD;dH8HbSR{4oOhb@KhNZ^CO_uV=llf;kuZ%kqoq-g6_<*+a-GmS zaFXE(cCQD}5X4Y|poz2rOY*NIKuDhrjn6F30QDLLUKU4nZAY}e1TN2%A-{Xx2tEz9R6W5l!%#Az4_hj_l zXNhWg?-7-SMB79n{n%2^&nLPo^Q$+Q;@t=u^xQNo>5n|=iIO?gL@P^3M*d=H4A^d4 zUYfo??de3}SS%45V5tg#*PPICJ*Btv9i7%4AiA3~ayEvK2L-&U0sdulb8@nSn&S>i zx=;tQ$mncazxA(t5)uVdK9Pj-5Oij0+&*Bh;dZfTnh1LEzYlHEH;_NA;fcGVy(e;xWQj$x}k=M8ujlIxAdWiWz8Hd zubJq$4SeOh(=Owfj&-2-;FnmLlRee@inW)npXJ@^2x@3VJcWiRShv8si<6;dUtA4L zY&Ho&=&OpPx&QsncRj*!p^s)ak_UIS5#Q{pYc}Z*IC9v48;gv8@b858+7)?EN_MCf zCZ)?(Sws^~XvnJ>7(nkGPcBp2tPX!uC{=$|h;iG9uG%YqA-i@inKHG@V(J2%rXIwl z+`S2W>mZznkRhzyWA3US)yVctVe~doa4v)pe^*H|QR43R2{IATE9E}$^ztwgg#A0J zxf6qM9&}S3K1piR*5B}yRm3)&SYmZNtrsu0=mCF|@#H-mya&e-KLJ9H5P47C1c ziJLdpaKY_@T2|*EgzNohN&~ed2><=mwvw)sP{AGav$8wwAaQpDPM(zmLfhsl(Xm4p z_Nwu0s-Cfq-s;M{#78^PxZxFc@Y*4N0^kzLQtrJIVS!>_aykRL>f^;kD?kn=3dQ-YPr!sF1FPQD)WCV z*Zg;lBt?XTw0vWDaLv72;@oKRE05fm^~i1Z`G&;C_)bWfoLoJsQJ|b>dG9Y7Va~S0 z=g+-*-c;NV@n!~X>n9~*otZFUtpE!!XDT)uF4*}fFZa96S7D!+gQz3UsxX^?j8!&T zL9Jb&h#QuqsQbB=ecUbO)N*L%&mMDAvrn!1Y!{OsYy6DAi#WxZT@af{_tXeNa&031 z4U{3zGLzYqfJSg8dXU|HDvy7Rbz$&!%!uW+wx+L?)T{GDG2J9TSx^|AtKHA_YMG|_B_Z9B%0h^Vylz-g8-G?<iwdBa;A+dcuD-ULIv4d zn#7J$X6s}8D>u+j;Ygzj?K=l5D*s;cKX*s4Hb*{MvHwtI4O9|xe)Ff<;7yR3fI*r%;Rb!ENk41Aq$raSQXTq$5@BE9>%xCG= z%OI1!A&gK9<4u z9-BOn0BC;fRFeltUz|M519kUNCu4x$ifH+?`?k{tz;GFUN2u&t(}WgkkbN5I#7q~H zaQV_KO+K+=Dr*)>6-X84`)|Lp3Pd_F!~NElim&ox#L*aFd^&aSv}eqjDLnp&-LL;m zMQ^WH!Q=AZUbFpvhWaO;yB=L^vWc~R$c@?=4a%>PBEH71-_siGeAm0X0}eVq-uV)M ze=TGN=rVkw&+fCMa<@bLY0#RxR30Ee+hZ*bg<^wE$o|n%9>;uh(U&1kZ|}O~hk%x& z+7t>$d-hKjwq?{>4Y1`PNrc>wQ#n%&T2F(QK3YUD@aZwQK<_;0#Uw}WWq5U!bbxIb zF0|g0vXH#n#H+A|MVEffbn~AAMAO5FTKB9>x07Ua#>E}9{SdT$WQ^P7FaA2bQ@ZsS ziq5CrW9F|-EBJ}{sq9`i4v$m}^FkMDo|PgVZ2;=#ILsPRdKp~yP+M&wi)Pc}aUFo`oCD2lUh?OThV@xh7!Ohlcru2c_WeKkQ}sm{82Ao^Wi zLTLeL?yk;*0_n`~#O|Iv6VgXrH$0KWubLi|2fWYK%ZuJ;)6^}xXaUieGi6~^1UGAqteu1P zpL}@90O=$95{%<#_jKc1-b-J+_#r@GQ*?u;L|Q!8k%ez^p|kV1V6R`#&m$z~r5P|z zCK;92)qVcG)ekzhBJmq|#9Y3>H(n>1+Cgr(F2MApD-R=GuO$q)c;TyJx=8omyE zKL}uO&b`tAPoAS#Am6$>F_YT3$d9L8pt~QGVuW4F1NgP>;tnP&VmF?5HWb0Hxb*L* z2j{~MN)3Bzm$^Sp%q{@E0n+jtMh&p<9n$!Hx?_Atov#}{0o#D~{RV_z{dzYe@!j`G zz;UMdmS#|;S*-UZE-BO7q);5Z@G$v*$4>76^S?#MBM@nSnjG|hlrtSbVi-f4mHNV* zkTgk?H42ByFy=ZZ^@UJY?(n0M=>R4JLky-gelsB+w&?+uaUoMGa@A-ct3=fP<_RiS zpOh?CZ34W>fG)F5Bs)8E5lC`BYF5wY$0Wf87`9{Hq;bfI?NtWAk;WrK>f>lc1L8Ns zGCxlokkCRN?~tZEX3XDA;J2asa%>&k2#i7ZAjF89Ta+=wtG!Sgcc1*RkRT6iz?erM zi#Px9z9+L>pNb-V!HaH>tZH?xfo$|quge=-iRjg6gx0odq@BND7&`%xOaCWwgfK8& b?yrbNC+vqew=cv08G)R%vQ({vVbK2r7|~Lg literal 0 HcmV?d00001 diff --git a/advanced_source/dispatcher.rst b/advanced_source/dispatcher.rst index 23ba0f96be1..4f3b52fea32 100644 --- a/advanced_source/dispatcher.rst +++ b/advanced_source/dispatcher.rst @@ -105,6 +105,8 @@ speaking, the structure of your registrations will look like this: that provides implementations for all basic operators on the XLA dispatch key. +.. _autograd-support: + Adding autograd support ----------------------- @@ -299,6 +301,28 @@ the safest choice for the execution type: at::autocast::cached_cast(exec_type, t1)); } +If your custom op is :ref:`autograd-enabled`, you only need to write and register +an autocast wrapper for the same name onto which the autograd wrapper is registered. +For example, if you wanted an autocast wrapper for the ``myadd`` function shown +in the autograd section, all you'd need is + +.. code-block:: cpp + + Tensor myadd_autocast(const Tensor& self, const Tensor& other) { + c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast); + return myadd(at::autocast::cached_cast(, self), + at::autocast::cached_cast(, other)); + } + + TORCH_LIBRARY_IMPL(myops, Autocast, m) { + m.impl("myadd", myadd_autocast); + } + +There are no separate gymnastics to make the backward method autocast compatible. +However, the backward method defined in your custom autograd function will run in the same +dtype as autocast sets for the forward method, so you should choose a ```` +suitable for both your forward and backward methods. + Batched ^^^^^^^ diff --git a/recipes_source/recipes/README.txt b/recipes_source/recipes/README.txt index f93ee92c2c6..a182b0a11c5 100644 --- a/recipes_source/recipes/README.txt +++ b/recipes_source/recipes/README.txt @@ -56,3 +56,7 @@ PyTorch Recipes 14. mobile_perf.py PyTorch Mobile Performance Recipes https://pytorch.org/tutorials/recipes/mobile_perf.html + +15. amp_recipe.py + Automatic Mixed Precision + https://pytorch.org/tutorials/recipes/amp_recipe.html diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py new file mode 100644 index 00000000000..c1ec52a3883 --- /dev/null +++ b/recipes_source/recipes/amp_recipe.py @@ -0,0 +1,325 @@ +# -*- coding: utf-8 -*- +""" +Automatic Mixed Precision +************************* +**Author**: `Michael Carilli `_ + +`torch.cuda.amp `_ provides convenience methods for mixed precision, +where some operations use the ``torch.float32`` (``float``) datatype and other operations +use ``torch.float16`` (``half``). Some ops, like linear layers and convolutions, +are much faster in ``float16``. Other ops, like reductions, often require the dynamic +range of ``float32``. Mixed precision tries to match each op to its appropriate datatype, +which can reduce your network's runtime and memory footprint. + +Ordinarily, "automatic mixed precision training" uses `torch.cuda.amp.autocast `_ and +`torch.cuda.amp.GradScaler `_ together. + +This recipe measures the performance of a simple network in default precision, +then walks through adding ``autocast`` and ``GradScaler`` to run the same network in +mixed precision with improved performance. + +You may download and run this recipe as a standalone Python script. +The only requirements are Pytorch 1.6+ and a CUDA-capable GPU. + +Mixed precision primarily benefits Tensor Core-enabled architectures (Volta, Turing, Ampere). +This recipe should show significant (2-3X) speedup on those architectures. +On earlier architectures (Kepler, Maxwell, Pascal), you may observe a modest speedup. +Run ``nvidia-smi`` to display your GPU's architecture. +""" + +import torch, time, gc + +# Timing utilities +start_time = None + +def start_timer(): + global start_time + gc.collect() + torch.cuda.empty_cache() + torch.cuda.reset_max_memory_allocated() + torch.cuda.synchronize() + start_time = time.time() + +def end_timer_and_print(local_msg): + torch.cuda.synchronize() + end_time = time.time() + print("\n" + local_msg) + print("Total execution time = {:.3f} sec".format(end_time - start_time)) + print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated())) + +########################################################## +# A simple network +# ---------------- +# The following sequence of linear layers and ReLUs should show a speedup with mixed precision. + +def make_model(in_size, out_size, num_layers): + layers = [] + for _ in range(num_layers - 1): + layers.append(torch.nn.Linear(in_size, in_size)) + layers.append(torch.nn.ReLU()) + layers.append(torch.nn.Linear(in_size, out_size)) + return torch.nn.Sequential(*tuple(layers)).cuda() + +########################################################## +# ``batch_size``, ``in_size``, ``out_size``, and ``num_layers`` are chosen to be large enough to saturate the GPU with work. +# Typically, mixed precision provides the greatest speedup when the GPU is saturated. +# Small networks may be CPU bound, in which case mixed precision won't improve performance. +# Sizes are also chosen such that linear layers' participating dimensions are multiples of 8, +# to permit Tensor Core usage on Tensor Core-capable GPUs (see :ref:`Troubleshooting` below). +# +# Exercise: Vary participating sizes and see how the mixed precision speedup changes. + +batch_size = 512 # Try, for example, 128, 256, 513. +in_size = 4096 +out_size = 4096 +num_layers = 3 +num_batches = 50 +epochs = 3 + +# Creates data in default precision. +# The same data is used for both default and mixed precision trials below. +# You don't need to manually change inputs' dtype when enabling mixed precision. +data = [torch.randn(batch_size, in_size, device="cuda") for _ in range(num_batches)] +targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)] + +loss_fn = torch.nn.MSELoss().cuda() + +########################################################## +# Default Precision +# ----------------- +# Without ``torch.cuda.amp``, the following simple network executes all ops in default precision (``torch.float32``): + +net = make_model(in_size, out_size, num_layers) +opt = torch.optim.SGD(net.parameters(), lr=0.001) + +start_timer() +for epoch in range(epochs): + for input, target in zip(data, targets): + output = net(input) + loss = loss_fn(output, target) + loss.backward() + opt.step() + opt.zero_grad() # set_to_none=True here can modestly improve performance +end_timer_and_print("Default precision:") + +########################################################## +# Adding autocast +# --------------- +# Instances of `torch.cuda.amp.autocast `_ +# serve as context managers that allow regions of your script to run in mixed precision. +# +# In these regions, CUDA ops run in a dtype chosen by autocast +# to improve performance while maintaining accuracy. +# See the `Autocast Op Reference `_ +# for details on what precision autocast chooses for each op, and under what circumstances. + +for epoch in range(0): # 0 epochs, this section is for illustration only + for input, target in zip(data, targets): + # Runs the forward pass under autocast. + with torch.cuda.amp.autocast(): + output = net(input) + # output is float16 because linear layers autocast to float16. + assert output.dtype is torch.float16 + + loss = loss_fn(output, target) + # loss is float32 because mse_loss layers autocast to float32. + assert loss.dtype is torch.float32 + + # Exits autocast before backward(). + # Backward passes under autocast are not recommended. + # Backward ops run in the same dtype autocast chose for corresponding forward ops. + loss.backward() + opt.step() + opt.zero_grad() # set_to_none=True here can modestly improve performance + +########################################################## +# Adding GradScaler +# ----------------- +# `Gradient scaling `_ +# helps prevent gradients with small magnitudes from flushing to zero +# ("underflowing") when training with mixed precision. +# +# `torch.cuda.amp.GradScaler `_ +# performs the steps of gradient scaling conveniently. + +# Constructs scaler once, at the beginning of the convergence run, using default args. +# If your network fails to converge with default GradScaler args, please file an issue. +# The same GradScaler instance should be used for the entire convergence run. +# If you perform multiple convergence runs in the same script, each run should use +# a dedicated fresh GradScaler instance. GradScaler instances are lightweight. +scaler = torch.cuda.amp.GradScaler() + +for epoch in range(0): # 0 epochs, this section is for illustration only + for input, target in zip(data, targets): + with torch.cuda.amp.autocast(): + output = net(input) + loss = loss_fn(output, target) + + # Scales loss. Calls backward() on scaled loss to create scaled gradients. + scaler.scale(loss).backward() + + # scaler.step() first unscales the gradients of the optimizer's assigned params. + # If these gradients do not contain infs or NaNs, optimizer.step() is then called, + # otherwise, optimizer.step() is skipped. + scaler.step(opt) + + # Updates the scale for next iteration. + scaler.update() + + opt.zero_grad() # set_to_none=True here can modestly improve performance + +########################################################## +# All together: "Automatic Mixed Precision" +# ------------------------------------------ +# (The following also demonstrates ``enabled``, an optional convenience argument to ``autocast`` and ``GradScaler``. +# If False, ``autocast`` and ``GradScaler``\ 's calls become no-ops. +# This allows switching between default precision and mixed precision without if/else statements.) + +use_amp = True + +net = make_model(in_size, out_size, num_layers) +opt = torch.optim.SGD(net.parameters(), lr=0.001) +scaler = torch.cuda.amp.GradScaler(enabled=use_amp) + +start_timer() +for epoch in range(epochs): + for input, target in zip(data, targets): + with torch.cuda.amp.autocast(enabled=use_amp): + output = net(input) + loss = loss_fn(output, target) + scaler.scale(loss).backward() + scaler.step(opt) + scaler.update() + opt.zero_grad() # set_to_none=True here can modestly improve performance +end_timer_and_print("Mixed precision:") + +########################################################## +# Inspecting/modifying gradients (e.g., clipping) +# -------------------------------------------------------- +# All gradients produced by ``scaler.scale(loss).backward()`` are scaled. If you wish to modify or inspect +# the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``, you should +# unscale them first using `scaler.unscale_(optimizer) `_. + +for epoch in range(0): # 0 epochs, this section is for illustration only + for input, target in zip(data, targets): + with torch.cuda.amp.autocast(): + output = net(input) + loss = loss_fn(output, target) + scaler.scale(loss).backward() + + # Unscales the gradients of optimizer's assigned params in-place + scaler.unscale_(opt) + + # Since the gradients of optimizer's assigned params are now unscaled, clips as usual. + # You may use the same value for max_norm here as you would without gradient scaling. + torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=0.1) + + scaler.step(opt) + scaler.update() + opt.zero_grad() # set_to_none=True here can modestly improve performance + +########################################################## +# Saving/Resuming +# ---------------- +# To save/resume Amp-enabled runs with bitwise accuracy, use +# `scaler.state_dict `_ and +# `scaler.load_state_dict `_. +# +# When saving, save the scaler state dict alongside the usual model and optimizer state dicts. +# Do this either at the beginning of an iteration before any forward passes, or at the end of +# an iteration after ``scaler.update()``. + +checkpoint = {"model": net.state_dict(), + "optimizer": opt.state_dict(), + "scaler": scaler.state_dict()} +# Write checkpoint as desired, e.g., +# torch.save(checkpoint, "filename") + +########################################################## +# When resuming, load the scaler state dict alongside the model and optimizer state dicts. + +# Read checkpoint as desired, e.g., +# dev = torch.cuda.current_device() +# checkpoint = torch.load("filename", +# map_location = lambda storage, loc: storage.cuda(dev)) +net.load_state_dict(checkpoint["model"]) +opt.load_state_dict(checkpoint["optimizer"]) +scaler.load_state_dict(checkpoint["scaler"]) + +########################################################## +# If a checkpoint was created from a run *without* Amp, and you want to resume training *with* Amp, +# load model and optimizer states from the checkpoint as usual. The checkpoint won't contain a saved scaler state, so +# use a fresh instance of ``GradScaler``. +# +# If a checkpoint was created from a run *with* Amp and you want to resume training *without* Amp, +# load model and optimizer states from the checkpoint as usual, and ignore the saved scaler state. + +########################################################## +# Inference/Evaluation +# -------------------- +# ``autocast`` may be used by itself to wrap inference or evaluation forward passes. ``GradScaler`` is not necessary. + +########################################################## +# .. _advanced-topics: +# +# Advanced topics +# --------------- +# See the `Automatic Mixed Precision Examples `_ for advanced use cases including: +# +# * Gradient accumulation +# * Gradient penalty/double backward +# * Networks with multiple models, optimizers, or losses +# * Multiple GPUs (``torch.nn.DataParallel`` or ``torch.nn.parallel.DistributedDataParallel``) +# * Custom autograd functions (subclasses of ``torch.autograd.Function``) +# +# If you perform multiple convergence runs in the same script, each run should use +# a dedicated fresh GradScaler instance. GradScaler instances are lightweight. +# +# If you're registering a custom C++ op with the dispatcher, see the +# `autocast section `_ +# of the dispatcher tutorial. + +########################################################## +# .. _troubleshooting: +# +# Troubleshooting +# --------------- +# Speedup with Amp is minor +# ~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. Your network may fail to saturate the GPU(s) with work, and is therefore CPU bound. Amp's effect on GPU performance +# won't matter. +# +# * A rough rule of thumb to saturate the GPU is to increase batch and/or network size(s) +# as much as you can without running OOM. +# * Try to avoid excessive CPU-GPU synchronization (``.item()`` calls, or printing values from CUDA tensors). +# * Try to avoid sequences of many small CUDA ops (coalesce these into a few large CUDA ops if you can). +# 2. Your network may be GPU compute bound (lots of matmuls/convolutions) but your GPU does not have Tensor Cores. +# In this case a reduced speedup is expected. +# 3. Matmul dimensions are not Tensor Core-friendly. Make sure matmuls' participating sizes are multiples of 8. +# (For NLP models with encoders/decoders, this can be subtle. Also, convolutions used to have similar size constraints +# for Tensor Core use, but for CuDNN versions 7.3 and later, no such constraints exist. See +# `here `_ for guidance.) +# +# Loss is inf/NaN +# ~~~~~~~~~~~~~~~ +# First, check if your network fits an :ref:`advanced use case`. +# See also `Prefer binary_cross_entropy_with_logits over binary_cross_entropy `_. +# +# If you're confident your Amp usage is correct, you may need to file an issue, but before doing so, it's helpful to gather the following information: +# +# 1. Disable ``autocast`` or ``GradScaler`` individually (by passing ``enabled=False`` to their constructor) and see if infs/NaNs persist. +# 2. If you suspect part of your network (e.g., a complicated loss function) overflows , run that forward region in ``float32`` +# and see if infs/NaNs persist. +# `The autocast docstring `_'s last code snippet +# shows forcing a subregion to run in ``float32`` (by locally disabling autocast and casting the subregion's inputs). +# +# Type mismatch error (may manifest as CUDNN_STATUS_BAD_PARAM) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Autocast tries to cover all ops that benefit from or require casting. +# `Ops that receive explicit coverage `_ +# are chosen based on numerical properties, but also on experience. +# If you see a type mismatch error in an autocast-enabled forward region or a backward pass following that region, +# it's possible autocast missed an op. +# +# Please file an issue with the error backtrace. ``export TORCH_SHOW_CPP_STACKTRACES=1`` before running your script to provide +# fine-grained information on which backend op is failing. diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst index 86438135e1d..f8986363092 100644 --- a/recipes_source/recipes_index.rst +++ b/recipes_source/recipes_index.rst @@ -167,6 +167,15 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu :link: ../recipes/android_native_app_with_custom_op.html :tags: Mobile +.. Automatic Mixed Precision + +.. customcarditem:: + :header: Automatic Mixed Precision + :card_description: Use torch.cuda.amp to reduce runtime and save memory on NVIDIA GPUs. + :image: ../_static/img/thumbnails/cropped/amp.png + :link: ../recipes/recipes/amp_recipe.html + :tags: Model-Optimization + .. End of tutorial card section .. raw:: html @@ -199,6 +208,7 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu /recipes/recipes/Captum_Recipe /recipes/recipes/tensorboard_with_pytorch /recipes/recipes/dynamic_quantization + /recipes/recipes/amp_recipe /recipes/torchscript_inference /recipes/deployment_with_flask /recipes/distributed_rpc_profiling