From fe33b549f7b2cac58d7da07b76091acd709171d3 Mon Sep 17 00:00:00 2001
From: mcarilli <mcarilli@gmail.com>
Date: Tue, 15 Sep 2020 10:00:06 -0600
Subject: [PATCH] Python recipe for automatic mixed precision (#1137)

* fdsa

* Tutorial runs

* clarify one scaler per convergence run

* adjust sizes, dont run illustrative sections

* satisfying ocd

* MORE

* fdsa

* details

* rephrase

* fix formatting

* move script to recipes

* hopefully moved to recipes

* fdsa

* add amp_tutorial to toctree

* amp_tutorial -> amp_recipe

* looks like backtick highlights dont render in card_description

* correct path for amp_recipe.html

* arch notes and saving/restoring

* formatting

* fdsa

* Clarify autograd-autocast interaction for custom ops

* touchups

Co-authored-by: Brian Johnson <brianjo@fb.com>
---
 _static/img/thumbnails/cropped/amp.png | Bin 0 -> 14849 bytes
 advanced_source/dispatcher.rst         |  24 ++
 recipes_source/recipes/README.txt      |   4 +
 recipes_source/recipes/amp_recipe.py   | 325 +++++++++++++++++++++++++
 recipes_source/recipes_index.rst       |  10 +
 5 files changed, 363 insertions(+)
 create mode 100644 _static/img/thumbnails/cropped/amp.png
 create mode 100644 recipes_source/recipes/amp_recipe.py
diff --git a/_static/img/thumbnails/cropped/amp.png b/_static/img/thumbnails/cropped/amp.png
new file mode 100644
index 0000000000000000000000000000000000000000..a6916ce5605e99d5168c7d52aa157f913b8e7526
GIT binary patch
literal 14849
zcmc(`bx<9_w=PP8TX1(taCZw%a0{}tad&qoxVyXS#wEDJ#x-o*osA^81s=ci&V5zy
z&wJ}uovG^nzSX@}P0vhq_gcMbB2|^Yp`#F^z`($u%gIWq!@zt5{HGwnzPCiSSzf@v
z5Z!re09@6LJSZKV9n7t4%_v>H9L*@rJgv-OU_4h3vUQwk+nuG}8c<k1IKZU^muI>c
z96&v_c*4(?6cw(ixRmr7ms3+S#X`cvg%Xx>-=1cmufcEDmn#SJpgW+*+d-ZW>5I|p
ztwDf4Vt)SCOu(?OZ@|-m%e@ojmD;XI_owE+-Wb6%69#m?o!>GHL_O}lVxRjJ-v4wU
zQ0!>3ZxO{vn%RS1Cu|X%9^HOBhkV6w>lry&dXAn^-4um(nPYA8E%a_2VK~#scmAot
z-IMcud1+ljnul8aS;*fSxtAD!A^YyymHK;=hKkal?EZY?dFI0t_P#?v`(^Rl-zAU!
z_`QE#PpoOTWHhf2CqGsF$zSM(^6hiJ%HZ~rBXPx0lwH5Jd#rUI@pL9KVbwTDdEK)J
z3Wl>OOXTG(FxHKz>|EY;39?-evZ?p{yGmX{5s_+i)*Cen$Fk?2c}9t*I`lN&6B)r<
zxXvua_t8B`rPJRTk-2l{bZ*^sV)*OyLa5+*v6lZ_W4P{x*h36UfGU4or+?vr__lv=
zp@*XO=MIZ$1TR!KtZ?##Z;P60r{`KL4<^8;d*M=a2eBfl|0-l9WBbpwTjY-cgJpS@
zJe)12u3mZiw$?2IN~!fu_Kbwlz&ff_)nEsXR?LXv#De}&lX0h{pQ`ffXq<;B)~+n-
zoa@;xK+hf$1&|uWpb~9SNfvQ?VUMiVH2h1mt)XR#hBHHHn$~2hzKjyjlDdtZCM_S(
zx;<4(#|Gj}lE$plU_CY25lik91glWj?FQ|R$J*GI3ARD+5q&Tn$u!S1gRGQIeFcB~
z%2d~QYCWN4U;@tN<!*r5ueopS!Fd=)C94SBHk*{^JN2(B@ohOn?Uw9&_h@V`L9c(F
zf<d7kklkbkzdAqDSQR~+EmkZkI%xqK6Anbq&6E!?WT{*ZR7Vw)3>H1#qKc#tex#86
z3g(nSxkheS|C89+%XuGvB}4G|%_}+}U7-B;KaxzBk_&LxKUy0d*I#)t-Bqd_w~?oN
zLNr->U?2Z~4qI;qdZWrvx@qH`-~P@Gu;Mh@)<FM#CI6X^vAnW#R@LVyxDrmZZ8sST
z1-M$M|7vBK+Qh!xDJaiJ>Rj4g9v%Ugbhertdd8-J3QckRH9~$Teh?_>^tOa!fLyvz
zn%7Kjal)H@rpg7-Dn7N15D%aQ!P9c(u0&Or==R7n5v5Rg=$rs;zvfQtL*FZ1<C{9&
zLxlTF-7-!d)&&<Gfn_C|HPFe05cZf7Ir}l3#*WZpsKz(KLxf%*Y4Rw7W5LgDag1$d
z;*`J8`rPg;XHCV4Y`1`6dQTa9p(_p6FwX6=A^v_IqNMcIVq`~WWc~LcDJ;JLck6s9
zA}Alj|2Cu6gRMR4yj5ZsYMKK7VI@yHkDt?&a@47eLAav4ibPKwCZ_)Fij~aDE2wp?
zK$Y8hdn3m5AS$_Q%*=4hwJxbmMEMGQRn#y1yqAJ5@nRe!okS=dOG};4tFb?w|Ai3~
z;%Zy6bD~ZB#I5FATiJno%Cs1K;c|T39~)?qVg1&sXREmye7V!=TX>^06VgP}toPAL
zv4)PnM@Xf)ExUgsErNoqW~rsPe5~SD7QtL6Y*uF+`jHWHsL+lzOTk>x10<sN$&EZf
zu$)+=PO2%DX0L#Zz0lo!64kg~hsGkkodD!`($e~C4owUYl1R$fWVh|S2FktzxV>ly
zHSdM;X3_nu84||Ha#^qZDXbQ%6XWS=t`O~%-6P~4<t4KgQJqpVpKi6b95*<IW)_%7
z)LH~$g-Ec_z}6n$kNMy<5gEz`AQ{M3=k~c;7H7)ndRVPnZxUN3{6OMsB(%=|8U$+$
znOCjrEUx<)8~B6PAk!t61Tp<@%S~nn9gd&H{=-kd%L-_bDiR={&e4e`B54-abww8O
zTlv;kFtM#CO|~qehIo_gikee|a~9vh%C_N%p2rN=eJmRxmz{FyCD?Qr>&hi~ctg@4
z{OQx+Z-PG^*^#;^YxmyD5Zp0Tzn-~kCo9a+#x#_5u4zYa*d-gK-k+|F{!3D}=hJB(
zdQssu5b-bYmeA%E?0SqEO2IaR^@!LFQoTd<&t?uX{KuRVsGdb+6(A~sK7JlaV)c>n
zm~#v2H@n?@1|Hm+zBw|iOUzki7sY41Z<9Q6y|aCT(Xj(Rwe2Lc1Ml2>1oY@GsN<2A
z2`A-LZsTo&y_&^$N|PBoEIkl{GIabxf|C?*&(Tv>Bqvr@P&8InZe^<OyjeS-D>y_*
zqILo2roNU7O!3H@2#+LqpFNNo%YE*Ck2{$;gX1W2%NkM_hL4T#`X<CG|5k=Zp*QS%
z$)B>1DIuw@!$;t&5)<!WorOc=R_OIT8)c%J(fYV3W-$E10(HA$1;|LHRX?9&%fX-r
zBi-vclN_JHuj6OAhX|`TaeI8@-weW?cB{m6*o=tJ28VqYRFzlsyOH1ta~mUDJZ|m@
z!a;mOEk=!QMa5eu2TF>r%f$EFfK0LGcFV<4v0G3b;rkY?shvM(M$3@>MyH0072Ik3
zT6e=7jKncTiy*rQM#r4J`MGKXyIP%s#U3gHJHPT-xcc;0O9Apq8HQRqk9w)bYEKlZ
zpq~{GXS1`fM_FB(FO>Fy_p7jBld(5UuwXfY6m8`Ih1-lUWKswwI1dAF#q?fXSK({I
zX)(S_O$vQsujXNfav}}8Xmwe*ljFsL!Kz6h<_-Zyvz<hKVv~STiXxH`JE6mp@;TB9
zJjlglkWsNk%0abB?YHA$Xr~ESWw!KqeS6wlfd1yf;@<K_c;{BO@LTwg*aixqa_i5V
zq5KH6)BaluTXij^9CS;P*m}+gk>8$_LRnZ;jpCA=Dv=K(c)wI9!?lBZQTI2IjzwAz
zNbRW~f`^CtM=>YkN6v_m^Ktbu*5K{g=WWAxCoAvMPH0g7&7r~|ERF<@Y02F5Ng4ih
z|4D@s4NaCPP^Ks&n`&f3uZA$w8%cITfWPx8O`_6Pk<Qs`Jp+8!KLiMF-agvb#`aS1
zGAu{S{$!z3_cJ-5hA@%K_Nax)aZi@)U~i*T#;A|2qG?sqI8NdIiA9^rGb7*D9#6gb
z7AuySS@12;W_LX4lb`Kuc}10P3=mq7%A7R5ETNy^DlUZmxy@5NJNX*p`$|kL9nrpy
z^rlD9Z3sn*n(n6CXb$g(KFN?dnLAupWS4n!KJ2nx<>MrxQ`j8S+WYAFn<nC4m5PpP
zwu7W;L?=XrCF^nWPFT47WFBK{r<PcM2c+0AEsOZ1C=)nf%<xaP;}#!smM$OJ18SW3
z9vVzyc-=hZ5a1A~)=Q>kOeAAmhr387U4(83?U&QTb}gB)n<^TFUs(M-H-hn9w9rUa
zx8{k-Q?Yr_pV(J1I!g<Qei=ORMveC?Y<rHGi5L&FqOnP2|BA+MP(8^)ruw=bgi;>O
z+t51cx@|zuG7iRSDwYY+^nv3XVFUqHrRg29;7!H)4y|x+;Wmv?%9PqUE`WW9(p=`>
z<O)5G>6@!F#c)tyayV=OIG;Iq6Hf5ND2%wp)&HFoI?@M9^|6k|yAO$dE7itH)-lJl
z_*D)?)j;dvA<leag7YpgXrol$QqM^6QpYIa=OsWfbfj^D34kGpfF`I8sX7F*ss~@C
zmA7aSf1&)+%Tt*|OLr0}#S0I>%*_?OXjJ&nsx(d|rf=4d04S>3B$aZPX&=n84qYK+
zJX2<s0>Mw~CkSnHOv!zx6d@riz};pNn`uX)R2-KE_c>u?a5fxa>+qdo3d0arU<slZ
z70a8k)P{Z@jG%;|rHb{t&s|9K59ZNZZxfm?c8sd3TzpNYVBhAI$l7I*A~rO?VS|b3
zpb6yTAWFg%T52Oean>-74pbZ!HI|qfI0e!;PDn2AliUT;9e?}=w5oh8%Ogl)+#wuY
zf`3b~I8?EZyePdd$e{Z4XIbR1EjtjYgMg>Lg>^|^_h<s=dc)GmSA%s51M>XHw+_}W
zaFcZ%Q<2vi1UiZG!Hly!ADLGq8+3QZV-Qb?{456^{^XgdtT>EzJ-PX)(qO((D;CF-
z8H<U~FY&cVTc`|xhAc1JziwS}H4o#4ZYqQnRKXCU*i_7!2$L08o_2l?_6vCYyF%Jn
zxRdlpEm<xQr35=vs-m6HFAjlD8RqX}!?B*9L>hvZ?7`}<;*0Wdp>*R$dSDEvKG~KB
zcV$@ppz4r-<40)GQ~e#xZpoRy+qMcDd;`@zw|)FWh~Yv4)HC*p`K{`O2YJunT7-+`
zjF@NCKozkRw$V&jy$r*2Tb~1dRMUXoGhi{Y{-TZQ&lTv)3Ey)0o^{QDf@wJt=)xIk
zpa}kV_A7UnL3~I2V4at66AVh)xde-v-Br2de4OT!>yTf-F^|4o@Q09_8CJjmavaTS
zO}#}3<C8(y<0IcDq98Hs8iC`@X}lYJR<#ar=4)rpi5U=~B*4;?i$tPb-#M};$bxxC
zmVOx`oK#8Z7Nei_Qm0_f7q01FQ%d>y-vrU)+gRuGlQ;%2y-2VapQ&{nmV)BRkloq#
zp}=Ci&97tj^cIUL2Y0=E&ma!_(~M}v>(YR_NG~wRySvkX>|x>lNFqa?__EByDs}sl
z7q{dO$zuB5AV#DcpCN;ahY(GYhQ9XdISKF*qm@UJVSK5$UxDk%Hg`>x^a{>w58xuA
zeAZiIH_X!GNeg#z4!dIS0!#Z55_~0!Nh-)$(Bobr&<$y}Bo9aty5}><XvS(5vB&bH
z6s($qvC6|W%a)feuY2I=62g%#p)I!OW&DJWbU@u{iHO*O2ohp+Szv#VkVYLK26C<e
zf&JZIBxApF%wHVN7tyC9q8(kKN|_v4S$56(H*XclpdIyr`5%L;7J=3RJC%Mi&A$s)
zs}<(9^HR^2+*(}jPsU%{d55w8a{c~&Y-wqpJoJHw|2VOixzr-nsQQ-^!5qUDxMTg?
z#;?YZ%uafKDjOfBycxe=%+ySG$|m$5K{8IcF@bcdP6x%`VKHB>7zH@T7=yLgFCzYg
zbNA_BS7PE3gZP(W;8vng5M1Hx6N6I$*TC?;5vL=Y)i{-%%bOwghw98fhx^~+oSBsl
zo-^B2zI#cZyUgY#R6b28>B);t{|V<8wP?Fkd6dh^|6on@Lg3`jdG4QTO-<CLMNYtr
zDE&Fmq}vHkyDX*O>b%Rj6A3ipdi*<fxQ(#Jic2X3K{7X(8caTciEEco>NEy35X+Is
z2fM5W`sv<4ZhiX6cSnwTGPcHB>zGppIg|A9C<-9*TXQfOWD%LWyU_h-DfkrXXO(PW
zSfX2c5cp@I7$5TF-_rZz^9W-Q@^>*gyWDD<tIMpeo3;zPRqCqmsPUh=3D^a2<MZZ@
z8yB3HFdqM+%!dH4L3$*Gh~?IL&*HcOG^33#a`t0Z=Ni+RNiZz41(i<T+m}ULrl_~g
z^5vpb^$mZ@%Y(H)6~OSUe(p?|)Cd5Tl%E}(6gF$Kd;B#fz=Z_}qh;wqtsa!6e8IVf
zJM$=Rpy*iZE9KC`s7t<<?i%`^>x>4d%|tNrAraP(azzdaztEV9i>P!$W@*06nx4+1
zA+tG~&kWhxC{U`GCjF?pl~KJGYhZ3pw)Ik^gxlZUIBrUp5ldG^`RB&psT1f2GwpL(
zz!L=gH_!Q5D~Q*j!n66S5{;Hj`63+shdm4WGuST+t81N4^I)pi61~(~t?12Fc-Dp8
z&#Q2o!g$#n2^Us@n<#qw!C!->^A|6RhS{=s?3+!})N?qf)<hn=p%E<tES!W-ddlKA
zD*Mh0lElSA?D#HMeb`LB|1hvAR5Eugo|gQroqyd)IZ95VpWQ1cuX22ovq2aV2LGsB
zralo1Du_$MtWo?xk6x=)sj|vQlyhtW%DuC%EsYvhu)iahL{g}xF1F!ew_=l?fRJE;
z@W3SX;0*l~9O(kHUyMt+W=z-q&Ph{q^AT$jy;SZyMKOeO@OY_~qRt<&82g5!(Bhl}
z>2n0VQlbR&kV3zkgFY280Uz^52L-nuo0UuP6XhRBN0AKo{`_2lXHcT9ty5C|k`n!|
zc)}PqUEqXr)M}1R{A@I0un-YWkh)hY9%DFAEhQS?i)LuA(qA}^r8}c%NA|fw8MQDr
zJBu`e+^Ns0$g;UbQb%d^dm#v?!XCF!@ru)c>tM{V^c42?`-Q8GHGvcfMMg}*hp!2K
zT1?eyMf8WeL}7aTgZ46_Vsd|iA!++-(dRA*THJK!F?}W)Ldm6J2}rU)SFE5SY`xP2
zhl}aZv23_6<<v@q+_S7qW<#1D*-h+}kr&~-$+8M5{t;KR`vJ+Er*b%by^5gCjU9Zp
zeKxyO(xUKyLWpl&jU&;POtBl~GGtO0HC|v_aP{~H0XAcD)@7wJ+pVSumH4DZ2&nB3
z?Z2<qRIT=Im91*($ng-1=FrBgjJxcUhHN6katzaNJjNs2RVA7R4{+U)h6b<)E{%}c
zn;VoN_+_Wnz%KR;E(}oJL2|pWf%(M<i^pmx$5Uks2KTA^wpU%}p&Hv4goeQvAR@-@
zBWx_sX$MIH=jLFZcM3YyC&B{j(`M^6ueLI>(GQ3VP~Y>a?TF8X`juE&eO}Jm5x0xh
zj)$g_CnpW?RR|xtTAFi$a9C(Od{*@0^DKX4DC4W!^nWQ#2|hzY^7a}BdQBO&E}umC
z&o2M~MO<k#XxgKR!YQy+S?vSjKEY&2AnT3kn4Tuk89tIU?|A*LPz(gm$G2j$yZx19
zAz;tTnS9)<<`&)W2#fe`mAGs|pYY?=i8{Fpw(+;J7Ot-OJp^R)p)BmlMFZ;=1++O>
z(=RREwDA1iNR(KRDA|w8*x%0D81=NF<UD*6z}JEPJ)1fD@d>FUZo6vrrM5pQ<G$_=
z$&i)QieN#?QCya*oL6fjbyQD<8vW<`*)OXwGLDZEd`YKBrWh;}$-Si0zrQds<e(a6
z1#ijrOVRcX-oQ6rH2lfk!CLywfN?&TTD+TySNLHA$qutZP}r~84B^Nf283Z~pBj1k
z!4T*Wk!SMHQkuXxD3al!Y;&_s#6TR|icM@{3G-pyG}h{q2qsbg$Y)2#SncSa9zYE!
zixKCHq?YA5!EQNQfeo>nbKl8tb3eCkCqw(pSZ9oaP2i8X2+23uax_=>vJ<S^(sIvO
z<b`xqD@<M)5>5=C&mMi5%mVtJmWx(rr99rIb8}okxxibJ?aOLuOZseT88DQpF*NZC
z12NVjAAhIJuUs-Yh*Z8^BaetvJIAl$(Nm{rokki?4hzOm^RvK=Z8dd|H8HD0#_V@K
znF<a>00$8J7Mm0DQ)}iSDD9I#JVnV@9;5_zCut12Py9lAGh~U@yCt9+PP@X*!Wzf0
z2*snWnAUlBEZNXyv3*@gRYqPWF;XwEjCE5y#*$UUR(zq4G``S+krz8F4aaC9Q1_dn
zMU8$;XSVkT!a_qs5nI~XLpE3U^7D`1ED6g3KLrQm1qZ)rQ_d%?sw$b#YcOcUy~;d8
z89oV;t;+>4uouJbb5jl>>_iJ}isdhU!8>mrX(UcHQ%s^h=aXYC%2h!q>r+w~9{;MD
znhVKlVePDZc)VHJW#|dk=sTpEWNHrPS@oNzAB$NUsfP&H+eegChUTb8M-vzc6FTqm
z_D2WMQD1fkeZtUKv|l<TmMoGrUKmFei*%D@k0>oZg3Z-9QPl`0CvPg94m2prEuP9O
zCZ9T4zWw~I=qOZXt`X5LdwL$p^2Z+_T!K09MZ#e0EtsSu22xX-#r2o2JGd~D9B}hp
zrM(}py0)l1!T9yGUA9U(5Y8Fc2uq)oyc*Wp`VkvtmuN^qg}dVOmvcE6-XhPwHE2<@
zLa;8?UhGx)ZZk=D3x;%MA|~ycn$@+_yth2#4cWq>=$^Objcp`69AlUTrLTfJZdIQ3
zQj-|^Mp*8j`AU5sRvayLtYAr$2smpm!OBddV3@r~1h9uEx|s4J)c$ss<QU2@xDY|3
z0mHtCGq8#*wYOId=~42MlMSGByJ0pc=-HXVS#N^p{4Pv2et%+xm7}&q16EbxXP~5p
z?GC4Bv5t&FvGi%5lYjoPiX2HO?NP{x*>fw^d&hK)Ljbg@0<xn2?)P74l@OOCFy>Fr
zbA~8;>xac1db6<I`o}*}_I%(278>j`&8B)jgZ5g~>J99!d{|VbTF+i1MFJ1L>xl8Y
zo--IGF}yu4+O}=kGW+s)bcnz7Z=B4biELtSNK~UwIrPD-QouVkXAAg2R$;#Qw|uZO
zh$2k5zt+!?`uxS-{pJStu%jI()O3=Nm?b@Qf$=A_$nZtXb7F@Bp=Q&}hls!4Bm-&p
z6K%x|D~dyH)t;292GjHh+Twt!)`ZG~yq2$EwWNHG8@YH0WmWUvkO5{O{Re!TT_=P1
z<IrBc*WceMdRvSyQbkG4srOtlG)z73LM0)vqtRjF+ER@$_U%cD33%shu!wG=DxkJ}
zx@(WiMksfa63uDb*=81bvfJ1AN;}6KBc#t(vGDe?ZGF*`>1JXtqjujtkw?UU%kxc}
z+r7<8z(QWCjnosrbI$duQa}0y3K5@Rc0qn?K{jO2CEZ6W*Mh3tTM+lt{fDpak1er7
zSR#D=o`E;MBK09XEWoWGDYe8f^~KgnUlwn_H!2C#L$+1DIN)N;C6#Z-lEE(&QHnLL
zkM|rQq9lDD_xGE_0i>Z@i2eWK5&!C8t}ppWGneOGJNN}mmEyNKy-F{!A``?TExxG)
zi>eM*1T0PTEt}v(DMpy0j}!d7J2Nf9sSGysj<TD){}gp{>+wwewu&(E){eU2^Y{Da
zKR#@l5rNQR0(cmh4<S~PlB#l&lK(3M^q%+0@k<nu9TX!SHuzqu$@Iw_(<wqVmpLMe
z-x@DUmFhQ2jbKK+zj$&O?eMUgBMbjS1HjGE39{I)i3ukvmP4A0t5-7g_Bj-vgPin!
zZ=)N$Arx4hjyH8mh{(Oc`#rKPVXPLVP}`LXja-B<4R;~~XPDL(yd?6`wmZ6k`OPID
zPM^nuK}?J?PZMq7%O+*sW;6Q8^H7(1ipH+XE=Mw}?8~U@${ybce7Zo|SoX<pVGb2$
z+*N?RNvWmv(DxxV%>{=12~QB`rZFhBTK+okEHp8vr*YCXOH$SlO97KgOa||BiV=-W
zwQ^J_?t6Uu5?pIQo32=kjqA)@@DV);iufG7wE=t_LuO;ZMiew4Ju-nqI{z4(wdYGA
zq`U`*^c7ayd%~a+mMgzx2?)GFI$lFyt<}luLe#y0fmI1Jm9NDu89TV{dnWSxj(^+>
z-?7xSnf1)?2Q&X$^9q5VA5I1}rTgOg!qLa!Ezj!i9+o@GIWBPj6VLniKvGrIkbZl6
zyLvbR@1}RG2Q)4_W=@C&bYr@=!+3Wh)j})>x8qN4mM-qL5=Pi#hUkB-dLCTO{648G
zoKZ~~<KH;%eg1bJ+(#HbK)G`X&YhB-*v~JW11v*-%^sC!Oo&Af(O3O8ynDXvTo0T$
z$UnVV>e&oW8Rf59Fznq7Z(a4=`2!AUFNbke1#hqPt}i$?<f;~gA6;D{d1}oZY^IO2
ziKY8u`srBKWW*0NE}iUeU7WDTInhVidbSqVEsXLfd)z-i5&kry3^SL_4$7bC29(79
zTAmK?t_kd@K=RbItVzGTyD_Xt;!k!jpBW_lX&By911O2{Y$>9TvY`xpxB6*Vne?8e
z{3YDt@D3l06~0NqM8K2Dz6;2XvOip4U_K50r+ffARG7XuBD%^cNh9tfBjFOjq$Bet
zz`&56%SnlAc&?u31~{zBuZCfvmK-7JD>GPM*(b@nVjN&lj$nVLnj6s~`B~&1`O&oZ
z?c0w(Gns#v3F~o^bhYE+P{N3Oe4q0y@`%nPKD^)Jf8yE}S$NX!ey!9M_{ef_MR6OT
znK+iF@#YreuI{M&M$%Anbg3fyrMoo?=}W0js@jB>uLRJlqB!LomesTg>&&|RHG$n<
zl23NBhXgMrwD#(!7}E#WmT-%fS&YA$tWNmjQ9Bqz4Fu(~vJ{3?u_6NBq+CPVUBe<|
z2Mv%+99Bmou#8w~Bu%A4Qc<NthH$B1CA|lhU?qy^OpT8OEQ=!*A7B}<j1R<N8OCyo
z-ldQfQ^0}pRjxSI;JaRw7yt>?h>AHRv>xXF1x);bkMGhuNW6FbH@tWKpDBX0SwN>=
z-D;v9c{Grxo0ISD!S#j#M!C<@AcxJv^^WTwd37t1HKjn|%{MEra7X2@=@SM$Lt849
za{~So1B~#}+<rzPV7+x^6OR?Fr%jbS!4Q}Ev|1hbqgAUeJ)^||8W!PVlf^yv_5+a*
z<<|%yng*A$9OlI?HvZ33C==az<LAlOcV1LBU`9yQUw^!&39v*{uNByuap{%>2jVmf
zqwkxXX~MI#Tlu9u$vpbxw95JFOVIti44AP?6U>MJW<z1Z0Xq*B<E6b7m`|0d{5Uyv
zBxO<tDC4g9R}VN(_xu<gZ1_5iTus<JLFO^|5L-fWDyO%jR;m|(uA;HtO_%b*-~r|&
z`P+Gx8yxTOIX?9m;2;!_M-*^y2B>i#V9g#l@f6~KJo?~j@i<evxHCIo{+_$IdZRGQ
zfc7=$%L5k8n@%3SJ(}Lum1XV&YOWVlY*y*!Rtl88x`(r|P{Kl{%5Vzv`N)FmG$9U<
zD$Na6zKx7K)iPJ>*t${&A-fb4P=LLTT_+BXH(C1Ht^1Gkf(l=T7HIC((Xu*|fnaq+
z?H%&~<LNwOZM}6AC<KvQvjpz@ciW(a<mH7Yxrh_UXUouoIJ{~$CQ#Lyhvw>LsCnAF
z{rX6Qa_XFXR8p`~wTge@uauxLzUX|KTMvlR=4!WEBW~Im;0W(A3FV7Um6@Hjj0@a>
zs&IKtkUn4-4HncLqWR#ffrBtk!F0kqG7NZA#D-YejwwkO&*N%eyG*9C(y`0GkEg@#
z5g966TA#HlSIa}kA=e8BX5<?L#&Pdx*EqrqFp@(@aI<+Ypa8kg+S94UO|T1s3jB4o
zCX2s+#O8$F{?&QR{{FqRYarh@5^3VQO3j8XzQRQwyw9omC&mNy+0N&5-R^V$*5)Vh
zX<Y>g_h{!Cq{&*$g)C)-je&XL(SVT1UEOZ|%gE_5sRGMm{GaPr9Xg}t(_fx6Mw&mr
z2Y_9J^<BV!PcFDZ=UgROupe<hGq_1`{O+=;1fpVm{&Jti4EpXtzK(G;e#Axxy8V=g
zg80O=D1&aWM_PS#u$EqQ3a}ol+1|6)$!SXj|IM_uWU9=(z1__jFOMH57l>;)mDHKG
zVoJ?d0Sw1_F_-Jqa;LJcFDaB)Af50}rNn6gE<Dwcepnw)ATuG!Sce{;#IF$2fdanV
zCYM{uU(m~Fpg_Xg(iUq;PegwtmCwfItu!bnhiIym5_n1CM>BvGD?>}Ycpv<^!k&?!
zGc+sAQC7V6Ldbeivb?pX-yx}Nga*n*w$r8b)!pALQ=3fMpC1&++%#N6dG=2-&Pf&c
z3@f`*>O+>4kzCTu_g>>l)7EiKf5%*u)>>4{`5F1$H*EV4H1drA$`dg%)VV8bH*h#V
ztszhJx-#81^-f+VVej?i)HCk>;d9#K<$z=bNRcv??VG!9XwkXX%3d|L-kh$GF;y~w
zkTQ`tva}_j$>vR`_*z>HOgF5(jSL6VlLQ2i#I|WJr@hv=+jv^AAa;?*)J{r^SAn6?
zXv4Bu9w}LYNoJNNGx~v$wCL{Fw9uAZlf{(jOWQ54K~RVpiUY0vI`4?QD|6v~m=N^~
zwmK`H#_;c+R_*lG6_60T9}XGx<+zlTiE89Y;d}GCP%Bf`s1c&k`=EU8W5*{@tFvf9
z*3L>k{8|qqOG9{Y7q?5lC*Pj6y-tQntFhQXWrt^wP?a2wW`X|P(9c~?triBJ>;v+m
zjK9-Kg~o2@JgP^ZS{#i3<x0AOnoNzn!nGKFMbjOX?sc0mJ1GyMv%h+G#w{wJ02(rP
zTRAawL~MA?z}_52b}<dEahfQ)_Gjp7{>YV)dSun4f0e;#Z@!BIjAlCkWlzUfr4n7H
zy4NG+)0B;sx_Iya=2bR#>9uiODm$a_94;Bcr9Gjkd}zx(mKNQ-0K^GoYz+6>pW*_Y
ziZIC~PcTV6P&4A`Kq<sQ^HA9zkbkC?ok;XP9RieQL?ZX6_>)aUYkV4NNaDQAvb7w(
ze3+28tmKEdb3nXI=Oz)&G9L8Fs*^qW%aVdcg($(+w&PvDiYIicdYrRDrNGu@cxijF
z6hnRrak^V7F}DI%J#|HgkZeQ|M&UG3)krwW)NMzUhB!@SN7AWNgz|%`R!)VLfd<|0
zPZew>plPi<=_>#tC!;IINZc(t7aF~de9=j@E`Y(PRfUF`XLQv={3IF)y0)bJ8A><-
zg}b>gt*k5(isaYY8zzNnP-LSf5AKZ4yoj1nef!R;kRZcB<yMQO#nZJtFdtQ?0DvkU
z)X6~N4q98g5IU&y+^HHxQ>8p*y_g+DPkSsfQ)$FSHMO+^P{#jqTcrOkWwmFAIHGd$
zp-fb_v6vdgGu~BWYpLnIrpUz{<WmZv&8bWo<~inyx01|Pg*z5bQpKVK+jhK}uwLF;
zJ)eO*fmnKD?)sXqx_*C{b2veaTw37UYxB-RI0ZqY_F*zIzSG-<J(`tX{qTB4Dq7i$
zL!mWD!;9*ESC@4<hS|}QKKk6f?XQfhOx%2ONCmrwCx<<<{-0+yGjW-AW{>Q7>p$1A
z1B@D?L(@V%C`Otk`oj7Ex;4SH3w!*s8}UwWXLs{eXh4LEAmRbQ|A;_d{O^^_WaCUD
z->^#++F2a;_evm`@#U6ENUyuVBsD42jPI8_vr@emb?7A&a=ku#VtKYV7~KHzc5+D;
z4_v@rzX6=)DF=BDJRW3c9gtE$_X&>x)&EY0G;bI~4)oOz5d~tRbW-bMO=?zE<axA1
z!>}*1EbD4-Ot_fu4`f0QS60S;a0}W0>g65ZjkLp4SrYxq{ENmTp1~b*8F2fNvkAbe
zAmgGBv}0MEnb{u>>dmV5R^cve|6vQV>w>J@FBx>AKz%o;$4M;cta!c2*KpM{bCct<
zdX3xuf^t+FUq;l9)?J2@TN_t5Z2l#8EG&@{C~YB=RLgv2>J6#a#b`lg6Q6D&??yV1
z?->SfEEQ`_+wq~iZ#v$=l%H2H)ZhjN8ZhzWLhBuwM?aFV>;U-R1(W~4_WuTrga7}`
z{{K{~Od|m<E)|i7HW3-Ggmys;CsS2Yo|B9_5!tMS_8L#gR08qcsVj_B<hRfmt!Kwj
zOp!{EK6@qgmBJ&Mm61)9BYTqY9?+$yj_$6cDXuVC0VBZ<Qjb#)c#Gr>`|kMr%1C-o
z&P13{RLRnZG43u0$FtqA=Pb<<VVj>%Yu`x1#MkEKayUU)eII1`+8BZ?Y}*Ds9b+8T
zOSVM|Qdn><STvHu3_wq7Vbg51$sMB1{*vsK6{#5`E3DUls+S*R+ENJtem1yji}t6C
z%ZS%o>j#eRqpY~25j`KgS-IgT-<`_9dKgUn{}7n}IGdErFd^{&Au#Vw=sSTi`tTp}
zuJj-BU)n|={^!j+>l8_!^6J;MakAj=hL5Emm7Dr{GKh_0t?)<t8xgos_KkiJ;=fm0
zsJyWsS9I{h;JpA49Q2P~fd8WS0?ebsjqJ>$ga4`in-OA5u%_{}TA_}9T5AH)xxSm=
zW8v>(nTc>HApI}SK^vmW{JyBMIEv&LY_8vk%}h;|a-7p7vk<8S6y)MEqc^v}`^7Md
zh?>908zBFpv>L!4E^zZB^>W<LNPAxN?)vInMN~68LKIfaG36MfT^z(<8rnuCdZrID
z!9Je&CpxaFDmkHn#%)rhKK;%w883MA%gH)YCBagIn{u8~Aw~xSxJ1%a!KwYDvohZj
zaq$~R1BF6LIKScjP$Z_NP5oR|z&2@;gsEowk;xLb%#s?1H8BaTx*n;~*(mS}XTO9z
zCw|sw$Y*t9)eKZwIRt6S{{$cZ6=x*CmuzDF*L(tLN5xV*GLxc$2O9nZ|5qsVR!A$r
zyUCbcSM*}6ed&)<?U8TeF0gk@*P+S94fHmtO?sBw!Lm2!%@pTGChT;0MHJ%pyit7y
zE@iBRR=%ONkqk~wHFQ90;t?yWzFj)$)C}u5GVetC+kl-fkh9nH)<@=x_up9b(5`S&
zU+oU}0oTEJkTdqqzj|a$U4l72zhay4(wH&6x07W3!JRqAaR)#0LvF-EKMRxnxnLOr
zTug<=`WK42*mvcWn%uJaL|sbdQFzweSED`cwYhVC(g0}dw+Z!Th1A@bb^*c8f)YC?
zMk~lRcapeqM+#6`dk9+$U3#N;S7&2DFInA3Pa3qx%~%tboNKy`L=P`Bd{+Hx6Q$s2
zeix&So1y_mwBXIU%Mtfq1Jx8u&g*>BQw{zUIrTa253D`c_5Me@ErJja&EyqiXs0qD
z)RE62cjL~1#(wk0zhbTI65MdH)X6NWb*dWD;-m7=^v5?2kBB!WC_UJwC7iq~pK!wK
zr4b7cvfm=8MVmIZ)<FFj3a7c*MFDoMr4r!j@STcV#NZTKL7iBo1z8RF3#pym2S+#L
zU3qKeHfc`;pP~G62la69kuCrx%dcyla%n-(%9hhM8NOg4E<v*9i&lGoV!yw3CYx?q
zC(d8<>rRQ@6F-b?tR|$;BQ8OR@z>tmDRjMQpTEo9xg=>quATCksf{s@CC?A}+_%ik
z6VB=)Bc~LQf#)6`#{t#}GhE-_Y=;+GIhn+hmluZ+$>xql`^iH+8`|?V;uW&Nta}gU
z&*(=d66us=(Z$>C1%@v4`lb_dAcOTG_s}hzM#_;U0SI@SepQ%q#B~2OLCtL&Z!D<x
zMaCLEpBLhSH8y(V9X~$jsU378qnBo&SPkv+TS`I@`B`Od^npM{{`5!8rc8}x#IH^h
zhJh?3u-RuncE$Aj0QzS~A`f88)sK<*m#1CUgA0I48NW5paP&%5mB}jZPP5s!)Ff3E
z0~*klw^3{^N4#VIN`NTH3}0gz#p6}ns>czW-a3y!;6A`_1TCW|a|MllCN3+wu~4F6
zW~zNn^xr!4Q3|jZVAR+UU$%~T#5$5HlMNMZF1ggTt@?)F=NW8*XJ4A(x%fk#?HqBK
zVL-{R9UVwx^>icHNoccrF6g&jw3e%a&PQUUtx4u7;on8F_OCL#pCr>b;@=?9@87d*
z&(l+K&w|;^mw&egqNnsYM<jzGd5(3qlHTB}Dh`Y9Cc9+I$tj=M6gz@zJ>WKQ$&9vi
zKA5iG4tE$s6FVjD;dH8HbSR{4oOhb@KhNZ^CO_uV=llf;kuZ%kqoq-g6_<*+a-GmS
zaFXE(cCQD}5X4Y|poz2rOY*NIKuDhrjn6F30QDLLUKU4nZAY}e1TN2%A-<qp%&j46
z=N1Wn-K3f&rhoda<;1_$u)mM9EJZd1GTG3-tM^~B>{Xx2tEz9R6W5l!%#Az4_hj_l
zXNhWg?-7-SMB79n{n%2^&nLPo^Q$+Q;@t=u^xQNo>5n|=iIO?gL@P^3M*d=H4A^d4
zUYfo??de3}SS%45V5tg#*PPICJ*Btv<l;`*ci`$G(qr?3WC7hoKrpMVJD*MVzJFUh
zGditsh&nz=X|2Ym7t<%niQI-zpgYQ>9i7%4AiA3~ayEvK2L-&U0sdulb8@nSn&S>i
zx=;tQ$mncazxA(t5)uVdK9Pj-5Oij0+&*Bh<WTY)h&cOnZe5{7s?yvK@P$Xc2{?CL
z`y4|-T$fF<%*8P%;famm4V>;dZfTnh1LEz<g)%!oNNunYCU?iUf&yZVLG%KVU$wTB
z-j_f?N9}C$fx=<BFEY!X4>YlHEH;_NA;fcGVy(e;xWQj$x}k=M8ujlIxAdWiWz8Hd
zubJq$4SeOh(=Owfj&-2-;FnmLlRee@inW)npXJ@^2x@3VJcWiRShv8si<6;dUtA4L
zY&Ho&=&OpPx&QsncRj*!p^s)ak_UIS5#Q{pYc}Z*IC9v48;gv8@b858+7)?EN_MCf
zCZ)?(Sws^~XvnJ>7(nkGPcBp2tPX!uC{=$|h;iG9uG%YqA-i@inKHG@V(J2%rXIwl
z+`S2W>mZznkRhzyWA3US)yVctVe~doa4v)pe^*H|QR43R2{IATE9E}$^ztwgg#A0J
zxf6qM9&}S3K1piR*5B}yRm3)&SYmZNt<xcQY)1IIDsi<Z(Z?FxF~1ztdvCq;OEKHw
z_(`v4QQ`YcTAUdjKLnr_Z#2%dL+<(mDt$>rsu0=mCF|@#H-mya&e-KLJ9H5P47C1c
ziJLdpaKY_@T2|*EgzNohN&~ed2><=mwvw)sP{AGav$8wwAaQpDPM(zmLfhsl(Xm4p
z_Nwu0s-Cfq-s;M{#78^PxZxFc@Y*4N0^kzLQtrJI<zSJW5S(hRyk8WDH##}<cM(C3
zD?NU=v?HyCPJL7QqGxg(Mv8%T;yngBihB<=-H0OQL{1MBCtK}z03DOVl<t$6GZpM}
za`^MQC#ut_4)THzV0Qn}`G((w6LB8Qe87fI8u5+^ORGCt3~<NVHozp-9I;<tsPl0!
z<;;NuoFz3-8JUjUm_N}F=H6^I*i8}kZ)LZrz+kjR4rqVK)Sd?bH+sZ3tTUAK2wTw#
zXf3gpoS?cyHYiS2u*-JOr1&m4>VS!>_aykRL>f^;kD?kn=3dQ-YPr!sF1FPQD)WCV
z*Zg;lBt?XTw0vWDaLv72;@oKRE05fm^~i1Z`G&;C_)bWfoLoJsQJ|b>dG9Y7Va~S0
z=g+-*-c;NV@n!~X>n9~*otZFUtpE!!XDT)uF4*}fFZa96S7D!+gQz3UsxX^?j8!&T
zL9Jb&h#QuqsQbB=ecUbO)N*L%&mMDAvrn!1Y!{OsYy6DAi#WxZT@af{_tXeNa&031
z4U{3zGLzYqfJSg8dXU|HDvy7Rbz$&!%!uW+wx+L?)T{GDG<rs2WfjkTl6kJ<{Djx_
zwV;u>2J9TSx^|AtKHA_YMG|_B_Z9B%0h^Vylz-g8-G?&LTiwdBa;A+dcuD-ULIv4d
zn#7J$X6s}8D>u+j;Ygzj?K=l5D*s;cKX*s4Hb*{MvHwtI4O9|xe)Ff<;7yR3fI<bg
zp$erU{5xBheOlx$9?VOL%mOxe%rdPT&%<r_=A#@CU1VejpE)<K<jqndT%II4_~+fC
zsTWckDhreiNo4Ij;S7nb`uRoA_LdBp>*r%;Rb!ENk41Aq$raSQXTq$5@BE9>%xCG=
z%O<sa_qTdnoEt+pzW)0!HQ-VbR@q{SYc<`Em~oF?6e%~sUVRi78!s>I1!A&gK9<4u
z9-BOn0BC;fRFeltUz|M519kUNCu4x$ifH+?`?k{tz;GFUN2u&t(}WgkkbN5I#7q~H
zaQV_KO+K+=Dr*)>6-X84`)|Lp3Pd_F!~NElim&ox#L*aFd^&aSv}eqjDLnp&-LL;m
zMQ^WH!Q=AZUbFpvhWaO;yB=L^vWc~R$c@?=4a%>PBEH71-_siGeAm0X0}eVq-uV)M
ze=TGN=rVkw&+fCMa<@bLY0#RxR30Ee+hZ*bg<^wE$o|n%9>;uh(U&1kZ|}O~hk%x&
z+7t>$d-hKjwq?{>4Y1`PNrc>wQ#n%&T2F(QK3YUD@aZwQK<_;0#Uw}WWq5U!bbxIb
zF0|g0vXH#n#H+A|MVEffbn~AAMAO5FTKB9>x07Ua#>E}9{SdT$WQ^P7FaA2bQ@ZsS
ziq5CrW9F|-EBJ}{sq9`i4v$m}^FkMDo|PgVZ2;=#ILsPRdKp<N_F(o3^zA)A4J&2P
z=0)|+>~yP+M&wi)Pc}aUFo`oCD2lUh?OThV@xh7!Ohlcru2c_WeKkQ}sm{82Ao^Wi
zLTLeL?yk;*0<?egQWq#co;{1M5IbN){*^9E%{vU6K<9+gQfk^)APqye?NHqH(Z@gI
zjd>_n`~#O|Iv6VgX<l>rH$0KWubLi|2fWYK%ZuJ;)6^}xXaUieGi6~^1UGAqteu1P
zpL}@90O=$95{%<#_jKc1-b-J+_#r@GQ*?u;L|Q!8k%ez^p|kV1V6R`#&m$z~r5P|z
zCK;92)qVcG)ekzhBJmq|#9Y3>H(n><Ot+&8B}51X&XmXPysEy(W$2(T*oQzL#r~$P
zgI{Z)<BV|&c)s*hb3wUh^K3Ui@zW^lR>1+Cgr(F2MApD-R=GuO$q)c;TyJx=8omyE
zKL}uO&b`tAPoAS#Am6$>F_YT3$d9L8pt~QGVuW4F1NgP>;tnP&VmF?5HWb0Hxb*L*
z2j{~MN)3Bzm$^Sp%q{@E0n+jtMh&p<9n$!Hx?_Atov#}{0o#D~{RV_z{dzYe@!j`G
zz;UMdmS#|;S*-UZE-BO7q);5Z@G$v*$4>76^S?#MBM@nSnjG|hlrtSbVi-f4mHNV*
zkTgk?H42ByFy=ZZ^@UJY?(n0M=>R4JLky-gelsB+w&?+uaUoMGa@A-ct3=fP<_RiS
zpOh?CZ34W>fG)F5Bs)8E5lC`BYF5wY$0Wf87`9{Hq;bfI?NtWAk;WrK>f>lc1L8Ns
zGCxlokkCRN?~tZEX3XDA;J2asa%>&k2#i7ZAjF89Ta+=wtG!Sgcc1*RkRT6iz?erM
zi#Px9z9+L>pNb-V!HaH>tZH?xfo$|quge=-iRjg6gx0odq@BND7&`%xOaCWwgfK8&
b?yrbNC+vqew=cv08G)R%vQ({vVbK2r7|~Lg

literal 0
HcmV?d00001

diff --git a/advanced_source/dispatcher.rst b/advanced_source/dispatcher.rst
index 23ba0f96be1..4f3b52fea32 100644
--- a/advanced_source/dispatcher.rst
+++ b/advanced_source/dispatcher.rst
@@ -105,6 +105,8 @@ speaking, the structure of your registrations will look like this:
     that provides implementations for all basic operators on the XLA dispatch
     key.
 
+.. _autograd-support:
+
 Adding autograd support
 -----------------------
 
@@ -299,6 +301,28 @@ the safest choice for the execution type:
                                   at::autocast::cached_cast(exec_type, t1));
     }
 
+If your custom op is :ref:`autograd-enabled<autograd-support>`, you only need to write and register
+an autocast wrapper for the same name onto which the autograd wrapper is registered.
+For example, if you wanted an autocast wrapper for the ``myadd`` function shown
+in the autograd section, all you'd need is
+
+.. code-block:: cpp
+
+    Tensor myadd_autocast(const Tensor& self, const Tensor& other) {
+      c10::impl::ExcludeDispatchKeyGuard no_autocast(c10::DispatchKey::Autocast);
+      return myadd(at::autocast::cached_cast(<desired dtype>, self),
+                   at::autocast::cached_cast(<desired dtype>, other));
+    }
+
+    TORCH_LIBRARY_IMPL(myops, Autocast, m) {
+      m.impl("myadd", myadd_autocast);
+    }
+
+There are no separate gymnastics to make the backward method autocast compatible.
+However, the backward method defined in your custom autograd function will run in the same
+dtype as autocast sets for the forward method, so you should choose a ``<desired dtype>``
+suitable for both your forward and backward methods.
+
 Batched
 ^^^^^^^
 
diff --git a/recipes_source/recipes/README.txt b/recipes_source/recipes/README.txt
index f93ee92c2c6..a182b0a11c5 100644
--- a/recipes_source/recipes/README.txt
+++ b/recipes_source/recipes/README.txt
@@ -56,3 +56,7 @@ PyTorch Recipes
 14. mobile_perf.py
          PyTorch Mobile Performance Recipes
          https://pytorch.org/tutorials/recipes/mobile_perf.html
+
+15. amp_recipe.py
+         Automatic Mixed Precision
+         https://pytorch.org/tutorials/recipes/amp_recipe.html
diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py
new file mode 100644
index 00000000000..c1ec52a3883
--- /dev/null
+++ b/recipes_source/recipes/amp_recipe.py
@@ -0,0 +1,325 @@
+# -*- coding: utf-8 -*-
+"""
+Automatic Mixed Precision
+*************************
+**Author**: `Michael Carilli <https://github.com/mcarilli>`_
+
+`torch.cuda.amp <https://pytorch.org/docs/stable/amp.html>`_ provides convenience methods for mixed precision,
+where some operations use the ``torch.float32`` (``float``) datatype and other operations
+use ``torch.float16`` (``half``). Some ops, like linear layers and convolutions,
+are much faster in ``float16``. Other ops, like reductions, often require the dynamic
+range of ``float32``.  Mixed precision tries to match each op to its appropriate datatype,
+which can reduce your network's runtime and memory footprint.
+
+Ordinarily, "automatic mixed precision training" uses `torch.cuda.amp.autocast <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.autocast>`_ and
+`torch.cuda.amp.GradScaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`_ together.
+
+This recipe measures the performance of a simple network in default precision,
+then walks through adding ``autocast`` and ``GradScaler`` to run the same network in
+mixed precision with improved performance.
+
+You may download and run this recipe as a standalone Python script.
+The only requirements are Pytorch 1.6+ and a CUDA-capable GPU.
+
+Mixed precision primarily benefits Tensor Core-enabled architectures (Volta, Turing, Ampere).
+This recipe should show significant (2-3X) speedup on those architectures.
+On earlier architectures (Kepler, Maxwell, Pascal), you may observe a modest speedup.
+Run ``nvidia-smi`` to display your GPU's architecture.
+"""
+
+import torch, time, gc
+
+# Timing utilities
+start_time = None
+
+def start_timer():
+    global start_time
+    gc.collect()
+    torch.cuda.empty_cache()
+    torch.cuda.reset_max_memory_allocated()
+    torch.cuda.synchronize()
+    start_time = time.time()
+
+def end_timer_and_print(local_msg):
+    torch.cuda.synchronize()
+    end_time = time.time()
+    print("\n" + local_msg)
+    print("Total execution time = {:.3f} sec".format(end_time - start_time))
+    print("Max memory used by tensors = {} bytes".format(torch.cuda.max_memory_allocated()))
+
+##########################################################
+# A simple network
+# ----------------
+# The following sequence of linear layers and ReLUs should show a speedup with mixed precision.
+
+def make_model(in_size, out_size, num_layers):
+    layers = []
+    for _ in range(num_layers - 1):
+        layers.append(torch.nn.Linear(in_size, in_size))
+        layers.append(torch.nn.ReLU())
+    layers.append(torch.nn.Linear(in_size, out_size))
+    return torch.nn.Sequential(*tuple(layers)).cuda()
+
+##########################################################
+# ``batch_size``, ``in_size``, ``out_size``, and ``num_layers`` are chosen to be large enough to saturate the GPU with work.
+# Typically, mixed precision provides the greatest speedup when the GPU is saturated.
+# Small networks may be CPU bound, in which case mixed precision won't improve performance.
+# Sizes are also chosen such that linear layers' participating dimensions are multiples of 8,
+# to permit Tensor Core usage on Tensor Core-capable GPUs (see :ref:`Troubleshooting<troubleshooting>` below).
+#
+# Exercise: Vary participating sizes and see how the mixed precision speedup changes.
+
+batch_size = 512 # Try, for example, 128, 256, 513.
+in_size = 4096
+out_size = 4096
+num_layers = 3
+num_batches = 50
+epochs = 3
+
+# Creates data in default precision.
+# The same data is used for both default and mixed precision trials below.
+# You don't need to manually change inputs' dtype when enabling mixed precision.
+data = [torch.randn(batch_size, in_size, device="cuda") for _ in range(num_batches)]
+targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)]
+
+loss_fn = torch.nn.MSELoss().cuda()
+
+##########################################################
+# Default Precision
+# -----------------
+# Without ``torch.cuda.amp``, the following simple network executes all ops in default precision (``torch.float32``):
+
+net = make_model(in_size, out_size, num_layers)
+opt = torch.optim.SGD(net.parameters(), lr=0.001)
+
+start_timer()
+for epoch in range(epochs):
+    for input, target in zip(data, targets):
+        output = net(input)
+        loss = loss_fn(output, target)
+        loss.backward()
+        opt.step()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+end_timer_and_print("Default precision:")
+
+##########################################################
+# Adding autocast
+# ---------------
+# Instances of `torch.cuda.amp.autocast <https://pytorch.org/docs/stable/amp.html#autocasting>`_
+# serve as context managers that allow regions of your script to run in mixed precision.
+#
+# In these regions, CUDA ops run in a dtype chosen by autocast
+# to improve performance while maintaining accuracy.
+# See the `Autocast Op Reference <https://pytorch.org/docs/stable/amp.html#autocast-op-reference>`_
+# for details on what precision autocast chooses for each op, and under what circumstances.
+
+for epoch in range(0): # 0 epochs, this section is for illustration only
+    for input, target in zip(data, targets):
+        # Runs the forward pass under autocast.
+        with torch.cuda.amp.autocast():
+            output = net(input)
+            # output is float16 because linear layers autocast to float16.
+            assert output.dtype is torch.float16
+
+            loss = loss_fn(output, target)
+            # loss is float32 because mse_loss layers autocast to float32.
+            assert loss.dtype is torch.float32
+
+        # Exits autocast before backward().
+        # Backward passes under autocast are not recommended.
+        # Backward ops run in the same dtype autocast chose for corresponding forward ops.
+        loss.backward()
+        opt.step()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+
+##########################################################
+# Adding GradScaler
+# -----------------
+# `Gradient scaling <https://pytorch.org/docs/stable/amp.html#gradient-scaling>`_
+# helps prevent gradients with small magnitudes from flushing to zero
+# ("underflowing") when training with mixed precision.
+#
+# `torch.cuda.amp.GradScaler <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler>`_
+# performs the steps of gradient scaling conveniently.
+
+# Constructs scaler once, at the beginning of the convergence run, using default args.
+# If your network fails to converge with default GradScaler args, please file an issue.
+# The same GradScaler instance should be used for the entire convergence run.
+# If you perform multiple convergence runs in the same script, each run should use
+# a dedicated fresh GradScaler instance.  GradScaler instances are lightweight.
+scaler = torch.cuda.amp.GradScaler()
+
+for epoch in range(0): # 0 epochs, this section is for illustration only
+    for input, target in zip(data, targets):
+        with torch.cuda.amp.autocast():
+            output = net(input)
+            loss = loss_fn(output, target)
+
+        # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
+        scaler.scale(loss).backward()
+
+        # scaler.step() first unscales the gradients of the optimizer's assigned params.
+        # If these gradients do not contain infs or NaNs, optimizer.step() is then called,
+        # otherwise, optimizer.step() is skipped.
+        scaler.step(opt)
+
+        # Updates the scale for next iteration.
+        scaler.update()
+
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+
+##########################################################
+# All together: "Automatic Mixed Precision"
+# ------------------------------------------
+# (The following also demonstrates ``enabled``, an optional convenience argument to ``autocast`` and ``GradScaler``.
+# If False, ``autocast`` and ``GradScaler``\ 's calls become no-ops.
+# This allows switching between default precision and mixed precision without if/else statements.)
+
+use_amp = True
+
+net = make_model(in_size, out_size, num_layers)
+opt = torch.optim.SGD(net.parameters(), lr=0.001)
+scaler = torch.cuda.amp.GradScaler(enabled=use_amp)
+
+start_timer()
+for epoch in range(epochs):
+    for input, target in zip(data, targets):
+        with torch.cuda.amp.autocast(enabled=use_amp):
+            output = net(input)
+            loss = loss_fn(output, target)
+        scaler.scale(loss).backward()
+        scaler.step(opt)
+        scaler.update()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+end_timer_and_print("Mixed precision:")
+
+##########################################################
+# Inspecting/modifying gradients (e.g., clipping)
+# --------------------------------------------------------
+# All gradients produced by ``scaler.scale(loss).backward()`` are scaled.  If you wish to modify or inspect
+# the parameters' ``.grad`` attributes between ``backward()`` and ``scaler.step(optimizer)``, you should
+# unscale them first using `scaler.unscale_(optimizer) <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.unscale_>`_.
+
+for epoch in range(0): # 0 epochs, this section is for illustration only
+    for input, target in zip(data, targets):
+        with torch.cuda.amp.autocast():
+            output = net(input)
+            loss = loss_fn(output, target)
+        scaler.scale(loss).backward()
+
+        # Unscales the gradients of optimizer's assigned params in-place
+        scaler.unscale_(opt)
+
+        # Since the gradients of optimizer's assigned params are now unscaled, clips as usual.
+        # You may use the same value for max_norm here as you would without gradient scaling.
+        torch.nn.utils.clip_grad_norm_(net.parameters(), max_norm=0.1)
+
+        scaler.step(opt)
+        scaler.update()
+        opt.zero_grad() # set_to_none=True here can modestly improve performance
+
+##########################################################
+# Saving/Resuming
+# ----------------
+# To save/resume Amp-enabled runs with bitwise accuracy, use
+# `scaler.state_dict <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.state_dict>`_ and
+# `scaler.load_state_dict <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.load_state_dict>`_.
+#
+# When saving, save the scaler state dict alongside the usual model and optimizer state dicts.
+# Do this either at the beginning of an iteration before any forward passes, or at the end of
+# an iteration after ``scaler.update()``.
+
+checkpoint = {"model": net.state_dict(),
+              "optimizer": opt.state_dict(),
+              "scaler": scaler.state_dict()}
+# Write checkpoint as desired, e.g.,
+# torch.save(checkpoint, "filename")
+
+##########################################################
+# When resuming, load the scaler state dict alongside the model and optimizer state dicts.
+
+# Read checkpoint as desired, e.g.,
+# dev = torch.cuda.current_device()
+# checkpoint = torch.load("filename",
+#                         map_location = lambda storage, loc: storage.cuda(dev))
+net.load_state_dict(checkpoint["model"])
+opt.load_state_dict(checkpoint["optimizer"])
+scaler.load_state_dict(checkpoint["scaler"])
+
+##########################################################
+# If a checkpoint was created from a run *without* Amp, and you want to resume training *with* Amp,
+# load model and optimizer states from the checkpoint as usual.  The checkpoint won't contain a saved scaler state, so
+# use a fresh instance of ``GradScaler``.
+#
+# If a checkpoint was created from a run *with* Amp and you want to resume training *without* Amp,
+# load model and optimizer states from the checkpoint as usual, and ignore the saved scaler state.
+
+##########################################################
+# Inference/Evaluation
+# --------------------
+# ``autocast`` may be used by itself to wrap inference or evaluation forward passes. ``GradScaler`` is not necessary.
+
+##########################################################
+# .. _advanced-topics:
+#
+# Advanced topics
+# ---------------
+# See the `Automatic Mixed Precision Examples <https://pytorch.org/docs/stable/notes/amp_examples.html>`_ for advanced use cases including:
+#
+# * Gradient accumulation
+# * Gradient penalty/double backward
+# * Networks with multiple models, optimizers, or losses
+# * Multiple GPUs (``torch.nn.DataParallel`` or ``torch.nn.parallel.DistributedDataParallel``)
+# * Custom autograd functions (subclasses of ``torch.autograd.Function``)
+#
+# If you perform multiple convergence runs in the same script, each run should use
+# a dedicated fresh GradScaler instance.  GradScaler instances are lightweight.
+#
+# If you're registering a custom C++ op with the dispatcher, see the
+# `autocast section <https://pytorch.org/tutorials/advanced/dispatcher.html#autocast>`_
+# of the dispatcher tutorial.
+
+##########################################################
+# .. _troubleshooting:
+#
+# Troubleshooting
+# ---------------
+# Speedup with Amp is minor
+# ~~~~~~~~~~~~~~~~~~~~~~~~~
+# 1. Your network may fail to saturate the GPU(s) with work, and is therefore CPU bound. Amp's effect on GPU performance
+#    won't matter.
+#
+#    * A rough rule of thumb to saturate the GPU is to increase batch and/or network size(s)
+#      as much as you can without running OOM.
+#    * Try to avoid excessive CPU-GPU synchronization (``.item()`` calls, or printing values from CUDA tensors).
+#    * Try to avoid sequences of many small CUDA ops (coalesce these into a few large CUDA ops if you can).
+# 2. Your network may be GPU compute bound (lots of matmuls/convolutions) but your GPU does not have Tensor Cores.
+#    In this case a reduced speedup is expected.
+# 3. Matmul dimensions are not Tensor Core-friendly.  Make sure matmuls' participating sizes are multiples of 8.
+#    (For NLP models with encoders/decoders, this can be subtle.  Also, convolutions used to have similar size constraints
+#    for Tensor Core use, but for CuDNN versions 7.3 and later, no such constraints exist.  See
+#    `here <https://github.com/NVIDIA/apex/issues/221#issuecomment-478084841>`_ for guidance.)
+#
+# Loss is inf/NaN
+# ~~~~~~~~~~~~~~~
+# First, check if your network fits an :ref:`advanced use case<advanced-topics>`.
+# See also `Prefer binary_cross_entropy_with_logits over binary_cross_entropy <https://pytorch.org/docs/stable/amp.html#prefer-binary-cross-entropy-with-logits-over-binary-cross-entropy>`_.
+#
+# If you're confident your Amp usage is correct, you may need to file an issue, but before doing so, it's helpful to gather the following information:
+#
+# 1. Disable ``autocast`` or ``GradScaler`` individually (by passing ``enabled=False`` to their constructor) and see if infs/NaNs persist.
+# 2. If you suspect part of your network (e.g., a complicated loss function) overflows , run that forward region in ``float32``
+#    and see if infs/NaNs persist.
+#    `The autocast docstring <https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.autocast>`_'s last code snippet
+#    shows forcing a subregion to run in ``float32`` (by locally disabling autocast and casting the subregion's inputs).
+#
+# Type mismatch error (may manifest as CUDNN_STATUS_BAD_PARAM)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Autocast tries to cover all ops that benefit from or require casting.
+# `Ops that receive explicit coverage <https://pytorch.org/docs/stable/amp.html#autocast-op-reference>`_
+# are chosen based on numerical properties, but also on experience.
+# If you see a type mismatch error in an autocast-enabled forward region or a backward pass following that region,
+# it's possible autocast missed an op.
+#
+# Please file an issue with the error backtrace.  ``export TORCH_SHOW_CPP_STACKTRACES=1`` before running your script to provide
+# fine-grained information on which backend op is failing.
diff --git a/recipes_source/recipes_index.rst b/recipes_source/recipes_index.rst
index 86438135e1d..f8986363092 100644
--- a/recipes_source/recipes_index.rst
+++ b/recipes_source/recipes_index.rst
@@ -167,6 +167,15 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu
    :link: ../recipes/android_native_app_with_custom_op.html
    :tags: Mobile
 
+.. Automatic Mixed Precision
+
+.. customcarditem::
+   :header: Automatic Mixed Precision
+   :card_description: Use torch.cuda.amp to reduce runtime and save memory on NVIDIA GPUs.
+   :image: ../_static/img/thumbnails/cropped/amp.png
+   :link: ../recipes/recipes/amp_recipe.html
+   :tags: Model-Optimization
+
 .. End of tutorial card section
 
 .. raw:: html
@@ -199,6 +208,7 @@ Recipes are bite-sized, actionable examples of how to use specific PyTorch featu
    /recipes/recipes/Captum_Recipe
    /recipes/recipes/tensorboard_with_pytorch
    /recipes/recipes/dynamic_quantization
+   /recipes/recipes/amp_recipe
    /recipes/torchscript_inference
    /recipes/deployment_with_flask
    /recipes/distributed_rpc_profiling