From 5ab05e42c980467ccfbd4cce6057bf2a03e5de55 Mon Sep 17 00:00:00 2001
From: Sergei Lebedev <slebedev@google.com>
Date: Mon, 25 Sep 2023 14:44:27 +0100
Subject: [PATCH] MAINT Clean up leftover `Array = Any` aliases in
 jax/_src/**.py

I had to revert to using `Any` for `RaggedAxis.ragged_axes` because pytype
found more latent type errors, which require the understanding of ragedness
and dynamic shapes internals to fix properly.
---
 jax/BUILD                             |   1 +
 jax/_src/interpreters/batching.py     |  10 +-
 jax/_src/lax/ann.py                   |   5 +-
 jax/_src/lax/control_flow/for_loop.py |   2 +-
 jax/_src/lax/control_flow/loops.py    |   2 +-
 jax/_src/lax/convolution.py           |  22 ++--
 jax/_src/lax/windowed_reductions.py   |   5 +-
 jax/_src/nn/functions.py              | 165 +++++++++++++++-----------
 jax/_src/nn/initializers.py           |  11 +-
 jax/_src/ops/scatter.py               |  24 ++--
 jax/_src/scipy/optimize/_lbfgs.py     |   5 +-
 jax/_src/state/types.py               |   3 +-
 12 files changed, 138 insertions(+), 117 deletions(-)

diff --git a/jax/BUILD b/jax/BUILD
index cad6d7d9acb4..06be18b9e98a 100644
--- a/jax/BUILD
+++ b/jax/BUILD
@@ -666,6 +666,7 @@ pytype_strict_library(
         ":core",
         ":effects",
         ":pretty_printer",
+        ":typing",
         ":util",
     ],
 )
diff --git a/jax/_src/interpreters/batching.py b/jax/_src/interpreters/batching.py
index 84e616a62d63..843f29fe395f 100644
--- a/jax/_src/interpreters/batching.py
+++ b/jax/_src/interpreters/batching.py
@@ -33,12 +33,12 @@
 from jax._src.interpreters import partial_eval as pe
 from jax._src.tree_util import (tree_unflatten, tree_flatten,
                                 register_pytree_node)
+from jax._src.typing import Array
 from jax._src.util import (unzip2, unzip3, safe_map, safe_zip, split_list,
                            canonicalize_axis, moveaxis, as_hashable_function,
                            curry, memoize, weakref_lru_cache)
 
 
-Array = Any
 map, unsafe_map = safe_map, map
 zip, unsafe_zip = safe_zip, zip
 
@@ -116,7 +116,7 @@ class RaggedAxis:
   # For each axis, we store its index and the corresponding segment lengths.
   # For example, the jumble i:(Fin 3) => f32[lens1.i, 7, lens2.i]
   # would be represented with ragged_axes = [(1, lens1), (3, lens2)]
-  ragged_axes: tuple[tuple[int, Array], ...]
+  ragged_axes: tuple[tuple[int, Any], ...]
 
   @property
   def size(self):
@@ -148,8 +148,10 @@ def _sorted_ragged_axis(stacked_axis, ragged_axes):
   return RaggedAxis(stacked_axis, tuple(sorted(ragged_axes, key=lambda p: p[0])))
 
 def make_batch_axis(
-    ndim: int, stacked_axis: int, ragged_axes: list[tuple[int, Array]]
-  ) -> int | RaggedAxis:
+    ndim: int,
+    stacked_axis: int,
+    ragged_axes: list[tuple[int, Array | core.Var]],
+) -> int | RaggedAxis:
   if ragged_axes:
     canonical = [(canonicalize_axis(ax, ndim), sz) for ax, sz in ragged_axes]
     return _sorted_ragged_axis(canonicalize_axis(stacked_axis, ndim), canonical)
diff --git a/jax/_src/lax/ann.py b/jax/_src/lax/ann.py
index ac1b5aec103c..22abf2c6b0a4 100644
--- a/jax/_src/lax/ann.py
+++ b/jax/_src/lax/ann.py
@@ -70,7 +70,6 @@ def pmap_mips(qy, db, db_offset, db_size, k, recall_target):
 """
 
 from functools import partial
-from typing import Any
 
 import numpy as np
 
@@ -88,9 +87,7 @@ def pmap_mips(qy, db, db_offset, db_size, k, recall_target):
 from jax._src.lib.mlir import ir
 from jax._src.lib.mlir.dialects import func
 from jax._src.lib.mlir.dialects import hlo
-
-
-Array = Any
+from jax._src.typing import Array
 
 
 def approx_max_k(operand: Array,
diff --git a/jax/_src/lax/control_flow/for_loop.py b/jax/_src/lax/control_flow/for_loop.py
index 4a93b8b68ba2..d4635a8660b7 100644
--- a/jax/_src/lax/control_flow/for_loop.py
+++ b/jax/_src/lax/control_flow/for_loop.py
@@ -39,6 +39,7 @@
 from jax._src.state import primitives as state_primitives
 from jax._src.state import utils as state_utils
 from jax._src.state import types as state_types
+from jax._src.typing import Array
 from jax._src.util import (partition_list, merge_lists, safe_map, safe_zip,
                            split_list, split_dict)
 from jax._src.lax.control_flow import loops
@@ -53,7 +54,6 @@
 S = TypeVar('S')
 T = TypeVar('T')
 class Ref(Generic[T]): pass
-Array = Any
 
 ref_set = state_primitives.ref_set
 ref_get = state_primitives.ref_get
diff --git a/jax/_src/lax/control_flow/loops.py b/jax/_src/lax/control_flow/loops.py
index bf118b83c817..823d7c7e2933 100644
--- a/jax/_src/lax/control_flow/loops.py
+++ b/jax/_src/lax/control_flow/loops.py
@@ -52,6 +52,7 @@
 from jax._src.state import discharge as state_discharge
 from jax._src.numpy.ufuncs import logaddexp
 from jax._src.traceback_util import api_boundary
+from jax._src.typing import Array
 from jax._src.util import (partition_list, safe_map, safe_zip, split_list,
                            unzip2, weakref_lru_cache, merge_lists)
 import numpy as np
@@ -64,7 +65,6 @@
 zip = safe_zip
 
 T = TypeVar('T')
-Array = Any
 BooleanNumeric = Any  # A bool, or a Boolean array.
 
 ### Helper functions
diff --git a/jax/_src/lax/convolution.py b/jax/_src/lax/convolution.py
index b533508dc3dd..0aed39ce9533 100644
--- a/jax/_src/lax/convolution.py
+++ b/jax/_src/lax/convolution.py
@@ -12,11 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import builtins
 from collections.abc import Sequence
 from functools import partial
 import operator
-from typing import Any, NamedTuple, Optional, Union
+from typing import NamedTuple, Optional, Union
 
 import numpy as np
 
@@ -28,14 +27,9 @@
 from jax._src.interpreters import mlir
 from jax._src.lax import lax
 from jax._src.lib.mlir.dialects import hlo
+from jax._src.typing import Array, DTypeLike
 
 
-_max = builtins.max
-
-Array = Any
-DType = Any
-Shape = core.Shape
-
 class ConvDimensionNumbers(NamedTuple):
   """Describes batch, spatial, and feature dimensions of a convolution.
 
@@ -62,7 +56,7 @@ def conv_general_dilated(
   dimension_numbers: ConvGeneralDilatedDimensionNumbers  = None,
   feature_group_count: int = 1, batch_group_count: int = 1,
   precision: lax.PrecisionLike = None,
-  preferred_element_type: Optional[DType] = None) -> Array:
+  preferred_element_type: Optional[DTypeLike] = None) -> Array:
   """General n-dimensional convolution operator, with optional dilation.
 
   Wraps XLA's `Conv
@@ -174,7 +168,7 @@ def conv_general_dilated(
 
 def conv(lhs: Array, rhs: Array, window_strides: Sequence[int],
          padding: str, precision: lax.PrecisionLike = None,
-         preferred_element_type: Optional[DType] = None) -> Array:
+         preferred_element_type: Optional[DTypeLike] = None) -> Array:
   """Convenience wrapper around `conv_general_dilated`.
 
   Args:
@@ -204,7 +198,7 @@ def conv_with_general_padding(lhs: Array, rhs: Array,
                               lhs_dilation: Optional[Sequence[int]],
                               rhs_dilation: Optional[Sequence[int]],
                               precision: lax.PrecisionLike = None,
-                              preferred_element_type: Optional[DType] = None) -> Array:
+                              preferred_element_type: Optional[DTypeLike] = None) -> Array:
   """Convenience wrapper around `conv_general_dilated`.
 
   Args:
@@ -256,7 +250,7 @@ def _conv_transpose_padding(k, s, padding):
     else:
       pad_a = int(np.ceil(pad_len / 2))
   elif padding == 'VALID':
-    pad_len = k + s - 2 + _max(k - s, 0)
+    pad_len = k + s - 2 + max(k - s, 0)
     pad_a = k - 1
   else:
     raise ValueError('Padding mode must be `SAME` or `VALID`.')
@@ -277,7 +271,7 @@ def conv_transpose(lhs: Array, rhs: Array, strides: Sequence[int],
                    dimension_numbers: ConvGeneralDilatedDimensionNumbers = None,
                    transpose_kernel: bool = False,
                    precision: lax.PrecisionLike = None,
-                   preferred_element_type: Optional[DType] = None) -> Array:
+                   preferred_element_type: Optional[DTypeLike] = None) -> Array:
   """Convenience wrapper for calculating the N-d convolution "transpose".
 
   This function directly calculates a fractionally strided conv rather than
@@ -343,7 +337,7 @@ def conv_transpose(lhs: Array, rhs: Array, strides: Sequence[int],
   if transpose_kernel:
     # flip spatial dims and swap input / output channel axes
     rhs = _flip_axes(rhs, np.array(dn.rhs_spec)[2:])
-    rhs = np.swapaxes(rhs, dn.rhs_spec[0], dn.rhs_spec[1])
+    rhs = rhs.swapaxes(dn.rhs_spec[0], dn.rhs_spec[1])
   return conv_general_dilated(lhs, rhs, one, pads, strides, rhs_dilation, dn,
                               precision=precision,
                               preferred_element_type=preferred_element_type)
diff --git a/jax/_src/lax/windowed_reductions.py b/jax/_src/lax/windowed_reductions.py
index 1ac143b23692..e7fe2a604fd7 100644
--- a/jax/_src/lax/windowed_reductions.py
+++ b/jax/_src/lax/windowed_reductions.py
@@ -14,7 +14,7 @@
 
 from collections.abc import Sequence
 from functools import partial
-from typing import Any, Callable, Optional, Union
+from typing import Callable, Optional, Union
 import warnings
 
 import numpy as np
@@ -36,12 +36,11 @@
 from jax._src.lib.mlir import ir
 from jax._src.lib.mlir.dialects import hlo
 from jax._src.numpy.ufuncs import logaddexp
+from jax._src.typing import Array
 
 map = util.safe_map
 zip = util.safe_zip
 
-Array = Any
-
 
 def reduce_window(operand, init_value, computation: Callable,
                   window_dimensions: core.Shape, window_strides: Sequence[int],
diff --git a/jax/_src/nn/functions.py b/jax/_src/nn/functions.py
index 69b05d4b0496..62b87412e8cb 100644
--- a/jax/_src/nn/functions.py
+++ b/jax/_src/nn/functions.py
@@ -28,15 +28,16 @@
 from jax._src import dtypes
 from jax._src import util
 from jax._src.core import AxisName
+from jax._src.numpy import util as numpy_util
+from jax._src.typing import Array, ArrayLike
 from jax._src.ops.special import logsumexp as _logsumexp
 
-Array = Any
 
 # activations
 
 @custom_jvp
 @jax.jit
-def relu(x: Array) -> Array:
+def relu(x: ArrayLike) -> Array:
   r"""Rectified linear unit activation function.
 
   Computes the element-wise function:
@@ -72,7 +73,7 @@ def relu(x: Array) -> Array:
 relu.defjvps(lambda g, ans, x: lax.select(x > 0, g, lax.full_like(g, 0)))
 
 @jax.jit
-def softplus(x: Array) -> Array:
+def softplus(x: ArrayLike) -> Array:
   r"""Softplus activation function.
 
   Computes the element-wise function
@@ -86,7 +87,7 @@ def softplus(x: Array) -> Array:
   return jnp.logaddexp(x, 0)
 
 @jax.jit
-def soft_sign(x: Array) -> Array:
+def soft_sign(x: ArrayLike) -> Array:
   r"""Soft-sign activation function.
 
   Computes the element-wise function
@@ -97,10 +98,12 @@ def soft_sign(x: Array) -> Array:
   Args:
     x : input array
   """
-  return x / (jnp.abs(x) + 1)
+  numpy_util.check_arraylike("soft_sign", x)
+  x_arr = jnp.asarray(x)
+  return x_arr / (jnp.abs(x_arr) + 1)
 
 @jax.jit
-def sigmoid(x: Array) -> Array:
+def sigmoid(x: ArrayLike) -> Array:
   r"""Sigmoid activation function.
 
   Computes the element-wise function:
@@ -121,7 +124,7 @@ def sigmoid(x: Array) -> Array:
   return lax.logistic(x)
 
 @jax.jit
-def silu(x: Array) -> Array:
+def silu(x: ArrayLike) -> Array:
   r"""SiLU (a.k.a. swish) activation function.
 
   Computes the element-wise function:
@@ -140,12 +143,14 @@ def silu(x: Array) -> Array:
   See also:
     :func:`sigmoid`
   """
-  return x * sigmoid(x)
+  numpy_util.check_arraylike("silu", x)
+  x_arr = jnp.asarray(x)
+  return x_arr * sigmoid(x_arr)
 
 swish = silu
 
 @jax.jit
-def log_sigmoid(x: Array) -> Array:
+def log_sigmoid(x: ArrayLike) -> Array:
   r"""Log-sigmoid activation function.
 
   Computes the element-wise function:
@@ -162,10 +167,12 @@ def log_sigmoid(x: Array) -> Array:
   See also:
     :func:`sigmoid`
   """
-  return -softplus(-x)
+  numpy_util.check_arraylike("log_sigmoid", x)
+  x_arr = jnp.asarray(x)
+  return -softplus(-x_arr)
 
 @jax.jit
-def elu(x: Array, alpha: Array = 1.0) -> Array:
+def elu(x: ArrayLike, alpha: ArrayLike = 1.0) -> Array:
   r"""Exponential linear unit activation function.
 
   Computes the element-wise function:
@@ -186,11 +193,14 @@ def elu(x: Array, alpha: Array = 1.0) -> Array:
   See also:
     :func:`selu`
   """
-  safe_x = jnp.where(x > 0, 0., x)
-  return jnp.where(x > 0, x, alpha * jnp.expm1(safe_x))
+  numpy_util.check_arraylike("elu", x)
+  x_arr = jnp.asarray(x)
+  return jnp.where(x_arr > 0,
+                   x_arr,
+                   alpha * jnp.expm1(jnp.where(x_arr > 0, 0., x_arr)))
 
 @jax.jit
-def leaky_relu(x: Array, negative_slope: Array = 1e-2) -> Array:
+def leaky_relu(x: ArrayLike, negative_slope: ArrayLike = 1e-2) -> Array:
   r"""Leaky rectified linear unit activation function.
 
   Computes the element-wise function:
@@ -213,10 +223,12 @@ def leaky_relu(x: Array, negative_slope: Array = 1e-2) -> Array:
   See also:
     :func:`relu`
   """
-  return jnp.where(x >= 0, x, negative_slope * x)
+  numpy_util.check_arraylike("leaky_relu", x)
+  x_arr = jnp.asarray(x)
+  return jnp.where(x_arr >= 0, x_arr, negative_slope * x_arr)
 
 @jax.jit
-def hard_tanh(x: Array) -> Array:
+def hard_tanh(x: ArrayLike) -> Array:
   r"""Hard :math:`\mathrm{tanh}` activation function.
 
   Computes the element-wise function:
@@ -234,10 +246,12 @@ def hard_tanh(x: Array) -> Array:
   Returns:
     An array.
   """
-  return jnp.where(x > 1, 1, jnp.where(x < -1, -1, x))
+  numpy_util.check_arraylike("hard_tanh", x)
+  x_arr = jnp.asarray(x)
+  return jnp.where(x_arr > 1, 1, jnp.where(x_arr < -1, -1, x_arr))
 
 @jax.jit
-def celu(x: Array, alpha: Array = 1.0) -> Array:
+def celu(x: ArrayLike, alpha: ArrayLike = 1.0) -> Array:
   r"""Continuously-differentiable exponential linear unit activation.
 
   Computes the element-wise function:
@@ -262,7 +276,7 @@ def celu(x: Array, alpha: Array = 1.0) -> Array:
   return jnp.maximum(x, 0.0) + alpha * jnp.expm1(jnp.minimum(x, 0.0) / alpha)
 
 @jax.jit
-def selu(x: Array) -> Array:
+def selu(x: ArrayLike) -> Array:
   r"""Scaled exponential linear unit activation.
 
   Computes the element-wise function:
@@ -295,7 +309,7 @@ def selu(x: Array) -> Array:
 
 # TODO(phawkins): this jit was found to change numerics in a test. Debug this.
 # @partial(jax.jit, static_argnames=("approximate",))
-def gelu(x: Array, approximate: bool = True) -> Array:
+def gelu(x: ArrayLike, approximate: bool = True) -> Array:
   r"""Gaussian error linear unit activation function.
 
   If ``approximate=False``, computes the element-wise function:
@@ -317,20 +331,18 @@ def gelu(x: Array, approximate: bool = True) -> Array:
     x : input array
     approximate: whether to use the approximate or exact formulation.
   """
-
-  # Promote to nearest float-like dtype.
-  x = x.astype(dtypes.to_inexact_dtype(x.dtype))
+  [x_arr] = numpy_util.promote_args_inexact("gelu", x)
 
   if approximate:
-    sqrt_2_over_pi = np.sqrt(2 / np.pi).astype(x.dtype)
-    cdf = 0.5 * (1.0 + jnp.tanh(sqrt_2_over_pi * (x + 0.044715 * (x ** 3))))
-    return x * cdf
+    sqrt_2_over_pi = np.sqrt(2 / np.pi).astype(x_arr.dtype)
+    cdf = 0.5 * (1.0 + jnp.tanh(sqrt_2_over_pi * (x_arr + 0.044715 * (x_arr ** 3))))
+    return x_arr * cdf
   else:
-    sqrt_2 = np.sqrt(2).astype(x.dtype)
-    return jnp.array(x * (lax.erf(x / sqrt_2) + 1) / 2, dtype=x.dtype)
+    sqrt_2 = np.sqrt(2).astype(x_arr.dtype)
+    return jnp.array(x_arr * (lax.erf(x_arr / sqrt_2) + 1) / 2, dtype=x_arr.dtype)
 
 @partial(jax.jit, static_argnames=("axis",))
-def glu(x: Array, axis: int = -1) -> Array:
+def glu(x: ArrayLike, axis: int = -1) -> Array:
   r"""Gated linear unit activation function.
 
   Computes the function:
@@ -353,9 +365,11 @@ def glu(x: Array, axis: int = -1) -> Array:
   See also:
     :func:`sigmoid`
   """
-  size = x.shape[axis]
+  numpy_util.check_arraylike("glu", x)
+  x_arr = jnp.asarray(x)
+  size = x_arr.shape[axis]
   assert size % 2 == 0, "axis size must be divisible by 2"
-  x1, x2 = jnp.split(x, 2, axis)
+  x1, x2 = jnp.split(x_arr, 2, axis)
   return x1 * sigmoid(x2)
 
 # other functions
@@ -364,10 +378,10 @@ def glu(x: Array, axis: int = -1) -> Array:
 
 
 @partial(jax.jit, static_argnames=("axis",))
-def log_softmax(x: Array,
+def log_softmax(x: ArrayLike,
                 axis: Optional[Union[int, tuple[int, ...]]] = -1,
-                where: Optional[Array] = None,
-                initial: Optional[Array] = None) -> Array:
+                where: Optional[ArrayLike] = None,
+                initial: Optional[ArrayLike] = None) -> Array:
   r"""Log-Softmax function.
 
   Computes the logarithm of the :code:`softmax` function, which rescales
@@ -391,8 +405,10 @@ def log_softmax(x: Array,
   See also:
     :func:`softmax`
   """
-  x_max = jnp.max(x, axis, where=where, initial=initial, keepdims=True)
-  shifted = x - lax.stop_gradient(x_max)
+  numpy_util.check_arraylike("log_softmax", x)
+  x_arr = jnp.asarray(x)
+  x_max = jnp.max(x_arr, axis, where=where, initial=initial, keepdims=True)
+  shifted = x_arr - lax.stop_gradient(x_max)
   shifted_logsumexp = jnp.log(
       jnp.sum(jnp.exp(shifted), axis, where=where, keepdims=True))
   result = shifted - shifted_logsumexp
@@ -403,10 +419,10 @@ def log_softmax(x: Array,
 
 # TODO(phawkins): this jit was found to change numerics in a test. Debug this.
 #@partial(jax.jit, static_argnames=("axis",))
-def softmax(x: Array,
+def softmax(x: ArrayLike,
             axis: Optional[Union[int, tuple[int, ...]]] = -1,
-            where: Optional[Array] = None,
-            initial: Optional[Array] = None) -> Array:
+            where: Optional[ArrayLike] = None,
+            initial: Optional[ArrayLike] = None) -> Array:
   r"""Softmax function.
 
   Computes the function which rescales elements to the range :math:`[0, 1]`
@@ -431,17 +447,20 @@ def softmax(x: Array,
     :func:`log_softmax`
   """
   if jax.config.jax_softmax_custom_jvp:
-    return _softmax(x, axis, where, initial)
+    # mypy is confused by the `functools.partial` application in the definition
+    # of `_softmax` and incorrectly concludes that `_softmax` returns
+    # `ReturnValue` -- the unsubstituted type parameter of `custom_jvp`.
+    return _softmax(x, axis, where, initial)  # type: ignore[return-value]
   else:
     return _softmax_deprecated(x, axis, where, initial)
 
 # TODO(mattjj): replace softmax with _softmax when deprecation flag is removed
 @partial(jax.custom_jvp, nondiff_argnums=(1,))
 def _softmax(
-    x,
+    x: ArrayLike,
     axis: Optional[Union[int, tuple[int, ...]]] = -1,
-    where: Optional[Array] = None,
-    initial: Optional[Array] = None) -> Array:
+    where: Optional[ArrayLike] = None,
+    initial: Optional[ArrayLike] = None) -> Array:
   x_max = jnp.max(x, axis, where=where, initial=initial, keepdims=True)
   unnormalized = jnp.exp(x - x_max)
   result = unnormalized / jnp.sum(unnormalized, axis, where=where, keepdims=True)
@@ -455,7 +474,11 @@ def _softmax_jvp(axis, primals, tangents):
   y = _softmax(x, axis, where, initial)
   return y, y * (x_dot - (y * x_dot).sum(axis, where=where, keepdims=True))
 
-def _softmax_deprecated(x, axis, where, initial):
+def _softmax_deprecated(
+    x: ArrayLike,
+    axis: Optional[Union[int, tuple[int, ...]]] = -1,
+    where: Optional[ArrayLike] = None,
+    initial: Optional[ArrayLike] = None) -> Array:
   x_max = jnp.max(x, axis, where=where, initial=initial, keepdims=True)
   unnormalized = jnp.exp(x - lax.stop_gradient(x_max))
   result = unnormalized / jnp.sum(unnormalized, axis, where=where, keepdims=True)
@@ -465,13 +488,15 @@ def _softmax_deprecated(x, axis, where, initial):
 
 
 @partial(jax.jit, static_argnames=("axis",))
-def standardize(x: Array,
+def standardize(x: ArrayLike,
               axis: Optional[Union[int, tuple[int, ...]]] = -1,
-              mean: Optional[Array] = None,
-              variance: Optional[Array] = None,
-              epsilon: Array = 1e-5,
-              where: Optional[Array] = None) -> Array:
+              mean: Optional[ArrayLike] = None,
+              variance: Optional[ArrayLike] = None,
+              epsilon: ArrayLike = 1e-5,
+              where: Optional[ArrayLike] = None) -> Array:
   r"""Normalizes an array by subtracting ``mean`` and dividing by :math:`\sqrt{\mathrm{variance}}`."""
+  numpy_util.check_arraylike("standardize", x)
+  numpy_util.check_arraylike_or_none("standardize", mean, variance, where)
   if mean is None:
     mean = jnp.mean(x, axis, keepdims=True, where=where)
   if variance is None:
@@ -481,43 +506,45 @@ def standardize(x: Array,
     # when used in neural network normalization layers
     variance = jnp.mean(
         jnp.square(x), axis, keepdims=True, where=where) - jnp.square(mean)
-  return (x - mean) * lax.rsqrt(variance + epsilon)
+  return jnp.subtract(x, jnp.asarray(mean)) * lax.rsqrt(jnp.asarray(variance) + epsilon)
 
-def normalize(x: Array,
-              axis: Optional[Union[int, tuple[int, ...]]] = -1,
-              mean: Optional[Array] = None,
-              variance: Optional[Array] = None,
-              epsilon: Array = 1e-5,
-              where: Optional[Array] = None) -> Array:
+def normalize(x: ArrayLike,
+            axis: Optional[Union[int, tuple[int, ...]]] = -1,
+            mean: Optional[ArrayLike] = None,
+            variance: Optional[ArrayLike] = None,
+            epsilon: ArrayLike = 1e-5,
+            where: Optional[ArrayLike] = None) -> Array:
   r"""Normalizes an array by subtracting ``mean`` and dividing by :math:`\sqrt{\mathrm{variance}}`."""
   warnings.warn("jax.nn.normalize will be deprecated. Use jax.nn.standardize instead.", DeprecationWarning)
   return standardize(x, axis, mean, variance, epsilon, where)
 
+# TODO(slebedev): Change the type of `x` to `ArrayLike`.
 @partial(jax.jit, static_argnames=("num_classes", "dtype", "axis"))
-def _one_hot(x: Array, num_classes: int, *,
+def _one_hot(x: Any, num_classes: int, *,
              dtype: Any, axis: Union[int, AxisName]) -> Array:
   num_classes = core.concrete_dim_or_error(
       num_classes,
       "The error arose in jax.nn.one_hot argument `num_classes`.")
   dtype = dtypes.canonicalize_dtype(dtype)
-  x = jnp.asarray(x)
+  x_arr = jnp.asarray(x)
   try:
-    output_pos_axis = util.canonicalize_axis(axis, x.ndim + 1)
+    output_pos_axis = util.canonicalize_axis(axis, x_arr.ndim + 1)
   except TypeError:
     axis_size = lax.psum(1, axis)
     if num_classes != axis_size:
       raise ValueError(f"Expected num_classes to match the size of axis {axis}, "
                        f"but {num_classes} != {axis_size}") from None
     axis_idx = lax.axis_index(axis)
-    return jnp.asarray(x == axis_idx, dtype=dtype)
+    return jnp.asarray(x_arr == axis_idx, dtype=dtype)
   axis = operator.index(axis)  # type: ignore[arg-type]
-  lhs = lax.expand_dims(x, (axis,))
-  rhs_shape = [1] * x.ndim
+  lhs = lax.expand_dims(x_arr, (axis,))
+  rhs_shape = [1] * x_arr.ndim
   rhs_shape.insert(output_pos_axis, num_classes)
-  rhs = lax.broadcasted_iota(x.dtype, rhs_shape, output_pos_axis)
+  rhs = lax.broadcasted_iota(x_arr.dtype, rhs_shape, output_pos_axis)
   return jnp.asarray(lhs == rhs, dtype=dtype)
 
-def one_hot(x: Array, num_classes: int, *,
+# TODO(slebedev): Change the type of `x` to `ArrayLike`.
+def one_hot(x: Any, num_classes: int, *,
             dtype: Any = jnp.float_, axis: Union[int, AxisName] = -1) -> Array:
   """One-hot encodes the given indices.
 
@@ -550,7 +577,7 @@ def one_hot(x: Array, num_classes: int, *,
 
 @jax.custom_jvp
 @jax.jit
-def relu6(x: Array) -> Array:
+def relu6(x: ArrayLike) -> Array:
   r"""Rectified Linear Unit 6 activation function.
 
   Computes the element-wise function
@@ -582,7 +609,7 @@ def relu6(x: Array) -> Array:
               lax.select((x > 0) & (x < 6), g, lax.full_like(g, 0)))
 
 @jax.jit
-def hard_sigmoid(x: Array) -> Array:
+def hard_sigmoid(x: ArrayLike) -> Array:
   r"""Hard Sigmoid activation function.
 
   Computes the element-wise function
@@ -602,7 +629,7 @@ def hard_sigmoid(x: Array) -> Array:
   return relu6(x + 3.) / 6.
 
 @jax.jit
-def hard_silu(x: Array) -> Array:
+def hard_silu(x: ArrayLike) -> Array:
   r"""Hard SiLU (swish) activation function
 
   Computes the element-wise function
@@ -622,6 +649,8 @@ def hard_silu(x: Array) -> Array:
   See also:
     :func:`hard_sigmoid`
   """
-  return x * hard_sigmoid(x)
+  numpy_util.check_arraylike("hard_silu", x)
+  x_arr = jnp.asarray(x)
+  return x_arr * hard_sigmoid(x_arr)
 
 hard_swish = hard_silu
diff --git a/jax/_src/nn/initializers.py b/jax/_src/nn/initializers.py
index add524b73b0b..297c330773a8 100644
--- a/jax/_src/nn/initializers.py
+++ b/jax/_src/nn/initializers.py
@@ -23,18 +23,17 @@
 
 import numpy as np
 
-import jax
 import jax.numpy as jnp
 from jax import lax
 from jax import random
 from jax._src import core
 from jax._src import dtypes
+from jax._src.typing import Array, ArrayLike
 from jax._src.util import set_module
 
 export = set_module('jax.nn.initializers')
 
-KeyArray = jax.Array
-Array = Any
+KeyArray = Array
 # TODO: Import or define these to match
 # https://github.com/numpy/numpy/blob/main/numpy/typing/_dtype_like.py.
 DTypeLikeFloat = Any
@@ -48,7 +47,7 @@ class Initializer(Protocol):
   def __call__(key: KeyArray,
                shape: core.Shape,
                dtype: DTypeLikeInexact = jnp.float_) -> Array:
-    ...
+    raise NotImplementedError
 
 @export
 def zeros(key: KeyArray,
@@ -82,7 +81,7 @@ def ones(key: KeyArray,
   return jnp.ones(shape, dtypes.canonicalize_dtype(dtype))
 
 @export
-def constant(value: Array,
+def constant(value: ArrayLike,
              dtype: DTypeLikeInexact = jnp.float_
              ) -> Initializer:
   """Builds an initializer that returns arrays full of a constant ``value``.
@@ -240,7 +239,7 @@ def _complex_uniform(key: KeyArray,
   theta = 2 * jnp.pi * random.uniform(key_theta, shape, real_dtype).astype(dtype)
   return r * jnp.exp(1j * theta)
 
-def _complex_truncated_normal(key: KeyArray, upper: Array,
+def _complex_truncated_normal(key: KeyArray, upper: ArrayLike,
                               shape: Union[Sequence[int], core.NamedShape],
                               dtype: DTypeLikeInexact) -> Array:
   """
diff --git a/jax/_src/ops/scatter.py b/jax/_src/ops/scatter.py
index 0fa832aac155..8101199906f6 100644
--- a/jax/_src/ops/scatter.py
+++ b/jax/_src/ops/scatter.py
@@ -16,7 +16,7 @@
 
 from collections.abc import Sequence
 import sys
-from typing import Any, Callable, Optional, Union
+from typing import Callable, Optional, Union
 import warnings
 
 import numpy as np
@@ -31,9 +31,9 @@
 from jax._src.numpy import lax_numpy as jnp
 from jax._src.numpy import reductions
 from jax._src.numpy.util import check_arraylike, promote_dtypes
+from jax._src.typing import Array, ArrayLike
 
 
-Array = Any
 if sys.version_info >= (3, 10):
     from types import EllipsisType
     SingleIndex = Union[None, int, slice, Sequence[int], Array, EllipsisType]
@@ -154,8 +154,8 @@ def _get_identity(op, dtype):
 
 
 def _segment_update(name: str,
-                    data: Array,
-                    segment_ids: Array,
+                    data: ArrayLike,
+                    segment_ids: ArrayLike,
                     scatter_op: Callable,
                     num_segments: Optional[int] = None,
                     indices_are_sorted: bool = False,
@@ -195,8 +195,8 @@ def _segment_update(name: str,
   return reducer(out, axis=0).astype(dtype)
 
 
-def segment_sum(data: Array,
-                segment_ids: Array,
+def segment_sum(data: ArrayLike,
+                segment_ids: ArrayLike,
                 num_segments: Optional[int] = None,
                 indices_are_sorted: bool = False,
                 unique_indices: bool = False,
@@ -250,8 +250,8 @@ def segment_sum(data: Array,
       indices_are_sorted, unique_indices, bucket_size, reductions.sum, mode=mode)
 
 
-def segment_prod(data: Array,
-                 segment_ids: Array,
+def segment_prod(data: ArrayLike,
+                 segment_ids: ArrayLike,
                  num_segments: Optional[int] = None,
                  indices_are_sorted: bool = False,
                  unique_indices: bool = False,
@@ -306,8 +306,8 @@ def segment_prod(data: Array,
       indices_are_sorted, unique_indices, bucket_size, reductions.prod, mode=mode)
 
 
-def segment_max(data: Array,
-                segment_ids: Array,
+def segment_max(data: ArrayLike,
+                segment_ids: ArrayLike,
                 num_segments: Optional[int] = None,
                 indices_are_sorted: bool = False,
                 unique_indices: bool = False,
@@ -361,8 +361,8 @@ def segment_max(data: Array,
       indices_are_sorted, unique_indices, bucket_size, reductions.max, mode=mode)
 
 
-def segment_min(data: Array,
-                segment_ids: Array,
+def segment_min(data: ArrayLike,
+                segment_ids: ArrayLike,
                 num_segments: Optional[int] = None,
                 indices_are_sorted: bool = False,
                 unique_indices: bool = False,
diff --git a/jax/_src/scipy/optimize/_lbfgs.py b/jax/_src/scipy/optimize/_lbfgs.py
index 44862eee146b..b50c54ec1700 100644
--- a/jax/_src/scipy/optimize/_lbfgs.py
+++ b/jax/_src/scipy/optimize/_lbfgs.py
@@ -12,18 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """The Limited-Memory Broyden-Fletcher-Goldfarb-Shanno minimization algorithm."""
-from typing import Any, Callable, NamedTuple, Optional, Union
+from typing import Callable, NamedTuple, Optional, Union
 from functools import partial
 
 import jax
 import jax.numpy as jnp
 from jax import lax
 from jax._src.scipy.optimize.line_search import line_search
+from jax._src.typing import Array
+
 
 _dot = partial(jnp.dot, precision=lax.Precision.HIGHEST)
 
 
-Array = Any
 
 class LBFGSResults(NamedTuple):
   """Results from L-BFGS optimization
diff --git a/jax/_src/state/types.py b/jax/_src/state/types.py
index be6c39177e8a..e7845f41a615 100644
--- a/jax/_src/state/types.py
+++ b/jax/_src/state/types.py
@@ -23,14 +23,13 @@
 from jax._src import effects
 from jax._src import pretty_printer as pp
 from jax._src.util import safe_map, safe_zip
+from jax._src.typing import Array
 
 ## JAX utilities
 
 map, unsafe_map = safe_map, map
 zip, unsafe_zip = safe_zip, zip
 
-Array = Any
-
 _ref_effect_color = pp.Color.GREEN
 
 class RefEffect(effects.JaxprInputEffect):