From a0a563ee0e14cf366fed6f847ccc056649526c99 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 31 Jul 2024 03:05:37 +0000
Subject: [PATCH] [Bugfix][TPU] Set readonly=True for non-root devices

---
 vllm/worker/tpu_worker.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py
index 17fa5c35457c2..90c86d4e6c59d 100644
--- a/vllm/worker/tpu_worker.py
+++ b/vllm/worker/tpu_worker.py
@@ -104,7 +104,10 @@ def init_device(self) -> None:
         # Use persistent cache to avoid XLA recompilation.
         # NOTE(woosuk): This does not completely eliminate the recompilation
         # overhead because dynamo does not cache the compiled results.
-        xr.initialize_cache(envs.VLLM_XLA_CACHE_PATH, readonly=False)
+        # NOTE(woosuk): Set readonly=False only for the rank 0 process to avoid
+        # race conditions.
+        xr.initialize_cache(envs.VLLM_XLA_CACHE_PATH,
+                            readonly=not self.is_driver_worker)
 
     def load_model(self):
         self.model_runner.load_model()