diff --git a/filament/backend/CMakeLists.txt b/filament/backend/CMakeLists.txt
index 6be8a3959ec4..cef040982288 100644
--- a/filament/backend/CMakeLists.txt
+++ b/filament/backend/CMakeLists.txt
@@ -318,6 +318,8 @@ if (FILAMENT_SUPPORTS_WEBGPU)
             src/webgpu/WebGPURenderPrimitive.h
             src/webgpu/WebGPURenderTarget.cpp
             src/webgpu/WebGPURenderTarget.h
+            src/webgpu/WebGPUStagePool.cpp
+            src/webgpu/WebGPUStagePool.h
             src/webgpu/WebGPUStrings.h
             src/webgpu/WebGPUSwapChain.cpp
             src/webgpu/WebGPUSwapChain.h
diff --git a/filament/backend/src/webgpu/WebGPUBufferBase.cpp b/filament/backend/src/webgpu/WebGPUBufferBase.cpp
index 9bfa56a53028..4e038a0078a1 100644
--- a/filament/backend/src/webgpu/WebGPUBufferBase.cpp
+++ b/filament/backend/src/webgpu/WebGPUBufferBase.cpp
@@ -18,6 +18,7 @@
 
 #include "WebGPUConstants.h"
 #include "WebGPUQueueManager.h"
+#include "WebGPUStagePool.h"
 
 #include "DriverBase.h"
 #include <backend/BufferDescriptor.h>
@@ -65,7 +66,7 @@ WebGPUBufferBase::WebGPUBufferBase(wgpu::Device const& device, const wgpu::Buffe
 // of 4 by padding with zeros.
 void WebGPUBufferBase::updateGPUBuffer(BufferDescriptor const& bufferDescriptor,
         const uint32_t byteOffset, wgpu::Device const& device,
-        WebGPUQueueManager* const webGPUQueueManager) {
+        WebGPUQueueManager* const webGPUQueueManager, WebGPUStagePool* const webGPUStagePool) {
     FILAMENT_CHECK_PRECONDITION(bufferDescriptor.buffer)
             << "updateGPUBuffer called with a null buffer";
     FILAMENT_CHECK_PRECONDITION(bufferDescriptor.size + byteOffset <= mBuffer.GetSize())
@@ -85,15 +86,12 @@ void WebGPUBufferBase::updateGPUBuffer(BufferDescriptor const& bufferDescriptor,
     const size_t stagingBufferSize =
             remainder == 0 ? bufferDescriptor.size : mainBulk + FILAMENT_WEBGPU_BUFFER_SIZE_MODULUS;
 
-    // create a staging buffer
-    wgpu::BufferDescriptor descriptor{
-        .label = "Filament WebGPU Staging Buffer",
-        .usage = wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc,
-        .size = stagingBufferSize,
-        .mappedAtCreation = true };
-    wgpu::Buffer stagingBuffer = device.CreateBuffer(&descriptor);
+    wgpu::Buffer stagingBuffer = webGPUStagePool->acquireBuffer(stagingBufferSize,
+            webGPUQueueManager->getLatestSubmissionState());
 
     void* mappedRange = stagingBuffer.GetMappedRange();
+    assert_invariant(mappedRange);
+
     memcpy(mappedRange, bufferDescriptor.buffer, bufferDescriptor.size);
 
     // Make sure the padded memory is set to 0 to have deterministic behaviors
@@ -106,7 +104,9 @@ void WebGPUBufferBase::updateGPUBuffer(BufferDescriptor const& bufferDescriptor,
 
     // Copy the staging buffer contents to the destination buffer.
     webGPUQueueManager->getCommandEncoder().CopyBufferToBuffer(stagingBuffer, 0, mBuffer,
-            byteOffset, stagingBufferSize);
+            byteOffset,
+            remainder == 0 ? bufferDescriptor.size
+                           : mainBulk + FILAMENT_WEBGPU_BUFFER_SIZE_MODULUS);
 }
 
 } // namespace filament::backend
diff --git a/filament/backend/src/webgpu/WebGPUBufferBase.h b/filament/backend/src/webgpu/WebGPUBufferBase.h
index c637ca04f51a..e9a5c978e164 100644
--- a/filament/backend/src/webgpu/WebGPUBufferBase.h
+++ b/filament/backend/src/webgpu/WebGPUBufferBase.h
@@ -25,6 +25,7 @@ namespace filament::backend {
 
 class BufferDescriptor;
 class WebGPUQueueManager;
+class WebGPUStagePool;
 
 /**
   * A base class for WebGPU buffer objects, providing common functionality for creating and
@@ -40,7 +41,7 @@ class WebGPUBufferBase /* intended to be extended */ {
      * ensures the calls happen in the expected sequence.
      */
     void updateGPUBuffer(BufferDescriptor const&, uint32_t byteOffset, wgpu::Device const& device,
-            WebGPUQueueManager* const webGPUQueueManager);
+            WebGPUQueueManager* const webGPUQueueManager, WebGPUStagePool* const webGPUStagePool);
 
     [[nodiscard]] wgpu::Buffer const& getBuffer() const { return mBuffer; }
 
diff --git a/filament/backend/src/webgpu/WebGPUDriver.cpp b/filament/backend/src/webgpu/WebGPUDriver.cpp
index ccb8c187db92..e45ecec0a4cc 100644
--- a/filament/backend/src/webgpu/WebGPUDriver.cpp
+++ b/filament/backend/src/webgpu/WebGPUDriver.cpp
@@ -107,6 +107,7 @@ WebGPUDriver::WebGPUDriver(WebGPUPlatform& platform,
       mAdapter{ mPlatform.requestAdapter(nullptr) },
       mDevice{ mPlatform.requestDevice(mAdapter) },
       mQueueManager{ mDevice },
+      mStagePool{ mDevice },
       mPipelineLayoutCache{ mDevice },
       mPipelineCache{ mDevice },
       mRenderPassMipmapGenerator{ mDevice, &mQueueManager },
@@ -177,6 +178,9 @@ void WebGPUDriver::endFrame(const uint32_t /* frameId */) {
     for (size_t i = 0; i < MAX_DESCRIPTOR_SET_COUNT; i++) {
         mCurrentDescriptorSets[i] = {};
     }
+
+    // Garbage collection (if necessary)
+    mStagePool.gc();
 }
 
 // If a command encoder is in flight then the encoder is finished and submitted to the GPU queue.
@@ -851,7 +855,7 @@ void WebGPUDriver::updateIndexBuffer(Handle<HwIndexBuffer> indexBufferHandle,
     // draw calls are made.
     flush();
     handleCast<WebGPUIndexBuffer>(indexBufferHandle)
-            ->updateGPUBuffer(bufferDescriptor, byteOffset, mDevice, &mQueueManager);
+            ->updateGPUBuffer(bufferDescriptor, byteOffset, mDevice, &mQueueManager, &mStagePool);
     scheduleDestroy(std::move(bufferDescriptor));
 }
 
@@ -862,14 +866,14 @@ void WebGPUDriver::updateBufferObject(Handle<HwBufferObject> bufferObjectHandle,
     // draw calls are made.
     flush();
     handleCast<WebGPUBufferObject>(bufferObjectHandle)
-            ->updateGPUBuffer(bufferDescriptor, byteOffset, mDevice, &mQueueManager);
+            ->updateGPUBuffer(bufferDescriptor, byteOffset, mDevice, &mQueueManager, &mStagePool);
     scheduleDestroy(std::move(bufferDescriptor));
 }
 
 void WebGPUDriver::updateBufferObjectUnsynchronized(Handle<HwBufferObject> bufferObjectHandle,
         BufferDescriptor&& bufferDescriptor, const uint32_t byteOffset) {
     handleCast<WebGPUBufferObject>(bufferObjectHandle)
-            ->updateGPUBuffer(bufferDescriptor, byteOffset, mDevice, &mQueueManager);
+            ->updateGPUBuffer(bufferDescriptor, byteOffset, mDevice, &mQueueManager, &mStagePool);
     scheduleDestroy(std::move(bufferDescriptor));
 }
 
diff --git a/filament/backend/src/webgpu/WebGPUDriver.h b/filament/backend/src/webgpu/WebGPUDriver.h
index 3a8ffcaa3ded..915a435095f2 100644
--- a/filament/backend/src/webgpu/WebGPUDriver.h
+++ b/filament/backend/src/webgpu/WebGPUDriver.h
@@ -25,6 +25,7 @@
 #include "webgpu/WebGPUPipelineLayoutCache.h"
 #include "webgpu/WebGPURenderPassMipmapGenerator.h"
 #include "webgpu/WebGPUQueueManager.h"
+#include "webgpu/WebGPUStagePool.h"
 #include "webgpu/utils/AsyncTaskCounter.h"
 #include <backend/platforms/WebGPUPlatform.h>
 
@@ -81,6 +82,7 @@ class WebGPUDriver final : public DriverBase {
     wgpu::Device mDevice = nullptr;
     wgpu::Limits mDeviceLimits = {};
     WebGPUQueueManager mQueueManager;
+    WebGPUStagePool mStagePool;
     void* mNativeWindow = nullptr;
     WebGPUSwapChain* mSwapChain = nullptr;
     uint64_t mNextFakeHandle = 1;
diff --git a/filament/backend/src/webgpu/WebGPUQueueManager.cpp b/filament/backend/src/webgpu/WebGPUQueueManager.cpp
index 3fd9b9633e42..57edf5e1636c 100644
--- a/filament/backend/src/webgpu/WebGPUQueueManager.cpp
+++ b/filament/backend/src/webgpu/WebGPUQueueManager.cpp
@@ -64,6 +64,7 @@ wgpu::CommandEncoder WebGPUQueueManager::getCommandEncoder() {
         };
         mCommandEncoder = mDevice.CreateCommandEncoder(&commandEncoderDescriptor);
         ASSERT_POSTCONDITION(mCommandEncoder, "Failed to create command encoder.");
+        mLatestSubmissionState = std::make_shared<WebGPUSubmissionState>();
     }
     return mCommandEncoder;
 }
@@ -94,8 +95,6 @@ void WebGPUQueueManager::submit() {
         return;
     }
 
-    mLatestSubmissionState = std::make_shared<WebGPUSubmissionState>();
-
     wgpu::CommandBufferDescriptor commandBufferDescriptor{
         .label = "Filament Command Buffer",
     };
diff --git a/filament/backend/src/webgpu/WebGPUStagePool.cpp b/filament/backend/src/webgpu/WebGPUStagePool.cpp
new file mode 100644
index 000000000000..c2ab8d9e8a2b
--- /dev/null
+++ b/filament/backend/src/webgpu/WebGPUStagePool.cpp
@@ -0,0 +1,92 @@
+/*
+* Copyright (C) 2025 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "WebGPUStagePool.h"
+
+#include "WebGPUConstants.h"
+#include "WebGPUQueueManager.h"
+
+namespace filament::backend {
+
+WebGPUStagePool::WebGPUStagePool(wgpu::Device const& device) : mDevice(device) {}
+
+WebGPUStagePool::~WebGPUStagePool() = default;
+
+wgpu::Buffer WebGPUStagePool::acquireBuffer(size_t requiredSize,
+        std::shared_ptr<WebGPUSubmissionState> submissionState) {
+    wgpu::Buffer buffer;
+    {
+        std::lock_guard<std::mutex> lock(mMutex);
+        auto iter = mBuffers.lower_bound(requiredSize);
+        if (iter != mBuffers.end()) {
+            buffer = iter->second;
+            mBuffers.erase(iter);
+        }
+    }
+    if (!buffer.Get()) {
+        buffer = createNewBuffer(requiredSize);
+    }
+    mInProgress.push_back({submissionState, buffer});
+    return buffer;
+}
+
+void WebGPUStagePool::recycleBuffer(wgpu::Buffer buffer) {
+    struct UserData final {
+        wgpu::Buffer buffer;
+        WebGPUStagePool* webGPUStagePool;
+    };
+    auto userData =
+            std::make_unique<UserData>(UserData{ .buffer = buffer, .webGPUStagePool = this });
+    buffer.MapAsync(wgpu::MapMode::Write, 0, buffer.GetSize(), wgpu::CallbackMode::AllowSpontaneous,
+            [data = std::move(userData)](wgpu::MapAsyncStatus status, const char* message) {
+                if (UTILS_LIKELY(status == wgpu::MapAsyncStatus::Success)) {
+                    if (!data->webGPUStagePool) {
+                        return;
+                    }
+                    std::lock_guard<std::mutex> lock(data->webGPUStagePool->mMutex);
+                    data->webGPUStagePool->mBuffers.insert(
+                            { data->buffer.GetSize(), data->buffer });
+                } else {
+                    FWGPU_LOGE << "Failed to MapAsync when recycling staging buffer: " << message;
+                }
+            });
+}
+
+void WebGPUStagePool::gc() {
+    // We found that MapAsync would sometimes lead to GetMappedRange returning nullptr if the
+    // command using that staging buffer has not finished executing, so here we only recycle those
+    // buffers that are not still being used by any command
+    std::vector<std::pair<std::shared_ptr<WebGPUSubmissionState>, wgpu::Buffer>> stillInProgress;
+    for (auto& [st, buffer]: mInProgress) {
+        if (st->getStatus() == FenceStatus::CONDITION_SATISFIED) {
+            recycleBuffer(buffer);
+        } else {
+            stillInProgress.push_back({st, buffer});
+        }
+    }
+    std::swap(mInProgress, stillInProgress);
+}
+
+wgpu::Buffer WebGPUStagePool::createNewBuffer(size_t bufferSize) {
+    wgpu::BufferDescriptor descriptor{
+        .label = "Filament WebGPU Staging Buffer",
+        .usage = wgpu::BufferUsage::MapWrite | wgpu::BufferUsage::CopySrc,
+        .size = bufferSize,
+        .mappedAtCreation = true };
+    return mDevice.CreateBuffer(&descriptor);
+}
+
+} // namespace filament::backend
diff --git a/filament/backend/src/webgpu/WebGPUStagePool.h b/filament/backend/src/webgpu/WebGPUStagePool.h
new file mode 100644
index 000000000000..f878bd555ced
--- /dev/null
+++ b/filament/backend/src/webgpu/WebGPUStagePool.h
@@ -0,0 +1,50 @@
+/*
+* Copyright (C) 2025 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TNT_FILAMENT_BACKEND_WEBGPUSTAGEPOOL_H
+#define TNT_FILAMENT_BACKEND_WEBGPUSTAGEPOOL_H
+
+#include <webgpu/webgpu_cpp.h>
+
+#include <map>
+#include <mutex>
+
+namespace filament::backend {
+
+struct WebGPUSubmissionState;
+
+class WebGPUStagePool {
+public:
+    WebGPUStagePool(wgpu::Device const& device);
+    ~WebGPUStagePool();
+
+    wgpu::Buffer acquireBuffer(size_t requiredSize,
+            std::shared_ptr<WebGPUSubmissionState> submissionState);
+    void recycleBuffer(wgpu::Buffer buffer);
+    void gc();
+
+private:
+    wgpu::Buffer createNewBuffer(size_t bufferSize);
+    std::multimap<uint32_t, wgpu::Buffer> mBuffers;
+    std::vector<std::pair<std::shared_ptr<WebGPUSubmissionState>, wgpu::Buffer>> mInProgress;
+    std::mutex mMutex;
+
+    wgpu::Device mDevice;
+};
+
+} // namespace filament::backend
+
+#endif // TNT_FILAMENT_BACKEND_WEBGPUSTAGEPOOL_H