rasbt
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb‎
Lines changed: 40 additions & 71 deletions b/‎ch05/11_qwen3/standalone-qwen3-moe-plus-kvcache.ipynb‎
Lines changed: 40 additions & 71 deletions
@@ -83,6 +83,7 @@ gemma-3-270m-it/
 Qwen3-0.6B-Base/
 Qwen3-0.6B/
 tokenizer-base.json
+tokenizer-reasoning.json
 tokenizer.json
 
 # Datasets
 
@@ -150,79 +150,52 @@
     "        super().__init__()\n",
     "        self.num_experts_per_tok = cfg[\"num_experts_per_tok\"]\n",
     "        self.num_experts = cfg[\"num_experts\"]\n",
+    "        self.emb_dim = cfg[\"emb_dim\"]\n",
     "        self.gate = nn.Linear(cfg[\"emb_dim\"], cfg[\"num_experts\"], bias=False, dtype=cfg[\"dtype\"])\n",
     "\n",
-    "        # meta device to reduce memory pressure when initializing the model before loading weights\n",
-    "        meta_device = torch.device(\"meta\")\n",
-    "        self.fc1 = nn.ModuleList([\n",
-    "            nn.Linear(\n",
-    "                cfg[\"emb_dim\"], cfg[\"moe_intermediate_size\"],\n",
-    "                bias=False, dtype=cfg[\"dtype\"], device=meta_device)\n",
-    "            for _ in range(cfg[\"num_experts\"])]\n",
-    "        )\n",
-    "        self.fc2 = nn.ModuleList([\n",
-    "            nn.Linear(\n",
-    "                cfg[\"emb_dim\"], cfg[\"moe_intermediate_size\"],\n",
-    "                bias=False, dtype=cfg[\"dtype\"], device=meta_device\n",
-    "                )\n",
-    "            for _ in range(cfg[\"num_experts\"])]\n",
-    "        )\n",
-    "        self.fc3 = nn.ModuleList([\n",
-    "            nn.Linear(\n",
-    "                cfg[\"moe_intermediate_size\"], cfg[\"emb_dim\"],\n",
-    "                bias=False, dtype=cfg[\"dtype\"], device=meta_device\n",
-    "                )\n",
-    "            for _ in range(cfg[\"num_experts\"])]\n",
-    "        )\n",
+    "        self.fc1 = nn.ModuleList([nn.Linear(cfg[\"emb_dim\"], cfg[\"moe_intermediate_size\"], bias=False, dtype=cfg[\"dtype\"])\n",
+    "                                  for _ in range(cfg[\"num_experts\"])])\n",
+    "        self.fc2 = nn.ModuleList([nn.Linear(cfg[\"emb_dim\"], cfg[\"moe_intermediate_size\"], bias=False, dtype=cfg[\"dtype\"])\n",
+    "                                  for _ in range(cfg[\"num_experts\"])])\n",
+    "        self.fc3 = nn.ModuleList([nn.Linear(cfg[\"moe_intermediate_size\"], cfg[\"emb_dim\"], bias=False, dtype=cfg[\"dtype\"])\n",
+    "                                  for _ in range(cfg[\"num_experts\"])])\n",
     "\n",
     "    def forward(self, x):\n",
-    "        b, seq_len, embed_dim = x.shape\n",
     "        scores = self.gate(x)  # (b, seq_len, num_experts)\n",
     "        topk_scores, topk_indices = torch.topk(scores, self.num_experts_per_tok, dim=-1)\n",
     "        topk_probs = torch.softmax(topk_scores, dim=-1)\n",
-    "        \n",
-    "        expert_outputs = []\n",
-    "        for e in range(self.num_experts):\n",
-    "            hidden = torch.nn.functional.silu(self.fc1[e](x)) * self.fc2[e](x)\n",
-    "            out = self.fc3[e](hidden)\n",
-    "            expert_outputs.append(out.unsqueeze(-2))\n",
-    "        expert_outputs = torch.cat(expert_outputs, dim=-2)  # (b, t, num_experts, emb_dim)\n",
-    "\n",
-    "        gating_probs = torch.zeros_like(scores)\n",
-    "\n",
-    "        for i in range(self.num_experts_per_tok):\n",
-    "            indices = topk_indices[..., i:i+1]\n",
-    "            prob = topk_probs[..., i:i+1]\n",
-    "            gating_probs.scatter_(dim=-1, index=indices, src=prob)\n",
-    "        gating_probs = gating_probs.unsqueeze(-1)  # (b, t, num_experts, 1)\n",
-    "        \n",
-    "        # Weighted sum over experts\n",
-    "        y = (gating_probs * expert_outputs).sum(dim=-2)\n",
-    "        return y\n",
-    "\n",
-    "\n",
-    "        # For some reason, the version below is slower than the naive version\n",
-    "        # above that computes all experts, even the unused ones\n",
-    "\n",
-    "        # def forward(self, x):\n",
-    "        #     scores = self.gate(x)  # (b, seq_len, num_experts)\n",
-    "        #     topk_scores, topk_indices = torch.topk(scores, self.num_experts_per_tok, dim=-1)\n",
-    "        #     topk_probs = torch.softmax(topk_scores, dim=-1)\n",
-    "        #     y = torch.zeros_like(x)\n",
-    "        #\n",
-    "        #     for i in range(self.num_experts_per_tok):\n",
-    "        #         # expert_indices is (b, seq_len) with values in [0, num_experts)\n",
-    "        #         expert_indices = topk_indices[..., i]\n",
-    "        #         prob = topk_probs[..., i].unsqueeze(-1)  # (b, seq_len, 1)\n",
-    "        #\n",
-    "        #         # For each expert, process only the tokens assigned to it\n",
-    "        #         for e in range(self.num_experts):\n",
-    "        #             mask = (expert_indices == e)  # (b, seq_len) boolean mask\n",
-    "        #             if mask.any():\n",
-    "        #                 selected = x[mask]  # (num_tokens_e, emb_dim)\n",
-    "        #                 out = self.fc3[e](torch.nn.functional.silu(self.fc1[e](selected)) * self.fc2[e](selected))\n",
-    "        #                 y[mask] += prob[mask] * out\n",
-    "        #     return y"
+    "\n",
+    "        batch, seq_len, _ = x.shape\n",
+    "        x_flat = x.reshape(batch * seq_len, -1)\n",
+    "        out_flat = torch.zeros(batch * seq_len, self.emb_dim, device=x.device, dtype=x.dtype)\n",
+    "\n",
+    "        topk_indices_flat = topk_indices.reshape(-1, self.num_experts_per_tok)\n",
+    "        topk_probs_flat = topk_probs.reshape(-1, self.num_experts_per_tok)\n",
+    "\n",
+    "        unique_experts = torch.unique(topk_indices_flat)\n",
+    "\n",
+    "        for expert_id_tensor in unique_experts:\n",
+    "            expert_id = int(expert_id_tensor.item())\n",
+    "            mask = topk_indices_flat == expert_id\n",
+    "            if not mask.any():\n",
+    "                continue\n",
+    "\n",
+    "            token_mask = mask.any(dim=-1)\n",
+    "            selected_idx = token_mask.nonzero(as_tuple=False).squeeze(-1)\n",
+    "            if selected_idx.numel() == 0:\n",
+    "                continue\n",
+    "\n",
+    "            expert_input = x_flat.index_select(0, selected_idx)\n",
+    "            hidden = torch.nn.functional.silu(self.fc1[expert_id](expert_input)) * self.fc2[expert_id](expert_input)\n",
+    "            expert_out = self.fc3[expert_id](hidden)\n",
+    "\n",
+    "            mask_selected = mask[selected_idx]\n",
+    "            slot_indices = mask_selected.int().argmax(dim=-1, keepdim=True)\n",
+    "            selected_probs = torch.gather(topk_probs_flat.index_select(0, selected_idx), dim=-1, index=slot_indices).squeeze(-1)\n",
+    "\n",
+    "            out_flat.index_add_(0, selected_idx, expert_out * selected_probs.unsqueeze(-1))\n",
+    "\n",
+    "        return out_flat.reshape(batch, seq_len, self.emb_dim)"
    ]
   },
   {
@@ -829,7 +802,7 @@
     "        )\n",
     "\n",
     "        # Feedforward weights\n",
-    "        if \"num_experts\" in param_config:\n",
+    "        if \"num_experts\" in param_config and param_config[\"num_experts\"] > 0:\n",
     "            # Load router (gating) weights\n",
     "            block.ff.gate.weight = assign(\n",
     "                block.ff.gate.weight,\n",
@@ -854,10 +827,6 @@
     "                    params[f\"{prefix}.down_proj.weight\"],\n",
     "                    f\"{prefix}.down_proj.weight\"\n",
     "                )\n",
-    "                # After assigning weights, move the expert layers from meta to CPU\n",
-    "                block.ff.fc1[e] = block.ff.fc1[e].to(\"cpu\")\n",
-    "                block.ff.fc2[e] = block.ff.fc2[e].to(\"cpu\")\n",
-    "                block.ff.fc3[e] = block.ff.fc3[e].to(\"cpu\")\n",
     "\n",
     "        else:\n",
     "            block.ff.fc1.weight = assign(\n",