fix issues raised by Coverity scans (#7431)

NirSonnenschein · loadams · sfc-gh-truwase · web-flow · commit 1a8ad24f0d88 · 2025-08-02T12:16:10.000-04:00
This commit combines fixes for 37 potential code issues found in
Coverity scans.
the issues include but are not limited to potential access to
uninitialized variables, dead and redundant code.
We understand that reviewing such a commit can be difficult and will be
happy to help with any questions or changes required.

---------

Signed-off-by: Nir Sonnenschein &lt;nsonnenschein@habana.ai&gt;
Co-authored-by: Logan Adams &lt;114770087+loadams@users.noreply.github.com&gt;
Co-authored-by: Olatunji Ruwase &lt;tunji.ruwase@snowflake.com&gt;
diff --git a/accelerator/real_accelerator.py b/accelerator/real_accelerator.py
@@ -67,7 +67,7 @@ def get_accelerator():
                     f"XPU_Accelerator requires intel_extension_for_pytorch, which is not installed on this system.")
         elif accelerator_name == "xpu.external":
             try:
-                import intel_extension_for_deepspeed  # noqa: F401 # type: ignore
+                from intel_extension_for_deepspeed import XPU_Accelerator  # noqa: F401 # type: ignore
             except ImportError as e:
                 raise ValueError(
                     f"XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system."
@@ -224,6 +224,12 @@ def get_accelerator():
         ds_accelerator = CPU_Accelerator()
     elif accelerator_name == "xpu.external":
         # XPU_Accelerator is already imported in detection stage
+        try:
+            from intel_extension_for_deepspeed import XPU_Accelerator  # noqa: F811
+        except ImportError as e:
+            raise ValueError(
+                f"XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system."
+            )
         ds_accelerator = XPU_Accelerator()
     elif accelerator_name == "xpu":
         from .xpu_accelerator import XPU_Accelerator
@@ -258,7 +264,7 @@ def get_accelerator():
 def set_accelerator(accel_obj):
     global ds_accelerator
     _validate_accelerator(accel_obj)
-    if accel_logger is not None:
+    if accel_logger is not None and accel_obj is not None:
         accel_logger.info(f"Setting ds_accelerator to {accel_obj._name} (model specified)")
     ds_accelerator = accel_obj
 
diff --git a/deepspeed/autotuning/autotuner.py b/deepspeed/autotuning/autotuner.py
@@ -81,7 +81,7 @@ def __init__(self, args, active_resources):
         if not os.path.exists(self.results_dir):
             try:
                 os.makedirs(self.results_dir, exist_ok=True)
-                logger.info(f"Created autotuning results directory: {self.exps_dir}")
+                logger.info(f"Created autotuning results directory: {self.results_dir}")
             except:
                 logger.error(
                     f"Failed to create {self.results_dir}, please check results_dir in the autotuning config file is accessible by all the nodes in the job."
diff --git a/deepspeed/autotuning/constants.py b/deepspeed/autotuning/constants.py
@@ -144,7 +144,7 @@
     "zero_optimization": {
         "stage": 3
     },
-    "memory_break_down": False
+    "memory_breakdown": False
 }
 
 DEFAULT_TUNING_SPACE_ZERO_0 = {"zero_optimization": {"stage": 0}}
diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py
@@ -77,27 +77,12 @@ def run_collective(self, name, **kwargs):
             return CCLHandler(self.ccl_comm_op)
 
     def all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False):
-        use_caching = False
-        if use_caching:
-            match_id = f"{tensor.size()}-{op}"
-            name = "all_reduce_caching"
-            if name in self.available_coll:
-                group = self.get_all_ranks_from_group(group)
-                return self.ccl_comm_op.all_reduce_caching(tensor, op, match_id, group, async_op)
-            else:
-                return self.run_collective(name=name,
-                                           tensor=tensor,
-                                           op=op,
-                                           match_id=match_id,
-                                           group=group,
-                                           async_op=async_op)
+        name = "all_reduce"
+        if name in self.available_coll:
+            group = self.get_all_ranks_from_group(group)
+            return self.ccl_comm_op.all_reduce(tensor, op, group, async_op)
         else:
-            name = "all_reduce"
-            if name in self.available_coll:
-                group = self.get_all_ranks_from_group(group)
-                return self.ccl_comm_op.all_reduce(tensor, op, group, async_op)
-            else:
-                return self.run_collective(name=name, tensor=tensor, op=op, group=group, async_op=async_op)
+            return self.run_collective(name=name, tensor=tensor, op=op, group=group, async_op=async_op)
 
     def inference_all_reduce(self, tensor, op=ReduceOp.SUM, group=None):
         name = "inference_all_reduce"
diff --git a/deepspeed/compile/passes/offload_activation.py b/deepspeed/compile/passes/offload_activation.py
@@ -101,7 +101,7 @@ def reload_activation_bwd(graph: Graph, graph_id: int, graph_order: List[int], m
         with graph.inserting_after(reload_node):
             wait_node = graph.create_node('call_function',
                                           torch.ops.dc.wait_reload.default, (reload_node, graph_id, val_id), {},
-                                          name=f"wait_copy_{node.name}_{val_id}")
+                                          name=f"wait_copy_{reload_node.name}_{val_id}")
 
         # replace all uses of node with wait_node
         users = {}
diff --git a/deepspeed/compression/helper.py b/deepspeed/compression/helper.py
@@ -137,7 +137,7 @@ def module_replacement(model, module_name, compression_technique=None, mpu=None)
     else:
         new_module = None
 
-    if compression_technique is not None:
+    if compression_technique is not None and new_module is not None:
         for k, v in compression_technique.items():
             if k == SPARSE_PRUNING:
                 if v[SPARSE_PRUNING_ENABLED]:
diff --git a/deepspeed/inference/v2/model_implementations/sharding/qkv.py b/deepspeed/inference/v2/model_implementations/sharding/qkv.py
@@ -37,12 +37,16 @@ def shard_qkv_param(param: torch.Tensor,
     if n_heads_kv is not None and n_heads_q is None:
         raise ValueError("n_heads_kv should not be passed without n_heads_q")
 
+    if param is None:
+        raise ValueError("param should not be None")
     if n_heads_q is None:
         # Guaranteed to be in MHA
         if param.shape[0] // 3 % head_size != 0:
             raise ValueError("MHA param shape is not correct")
         n_heads_q = param.shape[0] // head_size // 3
         mha_sharding = True
+    elif n_heads_kv is None:
+        mha_sharding = True
     else:
         mha_sharding = n_heads_q == n_heads_kv
 
@@ -73,9 +77,6 @@ def shard_qkv_param(param: torch.Tensor,
         else:
             even_kv_sharding = n_heads_kv >= num_shards
 
-        if param is None:
-            return None
-
         q_param = param[:head_size * n_heads_q]
         kv_param = param[head_size * n_heads_q:]
 
diff --git a/deepspeed/inference/v2/ragged/sequence_descriptor.py b/deepspeed/inference/v2/ragged/sequence_descriptor.py
@@ -122,9 +122,9 @@ def __init__(self,
 
         self._seen_tokens = 0
         self._in_flight_tokens = 0
+        assert kv_cache_ids_shadow is not None  # add check before use
 
-        self._num_allocation_groups = tuple(kv_cache_ids_shadow.shape[0]
-                                            for kv_cache_ids_shadow in kv_cache_ids_shadow)
+        self._num_allocation_groups = tuple(kv_cache_id.shape[0] for kv_cache_id in kv_cache_ids_shadow)
         self._blocks_per_allocation_group = tuple(
             torch.zeros(num_groups, dtype=torch.int32, device="cpu") for num_groups in self._num_allocation_groups)
 
diff --git a/deepspeed/module_inject/containers/megatron_gpt.py b/deepspeed/module_inject/containers/megatron_gpt.py
@@ -73,6 +73,8 @@ def attention(self, enable_training=False):
                 attention = self.client_module.attention
             else:
                 attention = self.client_module.self_attention
+        else:
+            return None
 
         return attention.query_key_value.weight, \
                attention.query_key_value.bias, \
diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
@@ -93,8 +93,10 @@ def replace_attn(child, policy):
             return child
         if len(policy_attn) == 5:
             qkvw, attn_ow, attn_ob, hidden_size, heads = policy_attn
+            qw, kw, vw = torch.empty(0), torch.empty(0), torch.empty(0)
         else:
             qw, kw, vw, attn_ow, attn_ob, hidden_size, heads = policy_attn
+            qkvw = torch.empty(0)
 
         config = transformer_inference.DeepSpeedInferenceConfig(
             hidden_size=hidden_size,
@@ -113,11 +115,15 @@ def transpose(data):
             return data
 
         if len(policy_attn) == 5:
+            assert qkvw is not None and qkvw.data is not None, "qkvw can't be None"
             attn_module.attn_qkvw.data = transpose(qkvw.data)
         else:
             attn_module.attn_qkvw = None
+            assert qw is not None and qw.data is not None, "qw can't be None"
             attn_module.attn_qw.data = transpose(qw.data)
+            assert kw is not None and kw.data is not None, "kw can't be None"
             attn_module.attn_kw.data = transpose(kw.data)
+            assert vw is not None and vw.data is not None, "vw can't be None"
             attn_module.attn_vw.data = transpose(vw.data)
 
         attn_module.attn_qkvb = None
@@ -316,21 +322,15 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None):
         return _autotp._replace_module(module)
 
     def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None):
-        training = False  # todo: refactor this part to go in the config
-        if training:
-            # copy relevant state from child -> new module
-            new_module = replace_with_policy(child, _policy, config.triangular_masking)
-
+        # copy relevant state from child -> new module
+        if not is_autotp_training_mode() and config.replace_with_kernel_inject:
+            new_module = replace_with_policy(child,
+                                             _policy,
+                                             config.triangular_masking,
+                                             inference=True,
+                                             layer_id=layer_id)
         else:
-            # copy relevant state from child -> new module
-            if not is_autotp_training_mode() and config.replace_with_kernel_inject:
-                new_module = replace_with_policy(child,
-                                                 _policy,
-                                                 config.triangular_masking,
-                                                 inference=True,
-                                                 layer_id=layer_id)
-            else:
-                new_module = replace_wo_policy(child, _policy, prefix=prefix, state_dict=state_dict)
+            new_module = replace_wo_policy(child, _policy, prefix=prefix, state_dict=state_dict)
 
         return new_module
 
diff --git a/deepspeed/moe/sharded_moe.py b/deepspeed/moe/sharded_moe.py
@@ -400,7 +400,7 @@ def topkgating(
     me = torch.mean(gates, dim=0)
     ce = torch.mean(mask.float(), dim=0)
     l_aux = torch.mean(me * ce) * num_experts * num_experts / k
-
+    locations = None
     if drop_tokens:
         # Calculate configured capacity and remove locations outside capacity from mask
         capacity = _capacity(gates, torch.tensor(capacity_factor * k), torch.tensor(min_capacity))
@@ -437,6 +437,8 @@ def topkgating(
     denom_s = torch.clamp(gates_s, min=torch.finfo(gates_masked.dtype).eps)
     gates_masked = gates_masked / denom_s
 
+    if locations is None:
+        raise ValueError(f"Locations is not set: {locations}")
     # dispatch_mask
     locations_sc = _one_hot_to_float((locations * mask), capacity)
 
diff --git a/deepspeed/ops/sparse_attention/matmul.py b/deepspeed/ops/sparse_attention/matmul.py
@@ -128,18 +128,18 @@ def _kernel(A, B, C, stride_za, stride_ha, stride_ma, stride_ka, stride_zb, stri
             inc_b = TK * stride_kb
         else:
             pinc += 2
-        if meta['DSD']:
-            inc_b = tl.load(pinc)
-            inc_a = tl.load(pinc + 1)
-            inc_b = tl.multiple_of(inc_b, 8)
-            inc_a = tl.multiple_of(inc_a, 8)
-            inc_b = inc_b * stride_kb
-        if meta['DDS']:
-            inc_a = tl.load(pinc)
-            inc_b = tl.load(pinc + 1)
-            inc_a = tl.multiple_of(inc_a, 8)
-            inc_b = tl.multiple_of(inc_b, 8)
-            inc_a = inc_a * stride_ka
+            if meta['DSD']:
+                inc_b = tl.load(pinc)
+                inc_a = tl.load(pinc + 1)
+                inc_b = tl.multiple_of(inc_b, 8)
+                inc_a = tl.multiple_of(inc_a, 8)
+                inc_b = inc_b * stride_kb
+            if meta['DDS']:
+                inc_a = tl.load(pinc)
+                inc_b = tl.load(pinc + 1)
+                inc_a = tl.multiple_of(inc_a, 8)
+                inc_b = tl.multiple_of(inc_b, 8)
+                inc_a = inc_a * stride_ka
         pa += inc_a
         pb += inc_b
         # pre-fetch
diff --git a/deepspeed/ops/transformer/inference/diffusers_transformer_block.py b/deepspeed/ops/transformer/inference/diffusers_transformer_block.py
@@ -57,7 +57,7 @@ def __init__(self, equivalent_module: nn.Module, config: Diffusers2DTransformerC
             self.attn_2.do_out_bias = False
             self.attn_2_bias = self.attn_2.attn_ob
         else:
-            self.attn_2_bias = nn.Paramaeter(torch.zeros_like(self.norm3_g), requires_grad=False)
+            self.attn_2_bias = nn.Parameter(torch.zeros_like(self.norm3_g), requires_grad=False)
 
         self.gated_activation = GatedActivationOp()
         self.layer_norm = LayerNormOp()
diff --git a/deepspeed/ops/transformer/transformer.py b/deepspeed/ops/transformer/transformer.py
@@ -335,27 +335,29 @@ def __init__(self, config, initial_weights=None, initial_biases=None):
             self.norm_b = nn.Parameter(torch.Tensor(self.config.hidden_size))
             self.init_transformer_weights(self.config.adjust_init_range)
         else:
-            # For testing only.
-            q = initial_weights[0].data
-            k = initial_weights[1].data
-            v = initial_weights[2].data
-
-            self.attn_qkvw = nn.Parameter(torch.cat((q, k, v)))
-            #self.attn_qkvw[i * self.config.hidden_size:(i + 1) * self.config.hidden_size] = \
-            #    initial_weights[i].clone()
-            #torch.empty_like(initial_weights[i]).data.copy_(initial_weights[i].data)
-            self.attn_qkvb = nn.Parameter(torch.Tensor(self.config.hidden_size * 3))
-            self.attn_qkvb.data.zero_()
-            self.attn_ow = initial_weights[3]
-            self.attn_ob = initial_biases[3]
-            self.attn_nw = initial_weights[4]
-            self.attn_nb = initial_biases[4]
-            self.inter_w = initial_weights[5]
-            self.inter_b = initial_biases[5]
-            self.output_w = initial_weights[6]
-            self.output_b = initial_biases[6]
-            self.norm_w = initial_weights[7]
-            self.norm_b = initial_biases[7]
+            if initial_weights is not None:
+                # For testing only.
+                q = initial_weights[0].data
+                k = initial_weights[1].data
+                v = initial_weights[2].data
+
+                self.attn_qkvw = nn.Parameter(torch.cat((q, k, v)))
+                #self.attn_qkvw[i * self.config.hidden_size:(i + 1) * self.config.hidden_size] = \
+                #    initial_weights[i].clone()
+                #torch.empty_like(initial_weights[i]).data.copy_(initial_weights[i].data)
+                self.attn_qkvb = nn.Parameter(torch.Tensor(self.config.hidden_size * 3))
+                self.attn_qkvb.data.zero_()
+                self.attn_ow = initial_weights[3]
+                self.attn_nw = initial_weights[4]
+                self.inter_w = initial_weights[5]
+                self.output_w = initial_weights[6]
+                self.norm_w = initial_weights[7]
+            if initial_biases is not None:
+                self.attn_ob = initial_biases[3]
+                self.attn_nb = initial_biases[4]
+                self.inter_b = initial_biases[5]
+                self.output_b = initial_biases[6]
+                self.norm_b = initial_biases[7]
 
         # Load cuda modules if needed
         global transformer_cuda_module, stochastic_transformer_cuda_module
diff --git a/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py b/deepspeed/runtime/data_pipeline/data_sampling/data_analyzer.py
@@ -715,7 +715,7 @@ def write_buffer_to_file(buff, src, builder):
                 buffer = torch.cat(tensor_list, dim=0).to(self.device)
                 write_buffer_to_file(buffer, 0, builder)
             elif self.worker_id == 0 and src > 0:  # rank 0 receives other rank's data and writes it
-                buffer = torch.empty(sizes[src].item(), dtype=buffer.dtype, device=buffer.device)
+                buffer = torch.empty(sizes[src].item(), dtype=numpy_dtype, device=self.device)
                 err = dist.recv(buffer, src=src, group=self.comm_group, tag=src)
                 assert err == src and len(buffer) > 0, "recv failed"
                 write_buffer_to_file(buffer, src, builder)
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
@@ -407,7 +407,6 @@ def _optimized_linear_offload_setup(self):
         for _, module in self.module.named_modules():
             if isinstance(module, LoRAOptimizedLinear):
                 self.optimized_linear_lora_enabled = True
-                offload_ratio = None
                 if offload_ratio is not None:
                     assert offload_ratio == module.lora_config.offload_ratio, \
                         "all lora_config offload ratios should be the same across the model"
@@ -1262,10 +1261,6 @@ def is_replicated(p):
     @staticmethod
     def __check_params(model: Module, dtype: torch.dtype) -> None:
         return
-        if not all(param.dtype == dtype for param in model.parameters()) and dist.get_rank() == 0:
-            raise ValueError(f"{dtype} is enabled but the following parameters have dtype that is "
-                             f"not {dtype}: "
-                             f"{[(n, p.dtype) for n, p in model.named_parameters() if p.dtype != dtype]}")
 
     def _set_client_model(self, model):
         # register client model in _modules so that nn.module methods work correctly
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
diff --git a/deepspeed/runtime/sparse_tensor.py b/deepspeed/runtime/sparse_tensor.py
diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
diff --git a/deepspeed/runtime/zero/stage_1_and_2.py b/deepspeed/runtime/zero/stage_1_and_2.py
diff --git a/deepspeed/sequence/fpdt_layer.py b/deepspeed/sequence/fpdt_layer.py
diff --git a/deepspeed/utils/groups.py b/deepspeed/utils/groups.py
diff --git a/op_builder/builder.py b/op_builder/builder.py
diff --git a/op_builder/cpu/comm.py b/op_builder/cpu/comm.py

Original file line number	Diff line number	Diff line change
`@@ -144,7 +144,7 @@`
`144`	`144`	`"zero_optimization": {`
`145`	`145`	`"stage": 3`
`146`	`146`	`},`
`147`		`- "memory_break_down": False`
	`147`	`+ "memory_breakdown": False`
`148`	`148`	`}`
`149`	`149`
`150`	`150`	`DEFAULT_TUNING_SPACE_ZERO_0 = {"zero_optimization": {"stage": 0}}`