Skip to content

Commit 1a8ad24

Browse files
NirSonnenscheinloadamssfc-gh-truwase
authored
fix issues raised by Coverity scans (#7431)
This commit combines fixes for 37 potential code issues found in Coverity scans. the issues include but are not limited to potential access to uninitialized variables, dead and redundant code. We understand that reviewing such a commit can be difficult and will be happy to help with any questions or changes required. --------- Signed-off-by: Nir Sonnenschein <[email protected]> Co-authored-by: Logan Adams <[email protected]> Co-authored-by: Olatunji Ruwase <[email protected]>
1 parent 0e51e09 commit 1a8ad24

File tree

24 files changed

+101
-163
lines changed

24 files changed

+101
-163
lines changed

accelerator/real_accelerator.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def get_accelerator():
6767
f"XPU_Accelerator requires intel_extension_for_pytorch, which is not installed on this system.")
6868
elif accelerator_name == "xpu.external":
6969
try:
70-
import intel_extension_for_deepspeed # noqa: F401 # type: ignore
70+
from intel_extension_for_deepspeed import XPU_Accelerator # noqa: F401 # type: ignore
7171
except ImportError as e:
7272
raise ValueError(
7373
f"XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system."
@@ -224,6 +224,12 @@ def get_accelerator():
224224
ds_accelerator = CPU_Accelerator()
225225
elif accelerator_name == "xpu.external":
226226
# XPU_Accelerator is already imported in detection stage
227+
try:
228+
from intel_extension_for_deepspeed import XPU_Accelerator # noqa: F811
229+
except ImportError as e:
230+
raise ValueError(
231+
f"XPU_Accelerator external requires intel_extension_for_deepspeed, which is not installed on this system."
232+
)
227233
ds_accelerator = XPU_Accelerator()
228234
elif accelerator_name == "xpu":
229235
from .xpu_accelerator import XPU_Accelerator
@@ -258,7 +264,7 @@ def get_accelerator():
258264
def set_accelerator(accel_obj):
259265
global ds_accelerator
260266
_validate_accelerator(accel_obj)
261-
if accel_logger is not None:
267+
if accel_logger is not None and accel_obj is not None:
262268
accel_logger.info(f"Setting ds_accelerator to {accel_obj._name} (model specified)")
263269
ds_accelerator = accel_obj
264270

deepspeed/autotuning/autotuner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def __init__(self, args, active_resources):
8181
if not os.path.exists(self.results_dir):
8282
try:
8383
os.makedirs(self.results_dir, exist_ok=True)
84-
logger.info(f"Created autotuning results directory: {self.exps_dir}")
84+
logger.info(f"Created autotuning results directory: {self.results_dir}")
8585
except:
8686
logger.error(
8787
f"Failed to create {self.results_dir}, please check results_dir in the autotuning config file is accessible by all the nodes in the job."

deepspeed/autotuning/constants.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@
144144
"zero_optimization": {
145145
"stage": 3
146146
},
147-
"memory_break_down": False
147+
"memory_breakdown": False
148148
}
149149

150150
DEFAULT_TUNING_SPACE_ZERO_0 = {"zero_optimization": {"stage": 0}}

deepspeed/comm/ccl.py

Lines changed: 5 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -77,27 +77,12 @@ def run_collective(self, name, **kwargs):
7777
return CCLHandler(self.ccl_comm_op)
7878

7979
def all_reduce(self, tensor, op=ReduceOp.SUM, group=None, async_op=False):
80-
use_caching = False
81-
if use_caching:
82-
match_id = f"{tensor.size()}-{op}"
83-
name = "all_reduce_caching"
84-
if name in self.available_coll:
85-
group = self.get_all_ranks_from_group(group)
86-
return self.ccl_comm_op.all_reduce_caching(tensor, op, match_id, group, async_op)
87-
else:
88-
return self.run_collective(name=name,
89-
tensor=tensor,
90-
op=op,
91-
match_id=match_id,
92-
group=group,
93-
async_op=async_op)
80+
name = "all_reduce"
81+
if name in self.available_coll:
82+
group = self.get_all_ranks_from_group(group)
83+
return self.ccl_comm_op.all_reduce(tensor, op, group, async_op)
9484
else:
95-
name = "all_reduce"
96-
if name in self.available_coll:
97-
group = self.get_all_ranks_from_group(group)
98-
return self.ccl_comm_op.all_reduce(tensor, op, group, async_op)
99-
else:
100-
return self.run_collective(name=name, tensor=tensor, op=op, group=group, async_op=async_op)
85+
return self.run_collective(name=name, tensor=tensor, op=op, group=group, async_op=async_op)
10186

10287
def inference_all_reduce(self, tensor, op=ReduceOp.SUM, group=None):
10388
name = "inference_all_reduce"

deepspeed/compile/passes/offload_activation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def reload_activation_bwd(graph: Graph, graph_id: int, graph_order: List[int], m
101101
with graph.inserting_after(reload_node):
102102
wait_node = graph.create_node('call_function',
103103
torch.ops.dc.wait_reload.default, (reload_node, graph_id, val_id), {},
104-
name=f"wait_copy_{node.name}_{val_id}")
104+
name=f"wait_copy_{reload_node.name}_{val_id}")
105105

106106
# replace all uses of node with wait_node
107107
users = {}

deepspeed/compression/helper.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ def module_replacement(model, module_name, compression_technique=None, mpu=None)
137137
else:
138138
new_module = None
139139

140-
if compression_technique is not None:
140+
if compression_technique is not None and new_module is not None:
141141
for k, v in compression_technique.items():
142142
if k == SPARSE_PRUNING:
143143
if v[SPARSE_PRUNING_ENABLED]:

deepspeed/inference/v2/model_implementations/sharding/qkv.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,12 +37,16 @@ def shard_qkv_param(param: torch.Tensor,
3737
if n_heads_kv is not None and n_heads_q is None:
3838
raise ValueError("n_heads_kv should not be passed without n_heads_q")
3939

40+
if param is None:
41+
raise ValueError("param should not be None")
4042
if n_heads_q is None:
4143
# Guaranteed to be in MHA
4244
if param.shape[0] // 3 % head_size != 0:
4345
raise ValueError("MHA param shape is not correct")
4446
n_heads_q = param.shape[0] // head_size // 3
4547
mha_sharding = True
48+
elif n_heads_kv is None:
49+
mha_sharding = True
4650
else:
4751
mha_sharding = n_heads_q == n_heads_kv
4852

@@ -73,9 +77,6 @@ def shard_qkv_param(param: torch.Tensor,
7377
else:
7478
even_kv_sharding = n_heads_kv >= num_shards
7579

76-
if param is None:
77-
return None
78-
7980
q_param = param[:head_size * n_heads_q]
8081
kv_param = param[head_size * n_heads_q:]
8182

deepspeed/inference/v2/ragged/sequence_descriptor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,9 +122,9 @@ def __init__(self,
122122

123123
self._seen_tokens = 0
124124
self._in_flight_tokens = 0
125+
assert kv_cache_ids_shadow is not None # add check before use
125126

126-
self._num_allocation_groups = tuple(kv_cache_ids_shadow.shape[0]
127-
for kv_cache_ids_shadow in kv_cache_ids_shadow)
127+
self._num_allocation_groups = tuple(kv_cache_id.shape[0] for kv_cache_id in kv_cache_ids_shadow)
128128
self._blocks_per_allocation_group = tuple(
129129
torch.zeros(num_groups, dtype=torch.int32, device="cpu") for num_groups in self._num_allocation_groups)
130130

deepspeed/module_inject/containers/megatron_gpt.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ def attention(self, enable_training=False):
7373
attention = self.client_module.attention
7474
else:
7575
attention = self.client_module.self_attention
76+
else:
77+
return None
7678

7779
return attention.query_key_value.weight, \
7880
attention.query_key_value.bias, \

deepspeed/module_inject/replace_module.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,10 @@ def replace_attn(child, policy):
9393
return child
9494
if len(policy_attn) == 5:
9595
qkvw, attn_ow, attn_ob, hidden_size, heads = policy_attn
96+
qw, kw, vw = torch.empty(0), torch.empty(0), torch.empty(0)
9697
else:
9798
qw, kw, vw, attn_ow, attn_ob, hidden_size, heads = policy_attn
99+
qkvw = torch.empty(0)
98100

99101
config = transformer_inference.DeepSpeedInferenceConfig(
100102
hidden_size=hidden_size,
@@ -113,11 +115,15 @@ def transpose(data):
113115
return data
114116

115117
if len(policy_attn) == 5:
118+
assert qkvw is not None and qkvw.data is not None, "qkvw can't be None"
116119
attn_module.attn_qkvw.data = transpose(qkvw.data)
117120
else:
118121
attn_module.attn_qkvw = None
122+
assert qw is not None and qw.data is not None, "qw can't be None"
119123
attn_module.attn_qw.data = transpose(qw.data)
124+
assert kw is not None and kw.data is not None, "kw can't be None"
120125
attn_module.attn_kw.data = transpose(kw.data)
126+
assert vw is not None and vw.data is not None, "vw can't be None"
121127
attn_module.attn_vw.data = transpose(vw.data)
122128

123129
attn_module.attn_qkvb = None
@@ -316,21 +322,15 @@ def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None):
316322
return _autotp._replace_module(module)
317323

318324
def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None):
319-
training = False # todo: refactor this part to go in the config
320-
if training:
321-
# copy relevant state from child -> new module
322-
new_module = replace_with_policy(child, _policy, config.triangular_masking)
323-
325+
# copy relevant state from child -> new module
326+
if not is_autotp_training_mode() and config.replace_with_kernel_inject:
327+
new_module = replace_with_policy(child,
328+
_policy,
329+
config.triangular_masking,
330+
inference=True,
331+
layer_id=layer_id)
324332
else:
325-
# copy relevant state from child -> new module
326-
if not is_autotp_training_mode() and config.replace_with_kernel_inject:
327-
new_module = replace_with_policy(child,
328-
_policy,
329-
config.triangular_masking,
330-
inference=True,
331-
layer_id=layer_id)
332-
else:
333-
new_module = replace_wo_policy(child, _policy, prefix=prefix, state_dict=state_dict)
333+
new_module = replace_wo_policy(child, _policy, prefix=prefix, state_dict=state_dict)
334334

335335
return new_module
336336

0 commit comments

Comments
 (0)