update step logic

Chris-hughes10 · Chris-hughes10 · commit 738a0c112935 · 2025-06-03T09:47:22.000Z
diff --git a/pytorch_accelerated/trainer.py b/pytorch_accelerated/trainer.py
@@ -428,7 +428,7 @@ def train(
         :param num_epochs: the number of epochs to train for
         :param eval_dataset: the dataset to use during evaluation epochs, if this is not provided, evaluation is skipped.
         :param per_device_batch_size: the batch size to use per device
-        :param max_num_train_steps: the maximum number of steps across all processes to train for. If provided, this will override num_epochs
+        :param max_num_train_steps: the maximum number of steps across all processes to train for. If both max_num_train_steps and num_epochs are provided, the smaller of the two limits is used.
         :param gradient_accumulation_steps: accumulate gradients to the specified number of steps to simulate a bigger batch size. By default, this is set to ``1``
         :param gradient_clip_value: if specified, the gradients of the model's parameters will be clipped to the range ``[-gradient_clip_value, gradient_clip_value]``
         :param create_scheduler_fn: a function which accepts an optimizer as an argument and returns a learning rate scheduler
@@ -808,9 +808,10 @@ def _run_train_epoch(self, train_dl):
             self,
         )
 
-        # updates across all processes
+        # max steps across all processes
         max_total_update_steps = self.run_config.max_num_train_steps
-
+        
+        # updates across all processes
         updates_completed = (
             self.run_history.current_epoch - 1
         ) * self.run_config.num_update_steps_per_epoch
@@ -850,11 +851,9 @@ def _run_train_epoch(self, train_dl):
                     + (step + 1) // self.run_config.gradient_accumulation_steps
                 )
 
-                global_process_updates = process_updates * self._accelerator.num_processes
-
                 if (
                     self.run_config.max_num_train_steps is not None
-                    and global_process_updates >= max_total_update_steps
+                    and process_updates >= max_total_update_steps
                 ):
                     reached_max_steps = True
                     # Synchronize reached_max_steps across processes