Counting bug fix (#69)

Chris-hughes10 · web-flow · commit bab411b56441 · 2025-06-03T10:48:08.000+01:00
* add debug configs

* update step logic
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,30 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": false
+        },
+        {
+            "name": "Python Debugger: accelerate",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "accelerate.commands.launch",
+            "args": [
+                // "--config_file",
+                // "PATH/TO/accelerate_config.yaml",
+                // "PATH/TO/train.py",
+               
+            ],
+            "console": "integratedTerminal",
+            "justMyCode": false
+        },
+    ]
+}
diff --git a/pytorch_accelerated/trainer.py b/pytorch_accelerated/trainer.py
@@ -428,7 +428,7 @@ def train(
         :param num_epochs: the number of epochs to train for
         :param eval_dataset: the dataset to use during evaluation epochs, if this is not provided, evaluation is skipped.
         :param per_device_batch_size: the batch size to use per device
-        :param max_num_train_steps: the maximum number of steps across all processes to train for. If provided, this will override num_epochs
+        :param max_num_train_steps: the maximum number of steps across all processes to train for. If both max_num_train_steps and num_epochs are provided, the smaller of the two limits is used.
         :param gradient_accumulation_steps: accumulate gradients to the specified number of steps to simulate a bigger batch size. By default, this is set to ``1``
         :param gradient_clip_value: if specified, the gradients of the model's parameters will be clipped to the range ``[-gradient_clip_value, gradient_clip_value]``
         :param create_scheduler_fn: a function which accepts an optimizer as an argument and returns a learning rate scheduler
@@ -808,9 +808,10 @@ def _run_train_epoch(self, train_dl):
             self,
         )
 
-        # updates across all processes
+        # max steps across all processes
         max_total_update_steps = self.run_config.max_num_train_steps
-
+        
+        # updates across all processes
         updates_completed = (
             self.run_history.current_epoch - 1
         ) * self.run_config.num_update_steps_per_epoch
@@ -850,11 +851,9 @@ def _run_train_epoch(self, train_dl):
                     + (step + 1) // self.run_config.gradient_accumulation_steps
                 )
 
-                global_process_updates = process_updates * self._accelerator.num_processes
-
                 if (
                     self.run_config.max_num_train_steps is not None
-                    and global_process_updates >= max_total_update_steps
+                    and process_updates >= max_total_update_steps
                 ):
                     reached_max_steps = True
                     # Synchronize reached_max_steps across processes