@@ -41,6 +41,7 @@ import (
4141 "vitess.io/vitess/go/mysql/sqlerror"
4242 "vitess.io/vitess/go/sqlescape"
4343 "vitess.io/vitess/go/sqltypes"
44+ "vitess.io/vitess/go/stats"
4445 "vitess.io/vitess/go/textutil"
4546 "vitess.io/vitess/go/timer"
4647 "vitess.io/vitess/go/vt/binlog/binlogplayer"
7576 ErrMigrationNotFound = errors .New ("migration not found" )
7677)
7778
79+ var (
80+ staleMigrationMinutesStats = stats .NewGauge ("OnlineDDLStaleMigrationMinutes" , "longest stale migration in minutes" )
81+ )
82+
7883var (
7984 // fixCompletedTimestampDone fixes a nil `completed_timestamp` columns, see
8085 // https://github.com/vitessio/vitess/issues/13927
@@ -115,7 +120,8 @@ func registerOnlineDDLFlags(fs *pflag.FlagSet) {
115120
116121const (
117122 maxPasswordLength = 32 // MySQL's *replication* password may not exceed 32 characters
118- staleMigrationMinutes = 180
123+ staleMigrationFailMinutes = 180
124+ staleMigrationWarningMinutes = 5
119125 progressPctStarted float64 = 0
120126 progressPctFull float64 = 100.0
121127 etaSecondsUnknown = - 1
@@ -3121,7 +3127,7 @@ func (e *Executor) reviewRunningMigrations(ctx context.Context) (countRunnning i
31213127 if _ , ok := e .vreplicationLastError [uuid ]; ! ok {
31223128 e .vreplicationLastError [uuid ] = vterrors .NewLastError (
31233129 fmt .Sprintf ("Online DDL migration %v" , uuid ),
3124- staleMigrationMinutes * time .Minute ,
3130+ staleMigrationFailMinutes * time .Minute ,
31253131 )
31263132 }
31273133 lastError := e .vreplicationLastError [uuid ]
@@ -3251,14 +3257,53 @@ func (e *Executor) reviewRunningMigrations(ctx context.Context) (countRunnning i
32513257 return countRunnning , cancellable , nil
32523258}
32533259
3260+ // monitorStaleMigrations checks for stale migrations, i.e. migrations that are in 'running' state
3261+ // but have not updated their liveness timestamp in past X minutes. It updates the stats
3262+ // staleMigrationMinutesStats with the maximum number of stale minutes found, and logs a warning
3263+ // for each stale migration found.
3264+ func (e * Executor ) monitorStaleMigrations (ctx context.Context ) error {
3265+ e .migrationMutex .Lock ()
3266+ defer e .migrationMutex .Unlock ()
3267+
3268+ var maxStaleMinutes int64
3269+
3270+ query , err := sqlparser .ParseAndBind (sqlSelectStaleMigrations ,
3271+ sqltypes .Int64BindVariable (staleMigrationWarningMinutes ),
3272+ )
3273+ if err != nil {
3274+ return err
3275+ }
3276+ r , err := e .execQuery (ctx , query )
3277+ if err != nil {
3278+ return err
3279+ }
3280+ for _ , row := range r .Named ().Rows {
3281+ uuid := row ["migration_uuid" ].ToString ()
3282+ staleMinutes := row .AsInt64 ("stale_minutes" , 0 )
3283+
3284+ onlineDDL , row , err := e .readMigration (ctx , uuid )
3285+ if err != nil {
3286+ return err
3287+ }
3288+ livenessTimestamp := row .AsString ("liveness_timestamp" , "" )
3289+ message := fmt .Sprintf ("stale migration %s: found running but indicates no liveness for %v minutes, since %v" , onlineDDL .UUID , staleMinutes , livenessTimestamp )
3290+ log .Warning ("warnStaleMigrations: %s" , message )
3291+
3292+ maxStaleMinutes = max (maxStaleMinutes , staleMinutes )
3293+ }
3294+ staleMigrationMinutesStats .Set (maxStaleMinutes )
3295+
3296+ return nil
3297+ }
3298+
32543299// reviewStaleMigrations marks as 'failed' migrations whose status is 'running' but which have
32553300// shown no liveness in past X minutes. It also attempts to terminate them
32563301func (e * Executor ) reviewStaleMigrations (ctx context.Context ) error {
32573302 e .migrationMutex .Lock ()
32583303 defer e .migrationMutex .Unlock ()
32593304
32603305 query , err := sqlparser .ParseAndBind (sqlSelectStaleMigrations ,
3261- sqltypes .Int64BindVariable (staleMigrationMinutes ),
3306+ sqltypes .Int64BindVariable (staleMigrationFailMinutes ),
32623307 )
32633308 if err != nil {
32643309 return err
@@ -3275,7 +3320,7 @@ func (e *Executor) reviewStaleMigrations(ctx context.Context) error {
32753320 return err
32763321 }
32773322 log .Infof ("reviewStaleMigrations: stale migration found: %s" , onlineDDL .UUID )
3278- message := fmt .Sprintf ("stale migration %s: found running but indicates no liveness in the past %v minutes" , onlineDDL .UUID , staleMigrationMinutes )
3323+ message := fmt .Sprintf ("stale migration %s: found running but indicates no liveness in the past %v minutes" , onlineDDL .UUID , staleMigrationFailMinutes )
32793324 if onlineDDL .TabletAlias != e .TabletAliasString () {
32803325 // This means another tablet started the migration, and the migration has failed due to the tablet failure (e.g. primary failover)
32813326 if err := e .updateTabletFailure (ctx , onlineDDL .UUID ); err != nil {
@@ -3510,6 +3555,9 @@ func (e *Executor) onMigrationCheckTick() {
35103555 } else if err := e .cancelMigrations (ctx , cancellable , false ); err != nil {
35113556 log .Error (err )
35123557 }
3558+ if err := e .monitorStaleMigrations (ctx ); err != nil {
3559+ log .Error (err )
3560+ }
35133561 if err := e .reviewStaleMigrations (ctx ); err != nil {
35143562 log .Error (err )
35153563 }
0 commit comments