Skip to content

Commit 4fbab4f

Browse files
authored
Online DDL metrics: OnlineDDLStaleMigrationMinutes (#18417)
Signed-off-by: Shlomi Noach <[email protected]>
1 parent 826d78d commit 4fbab4f

File tree

2 files changed

+54
-5
lines changed

2 files changed

+54
-5
lines changed

go/vt/vttablet/onlineddl/executor.go

Lines changed: 52 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ import (
4141
"vitess.io/vitess/go/mysql/sqlerror"
4242
"vitess.io/vitess/go/sqlescape"
4343
"vitess.io/vitess/go/sqltypes"
44+
"vitess.io/vitess/go/stats"
4445
"vitess.io/vitess/go/textutil"
4546
"vitess.io/vitess/go/timer"
4647
"vitess.io/vitess/go/vt/binlog/binlogplayer"
@@ -75,6 +76,10 @@ var (
7576
ErrMigrationNotFound = errors.New("migration not found")
7677
)
7778

79+
var (
80+
staleMigrationMinutesStats = stats.NewGauge("OnlineDDLStaleMigrationMinutes", "longest stale migration in minutes")
81+
)
82+
7883
var (
7984
// fixCompletedTimestampDone fixes a nil `completed_timestamp` columns, see
8085
// https://github.com/vitessio/vitess/issues/13927
@@ -115,7 +120,8 @@ func registerOnlineDDLFlags(fs *pflag.FlagSet) {
115120

116121
const (
117122
maxPasswordLength = 32 // MySQL's *replication* password may not exceed 32 characters
118-
staleMigrationMinutes = 180
123+
staleMigrationFailMinutes = 180
124+
staleMigrationWarningMinutes = 5
119125
progressPctStarted float64 = 0
120126
progressPctFull float64 = 100.0
121127
etaSecondsUnknown = -1
@@ -3121,7 +3127,7 @@ func (e *Executor) reviewRunningMigrations(ctx context.Context) (countRunnning i
31213127
if _, ok := e.vreplicationLastError[uuid]; !ok {
31223128
e.vreplicationLastError[uuid] = vterrors.NewLastError(
31233129
fmt.Sprintf("Online DDL migration %v", uuid),
3124-
staleMigrationMinutes*time.Minute,
3130+
staleMigrationFailMinutes*time.Minute,
31253131
)
31263132
}
31273133
lastError := e.vreplicationLastError[uuid]
@@ -3251,14 +3257,53 @@ func (e *Executor) reviewRunningMigrations(ctx context.Context) (countRunnning i
32513257
return countRunnning, cancellable, nil
32523258
}
32533259

3260+
// monitorStaleMigrations checks for stale migrations, i.e. migrations that are in 'running' state
3261+
// but have not updated their liveness timestamp in past X minutes. It updates the stats
3262+
// staleMigrationMinutesStats with the maximum number of stale minutes found, and logs a warning
3263+
// for each stale migration found.
3264+
func (e *Executor) monitorStaleMigrations(ctx context.Context) error {
3265+
e.migrationMutex.Lock()
3266+
defer e.migrationMutex.Unlock()
3267+
3268+
var maxStaleMinutes int64
3269+
3270+
query, err := sqlparser.ParseAndBind(sqlSelectStaleMigrations,
3271+
sqltypes.Int64BindVariable(staleMigrationWarningMinutes),
3272+
)
3273+
if err != nil {
3274+
return err
3275+
}
3276+
r, err := e.execQuery(ctx, query)
3277+
if err != nil {
3278+
return err
3279+
}
3280+
for _, row := range r.Named().Rows {
3281+
uuid := row["migration_uuid"].ToString()
3282+
staleMinutes := row.AsInt64("stale_minutes", 0)
3283+
3284+
onlineDDL, row, err := e.readMigration(ctx, uuid)
3285+
if err != nil {
3286+
return err
3287+
}
3288+
livenessTimestamp := row.AsString("liveness_timestamp", "")
3289+
message := fmt.Sprintf("stale migration %s: found running but indicates no liveness for %v minutes, since %v", onlineDDL.UUID, staleMinutes, livenessTimestamp)
3290+
log.Warning("warnStaleMigrations: %s", message)
3291+
3292+
maxStaleMinutes = max(maxStaleMinutes, staleMinutes)
3293+
}
3294+
staleMigrationMinutesStats.Set(maxStaleMinutes)
3295+
3296+
return nil
3297+
}
3298+
32543299
// reviewStaleMigrations marks as 'failed' migrations whose status is 'running' but which have
32553300
// shown no liveness in past X minutes. It also attempts to terminate them
32563301
func (e *Executor) reviewStaleMigrations(ctx context.Context) error {
32573302
e.migrationMutex.Lock()
32583303
defer e.migrationMutex.Unlock()
32593304

32603305
query, err := sqlparser.ParseAndBind(sqlSelectStaleMigrations,
3261-
sqltypes.Int64BindVariable(staleMigrationMinutes),
3306+
sqltypes.Int64BindVariable(staleMigrationFailMinutes),
32623307
)
32633308
if err != nil {
32643309
return err
@@ -3275,7 +3320,7 @@ func (e *Executor) reviewStaleMigrations(ctx context.Context) error {
32753320
return err
32763321
}
32773322
log.Infof("reviewStaleMigrations: stale migration found: %s", onlineDDL.UUID)
3278-
message := fmt.Sprintf("stale migration %s: found running but indicates no liveness in the past %v minutes", onlineDDL.UUID, staleMigrationMinutes)
3323+
message := fmt.Sprintf("stale migration %s: found running but indicates no liveness in the past %v minutes", onlineDDL.UUID, staleMigrationFailMinutes)
32793324
if onlineDDL.TabletAlias != e.TabletAliasString() {
32803325
// This means another tablet started the migration, and the migration has failed due to the tablet failure (e.g. primary failover)
32813326
if err := e.updateTabletFailure(ctx, onlineDDL.UUID); err != nil {
@@ -3510,6 +3555,9 @@ func (e *Executor) onMigrationCheckTick() {
35103555
} else if err := e.cancelMigrations(ctx, cancellable, false); err != nil {
35113556
log.Error(err)
35123557
}
3558+
if err := e.monitorStaleMigrations(ctx); err != nil {
3559+
log.Error(err)
3560+
}
35133561
if err := e.reviewStaleMigrations(ctx); err != nil {
35143562
log.Error(err)
35153563
}

go/vt/vttablet/onlineddl/schema.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -357,7 +357,8 @@ const (
357357
LIMIT 1
358358
`
359359
sqlSelectStaleMigrations = `SELECT
360-
migration_uuid
360+
migration_uuid,
361+
timestampdiff(minute, liveness_timestamp, now()) as stale_minutes
361362
FROM _vt.schema_migrations
362363
WHERE
363364
migration_status='running'

0 commit comments

Comments
 (0)