Skip to content

Commit 80fccbc

Browse files
vtorc: rename ReplicationAnalysis -> DetectionAnalysis (#18615)
Signed-off-by: Tim Vaillancourt <[email protected]>
1 parent 57a5620 commit 80fccbc

File tree

10 files changed

+141
-130
lines changed

10 files changed

+141
-130
lines changed

changelog/23.0/23.0.0/summary.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
- [Aggregated Discovery Metrics HTTP API removed](#aggregated-discovery-metrics-api-removed)
2020
- [Dynamic control of `EmergencyReparentShard`-based recoveries](#vtorc-dynamic-ers-disabled)
2121
- [Recovery stats to include keyspace/shard](#recoveries-stats-keyspace-shard)
22+
- [`/api/replication-analysis` HTTP API deprecation](#replication-analysis-api-deprecation)
2223
- **[VTTablet](#minor-changes-vttablet)**
2324
- [CLI Flags](#flags-vttablet)
2425
- [Managed MySQL configuration defaults to caching-sha2-password](#mysql-caching-sha2-password)
@@ -117,6 +118,10 @@ The following recovery-related stats now include labels for keyspaces and shards
117118

118119
Previous to this release, only the recovery "type" was included in labels.
119120

121+
#### <a id="replication-analysis-api-deprecation"/>`/api/replication-analysis` HTTP API deprecation</a>
122+
123+
The `/api/replication-analysis` HTTP API endpoint is now deprecated and is replaced with `/api/detection-analysis`, which currently returns the same response format.
124+
120125
### <a id="minor-changes-vttablet"/>VTTablet</a>
121126

122127
#### <a id="flags-vttablet"/>CLI Flags</a>

go/vt/vtorc/inst/analysis.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,12 @@ const (
8181
// Key of this map is a InstanceAnalysis.String()
8282
type PeerAnalysisMap map[string]int
8383

84-
type ReplicationAnalysisHints struct {
84+
type DetectionAnalysisHints struct {
8585
AuditAnalysis bool
8686
}
8787

88-
// ReplicationAnalysis notes analysis on replication chain status, per instance
89-
type ReplicationAnalysis struct {
88+
// DetectionAnalysis represents an analysis of a detected problem.
89+
type DetectionAnalysis struct {
9090
AnalyzedInstanceAlias string
9191
AnalyzedInstancePrimaryAlias string
9292
TabletType topodatapb.TabletType
@@ -139,11 +139,11 @@ type ReplicationAnalysis struct {
139139
IsDiskStalled bool
140140
}
141141

142-
func (replicationAnalysis *ReplicationAnalysis) MarshalJSON() ([]byte, error) {
142+
func (detectionAnalysis *DetectionAnalysis) MarshalJSON() ([]byte, error) {
143143
i := struct {
144-
ReplicationAnalysis
144+
DetectionAnalysis
145145
}{}
146-
i.ReplicationAnalysis = *replicationAnalysis
146+
i.DetectionAnalysis = *detectionAnalysis
147147

148148
return json.Marshal(i)
149149
}

go/vt/vtorc/inst/analysis_dao.go

Lines changed: 44 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -57,17 +57,17 @@ type clusterAnalysis struct {
5757
durability policy.Durabler
5858
}
5959

60-
// GetReplicationAnalysis will check for replication problems (dead primary; unreachable primary; etc)
61-
func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAnalysisHints) ([]*ReplicationAnalysis, error) {
62-
var result []*ReplicationAnalysis
63-
appendAnalysis := func(analysis *ReplicationAnalysis) {
60+
// GetDetectionAnalysis will check for detected problems (dead primary; unreachable primary; etc)
61+
func GetDetectionAnalysis(keyspace string, shard string, hints *DetectionAnalysisHints) ([]*DetectionAnalysis, error) {
62+
var result []*DetectionAnalysis
63+
appendAnalysis := func(analysis *DetectionAnalysis) {
6464
if analysis.Analysis == NoProblem && len(analysis.StructureAnalysis) == 0 {
6565
return
6666
}
6767
result = append(result, analysis)
6868
}
6969

70-
// TODO(sougou); deprecate ReduceReplicationAnalysisCount
70+
// TODO(sougou); deprecate ReduceDetectionAnalysisCount
7171
args := sqlutils.Args(config.GetReasonableReplicationLagSeconds(), ValidSecondsFromSeenToLastAttemptedCheck(), config.GetReasonableReplicationLagSeconds(), keyspace, shard)
7272
query := `SELECT
7373
vitess_tablet.info AS tablet_info,
@@ -280,7 +280,7 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
280280

281281
clusters := make(map[string]*clusterAnalysis)
282282
err := db.Db.QueryVTOrc(query, args, func(m sqlutils.RowMap) error {
283-
a := &ReplicationAnalysis{
283+
a := &DetectionAnalysis{
284284
Analysis: NoProblem,
285285
}
286286

@@ -407,121 +407,122 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
407407
return nil
408408
}
409409
isInvalid := m.GetBool("is_invalid")
410-
if a.IsClusterPrimary && isInvalid {
410+
switch {
411+
case a.IsClusterPrimary && isInvalid:
411412
a.Analysis = InvalidPrimary
412413
a.Description = "VTOrc hasn't been able to reach the primary even once since restart/shutdown"
413-
} else if isInvalid {
414+
case isInvalid:
414415
a.Analysis = InvalidReplica
415416
a.Description = "VTOrc hasn't been able to reach the replica even once since restart/shutdown"
416-
} else if a.IsClusterPrimary && !a.LastCheckValid && a.IsDiskStalled {
417+
case a.IsClusterPrimary && !a.LastCheckValid && a.IsDiskStalled:
417418
a.Analysis = PrimaryDiskStalled
418419
a.Description = "Primary has a stalled disk"
419420
ca.hasShardWideAction = true
420-
} else if a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas == 0 {
421+
case a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas == 0:
421422
a.Analysis = DeadPrimaryWithoutReplicas
422423
a.Description = "Primary cannot be reached by vtorc and has no replica"
423424
ca.hasShardWideAction = true
424425
//
425-
} else if a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
426+
case a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0:
426427
a.Analysis = DeadPrimary
427428
a.Description = "Primary cannot be reached by vtorc and none of its replicas is replicating"
428429
ca.hasShardWideAction = true
429430
//
430-
} else if a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == 0 && a.CountValidReplicatingReplicas == 0 {
431+
case a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == 0 && a.CountValidReplicatingReplicas == 0:
431432
a.Analysis = DeadPrimaryAndReplicas
432433
a.Description = "Primary cannot be reached by vtorc and none of its replicas is replicating"
433434
ca.hasShardWideAction = true
434435
//
435-
} else if a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 {
436+
case a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0:
436437
a.Analysis = DeadPrimaryAndSomeReplicas
437438
a.Description = "Primary cannot be reached by vtorc; some of its replicas are unreachable and none of its reachable replicas is replicating"
438439
ca.hasShardWideAction = true
439440
//
440-
} else if a.IsClusterPrimary && !a.IsPrimary {
441+
case a.IsClusterPrimary && !a.IsPrimary:
441442
a.Analysis = PrimaryHasPrimary
442443
a.Description = "Primary is replicating from somewhere else"
443444
ca.hasShardWideAction = true
444445
//
445-
} else if a.IsClusterPrimary && a.IsReadOnly {
446+
case a.IsClusterPrimary && a.IsReadOnly:
446447
a.Analysis = PrimaryIsReadOnly
447448
a.Description = "Primary is read-only"
448449
//
449-
} else if a.IsClusterPrimary && policy.SemiSyncAckers(ca.durability, tablet) != 0 && !a.SemiSyncPrimaryEnabled {
450+
case a.IsClusterPrimary && policy.SemiSyncAckers(ca.durability, tablet) != 0 && !a.SemiSyncPrimaryEnabled:
450451
a.Analysis = PrimarySemiSyncMustBeSet
451452
a.Description = "Primary semi-sync must be set"
452453
//
453-
} else if a.IsClusterPrimary && policy.SemiSyncAckers(ca.durability, tablet) == 0 && a.SemiSyncPrimaryEnabled {
454+
case a.IsClusterPrimary && policy.SemiSyncAckers(ca.durability, tablet) == 0 && a.SemiSyncPrimaryEnabled:
454455
a.Analysis = PrimarySemiSyncMustNotBeSet
455456
a.Description = "Primary semi-sync must not be set"
456457
//
457-
} else if a.IsClusterPrimary && a.CurrentTabletType != topodatapb.TabletType_UNKNOWN && a.CurrentTabletType != topodatapb.TabletType_PRIMARY {
458+
case a.IsClusterPrimary && a.CurrentTabletType != topodatapb.TabletType_UNKNOWN && a.CurrentTabletType != topodatapb.TabletType_PRIMARY:
458459
a.Analysis = PrimaryCurrentTypeMismatch
459460
a.Description = "Primary tablet's current type is not PRIMARY"
460-
} else if topo.IsReplicaType(a.TabletType) && a.ErrantGTID != "" {
461+
case topo.IsReplicaType(a.TabletType) && a.ErrantGTID != "":
461462
a.Analysis = ErrantGTIDDetected
462463
a.Description = "Tablet has errant GTIDs"
463-
} else if topo.IsReplicaType(a.TabletType) && ca.primaryAlias == "" && a.ShardPrimaryTermTimestamp.IsZero() {
464+
case topo.IsReplicaType(a.TabletType) && ca.primaryAlias == "" && a.ShardPrimaryTermTimestamp.IsZero():
464465
// ClusterHasNoPrimary should only be detected when the shard record doesn't have any primary term start time specified either.
465466
a.Analysis = ClusterHasNoPrimary
466467
a.Description = "Cluster has no primary"
467468
ca.hasShardWideAction = true
468-
} else if topo.IsReplicaType(a.TabletType) && ca.primaryAlias == "" && !a.ShardPrimaryTermTimestamp.IsZero() {
469+
case topo.IsReplicaType(a.TabletType) && ca.primaryAlias == "" && !a.ShardPrimaryTermTimestamp.IsZero():
469470
// If there are no primary tablets, but the shard primary start time isn't empty, then we know
470471
// the primary tablet was deleted.
471472
a.Analysis = PrimaryTabletDeleted
472473
a.Description = "Primary tablet has been deleted"
473474
ca.hasShardWideAction = true
474-
} else if a.IsPrimary && a.SemiSyncBlocked && a.CountSemiSyncReplicasEnabled >= a.SemiSyncPrimaryWaitForReplicaCount {
475+
case a.IsPrimary && a.SemiSyncBlocked && a.CountSemiSyncReplicasEnabled >= a.SemiSyncPrimaryWaitForReplicaCount:
475476
// The primary is reporting that semi-sync monitor is blocked on writes.
476477
// There are enough replicas configured to send semi-sync ACKs such that the primary shouldn't be blocked.
477478
// There is some network diruption in progress. We should run an ERS.
478479
a.Analysis = PrimarySemiSyncBlocked
479480
a.Description = "Writes seem to be blocked on semi-sync acks on the primary, even though sufficient replicas are configured to send ACKs"
480481
ca.hasShardWideAction = true
481-
} else if topo.IsReplicaType(a.TabletType) && !a.IsReadOnly {
482+
case topo.IsReplicaType(a.TabletType) && !a.IsReadOnly:
482483
a.Analysis = ReplicaIsWritable
483484
a.Description = "Replica is writable"
484485
//
485-
} else if topo.IsReplicaType(a.TabletType) && a.IsPrimary {
486+
case topo.IsReplicaType(a.TabletType) && a.IsPrimary:
486487
a.Analysis = NotConnectedToPrimary
487488
a.Description = "Not connected to the primary"
488489
//
489-
} else if topo.IsReplicaType(a.TabletType) && !a.IsPrimary && math.Round(a.HeartbeatInterval*2) != float64(a.ReplicaNetTimeout) {
490+
case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && math.Round(a.HeartbeatInterval*2) != float64(a.ReplicaNetTimeout):
490491
a.Analysis = ReplicaMisconfigured
491492
a.Description = "Replica has been misconfigured"
492493
//
493-
} else if topo.IsReplicaType(a.TabletType) && !a.IsPrimary && ca.primaryAlias != "" && a.AnalyzedInstancePrimaryAlias != ca.primaryAlias {
494+
case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && ca.primaryAlias != "" && a.AnalyzedInstancePrimaryAlias != ca.primaryAlias:
494495
a.Analysis = ConnectedToWrongPrimary
495496
a.Description = "Connected to wrong primary"
496497
//
497-
} else if topo.IsReplicaType(a.TabletType) && !a.IsPrimary && a.ReplicationStopped {
498+
case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && a.ReplicationStopped:
498499
a.Analysis = ReplicationStopped
499500
a.Description = "Replication is stopped"
500501
//
501-
} else if topo.IsReplicaType(a.TabletType) && !a.IsPrimary && policy.IsReplicaSemiSync(ca.durability, primaryTablet, tablet) && !a.SemiSyncReplicaEnabled {
502+
case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && policy.IsReplicaSemiSync(ca.durability, primaryTablet, tablet) && !a.SemiSyncReplicaEnabled:
502503
a.Analysis = ReplicaSemiSyncMustBeSet
503504
a.Description = "Replica semi-sync must be set"
504505
//
505-
} else if topo.IsReplicaType(a.TabletType) && !a.IsPrimary && !policy.IsReplicaSemiSync(ca.durability, primaryTablet, tablet) && a.SemiSyncReplicaEnabled {
506+
case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && !policy.IsReplicaSemiSync(ca.durability, primaryTablet, tablet) && a.SemiSyncReplicaEnabled:
506507
a.Analysis = ReplicaSemiSyncMustNotBeSet
507508
a.Description = "Replica semi-sync must not be set"
508509
//
509510
// TODO(sougou): Events below here are either ignored or not possible.
510-
} else if a.IsPrimary && !a.LastCheckValid && a.CountLaggingReplicas == a.CountReplicas && a.CountDelayedReplicas < a.CountReplicas && a.CountValidReplicatingReplicas > 0 {
511+
case a.IsPrimary && !a.LastCheckValid && a.CountLaggingReplicas == a.CountReplicas && a.CountDelayedReplicas < a.CountReplicas && a.CountValidReplicatingReplicas > 0:
511512
a.Analysis = UnreachablePrimaryWithLaggingReplicas
512513
a.Description = "Primary cannot be reached by vtorc and all of its replicas are lagging"
513514
//
514-
} else if a.IsPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == a.CountValidReplicas {
515+
case a.IsPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == a.CountValidReplicas:
515516
// partial success is here to reduce noise
516517
a.Analysis = UnreachablePrimary
517518
a.Description = "Primary cannot be reached by vtorc but all of its replicas seem to be replicating; possibly a network/host issue"
518519
//
519-
} else if a.IsPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 && a.CountValidReplicatingReplicas < a.CountValidReplicas {
520+
case a.IsPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 && a.CountValidReplicatingReplicas < a.CountValidReplicas:
520521
// partial success is here to reduce noise
521522
a.Analysis = UnreachablePrimaryWithBrokenReplicas
522523
a.Description = "Primary cannot be reached by vtorc but it has (some, but not all) replicating replicas; possibly a network/host issue"
523524
//
524-
} else if a.IsPrimary && a.SemiSyncPrimaryEnabled && a.SemiSyncPrimaryStatus && a.SemiSyncPrimaryWaitForReplicaCount > 0 && a.SemiSyncPrimaryClients < a.SemiSyncPrimaryWaitForReplicaCount {
525+
case a.IsPrimary && a.SemiSyncPrimaryEnabled && a.SemiSyncPrimaryStatus && a.SemiSyncPrimaryWaitForReplicaCount > 0 && a.SemiSyncPrimaryClients < a.SemiSyncPrimaryWaitForReplicaCount:
525526
if isStaleBinlogCoordinates {
526527
a.Analysis = LockedSemiSyncPrimary
527528
a.Description = "Semi sync primary is locked since it doesn't get enough replica acknowledgements"
@@ -530,26 +531,26 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
530531
a.Description = "Semi sync primary seems to be locked, more samplings needed to validate"
531532
}
532533
//
533-
} else if a.IsPrimary && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
534+
case a.IsPrimary && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0:
534535
a.Analysis = PrimarySingleReplicaNotReplicating
535536
a.Description = "Primary is reachable but its single replica is not replicating"
536-
} else if a.IsPrimary && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == 0 {
537+
case a.IsPrimary && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == 0:
537538
a.Analysis = PrimarySingleReplicaDead
538539
a.Description = "Primary is reachable but its single replica is dead"
539540
//
540-
} else if a.IsPrimary && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
541+
case a.IsPrimary && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0:
541542
a.Analysis = AllPrimaryReplicasNotReplicating
542543
a.Description = "Primary is reachable but none of its replicas is replicating"
543544
//
544-
} else if a.IsPrimary && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 {
545+
case a.IsPrimary && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0:
545546
a.Analysis = AllPrimaryReplicasNotReplicatingOrDead
546547
a.Description = "Primary is reachable but none of its replicas is replicating"
547548
//
549+
// case a.IsPrimary && a.CountReplicas == 0:
550+
// a.Analysis = PrimaryWithoutReplicas
551+
// a.Description = "Primary has no replicas"
552+
// }
548553
}
549-
// else if a.IsPrimary && a.CountReplicas == 0 {
550-
// a.Analysis = PrimaryWithoutReplicas
551-
// a.Description = "Primary has no replicas"
552-
// }
553554

554555
{
555556
// Moving on to structure analysis
@@ -606,12 +607,12 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
606607
if err != nil {
607608
log.Error(err)
608609
}
609-
// TODO: result, err = getConcensusReplicationAnalysis(result)
610+
// TODO: result, err = getConcensusDetectionAnalysis(result)
610611
return result, err
611612
}
612613

613614
// postProcessAnalyses is used to update different analyses based on the information gleaned from looking at all the analyses together instead of individual data.
614-
func postProcessAnalyses(result []*ReplicationAnalysis, clusters map[string]*clusterAnalysis) []*ReplicationAnalysis {
615+
func postProcessAnalyses(result []*DetectionAnalysis, clusters map[string]*clusterAnalysis) []*DetectionAnalysis {
615616
for {
616617
// Store whether we have changed the result of replication analysis or not.
617618
resultChanged := false

0 commit comments

Comments
 (0)