@@ -57,17 +57,17 @@ type clusterAnalysis struct {
5757 durability policy.Durabler
5858}
5959
60- // GetReplicationAnalysis will check for replication problems (dead primary; unreachable primary; etc)
61- func GetReplicationAnalysis (keyspace string , shard string , hints * ReplicationAnalysisHints ) ([]* ReplicationAnalysis , error ) {
62- var result []* ReplicationAnalysis
63- appendAnalysis := func (analysis * ReplicationAnalysis ) {
60+ // GetDetectionAnalysis will check for detected problems (dead primary; unreachable primary; etc)
61+ func GetDetectionAnalysis (keyspace string , shard string , hints * DetectionAnalysisHints ) ([]* DetectionAnalysis , error ) {
62+ var result []* DetectionAnalysis
63+ appendAnalysis := func (analysis * DetectionAnalysis ) {
6464 if analysis .Analysis == NoProblem && len (analysis .StructureAnalysis ) == 0 {
6565 return
6666 }
6767 result = append (result , analysis )
6868 }
6969
70- // TODO(sougou); deprecate ReduceReplicationAnalysisCount
70+ // TODO(sougou); deprecate ReduceDetectionAnalysisCount
7171 args := sqlutils .Args (config .GetReasonableReplicationLagSeconds (), ValidSecondsFromSeenToLastAttemptedCheck (), config .GetReasonableReplicationLagSeconds (), keyspace , shard )
7272 query := `SELECT
7373 vitess_tablet.info AS tablet_info,
@@ -280,7 +280,7 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
280280
281281 clusters := make (map [string ]* clusterAnalysis )
282282 err := db .Db .QueryVTOrc (query , args , func (m sqlutils.RowMap ) error {
283- a := & ReplicationAnalysis {
283+ a := & DetectionAnalysis {
284284 Analysis : NoProblem ,
285285 }
286286
@@ -407,121 +407,122 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
407407 return nil
408408 }
409409 isInvalid := m .GetBool ("is_invalid" )
410- if a .IsClusterPrimary && isInvalid {
410+ switch {
411+ case a .IsClusterPrimary && isInvalid :
411412 a .Analysis = InvalidPrimary
412413 a .Description = "VTOrc hasn't been able to reach the primary even once since restart/shutdown"
413- } else if isInvalid {
414+ case isInvalid :
414415 a .Analysis = InvalidReplica
415416 a .Description = "VTOrc hasn't been able to reach the replica even once since restart/shutdown"
416- } else if a .IsClusterPrimary && ! a .LastCheckValid && a .IsDiskStalled {
417+ case a .IsClusterPrimary && ! a .LastCheckValid && a .IsDiskStalled :
417418 a .Analysis = PrimaryDiskStalled
418419 a .Description = "Primary has a stalled disk"
419420 ca .hasShardWideAction = true
420- } else if a .IsClusterPrimary && ! a .LastCheckValid && a .CountReplicas == 0 {
421+ case a .IsClusterPrimary && ! a .LastCheckValid && a .CountReplicas == 0 :
421422 a .Analysis = DeadPrimaryWithoutReplicas
422423 a .Description = "Primary cannot be reached by vtorc and has no replica"
423424 ca .hasShardWideAction = true
424425 //
425- } else if a .IsClusterPrimary && ! a .LastCheckValid && a .CountValidReplicas == a .CountReplicas && a .CountValidReplicatingReplicas == 0 {
426+ case a .IsClusterPrimary && ! a .LastCheckValid && a .CountValidReplicas == a .CountReplicas && a .CountValidReplicatingReplicas == 0 :
426427 a .Analysis = DeadPrimary
427428 a .Description = "Primary cannot be reached by vtorc and none of its replicas is replicating"
428429 ca .hasShardWideAction = true
429430 //
430- } else if a .IsClusterPrimary && ! a .LastCheckValid && a .CountReplicas > 0 && a .CountValidReplicas == 0 && a .CountValidReplicatingReplicas == 0 {
431+ case a .IsClusterPrimary && ! a .LastCheckValid && a .CountReplicas > 0 && a .CountValidReplicas == 0 && a .CountValidReplicatingReplicas == 0 :
431432 a .Analysis = DeadPrimaryAndReplicas
432433 a .Description = "Primary cannot be reached by vtorc and none of its replicas is replicating"
433434 ca .hasShardWideAction = true
434435 //
435- } else if a .IsClusterPrimary && ! a .LastCheckValid && a .CountValidReplicas < a .CountReplicas && a .CountValidReplicas > 0 && a .CountValidReplicatingReplicas == 0 {
436+ case a .IsClusterPrimary && ! a .LastCheckValid && a .CountValidReplicas < a .CountReplicas && a .CountValidReplicas > 0 && a .CountValidReplicatingReplicas == 0 :
436437 a .Analysis = DeadPrimaryAndSomeReplicas
437438 a .Description = "Primary cannot be reached by vtorc; some of its replicas are unreachable and none of its reachable replicas is replicating"
438439 ca .hasShardWideAction = true
439440 //
440- } else if a .IsClusterPrimary && ! a .IsPrimary {
441+ case a .IsClusterPrimary && ! a .IsPrimary :
441442 a .Analysis = PrimaryHasPrimary
442443 a .Description = "Primary is replicating from somewhere else"
443444 ca .hasShardWideAction = true
444445 //
445- } else if a .IsClusterPrimary && a .IsReadOnly {
446+ case a .IsClusterPrimary && a .IsReadOnly :
446447 a .Analysis = PrimaryIsReadOnly
447448 a .Description = "Primary is read-only"
448449 //
449- } else if a .IsClusterPrimary && policy .SemiSyncAckers (ca .durability , tablet ) != 0 && ! a .SemiSyncPrimaryEnabled {
450+ case a .IsClusterPrimary && policy .SemiSyncAckers (ca .durability , tablet ) != 0 && ! a .SemiSyncPrimaryEnabled :
450451 a .Analysis = PrimarySemiSyncMustBeSet
451452 a .Description = "Primary semi-sync must be set"
452453 //
453- } else if a .IsClusterPrimary && policy .SemiSyncAckers (ca .durability , tablet ) == 0 && a .SemiSyncPrimaryEnabled {
454+ case a .IsClusterPrimary && policy .SemiSyncAckers (ca .durability , tablet ) == 0 && a .SemiSyncPrimaryEnabled :
454455 a .Analysis = PrimarySemiSyncMustNotBeSet
455456 a .Description = "Primary semi-sync must not be set"
456457 //
457- } else if a .IsClusterPrimary && a .CurrentTabletType != topodatapb .TabletType_UNKNOWN && a .CurrentTabletType != topodatapb .TabletType_PRIMARY {
458+ case a .IsClusterPrimary && a .CurrentTabletType != topodatapb .TabletType_UNKNOWN && a .CurrentTabletType != topodatapb .TabletType_PRIMARY :
458459 a .Analysis = PrimaryCurrentTypeMismatch
459460 a .Description = "Primary tablet's current type is not PRIMARY"
460- } else if topo .IsReplicaType (a .TabletType ) && a .ErrantGTID != "" {
461+ case topo .IsReplicaType (a .TabletType ) && a .ErrantGTID != "" :
461462 a .Analysis = ErrantGTIDDetected
462463 a .Description = "Tablet has errant GTIDs"
463- } else if topo .IsReplicaType (a .TabletType ) && ca .primaryAlias == "" && a .ShardPrimaryTermTimestamp .IsZero () {
464+ case topo .IsReplicaType (a .TabletType ) && ca .primaryAlias == "" && a .ShardPrimaryTermTimestamp .IsZero ():
464465 // ClusterHasNoPrimary should only be detected when the shard record doesn't have any primary term start time specified either.
465466 a .Analysis = ClusterHasNoPrimary
466467 a .Description = "Cluster has no primary"
467468 ca .hasShardWideAction = true
468- } else if topo .IsReplicaType (a .TabletType ) && ca .primaryAlias == "" && ! a .ShardPrimaryTermTimestamp .IsZero () {
469+ case topo .IsReplicaType (a .TabletType ) && ca .primaryAlias == "" && ! a .ShardPrimaryTermTimestamp .IsZero ():
469470 // If there are no primary tablets, but the shard primary start time isn't empty, then we know
470471 // the primary tablet was deleted.
471472 a .Analysis = PrimaryTabletDeleted
472473 a .Description = "Primary tablet has been deleted"
473474 ca .hasShardWideAction = true
474- } else if a .IsPrimary && a .SemiSyncBlocked && a .CountSemiSyncReplicasEnabled >= a .SemiSyncPrimaryWaitForReplicaCount {
475+ case a .IsPrimary && a .SemiSyncBlocked && a .CountSemiSyncReplicasEnabled >= a .SemiSyncPrimaryWaitForReplicaCount :
475476 // The primary is reporting that semi-sync monitor is blocked on writes.
476477 // There are enough replicas configured to send semi-sync ACKs such that the primary shouldn't be blocked.
477478 // There is some network diruption in progress. We should run an ERS.
478479 a .Analysis = PrimarySemiSyncBlocked
479480 a .Description = "Writes seem to be blocked on semi-sync acks on the primary, even though sufficient replicas are configured to send ACKs"
480481 ca .hasShardWideAction = true
481- } else if topo .IsReplicaType (a .TabletType ) && ! a .IsReadOnly {
482+ case topo .IsReplicaType (a .TabletType ) && ! a .IsReadOnly :
482483 a .Analysis = ReplicaIsWritable
483484 a .Description = "Replica is writable"
484485 //
485- } else if topo .IsReplicaType (a .TabletType ) && a .IsPrimary {
486+ case topo .IsReplicaType (a .TabletType ) && a .IsPrimary :
486487 a .Analysis = NotConnectedToPrimary
487488 a .Description = "Not connected to the primary"
488489 //
489- } else if topo .IsReplicaType (a .TabletType ) && ! a .IsPrimary && math .Round (a .HeartbeatInterval * 2 ) != float64 (a .ReplicaNetTimeout ) {
490+ case topo .IsReplicaType (a .TabletType ) && ! a .IsPrimary && math .Round (a .HeartbeatInterval * 2 ) != float64 (a .ReplicaNetTimeout ):
490491 a .Analysis = ReplicaMisconfigured
491492 a .Description = "Replica has been misconfigured"
492493 //
493- } else if topo .IsReplicaType (a .TabletType ) && ! a .IsPrimary && ca .primaryAlias != "" && a .AnalyzedInstancePrimaryAlias != ca .primaryAlias {
494+ case topo .IsReplicaType (a .TabletType ) && ! a .IsPrimary && ca .primaryAlias != "" && a .AnalyzedInstancePrimaryAlias != ca .primaryAlias :
494495 a .Analysis = ConnectedToWrongPrimary
495496 a .Description = "Connected to wrong primary"
496497 //
497- } else if topo .IsReplicaType (a .TabletType ) && ! a .IsPrimary && a .ReplicationStopped {
498+ case topo .IsReplicaType (a .TabletType ) && ! a .IsPrimary && a .ReplicationStopped :
498499 a .Analysis = ReplicationStopped
499500 a .Description = "Replication is stopped"
500501 //
501- } else if topo .IsReplicaType (a .TabletType ) && ! a .IsPrimary && policy .IsReplicaSemiSync (ca .durability , primaryTablet , tablet ) && ! a .SemiSyncReplicaEnabled {
502+ case topo .IsReplicaType (a .TabletType ) && ! a .IsPrimary && policy .IsReplicaSemiSync (ca .durability , primaryTablet , tablet ) && ! a .SemiSyncReplicaEnabled :
502503 a .Analysis = ReplicaSemiSyncMustBeSet
503504 a .Description = "Replica semi-sync must be set"
504505 //
505- } else if topo .IsReplicaType (a .TabletType ) && ! a .IsPrimary && ! policy .IsReplicaSemiSync (ca .durability , primaryTablet , tablet ) && a .SemiSyncReplicaEnabled {
506+ case topo .IsReplicaType (a .TabletType ) && ! a .IsPrimary && ! policy .IsReplicaSemiSync (ca .durability , primaryTablet , tablet ) && a .SemiSyncReplicaEnabled :
506507 a .Analysis = ReplicaSemiSyncMustNotBeSet
507508 a .Description = "Replica semi-sync must not be set"
508509 //
509510 // TODO(sougou): Events below here are either ignored or not possible.
510- } else if a .IsPrimary && ! a .LastCheckValid && a .CountLaggingReplicas == a .CountReplicas && a .CountDelayedReplicas < a .CountReplicas && a .CountValidReplicatingReplicas > 0 {
511+ case a .IsPrimary && ! a .LastCheckValid && a .CountLaggingReplicas == a .CountReplicas && a .CountDelayedReplicas < a .CountReplicas && a .CountValidReplicatingReplicas > 0 :
511512 a .Analysis = UnreachablePrimaryWithLaggingReplicas
512513 a .Description = "Primary cannot be reached by vtorc and all of its replicas are lagging"
513514 //
514- } else if a .IsPrimary && ! a .LastCheckValid && ! a .LastCheckPartialSuccess && a .CountValidReplicas > 0 && a .CountValidReplicatingReplicas == a .CountValidReplicas {
515+ case a .IsPrimary && ! a .LastCheckValid && ! a .LastCheckPartialSuccess && a .CountValidReplicas > 0 && a .CountValidReplicatingReplicas == a .CountValidReplicas :
515516 // partial success is here to reduce noise
516517 a .Analysis = UnreachablePrimary
517518 a .Description = "Primary cannot be reached by vtorc but all of its replicas seem to be replicating; possibly a network/host issue"
518519 //
519- } else if a .IsPrimary && ! a .LastCheckValid && ! a .LastCheckPartialSuccess && a .CountValidReplicas > 0 && a .CountValidReplicatingReplicas > 0 && a .CountValidReplicatingReplicas < a .CountValidReplicas {
520+ case a .IsPrimary && ! a .LastCheckValid && ! a .LastCheckPartialSuccess && a .CountValidReplicas > 0 && a .CountValidReplicatingReplicas > 0 && a .CountValidReplicatingReplicas < a .CountValidReplicas :
520521 // partial success is here to reduce noise
521522 a .Analysis = UnreachablePrimaryWithBrokenReplicas
522523 a .Description = "Primary cannot be reached by vtorc but it has (some, but not all) replicating replicas; possibly a network/host issue"
523524 //
524- } else if a .IsPrimary && a .SemiSyncPrimaryEnabled && a .SemiSyncPrimaryStatus && a .SemiSyncPrimaryWaitForReplicaCount > 0 && a .SemiSyncPrimaryClients < a .SemiSyncPrimaryWaitForReplicaCount {
525+ case a .IsPrimary && a .SemiSyncPrimaryEnabled && a .SemiSyncPrimaryStatus && a .SemiSyncPrimaryWaitForReplicaCount > 0 && a .SemiSyncPrimaryClients < a .SemiSyncPrimaryWaitForReplicaCount :
525526 if isStaleBinlogCoordinates {
526527 a .Analysis = LockedSemiSyncPrimary
527528 a .Description = "Semi sync primary is locked since it doesn't get enough replica acknowledgements"
@@ -530,26 +531,26 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
530531 a .Description = "Semi sync primary seems to be locked, more samplings needed to validate"
531532 }
532533 //
533- } else if a .IsPrimary && a .LastCheckValid && a .CountReplicas == 1 && a .CountValidReplicas == a .CountReplicas && a .CountValidReplicatingReplicas == 0 {
534+ case a .IsPrimary && a .LastCheckValid && a .CountReplicas == 1 && a .CountValidReplicas == a .CountReplicas && a .CountValidReplicatingReplicas == 0 :
534535 a .Analysis = PrimarySingleReplicaNotReplicating
535536 a .Description = "Primary is reachable but its single replica is not replicating"
536- } else if a .IsPrimary && a .LastCheckValid && a .CountReplicas == 1 && a .CountValidReplicas == 0 {
537+ case a .IsPrimary && a .LastCheckValid && a .CountReplicas == 1 && a .CountValidReplicas == 0 :
537538 a .Analysis = PrimarySingleReplicaDead
538539 a .Description = "Primary is reachable but its single replica is dead"
539540 //
540- } else if a .IsPrimary && a .LastCheckValid && a .CountReplicas > 1 && a .CountValidReplicas == a .CountReplicas && a .CountValidReplicatingReplicas == 0 {
541+ case a .IsPrimary && a .LastCheckValid && a .CountReplicas > 1 && a .CountValidReplicas == a .CountReplicas && a .CountValidReplicatingReplicas == 0 :
541542 a .Analysis = AllPrimaryReplicasNotReplicating
542543 a .Description = "Primary is reachable but none of its replicas is replicating"
543544 //
544- } else if a .IsPrimary && a .LastCheckValid && a .CountReplicas > 1 && a .CountValidReplicas < a .CountReplicas && a .CountValidReplicas > 0 && a .CountValidReplicatingReplicas == 0 {
545+ case a .IsPrimary && a .LastCheckValid && a .CountReplicas > 1 && a .CountValidReplicas < a .CountReplicas && a .CountValidReplicas > 0 && a .CountValidReplicatingReplicas == 0 :
545546 a .Analysis = AllPrimaryReplicasNotReplicatingOrDead
546547 a .Description = "Primary is reachable but none of its replicas is replicating"
547548 //
549+ // case a.IsPrimary && a.CountReplicas == 0:
550+ // a.Analysis = PrimaryWithoutReplicas
551+ // a.Description = "Primary has no replicas"
552+ // }
548553 }
549- // else if a.IsPrimary && a.CountReplicas == 0 {
550- // a.Analysis = PrimaryWithoutReplicas
551- // a.Description = "Primary has no replicas"
552- // }
553554
554555 {
555556 // Moving on to structure analysis
@@ -606,12 +607,12 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
606607 if err != nil {
607608 log .Error (err )
608609 }
609- // TODO: result, err = getConcensusReplicationAnalysis (result)
610+ // TODO: result, err = getConcensusDetectionAnalysis (result)
610611 return result , err
611612}
612613
613614// postProcessAnalyses is used to update different analyses based on the information gleaned from looking at all the analyses together instead of individual data.
614- func postProcessAnalyses (result []* ReplicationAnalysis , clusters map [string ]* clusterAnalysis ) []* ReplicationAnalysis {
615+ func postProcessAnalyses (result []* DetectionAnalysis , clusters map [string ]* clusterAnalysis ) []* DetectionAnalysis {
615616 for {
616617 // Store whether we have changed the result of replication analysis or not.
617618 resultChanged := false
0 commit comments