Skip to content

Commit 5f1c42d

Browse files
vtorc: allow recoveries to be disabled from startup (vitessio#18005) (#687)
1 parent 7a5f16f commit 5f1c42d

File tree

4 files changed

+20
-0
lines changed

4 files changed

+20
-0
lines changed

go/flags/endtoend/vtorc.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ vtorc \
1717

1818
Flags:
1919
--allow-emergency-reparent Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary (default true)
20+
--allow-recovery Whether VTOrc should be allowed to run recovery actions (default true)
2021
--alsologtostderr log to standard error as well as files
2122
--audit-file-location string File location where the audit logs are to be stored
2223
--audit-purge-duration duration Duration for which audit logs are held before being purged. Should be in multiples of days (default 168h0m0s)

go/test/endtoend/vtorc/readtopologyinstance/main_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ func TestReadTopologyInstanceBufferable(t *testing.T) {
5757
"--topo_global_root", clusterInfo.ClusterInstance.VtctlProcess.TopoGlobalRoot,
5858
}
5959
servenv.ParseFlags("vtorc")
60+
config.Config.AllowRecovery = true
6061
config.Config.RecoveryPeriodBlockSeconds = 1
6162
config.Config.InstancePollSeconds = 1
6263
config.MarkConfigurationLoaded()

go/vt/vtorc/config/config.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ var (
6262
tolerableReplicationLag = 0 * time.Second
6363
topoInformationRefreshDuration = 15 * time.Second
6464
recoveryPollDuration = 1 * time.Second
65+
allowRecovery = true
6566
ersEnabled = true
6667
convertTabletsWithErrantGTIDs = false
6768
)
@@ -83,6 +84,7 @@ func RegisterFlags(fs *pflag.FlagSet) {
8384
fs.DurationVar(&tolerableReplicationLag, "tolerable-replication-lag", tolerableReplicationLag, "Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS")
8485
fs.DurationVar(&topoInformationRefreshDuration, "topo-information-refresh-duration", topoInformationRefreshDuration, "Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topology server")
8586
fs.DurationVar(&recoveryPollDuration, "recovery-poll-duration", recoveryPollDuration, "Timer duration on which VTOrc polls its database to run a recovery")
87+
fs.BoolVar(&allowRecovery, "allow-recovery", allowRecovery, "Whether VTOrc should be allowed to run recovery actions")
8688
fs.BoolVar(&ersEnabled, "allow-emergency-reparent", ersEnabled, "Whether VTOrc should be allowed to run emergency reparent operation when it detects a dead primary")
8789
fs.BoolVar(&convertTabletsWithErrantGTIDs, "change-tablets-with-errant-gtid-to-drained", convertTabletsWithErrantGTIDs, "Whether VTOrc should be changing the type of tablets with errant GTIDs to DRAINED")
8890
}
@@ -106,6 +108,7 @@ type Configuration struct {
106108
WaitReplicasTimeoutSeconds int // Timeout on amount of time to wait for the replicas in case of ERS. Should be a small value because we should fail-fast. Should not be larger than LockTimeout since that is the total time we use for an ERS.
107109
TolerableReplicationLagSeconds int // Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary in PRS.
108110
TopoInformationRefreshSeconds int // Timer duration on which VTOrc refreshes the keyspace and vttablet records from the topo-server.
111+
AllowRecovery bool // Allow recoveries.
109112
RecoveryPollSeconds int // Timer duration on which VTOrc recovery analysis runs
110113
}
111114

@@ -137,6 +140,7 @@ func UpdateConfigValuesFromFlags() {
137140
Config.WaitReplicasTimeoutSeconds = int(waitReplicasTimeout / time.Second)
138141
Config.TolerableReplicationLagSeconds = int(tolerableReplicationLag / time.Second)
139142
Config.TopoInformationRefreshSeconds = int(topoInformationRefreshDuration / time.Second)
143+
Config.AllowRecovery = allowRecovery
140144
Config.RecoveryPollSeconds = int(recoveryPollDuration / time.Second)
141145
}
142146

@@ -150,6 +154,11 @@ func SetERSEnabled(val bool) {
150154
ersEnabled = val
151155
}
152156

157+
// GetAllowRecovery is a getter function.
158+
func GetAllowRecovery() bool {
159+
return allowRecovery
160+
}
161+
153162
// ConvertTabletWithErrantGTIDs reports whether VTOrc is allowed to change the tablet type of tablets with errant GTIDs to DRAINED.
154163
func ConvertTabletWithErrantGTIDs() bool {
155164
return convertTabletsWithErrantGTIDs
@@ -181,6 +190,7 @@ func newConfiguration() *Configuration {
181190
PreventCrossDataCenterPrimaryFailover: false,
182191
WaitReplicasTimeoutSeconds: 30,
183192
TopoInformationRefreshSeconds: 15,
193+
AllowRecovery: true,
184194
RecoveryPollSeconds: 1,
185195
}
186196
}

go/vt/vtorc/logic/vtorc.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,14 @@ func ContinuousDiscovery() {
355355
checkAndRecoverWaitPeriod := 3 * instancePollSecondsDuration()
356356
recentDiscoveryOperationKeys = cache.New(instancePollSecondsDuration(), time.Second)
357357

358+
if !config.GetAllowRecovery() {
359+
log.Info("--allow-recovery is set to 'false', disabling recovery actions")
360+
if err := DisableRecovery(); err != nil {
361+
log.Errorf("failed to disable recoveries: %+v", err)
362+
return
363+
}
364+
}
365+
358366
go handleDiscoveryRequests()
359367

360368
healthTick := time.Tick(config.HealthPollSeconds * time.Second)

0 commit comments

Comments
 (0)