Skip to content
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .chloggen/supervisor-uid-mismatch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Use this changelog template to create an entry for release notes.

# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: enhancement

# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
component: opampsupervisor

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Report the reception of an unexpected UID during bootstrapping

# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
issues: [29864]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext:

# If your change doesn't affect end users or the exported elements of any package,
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
# Optional: The change log or logs in which this entry should be included.
# e.g. '[user]' or '[user, api]'
# Include 'user' if the change is relevant to end users.
# Include 'api' if there is a change to a library API.
# Default: '[user]'
change_logs: []
107 changes: 107 additions & 0 deletions cmd/opampsupervisor/e2e_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ import (
"text/template"
"time"

"go.opentelemetry.io/collector/pdata/ptrace"

"github.com/open-telemetry/opentelemetry-collector-contrib/testbed/testbed"

"github.com/google/uuid"
"github.com/knadh/koanf/parsers/yaml"
"github.com/knadh/koanf/providers/file"
Expand Down Expand Up @@ -1868,3 +1872,106 @@ func findRandomPort() (int, error) {

return port, nil
}

func TestSupervisorEmitBootstrapTelemetry(t *testing.T) {
agentDescription := atomic.Value{}

// Load the Supervisor config so we can get the location of
// the Collector that will be run.
var cfg config.Supervisor
cfgFile := getSupervisorConfig(t, "nocap", map[string]string{})
k := koanf.New("::")
err := k.Load(file.Provider(cfgFile.Name()), yaml.Parser())
require.NoError(t, err)
err = k.UnmarshalWithConf("", &cfg, koanf.UnmarshalConf{
Tag: "mapstructure",
})
require.NoError(t, err)

// Get the binary name and version from the Collector binary
// using the `components` command that prints a YAML-encoded
// map of information about the Collector build. Some of this
// information will be used as defaults for the telemetry
// attributes.
agentPath := cfg.Agent.Executable
componentsInfo, err := exec.Command(agentPath, "components").Output()
require.NoError(t, err)
k = koanf.New("::")
err = k.Load(rawbytes.Provider(componentsInfo), yaml.Parser())
require.NoError(t, err)
buildinfo := k.StringMap("buildinfo")
command := buildinfo["command"]
version := buildinfo["version"]

server := newOpAMPServer(
t,
defaultConnectingHandler,
types.ConnectionCallbacks{
OnMessage: func(_ context.Context, _ types.Connection, message *protobufs.AgentToServer) *protobufs.ServerToAgent {
if message.AgentDescription != nil {
agentDescription.Store(message.AgentDescription)
}

return &protobufs.ServerToAgent{}
},
})

outputPath := filepath.Join(t.TempDir(), "output.txt")
_, err = findRandomPort()
require.Nil(t, err)
backend := testbed.NewOTLPHTTPDataReceiver(4318)
mockBackend := testbed.NewMockBackend(outputPath, backend)
mockBackend.EnableRecording()
defer mockBackend.Stop()
require.NoError(t, mockBackend.Start())

s := newSupervisor(t,
"emit_telemetry",
map[string]string{
"url": server.addr,
"telemetryUrl": fmt.Sprintf("localhost:%d", 4318),
},
)

require.Nil(t, s.Start())
defer s.Shutdown()

waitForSupervisorConnection(server.supervisorConnected, true)

require.Eventually(t, func() bool {
ad, ok := agentDescription.Load().(*protobufs.AgentDescription)
if !ok {
return false
}

var agentName, agentVersion string
identAttr := ad.IdentifyingAttributes
for _, attr := range identAttr {
switch attr.Key {
case semconv.AttributeServiceName:
agentName = attr.Value.GetStringValue()
case semconv.AttributeServiceVersion:
agentVersion = attr.Value.GetStringValue()
}
}

// By default, the Collector should report its name and version
// from the component.BuildInfo struct built into the Collector
// binary.
return agentName == command && agentVersion == version
}, 5*time.Second, 250*time.Millisecond)

require.EventuallyWithT(t, func(collect *assert.CollectT) {
require.Len(collect, mockBackend.ReceivedTraces, 1)
}, 10*time.Second, 250*time.Millisecond)

require.Equal(t, 1, mockBackend.ReceivedTraces[0].ResourceSpans().Len())
gotServiceName, ok := mockBackend.ReceivedTraces[0].ResourceSpans().At(0).Resource().Attributes().Get(semconv.AttributeServiceName)
require.True(t, ok)
require.Equal(t, "opamp-supervisor", gotServiceName.Str())

require.Equal(t, 1, mockBackend.ReceivedTraces[0].ResourceSpans().At(0).ScopeSpans().Len())
require.Equal(t, 1, mockBackend.ReceivedTraces[0].ResourceSpans().At(0).ScopeSpans().At(0).Spans().Len())
require.Equal(t, "GetBootstrapInfo", mockBackend.ReceivedTraces[0].ResourceSpans().At(0).ScopeSpans().At(0).Spans().At(0).Name())
require.Equal(t, ptrace.StatusCodeOk, mockBackend.ReceivedTraces[0].ResourceSpans().At(0).ScopeSpans().At(0).Spans().At(0).Status().Code())
}
21 changes: 20 additions & 1 deletion cmd/opampsupervisor/examples/supervisor_darwin.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
server:
endpoint: wss://127.0.0.1:4320/v1/opamp
endpoint: ws://127.0.0.1:4320/v1/opamp
tls:
# Disable verification to test locally.
# Don't do this in production.
insecure_skip_verify: true
# For more TLS settings see config/configtls.ClientConfig
insecure: true
#cert_file: /Users/florian.bacher/go/src/github.com/open-telemetry/opamp-go/internal/certs/server_certs/server.cert.pem

capabilities:
reports_effective_config: true
Expand All @@ -20,3 +22,20 @@ agent:

storage:
directory: .

telemetry:
traces:
processors:
- simple:
exporter:
otlp:
protocol: http/protobuf
endpoint: http://localhost:4318
logs:
level: debug
processors:
- simple:
exporter:
otlp:
protocol: http/protobuf
endpoint: http://localhost:4318
8 changes: 4 additions & 4 deletions cmd/opampsupervisor/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ require (
go.opentelemetry.io/collector/service v0.122.0
go.opentelemetry.io/contrib/bridges/otelzap v0.10.0
go.opentelemetry.io/contrib/otelconf v0.15.0
go.opentelemetry.io/otel v1.35.0
go.opentelemetry.io/otel/log v0.11.0
go.opentelemetry.io/otel/trace v1.35.0
go.uber.org/goleak v1.3.0
go.uber.org/multierr v1.11.0
go.uber.org/zap v1.27.0
Expand Down Expand Up @@ -185,7 +187,6 @@ require (
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect
go.opentelemetry.io/contrib/propagators/b3 v1.35.0 // indirect
go.opentelemetry.io/contrib/zpages v0.60.0 // indirect
go.opentelemetry.io/otel v1.35.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploggrpc v0.11.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlplog/otlploghttp v0.11.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.35.0 // indirect
Expand All @@ -201,14 +202,13 @@ require (
go.opentelemetry.io/otel/sdk v1.35.0 // indirect
go.opentelemetry.io/otel/sdk/log v0.11.0 // indirect
go.opentelemetry.io/otel/sdk/metric v1.35.0 // indirect
go.opentelemetry.io/otel/trace v1.35.0 // indirect
go.opentelemetry.io/proto/otlp v1.5.0 // indirect
golang.org/x/exp v0.0.0-20250210185358-939b2ce775ac // indirect
golang.org/x/net v0.37.0 // indirect
golang.org/x/text v0.23.0 // indirect
gonum.org/v1/gonum v0.15.1 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20250303144028-a0af3efb3deb // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20250313205543-e70fdf4c4cb4 // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20250313205543-e70fdf4c4cb4 // indirect
google.golang.org/grpc v1.71.0 // indirect
gopkg.in/yaml.v2 v2.4.0 // indirect
)
Expand Down
8 changes: 4 additions & 4 deletions cmd/opampsupervisor/go.sum

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

57 changes: 51 additions & 6 deletions cmd/opampsupervisor/supervisor/supervisor.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,9 @@ import (
semconv "go.opentelemetry.io/collector/semconv/v1.21.0"
"go.opentelemetry.io/contrib/bridges/otelzap"
telemetryconfig "go.opentelemetry.io/contrib/otelconf/v0.3.0"
"go.opentelemetry.io/otel/codes"
"go.opentelemetry.io/otel/log"
"go.opentelemetry.io/otel/trace"
"go.uber.org/multierr"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
Expand All @@ -66,6 +68,8 @@ var (

lastRecvRemoteConfigFile = "last_recv_remote_config.dat"
lastRecvOwnTelemetryConfigFile = "last_recv_own_telemetry_config.dat"

errNonMatchingInstanceUID = errors.New("received collector instance UID does not match expected UID set by the supervisor")
)

const (
Expand Down Expand Up @@ -387,18 +391,23 @@ func (s *Supervisor) createTemplates() error {
// shuts down the Collector. This only needs to happen
// once per Collector binary.
func (s *Supervisor) getBootstrapInfo() (err error) {
_, span := s.getTracer().Start(context.Background(), "GetBootstrapInfo")
defer span.End()
s.opampServerPort, err = s.getSupervisorOpAMPServerPort()
if err != nil {
span.SetStatus(codes.Error, fmt.Sprintf("Could not get supervisor opamp service port: %v", err))
return err
}

bootstrapConfig, err := s.composeNoopConfig()
if err != nil {
span.SetStatus(codes.Error, fmt.Sprintf("Could not compose noop config config: %v", err))
return err
}

err = os.WriteFile(s.agentConfigFilePath(), bootstrapConfig, 0o600)
if err != nil {
span.SetStatus(codes.Error, fmt.Sprintf("Failed to write agent config: %v", err))
return fmt.Errorf("failed to write agent config: %w", err)
}

Expand Down Expand Up @@ -429,13 +438,13 @@ func (s *Supervisor) getBootstrapInfo() (err error) {

for _, attr := range identAttr {
if attr.Key == semconv.AttributeServiceInstanceID {
// TODO: Consider whether to attempt restarting the Collector.
// https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/29864
if attr.Value.GetStringValue() != s.persistentState.InstanceID.String() {
done <- fmt.Errorf(
"the Collector's instance ID (%s) does not match with the instance ID set by the Supervisor (%s)",
"the Collector's instance ID (%s) does not match with the instance ID set by the Supervisor (%s): %w",
attr.Value.GetStringValue(),
s.persistentState.InstanceID.String())
s.persistentState.InstanceID.String(),
errNonMatchingInstanceUID,
)
return response
}
instanceIDSeen = true
Expand Down Expand Up @@ -480,6 +489,7 @@ func (s *Supervisor) getBootstrapInfo() (err error) {
},
}.toServerSettings())
if err != nil {
span.SetStatus(codes.Error, fmt.Sprintf("Could not start OpAMP server: %v", err))
return err
}

Expand All @@ -496,10 +506,12 @@ func (s *Supervisor) getBootstrapInfo() (err error) {
"--config", s.agentConfigFilePath(),
)
if err != nil {
span.SetStatus(codes.Error, fmt.Sprintf("Could not start Agent: %v", err))
return err
}

if err = cmd.Start(context.Background()); err != nil {
span.SetStatus(codes.Error, fmt.Sprintf("Could not start Agent: %v", err))
return err
}

Expand All @@ -512,11 +524,39 @@ func (s *Supervisor) getBootstrapInfo() (err error) {
select {
case <-time.After(s.config.Agent.BootstrapTimeout):
if connected.Load() {
return errors.New("collector connected but never responded with an AgentDescription message")
msg := "collector connected but never responded with an AgentDescription message"
span.SetStatus(codes.Error, msg)
return errors.New(msg)
} else {
return errors.New("collector's OpAMP client never connected to the Supervisor")
msg := "collector's OpAMP client never connected to the Supervisor"
span.SetStatus(codes.Error, msg)
return errors.New(msg)
}
case err = <-done:
if errors.Is(err, errNonMatchingInstanceUID) {
// try to report the issue to the OpAMP server
if startOpAMPErr := s.startOpAMPClient(); startOpAMPErr == nil {
defer func(s *Supervisor) {
if stopErr := s.stopOpAMPClient(); stopErr != nil {
s.telemetrySettings.Logger.Error("Could not stop OpAmp client", zap.Error(stopErr))
}
}(s)
if healthErr := s.opampClient.SetHealth(&protobufs.ComponentHealth{
Healthy: false,
LastError: err.Error(),
}); healthErr != nil {
s.telemetrySettings.Logger.Error("Could not report health to OpAMP server", zap.Error(healthErr))
}
} else {
s.telemetrySettings.Logger.Error("Could not start OpAMP client to report health to server", zap.Error(startOpAMPErr))
}
}
if err != nil {
s.telemetrySettings.Logger.Error("Could not complete bootstrap", zap.Error(err))
span.SetStatus(codes.Error, err.Error())
} else {
span.SetStatus(codes.Ok, "")
}
return err
}
}
Expand Down Expand Up @@ -1641,6 +1681,11 @@ func (s *Supervisor) findRandomPort() (int, error) {
return port, nil
}

func (s *Supervisor) getTracer() trace.Tracer {
tracer := s.telemetrySettings.TracerProvider.Tracer("github.com/open-telemetry/opentelemetry-collector-contrib/cmd/opampsupervisor")
return tracer
}

// The default koanf behavior is to override lists in the config.
// Instead, we provide this function, which merges the source and destination config's
// extension lists by concatenating the two.
Expand Down
Loading