Menu
Grafana Cloud

Scaling

If the load on the data collector (Grafana Agent or OpenTelemetry collector) is small, it is recommended to process all necessary telemetry signals in the same process. For example, a single collector can process all of the incoming metrics, logs, traces.

When telemetry volume increases you should consider ways to scale the data collector.

The following section will guide you through our recommendations on scaling the data collector when sampling is enabled.

For Application Observability, we recommend sampling at the data collector after metrics generation so that all traces are available to generate accurate metrics. The sampling strategy applies to traces, and only sampled traces are sent to the backend.

This pipeline makes the data collector stateful, containing these stateful components:

In the context of tracing stateful component is a component that needs to aggregate certain spans to work correctly.

  • Span metrics connector needs all the spans with service.name to be processed by the same collector
  • Service graph connector needs to pair each “client” span with “server” span to calculate metric such as span duration
  • Tail sampling processor needs all the spans with traceID to be processed by the same collector

To scale this pipeline we recommend deploying a two-layer of collectors. The first layer enriches data, exports application metrics and logs to backends, and load balances traces using a otelcol.exporter.loadbalancing for Grafana Agent and loadbalancingexporter for OpenTelemetry collector. The second layer performs metrics generation and sampling and exports sampled traces and generated metrics.

Sampling scaled architecture

In order to differentiate series generated by different collectors on the second layer we recommend adding additional label “collector_id”. Cardinality issues due to “collector_id” labels can be solved using Adaptive Metrics.

To view the Grafana Agent configuration for the first layer, select the river tab below. To view the OpenTelemetry Collector configuration for the first layer, select the yaml tab below.

river
otelcol.receiver.otlp "default" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.receiver.otlp/

	// configures the default grpc endpoint "0.0.0.0:4317"
	grpc { }
	// configures the default http/protobuf endpoint "0.0.0.0:4318"
	http { }

	output {
		metrics = [otelcol.processor.resourcedetection.default.input]
		logs    = [otelcol.processor.resourcedetection.default.input]
		traces  = [otelcol.processor.resourcedetection.default.input]
	}
}

otelcol.processor.transform "add_resource_attributes_as_metric_attributes" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.transform/
	error_mode = "ignore"

	metric_statements {
		context    = "datapoint"
		statements = [
			"set(attributes[\"deployment.environment\"], resource.attributes[\"deployment.environment\"])",
			"set(attributes[\"service.version\"], resource.attributes[\"service.version\"])",
		]
	}

	output {
		metrics = [otelcol.processor.batch.default.input]
	}
}

otelcol.processor.batch "default" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.batch/
	output {
		metrics = [otelcol.exporter.prometheus.grafana_cloud_prometheus.input]
		logs    = [otelcol.exporter.loki.grafana_cloud_loki.input]
		traces  = [otelcol.exporter.loadbalancing.default.input]
	}
}

otelcol.processor.resourcedetection "default" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.resourcedetection/
	detectors = ["env", "system"]

	system {
		hostname_sources = ["os"]
	}

	output {
		metrics = [otelcol.processor.transform.add_resource_attributes_as_metric_attributes.input]
		logs    = [otelcol.processor.batch.default.input]
		traces  = [
			otelcol.processor.batch.default.input,
			otelcol.connector.host_info.default.input,
		]
	}
}

otelcol.connector.host_info "default" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.connector.host_info/
	host_identifiers = ["host.name"]

	output {
		metrics = [otelcol.processor.batch.default.input]
	}
}

otelcol.exporter.loki "grafana_cloud_loki" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.exporter.loki/
	forward_to = [loki.write.grafana_cloud_loki.receiver]
}

otelcol.exporter.prometheus "grafana_cloud_prometheus" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.exporter.prometheus/
	add_metric_suffixes = false
	forward_to          = [prometheus.remote_write.grafana_cloud_prometheus.receiver]
}

prometheus.remote_write "grafana_cloud_prometheus" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/prometheus.remote_write/
	endpoint {
		url = env("GRAFANA_CLOUD_PROMETHEUS_URL")

		basic_auth {
			username = env("GRAFANA_CLOUD_PROMETHEUS_USERNAME")
			password = env("GRAFANA_CLOUD_API_KEY")
		}
	}
}

loki.write "grafana_cloud_loki" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/loki.write/
	endpoint {
		url = env("GRAFANA_CLOUD_LOKI_URL")

		basic_auth {
			username = env("GRAFANA_CLOUD_LOKI_USERNAME")
			password = env("GRAFANA_CLOUD_API_KEY")
		}
	}
}

otelcol.exporter.loadbalancing "default" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.exporter.loadbalancing/
	resolver {
		static {
			hostnames = ["collector-1:4317", "collector-2:4317", "collector-3:4317"]
		}
	}

	protocol {
		otlp {
			client { }
		}
	}
}
yaml
# Tested with OpenTelemetry Collector Contrib v0.92.0
receivers:
  otlp:
    # https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver/otlpreceiver
    protocols:
      grpc:
      http:
  hostmetrics:
    # Optional. Host Metrics Receiver added as an example of Infra Monitoring capabilities of the OpenTelemetry Collector
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/hostmetricsreceiver
    scrapers:
      load:
      memory:

processors:
  batch:
  # https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/batchprocessor
  resourcedetection:
    # Enriches telemetry data with resource information from the host
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/resourcedetectionprocessor
    detectors: ["env", "system"]
    override: false
  transform/add_resource_attributes_as_metric_attributes:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/transformprocessor
    error_mode: ignore
    metric_statements:
      - context: datapoint
        statements:
          - set(attributes["deployment.environment"], resource.attributes["deployment.environment"])
          - set(attributes["service.version"], resource.attributes["service.version"])

exporters:
  loadbalancing:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/loadbalancingexporter
    protocol:
      otlp:
    resolver:
      static:
        hostnames:
          - collector-1:4317
          - collector-2:4317
          - collector-3:4317
  loki/grafana_cloud_logs:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/lokiexporter
    endpoint: "${env:GRAFANA_CLOUD_LOKI_URL}"
    auth:
      authenticator: basicauth/grafana_cloud_logs

  prometheusremotewrite/grafana_cloud_metrics:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/prometheusremotewriteexporter
    endpoint: "${env:GRAFANA_CLOUD_PROMETHEUS_URL}"
    auth:
      authenticator: basicauth/grafana_cloud_metrics
    add_metric_suffixes: false

extensions:
  basicauth/grafana_cloud_metrics:
    client_auth:
      username: "${env:GRAFANA_CLOUD_PROMETHEUS_USERNAME}"
      password: "${env:GRAFANA_CLOUD_API_KEY}"
  basicauth/grafana_cloud_logs:
    client_auth:
      username: "${env:GRAFANA_CLOUD_LOKI_USERNAME}"
      password: "${env:GRAFANA_CLOUD_API_KEY}"

service:
  extensions: [basicauth/grafana_cloud_metrics, basicauth/grafana_cloud_logs]
  pipelines:
    traces:
      receivers: [otlp]
      processors: [resourcedetection, batch]
      exporters: [loadbalancing]
    metrics:
      receivers: [otlp, hostmetrics]
      processors:
        [
          resourcedetection,
          transform/add_resource_attributes_as_metric_attributes,
          batch,
        ]
      exporters: [prometheusremotewrite/grafana_cloud_metrics]
    logs:
      receivers: [otlp]
      processors: [resourcedetection, batch]
      exporters: [loki/grafana_cloud_logs]

The first layer collector is stateless. Scaling stateless collector is easy, as an off-the-shelf layer 4 load-balancer would be sufficient.

The collector has three resolvers for the load-balancing exporter static, dns, and k8s.

  • static: A static list of backends is provided in the configuration. This is suitable when the backends are static and scaling isn’t expected.
  • dns: A hostname is provided as a parameter which the resolver periodically queries to discover IPs and update the load-balancer ring. When multiple instances are used, there is a chance they can momentarily have a different view of the system while they sync after a refresh. This can result in some spans for the same trace ID being sent to multiple hosts. Determine if this acceptable for the system, and use a longer refresh interval to reduce the effect of being out of sync.
  • kubernetes: A resolver that implements a watcher using Kubernetes APIs to get notifications when the list of pods backing a service is changed. This should reduce the amount of time when cluster views differ between nodes, effectively being a better solution than the DNS resolver when Kubernetes is used.

To view the Grafana Agent configuration for the second layer, select the river tab below. To view the OpenTelemetry Collector configuration for the second layer, select the yaml tab below.

river
otelcol.receiver.otlp "default" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.receiver.otlp/

	// configures the default grpc endpoint "0.0.0.0:4317"
	grpc { }

	output {
		traces = [
			otelcol.processor.tail_sampling.default.input,
			otelcol.processor.transform.drop_unneeded_resource_attributes.input,
		]
	}
}

otelcol.connector.spanmetrics "default" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.connector.spanmetrics/

	dimension {
		name = "service.namespace"
	}

	dimension {
		name = "service.version"
	}

	dimension {
		name = "deployment.environment"
	}

	histogram {
		explicit {
			buckets = ["0s", "0.005s", "0.01s", "0.025s", "0.05s", "0.075s", "0.1s", "0.25s", "0.5s", "0.75s", "1s", "2.5s", "5s", "7.5s", "10s"]
		}
		unit = "s"
	}

	namespace = "traces.spanmetrics"

	output {
		metrics = [otelcol.processor.filter.drop_unneeded_span_metrics.input]
	}
}

otelcol.processor.transform "use_grafana_metric_names" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.transform/
	error_mode = "ignore"

	metric_statements {
		context    = "metric"
		statements = [
			"set(name, \"traces.spanmetrics.latency\") where name == \"traces.spanmetrics.duration\"",
			"set(name, \"traces.spanmetrics.calls.total\") where name == \"traces.spanmetrics.calls\"",
		]
	}

	output {
		metrics = [otelcol.processor.batch.default.input]
	}
}

otelcol.processor.filter "drop_unneeded_span_metrics" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.filter/
	error_mode = "ignore"

	metrics {
		datapoint = [
			"IsMatch(metric.name, \"traces.spanmetrics.calls|traces.spanmetrics.duration\") and IsMatch(attributes[\"span.kind\"], \"SPAN_KIND_INTERNAL|SPAN_KIND_CLIENT|SPAN_KIND_PRODUCER\")",
		]
	}

	output {
		metrics = [otelcol.processor.transform.use_grafana_metric_names.input]
	}
}

otelcol.processor.transform "drop_unneeded_resource_attributes" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.transform/
	error_mode = "ignore"

	trace_statements {
		context    = "resource"
		statements = [
			"delete_key(attributes, \"k8s.pod.start_time\")",
			"delete_key(attributes, \"os.description\")",
			"delete_key(attributes, \"os.type\")",
			"delete_key(attributes, \"process.command_args\")",
			"delete_key(attributes, \"process.executable.path\")",
			"delete_key(attributes, \"process.pid\")",
			"delete_key(attributes, \"process.runtime.description\")",
			"delete_key(attributes, \"process.runtime.name\")",
			"delete_key(attributes, \"process.runtime.version\")",
		]
	}

	output {
		traces = [
			otelcol.connector.servicegraph.default.input,
			otelcol.connector.spanmetrics.default.input,
		]
	}
}

otelcol.connector.servicegraph "default" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.connector.servicegraph/
	dimensions = [
		"service.namespace",
		"service.version",
		"deployment.environment",
	]
	latency_histogram_buckets = ["0s", "0.005s", "0.01s", "0.025s", "0.05s", "0.075s", "0.1s", "0.25s", "0.5s", "0.75s", "1s", "2.5s", "5s", "7.5s", "10s"]

	store {
		ttl = "2s"
	}

	output {
		metrics = [otelcol.processor.batch.default.input]
	}
}

otelcol.processor.batch "default" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.batch/
	output {
		metrics = [otelcol.exporter.prometheus.grafana_cloud_prometheus.input]
		traces  = [otelcol.exporter.otlp.grafana_cloud_tempo.input]
	}
}

otelcol.processor.tail_sampling "default" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.tail_sampling/
	// Examples: keep all traces that take more than 5000 ms
	policy {
		name    = "all_traces_above_5000ms"
		type    = "latency"
		latency = {
			threshold_ms = 5000,
		}
	}

	output {
		traces = [otelcol.processor.batch.default.input]
	}
}

otelcol.exporter.prometheus "grafana_cloud_prometheus" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.exporter.prometheus/
	add_metric_suffixes = false
	forward_to          = [prometheus.remote_write.grafana_cloud_prometheus.receiver]
}

prometheus.remote_write "grafana_cloud_prometheus" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/prometheus.remote_write/
	endpoint {
		url = env("GRAFANA_CLOUD_PROMETHEUS_URL")

		basic_auth {
			username = env("GRAFANA_CLOUD_PROMETHEUS_USERNAME")
			password = env("GRAFANA_CLOUD_API_KEY")
		}

		queue_config {
			retry_on_http_429 = false
		}
	}

	external_labels {
		// add collector_id label to differentiate series generated by different agents
		collector_id = env("POD_UID")
	}
}

otelcol.exporter.otlp "grafana_cloud_tempo" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.exporter.otlp/
	client {
		endpoint = env("GRAFANA_CLOUD_TEMPO_ENDPOINT")
		auth     = otelcol.auth.basic.grafana_cloud_tempo.handler
	}
}

otelcol.auth.basic "grafana_cloud_tempo" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.auth.basic/
	username = env("GRAFANA_CLOUD_TEMPO_USERNAME")
	password = env("GRAFANA_CLOUD_API_KEY")
}
yaml
# Tested with OpenTelemetry Collector Contrib v0.92.0
receivers:
  otlp:
    # https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver/otlpreceiver
    protocols:
      grpc:

processors:
  batch:
  # https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/batchprocessor
  filter/drop_unneeded_span_metrics:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/filterprocessor
    error_mode: ignore
    metrics:
      datapoint:
        - 'IsMatch(metric.name, "traces.spanmetrics.calls|traces.spanmetrics.duration") and IsMatch(attributes["span.kind"], "SPAN_KIND_INTERNAL|SPAN_KIND_CLIENT|SPAN_KIND_PRODUCER")'
  transform/drop_unneeded_resource_attributes:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/transformprocessor
    error_mode: ignore
    trace_statements:
      - context: resource
        statements:
          - delete_key(attributes, "k8s.pod.start_time")
          - delete_key(attributes, "os.description")
          - delete_key(attributes, "os.type")
          - delete_key(attributes, "process.command_args")
          - delete_key(attributes, "process.executable.path")
          - delete_key(attributes, "process.pid")
          - delete_key(attributes, "process.runtime.description")
          - delete_key(attributes, "process.runtime.name")
          - delete_key(attributes, "process.runtime.version")
  transform/use_grafana_metric_names:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/transformprocessor
    error_mode: ignore
    metric_statements:
      - context: metric
        statements:
          - set(name, "traces.spanmetrics.latency") where name == "traces.spanmetrics.duration"
          - set(name, "traces.spanmetrics.calls.total") where name == "traces.spanmetrics.calls"
  tail_sampling:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/tailsamplingprocessor
    policies:
      # Examples: keep all traces that take more than 5000 ms
      [
        {
          name: all_traces_above_5000ms,
          type: latency,
          latency: { threshold_ms: 5000 },
        },
      ]

connectors:
  servicegraph:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/connector/servicegraphconnector
    dimensions:
      - service.namespace
      - service.version
      - deployment.environment

  spanmetrics:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/connector/spanmetricsconnector
    namespace: traces.spanmetrics
    histogram:
      unit: s
    dimensions:
      - name: service.namespace
      - name: service.version
      - name: deployment.environment

exporters:
  otlp/grafana_cloud_traces:
    # https://github.com/open-telemetry/opentelemetry-collector/tree/main/exporter/otlpexporter
    endpoint: "${env:GRAFANA_CLOUD_TEMPO_ENDPOINT}"
    auth:
      authenticator: basicauth/grafana_cloud_traces

  prometheusremotewrite/grafana_cloud_metrics:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/prometheusremotewriteexporter
    endpoint: "${env:GRAFANA_CLOUD_PROMETHEUS_URL}"
    auth:
      authenticator: basicauth/grafana_cloud_metrics
    add_metric_suffixes: false
    external_labels:
      # add collector_id label to differentiate series generated by different collectors
      collector_id: "${env.POD_UID}"

extensions:
  basicauth/grafana_cloud_traces:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/extension/basicauthextension
    client_auth:
      username: "${env:GRAFANA_CLOUD_TEMPO_USERNAME}"
      password: "${env:GRAFANA_CLOUD_API_KEY}"
  basicauth/grafana_cloud_metrics:
    client_auth:
      username: "${env:GRAFANA_CLOUD_PROMETHEUS_USERNAME}"
      password: "${env:GRAFANA_CLOUD_API_KEY}"

service:
  extensions: [basicauth/grafana_cloud_traces, basicauth/grafana_cloud_metrics]
  pipelines:
    traces:
      receivers: [otlp]
      processors: [transform/drop_unneeded_resource_attributes]
      exporters: [servicegraph, spanmetrics]
    traces/grafana_cloud_traces:
      receivers: [otlp]
      processors: [tail_sampling, batch]
      exporters: [otlp/grafana_cloud_traces]
    metrics/spanmetrics:
      receivers: [spanmetrics]
      processors:
        [
          filter/drop_unneeded_span_metrics,
          transform/use_grafana_metric_names,
          batch,
        ]
      exporters: [prometheusremotewrite/grafana_cloud_metrics]
    metrics/servicegraph:
      receivers: [servicegraph]
      processors: [batch]
      exporters: [prometheusremotewrite/grafana_cloud_metrics]