Menu
Grafana Cloud

Tail sampling

Overview

With a tail sampling strategy, the decision to sample the trace is made considering all or most of the spans. For example, tail sampling is a good option to sample only traces that have errors or traces with long request duration.

Tail sampling is more complex to configure, implement, and maintain but is the recommended sampling strategy for large systems with a high telemetry volume.

The key points to keep in mind when choosing tail sampling:

  • While the system grows and changes over time the tail sampling strategy should adapt to keep the volume of sampled data balanced
  • The components that implement the tail sampling strategy must be stateful, they should hold data in memory for some time and use it to make a sampling decision

For Application Observability, we recommend sampling at the data collector after metrics generation so that all traces are available to generate accurate metrics. If the metrics are generated from sampled traces, their values will be affected by the sampling.

Prerequisites

  1. Head sampling should not already be implemented at the application level.
  2. Use Grafana Agent or OpenTelemetry Collector to collect traces from the application, generate metrics from traces, and apply sampling.
  3. Send all traces to the data collector to generate accurate metrics.

In Application Observability on Grafana Cloud:

  1. Disable metrics generation in the configuration.
  2. Choose target_info (OTel Collector, Grafana Agent) metric name in the configuration.

Configuration

The collector receives all traces, generates metrics, and sends metrics to Grafana Cloud Prometheus. In parallel, the collector applies a tail sampling strategy to the traces and sends sampled data to Grafana Cloud Tempo.

To view the Grafana Agent configuration for tail sampling, select the river tab below. To view the OpenTelemetry Collector configuration for tail sampling, select the yaml tab below.

river
otelcol.receiver.otlp "default" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.receiver.otlp/

	// configures the default grpc endpoint "0.0.0.0:4317"
	grpc { }
	// configures the default http/protobuf endpoint "0.0.0.0:4318"
	http { }

	output {
		metrics = [otelcol.processor.transform.add_resource_attributes_as_metric_attributes.input]
		logs    = [otelcol.processor.batch.default.input]
		traces  = [
			otelcol.processor.tail_sampling.default.input,
			otelcol.processor.transform.drop_unneeded_resource_attributes.input,
			otelcol.connector.servicegraph.default.input,
		]
	}
}

otelcol.processor.transform "add_resource_attributes_as_metric_attributes" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.transform/
	error_mode = "ignore"

	metric_statements {
		context    = "datapoint"
		statements = [
			"set(attributes[\"deployment.environment\"], resource.attributes[\"deployment.environment\"])",
			"set(attributes[\"service.version\"], resource.attributes[\"service.version\"])",
		]
	}

	output {
		metrics = [otelcol.processor.batch.default.input]
	}
}

otelcol.connector.spanmetrics "default" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.connector.spanmetrics/

	dimension {
		name = "service.namespace"
	}

	dimension {
		name = "service.version"
	}

	dimension {
		name = "deployment.environment"
	}

	histogram {
		explicit {
			buckets = ["0s", "0.005s", "0.01s", "0.025s", "0.05s", "0.075s", "0.1s", "0.25s", "0.5s", "0.75s", "1s", "2.5s", "5s", "7.5s", "10s"]
		}
		unit = "s"
	}

	namespace = "traces.spanmetrics"

	output {
		metrics = [otelcol.processor.transform.use_grafana_metric_names.input]
	}
}

otelcol.processor.transform "use_grafana_metric_names" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.transform/
	error_mode = "ignore"

	metric_statements {
		context    = "metric"
		statements = [
			"set(name, \"traces.spanmetrics.latency\") where name == \"traces.spanmetrics.duration\"",
			"set(name, \"traces.spanmetrics.calls.total\") where name == \"traces.spanmetrics.calls\"",
		]
	}

	output {
		metrics = [otelcol.processor.filter.drop_unneeded_span_metrics.input]
	}
}

otelcol.processor.filter "drop_unneeded_span_metrics" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.filter/
	error_mode = "ignore"

	metrics {
		datapoint = [
			"IsMatch(metric.name, \"traces.spanmetrics.calls|traces.spanmetrics.duration\") and IsMatch(attributes[\"span.kind\"], \"SPAN_KIND_INTERNAL|SPAN_KIND_CLIENT|SPAN_KIND_PRODUCER\")",
		]
	}

	output {
		metrics = [otelcol.processor.batch.default.input]
	}
}

otelcol.processor.transform "drop_unneeded_resource_attributes" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.transform/
	error_mode = "ignore"

	trace_statements {
		context    = "resource"
		statements = [
			"delete_key(attributes, \"k8s.pod.start_time\")",
			"delete_key(attributes, \"os.description\")",
			"delete_key(attributes, \"os.type\")",
			"delete_key(attributes, \"process.command_args\")",
			"delete_key(attributes, \"process.executable.path\")",
			"delete_key(attributes, \"process.pid\")",
			"delete_key(attributes, \"process.runtime.description\")",
			"delete_key(attributes, \"process.runtime.name\")",
			"delete_key(attributes, \"process.runtime.version\")",
		]
	}

	output {
		traces = [otelcol.connector.spanmetrics.default.input]
	}
}

otelcol.connector.servicegraph "default" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.connector.servicegraph/
	dimensions = [
		"service.namespace",
		"service.version",
		"deployment.environment",
	]
	latency_histogram_buckets = ["0s", "0.005s", "0.01s", "0.025s", "0.05s", "0.075s", "0.1s", "0.25s", "0.5s", "0.75s", "1s", "2.5s", "5s", "7.5s", "10s"]

	store {
		ttl = "2s"
	}

	output {
		metrics = [otelcol.processor.batch.default.input]
	}
}

otelcol.processor.batch "default" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.batch/
	output {
		metrics = [otelcol.exporter.prometheus.grafana_cloud_prometheus.input]
		logs    = [otelcol.exporter.loki.grafana_cloud_loki.input]
		traces  = [otelcol.exporter.otlp.grafana_cloud_tempo.input]
	}
}

otelcol.processor.tail_sampling "default" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.processor.tail_sampling/
	// Examples: keep all traces that take more than 5000 ms
	policy {
		name    = "all_traces_above_5000ms"
		type    = "latency"
		latency = {
			threshold_ms = 5000,
		}
	}

	output {
		traces = [otelcol.processor.batch.default.input]
	}
}

otelcol.exporter.loki "grafana_cloud_loki" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.exporter.loki/
	forward_to = [loki.write.grafana_cloud_loki.receiver]
}

otelcol.exporter.prometheus "grafana_cloud_prometheus" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.exporter.prometheus/
	add_metric_suffixes = false
	forward_to          = [prometheus.remote_write.grafana_cloud_prometheus.receiver]
}

prometheus.remote_write "grafana_cloud_prometheus" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/prometheus.remote_write/
	endpoint {
		url = env("GRAFANA_CLOUD_PROMETHEUS_URL")

		basic_auth {
			username = env("GRAFANA_CLOUD_PROMETHEUS_USERNAME")
			password = env("GRAFANA_CLOUD_API_KEY")
		}
	}
}

loki.write "grafana_cloud_loki" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/loki.write/
	endpoint {
		url = env("GRAFANA_CLOUD_LOKI_URL")

		basic_auth {
			username = env("GRAFANA_CLOUD_LOKI_USERNAME")
			password = env("GRAFANA_CLOUD_API_KEY")
		}
	}
}

otelcol.exporter.otlp "grafana_cloud_tempo" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.exporter.otlp/
	client {
		endpoint = env("GRAFANA_CLOUD_TEMPO_ENDPOINT")
		auth     = otelcol.auth.basic.grafana_cloud_tempo.handler
	}
}

otelcol.auth.basic "grafana_cloud_tempo" {
	// https://grafana.com/docs/agent/latest/flow/reference/components/otelcol.auth.basic/
	username = env("GRAFANA_CLOUD_TEMPO_USERNAME")
	password = env("GRAFANA_CLOUD_API_KEY")
}
yaml
# Tested with OpenTelemetry Collector Contrib v0.92.0
receivers:
  otlp:
    # https://github.com/open-telemetry/opentelemetry-collector/tree/main/receiver/otlpreceiver
    protocols:
      grpc:
      http:
  hostmetrics:
    # Optional. Host Metrics Receiver added as an example of Infra Monitoring capabilities of the OpenTelemetry Collector
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/hostmetricsreceiver
    scrapers:
      load:
      memory:

processors:
  batch:
    # https://github.com/open-telemetry/opentelemetry-collector/tree/main/processor/batchprocessor
  resourcedetection:
    # Enriches telemetry data with resource information from the host
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/resourcedetectionprocessor
    detectors: ["env", "system"]
    override: false
  transform/add_resource_attributes_as_metric_attributes:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/transformprocessor
    error_mode: ignore
    metric_statements:
      - context: datapoint
        statements:
          - set(attributes["deployment.environment"], resource.attributes["deployment.environment"])
          - set(attributes["service.version"], resource.attributes["service.version"])
  filter/drop_unneeded_span_metrics:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/filterprocessor
    error_mode: ignore
    metrics:
      datapoint:
        - 'IsMatch(metric.name, "traces.spanmetrics.calls|traces.spanmetrics.duration") and IsMatch(attributes["span.kind"], "SPAN_KIND_INTERNAL|SPAN_KIND_CLIENT|SPAN_KIND_PRODUCER")'
  transform/drop_unneeded_resource_attributes:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/transformprocessor
    error_mode: ignore
    trace_statements:
      - context: resource
        statements:
          - delete_key(attributes, "k8s.pod.start_time")
          - delete_key(attributes, "os.description")
          - delete_key(attributes, "os.type")
          - delete_key(attributes, "process.command_args")
          - delete_key(attributes, "process.executable.path")
          - delete_key(attributes, "process.pid")
          - delete_key(attributes, "process.runtime.description")
          - delete_key(attributes, "process.runtime.name")
          - delete_key(attributes, "process.runtime.version")
  transform/use_grafana_metric_names:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/transformprocessor
    error_mode: ignore
    metric_statements:
      - context: metric
        statements:
          - set(name, "traces.spanmetrics.latency") where name == "traces.spanmetrics.duration"
          - set(name, "traces.spanmetrics.calls.total") where name == "traces.spanmetrics.calls"
  tail_sampling:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/tailsamplingprocessor
    policies:
      # Examples: keep all traces that take more than 5000 ms
      [
        {
          name: all_traces_above_5000ms,
          type: latency,
          latency: { threshold_ms: 5000 },
        },
      ]

connectors:
  servicegraph:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/connector/servicegraphconnector
    dimensions:
      - service.namespace
      - service.version
      - deployment.environment

  spanmetrics:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/connector/spanmetricsconnector
    namespace: traces.spanmetrics
    histogram:
      unit: s
    dimensions:
      - name: service.namespace
      - name: service.version
      - name: deployment.environment

exporters:
  otlp/grafana_cloud_traces:
    # https://github.com/open-telemetry/opentelemetry-collector/tree/main/exporter/otlpexporter
    endpoint: "${env:GRAFANA_CLOUD_TEMPO_ENDPOINT}"
    auth:
      authenticator: basicauth/grafana_cloud_traces

  loki/grafana_cloud_logs:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/lokiexporter
    endpoint: "${env:GRAFANA_CLOUD_LOKI_URL}"
    auth:
      authenticator: basicauth/grafana_cloud_logs

  prometheusremotewrite/grafana_cloud_metrics:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/exporter/prometheusremotewriteexporter
    endpoint: "${env:GRAFANA_CLOUD_PROMETHEUS_URL}"
    auth:
      authenticator: basicauth/grafana_cloud_metrics
    add_metric_suffixes: false

extensions:
  basicauth/grafana_cloud_traces:
    # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/extension/basicauthextension
    client_auth:
      username: "${env:GRAFANA_CLOUD_TEMPO_USERNAME}"
      password: "${env:GRAFANA_CLOUD_API_KEY}"
  basicauth/grafana_cloud_metrics:
    client_auth:
      username: "${env:GRAFANA_CLOUD_PROMETHEUS_USERNAME}"
      password: "${env:GRAFANA_CLOUD_API_KEY}"
  basicauth/grafana_cloud_logs:
    client_auth:
      username: "${env:GRAFANA_CLOUD_LOKI_USERNAME}"
      password: "${env:GRAFANA_CLOUD_API_KEY}"

service:
  extensions:
    [
      basicauth/grafana_cloud_traces,
      basicauth/grafana_cloud_metrics,
      basicauth/grafana_cloud_logs,
    ]
  pipelines:
    traces:
      receivers: [otlp]
      processors:
        [resourcedetection, transform/drop_unneeded_resource_attributes]
      exporters: [servicegraph, spanmetrics]
    traces/grafana_cloud_traces:
      receivers: [otlp]
      processors: [resourcedetection, tail_sampling, batch]
      exporters: [otlp/grafana_cloud_traces]
    metrics:
      receivers: [otlp, hostmetrics]
      processors:
        [
          resourcedetection,
          transform/add_resource_attributes_as_metric_attributes,
          batch,
        ]
      exporters: [prometheusremotewrite/grafana_cloud_metrics]
    metrics/spanmetrics:
      receivers: [spanmetrics]
      processors:
        [
          filter/drop_unneeded_span_metrics,
          transform/use_grafana_metric_names,
          batch,
        ]
      exporters: [prometheusremotewrite/grafana_cloud_metrics]
    metrics/servicegraph:
      receivers: [servicegraph]
      processors: [batch]
      exporters: [prometheusremotewrite/grafana_cloud_metrics]
    logs:
      receivers: [otlp]
      processors: [resourcedetection, batch]
      exporters: [loki/grafana_cloud_logs]