1
0
mirror of https://github.com/coreos/prometheus-operator.git synced 2026-02-05 15:46:31 +01:00

Merge remote-tracking branch 'upstream/master' into crd

Signed-off-by: Goutham Veeramachaneni <cs14btech11014@iith.ac.in>
This commit is contained in:
Goutham Veeramachaneni
2017-07-25 14:27:28 +05:30
19 changed files with 381 additions and 55 deletions

View File

@@ -138,6 +138,7 @@ Specification of the desired behavior of the Prometheus cluster. More info: http
| imagePullSecrets | An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod | [][v1.LocalObjectReference](https://kubernetes.io/docs/api-reference/v1.6/#localobjectreference-v1-core) | false |
| replicas | Number of instances to deploy for a Prometheus deployment. | *int32 | false |
| retention | Time duration Prometheus shall retain data for. | string | false |
| evaluationInterval | Interval between consecutive evaluations. | string | false |
| externalLabels | The labels to add to any time series or alerts when communicating with external systems (federation, remote storage, Alertmanager). | map[string]string | false |
| externalUrl | The external URL the Prometheus instances will be available under. This is necessary to generate correct URLs. This is necessary if Prometheus is not served from root of a DNS name. | string | false |
| routePrefix | The route prefix Prometheus registers HTTP handlers for. This is useful, if using ExternalURL and a proxy is rewriting HTTP routes of a request, and the actual ExternalURL is still true, but the server serves requests under a different route prefix. For example for use with `kubectl proxy`. | string | false |

View File

@@ -20,6 +20,7 @@ import (
"fmt"
"net"
"net/http"
"net/http/pprof"
"os"
"os/signal"
"syscall"
@@ -100,6 +101,11 @@ func Main() int {
po.RegisterMetrics(r)
ao.RegisterMetrics(r)
mux.Handle("/metrics", promhttp.HandlerFor(r, promhttp.HandlerOpts{}))
mux.Handle("/debug/pprof/", http.HandlerFunc(pprof.Index))
mux.Handle("/debug/pprof/cmdline", http.HandlerFunc(pprof.Cmdline))
mux.Handle("/debug/pprof/profile", http.HandlerFunc(pprof.Profile))
mux.Handle("/debug/pprof/symbol", http.HandlerFunc(pprof.Symbol))
mux.Handle("/debug/pprof/trace", http.HandlerFunc(pprof.Trace))
conf, err := k8sutil.NewClusterConfig(cfg.Host, cfg.TLSInsecure, &cfg.TLSConfig)
if err != nil {

View File

@@ -0,0 +1,3 @@
generate:
@echo ">> Compiling assets and generating Kubernetes manifests"
@hack/scripts/generate-manifests.sh

View File

@@ -0,0 +1,27 @@
# Developing Alerts and Dashboards
`kube-prometheus` ships with a set of default alerting rules and dashboards. At some point one might like to extend them. This document is intended to explain the workflow of how additional alerting rules and dashboards could be added.
For both, the Prometheus alerting rules as well as the Grafana dashboards, there are Kubernetes `ConfigMap`s, that are generated from content in the `assets/` directory.
The source of truth for the alerts and dashboards are the files in the `assets/` directory. The respective files have to be changed there and then the `make generate` make target is executed to re-generate the Kubernetes manifests.
## Alerts
The `ConfigMap` that is generated and holds the alerting rule files can be found in `manifests/prometheus/prometheus-k8s-rules.yaml`.
It is generated by taking all the `*.rules` files in the `assets/prometheus/rules/` directory and generate the `ConfigMap`.
To extend the alerting rules simply add a new `.rules` file into the `assets/prometheus/rules/` directory and re-generate the manifests. To modify the existing rules, simply edit the respective `.rules` file and re-generate the manifest.
Then the generated manifest can be applied against a Kubernetes cluster.
## Dashboards
The `ConfigMap` that is generated and holds the dashboard definitions can be found in `manifests/grafana/grafana-dashboards.yaml`.
As Grafana's support for applying dashboards from files is limited a sidecar (called "grafana-watcher") was implemented. It watches the dashboard definitions provided through the `ConfigMap` and ensures that Grafana's SQLite database is in sync with the dashboard definitions.
To edit/create a dashboard login to Grafana and modify and save the dashboard. Then download the dashboard definition in Grafana through `Share` -> `Export` -> `Save to file`. Move the file to `assets/grafana/` and re-generate the manifests.
Then the generated manifest can be applied against a Kubernetes cluster.

View File

@@ -27,6 +27,8 @@ kctl apply -f manifests/node-exporter
kctl apply -f manifests/kube-state-metrics
kctl apply -f manifests/grafana/grafana-credentials.yaml
kctl apply -f manifests/grafana
kctl apply -f manifests/prometheus/
find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" apply -f {} \;
kubectl apply -f manifests/prometheus/prometheus-k8s-roles.yaml
kubectl apply -f manifests/prometheus/prometheus-k8s-role-bindings.yaml
kctl apply -f manifests/alertmanager/

View File

@@ -15,7 +15,9 @@ kctl() {
kctl delete -f manifests/node-exporter
kctl delete -f manifests/kube-state-metrics
kctl delete -f manifests/grafana
kctl delete -f manifests/prometheus
find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" delete -f {} \;
kubectl delete -f manifests/prometheus/prometheus-k8s-roles.yaml
kubectl delete -f manifests/prometheus/prometheus-k8s-role-bindings.yaml
kctl delete -f manifests/alertmanager
# Hack: wait a bit to let the controller delete the deployed Prometheus server.

View File

@@ -1,12 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring

View File

@@ -1,18 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]

View File

@@ -0,0 +1,54 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: monitoring
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: default
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: prometheus-k8s
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring

View File

@@ -0,0 +1,50 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
name: prometheus-k8s
namespace: monitoring
rules:
- apiGroups: [""]
resources:
- nodes
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
name: prometheus-k8s
namespace: kube-system
rules:
- apiGroups: [""]
resources:
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
name: prometheus-k8s
namespace: default
rules:
- apiGroups: [""]
resources:
- services
- endpoints
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: prometheus-k8s
rules:
- nonResourceURLs: ["/metrics"]
verbs: ["get"]

View File

@@ -0,0 +1,5 @@
FROM quay.io/prometheus/busybox:latest
ADD main /bin/main
ENTRYPOINT ["/bin/main"]

View File

@@ -0,0 +1,3 @@
all:
CGO_ENABLED=0 go build --installsuffix cgo main.go
docker build -t quay.io/coreos/prometheus-alertmanager-test-webhook .

BIN
example/alertmanger-webhook/main Executable file

Binary file not shown.

View File

@@ -0,0 +1,12 @@
package main
import (
"fmt"
"net/http"
)
func main() {
http.ListenAndServe(":5001", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Println("Alertmanager Notification Payload Received")
}))
}

View File

@@ -66,6 +66,8 @@ type PrometheusSpec struct {
Replicas *int32 `json:"replicas,omitempty"`
// Time duration Prometheus shall retain data for.
Retention string `json:"retention,omitempty"`
// Interval between consecutive evaluations.
EvaluationInterval string `json:"evaluationInterval,omitempty"`
// The labels to add to any time series or alerts when communicating with
// external systems (federation, remote storage, Alertmanager).
ExternalLabels map[string]string `json:"externalLabels,omitempty"`

View File

@@ -63,10 +63,15 @@ func generateConfig(p *v1alpha1.Prometheus, mons map[string]*v1alpha1.ServiceMon
cfg := yaml.MapSlice{}
evaluationInterval := "30s"
if p.Spec.EvaluationInterval != "" {
evaluationInterval = p.Spec.EvaluationInterval
}
cfg = append(cfg, yaml.MapItem{
Key: "global",
Value: yaml.MapSlice{
{Key: "evaluation_interval", Value: "30s"},
{Key: "evaluation_interval", Value: evaluationInterval},
{Key: "scrape_interval", Value: "30s"},
{Key: "external_labels", Value: stringMapToMapSlice(p.Spec.ExternalLabels)},
},

View File

@@ -37,10 +37,12 @@ TF_VAR_tectonic_dns_name="${CLUSTER}"
TECTONIC_INSTALLER_DIR=/go/src/github.com/coreos/tectonic-installer
PO_DIR=/go/src/github.com/coreos/prometheus-operator
KUBECONFIG="${PO_DIR}/build/${CLUSTER}/generated/auth/kubeconfig"
TECTONIC_INSTALLER="quay.io/coreos/tectonic-installer:master"
mkdir -p build/${CLUSTER}
cp ${WORKSPACE}/scripts/jenkins/kubernetes-vanilla.tfvars build/${CLUSTER}/terraform.tfvars
docker pull $TECTONIC_INSTALLER
docker run \
--rm \
-v $PWD/build/:$TECTONIC_INSTALLER_DIR/build/ \
@@ -54,7 +56,7 @@ docker run \
-e TF_VAR_tectonic_cluster_name=${TF_VAR_tectonic_cluster_name} \
-e TF_VAR_tectonic_dns_name=${TF_VAR_tectonic_dns_name} \
-w $TECTONIC_INSTALLER_DIR \
quay.io/coreos/tectonic-installer:master \
$TECTONIC_INSTALLER \
/bin/bash -c "touch license secret && make plan && make apply"
docker build \

View File

@@ -17,12 +17,16 @@ package e2e
import (
"fmt"
"strconv"
"strings"
"testing"
"time"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/util/intstr"
"k8s.io/client-go/pkg/api/v1"
"k8s.io/client-go/pkg/apis/extensions/v1beta1"
"github.com/coreos/prometheus-operator/pkg/client/monitoring/v1alpha1"
testFramework "github.com/coreos/prometheus-operator/test/framework"
)
@@ -130,17 +134,8 @@ func TestMeshInitialization(t *testing.T) {
ns := ctx.CreateNamespace(t, framework.KubeClient)
ctx.SetupPrometheusRBAC(t, ns, framework.KubeClient)
var amountAlertmanagers int32 = 3
alertmanager := &v1alpha1.Alertmanager{
ObjectMeta: metav1.ObjectMeta{
Name: "test",
},
Spec: v1alpha1.AlertmanagerSpec{
Replicas: &amountAlertmanagers,
Version: "v0.7.1",
},
}
amClusterSize := 3
alertmanager := framework.MakeBasicAlertmanager("test", int32(amClusterSize))
alertmanagerService := framework.MakeAlertmanagerService(alertmanager.Name, "alertmanager-service", v1.ServiceTypeClusterIP)
if err := framework.CreateAlertmanagerAndWaitUntilReady(ns, alertmanager); err != nil {
@@ -151,9 +146,9 @@ func TestMeshInitialization(t *testing.T) {
t.Fatal(err)
}
for i := 0; i < int(amountAlertmanagers); i++ {
for i := 0; i < amClusterSize; i++ {
name := "alertmanager-" + alertmanager.Name + "-" + strconv.Itoa(i)
if err := framework.WaitForAlertmanagerInitializedMesh(ns, name, int(amountAlertmanagers)); err != nil {
if err := framework.WaitForAlertmanagerInitializedMesh(ns, name, amClusterSize); err != nil {
t.Fatal(err)
}
}
@@ -231,3 +226,188 @@ receivers:
t.Fatal(err)
}
}
func TestAlertmanagerZeroDowntimeRollingDeployment(t *testing.T) {
t.Parallel()
ctx := framework.NewTestCtx(t)
defer ctx.Cleanup(t)
ns := ctx.CreateNamespace(t, framework.KubeClient)
ctx.SetupPrometheusRBAC(t, ns, framework.KubeClient)
whReplicas := int32(1)
whdpl := &v1beta1.Deployment{
ObjectMeta: metav1.ObjectMeta{
Name: "alertmanager-webhook",
},
Spec: v1beta1.DeploymentSpec{
Replicas: &whReplicas,
Template: v1.PodTemplateSpec{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
"app": "alertmanager-webhook",
},
},
Spec: v1.PodSpec{
Containers: []v1.Container{
{
Name: "webhook-server",
Image: "quay.io/coreos/prometheus-alertmanager-test-webhook",
Ports: []v1.ContainerPort{
{
Name: "web",
ContainerPort: 5001,
},
},
},
},
},
},
},
}
whsvc := &v1.Service{
ObjectMeta: metav1.ObjectMeta{
Name: "alertmanager-webhook",
},
Spec: v1.ServiceSpec{
Type: v1.ServiceTypeClusterIP,
Ports: []v1.ServicePort{
v1.ServicePort{
Name: "web",
Port: 5001,
TargetPort: intstr.FromString("web"),
},
},
Selector: map[string]string{
"app": "alertmanager-webhook",
},
},
}
if err := testFramework.CreateDeployment(framework.KubeClient, ns, whdpl); err != nil {
t.Fatal(err)
}
if _, err := testFramework.CreateServiceAndWaitUntilReady(framework.KubeClient, ns, whsvc); err != nil {
t.Fatal(err)
}
err := testFramework.WaitForPodsReady(framework.KubeClient, ns, time.Minute*5, 1,
metav1.ListOptions{
LabelSelector: fields.SelectorFromSet(fields.Set(map[string]string{
"app": "alertmanager-webhook",
})).String(),
},
)
if err != nil {
t.Fatal(err)
}
alertmanager := framework.MakeBasicAlertmanager("rolling-deploy", 2)
alertmanager.Spec.Version = "v0.7.0"
amsvc := framework.MakeAlertmanagerService(alertmanager.Name, "test", v1.ServiceTypeClusterIP)
amcfg := &v1.Secret{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("alertmanager-%s", alertmanager.Name),
},
Data: map[string][]byte{
"alertmanager.yaml": []byte(fmt.Sprintf(`
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'webhook'
receivers:
- name: 'webhook'
webhook_configs:
- url: 'http://%s.%s.svc:5001/'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
`, whsvc.Name, ns)),
},
}
if _, err := framework.KubeClient.CoreV1().Secrets(ns).Create(amcfg); err != nil {
t.Fatal(err)
}
if _, err := framework.MonClient.Alertmanagers(ns).Create(alertmanager); err != nil {
t.Fatal(err)
}
if _, err := testFramework.CreateServiceAndWaitUntilReady(framework.KubeClient, ns, amsvc); err != nil {
t.Fatal(err)
}
p := framework.MakeBasicPrometheus(ns, "test", "test", 3)
p.Spec.EvaluationInterval = "100ms"
framework.AddAlertingToPrometheus(p, ns, alertmanager.Name)
alertRule := &v1.ConfigMap{
ObjectMeta: metav1.ObjectMeta{
Name: fmt.Sprintf("prometheus-%s-rules", p.Name),
Labels: map[string]string{
"role": "rulefile",
},
},
Data: map[string]string{
"alerting.rules": `
ALERT Test
IF vector(1)
`,
},
}
if _, err := framework.KubeClient.CoreV1().ConfigMaps(ns).Create(alertRule); err != nil {
t.Fatal(err)
}
if err := framework.CreatePrometheusAndWaitUntilReady(ns, p); err != nil {
t.Fatal(err)
}
time.Sleep(1 * time.Minute)
opts := metav1.ListOptions{
LabelSelector: fields.SelectorFromSet(fields.Set(map[string]string{
"app": "alertmanager-webhook",
})).String(),
}
pl, err := framework.KubeClient.Core().Pods(ns).List(opts)
if err != nil {
t.Fatal(err)
}
if len(pl.Items) != 1 {
t.Fatalf("Expected one webhook pod, but got %d", len(pl.Items))
}
podName := pl.Items[0].Name
logs, err := testFramework.GetLogs(framework.KubeClient, ns, podName, "webhook-server")
if err != nil {
t.Fatal(err)
}
c := strings.Count(logs, "Alertmanager Notification Payload Received")
if c != 1 {
t.Fatalf("One notification expected, but %d received.\n\n%s", c, logs)
}
alertmanager.Spec.Version = "v0.7.1"
if _, err := framework.MonClient.Alertmanagers(ns).Update(alertmanager); err != nil {
t.Fatal(err)
}
time.Sleep(1 * time.Minute)
logs, err = testFramework.GetLogs(framework.KubeClient, ns, podName, "webhook-server")
if err != nil {
t.Fatal(err)
}
c = strings.Count(logs, "Alertmanager Notification Payload Received")
if c != 1 {
t.Fatalf("Only one notification expected, but %d received after rolling update of Alertmanager cluster.\n\n%s", c, logs)
}
}

View File

@@ -123,17 +123,19 @@ func (f *Framework) CreateAlertmanagerAndWaitUntilReady(ns string, a *v1alpha1.A
return errors.Wrap(err, fmt.Sprintf("creating alertmanager %v failed", a.Name))
}
err = WaitForPodsReady(
return f.WaitForAlertmanagerReady(ns, a.Name, int(*a.Spec.Replicas))
}
func (f *Framework) WaitForAlertmanagerReady(ns, name string, replicas int) error {
err := WaitForPodsReady(
f.KubeClient,
ns,
5*time.Minute,
int(*a.Spec.Replicas),
alertmanager.ListOptions(a.Name),
replicas,
alertmanager.ListOptions(name),
)
if err != nil {
return errors.Wrap(err, fmt.Sprintf("failed to create an Alertmanager cluster (%s) with %d instances", a.Name, a.Spec.Replicas))
}
return nil
return errors.Wrap(err, fmt.Sprintf("failed to create an Alertmanager cluster (%s) with %d instances", name, replicas))
}
func (f *Framework) UpdateAlertmanagerAndWaitUntilReady(ns string, a *v1alpha1.Alertmanager) error {