diff --git a/.editorconfig b/.editorconfig deleted file mode 100644 index 179650b2..00000000 --- a/.editorconfig +++ /dev/null @@ -1,14 +0,0 @@ -root = true - -[*] -end_of_line = lf -insert_final_newline = true -charset = utf-8 - -[*.{css,html,js,md,rb,sh,yaml,yml}] -indent_style = space -indent_size = 2 - -[Makefile] -indent_style = tab - diff --git a/.gitattributes b/.gitattributes deleted file mode 100644 index b26127f8..00000000 --- a/.gitattributes +++ /dev/null @@ -1,2 +0,0 @@ -static/bootstrap-3.4.1/* linguist-vendored -static/font-awesome-4.7.0/* linguist-vendored diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml deleted file mode 100644 index d5738395..00000000 --- a/.github/ISSUE_TEMPLATE/config.yml +++ /dev/null @@ -1,9 +0,0 @@ -blank_issues_enabled: true -contact_links: - - name: Prometheus Community Support - url: https://prometheus.io/community/ - about: If you need help or support, please request help here. - - name: Commercial Support & Training - url: https://prometheus.io/support-training/ - about: If you want commercial support or training, vendors are listed here. - - name: Blank issue template diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md deleted file mode 100644 index 644450b4..00000000 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ /dev/null @@ -1,7 +0,0 @@ - diff --git a/.github/dependabot.yml b/.github/dependabot.yml deleted file mode 100644 index fcf4bc0a..00000000 --- a/.github/dependabot.yml +++ /dev/null @@ -1,8 +0,0 @@ ---- -version: 2 -updates: - - package-ecosystem: 'bundler' - directory: '/' - open-pull-requests-limit: 20 - schedule: - interval: 'monthly' diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 632a81e3..00000000 --- a/.gitignore +++ /dev/null @@ -1,18 +0,0 @@ -# For projects using nanoc (http://nanoc.ws/) - -# Default location for output, needs to match output_dir's value found in config.yaml -output/ - -# Temporary file directory -tmp/ -downloads/ - -# Crash Log -crash.log - -# macOS artifacts -.DS_Store - -# Ruby artifacts -.bundle/ -vendor/ diff --git a/.gitpod.yml b/.gitpod.yml deleted file mode 100644 index cc3765da..00000000 --- a/.gitpod.yml +++ /dev/null @@ -1,8 +0,0 @@ -# Gitpod config, https://www.gitpod.io/docs/config-gitpod-file. - -tasks: - - init: >- - rvm install `cat .ruby-version` && - make bundle && - make build - command: echo "Site files are under ./output" diff --git a/.ruby-version b/.ruby-version deleted file mode 100644 index a423d421..00000000 --- a/.ruby-version +++ /dev/null @@ -1 +0,0 @@ -3.4.2 \ No newline at end of file diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md deleted file mode 100644 index d325872b..00000000 --- a/CODE_OF_CONDUCT.md +++ /dev/null @@ -1,3 +0,0 @@ -# Prometheus Community Code of Conduct - -Prometheus follows the [CNCF Code of Conduct](https://github.com/cncf/foundation/blob/main/code-of-conduct.md). diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md deleted file mode 100644 index f440aadd..00000000 --- a/CONTRIBUTING.md +++ /dev/null @@ -1,19 +0,0 @@ -# Contributing - -Prometheus uses GitHub to manage reviews of pull requests. - -* Every commit *MUST* be signed off to agree to the [Developer Certificate of Origin (DCO)](https://developercertificate.org/). [How does it work?](https://github.com/dcoapp/app#how-it-works) -** There is a [browser extension](https://github.com/scottrigby/dco-gh-ui) if you're making edits - from the GitHub UI. - -* If you have a trivial fix or improvement, go ahead and create a pull request, - addressing (with `@...`) the maintainer of this repository (see - [MAINTAINERS.md](MAINTAINERS.md)) in the description of the pull request. - -* If you plan to do something more involved, first discuss your ideas - on our [mailing list](https://groups.google.com/forum/?fromgroups#!forum/prometheus-developers). - This will avoid unnecessary work and surely give you and us a good deal - of inspiration. - -* Generally, the documents you see on are sourced from this repository. The only exception are the documentation pages that are versioned per Prometheus release. Those are sourced in [the docs directory of the prometheus/prometheus repository](https://github.com/prometheus/prometheus/tree/main/docs) for the related version. - diff --git a/Gemfile b/Gemfile deleted file mode 100644 index 1ae0db35..00000000 --- a/Gemfile +++ /dev/null @@ -1,19 +0,0 @@ -source 'https://rubygems.org' - -gem 'nanoc', '~> 4.13' -gem 'adsf' -gem 'kramdown' -gem 'guard-nanoc' -gem 'guard-livereload' -gem 'nokogiri' -gem 'redcarpet' -gem 'pygments.rb' -gem 'builder' -gem 'semverse' -gem 'rb-inotify', :require => false -gem 'rb-fsevent', :require => false -gem 'rb-fchange', :require => false - -group :test do - gem 'rspec' -end diff --git a/Gemfile.lock b/Gemfile.lock deleted file mode 100644 index 77cc1642..00000000 --- a/Gemfile.lock +++ /dev/null @@ -1,168 +0,0 @@ -GEM - remote: https://rubygems.org/ - specs: - addressable (2.8.7) - public_suffix (>= 2.0.2, < 7.0) - adsf (1.5.2) - rack (>= 1.0.0, < 4.0.0) - rackup (~> 2.1) - base64 (0.2.0) - builder (3.3.0) - coderay (1.1.3) - colored (1.2) - concurrent-ruby (1.3.5) - cri (2.15.12) - ddmetrics (1.1.0) - ddplugin (1.0.3) - diff-lcs (1.6.0) - em-websocket (0.5.2) - eventmachine (>= 0.12.9) - http_parser.rb (~> 0.6.0) - eventmachine (1.2.7) - ffi (1.17.1) - formatador (0.2.5) - guard (2.16.2) - formatador (>= 0.2.4) - listen (>= 2.7, < 4.0) - lumberjack (>= 1.0.12, < 2.0) - nenv (~> 0.1) - notiffany (~> 0.0) - pry (>= 0.9.12) - shellany (~> 0.0) - thor (>= 0.18.1) - guard-compat (1.2.1) - guard-livereload (2.5.2) - em-websocket (~> 0.5) - guard (~> 2.8) - guard-compat (~> 1.0) - multi_json (~> 1.8) - guard-nanoc (2.1.9) - guard (~> 2.8) - guard-compat (~> 1.0) - nanoc-cli (~> 4.11, >= 4.11.14) - nanoc-core (~> 4.11, >= 4.11.14) - http_parser.rb (0.6.0) - immutable-ruby (0.2.0) - concurrent-ruby (~> 1.1) - sorted_set (~> 1.0) - json_schema (0.21.0) - kramdown (2.5.1) - rexml (>= 3.3.9) - listen (3.4.1) - rb-fsevent (~> 0.10, >= 0.10.3) - rb-inotify (~> 0.9, >= 0.9.10) - lumberjack (1.2.8) - memo_wise (1.11.0) - method_source (1.0.0) - mini_portile2 (2.8.8) - multi_json (1.15.0) - nanoc (4.13.3) - addressable (~> 2.5) - colored (~> 1.2) - nanoc-checking (~> 1.0, >= 1.0.2) - nanoc-cli (= 4.13.3) - nanoc-core (= 4.13.3) - nanoc-deploying (~> 1.0) - parallel (~> 1.12) - tty-command (~> 0.8) - tty-which (~> 0.4) - nanoc-checking (1.0.5) - nanoc-cli (~> 4.12, >= 4.12.5) - nanoc-core (~> 4.12, >= 4.12.5) - nanoc-cli (4.13.3) - cri (~> 2.15) - diff-lcs (~> 1.3) - nanoc-core (= 4.13.3) - pry - zeitwerk (~> 2.1) - nanoc-core (4.13.3) - base64 (~> 0.2) - concurrent-ruby (~> 1.1) - ddmetrics (~> 1.0) - ddplugin (~> 1.0) - immutable-ruby (~> 0.1) - json_schema (~> 0.19) - memo_wise (~> 1.5) - slow_enumerator_tools (~> 1.0) - tty-platform (~> 0.2) - zeitwerk (~> 2.1) - nanoc-deploying (1.0.2) - nanoc-checking (~> 1.0) - nanoc-cli (~> 4.11, >= 4.11.15) - nanoc-core (~> 4.11, >= 4.11.15) - nenv (0.3.0) - nokogiri (1.18.8) - mini_portile2 (~> 2.8.2) - racc (~> 1.4) - notiffany (0.1.3) - nenv (~> 0.1) - shellany (~> 0.0) - parallel (1.26.3) - pastel (0.8.0) - tty-color (~> 0.5) - pry (0.14.0) - coderay (~> 1.1) - method_source (~> 1.0) - public_suffix (6.0.1) - pygments.rb (3.0.0) - racc (1.8.1) - rack (3.1.12) - rackup (2.2.1) - rack (>= 3) - rb-fchange (0.0.6) - ffi - rb-fsevent (0.11.2) - rb-inotify (0.11.1) - ffi (~> 1.0) - rbtree (0.4.6) - redcarpet (3.6.1) - rexml (3.4.1) - rspec (3.13.0) - rspec-core (~> 3.13.0) - rspec-expectations (~> 3.13.0) - rspec-mocks (~> 3.13.0) - rspec-core (3.13.3) - rspec-support (~> 3.13.0) - rspec-expectations (3.13.3) - diff-lcs (>= 1.2.0, < 2.0) - rspec-support (~> 3.13.0) - rspec-mocks (3.13.2) - diff-lcs (>= 1.2.0, < 2.0) - rspec-support (~> 3.13.0) - rspec-support (3.13.2) - semverse (3.0.2) - set (1.1.1) - shellany (0.0.1) - slow_enumerator_tools (1.1.0) - sorted_set (1.0.3) - rbtree - set (~> 1.0) - thor (1.1.0) - tty-color (0.6.0) - tty-command (0.10.1) - pastel (~> 0.8) - tty-platform (0.3.0) - tty-which (0.5.0) - zeitwerk (2.7.2) - -PLATFORMS - ruby - -DEPENDENCIES - adsf - builder - guard-livereload - guard-nanoc - kramdown - nanoc (~> 4.13) - nokogiri - pygments.rb - rb-fchange - rb-fsevent - rb-inotify - redcarpet - rspec - semverse - -BUNDLED WITH - 2.6.5 diff --git a/Guardfile b/Guardfile deleted file mode 100644 index f67a4916..00000000 --- a/Guardfile +++ /dev/null @@ -1,12 +0,0 @@ -# A sample Guardfile -# More info at https://github.com/guard/guard#readme - -guard 'nanoc' do - watch('nanoc.yaml') # Change this to config.yaml if you use the old config file name - watch('Rules') - watch(%r{^(content|layouts|lib|static)/.*$}) -end - -guard 'livereload' do - watch(%r{output/.+}) -end diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 261eeb9e..00000000 --- a/LICENSE +++ /dev/null @@ -1,201 +0,0 @@ - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. diff --git a/MAINTAINERS.md b/MAINTAINERS.md deleted file mode 100644 index cb4c39d0..00000000 --- a/MAINTAINERS.md +++ /dev/null @@ -1,17 +0,0 @@ -Documentation that refers to a specific project within the Prometheus ecosystem -is maintained by the maintainers of the respective project. For example, refer -to the maintainers specified in Alertmanager's -[MAINTAINERS.md](https://github.com/prometheus/alertmanager/blob/main/MAINTAINERS.md) -file for documentation about the Alertmanager. - -Note that the documentation for the Prometheus server is located in the -[prometheus/prometheus -repository](https://github.com/prometheus/prometheus/tree/main/docs) itself. - -For anything that is not documentation for a specific project, refer to the -following maintainers with their focus areas: - -* Julius Volz @juliusv: Web design, static site - generator. -* Richard Hartmann @RichiH: Everything - else. diff --git a/Makefile b/Makefile deleted file mode 100644 index f8cb59e4..00000000 --- a/Makefile +++ /dev/null @@ -1,36 +0,0 @@ -NANOC = bundle exec nanoc -GUARD = bundle exec guard -DOWNLOADS := prometheus alertmanager blackbox_exporter consul_exporter graphite_exporter memcached_exporter mysqld_exporter node_exporter promlens pushgateway statsd_exporter - -build: clean downloads compile - -bundle: - bundle config build.nokogiri --use-system-libraries - bundle config set path vendor - bundle install - -clean: - rm -rf output downloads - -compile: - $(NANOC) - -downloads: $(DOWNLOADS:%=downloads/%/repo.json) $(DOWNLOADS:%=downloads/%/releases.json) - -downloads/%/repo.json: - @mkdir -p $(dir $@) - @echo "curl -sf -H 'Accept: application/vnd.github.v3+json' https://api.github.com/repos/prometheus/$* > $@" - @curl -sf -H 'Accept: application/vnd.github.v3+json' $(GITHUB_AUTHENTICATION) https://api.github.com/repos/prometheus/$* > $@ - -downloads/%/releases.json: - @mkdir -p $(dir $@) - @echo "curl -sf -H 'Accept: application/vnd.github.v3+json' https://api.github.com/repos/prometheus/$*/releases > $@" - @curl -sf -H 'Accept: application/vnd.github.v3+json' $(GITHUB_AUTHENTICATION) https://api.github.com/repos/prometheus/$*/releases > $@ - -guard: - $(GUARD) - -serve: - $(NANOC) view - -.PHONY: build bundle clean compile downloads serve diff --git a/NOTICE b/NOTICE deleted file mode 100644 index ddf1a4d8..00000000 --- a/NOTICE +++ /dev/null @@ -1,18 +0,0 @@ -Prometheus documentation: content and static site generator -Copyright 2014-2018 The Prometheus Authors - -This product includes software developed at -SoundCloud Ltd. (http://soundcloud.com/). - - -The following components are included in this product: - -Bootstrap -http://getbootstrap.com -Copyright 2011-2014 Twitter, Inc. -Licensed under the MIT License - -Font Awesome -http://fontawesome.io -Copyright by @davegandy -License - http://fontawesome.io/license (Font: SIL OFL 1.1, CSS: MIT License) diff --git a/README.md b/README.md deleted file mode 100644 index b4a17c25..00000000 --- a/README.md +++ /dev/null @@ -1,74 +0,0 @@ -# Prometheus Documentation - -This repository contains both the content and the static-site generator code for the -Prometheus documentation site. - -## Contributing Changes - -See [`CONTRIBUTING.md`](CONTRIBUTING.md) for general instructions for new Prometheus contributors. - -The main documentation contents of this website are located in the [`content/docs`](content/docs) directory. - -Documentation concerning the Prometheus server is [maintained in the Prometheus server repository](https://github.com/prometheus/prometheus/tree/main/docs) and cloned into the website at build time. - -As a guideline, please keep the documentation generally applicable and avoid use-case-specific changes. - -## Prerequisites - -You need to have a working Ruby environment set up (including [bundler](https://bundler.io/)) -and then install the necessary gems: - -```bash -make bundle -``` - -## Building - -To generate the static site, run: - -```bash -make build -``` - -The resulting static site will be stored in the `output` directory. - -Optionally, you can use an API token to avoid rate limits on the API. You can get an API token from https://github.com/settings/tokens/new. -```bash -export GITHUB_AUTHENTICATION='-u user:token' -``` - -## Development Server - -To run a local server that displays the generated site, run: - -```bash -# Rebuild the site whenever relevant files change: -make guard -# Start the local development server in a separate shell: -make serve -``` - -You should now be able to view the generated site at -[http://localhost:3000/](http://localhost:3000). - -## Automatic Deployment - -This site is automatically deployed using [Netlify](https://www.netlify.com/). - -If you have the prerequisite access rights, you can view the Netlify settings here: - -* GitHub webhook notifying Netlify of branch changes: https://github.com/prometheus/docs/settings/hooks -* Netlify project: https://app.netlify.com/sites/prometheus-docs - -Changes to the `main` branch are deployed to the main site at https://prometheus.io. - -Netlify also creates preview deploys for every pull request. To view these for a PR where all checks have passed: - -1. In the CI section of the PR, click on "Show all checks". -2. On the "deploy/netlify" entry, click on "Details" to view the preview site for the PR. - -You may have to wait a while for the "deploy/netlify" check to appear after creating or updating the PR, even if the other checks have already passed. - -## License - -Apache License 2.0, see [LICENSE](LICENSE). diff --git a/Rules b/Rules deleted file mode 100644 index c67cff2e..00000000 --- a/Rules +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env ruby - -# A few helpful tips about the Rules file: -# -# * The string given to #compile and #route are matching patterns for -# identifiers--not for paths. Therefore, you can’t match on extension. -# -# * The order of rules is important: for each item, only the first matching -# rule is applied. -# -# * Item identifiers start and end with a slash (e.g. “/about/” for the file -# “content/about.html”). To select all children, grandchildren, … of an -# item, use the pattern “/about/*/”; “/about/*” will also select the parent, -# because “*” matches zero or more characters. - -passthrough '/assets/**' -passthrough '/_redirects' -passthrough '*/images/*' - -# RSS Feed -compile '/blog/feed/' do - filter :erb -end - -route '/blog/feed/' do - '/blog/feed.xml' -end - -compile '*' do - filter :erb if item[:extension] == 'html' - - if item[:extension] == 'md' - filter :redcarpet, options: {filter_html: true, autolink: true, no_intraemphasis: true, fenced_code_blocks: true, gh_blockcode: true, tables: true}, renderer_options: {with_toc_data: true} - filter :normalize_links, item[:repo_docs] if item[:repo_docs] - filter :version_warning, item[:repo_docs] if item[:repo_docs] - filter :add_anchors - filter :bootstrappify - filter :admonition - filter :colorize_syntax, :default_colorizer => :pygmentsrb - filter :config_linker if item[:title] == 'Configuration' - filter :toc, style: item[:toc] - if item[:kind] == 'article' - layout 'blog' - else - # TODO(mr): separate layout selection from Markdown handling - layout item[:layout] || 'docs' - end - elsif item[:extension] == 'css' - # don’t filter stylesheets - elsif item.binary? - # don’t filter binary items - elsif item[:layout] - layout item[:layout] - else - layout 'default' - end -end - -route '/blog/' do - '/blog/index.html' -end - -# Transform /blog/--
- to -# /blog///
/. -route '/blog/*' do - y, m, d, slug = /([0-9]+)\-([0-9]+)\-([0-9]+)\-([^\/]+)/.match(item.identifier).captures - - "/blog/#{y}/#{m}/#{d}/#{slug}/index.html" -end - -route '*' do - if item[:extension] == 'css' - # Write item with identifier /foo/ to /foo.css - item.identifier.chop + '.css' - elsif item.binary? - # Write item with identifier /foo.dat to /foo.dat - item.identifier.to_s - else - # Write item with identifier /foo/ to /foo/index.html - item.identifier + 'index.html' - end -end - -layout '*', :erb diff --git a/SECURITY.md b/SECURITY.md deleted file mode 100644 index fed02d85..00000000 --- a/SECURITY.md +++ /dev/null @@ -1,6 +0,0 @@ -# Reporting a security issue - -The Prometheus security policy, including how to report vulnerabilities, can be -found here: - - diff --git a/content/_redirects b/content/_redirects deleted file mode 100644 index e6ada913..00000000 --- a/content/_redirects +++ /dev/null @@ -1,61 +0,0 @@ -# Redirects for old site structure. -/docs/introduction/getting_started/ /docs/prometheus/latest/getting_started/ -/docs/introduction/install/ /docs/prometheus/latest/installation/ -/docs/operating/configuration/ /docs/prometheus/latest/configuration/configuration/ -/docs/operating/federation/ /docs/prometheus/latest/federation/ -/docs/operating/storage/ /docs/prometheus/latest/storage/ -/docs/querying/api/ /docs/prometheus/latest/querying/api/ -/docs/querying/basics/ /docs/prometheus/latest/querying/basics/ -/docs/querying/examples/ /docs/prometheus/latest/querying/examples/ -/docs/querying/functions/ /docs/prometheus/latest/querying/functions/ -/docs/querying/operators/ /docs/prometheus/latest/querying/operators/ -/docs/querying/rules/ /docs/prometheus/latest/configuration/recording_rules/ -/docs/visualization/template_examples/ /docs/prometheus/latest/configuration/template_examples/ -/docs/visualization/template_reference/ /docs/prometheus/latest/configuration/template_reference/ -/docs/alerting/overview/ /docs/alerting/latest/overview/ -/docs/alerting/alertmanager/ /docs/alerting/latest/alertmanager/ -/docs/alerting/configuration/ /docs/alerting/latest/configuration/ -/docs/alerting/clients/ /docs/alerting/latest/clients/ -/docs/alerting/notifications/ /docs/alerting/latest/notifications/ -/docs/alerting/notification_examples/ /docs/alerting/latest/notification_examples/ -/docs/alerting/management_api/ /docs/alerting/latest/management_api/ - -# Moved PRW spec from Concepts to a new section. -/docs/concepts/remote_write_spec /docs/specs/prw/remote_write_spec -# Moved PRW into separate dir. -/docs/specs/remote_write_spec /docs/specs/prw/remote_write_spec -/docs/specs/remote_write_spec_2_0 /docs/specs/prw/remote_write_spec_2_0 - -# Redirect for HTTP SD docs, which briefly lived in the wrong category / repo. -/docs/instrumenting/http_sd/ /docs/prometheus/latest/http_sd/ - -# Redirect for "disabled_features", which is now called "feature_flags". -/docs/prometheus/latest/disabled_features/ /docs/prometheus/latest/feature_flags/ - -# Redirects for sections. -# TODO(ts): Auto-generate from menu. -/docs/ /docs/introduction/overview/ 302! -/docs/introduction/ /docs/introduction/overview/ 302! -/docs/concepts/ /docs/concepts/data_model/ 302! -/docs/specs/ /docs/specs/remote_write_spec/ 302! -/docs/prometheus/ /docs/prometheus/latest/getting_started/ 302! -/docs/alerting/ /docs/alerting/latest/overview/ 302! -/docs/visualization/ /docs/visualization/browser/ 302! -/docs/instrumenting/ /docs/instrumenting/clientlibs/ 302! -/docs/operating/ /docs/operating/security/ 302! -/docs/alerting/ /docs/alerting/latest/overview/ 302! -/docs/practices/ /docs/practices/naming/ 302! -/docs/guides/ /docs/guides/basic-auth/ 302! - -# Redirects for index.hml pages. -/:foo/index.html /:foo/ 302! -/:foo/:bar/index.html /:foo/:bar/ 302! -/:foo/:bar/:baz/index.html /:foo/:bar/:baz/ 302! -/:foo/:bar/:baz/:qux/index.html /:foo/:bar/:baz/:qux/ 302! -/:foo/:bar/:baz/:qux/:quux/index.html /:foo/:bar/:baz/:qux/:quux/ 302! - -# Blog post duplicated -/blog/2021/05/14/prometheus-conformance-results/ /blog/2021/10/14/prometheus-conformance-results/ 302! - -# Custom 404 page for all nonexistent assets -/* /docs/404/ 404 diff --git a/content/blog/2015-04-24-prometheus-monitring-spreads-through-the-internet.md b/content/blog/2015-04-24-prometheus-monitring-spreads-through-the-internet.md deleted file mode 100644 index da87125e..00000000 --- a/content/blog/2015-04-24-prometheus-monitring-spreads-through-the-internet.md +++ /dev/null @@ -1,66 +0,0 @@ ---- -title: Prometheus Monitoring Spreads through the Internet -created_at: 2015-04-24 -kind: article -author_name: Brian Brazil ---- - -It has been almost three months since we publicly announced Prometheus version -0.10.0, and we're now at version 0.13.1. - -[SoundCloud's announcement blog post](https://developers.soundcloud.com/blog/prometheus-monitoring-at-soundcloud) -remains the best overview of the key components of Prometheus, but there has -been a lot of other online activity around Prometheus. This post will let you -catch up on anything you missed. - -In the future, we will use this blog to publish more articles and announcements -to help you get the most out of Prometheus. - - - -## Using Prometheus - -Posts on how to use Prometheus comprise the majority of online content. Here -are the ones we're aware of with the part of the ecosystem they cover: - -* Container Exporter: [Monitor Docker Containers with Prometheus](https://5pi.de/2015/01/26/monitor-docker-containers-with-prometheus/) -* HAProxy: [HAProxy Monitoring with Prometheus](http://www.boxever.com/haproxy-monitoring-with-prometheus) -* Java Client: [Easy Java Instrumentation with Prometheus](http://www.boxever.com/easy-java-instrumentation-with-prometheus) -* Java Client and Labels: [The Power of Multi-Dimensional Labels in Prometheus](http://www.boxever.com/the-power-of-multi-dimensional-labels-in-prometheus) -* Node Exporter: [Monitoring your Machines with Prometheus](http://www.boxever.com/monitoring-your-machines-with-prometheus) -* JMX Exporter: [Cassandra Consoles with JMX and Prometheus](http://www.boxever.com/cassandra-consoles-with-jmx-and-prometheus) -* Python Client and Node Exporter Textfile Collector: [Monitoring Python Batch Jobs](http://www.boxever.com/monitoring-python-batch-jobs) -* Mesos Exporter: [Monitoring Mesos tasks with Prometheus](http://www.antonlindstrom.com/2015/02/24/monitoring-mesos-tasks-with-prometheus.html) -* Synapse: [Monitoring Synapse Metrics with Prometheus](http://matrix.org/blog/2015/04/23/monitoring-synapse-metrics-with-prometheus/) - -## Articles - -These articles look at how Prometheus fits into the broader picture of keeping services up and running: - -* [Prometheus: A Next-Generation Monitoring System](http://www.boxever.com/prometheus-a-next-generation-monitoring-system) -* [SoundCloud’s Prometheus: A Monitoring System and Time Series Database Suited for Containers](http://thenewstack.io/soundclouds-prometheus-monitoring-system-time-series-database-suited-containers/) -* [Docker Monitoring Continued: Prometheus and Sysdig](http://rancher.com/docker-monitoring-continued-prometheus-and-sysdig/) - -## Philosophy - -Monitoring isn't just about the technical details. How it affects the design of -your systems, operations, and human factors are important too: - -* [Push vs Pull for Monitoring](http://www.boxever.com/push-vs-pull-for-monitoring) -* [Systems Monitoring with Prometheus](http://www.slideshare.net/brianbrazil/devops-ireland-systems-monitoring-with-prometheus) -* [Monitoring your Python with Prometheus](http://www.slideshare.net/brianbrazil/python-ireland-monitoring-your-python-with-prometheus) - -The comments on the [Hacker News post](https://news.ycombinator.com/item?id=8995696) about Prometheus are also insightful. - -## Non-English - -Several posts have appeared in languages beyond English: - -* Japanese how-to about installing Prometheus on CentOS: [データ可視化アプリの新星、PrometheusをCentOSにインストールする方法](http://y-ken.hatenablog.com/entry/how-to-install-prometheus) -* Japanese in-depth tutorial: [【入門】PrometheusでサーバやDockerコンテナのリソース監視](http://pocketstudio.jp/log3/2015/02/11/what_is_prometheus_monitoring/) -* Japanese overview: [Prometheus: Go言語で書かれたモニタリングシステム](http://wazanova.jp/items/1672) -* Russian podcast that mentions Prometheus: [RWPOD 04 выпуск 03 сезона](http://www.rwpod.com/posts/2015/02/02/podcast-03-04.html) - -## Closing - -Finally, I'd like to share how to run [Prometheus on a Raspberry Pi](https://5pi.de/2015/02/10/prometheus-on-raspberry-pi/). diff --git a/content/blog/2015-06-01-advanced-service-discovery.md b/content/blog/2015-06-01-advanced-service-discovery.md deleted file mode 100644 index cb36cc93..00000000 --- a/content/blog/2015-06-01-advanced-service-discovery.md +++ /dev/null @@ -1,344 +0,0 @@ ---- -title: Advanced Service Discovery in Prometheus 0.14.0 -created_at: 2015-06-01 -kind: article -author_name: Fabian Reinartz, Julius Volz ---- - -This week we released Prometheus v0.14.0 — a version with many long-awaited additions -and improvements. - -On the user side, Prometheus now supports new service discovery mechanisms. In -addition to DNS-SRV records, it now supports [Consul](https://www.consul.io) -out of the box, and a file-based interface allows you to connect your own -discovery mechanisms. Over time, we plan to add other common service discovery -mechanisms to Prometheus. - -Aside from many smaller fixes and improvements, you can now also reload your configuration during -runtime by sending a `SIGHUP` to the Prometheus process. For a full list of changes, check the -[changelog for this release](https://github.com/prometheus/prometheus/blob/main/CHANGELOG.md#0140--2015-06-01). - -In this blog post, we will take a closer look at the built-in service discovery mechanisms and provide -some practical examples. As an additional resource, see -[Prometheus's configuration documentation](/docs/operating/configuration). - - - -## Prometheus and targets - -For a proper understanding of this blog post, we first need to take a look at how -Prometheus labels targets. - -There are various places in the configuration file where target labels may be -set. They are applied in the following order, with later stages overwriting any -labels set by an earlier stage: - -1. Global labels, which are assigned to every target scraped by the Prometheus instance. -2. The `job` label, which is configured as a default value for each scrape configuration. -3. Labels that are set per target group within a scrape configuration. -4. Advanced label manipulation via [_relabeling_](/docs/operating/configuration/#relabel_config). - -Each stage overwrites any colliding labels from the earlier stages. Eventually, we have a flat -set of labels that describe a single target. Those labels are then attached to every time series that -is scraped from this target. - -Note: Internally, even the address of a target is stored in a special -`__address__` label. This can be useful during advanced label manipulation -(relabeling), as we will see later. Labels starting with `__` do not appear in -the final time series. - - -## Scrape configurations and relabeling - -Aside from moving from an ASCII protocol buffer format to YAML, a fundamental change to -Prometheus's configuration is the change from per-job configurations to more generalized scrape -configurations. While the two are almost equivalent for simple setups, scrape configurations -allow for greater flexibility in more advanced use cases. - -Each scrape configuration defines a job name which serves as a default value for the -`job` label. The `job` label can then be redefined for entire target groups or individual targets. -For example, we can define two target groups, each of which defines targets for one job. -To scrape them with the same parameters, we can configure them as follows: - -``` -scrape_configs: -- job_name: 'overwritten-default' - - scrape_interval: 10s - scrape_timeout: 5s - - target_groups: - - targets: ['10.1.200.130:5051', '10.1.200.134:5051'] - labels: - job: 'job1' - - - targets: ['10.1.200.130:6220', '10.1.200.134:6221'] - labels: - job: 'job2' -``` - -Through a mechanism named [_relabeling_](http://prometheus.io/docs/operating/configuration/#relabel_config), -any label can be removed, created, or modified on a per-target level. This -enables fine-grained labeling that can also take into account metadata coming -from the service discovery. Relabeling is the last stage of label assignment -and overwrites any labels previously set. - -Relabeling works as follows: - -- A list of source labels is defined. -- For each target, the values of those labels are concatenated with a separator. -- A regular expression is matched against the resulting string. -- A new value based on those matches is assigned to another label. - -Multiple relabeling rules can be defined for each scrape configuration. A simple one -that squashes two labels into one, looks as follows: - -``` -relabel_configs: -- source_labels: ['label_a', 'label_b'] - separator: ';' - regex: '(.*);(.*)' - replacement: '${1}-${2}' - target_label: 'label_c' -``` - -This rule transforms a target with the label set: - -``` -{ - "job": "job1", - "label_a": "foo", - "label_b": "bar" -} -``` -...into a target with the label set: - -``` -{ - "job": "job1", - "label_a": "foo", - "label_b": "bar", - "label_c": "foo-bar" -} -``` - -You could then also remove the source labels in an additional relabeling step. - -You can read more about relabeling and how you can use it to filter targets in the -[configuration documentation](/docs/operating/configuration#relabel_config). - -Over the next sections, we will see how you can leverage relabeling when using service discovery. - - -## Discovery with DNS-SRV records - -Since the beginning, Prometheus has supported target discovery via DNS-SRV records. -The respective configuration looked like this: - -``` -job { - name: "api-server" - sd_name: "telemetry.eu-west.api.srv.example.org" - metrics_path: "/metrics" -} -``` - -Prometheus 0.14.0 allows you to specify multiple SRV records to be queried in a -single scrape configuration, and also provides service-discovery-specific meta -information that is helpful during the relabeling phase. - -When querying the DNS-SRV records, a label named `__meta_dns_name` is -attached to each target. Its value is set to the SRV record name for which it was -returned. If we have structured SRV record names like `telemetry...srv.example.org`, -we can extract relevant labels from it those names: - -``` -scrape_configs: -- job_name: 'myjob' - - dns_sd_configs: - - names: - - 'telemetry.eu-west.api.srv.example.org' - - 'telemetry.us-west.api.srv.example.org' - - 'telemetry.eu-west.auth.srv.example.org' - - 'telemetry.us-east.auth.srv.example.org' - - relabel_configs: - - source_labels: ['__meta_dns_name'] - regex: 'telemetry\.(.+?)\..+?\.srv\.example\.org' - target_label: 'zone' - replacement: '$1' - - source_labels: ['__meta_dns_name'] - regex: 'telemetry\..+?\.(.+?)\.srv\.example\.org' - target_label: 'job' - replacement: '$1' -``` - -This will attach the `zone` and `job` label to each target based on the SRV record -it came from. - - -## Discovery with Consul - -Service discovery via Consul is now supported natively. It can be configured by defining -access parameters for our Consul agent and a list of Consul services for which we want -to query targets. - -The tags of each Consul node are concatenated by a configurable separator and exposed -through the `__meta_consul_tags` label. Various other Consul-specific meta -labels are also provided. - -Scraping all instances for a list of given services can be achieved with a simple -`consul_sd_config` and relabeling rules: - -``` -scrape_configs: -- job_name: 'overwritten-default' - - consul_sd_configs: - - server: '127.0.0.1:5361' - services: ['auth', 'api', 'load-balancer', 'postgres'] - - relabel_configs: - - source_labels: ['__meta_consul_service'] - regex: '(.*)' - target_label: 'job' - replacement: '$1' - - source_labels: ['__meta_consul_node'] - regex: '(.*)' - target_label: 'instance' - replacement: '$1' - - source_labels: ['__meta_consul_tags'] - regex: ',(production|canary),' - target_label: 'group' - replacement: '$1' -``` - -This discovers the given services from the local Consul agent. -As a result, we get metrics for four jobs (`auth`, `api`, `load-balancer`, and `postgres`). If a node -has the `production` or `canary` Consul tag, a respective `group` label is assigned to the target. -Each target's `instance` label is set to the node name provided by Consul. - -A full documentation of all configuration parameters for service discovery via Consul -can be found on the [Prometheus website](/docs/operating/configuration#relabel_config). - - -## Custom service discovery - -Finally, we added a file-based interface to integrate your custom service discovery or other common mechanisms -that are not yet supported out of the box. - -With this mechanism, Prometheus watches a set of directories or files which define target groups. -Whenever any of those files changes, a list of target groups is read from the files and scrape targets -are extracted. -It's now our job to write a small bridge program that runs as Prometheus's side-kick. -It retrieves changes from an arbitrary service discovery mechanism and writes the target information -to the watched files as lists of target groups. - -These files can either be in YAML: - -``` -- targets: ['10.11.150.1:7870', '10.11.150.4:7870'] - labels: - job: 'mysql' - -- targets: ['10.11.122.11:6001', '10.11.122.15:6002'] - labels: - job: 'postgres' -``` - -...or in JSON format: - -``` -[ - { - "targets": ["10.11.150.1:7870", "10.11.150.4:7870"], - "labels": { - "job": "mysql" - } - }, - { - "targets": ["10.11.122.11:6001", "10.11.122.15:6002"], - "labels": { - "job": "postgres" - } - } -] -``` - -We now configure Prometheus to watch the `tgroups/` directory in its working directory -for all `.json` files: - -``` -scrape_configs: -- job_name: 'overwritten-default' - - file_sd_configs: - - names: ['tgroups/*.json'] -``` - -What's missing now is a program that writes files to this directory. For the sake of this example, -let's assume we have all our instances for different jobs in a single denormalized MySQL table. -(Hint: you probably don't want to do service discovery this way.) - -Every 30 seconds, we read all instances from the MySQL table and write the -resulting target groups into a JSON file. Note that we do not have to keep -state whether or not any targets or their labels have changed. Prometheus will -automatically detect changes and applies them to targets without interrupting -their scrape cycles. - -``` -import os, time, json - -from itertools import groupby -from MySQLdb import connect - - -def refresh(cur): - # Fetch all rows. - cur.execute("SELECT address, job, zone FROM instances") - - tgs = [] - # Group all instances by their job and zone values. - for key, vals in groupby(cur.fetchall(), key=lambda r: (r[1], r[2])): - tgs.append({ - 'labels': dict(zip(['job', 'zone'], key)), - 'targets': [t[0] for t in vals], - }) - - # Persist the target groups to disk as JSON file. - with open('tgroups/target_groups.json.new', 'w') as f: - json.dump(tgs, f) - f.flush() - os.fsync(f.fileno()) - - os.rename('tgroups/target_groups.json.new', 'tgroups/target_groups.json') - - -if __name__ == '__main__': - while True: - with connect('localhost', 'root', '', 'test') as cur: - refresh(cur) - time.sleep(30) -``` - -While Prometheus will not apply any malformed changes to files, it is considered best practice to -update your files atomically via renaming, as we do in our example. -It is also recommended to split larger amounts of target groups into several files based on -logical grouping. - - -## Conclusion - -With DNS-SRV records and Consul, two major service discovery methods are now -natively supported by Prometheus. We've seen that relabeling is a powerful -approach to make use of metadata provided by service discovery mechanisms. - -Make sure to take a look at the new [configuration documentation](/docs/operating/configuration/) -to upgrade your Prometheus setup to the new release and find out about other configuration options, -such as basic HTTP authentication and target filtering via relabeling. - -We provide a [migration tool](https://github.com/prometheus/migrate/releases) that upgrades -your existing configuration files to the new YAML format. -For smaller configurations we recommend a manual upgrade to get familiar with the new format and -to preserve comments. diff --git a/content/blog/2015-06-18-practical-anomaly-detection.md b/content/blog/2015-06-18-practical-anomaly-detection.md deleted file mode 100644 index 790b44ba..00000000 --- a/content/blog/2015-06-18-practical-anomaly-detection.md +++ /dev/null @@ -1,132 +0,0 @@ ---- -title: Practical Anomaly Detection -created_at: 2015-06-18 -kind: article -author_name: Brian Brazil ---- - -In his *[Open Letter To Monitoring/Metrics/Alerting Companies](http://www.kitchensoap.com/2015/05/01/openlettertomonitoringproducts/)*, -John Allspaw asserts that attempting "to detect anomalies perfectly, at the right time, is not possible". - -I have seen several attempts by talented engineers to build systems to -automatically detect and diagnose problems based on time series data. While it -is certainly possible to get a demonstration working, the data always turned -out to be too noisy to make this approach work for anything but the simplest of -real-world systems. - -All hope is not lost though. There are many common anomalies which you can -detect and handle with custom-built rules. The Prometheus [query -language](/docs/prometheus/latest/querying/basics/) gives you the tools to discover -these anomalies while avoiding false positives. - - - -## Building a query - -A common problem within a service is when a small number of servers are not -performing as well as the rest, such as responding with increased latency. - -Let us say that we have a metric `instance:latency_seconds:mean5m` representing the -average query latency for each instance of a service, calculated via a -[recording rule](/docs/prometheus/latest/configuration/recording_rules/) from a -[Summary](/docs/concepts/metric_types/#summary) metric. - -A simple way to start would be to look for instances with a latency -more than two standard deviations above the mean: - -``` - instance:latency_seconds:mean5m -> on (job) group_left() - ( - avg by (job)(instance:latency_seconds:mean5m) - + on (job) - 2 * stddev by (job)(instance:latency_seconds:mean5m) - ) -``` - -You try this out and discover that there are false positives when -the latencies are very tightly clustered. So you add a requirement -that the instance latency also has to be 20% above the average: - -``` - ( - instance:latency_seconds:mean5m - > on (job) group_left() - ( - avg by (job)(instance:latency_seconds:mean5m) - + on (job) - 2 * stddev by (job)(instance:latency_seconds:mean5m) - ) - ) -> on (job) group_left() - 1.2 * avg by (job)(instance:latency_seconds:mean5m) -``` - -Finally, you find that false positives tend to happen at low traffic levels. -You add a requirement for there to be enough traffic for 1 query per second to -be going to each instance. You create an alert definition for all of this: - -```yaml -groups: -- name: Practical Anomaly Detection - rules: - - alert: InstanceLatencyOutlier - expr: > - ( - ( - instance:latency_seconds:mean5m - > on (job) group_left() - ( - avg by (job)(instance:latency_seconds:mean5m) - + on (job) - 2 * stddev by (job)(instance:latency_seconds:mean5m) - ) - ) - > on (job) group_left() - 1.2 * avg by (job)(instance:latency_seconds:mean5m) - and on (job) - avg by (job)(instance:latency_seconds_count:rate5m) - > - 1 - ) - for: 30m -``` - -## Automatic actions - -The above alert can feed into the -[Alertmanager](/docs/alerting/alertmanager/), and from there to -your chat, ticketing, or paging systems. After a while you might discover that the -usual cause of the alert is something that there is not a proper fix for, but there is an -automated action such as a restart, reboot, or machine replacement that resolves -the issue. - -Rather than having humans handle this repetitive task, one option is to -get the Alertmanager to send the alert to a web service that will perform -the action with appropriate throttling and safety features. - -The [generic webhook](/docs/alerting/alertmanager/#generic-webhook) -sends alert notifications to an HTTP endpoint of your choice. A simple Alertmanager -configuration that uses it could look like this: - -``` -# A simple notification configuration which only sends alert notifications to -# an external webhook. -receivers: -- name: restart_webhook - webhook_configs: - url: "http://example.org/my/hook" - -route: - receiver: restart_webhook -``` - -## Summary - -The Prometheus query language allows for rich processing of your monitoring -data. This lets you to create alerts with good signal-to-noise ratios, and the -Alertmanager's generic webhook support can trigger automatic remediations. -This all combines to enable oncall engineers to focus on problems where they can -have the most impact. - -When defining alerts for your services, see also our [alerting best practices](/docs/practices/alerting/). diff --git a/content/blog/2015-06-24-monitoring-dreamhack.md b/content/blog/2015-06-24-monitoring-dreamhack.md deleted file mode 100644 index e8c9e449..00000000 --- a/content/blog/2015-06-24-monitoring-dreamhack.md +++ /dev/null @@ -1,249 +0,0 @@ ---- -title: Monitoring DreamHack - the World's Largest Digital Festival -created_at: 2015-06-24 -kind: article -author_name: Christian Svensson (DreamHack Network Team) ---- - -*Editor's note: This article is a guest post written by a Prometheus user.* - -**If you are operating the network for 10,000's of demanding gamers, you need to -really know what is going on inside your network. Oh, and everything needs to be -built from scratch in just five days.** - -If you have never heard about [DreamHack](http://www.dreamhack.se/) before, here -is the pitch: Bring 20,000 people together and have the majority of them bring -their own computer. Mix in professional gaming (eSports), programming contests, -and live music concerts. The result is the world's largest festival dedicated -solely to everything digital. - -To make such an event possible, there needs to be a lot of infrastructure in -place. Ordinary infrastructures of this size take months to build, but the crew -at DreamHack builds everything from scratch in just five days. This of course -includes stuff like configuring network switches, but also building the -electricity distribution, setting up stores for food and drinks, and even -building the actual tables. - -The team that builds and operates everything related to the network is -officially called the Network team, but we usually refer to ourselves as *tech* -or *dhtech*. This post is going to focus on the work of dhtech and how we used -Prometheus during DreamHack Summer 2015 to try to kick our monitoring up another -notch. - - - -## The equipment -Turns out that to build a highly performant network for 10,000+ -computers, you need at least the same number of network ports. In our case these -come in the form of ~400 Cisco 2950 switches. We call these the access switches. -These are everywhere in the venue where participants will be seated with their -computers. - -[![Access switches](https://c1.staticflickr.com/9/8487/8206439882_4739d39a9c_c.jpg)](https://www.flickr.com/photos/dreamhack/8206439882) -
*Dutifully standing in line, the access switches are ready to greet the -DreamHackers with high-speed connectivity.*
- -Obviously just connecting all these computers to a switch is not enough. That -switch needs to be connected to the other switches as well. This is where the -distribution switches (or dist switches) come into play. These are switches that -take the hundreds of links from all access switches and aggregate them into -more manageable 10-Gbit/s high-capacity fibre. The dist switches are then -further aggregated into our core, where the traffic is routed to its -destination. - -On top of all of this, we operate our own WiFi networks, DNS/DHCP servers, and -other infrastructure. When completed, our core looks something like the image -below. - -[![Network planning map](/assets/dh_network_planning_map.png)](/assets/dh_network_planning_map.png) -
*The planning map for the distribution and core layers. The core is -clearly visible in "Hall D"*
- -All in all this is becoming a lengthy list of stuff to monitor, so let's get to -the reason you're here: How do we make sure we know what's going on? - -## Introducing: dhmon -dhmon is the collective name of the systems that not only -monitor the network, but also allow other teams to collect metrics on whatever -they want. - -Since the network needs to be built in five days, it's essential that the -monitoring systems are easy to set up and keep in sync if we need to do last -minute infrastructural changes (like adding or removing devices). When we start -to build the network, we need monitoring as soon as possible to be able to -discover any problems with the equipment or other issues we hadn't foreseen. - -In the past we have tried to use a mix of commonly available software such as -Cacti, SNMPc, and Opsview among others. While these have worked they have focused on -being closed systems and only provided the bare minimum. A few years back a few -people from the team said "Enough, we can do better ourselves!" and started -writing a custom monitoring solution. - -At the time the options were limited. Over the years the system went from using -Graphite (scalability issues), a custom Cassandra store (high complexity), and -InfluxDB (immature software) to finally land on using Prometheus. I first -learned about Prometheus back in 2014 when I met Julius Volz and I had been -eager to try it ever since. This summer we finally replaced the custom -InfluxDB-based metrics store that we had written with Prometheus. Spoiler: We're -not going back. - -## The architecture -The monitoring solution consists of three layers: -collection, storage, presentation. Our most critical collectors are -snmpcollector (SNMP) and ipplan-pinger (ICMP), closely followed by dhcpinfo -(DHCP lease stats). We also have some scripts that dump stats about other -systems into [node_exporter](https://github.com/prometheus/node_exporter)'s -textfile collector. - -[![dhmon Architecture](/assets/dh_dhmon_architecture.png)](/assets/dh_dhmon_architecture.png) -
*The current architecture plan of dhmon as of Summer 2015*
- -We use Prometheus as a central timeseries storage and querying engine, but we -also use Redis and memcached to export snapshot views of binary information that -we collect but cannot store in Prometheus in any sensible way, or when we need -to access very fresh data. - -One such case is in our presentation layer. We use our dhmap web application to -get an overview of the overall health of the access switches. In order to be -effective at resolving errors, we need a latency of ~10 seconds from data -collection to presentation. Our goal is to have fixed the problem before the -customer notices, or at least before they have walked over to the support people -to report an issue. For this reason, we have been using memcached since the -beginning to access the latest snapshot of the network. - -We continued to use memcached this year for our low-latency data, while using -Prometheus for everything that's historical or not as latency-sensitive. This -decision was made simply because we were unsure how Prometheus would perform at -very short sampling intervals. In the end, we found no reason for why we can't -use Prometheus for this data as well - we will definitely try to replace our -memcached with Prometheus at the next DreamHack. - -[![dhmon Visualization](/assets/dh_dhmon_visualization.png)](/assets/dh_dhmon_visualization.png) -
*The overview of our access layer visualized by dhmon*
- -## Prometheus setup -The block that so far has been referred to as *Prometheus* -really consists of three products: -[Prometheus](https://github.com/prometheus/prometheus), -[PromDash](https://github.com/prometheus/promdash), and -[Alertmanager](https://github.com/prometheus/alertmanager). The setup is fairly -basic and all three components are running on the same host. Everything is -served by an Apache web server that just acts as a reverse proxy. - - ProxyPass /prometheus http://localhost:9090/prometheus - ProxyPass /alertmanager http://localhost:9093/alertmanager - ProxyPass /dash http://localhost:3000/dash - -## Exploring the network -Prometheus has a powerful querying engine that allows -you to do pretty cool things with the streaming information collected from all -over your network. However, sometimes the queries need to process too much data -to finish within a reasonable amount of time. This happened to us when we wanted -to graph the top 5 utilized links out of ~18,000 in total. While the query -worked, it would take roughly the amount of time we set our timeout limit to, -meaning it was both slow and flaky. We decided to use Prometheus' [recording -rules](/docs/prometheus/latest/configuration/recording_rules/) for precomputing heavy queries. - - precomputed_link_utilization_percent = rate(ifHCOutOctets{layer!='access'}[10m])*8/1000/1000 - / on (device,interface,alias) - ifHighSpeed{layer!='access'} - -After this, running `topk(5, precomputed_link_utilization_percent)` was -blazingly fast. - -## Being reactive: alerting -So at this stage we had something we could query for -the state of the network. Since we are humans, we don't want to spend our time -running queries all the time to see if things are still running as they should, -so obviously we need alerting. - -For example: we know that all our access switches use GigabitEthernet0/2 as an -uplink. Sometimes when the network cables have been in storage for too long they -oxidize and are not able to negotiate the full 1000 Mbps that we want. - -The negotiated speed of a network port can be found in the SNMP OID -`IF-MIB::ifHighSpeed`. People familiar with SNMP will however recognize that -this OID is indexed by an arbitrary interface index. To make any sense of this -index, we need to cross-reference it with data from SNMP OID `IF-MIB::ifDescr` -to retrieve the actual interface name. - -Fortunately, our snmpcollector supports this kind of cross-referencing while -generating Prometheus metrics. This allows us in a simple way to not only query -data, but also define useful alerts. In our setup we configured the SNMP -collection to annotate any metric under the `IF-MIB::ifTable` and -`IF-MIB::ifXTable` OIDs with `ifDescr`. This will come in handy now when we need -to specify that we are only interested in the `GigabitEthernet0/2` port and no -other interface. - -Let's have a look at what such an alert definition looks like. - - ALERT BadUplinkOnAccessSwitch - IF ifHighSpeed{layer='access', interface='GigabitEthernet0/2'} < 1000 FOR 2m - SUMMARY "Interface linking at {{$value}} Mbps" - DESCRIPTION "Interface {{$labels.interface}} on {{$labels.device}} linking at {{$value}} Mbps" - -Done! Now we will get an alert if a switch's uplink suddenly links at a -non-optimal speed. - -Let's also look at how an alert for an almost full DHCP scope looks like: - - ALERT DhcpScopeAlmostFull - IF ceil((dhcp_leases_current_count / dhcp_leases_max_count)*100) > 90 FOR 2m - SUMMARY "DHCP scope {{$labels.network}} is almost full" - DESCRIPTION "DHCP scope {{$labels.network}} is {{$value}}% full" - -We found the syntax to define alerts easy to read and understand even if you had -no previous experience with Prometheus or time series databases. - -[![Prometheus alerts for DreamHack](/assets/dh_prometheus_alerts.png)](/assets/dh_prometheus_alerts.png) -
*Oops! Turns out we have some bad uplinks, better run out and fix -it!*
- -## Being proactive: dashboards -While alerting is an essential part of -monitoring, sometimes you just want to have a good overview of the health of -your network. To achieve this we used [PromDash](/docs/introduction/glossary/#promdash). -Every time someone asked us something about the network, we crafted a query to -get the answer and saved it as a dashboard widget. The most interesting ones -were then added to an overview dashboard that we proudly displayed. - -[![dhmon Dashboard](/assets/dh_dhmon_dashboard.png)](/assets/dh_dhmon_dashboard.png) -
*The DreamHack Overview dashboard powered by PromDash*
- -## The future -While changing an integral part of any system is a complex job and -we're happy that we managed to integrate Prometheus in just one event, there are -without a doubt a lot of areas to improve. Some areas are pretty basic: using -more precomputed metrics to improve performance, adding more alerts, and tuning -the ones we have. Another area is to make it easier for operators: creating an -alert dashboard suitable for our network operations center (NOC), figuring out -if we want to page the people on-call, or just let the NOC escalate alerts. - -Some bigger features we're planning on adding: syslog analysis (we have a lot of -syslog!), alerts from our intrusion detection systems, integrating with our -Puppet setup, and also integrating more across the different teams at DreamHack. -We managed to create a proof-of-concept where we got data from one of the -electrical current sensors into our monitoring, making it easy to see if a -device is faulty or if it simply doesn't have any electricity anymore. We're -also working on integrating with the point-of-sale systems that are used in the -stores at the event. Who doesn't want to graph the sales of ice cream? - -Finally, not all services that the team operates are on-site, and some even run -24/7 after the event. We want to monitor these services with Prometheus as well, -and in the long run when Prometheus gets support for federation, utilize the -off-site Prometheus to replicate the metrics from the event Prometheus. - -## Closing words -We're really excited about Prometheus and how easy it makes -setting up scalable monitoring and alerting from scratch. - -A huge shout-out to everyone that helped us in `#prometheus` on -FreeNode during the event. Special thanks to Brian -Brazil, Fabian Reinartz and Julius Volz. Thanks for helping us even in the cases -where it was obvious that we hadn't read the documentation thoroughly enough. - -Finally, dhmon is all open-source, so head over to https://github.com/dhtech/ -and have a look if you're interested. If you feel like you would like to be a -part of this, just head over to `#dreamhack` on -[QuakeNet](https://www.quakenet.org/) and have a chat with us. Who knows, maybe -you will help us build the next DreamHack? diff --git a/content/blog/2015-08-17-service-discovery-with-etcd.md b/content/blog/2015-08-17-service-discovery-with-etcd.md deleted file mode 100644 index b963d5aa..00000000 --- a/content/blog/2015-08-17-service-discovery-with-etcd.md +++ /dev/null @@ -1,321 +0,0 @@ ---- -title: Custom service discovery with etcd -created_at: 2015-08-17 -kind: article -author_name: Fabian Reinartz ---- - -In a [previous post](/blog/2015/06/01/advanced-service-discovery/) we -introduced numerous new ways of doing service discovery in Prometheus. -Since then a lot has happened. We improved the internal implementation and -received fantastic contributions from our community, adding support for -service discovery with Kubernetes and Marathon. They will become available -with the release of version 0.16. - -We also touched on the topic of [custom service discovery](/blog/2015/06/01/advanced-service-discovery/#custom-service-discovery). - -Not every type of service discovery is generic enough to be directly included -in Prometheus. Chances are your organisation has a proprietary -system in place and you just have to make it work with Prometheus. -This does not mean that you cannot enjoy the benefits of automatically -discovering new monitoring targets. - -In this post we will implement a small utility program that connects a custom -service discovery approach based on [etcd](https://coreos.com/etcd/), the -highly consistent distributed key-value store, to Prometheus. - - - -## Targets in etcd and Prometheus - -Our fictional service discovery system stores services and their -instances under a well-defined key schema: - -``` -/services// = -``` - -Prometheus should now automatically add and remove targets for all existing -services as they come and go. -We can integrate with Prometheus's file-based service discovery, which -monitors a set of files that describe targets as lists of target groups in -JSON format. - -A single target group consists of a list of addresses associated with a set of -labels. Those labels are attached to all time series retrieved from those -targets. -One example target group extracted from our service discovery in etcd could -look like this: - -``` -{ - "targets": ["10.0.33.1:54423", "10.0.34.12:32535"], - "labels": { - "job": "node_exporter" - } -} -``` - -## The program - -What we need is a small program that connects to the etcd cluster and performs -a lookup of all services found in the `/services` path and writes them out into -a file of target groups. - -Let's get started with some plumbing. Our tool has two flags: the etcd server -to connect to and the file to which the target groups are written. Internally, -the services are represented as a map from service names to instances. -Instances are a map from the instance identifier in the etcd path to its -address. - -``` -const servicesPrefix = "/services" - -type ( - instances map[string]string - services map[string]instances -) - -var ( - etcdServer = flag.String("server", "http://127.0.0.1:4001", "etcd server to connect to") - targetFile = flag.String("target-file", "tgroups.json", "the file that contains the target groups") -) -``` - -Our `main` function parses the flags and initializes our object holding the -current services. We then connect to the etcd server and do a recursive read -of the `/services` path. -We receive the subtree for the given path as a result and call `srvs.handle`, -which recursively performs the `srvs.update` method for each node in the -subtree. The `update` method modifies the state of our `srvs` object to be -aligned with the state of our subtree in etcd. -Finally, we call `srvs.persist` which transforms the `srvs` object into a list -of target groups and writes them out to the file specified by the -`-target-file` flag. - -``` -func main() { - flag.Parse() - - var ( - client = etcd.NewClient([]string{*etcdServer}) - srvs = services{} - ) - - // Retrieve the subtree of the /services path. - res, err := client.Get(servicesPrefix, false, true) - if err != nil { - log.Fatalf("Error on initial retrieval: %s", err) - } - srvs.handle(res.Node, srvs.update) - srvs.persist() -} -``` - -Let's assume we have this as a working implementation. We could now run this -tool every 30 seconds to have a mostly accurate view of the current targets in -our service discovery. - -But can we do better? - -The answer is _yes_. etcd provides watches, which let us listen for updates on -any path and its sub-paths. With that, we are informed about changes -immediately and can apply them immediately. We also don't have to work through -the whole `/services` subtree again and again, which can become important for -a large number of services and instances. - -We extend our `main` function as follows: - -``` -func main() { - // ... - - updates := make(chan *etcd.Response) - - // Start recursively watching for updates. - go func() { - _, err := client.Watch(servicesPrefix, 0, true, updates, nil) - if err != nil { - log.Errorln(err) - } - }() - - // Apply updates sent on the channel. - for res := range updates { - log.Infoln(res.Action, res.Node.Key, res.Node.Value) - - handler := srvs.update - if res.Action == "delete" { - handler = srvs.delete - } - srvs.handle(res.Node, handler) - srvs.persist() - } -} -``` - -We start a goroutine that recursively watches for changes to entries in -`/services`. It blocks forever and sends all changes to the `updates` channel. -We then read the updates from the channel and apply it as before. In case an -instance or entire service disappears however, we call `srvs.handle` using the -`srvs.delete` method instead. - -We finish each update by another call to `srvs.persist` to write out the -changes to the file Prometheus is watching. - -### Modification methods - -So far so good – conceptually this works. What remains are the `update` and -`delete` handler methods as well as the `persist` method. - -`update` and `delete` are invoked by the `handle` method which simply calls -them for each node in a subtree, given that the path is valid: - -``` -var pathPat = regexp.MustCompile(`/services/([^/]+)(?:/(\d+))?`) - -func (srvs services) handle(node *etcd.Node, handler func(*etcd.Node)) { - if pathPat.MatchString(node.Key) { - handler(node) - } else { - log.Warnf("unhandled key %q", node.Key) - } - - if node.Dir { - for _, n := range node.Nodes { - srvs.handle(n, handler) - } - } -} -``` - -#### `update` - -The update methods alters the state of our `services` object -based on the node which was updated in etcd. - -``` -func (srvs services) update(node *etcd.Node) { - match := pathPat.FindStringSubmatch(node.Key) - // Creating a new job directory does not require any action. - if match[2] == "" { - return - } - srv := match[1] - instanceID := match[2] - - // We received an update for an instance. - insts, ok := srvs[srv] - if !ok { - insts = instances{} - srvs[srv] = insts - } - insts[instanceID] = node.Value -} -``` - -#### `delete` - -The delete methods removes instances or entire jobs from our `services` -object depending on which node was deleted from etcd. - -``` -func (srvs services) delete(node *etcd.Node) { - match := pathPat.FindStringSubmatch(node.Key) - srv := match[1] - instanceID := match[2] - - // Deletion of an entire service. - if instanceID == "" { - delete(srvs, srv) - return - } - - // Delete a single instance from the service. - delete(srvs[srv], instanceID) -} -``` - -#### `persist` - -The persist method transforms the state of our `services` object into a list of `TargetGroup`s. It then writes this list into the `-target-file` in JSON -format. - -``` -type TargetGroup struct { - Targets []string `json:"targets,omitempty"` - Labels map[string]string `json:"labels,omitempty"` -} - -func (srvs services) persist() { - var tgroups []*TargetGroup - // Write files for current services. - for job, instances := range srvs { - var targets []string - for _, addr := range instances { - targets = append(targets, addr) - } - - tgroups = append(tgroups, &TargetGroup{ - Targets: targets, - Labels: map[string]string{"job": job}, - }) - } - - content, err := json.Marshal(tgroups) - if err != nil { - log.Errorln(err) - return - } - - f, err := create(*targetFile) - if err != nil { - log.Errorln(err) - return - } - defer f.Close() - - if _, err := f.Write(content); err != nil { - log.Errorln(err) - } -} -``` - -## Taking it live - -All done, so how do we run this? - -We simply start our tool with a configured output file: - -``` -./etcd_sd -target-file /etc/prometheus/tgroups.json -``` - -Then we configure Prometheus with file based service discovery -using the same file. The simplest possible configuration looks like this: - -``` -scrape_configs: -- job_name: 'default' # Will be overwritten by job label of target groups. - file_sd_configs: - - names: ['/etc/prometheus/tgroups.json'] -``` - -And that's it. Now our Prometheus stays in sync with services and their -instances entering and leaving our service discovery with etcd. - -## Conclusion - -If Prometheus does not ship with native support for the service discovery of -your organisation, don't despair. Using a small utility program you can easily -bridge the gap and profit from seamless updates to the monitored targets. -Thus, you can remove changes to the monitoring configuration from your -deployment equation. - -A big thanks to our contributors [Jimmy Dyson](https://twitter.com/jimmidyson) -and [Robert Jacob](https://twitter.com/xperimental) for adding native support -for [Kubernetes](http://kubernetes.io/) and [Marathon](https://mesosphere.github.io/marathon/). -Also check out [Keegan C Smith's](https://twitter.com/keegan_csmith) take on [EC2 service discovery](https://github.com/keegancsmith/prometheus-ec2-discovery) based on files. - -You can find the [full source of this blog post on GitHub](https://github.com/fabxc/prom_sd_example/tree/master/etcd_simple). - diff --git a/content/blog/2016-01-26-one-year-of-open-prometheus-development.md b/content/blog/2016-01-26-one-year-of-open-prometheus-development.md deleted file mode 100644 index ab639462..00000000 --- a/content/blog/2016-01-26-one-year-of-open-prometheus-development.md +++ /dev/null @@ -1,151 +0,0 @@ ---- -title: One Year of Open Prometheus Development -created_at: 2016-01-26 -kind: article -author_name: Julius Volz ---- - -## The beginning - -A year ago today, we officially announced Prometheus to the wider world. This -is a great opportunity for us to look back and share some of the wonderful -things that have happened to the project since then. But first, let's start at -the beginning. - -Although we had already started Prometheus as an open-source project on GitHub in -2012, we didn't make noise about it at first. We wanted to give the project -time to mature and be able to experiment without friction. Prometheus was -gradually introduced for production monitoring at -[SoundCloud](https://soundcloud.com/) in 2013 and then saw more and more -usage within the company, as well as some early adoption by our friends at -Docker and Boxever in 2014. Over the years, Prometheus was growing more and -more mature and although it was already solving people's monitoring problems, -it was still unknown to the wider public. - - - -## Going public - -Everything changed for us a year ago, in January of 2015. After more than two -years of development and internal usage, we felt that Prometheus was ready for -a wider audience and decided to go fully public with our official [announcement -blog post](https://developers.soundcloud.com/blog/prometheus-monitoring-at-soundcloud), -a [website](https://prometheus.io/), and a series of -[related](http://www.boxever.com/tags/monitoring) -[posts](http://5pi.de/2015/01/26/monitor-docker-containers-with-prometheus/). -We already received a good deal of attention during the first week after the -announcement, but nothing could prepare us for what happened a week later: -someone unknown to us (hello there, -[jjwiseman](https://news.ycombinator.com/user?id=jjwiseman)!) had submitted -[the Prometheus website](https://prometheus.io/) to Hacker News and somehow their -post had made it [all the way to the top](https://news.ycombinator.com/item?id=8995696). - -This is when things started going slightly crazy in a good way. We saw a sharp -rise in contributors, mailing list questions, GitHub issues, IRC visitors, -requests for conference and meetup talks, and increasing buzz on the net in -general. Since the beginning, we have been very lucky about the quality of our -newly expanded community: The kind of people who were attracted to Prometheus -also turned out to be very competent, constructive, and high-quality -contributors and users. The ideal open-source scenario of receiving a lot of -value back from the community was a reality pretty much from day one. - -What does all that Hacker News buzz look like in terms of GitHub stars? Try and -see if you can find the exact moment in this graph (ironically, a Gnuplot and -not Prometheus graph) when we went out of "dark mode" and got hit by Hacker -News: - -[![Prometheus GitHub stars](/assets/prometheus_github_stars.png)](/assets/prometheus_github_stars.png) - -This attention also put us in the 4th place of GitHub's trending repositories -worldwide: - -[![Prometheus trending on GitHub](/assets/prometheus_github_trending.png)](/assets/prometheus_github_trending.png) - -## After the first wave - -After those first weeks, the initial onslaught of incoming communication cooled -down a bit, but we were and still are receiving constantly growing adoption. - -To give you an idea of the ecosystem, we now have: - -- 33 repositories in our GitHub organization -- ~4800 total GitHub stars -- 200+ contributors -- 2300+ pull requests (60+ open) -- 1100+ issues (300+ open) -- 150+ people in our IRC channel (`#prometheus` on FreeNode) -- 250+ people on the mailing list who have created 300+ threads -- 20+ Prometheus-related talks and workshops -- 100+ articles and blog posts - -Besides countless smaller features and bug fixes to existing projects, the -community has contributed many projects of their own to the Prometheus -ecosystem. Most of them are exporters that translate metrics from existing -systems into Prometheus's data model, but there have also been important -additions to Prometheus itself, such as service discovery mechanisms for -[Kubernetes](http://kubernetes.io/), -[Marathon](https://mesosphere.github.io/marathon/) and -[EC2](http://aws.amazon.com/ec2/). - -Shortly after making more noise about Prometheus, we also found one contributor -([Fabian](https://github.com/fabxc)) so outstanding that he ended up joining -SoundCloud to work on Prometheus. He has since become the most active developer -on the project and we have him to thank for major new features such as -generalized service discovery support, runtime-reloadable configurations, new -powerful query language features, a custom-built query parser, and so much -more. He is currently working on the new beta rewrite of the -[Alertmanager](https://github.com/prometheus/alertmanager). - -Finally, we have been honored to be recognized and adopted by major players in -the industry. [Google](https://www.google.com) is now instrumenting its open-source -container management system [Kubernetes](http://kubernetes.io/) natively with -Prometheus metrics. [CoreOS](https://coreos.com/) is picking it up for -[etcd](https://coreos.com/etcd/)'s monitoring as well. [DigitalOcean](https://www.digitalocean.com/) is betting on Prometheus for their -internal monitoring. By now, the list of companies using Prometheus in one way -or another has become too long to mention all of them: -[Google](https://www.google.com), -[CoreOS](https://coreos.com/), [Docker](https://docker.com), -[Boxever](http://www.boxever.com/), -[DigitalOcean](https://www.digitalocean.com/), [Financial Times](http://www.ft.com/), -[Improbable](http://improbable.io/), [KPMG](https://www.kpmg.com), and many more. -Even the world's largest digital festival, -[DreamHack](https://www.dreamhack.se), has [used -Prometheus](/blog/2015/06/24/monitoring-dreamhack/) to keep -tabs on their network infrastructure in 2015, and -[FOSDEM](https://fosdem.org/2016/) will do so in 2016. - -The widely popular dashboard builder [Grafana](http://grafana.org/) also added -native Prometheus backend support in [version -2.5](http://grafana.org/blog/2015/10/28/Grafana-2-5-Released.html). Since -people all around the world are already using and loving Grafana, we are going -to focus on improving Grafana's Prometheus integration and will invest -less energy in our own dashboard builder -[PromDash](https://github.com/prometheus/promdash) in the future. - -With the Prometheus ecosystem continuing to grow, the first users have started -asking about commercial support. While Prometheus will always remain an -independent open source project, one of our core contributors ([Brian -Brazil](https://github.com/brian-brazil)) has recently founded his own company, -[Robust Perception](https://www.robustperception.io/), which provides support -and consulting services around Prometheus and monitoring in general. - -On a lighter note, 2015 has also been the year in which Brian proved Prometheus's query -language to be Turing complete by implementing -[Conway's Game of Life in PromQL](https://www.robustperception.io/conways-life-in-prometheus/). - -## The road ahead - -Both personally and technically, we are really excited about what has happened -last year in Prometheus-land. We love the opportunity to provide the world with -a powerful new approach to monitoring, especially one that is much better -suited towards modern cloud- and container-based infrastructures than -traditional solutions. We are also very grateful to all contributors and -hope to continuously improve Prometheus for everyone. - -Although Prometheus is relatively mature by now, we have a list of major goals -we want to tackle in 2016. The highlights will be polishing the new -Alertmanager rewrite, supporting full read and write integration for external -long-term storage, as well as eventually releasing a stable 1.0 version of the -Prometheus server itself. - -Stay tuned! diff --git a/content/blog/2016-03-03-custom-alertmanager-templates.md b/content/blog/2016-03-03-custom-alertmanager-templates.md deleted file mode 100644 index c1ae33cc..00000000 --- a/content/blog/2016-03-03-custom-alertmanager-templates.md +++ /dev/null @@ -1,128 +0,0 @@ ---- -title: Custom Alertmanager Templates -created_at: 2016-03-03 -kind: article -author_name: Fabian Reinartz ---- - -The Alertmanager handles alerts sent by Prometheus servers and sends -notifications about them to different receivers based on their labels. - -A receiver can be one of many different integrations such as PagerDuty, Slack, -email, or a custom integration via the generic webhook interface (for example [JIRA](https://github.com/fabxc/jiralerts)). - -## Templates - -The messages sent to receivers are constructed via templates. -Alertmanager comes with default templates but also allows defining custom -ones. - -In this blog post, we will walk through a simple customization of Slack -notifications. - -We use this simple Alertmanager configuration that sends all alerts to Slack: - -```yaml -global: - slack_api_url: '' - -route: - receiver: 'slack-notifications' - # All alerts in a notification have the same value for these labels. - group_by: [alertname, datacenter, app] - -receivers: -- name: 'slack-notifications' - slack_configs: - - channel: '#alerts' -``` - -By default, a Slack message sent by Alertmanager looks like this: - -![](/assets/blog/2016-03-03/slack_alert_before.png) - -It shows us that there is one firing alert, followed by the label values of -the alert grouping (alertname, datacenter, app) and further label values the -alerts have in common (critical). - - - -## Customize - -If you have alerts, you should also have documentation on how to handle them – -a runbook. A good approach to that is having a wiki that has a section for -each app you are running with a page for each alert. - -Suppose we have such a wiki running at `https://internal.myorg.net/wiki/alerts`. -Now we want links to these runbooks shown in our Slack messages. - -In our template, we need access to the "alertname" and the "app" label. Since -these are labels we group alerts by, they are available in the `GroupLabels` -map of our templating data. - -We can directly add custom templating to our Alertmanager's [Slack configuration](/docs/alerting/configuration/#slack-receiver-slack_config) -that is used for the `text` section of our Slack message. -The [templating language](https://godoc.org/text/template) is the one provided -by the Go programming language. - -```yaml -global: - slack_api_url: '' - -route: -- receiver: 'slack-notifications' - group_by: [alertname, datacenter, app] - -receivers: -- name: 'slack-notifications' - slack_configs: - - channel: '#alerts' - # Template for the text field in Slack messages. - text: 'https://internal.myorg.net/wiki/alerts/{{ .GroupLabels.app }}/{{ .GroupLabels.alertname }}' -``` - -We reload our Alertmanager by sending a `SIGHUP` or restart it to load the -changed configuration. Done. - -Our Slack notifications now look like this: - -![](/assets/blog/2016-03-03/slack_alert_after.png) - -### Template files - -Alternatively, we can also provide a file containing named templates, which -are then loaded by Alertmanager. This is especially helpful for more complex -templates that span many lines. - -We create a file `/etc/alertmanager/templates/myorg.tmpl` and create a -template in it named "slack.myorg.text": - -``` -{{ define "slack.myorg.text" }}https://internal.myorg.net/wiki/alerts/{{ .GroupLabels.app }}/{{ .GroupLabels.alertname }}{{ end}} -``` - -Our configuration now loads the template with the given name for the "text" -field and we provide a path to our custom template file: - -```yaml -global: - slack_api_url: '' - -route: -- receiver: 'slack-notifications' - group_by: [alertname, datacenter, app] - -receivers: -- name: 'slack-notifications' - slack_configs: - - channel: '#alerts' - text: '{{ template "slack.myorg.text" . }}' - -templates: -- '/etc/alertmanager/templates/myorg.tmpl' -``` - -We reload our Alertmanager by sending a `SIGHUP` or restart it to load the -changed configuration and the new template file. Done. - -To test and iterate on your Prometheus Alertmanager notification templates for Slack you can use the following [tool](https://juliusv.com/promslack/). diff --git a/content/blog/2016-03-23-interview-with-life360.md b/content/blog/2016-03-23-interview-with-life360.md deleted file mode 100644 index f4eae4da..00000000 --- a/content/blog/2016-03-23-interview-with-life360.md +++ /dev/null @@ -1,110 +0,0 @@ ---- -title: Interview with Life360 -created_at: 2016-03-23 -kind: article -author_name: Brian Brazil ---- - -*This is the first in a series of interviews with users of Prometheus, allowing -them to share their experiences of evaluating and using Prometheus. Our first -interview is with Daniel from Life360.* - -## Can you tell us about yourself and what Life360 does? - -I’m Daniel Ben Yosef, a.k.a, dby, and I’m an Infrastructure Engineer for -[Life360](https://www.life360.com/), and before that, I’ve held systems -engineering roles for the past 9 years. - -Life360 creates technology that helps families stay connected, we’re the Family -Network app for families. We’re quite busy handling these families - at peak -we serve 700k requests per minute for 70 million registered families. - -[](https://www.life360.com/) - -We manage around 20 services in production, mostly handling location requests -from mobile clients (Android, iOS, and Windows Phone), spanning over 150+ -instances at peak. Redundancy and high-availability are our goals and we strive -to maintain 100% uptime whenever possible because families trust us to be -available. - -We hold user data in both our MySQL multi-master cluster and in our 12-node -Cassandra ring which holds around 4TB of data at any given time. We have -services written in Go, Python, PHP, as well as plans to introduce Java to our -stack. We use Consul for service discovery, and of course our Prometheus setup -is integrated with it. - - - -## What was your pre-Prometheus monitoring experience? - -Our monitoring setup, before we switched to Prometheus, included many -components such as: - - * Copperegg (now Idera) - * Graphite + Statsd + Grafana - * Sensu - * AWS Cloudwatch - -We primarily use MySQL, NSQ and HAProxy and we found that all of the monitoring -solutions mentioned above were very partial, and required a lot of -customization to actually get all working together. - -## Why did you decide to look at Prometheus? - -We had a few reasons for switching to Prometheus, one of which is that we -simply needed better monitoring. - -Prometheus has been known to us for a while, and we have been tracking it and -reading about the active development, and at a point (a few months back) we -decided to start evaluating it for production use. - -The PoC results were incredible. The monitoring coverage of MySQL was amazing, -and we also loved the JMX monitoring for Cassandra, which had been sorely -lacking in the past. - -[![Cassandra Client Dashboard](/assets/blog/2016-03-23/cx_client.png)](/assets/blog/2016-03-23/cx_client.png) - -## How did you transition? - -We started with a relatively small box (4GB of memory) as an initial point. It -was effective for a small number of services, but not for our full monitoring -needs. - -We also initially deployed with Docker, but slowly transitioned to its own box -on an r3.2xl instance (60GB ram), and that holds all of our service monitoring -needs with 30 days of in-memory data. - -We slowly started introducing all of our hosts with the Node Exporter and built -Grafana graphs, up to the point where we had total service coverage. - -We were also currently looking at InfluxDB for long term storage, but due to -[recent developments](https://influxdata.com/blog/update-on-influxdb-clustering-high-availability-and-monetization/), -this may no longer be a viable option. - -We then added exporters for MySQL, Node, Cloudwatch, HAProxy, JMX, NSQ (with a -bit of our own code), Redis and Blackbox (with our own contribution to add -authentication headers). - -[![NSQ Overview Dashboard](/assets/blog/2016-03-23/nsq_overview.png)](/assets/blog/2016-03-23/nsq_overview.png) - - -## What improvements have you seen since switching? - -The visibility and instrumentation gain was the first thing we saw. Right -before switching, we started experiencing Graphite’s scalability issues, and -having an in-place replacement for Graphite so stakeholders can continue to use -Grafana as a monitoring tool was extremely valuable to us. Nowadays, we are -focusing on taking all that data and use it to detect anomalies, which will -eventually become alerts in the Alert Manager. - - -## What do you think the future holds for Life360 and Prometheus? - -We currently have one of our projects instrumented directly with a Prometheus -client, a Python-based service. As we build out new services, Prometheus is -becoming our go-to for instrumentation, and will help us gain extremely -meaningful alerts and stats about our infrastructure. - -We look forward to growing with the project and keep contributing. - -*Thank you Daniel! The source for Life360's dashboards is shared on [Github](https://github.com/life360/prometheus-grafana-dashboards).* diff --git a/content/blog/2016-05-01-interview-with-showmax.md b/content/blog/2016-05-01-interview-with-showmax.md deleted file mode 100644 index c9f38ed0..00000000 --- a/content/blog/2016-05-01-interview-with-showmax.md +++ /dev/null @@ -1,148 +0,0 @@ ---- -title: Interview with ShowMax -created_at: 2016-05-01 -kind: article -author_name: Brian Brazil ---- - -*This is the second in a series of interviews with users of Prometheus, allowing -them to share their experiences of evaluating and using Prometheus.* - -## Can you tell us about yourself and what ShowMax does? - -I’m Antonin Kral, and I’m leading research and architecture for -[ShowMax](http://www.showmax.com). Before that, I’ve held architectural and CTO -roles for the past 12 years. - -ShowMax is a subscription video on demand service that launched in South Africa -in 2015. We’ve got an extensive content catalogue with more than 20,000 -episodes of TV shows and movies. Our service is currently available in 65 -countries worldwide. While better known rivals are skirmishing in America and -Europe, ShowMax is battling a more difficult problem: how do you binge-watch -in a barely connected village in sub-Saharan Africa? Already 35% of video -around the world is streamed, but there are still so many places the revolution -has left untouched. - -![ShowMax logo](/assets/blog/2016-05-01/showmax-logo.png) - -We are managing about 50 services running mostly on private clusters built -around CoreOS. They are primarily handling API requests from our clients -(Android, iOS, AppleTV, JavaScript, Samsung TV, LG TV etc), while some of them -are used internally. One of the biggest internal pipelines is video encoding -which can occupy 400+ physical servers when handling large ingestion batches. - -The majority of our back-end services are written in Ruby, Go or Python. We use -EventMachine when writing apps in Ruby (Goliath on MRI, Puma on JRuby). Go is -typically used in apps that require large throughput and don’t have so much -business logic. We’re very happy with Falcon for services written in Python. -Data is stored in PostgreSQL and ElasticSearch clusters. We use etcd and custom -tooling for configuring Varnishes for routing requests. - - - -## What was your pre-Prometheus monitoring experience? - -The primary use-cases for monitoring systems are: - -* Active monitoring and probing (via Icinga) -* Metrics acquisition and creation of alerts based on these metrics (now Prometheus) -* Log acquisition from backend services -* Event and log acquisition from apps - -The last two use-cases are handled via our logging infrastructure. It consists -of a collector running in the service container, which is listening on local -Unix socket. The socket is used by apps to send messages to the outside world. -Messages are transferred via RabbitMQ servers to consumers. Consumers are -custom written or hekad based. One of the main message flows is going towards -the service ElasticSearch cluster, which makes logs accessible for Kibana and -ad-hoc searches. We also save all processed events to GlusterFS for archival -purposes and/or further processing. - -We used to run two metric acquisition pipelines in parallel. The first is based -on Collectd + StatsD + Graphite + Grafana and the other using Collectd + -OpenTSDB. We have struggled considerably with both pipelines. We had to deal -with either the I/O hungriness of Graphite, or the complexity and inadequate -tooling around OpenTSDB. - - -## Why did you decide to look at Prometheus? - -After learning from our problems with the previous monitoring system, we looked -for a replacement. Only a few solutions made it to our shortlist. Prometheus -was one of the first, as Jiri Brunclik, our head of Operations at the time, had -received a personal recommendation about the system from former colleagues at -Google. - -The proof of concept went great. We got a working system very quickly. We also -evaluated InfluxDB as a main system as well as a long-term storage for -Prometheus. But due to recent developments, this may no longer be a viable -option for us. - - -## How did you transition? - -We initially started with LXC containers on one of our service servers, but -quickly moved towards a dedicated server from Hetzner, where we host the -majority of our services. We’re using PX70-SSD, which is Intel® Xeon® E3-1270 -v3 Quad-Core Haswell with 32GB RAM, so we have plenty of power to run -Prometheus. SSDs allow us to have retention set to 120 days. Our logging -infrastructure is built around getting logs locally (receiving them on Unix -socket) and then pushing them towards the various workers. - -![Diagram of ShowMax logging infrastructure. Shows flow of log messages from the source via processors to various consumers.](/assets/blog/2016-05-01/Loggin_infrastructure.png) - -Having this infrastructure available made pushing metrics a logical choice -(especially in pre-Prometheus times). On the other side, Prometheus is -primarily designed around the paradigm of scraping metrics. We wanted to stay -consistent and push all metrics towards Prometheus initially. We have created a -Go daemon called prometheus-pusher. It’s responsible for scraping metrics from -local exporters and pushing them towards the Pushgateway. Pushing metrics has -some positive aspects (e.g. simplified service discovery) but also quite a few -drawbacks (e.g. making it hard to distinguish between a network partition vs. a -crashed service). We made Prometheus-pusher available on -[GitHub](https://github.com/ShowMax/prometheus-pusher), so you can try it -yourself. - -![Grafana dashboard showing April 5th 2016 log processors traffic.](/assets/blog/2016-05-01/log_processors.png) - -The next step was for us to figure out what to use for managing dashboards and -graphs. We liked the Grafana integration, but didn’t really like how Grafana -manages dashboard configurations. We are running Grafana in a Docker -container, so any changes should be kept out of the container. Another problem -was the lack of change tracking in Grafana. - -We have thus decided to write a generator which takes YAML maintained within -git and generates JSON configs for Grafana dashboards. It is furthermore able to -deploy dashboards to Grafana started in a fresh container without the need for -persisting changes made into the container. This provides you with automation, -repeatability, and auditing. - -We are pleased to announce that this tool is also now available under an Apache -2.0 license on [GitHub](https://github.com/ShowMax/grafana-dashboards-generator). - - -## What improvements have you seen since switching? - -An improvement which we saw immediately was the stability of Prometheus. We -were fighting with stability and scalability of Graphite prior to this, so -getting that sorted was a great win for us. Furthermore the speed and stability -of Prometheus made access to metrics very easy for developers. Prometheus is -really helping us to embrace the DevOps culture. - -Tomas Cerevka, one of our backend developers, was testing a new version of the -service using JRuby. He needed a quick peek into the heap consumption of that -particular service. He was able to get that information in a snap. For us, -this speed is essential. - -![Heap size consumed by JRuby worker during troubleshooting memory issues on JVM.](/assets/blog/2016-05-01/ui_fragments-heap-zoom.png) - -## What do you think the future holds for ShowMax and Prometheus? - -Prometheus has become an integral part of monitoring in ShowMax and it is going -to be with us for the foreseeable future. We have replaced our whole metric -storage with Prometheus, but the ingestion chain remains push based. We are -thus thinking about following Prometheus best practices and switching to a pull -model. - -We’ve also already played with alerts. We want to spend more time on this topic -and come up with increasingly sophisticated alert rules. diff --git a/content/blog/2016-05-08-when-to-use-varbit-chunks.md b/content/blog/2016-05-08-when-to-use-varbit-chunks.md deleted file mode 100644 index 5b6c950e..00000000 --- a/content/blog/2016-05-08-when-to-use-varbit-chunks.md +++ /dev/null @@ -1,155 +0,0 @@ ---- -title: When (not) to use varbit chunks -created_at: 2016-05-08 -kind: article -author_name: Björn “Beorn” Rabenstein ---- - -The embedded time series database (TSDB) of the Prometheus server organizes the -raw sample data of each time series in chunks of constant 1024 bytes size. In -addition to the raw sample data, a chunk contains some meta-data, which allows -the selection of a different encoding for each chunk. The most fundamental -distinction is the encoding version. You select the version for newly created -chunks via the command line flag `-storage.local.chunk-encoding-version`. Up to -now, there were only two supported versions: 0 for the original delta encoding, -and 1 for the improved double-delta encoding. With release -[0.18.0](https://github.com/prometheus/prometheus/releases/tag/0.18.0), we -added version 2, which is another variety of double-delta encoding. We call it -_varbit encoding_ because it involves a variable bit-width per sample within -the chunk. While version 1 is superior to version 0 in almost every aspect, -there is a real trade-off between version 1 and 2. This blog post will help you -to make that decision. Version 1 remains the default encoding, so if you want -to try out version 2 after reading this article, you have to select it -explicitly via the command line flag. There is no harm in switching back and -forth, but note that existing chunks will not change their encoding version -once they have been created. However, these chunks will gradually be phased out -according to the configured retention time and will thus be replaced by chunks -with the encoding specified in the command-line flag. - - - -## What is varbit encoding? - -From the beginning, we designed the chunked sample storage for easy addition of -new encodings. When Facebook published a -[paper on their in-memory TSDB Gorilla](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf), -we were intrigued by a number of similarities between the independently -developed approaches of Gorilla and Prometheus. However, there were also many -fundamental differences, which we studied in detail, wondering if we could get -some inspiration from Gorilla to improve Prometheus. - -On the rare occasion of a free weekend ahead of me, I decided to give it a -try. In a coding spree, I implemented what would later (after a considerable -amount of testing and debugging) become the varbit encoding. - -In a future blog post, I will describe the technical details of the -encoding. For now, you only need to know a few characteristics for your -decision between the new varbit encoding and the traditional double-delta -encoding. (I will call the latter just “double-delta encoding” from now on but -note that the varbit encoding also uses double deltas, just in a different -way.) - -## What are the advantages of varbit encoding? - -In short: It offers a way better compression ratio. While the double-delta -encoding needs about 3.3 bytes per sample for real-life data sets, the varbit -encoding went as far down as 1.28 bytes per sample on a typical large -production server at SoundCloud. That's almost three times more space efficient -(and even slightly better than the 1.37 bytes per sample reported for Gorilla – -but take that with a grain of salt as the typical data set at SoundCloud might -look different from the typical data set at Facebook). - -Now think of the implications: Three times more samples in RAM, three times -more samples on disk, only a third of disk ops, and since disk ops are -currently the bottleneck for ingestion speed, it will also allow ingestion to -be three times faster. In fact, the recently reported new ingestion record of -800,000 samples per second was only possible with varbit chunks – and with an -SSD, obviously. With spinning disks, the bottleneck is reached far earlier, and -thus the 3x gain matters even more. - -All of this sounds too good to be true… - -## So where is the catch? - -For one, the varbit encoding is more complex. The computational cost to encode -and decode values is therefore somewhat increased, which fundamentally affects -everything that writes or reads sample data. Luckily, it is only a proportional -increase of something that usually contributes only a small part to the total -cost of an operation. - -Another property of the varbit encoding is potentially way more relevant: -samples in varbit chunks can only be accessed sequentially, while samples in -double-delta encoded chunks are randomly accessible by index. Since writes in -Prometheus are append-only, the different access patterns only affect reading -of sample data. The practical impact depends heavily on the nature of the -originating PromQL query. - -A pretty harmless case is the retrieval of all samples within a time -interval. This happens when evaluating a range selector or rendering a -dashboard with a resolution similar to the scrape frequency. The Prometheus -storage engine needs to find the starting point of the interval. With -double-delta chunks, it can perform a binary search, while it has to scan -sequentially through a varbit chunk. However, once the starting point is found, -all remaining samples in the interval need to be decoded sequentially anyway, -which is only slightly more expensive with the varbit encoding. - -The trade-off is different for retrieving a small number of non-adjacent -samples from a chunk, or for plainly retrieving a single sample in a so-called -instant query. Potentially, the storage engine has to iterate through a lot of -samples to find the few samples to be returned. Fortunately, the most common -source of instant queries are rule evaluations referring to the latest sample -in each involved time series. Not completely by coincidence, I recently -improved the retrieval of the latest sample of a time series. Essentially, the -last sample added to a time series is cached now. A query that needs only the -most recent sample of a time series doesn't even hit the chunk layer anymore, -and the chunk encoding is irrelevant in that case. - -Even if an instant query refers to a sample in the past and therefore has to -hit the chunk layer, most likely other parts of the query, like the index -lookup, will dominate the total query time. But there are real-life queries -where the sequential access pattern required by varbit chunks will start to -matter a lot. - -## What is the worst-case query for varbit chunks? - -The worst case for varbit chunks is if you need just one sample from somewhere -in the middle of _each_ chunk of a very long time series. Unfortunately, there -is a real use-case for that. Let's assume a time series compresses nicely -enough to make each chunk last for about eight hours. That's about three chunks -a day, or about 100 chunks a month. If you have a dashboard that displays the -time series in question for the last month with a resolution of 100 data -points, the dashboard will execute a query that retrieves a single sample from -100 different chunks. Even then, the differences between chunk encodings will -be dominated by other parts of the query execution time. Depending on -circumstances, my guess would be that the query might take 50ms with -double-delta encoding and 100ms with varbit encoding. - -However, if your dashboard query doesn't only touch a single time series but -aggregates over thousands of time series, the number of chunks to access -multiplies accordingly, and the overhead of the sequential scan will become -dominant. (Such queries are frowned upon, and we usually recommend to use a -[recording rule](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/#recording-rules) -for queries of that kind that are used frequently, e.g. in a dashboard.) But -with the double-delta encoding, the query time might still have been -acceptable, let's say around one second. After the switch to varbit encoding, -the same query might last tens of seconds, which is clearly not what you want -for a dashboard. - -## What are the rules of thumb? - -To put it as simply as possible: If you are neither limited on disk capacity -nor on disk ops, don't worry and stick with the default of the classical -double-delta encoding. - -However, if you would like a longer retention time or if you are currently -bottle-necked on disk ops, I invite you to play with the new varbit -encoding. Start your Prometheus server with -`-storage.local.chunk-encoding-version=2` and wait for a while until you have -enough new chunks with varbit encoding to vet the effects. If you see queries -that are becoming unacceptably slow, check if you can use -[recording rules](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/#recording-rules) -to speed them up. Most likely, those queries will gain a lot from that even -with the old double-delta encoding. - -If you are interested in how the varbit encoding works behind the scenes, stay -tuned for another blog post in the not too distant future. diff --git a/content/blog/2016-05-09-prometheus-to-join-the-cloud-native-computing-foundation.md b/content/blog/2016-05-09-prometheus-to-join-the-cloud-native-computing-foundation.md deleted file mode 100644 index 5cc61514..00000000 --- a/content/blog/2016-05-09-prometheus-to-join-the-cloud-native-computing-foundation.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -title: Prometheus to Join the Cloud Native Computing Foundation -created_at: 2016-05-09 -kind: article -author_name: Julius Volz on behalf of the Prometheus core developers ---- - -Since the inception of Prometheus, we have been looking for a sustainable -governance model for the project that is independent of any single company. -Recently, we have been in discussions with the newly formed [Cloud Native -Computing Foundation](https://cncf.io/) (CNCF), which is backed by Google, -CoreOS, Docker, Weaveworks, Mesosphere, and [other leading infrastructure -companies](https://cncf.io/about/members). - -Today, we are excited to announce that the CNCF's Technical Oversight Committee -[voted unanimously](http://lists.cncf.io/pipermail/cncf-toc/2016-May/000198.html) to -accept Prometheus as a second hosted project after Kubernetes! You can find -more information about these plans in the -[official press release by the CNCF](https://cncf.io/news/news/2016/05/cloud-native-computing-foundation-accepts-prometheus-second-hosted-project). - -By joining the CNCF, we hope to establish a clear and sustainable project -governance model, as well as benefit from the resources, infrastructure, and -advice that the independent foundation provides to its members. - -We think that the CNCF and Prometheus are an ideal thematic match, as both -focus on bringing about a modern vision of the cloud. - -In the following months, we will be working with the CNCF on finalizing the -project governance structure. We will report back when there are more details -to announce. diff --git a/content/blog/2016-07-18-prometheus-1-0-released.md b/content/blog/2016-07-18-prometheus-1-0-released.md deleted file mode 100644 index 18d50437..00000000 --- a/content/blog/2016-07-18-prometheus-1-0-released.md +++ /dev/null @@ -1,57 +0,0 @@ ---- -title: Prometheus reaches 1.0 -created_at: 2016-07-18 -kind: article -author_name: Fabian Reinartz on behalf of the Prometheus team ---- - -In January, we published a blog post on [Prometheus’s first year of public existence](https://prometheus.io/blog/2016/01/26/one-year-of-open-prometheus-development/), summarizing what has been an amazing journey for us, and hopefully an innovative and useful monitoring solution for you. -Since then, [Prometheus has also joined the Cloud Native Computing Foundation](https://prometheus.io/blog/2016/05/09/prometheus-to-join-the-cloud-native-computing-foundation/), where we are in good company, as the second charter project after [Kubernetes](http://kubernetes.io/). - -Our recent work has focused on delivering a stable API and user interface, marked by version 1.0 of Prometheus. -We’re thrilled to announce that we’ve reached this goal, and [Prometheus 1.0 is available today](https://github.com/prometheus/prometheus/releases/tag/v1.0.0). - -## What does 1.0 mean for you? - -If you have been using Prometheus for a while, you may have noticed that the rate and impact of breaking changes significantly decreased over the past year. -In the same spirit, reaching 1.0 means that subsequent 1.x releases will remain API stable. Upgrades won’t break programs built atop the Prometheus API, and updates won’t require storage re-initialization or deployment changes. Custom dashboards and alerts will remain intact across 1.x version updates as well. -We’re confident Prometheus 1.0 is a solid monitoring solution. Now that the Prometheus server has reached a stable API state, other modules will follow it to their own stable version 1.0 releases over time. - -### Fine print - -So what does API stability mean? Prometheus has a large surface area and some parts are certainly more mature than others. -There are two simple categories, _stable_ and _unstable_: - -Stable as of v1.0 and throughout the 1.x series: - -* The query language and data model -* Alerting and recording rules -* The ingestion exposition formats -* Configuration flag names -* HTTP API (used by dashboards and UIs) -* Configuration file format (minus the non-stable service discovery integrations, see below) -* Alerting integration with Alertmanager 0.1+ for the foreseeable future -* Console template syntax and semantics - -Unstable and may change within 1.x: - -* The remote storage integrations (InfluxDB, OpenTSDB, Graphite) are still experimental and will at some point be removed in favor of a generic, more sophisticated API that allows storing samples in arbitrary storage systems. -* Several service discovery integrations are new and need to keep up with fast evolving systems. Hence, integrations with Kubernetes, Marathon, Azure, and EC2 remain in beta status and are subject to change. However, changes will be clearly announced. -* Exact flag meanings may change as necessary. However, changes will never cause the server to not start with previous flag configurations. -* Go APIs of packages that are part of the server. -* HTML generated by the web UI. -* The metrics in the `/metrics` endpoint of Prometheus itself. -* Exact on-disk format. Potential changes however, will be forward compatible and transparently handled by Prometheus. - -## So Prometheus is complete now? - -Absolutely not. We have a long roadmap ahead of us, full of great features to implement. Prometheus will not stay in 1.x for years to come. The infrastructure space is evolving rapidly and we fully intend for Prometheus to evolve with it. -This means that we will remain willing to question what we did in the past and are open to leave behind things that have lost relevance. There will be new major versions of Prometheus to facilitate future plans like persistent long-term storage, newer iterations of Alertmanager, internal storage improvements, and many things we don’t even know about yet. - -## Closing thoughts - -We want to thank our fantastic community for field testing new versions, filing bug reports, contributing code, helping out other community members, and shaping Prometheus by participating in countless productive discussions. -In the end, you are the ones who make Prometheus successful. - -Thank you, and keep up the great work! - diff --git a/content/blog/2016-07-23-pull-does-not-scale-or-does-it.md b/content/blog/2016-07-23-pull-does-not-scale-or-does-it.md deleted file mode 100644 index f954972a..00000000 --- a/content/blog/2016-07-23-pull-does-not-scale-or-does-it.md +++ /dev/null @@ -1,181 +0,0 @@ ---- -title: Pull doesn't scale - or does it? -created_at: 2016-07-23 -kind: article -author_name: Julius Volz ---- - -Let's talk about a particularly persistent myth. Whenever there is a discussion -about monitoring systems and Prometheus's pull-based metrics collection -approach comes up, someone inevitably chimes in about how a pull-based approach -just “fundamentally doesn't scale”. The given reasons are often vague or only -apply to systems that are fundamentally different from Prometheus. In fact, -having worked with pull-based monitoring at the largest scales, this claim runs -counter to our own operational experience. - -We already have an FAQ entry about -[why Prometheus chooses pull over push](/docs/introduction/faq/#why-do-you-pull-rather-than-push), -but it does not focus specifically on scaling aspects. Let's have a closer look -at the usual misconceptions around this claim and analyze whether and how they -would apply to Prometheus. - -## Prometheus is not Nagios - -When people think of a monitoring system that actively pulls, they often think -of Nagios. Nagios has a reputation of not scaling well, in part due to spawning -subprocesses for active checks that can run arbitrary actions on the Nagios -host in order to determine the health of a certain host or service. This sort -of check architecture indeed does not scale well, as the central Nagios host -quickly gets overwhelmed. As a result, people usually configure checks to only -be executed every couple of minutes, or they run into more serious problems. - -However, Prometheus takes a fundamentally different approach altogether. -Instead of executing check scripts, it only collects time series data from a -set of instrumented targets over the network. For each target, the Prometheus -server simply fetches the current state of all metrics of that target over HTTP -(in a highly parallel way, using goroutines) and has no other execution -overhead that would be pull-related. This brings us to the next point: - -## It doesn't matter who initiates the connection - -For scaling purposes, it doesn't matter who initiates the TCP connection over -which metrics are then transferred. Either way you do it, the effort for -establishing a connection is small compared to the metrics payload and other -required work. - -But a push-based approach could use UDP and avoid connection establishment -altogether, you say! True, but the TCP/HTTP overhead in Prometheus is still -negligible compared to the other work that the Prometheus server has to do to -ingest data (especially persisting time series data on disk). To put some -numbers behind this: a single big Prometheus server can easily store millions -of time series, with a record of 800,000 incoming samples per second (as -measured with real production metrics data at SoundCloud). Given a 10-seconds -scrape interval and 700 time series per host, this allows you to monitor over -10,000 machines from a single Prometheus server. The scaling bottleneck here -has never been related to pulling metrics, but usually to the speed at which -the Prometheus server can ingest the data into memory and then sustainably -persist and expire data on disk/SSD. - -Also, although networks are pretty reliable these days, using a TCP-based pull -approach makes sure that metrics data arrives reliably, or that the monitoring -system at least knows immediately when the metrics transfer fails due to a -broken network. - -## Prometheus is not an event-based system - -Some monitoring systems are event-based. That is, they report each individual -event (an HTTP request, an exception, you name it) to a central monitoring -system immediately as it happens. This central system then either aggregates -the events into metrics (StatsD is the prime example of this) or stores events -individually for later processing (the ELK stack is an example of that). In -such a system, pulling would be problematic indeed: the instrumented service -would have to buffer events between pulls, and the pulls would have to happen -incredibly frequently in order to simulate the same “liveness” of the -push-based approach and not overwhelm event buffers. - -However, again, Prometheus is not an event-based monitoring system. You do not -send raw events to Prometheus, nor can it store them. Prometheus is in the -business of collecting aggregated time series data. That means that it's only -interested in regularly collecting the current *state* of a given set of -metrics, not the underlying events that led to the generation of those metrics. -For example, an instrumented service would not send a message about each HTTP -request to Prometheus as it is handled, but would simply count up those -requests in memory. This can happen hundreds of thousands of times per second -without causing any monitoring traffic. Prometheus then simply asks the service -instance every 15 or 30 seconds (or whatever you configure) about the current -counter value and stores that value together with the scrape timestamp as a -sample. Other metric types, such as gauges, histograms, and summaries, are -handled similarly. The resulting monitoring traffic is low, and the pull-based -approach also does not create problems in this case. - -## But now my monitoring needs to know about my service instances! - -With a pull-based approach, your monitoring system needs to know which service -instances exist and how to connect to them. Some people are worried about the -extra configuration this requires on the part of the monitoring system and see -this as an operational scalability problem. - -We would argue that you cannot escape this configuration effort for -serious monitoring setups in any case: if your monitoring system doesn't know -what the world *should* look like and which monitored service instances -*should* be there, how would it be able to tell when an instance just never -reports in, is down due to an outage, or really is no longer meant to exist? -This is only acceptable if you never care about the health of individual -instances at all, like when you only run ephemeral workers where it is -sufficient for a large-enough number of them to report in some result. Most -environments are not exclusively like that. - -If the monitoring system needs to know the desired state of the world anyway, -then a push-based approach actually requires *more* configuration in total. Not -only does your monitoring system need to know what service instances should -exist, but your service instances now also need to know how to reach your -monitoring system. A pull approach not only requires less configuration, -it also makes your monitoring setup more flexible. With pull, you can just run -a copy of production monitoring on your laptop to experiment with it. It also -allows you just fetch metrics with some other tool or inspect metrics endpoints -manually. To get high availability, pull allows you to just run two identically -configured Prometheus servers in parallel. And lastly, if you have to move the -endpoint under which your monitoring is reachable, a pull approach does not -require you to reconfigure all of your metrics sources. - -On a practical front, Prometheus makes it easy to configure the desired state -of the world with its built-in support for a wide variety of service discovery -mechanisms for cloud providers and container-scheduling systems: Consul, -Marathon, Kubernetes, EC2, DNS-based SD, Azure, Zookeeper Serversets, and more. -Prometheus also allows you to plug in your own custom mechanism if needed. -In a microservice world or any multi-tiered architecture, it is also -fundamentally an advantage if your monitoring system uses the same method to -discover targets to monitor as your service instances use to discover their -backends. This way you can be sure that you are monitoring the same targets -that are serving production traffic and you have only one discovery mechanism -to maintain. - -## Accidentally DDoS-ing your monitoring - -Whether you pull or push, any time-series database will fall over if you send -it more samples than it can handle. However, in our experience it's slightly -more likely for a push-based approach to accidentally bring down your -monitoring. If the control over what metrics get ingested from which instances -is not centralized (in your monitoring system), then you run into the danger of -experimental or rogue jobs suddenly pushing lots of garbage data into your -production monitoring and bringing it down. There are still plenty of ways how -this can happen with a pull-based approach (which only controls where to pull -metrics from, but not the size and nature of the metrics payloads), but the -risk is lower. More importantly, such incidents can be mitigated at a central -point. - -## Real-world proof - -Besides the fact that Prometheus is already being used to monitor very large -setups in the real world (like using it to [monitor millions of machines at -DigitalOcean](https://promcon.io/2016-berlin/talks/scaling-to-a-million-machines-with-prometheus/)), -there are other prominent examples of pull-based monitoring being used -successfully in the largest possible environments. Prometheus was inspired by -Google's Borgmon, which was (and partially still is) used within Google to -monitor all its critical production services using a pull-based approach. Any -scaling issues we encountered with Borgmon at Google were not due its pull -approach either. If a pull-based approach scales to a global environment with -many tens of datacenters and millions of machines, you can hardly say that pull -doesn't scale. - -## But there are other problems with pull! - -There are indeed setups that are hard to monitor with a pull-based approach. -A prominent example is when you have many endpoints scattered around the -world which are not directly reachable due to firewalls or complicated -networking setups, and where it's infeasible to run a Prometheus server -directly in each of the network segments. This is not quite the environment for -which Prometheus was built, although workarounds are often possible ([via the -Pushgateway or restructuring your setup](/docs/practices/pushing/)). In any -case, these remaining concerns about pull-based monitoring are usually not -scaling-related, but due to network operation difficulties around opening TCP -connections. - -## All good then? - -This article addresses the most common scalability concerns around a pull-based -monitoring approach. With Prometheus and other pull-based systems being used -successfully in very large environments and the pull aspect not posing a -bottleneck in reality, the result should be clear: the “pull doesn't scale” -argument is not a real concern. We hope that future debates will focus on -aspects that matter more than this red herring. diff --git a/content/blog/2016-09-04-promcon-2016-its-a-wrap.md b/content/blog/2016-09-04-promcon-2016-its-a-wrap.md deleted file mode 100644 index 760d91d6..00000000 --- a/content/blog/2016-09-04-promcon-2016-its-a-wrap.md +++ /dev/null @@ -1,152 +0,0 @@ ---- -title: PromCon 2016 - It's a wrap! -created_at: 2016-09-04 -kind: article -author_name: Julius Volz ---- - -## What happened - -Last week, eighty Prometheus users and developers from around the world came -together for two days in Berlin for the first-ever conference about the -Prometheus monitoring system: [PromCon 2016](https://promcon.io/). The goal of -this conference was to exchange knowledge, best practices, and experience -gained using Prometheus. We also wanted to grow the community and help people -build professional connections around service monitoring. Here are some -impressions from the first morning: - - - - - - - - - -At PromCon, speakers from a variety of large and small companies talked about -how they were using Prometheus or are building solutions around it. For example, -[DigitalOcean](https://www.digitalocean.com/) spoke about their challenges of -using Prometheus at massive scale, while -[ShuttleCloud](https://www.shuttlecloud.com/) explained how it was a great fit -for monitoring their small startup. Our furthest-traveled speaker came all the -way from Tokyo to present how [LINE](https://linecorp.com/en/) is monitoring -their systems using Prometheus. [Weaveworks](https://www.weave.works/) -explained how they built a scalable multi-tenant version of Prometheus. - - - - - - - - - - - - - -Several Prometheus core developers also talked about the design decisions -behind the monitoring system, presented upcoming features, or shared best -practices. On a lighter note, two lightning talks explained the correct plural -of Prometheus, as well as an implementation of [Conway's Game of Life](https://en.wikipedia.org/wiki/Conway%27s_Game_of_Life) -in the Prometheus query language. - - - - - - - -To see the entire program, have a look at [the schedule](https://promcon.io/schedule). - -In the breaks between talks, there was a lot of fun (and food) to be had: - - - - -After wrapping up talks on the first evening, we enjoyed a warm summer night -with food and drinks on Gendarmenmarkt, one of Berlin's nicest plazas. This -gave people a chance to mingle even more and exchange thoughts and ideas around -Prometheus. - - - - -Overall, we were blown away by the quality of talks, the wide diversity of use -cases, as well as the friendly community coming together in this way and -forming new connections! - -## Talk recordings - -At PromCon 2016, we made it a priority to record all talks professionally. -Especially for a small conference like this, recording and sharing talks was -important, as it dramatically increases the reach of the talks and helps -Prometheus users and developers around the world to participate and learn. - -Today, we are pleased to announce that all talk recordings are now ready and -publicly available. You can enjoy them [in this Youtube playlist](https://www.youtube.com/playlist?list=PLoz-W_CUquUlCq-Q0hy53TolAhaED9vmU)! - -## Reception - -The feedback we got from speakers and attendees at PromCon 2016 was incredibly -encouraging and positive. A lot of people loved the friendly community feeling -of the conference, but also learned a lot from the focused talks and -interesting conversations. Here is what some attendees had to say: - - - - - - - - - - - - - - - - -Overall, we were very happy with how PromCon turned out - no event is perfect, -but for a small community conference organized in free time, it exceeded most -people's expectations. - -## Thanks - -PromCon 2016 would not have been possible without the help of its sponsors, -speakers, attendees, and organizers. Thanks so much to all of you! Our Diamond -and Platinum sponsors deserve a special mention at this point, since they did -the most to support us and made all the food, drinks, video recordings, and -swag possible: - -

Diamond

- - -

Platinum

- - -We would also like to thank Google for hosting the conference at their office -in Berlin! - -## Outlook - -If PromCon 2016 went so well, when will the next one happen? - - - - -The answer is that we don't know for sure yet. This first PromCon was organized -entirely in people's free time, with most of it handled by one person. This -will surely have to change, especially as we also expect a next Prometheus -conference to be much larger (even this year, the limited tickets sold out -within seconds). In the next months, we will discuss within the community what we -want PromCon to be, who should run it, and where it should take place. Perhaps -there is even space for multiple Prometheus conferences around the world. We will -report back when we know more. Stay tuned! diff --git a/content/blog/2016-09-07-interview-with-shuttlecloud.md b/content/blog/2016-09-07-interview-with-shuttlecloud.md deleted file mode 100644 index 82a6d710..00000000 --- a/content/blog/2016-09-07-interview-with-shuttlecloud.md +++ /dev/null @@ -1,114 +0,0 @@ ---- -title: Interview with ShuttleCloud -created_at: 2016-09-07 -kind: article -author_name: Brian Brazil ---- - -*Continuing our series of interviews with users of Prometheus, ShuttleCloud talks about how they began using Prometheus. Ignacio from ShuttleCloud also explained how [Prometheus Is Good for Your Small Startup](https://www.youtube.com/watch?v=gMHa4Yh8avk) at PromCon 2016.* - -## What does ShuttleCloud do? - -ShuttleCloud is the world’s most scalable email and contacts data importing system. We help some of the leading email and address book providers, including Google and Comcast, increase user growth and engagement by automating the switching experience through data import. - -By integrating our API into their offerings, our customers allow their users to easily migrate their email and contacts from one participating provider to another, reducing the friction users face when switching to a new provider. The 24/7 email providers supported include all major US internet service providers: Comcast, Time Warner Cable, AT&T, Verizon, and more. - -By offering end users a simple path for migrating their emails (while keeping complete control over the import tool’s UI), our customers dramatically improve user activation and onboarding. - -![ShuttleCloud's integration with Gmail](/assets/blog/2016-09-07/gmail-integration.png) -***ShuttleCloud’s [integration](https://support.google.com/mail/answer/164640?hl=en) with Google’s Gmail Platform.*** *Gmail has imported data for 3 million users with our API.* - - -ShuttleCloud’s technology encrypts all the data required to process an import, in addition to following the most secure standards (SSL, oAuth) to ensure the confidentiality and integrity of API requests. Our technology allows us to guarantee our platform’s high availability, with up to 99.5% uptime assurances. - -![ShuttleCloud by Numbers](/assets/blog/2016-09-07/shuttlecloud-numbers.png) - -## What was your pre-Prometheus monitoring experience? - -In the beginning, a proper monitoring system for our infrastructure was not one of our main priorities. We didn’t have as many projects and instances as we currently have, so we worked with other simple systems to alert us if anything was not working properly and get it under control. - - * We had a set of automatic scripts to monitor most of the operational metrics for the machines. These were cron-based and executed, using Ansible from a centralized machine. The alerts were emails sent directly to the entire development team. - * We trusted Pingdom for external blackbox monitoring and checking that all our frontends were up. They provided an easy interface and alerting system in case any of our external services were not reachable. - -Fortunately, big customers arrived, and the SLAs started to be more demanding. Therefore, we needed something else to measure how we were performing and to ensure that we were complying with all SLAs. One of the features we required was to have accurate stats about our performance and business metrics (i.e., how many migrations finished correctly), so reporting was more on our minds than monitoring. - -We developed the following system: - -![Initial Shuttlecloud System](/assets/blog/2016-09-07/Prometheus-System-1.jpg) - - * The source of all necessary data is a status database in a CouchDB. There, each document represents one status of an operation. This information is processed by the Status Importer and stored in a relational manner in a MySQL database. - - * A component gathers data from that database, with the information aggregated and post-processed into several views. - * One of the views is the email report, which we needed for reporting purposes. This is sent via email. - * The other view pushes data to a dashboard, where it can be easily controlled. The dashboard service we used was external. We trusted Ducksboard, not only because the dashboards were easy to set up and looked beautiful, but also because they provided automatic alerts if a threshold was reached. - -With all that in place, it didn’t take us long to realize that we would need a proper metrics, monitoring, and alerting system as the number of projects started to increase. - -Some drawbacks of the systems we had at that time were: - - * No centralized monitoring system. Each metric type had a different one: - * System metrics → Scripts run by Ansible. - * Business metrics → Ducksboard and email reports. - * Blackbox metrics → Pingdom. - * No standard alerting system. Each metric type had different alerts (email, push notification, and so on). - * Some business metrics had no alerts. These were reviewed manually. - -## Why did you decide to look at Prometheus? - -We analyzed several monitoring and alerting systems. We were eager to get our hands dirty and check if the a solution would succeed or fail. The system we decided to put to the test was Prometheus, for the following reasons: - - * First of all, you don’t have to define a fixed metric system to start working with it; metrics can be added or changed in the future. This provides valuable flexibility when you don’t know all of the metrics you want to monitor yet. - * If you know anything about Prometheus, you know that metrics can have labels that abstract us from the fact that different time series are considered. This, together with its query language, provided even more flexibility and a powerful tool. For example, we can have the same metric defined for different environments or projects and get a specific time series or aggregate certain metrics with the appropriate labels: - * `http_requests_total{job="my_super_app_1",environment="staging"}` - the time series corresponding to the staging environment for the app "my\_super\_app_1". - * `http_requests_total{job="my_super_app_1"}` - the time series for all environments for the app "my\_super\_app\_1". - * `http_requests_total{environment="staging"}` - the time series for all staging environments for all jobs. - * Prometheus supports a DNS service for service discovery. We happened to already have an internal DNS service. - * There is no need to install any external services (unlike Sensu, for example, which needs a data-storage service like Redis and a message bus like RabbitMQ). This might not be a deal breaker, but it definitely makes the test easier to perform, deploy, and maintain. - * Prometheus is quite easy to install, as you only need to download an executable Go file. The Docker container also works well and it is easy to start. - -## How do you use Prometheus? - -Initially we were only using some metrics provided out of the box by the [node_exporter](https://github.com/prometheus/node_exporter), including: - - * hard drive usage. - * memory usage. - * if an instance is up or down. - -Our internal DNS service is integrated to be used for service discovery, so every new instance is automatically monitored. - -Some of the metrics we used, which were not provided by the node_exporter by default, were exported using the [node_exporter textfile collector](https://github.com/prometheus/node_exporter#textfile-collector) feature. The first alerts we declared on the Prometheus Alertmanager were mainly related to the operational metrics mentioned above. - -We later developed an operation exporter that allowed us to know the status of the system almost in real time. It exposed business metrics, namely the statuses of all operations, the number of incoming migrations, the number of finished migrations, and the number of errors. We could aggregate these on the Prometheus side and let it calculate different rates. - -We decided to export and monitor the following metrics: - - * `operation_requests_total` - * `operation_statuses_total` - * `operation_errors_total` - -![Shuttlecloud Prometheus System](/assets/blog/2016-09-07/Prometheus-System-2.jpg) - -We have most of our services duplicated in two Google Cloud Platform availability zones. That includes the monitoring system. It’s straightforward to have more than one operation exporter in two or more different zones, as Prometheus can aggregate the data from all of them and make one metric (i.e., the maximum of all). We currently don’t have Prometheus or the Alertmanager in HA — only a metamonitoring instance — but we are working on it. - -For external blackbox monitoring, we use the Prometheus [Blackbox Exporter](https://github.com/prometheus/blackbox_exporter). Apart from checking if our external frontends are up, it is especially useful for having metrics for SSL certificates’ expiration dates. It even checks the whole chain of certificates. Kudos to Robust Perception for explaining it perfectly in their [blogpost](https://www.robustperception.io/get-alerted-before-your-ssl-certificates-expire/). - -We set up some charts in Grafana for visual monitoring in some dashboards, and the integration with Prometheus was trivial. The query language used to define the charts is the same as in Prometheus, which simplified their creation a lot. - -We also integrated Prometheus with Pagerduty and created a schedule of people on-call for the critical alerts. For those alerts that were not considered critical, we only sent an email. - - -## How does Prometheus make things better for you? - -We can't compare Prometheus with our previous solution because we didn’t have one, but we can talk about what features of Prometheus are highlights for us: - - * It has very few maintenance requirements. - * It’s efficient: one machine can handle monitoring the whole cluster. - * The community is friendly—both dev and users. Moreover, [Brian’s blog](https://www.robustperception.io/blog/) is a very good resource. - * It has no third-party requirements; it’s just the server and the exporters. (No RabbitMQ or Redis needs to be maintained.) - * Deployment of Go applications is a breeze. - -## What do you think the future holds for ShuttleCloud and Prometheus? - -We’re very happy with Prometheus, but new exporters are always welcome (Celery or Spark, for example). - -One question that we face every time we add a new alarm is: how do we test that the alarm works as expected? It would be nice to have a way to inject fake metrics in order to raise an alarm, to test it. diff --git a/content/blog/2016-09-14-interview-with-digitalocean.md b/content/blog/2016-09-14-interview-with-digitalocean.md deleted file mode 100644 index dd4a8221..00000000 --- a/content/blog/2016-09-14-interview-with-digitalocean.md +++ /dev/null @@ -1,94 +0,0 @@ ---- -title: Interview with DigitalOcean -created_at: 2016-09-14 -kind: article -author_name: Brian Brazil ---- - -*Next in our series of interviews with users of Prometheus, DigitalOcean talks -about how they use Prometheus. Carlos Amedee also talked about [the social -aspects of the rollout](https://www.youtube.com/watch?v=ieo3lGBHcy8) at PromCon -2016.* - -## Can you tell us about yourself and what DigitalOcean does? - -My name is Ian Hansen and I work on the platform metrics team. -[DigitalOcean](https://www.digitalocean.com/) provides simple cloud computing. -To date, we’ve created 20 million Droplets (SSD cloud servers) across 13 -regions. We also recently released a new Block Storage product. - -![DigitalOcean logo](/assets/blog/2016-09-14/DO_Logo_Horizontal_Blue-3db19536.png) - -## What was your pre-Prometheus monitoring experience? - -Before Prometheus, we were running [Graphite](https://graphiteapp.org/) and -[OpenTSDB](http://opentsdb.net/). Graphite was used for smaller-scale -applications and OpenTSDB was used for collecting metrics from all of our -physical servers via [Collectd](https://collectd.org/). -[Nagios](https://www.nagios.org/) would pull these databases to trigger alerts. -We do still use Graphite but we no longer run OpenTSDB. - -## Why did you decide to look at Prometheus? - -I was frustrated with OpenTSDB because I was responsible for keeping the -cluster online, but found it difficult to guard against metric storms. -Sometimes a team would launch a new (very chatty) service that would impact the -total capacity of the cluster and hurt my SLAs. - -We are able to blacklist/whitelist new metrics coming in to OpenTSDB, but -didn’t have a great way to guard against chatty services except for -organizational process (which was hard to change/enforce). Other teams were -frustrated with the query language and the visualization tools available at the -time. I was chatting with Julius Volz about push vs pull metric systems and was -sold in wanting to try Prometheus when I saw that I would really be in control -of my SLA when I get to determine what I’m pulling and how frequently. Plus, I -really really liked the query language. - -## How did you transition? - -We were gathering metrics via Collectd sending to OpenTSDB. Installing the -[Node Exporter](https://github.com/prometheus/node_exporter) in parallel with -our already running Collectd setup allowed us to start experimenting with -Prometheus. We also created a custom exporter to expose Droplet metrics. Soon, -we had feature parity with our OpenTSDB service and started turning off -Collectd and then turned off the OpenTSDB cluster. - -People really liked Prometheus and the visualization tools that came with it. -Suddenly, my small metrics team had a backlog that we couldn’t get to fast -enough to make people happy, and instead of providing and maintaining -Prometheus for people’s services, we looked at creating tooling to make it as -easy as possible for other teams to run their own Prometheus servers and to -also run the common exporters we use at the company. - -Some teams have started using Alertmanager, but we still have a concept of -pulling Prometheus from our existing monitoring tools. - -## What improvements have you seen since switching? - -We’ve improved our insights on hypervisor machines. The data we could get out -of Collectd and Node Exporter is about the same, but it’s much easier for our -team of golang developers to create a new custom exporter that exposes data -specific to the services we run on each hypervisor. - -We’re exposing better application metrics. It’s easier to learn and teach how -to create a Prometheus metric that can be aggregated correctly later. With -Graphite it’s easy to create a metric that can’t be aggregated in a certain way -later because the dot-separated-name wasn’t structured right. - -Creating alerts is much quicker and simpler than what we had before, plus in a -language that is familiar. This has empowered teams to create better alerting -for the services they know and understand because they can iterate quickly. - -## What do you think the future holds for DigitalOcean and Prometheus? - -We’re continuing to look at how to make collecting metrics as easy as possible -for teams at DigitalOcean. Right now teams are running their own Prometheus -servers for the things they care about, which allowed us to gain observability -we otherwise wouldn’t have had as quickly. But, not every team should have to -know how to run Prometheus. We’re looking at what we can do to make Prometheus -as automatic as possible so that teams can just concentrate on what queries and -alerts they want on their services and databases. - -We also created [Vulcan](https://github.com/digitalocean/vulcan) so that we -have long-term data storage, while retaining the Prometheus Query Language that -we have built tooling around and trained people how to use. diff --git a/content/blog/2016-09-21-interview-with-compose.md b/content/blog/2016-09-21-interview-with-compose.md deleted file mode 100644 index 86de2b95..00000000 --- a/content/blog/2016-09-21-interview-with-compose.md +++ /dev/null @@ -1,139 +0,0 @@ ---- -title: Interview with Compose -created_at: 2016-09-21 -kind: article -author_name: Brian Brazil ---- - -*Continuing our series of interviews with users of Prometheus, Compose talks -about their monitoring journey from Graphite and InfluxDB to Prometheus.* - - -## Can you tell us about yourself and what Compose does? - -[Compose](https://www.compose.com/) delivers production-ready database clusters -as a service to developers around the world. An app developer can come to us -and in a few clicks have a multi-host, highly available, automatically backed -up and secure database ready in minutes. Those database deployments then -autoscale up as demand increases so a developer can spend their time on -building their great apps, not on running their database. - -We have tens of clusters of hosts across at least two regions in each of AWS, -Google Cloud Platform and SoftLayer. Each cluster spans availability zones -where supported and is home to around 1000 highly-available database -deployments in their own private networks. More regions and providers are in -the works. - - -## What was your pre-Prometheus monitoring experience? - -Before Prometheus, a number of different metrics systems were tried. The first -system we tried was [Graphite](https://graphiteapp.org/), which worked pretty -well initially, but the sheer volume of different metrics we had to store, -combined with the way Whisper files are stored and accessed on disk, quickly -overloaded our systems. While we were aware that Graphite could be scaled -horizontally relatively easily, it would have been an expensive cluster. -[InfluxDB](https://www.influxdata.com/) looked more promising so we started -trying out the early-ish versions of that and it seemed to work well for a good -while. Goodbye Graphite. - -The earlier versions of InfluxDB had some issues with data corruption -occasionally. We semi-regularly had to purge all of our metrics. It wasn’t a -devastating loss for us normally, but it was irritating. The continued promises -of features that never materialised frankly wore on us. - - -## Why did you decide to look at Prometheus? - -It seemed to combine better efficiency with simpler operations than other -options. - -Pull-based metric gathering puzzled us at first, but we soon realised the -benefits. Initially it seemed like it could be far too heavyweight to scale -well in our environment where we often have several hundred containers with -their own metrics on each host, but by combining it with Telegraf, we can -arrange to have each host export metrics for all its containers (as well as its -overall resource metrics) via a single Prometheus scrape target. - - -## How did you transition? - -We are a Chef shop so we spun up a largish instance with a big EBS volume and -then reached right for a [community chef -cookbook](https://github.com/rayrod2030/chef-prometheus) for Prometheus. - -With Prometheus up on a host, we wrote a small Ruby script that uses the Chef -API to query for all our hosts, and write out a Prometheus target config file. -We use this file with a `file_sd_config` to ensure all hosts are discovered and -scraped as soon as they register with Chef. Thanks to Prometheus’ open -ecosystem, we were able to use Telegraf out of the box with a simple config to -export host-level metrics directly. - -We were testing how far a single Prometheus would scale and waiting for it to -fall over. It didn’t! In fact it handled the load of host-level metrics scraped -every 15 seconds for around 450 hosts across our newer infrastructure with very -little resource usage. - -We have a lot of containers on each host so we were expecting to have to start -to shard Prometheus once we added all memory usage metrics from those too, but -Prometheus just kept on going without any drama and still without getting too -close to saturating its resources. We currently monitor over 400,000 distinct -metrics every 15 seconds for around 40,000 containers on 450 hosts with a -single m4.xlarge prometheus instance with 1TB of storage. You can see our host -dashboard for this host below. Disk IO on the 1TB gp2 SSD EBS volume will -probably be the limiting factor eventually. Our initial guess is well -over-provisioned for now, but we are growing fast in both metrics gathered and -hosts/containers to monitor. - -![Prometheus Host Dashboard](/assets/blog/2016-09-21/compose-host-dashboard.png) - -At this point the Prometheus server we’d thrown up to test with was vastly more -reliable than the InfluxDB cluster we had doing the same job before, so we did -some basic work to make it less of a single-point-of-failure. We added another -identical node scraping all the same targets, then added a simple failover -scheme with keepalived + DNS updates. This was now more highly available than -our previous system so we switched our customer-facing graphs to use Prometheus -and tore down the old system. - - -![Prometheus-powered memory metrics for PostgresSQL containers in our app](/assets/blog/2016-09-21/compose-memory-stats.png) - -## What improvements have you seen since switching? - -Our previous monitoring setup was unreliable and difficult to manage. With -Prometheus we have a system that’s working well for graphing lots of metrics, -and we have team members suddenly excited about new ways to use it rather than -wary of touching the metrics system we used before. - -The cluster is simpler too, with just two identical nodes. As we grow, we know -we’ll have to shard the work across more Prometheus hosts and have considered a -few ways to do this. - -## What do you think the future holds for Compose and Prometheus? - -Right now we have only replicated the metrics we already gathered in previous -systems - basic memory usage for customer containers as well as host-level -resource usage for our own operations. The next logical step is enabling the -database teams to push metrics to the local Telegraf instance from inside the -DB containers so we can record database-level stats too without increasing -number of targets to scrape. - -We also have several other systems that we want to get into Prometheus to get -better visibility. We run our apps on Mesos and have integrated basic Docker -container metrics already, which is better than previously, but we also want to -have more of the infrastructure components in the Mesos cluster recording to -the central Prometheus so we can have centralised dashboards showing all -elements of supporting system health from load balancers right down to app -metrics. - -Eventually we will need to shard Prometheus. We already split customer -deployments among many smaller clusters for a variety of reasons so the one -logical option would be to move to a smaller Prometheus server (or a pair for -redundancy) per cluster rather than a single global one. - -For most reporting needs this is not a big issue as we usually don’t need -hosts/containers from different clusters in the same dashboard, but we may keep -a small global cluster with much longer retention and just a modest number of -down-sampled and aggregated metrics from each cluster’s Prometheus using -Recording Rules. - diff --git a/content/blog/2016-10-12-interview-with-justwatch.md b/content/blog/2016-10-12-interview-with-justwatch.md deleted file mode 100644 index 00835e8c..00000000 --- a/content/blog/2016-10-12-interview-with-justwatch.md +++ /dev/null @@ -1,141 +0,0 @@ ---- -title: Interview with JustWatch -created_at: 2016-10-12 -kind: article -author_name: Brian Brazil ---- - -*Continuing our series of interviews with users of Prometheus, JustWatch talks -about how they established their monitoring.* - -## Can you tell us about yourself and what JustWatch does? - -For consumers, [JustWatch](https://www.justwatch.com) is a streaming search -engine that helps to find out where to watch movies and TV shows legally online -and in theaters. You can search movie content across all major streaming -providers like Netflix, HBO, Amazon Video, iTunes, Google Play, and many others -in 17 countries. - -For our clients like movie studios or Video on Demand providers, we are an -international movie marketing company that collects anonymized data about -purchase behavior and movie taste of fans worldwide from our consumer apps. We -help studios to advertise their content to the right audience and make digital -video advertising a lot more efficient in minimizing waste coverage. - -![JustWatch logo](/assets/blog/2016-10-12/JW_logo_long_black.jpg) - -Since our launch in 2014 we went from zero to one of the largest 20k websites -internationally without spending a single dollar on marketing - becoming the -largest streaming search engine worldwide in under two years. Currently, with -an engineering team of just 10, we build and operate a fully dockerized stack -of about 50 micro- and macro-services, running mostly on -[Kubernetes](https://kubernetes.io). - -## What was your pre-Prometheus monitoring experience? - -At prior companies many of us worked with most of the open-source monitoring -products there are. We have quite some experience working with -[Nagios](https://www.nagios.org/), [Icinga](https://www.icinga.org/), -[Zabbix](http://www.zabbix.com/), -[Monit](https://mmonit.com/monit/documentation/), -[Munin](http://munin-monitoring.org/), [Graphite](https://graphiteapp.org/) and -a few other systems. At one company I helped build a distributed Nagios setup -with Puppet. This setup was nice, since new services automatically showed up in -the system, but taking instances out was still painful. As soon as you have -some variance in your systems, the host and service based monitoring suites -just don’t fit quite well. The label-based approach Prometheus took was -something I always wanted to have, but didn’t find before. - - -## Why did you decide to look at Prometheus? - -At JustWatch the public Prometheus announcement hit exactly the right time. We -mostly had blackbox monitoring for the first few months of the company - -[CloudWatch](https://aws.amazon.com/cloudwatch/) for some of the most important -internal metrics, combined with a external services like -[Pingdom](https://www.pingdom.com/) for detecting site-wide outages. Also, none -of the classical host-based solutions satisfied us. In a world of containers -and microservices, host-based tools like Icinga, -[Thruk](https://www.thruk.org/) or Zabbix felt antiquated and not ready for the -job. When we started to investigate whitebox monitoring, some of us luckily -attended the Golang Meetup where Julius and Björn announced Prometheus. We -quickly set up a Prometheus server and started to instrument our Go services -(we use almost only Go for the backend). It was amazing how easy that was - the -design felt like being cloud- and service-oriented as a first principle and -never got in the way. - - -## How did you transition? - -Transitioning wasn't that hard, as timing wise, we were lucky enough to go from -no relevant monitoring directly to Prometheus. - -The transition to Prometheus was mostly including the Go client into our apps -and wrapping the HTTP handlers. We also wrote and deployed several exporters, -including the [node_exporter](https://github.com/prometheus/node_exporter) and -several exporters for cloud provider APIs. In our experience monitoring and -alerting is a project that is never finished, but the bulk of the work was done -within a few weeks as a side project. - -Since the deployment of Prometheus we tend to look into metrics whenever we -miss something or when we are designing new services from scratch. - -It took some time to fully grasp the elegance of PromQL and labels concept -fully, but the effort really paid off. - - -## What improvements have you seen since switching? - -Prometheus enlightened us by making it incredibly easy to reap the benefits -from whitebox monitoring and label-based canary deployments. The out-of-the-box -metrics for many Golang aspects (HTTP Handler, Go Runtime) helped us to get to -a return on investment very quickly - goroutine metrics alone saved the day -multiple times. The only monitoring component we actually liked before - -[Grafana](http://grafana.org/) - feels like a natural fit for Prometheus and -has allowed us to create some very helpful dashboards. We appreciated that -Prometheus didn't try to reinvent the wheel but rather fit in perfectly with -the best solution out there. Another huge improvement on predecessors was -Prometheus's focus on actually getting the math right (percentiles, etc.). In -other systems, we were never quite sure if the operations offered made sense. -Especially percentiles are such a natural and necessary way of reasoning about -microservice performance that it felt great that they get first class -treatment. - -![Database Dashboard](/assets/blog/2016-10-12/prometheus-dashboard-db.jpg) - -The integrated service discovery makes it super easy to manage the scrape -targets. For Kubernetes, everything just works out-of-the-box. For some other -systems not running on Kubernetes yet, we use a -[Consul-based](https://www.consul.io/) approach. All it takes to get an -application monitored by Prometheus is to add the client, expose `/metrics` and -set one simple annotation on the Container/Pod. This low coupling takes out a -lot of friction between development and operations - a lot of services are -built well orchestrated from the beginning, because it's simple and fun. - -The combination of time-series and clever functions make for awesome alerting -super-powers. Aggregations that run on the server and treating both -time-series, combinations of them and even functions on those combinations as -first-class citizens makes alerting a breeze - often times after the fact. - -## What do you think the future holds for JustWatch and Prometheus? - -While we value very much that Prometheus doesn't focus on being shiny but on -actually working and delivering value while being reasonably easy to deploy and -operate - especially the Alertmanager leaves a lot to be desired yet. Just some -simple improvements like simplified interactive alert building and editing in -the frontend would go a long way in working with alerts being even simpler. - -We are really looking forward to the ongoing improvements in the storage layer, -including remote storage. We also hope for some of the approaches taken in -[Project Prism](https://github.com/weaveworks/prism) and -[Vulcan](https://github.com/digitalocean/vulcan) to be backported to core -Prometheus. The most interesting topics for us right now are GCE Service -Discovery, easier scaling, and much longer retention periods (even at the cost -of colder storage and much longer query times for older events). - -We are also looking forward to use Prometheus for more non-technical -departments as well. We’d like to cover most of our KPIs with Prometheus to -allow everyone to create beautiful dashboards, as well as alerts. We're -currently even planning to abuse the awesome alert engine for a new, internal -business project as well - stay tuned! - diff --git a/content/blog/2016-11-16-interview-with-canonical.md b/content/blog/2016-11-16-interview-with-canonical.md deleted file mode 100644 index adcbc328..00000000 --- a/content/blog/2016-11-16-interview-with-canonical.md +++ /dev/null @@ -1,79 +0,0 @@ ---- -title: Interview with Canonical -created_at: 2016-11-16 -kind: article -author_name: Brian Brazil ---- - -*Continuing our series of interviews with users of Prometheus, Canonical talks -about how they are transitioning to Prometheus.* - -## Can you tell us about yourself and what Canonical does? - -[Canonical](http://www.canonical.com/) is probably best known as the company -that sponsors Ubuntu Linux. We also produce or contribute to a number of other -open-source projects including MAAS, Juju, and OpenStack, and provide -commercial support for these products. Ubuntu powers the majority of OpenStack -deployments, with 55% of production clouds and [58% of large cloud -deployments]( -https://www.openstack.org/assets/survey/April-2016-User-Survey-Report.pdf#page=47). - -My group, BootStack, is our fully managed private cloud service. We build and -operate OpenStack clouds for Canonical customers. - -## What was your pre-Prometheus monitoring experience? - -We’d used a combination of [Nagios](https://www.nagios.org/), -[Graphite](https://graphite.readthedocs.io/en/latest/)/[statsd](https://github.com/etsy/statsd), -and in-house [Django](https://www.djangoproject.com/) apps. These did not offer -us the level of flexibility and reporting that we need in both our internal and -customer cloud environments. - -## Why did you decide to look at Prometheus? - -We’d evaluated a few alternatives, including -[InfluxDB](https://github.com/influxdata/influxdb) and extending our use of -Graphite, but our first experiences with Prometheus proved it to have the -combination of simplicity and power that we were looking for. We especially -appreciate the convenience of labels, the simple HTTP protocol, and the out of -box [timeseries alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). The -potential with Prometheus to replace 2 different tools (alerting and trending) -with one is particularly appealing. - -Also, several of our staff have prior experience with Borgmon from their time -at Google which greatly added to our interest! - -## How did you transition? - -We are still in the process of transitioning, we expect this will take some -time due to the number of custom checks we currently use in our existing -systems that will need to be re-implemented in Prometheus. The most useful -resource has been the [prometheus.io](https://prometheus.io/) site documentation. - -It took us a while to choose an exporter. We originally went with -[collectd](https://collectd.org/) but ran into limitations with this. We’re -working on writing an -[openstack-exporter](https://github.com/CanonicalLtd/prometheus-openstack-exporter) -now and were a bit surprised to find there is no good, working, example how to -write exporter from scratch. - -Some challenges we’ve run into are: No downsampling support, no long term -storage solution (yet), and we were surprised by the default 2 week retention -period. There's currently no tie-in with Juju, but [we’re working on it]( -https://launchpad.net/prometheus-registration)! - -## What improvements have you seen since switching? - -Once we got the hang of exporters, we found they were very easy to write and -have given us very useful metrics. For example we are developing an -openstack-exporter for our cloud environments. We’ve also seen very quick -cross-team adoption from our DevOps and WebOps groups and developers. We don’t -yet have alerting in place but expect to see a lot more to come once we get to -this phase of the transition. - -## What do you think the future holds for Canonical and Prometheus? - -We expect Prometheus to be a significant part of our monitoring and reporting -infrastructure, providing the metrics gathering and storage for numerous -current and future systems. We see it potentially replacing Nagios as for -alerting. diff --git a/content/blog/2017-02-20-interview-with-weaveworks.md b/content/blog/2017-02-20-interview-with-weaveworks.md deleted file mode 100644 index e77c0087..00000000 --- a/content/blog/2017-02-20-interview-with-weaveworks.md +++ /dev/null @@ -1,110 +0,0 @@ ---- -title: Interview with Weaveworks -created_at: 2017-02-20 -kind: article -author_name: Brian Brazil ---- - -*Continuing our series of interviews with users of Prometheus, Tom Wilkie from -Weaveworks talks about how they choose Prometheus and are now building on it.* - - -## Can you tell us about Weaveworks? - -[Weaveworks](https://www.weave.works/) offers [Weave -Cloud](https://www.weave.works/solution/cloud/), a service which -"operationalizes" microservices through a combination of open source projects -and software as a service. - -Weave Cloud consists of: - - * Visualisation with [Weave Scope](https://github.com/weaveworks/scope) - * Continuous Deployment with [Weave Flux](https://github.com/weaveworks/flux) - * Networking with [Weave Net](https://github.com/weaveworks/weave), the container SDN - * [Monitoring with Weave Cortex](https://www.weave.works/guides/cloud-guide-part-3-monitor-prometheus-monitoring/), our open source, distributed Prometheus-as-a-Service. - -You can try Weave Cloud [free for 60 days](https://cloud.weave.works/signup). -For the latest on our products check out our [blog](https://www.weave.works/blog/), [Twitter](https://twitter.com/weaveworks), or [Slack](https://weave-community.slack.com/) ([invite](https://weaveworks.github.io/community-slack/)). - -## What was your pre-Prometheus monitoring experience? - -Weave Cloud was a clean-slate implementation, and as such there was no previous -monitoring system. In previous lives the team had used the typical tools such -as Munin and Nagios. Weave Cloud started life as a multitenant, hosted -version of Scope. Scope includes basic monitoring for things like CPU and -memory usage, so I guess you could say we used that. But we needed something -to monitor Scope itself... - -## Why did you decide to look at Prometheus? - -We've got a bunch of ex-Google SRE on staff, so there was plenty of experience -with Borgmon, and an ex-SoundClouder with experience of Prometheus. We built -the service on Kubernetes and were looking for something that would "fit" with -its dynamically scheduled nature - so Prometheus was a no-brainer. We've even -written a series of blog posts of which [why Prometheus and Kubernetes work together -so well](https://www.weave.works/prometheus-kubernetes-perfect-match/) is the first. - -## How did you transition? - -When we started with Prometheus the Kubernetes service discovery was still just -a PR and as such there were few docs. We ran a custom build for a while and -kinda just muddled along, working it out for ourselves. Eventually we gave a -talk at the [London Prometheus meetup](https://www.meetup.com/Prometheus-London/) on [our experience](http://www.slideshare.net/weaveworks/kubernetes-and-prometheus) and published a -[series](https://www.weave.works/prometheus-kubernetes-deploying/) of [blog](https://www.weave.works/prometheus-and-kubernetes-monitoring-your-applications/) [posts](https://www.weave.works/monitoring-kubernetes-infrastructure/). - -We've tried pretty much every different option for running Prometheus. We -started off building our own container images with embedded config, running -them all together in a single Pod alongside Grafana and Alert Manager. We used -ephemeral, in-Pod storage for time series data. We then broke this up into -different Pods so we didn't have to restart Prometheus (and lose history) -whenever we changed our dashboards. More recently we've moved to using -upstream images and storing the config in a Kubernetes config map - which gets -updated by our CI system whenever we change it. We use a small sidecar -container in the Prometheus Pod to watch the config file and ping Prometheus -when it changes. This means we don't have to restart Prometheus very often, -can get away without doing anything fancy for storage, and don't lose history. - -Still the problem of periodically losing Prometheus history haunted us, and the -available solutions such as Kubernetes volumes or periodic S3 backups all had -their downsides. Along with our fantastic experience using Prometheus to -monitor the Scope service, this motivated us to build a cloud-native, -distributed version of Prometheus - one which could be upgraded, shuffled -around and survive host failures without losing history. And that’s how Weave -Cortex was born. - -## What improvements have you seen since switching? - -Ignoring Cortex for a second, we were particularly excited to see the -introduction of the HA Alert Manager; although mainly because it was one of the -[first non-Weaveworks projects to use Weave Mesh](https://www.weave.works/weave-mesh-prometheus-alertmanager/), -our gossip and coordination layer. - -I was also particularly keen on the version two Kubernetes service discovery -changes by Fabian - this solved an acute problem we were having with monitoring -our Consul Pods, where we needed to scrape multiple ports on the same Pod. - -And I'd be remiss if I didn't mention the remote write feature (something I -worked on myself). With this, Prometheus forms a key component of Weave Cortex -itself, scraping targets and sending samples to us. - -## What do you think the future holds for Weaveworks and Prometheus? - -For me the immediate future is Weave Cortex, Weaveworks' Prometheus as a -Service. We use it extensively internally, and are starting to achieve pretty -good query performance out of it. It's running in production with real users -right now, and shortly we'll be introducing support for alerting and achieve -feature parity with upstream Prometheus. From there we'll enter a beta -programme of stabilization before general availability in the middle of the -year. - -As part of Cortex, we've developed an intelligent Prometheus expression -browser, with autocompletion for PromQL and Jupyter-esque notebooks. We're -looking forward to getting this in front of more people and eventually open -sourcing it. - -I've also got a little side project called -[Loki](https://github.com/weaveworks-experiments/loki), which brings Prometheus -service discovery and scraping to OpenTracing, and makes distributed tracing -easy and robust. I'll be giving a [talk about this at KubeCon/CNCFCon -Berlin](https://cloudnativeeu2017.sched.com/event/9Tbt/loki-an-opensource-zipkin-prometheus-mashup-written-in-go-tom-wilkie-software-engineer) -at the end of March. diff --git a/content/blog/2017-04-06-interview-with-europace.md b/content/blog/2017-04-06-interview-with-europace.md deleted file mode 100644 index 27e6de19..00000000 --- a/content/blog/2017-04-06-interview-with-europace.md +++ /dev/null @@ -1,73 +0,0 @@ ---- -title: Interview with Europace -created_at: 2017-04-06 -kind: article -author_name: Brian Brazil ---- - -*Continuing our series of interviews with users of Prometheus, Tobias Gesellchen from -Europace talks about how they discovered Prometheus.* - - -## Can you tell us about Europace does? - -[Europace AG](https://www.europace.de/) develops and operates the web-based -EUROPACE financial marketplace, which is Germany’s largest platform for -mortgages, building finance products and personal loans. A fully integrated -system links about 400 partners – banks, insurers and financial product -distributors. Several thousand users execute some 35,000 transactions worth a -total of up to €4 billion on EUROPACE every month. Our engineers regularly -blog at [http://tech.europace.de/](http://tech.europace.de/) and -[@EuropaceTech](https://twitter.com/europacetech). - -## What was your pre-Prometheus monitoring experience? - -[Nagios](https://www.nagios.org/)/[Icinga](https://www.icinga.com/) are still -in use for other projects, but with the growing number of services and higher -demand for flexibility we looked for other solutions. Due to Nagios and Icinga -being more centrally maintained, Prometheus matched our aim to have the full -DevOps stack in our team and move specific responsibilities from our -infrastructure team to the project members. - -## Why did you decide to look at Prometheus? - -Through our activities in the [Docker Berlin -community](https://www.meetup.com/Docker-Berlin/) we had been in contact with -[SoundCloud](https://soundcloud.com/) and [Julius -Volz](https://twitter.com/juliusvolz), who gave us a good overview. The -combination of flexible Docker containers with the highly flexible label-based -concept convinced us give Prometheus a try. The Prometheus setup was easy -enough, and the Alertmanager worked for our needs, so that we didn’t see any -reason to try alternatives. Even our little pull requests to improve the -integration in a Docker environment and with messaging tools had been merged -very quickly. Over time, we added several exporters and Grafana to the stack. -We never looked back or searched for alternatives. - -![Grafana dashboard for Docker Registry](/assets/blog/2017-04-06/europace_grafana_1.png) - -## How did you transition? - -Our team introduced Prometheus in a new project, so the transition didn’t -happen in our team. Other teams started by adding Prometheus side by side to -existing solutions and then migrated the metrics collectors step by step. -Custom exporters and other temporary services helped during the migration. -Grafana existed already, so we didn’t have to consider another dashboard. Some -projects still use both Icinga and Prometheus in parallel. - -## What improvements have you seen since switching? - -We had issues using Icinga due to scalability - several teams maintaining a -centrally managed solution didn’t work well. Using the Prometheus stack along -with the Alertmanager decoupled our teams and projects. The Alertmanager is -now able to be deployed in a [high availability -mode](https://github.com/prometheus/alertmanager#high-availability), which is a -great improvement to the heart of our monitoring infrastructure. - -## What do you think the future holds for Europace and Prometheus? - -Other teams in our company have gradually adopted Prometheus in their projects. -We expect that more projects will introduce Prometheus along with the -Alertmanager and slowly replace Icinga. With the inherent flexibility of -Prometheus we expect that it will scale with our needs and that we won’t have -issues adapting it to future requirements. - diff --git a/content/blog/2017-04-10-promehteus-20-sneak-peak.md b/content/blog/2017-04-10-promehteus-20-sneak-peak.md deleted file mode 100644 index 400c9fa8..00000000 --- a/content/blog/2017-04-10-promehteus-20-sneak-peak.md +++ /dev/null @@ -1,35 +0,0 @@ ---- -title: Sneak Peak of Prometheus 2.0 -created_at: 2017-04-10 -kind: article -author_name: Fabian Reinartz ---- - -In July 2016 Prometheus reached a big milestone with its 1.0 release. Since then, plenty of new features like new service discovery integrations and our experimental remote APIs have been added. -We also realized that new developments in the infrastructure space, in particular [Kubernetes](https://kubernetes.io), allowed monitored environments to become significantly more dynamic. Unsurprisingly, this also brings new challenges to Prometheus and we identified performance bottlenecks in its storage layer. - -Over the past few months we have been designing and implementing a new storage concept that addresses those bottlenecks and shows considerable performance improvements overall. It also paves the way to add features such as hot backups. - -The changes are so fundamental that it will trigger a new major release: Prometheus 2.0. -Important features and changes beyond the storage are planned before its stable release. However, today we are releasing an early alpha of Prometheus 2.0 to kick off the stabilization process of the new storage. - -[Release tarballs](https://github.com/prometheus/prometheus/releases/tag/v2.0.0-alpha.0) and [Docker containers](https://quay.io/repository/prometheus/prometheus?tab=tags) are now available. -If you are interested in the new mechanics of the storage, make sure to read [the deep-dive blog post](https://fabxc.org/blog/2017-04-10-writing-a-tsdb/) looking under the hood. - -This version does not work with old storage data and should not replace existing production deployments. To run it, the data directory must be empty and all existing storage flags except for `-storage.local.retention` have to be removed. - -For example; before: - -``` -./prometheus -storage.local.retention=200h -storage.local.memory-chunks=1000000 -storage.local.max-chunks-to-persist=500000 -storage.local.chunk-encoding=2 -config.file=/etc/prometheus.yaml -``` - -after: - -``` -./prometheus -storage.local.retention=200h -config.file=/etc/prometheus.yaml -``` - -This is a very early version and crashes, data corruption, and bugs in general should be expected. Help us move towards a stable release by submitting them to [our issue tracker](https://github.com/prometheus/prometheus/issues). - -The experimental remote storage APIs are disabled in this alpha release. Scraping targets exposing timestamps, such as federated Prometheus servers, does not yet work. The storage format is breaking and will break again between subsequent alpha releases. We plan to document an upgrade path from 1.0 to 2.0 once we are approaching a stable release. diff --git a/content/blog/2017-05-17-interview-with-iadvize.md b/content/blog/2017-05-17-interview-with-iadvize.md deleted file mode 100644 index d4461d06..00000000 --- a/content/blog/2017-05-17-interview-with-iadvize.md +++ /dev/null @@ -1,149 +0,0 @@ ---- -title: Interview with iAdvize -created_at: 2017-05-17 -kind: article -author_name: Brian Brazil ---- - -*Continuing our series of interviews with users of Prometheus, Laurent -COMMARIEU from iAdvize talks about how they replaced their legacy Nagios and -Centreon monitoring with Prometheus.* - - -## Can you tell us about iAdvize does? - -I am Laurent COMMARIEU, a system engineer at iAdvize. I work within the 60 -person R&D department in a team of 5 system engineers. Our job is mainly to -ensure that applications, services and the underlying system are up and -running. We are working with developers to ensure the easiest path for their -code to production, and provide the necessary feedback at every step. That’s -where monitoring is important. - -iAdvize is a full stack conversational commerce platform. We provide an easy -way for a brand to centrally interact with their customers, no matter the -communication channel (chat, call, video, Facebook Pages, Facebook Messenger, -Twitter, Instagram, WhatsApp, SMS, etc...). Our customers work in [ecommerce, -banks, travel, fashion, etc. in 40 -countries](http://www.iadvize.com/en/customers/). We are an international -company of 200 employees with offices in France, UK, Germany, Spain and Italy. -We raised $16 Million in 2015. - -## What was your pre-Prometheus monitoring experience? - -I joined iAdvize in February 2016. Previously I worked in companies specialized -in network and application monitoring. We were working with opensource software -like [Nagios](https://www.nagios.org/), [Cacti](http://www.cacti.net/), -[Centreon](https://www.centreon.com/), [Zabbix](http://www.zabbix.com/), -[OpenNMS](https://www.opennms.org/en), etc. and some non-free ones like [HP -NNM](https://saas.hpe.com/en-us/software/network-node-manager-i-network-management-software), -[IBM Netcool -suite](http://www-03.ibm.com/software/products/en/netcool-network-management), -[BMC Patrol](http://www.bmc.com/it-solutions/brands/patrol-proactivenet.html), -etc. - -iAdvize used to delegate monitoring to an external provider. They ensured 24/7 -monitoring using Nagios and Centreon. This toolset was working fine with the -legacy static architecture (barebone servers, no VMs, no containers). To -complete this monitoring stack, we also use [Pingdom](https://www.pingdom.com/). - -With the moving our monolithic application towards a Microservices architecture -(using Docker) and our will to move our current workload to an infrastructure -cloud provider we needed to have more control and flexibility on monitoring. At -the same time, iAdvize recruited 3 people, which grew the infrastructure team -from 2 to 5. With the old system it took at least a few days or a week to add -some new metrics into Centreon and had a real cost (time and money). - - -## Why did you decide to look at Prometheus? - -We knew Nagios and the like were not a good choice. Prometheus was the rising -star at the time and we decided to PoC it. [Sensu](https://sensuapp.org/) was -also on the list at the beginning but Prometheus seemed more promising for our -use cases. - -We needed something able to integrate with Consul, our service discovery -system. Our micro services already had a /health route; adding a /metrics -endpoint was simple. For about every tool we used, an exporter was available -(MySQL, Memcached, Redis, nginx, FPM, etc.). - -On paper it looked good. - -![One of iAdvize's Grafana dashboards](/assets/blog/2017-05-17/iadvize-dashboard-1.png) - -## How did you transition? - -First of all, we had to convince the developers team (40 people) that -Prometheus was the right tool for the job and that they had to add an exporter -to their apps. So we did a little demo on RabbitMQ, we installed a RabbitMQ -exporter and built a simple [Grafana](https://grafana.com/) dashboard to -display usage metrics to developers. A Python script was written to create some -queue and publish/consume messages. - -They were quite impressed to see queues and the messages appear in real time. -Before that, developers didn't have access to any monitoring data. Centreon was -restricted by our infrastructure provider. Today, Grafana is available to -everyone at iAdvize, using the Google Auth integration to authenticate. There -are 78 active accounts on it (from dev teams to the CEO). - -After we started monitoring existing services with Consul and cAdvisor, we -monitored the actual presence of the containers. They were monitored using -Pingdom checks but it wasn't enough. - -We developed a few custom exporters in Go to scrape some business metrics from -our databases (MySQL and Redis). - -Soon enough, we were able to replace all the legacy monitoring by Prometheus. - -![One of iAdvize's Grafana dashboards](/assets/blog/2017-05-17/iadvize-dashboard-2.png) - -## What improvements have you seen since switching? - -Business metrics became very popular and during sales periods everyone is -connected to Grafana to see if we're gonna beat some record. We monitor the -number of simultaneous conversations, routing errors, agents connected, the -number of visitors loading the iAdvize tag, calls on our API gateway, etc. - -We worked for a month to optimize our MySQL servers with analysis based on the -[Newrelic exporter](https://github.com/jfindley/newrelic_exporter) and [Percona -dashboard for grafana] (https://github.com/percona/grafana-dashboards). It was -a real success, allowing us to discover inefficiencies and perform -optimisations that cut database size by 45% and peak latency by 75%. - -There are a lot to say. We know if a AMQP queue has no consumer or if it is -Filling abnormally. We know when a container restarts. - -The visibility is just awesome. - -That was just for the legacy platform. - -More and more micro services are going to be deployed in the cloud and -Prometheus is used to monitor them. We are using Consul to register the -services and Prometheus to discover the metrics routes. Everything works like a -charm and we are able to build a Grafana dashboard with a lot of critical -business, application and system metrics. - -We are building a scalable architecture to deploy our services with -[Nomad](https://www.nomadproject.io/). Nomad registers healthy services in -Consul and with some tags relabeling we are able to filter those with a tag -name "metrics=true". It offers to us a huge gain in time to deploy the -monitoring. We have nothing to do ^^. - -We also use the EC2 service discovery. It's really useful with auto-scaling -groups. We scale and recycle instances and it's already monitored. No more -waiting for our external infrastructure provider to notice what happens in -production. - -We use alertmanager to send some alerts by SMS or in to our -[Flowdock](https://www.flowdock.com/). - -## What do you think the future holds for iAdvize and Prometheus? - - -* We are waiting for a simple way to add a long term scalable storage for our - capacity planning. -* We have a dream that one day, our auto-scaling will be triggered by - Prometheus alerting. We want to build an autonomous system base on response - time and business metrics. -* I used to work with [Netuitive](http://www.netuitive.com/), it had a great - anomaly detection feature with automatic correlation. It would be great to - have some in Prometheus. diff --git a/content/blog/2017-06-14-interview-with-latelier-animation.md b/content/blog/2017-06-14-interview-with-latelier-animation.md deleted file mode 100644 index 44d5acb5..00000000 --- a/content/blog/2017-06-14-interview-with-latelier-animation.md +++ /dev/null @@ -1,111 +0,0 @@ ---- -title: Interview with L’Atelier Animation -created_at: 2017-06-14 -kind: article -author_name: Brian Brazil ---- - -*Continuing our series of interviews with users of Prometheus, Philippe Panaite -and Barthelemy Stevens from L’Atelier Animation talk about how they switched -their animation studio from a mix of Nagios, Graphite and InfluxDB to -Prometheus.* - -## Can you tell us about yourself and what L’Atelier Animation does? - -[L’Atelier Animation](http://www.latelieranimation.com/) is a 3D animation studio based in -the beautiful city of Montreal Canada. Our first feature film -["Ballerina"](http://www.imdb.com/title/tt2261287/combined) (also known as -"Leap") was released worldwide in 2017, US release is expected later this year. - -We’re currently hard at work on an animated TV series and on our second feature -film. -  -Our infrastructure consists of around 300 render blades, 150 workstations and -twenty various servers. With the exception of a couple of Macs, everything runs -on Linux ([CentOS](https://www.centos.org/)) and not a single Windows machine.   - -  -## What was your pre-Prometheus monitoring experience? -  -At first we went with a mix of [Nagios](https://www.nagios.org/), -[Graphite](https://graphiteapp.org/), and -[InfluxDB](https://www.influxdata.com). The initial setup was “ok” but nothing -special and over complicated (too many moving parts).   - - -## Why did you decide to look at Prometheus? -  -When we switched all of our services to CentOS 7, we looked at new monitoring -solutions and Prometheus came up for many reasons, but most importantly: - - * Node Exporter: With its customization capabilities, we can fetch any data from clients - * SNMP support: Removes the need for a 3rd party SNMP service - * Alerting system: ByeBye Nagios - * [Grafana](https://grafana.com/) support - - -## How did you transition? - -When we finished our first film we had a bit of a downtime so it was a perfect -opportunity for our IT department to make big changes. We decided to flush our -whole monitoring system as it was not as good as we wanted.   - -One of the most important part is to monitor networking equipment so we started -by configuring [snmp_exporter](https://github.com/prometheus/snmp_exporter/) to -fetch data from one of our switches. The calls to NetSNMP that the exporter -makes are different under CentOS so we had to re-compile some of the binaries, -we did encounter small hiccups here and there but with the help of Brian Brazil -from [Robust Perception](https://www.robustperception.io/), we got everything -sorted out quickly. Once we got snmp_exporter working, we were able to easily -add new devices and fetch SNMP data. We now have our core network monitored in -Grafana (including 13 switches, 10 VLANs). - -![Switch metrics from SNMP data](/assets/blog/2017-06-14/switches.png) - -After that we configured -[node_exporter](https://github.com/prometheus/node_exporter/) as we required -analytics on workstations, render blades and servers. In our field, when a CPU -is not at 100% it’s a problem, we want to use all the power we can so in the -end temperature is more critical. Plus, we need as much uptime as possible so -all our stations have email alerts setup via Prometheus’s -[Alertmanager](https://prometheus.io/docs/alerting/alertmanager/) so we’re -aware when anything is down. - -![Dashboard for one workstation](/assets/blog/2017-06-14/workstation.png) - -Our specific needs require us to monitor custom data from clients, it’s made -easy through the use of node_exporter’s [textfile -collector](https://github.com/prometheus/node_exporter#textfile-collector) -function. A cronjob outputs specific data from any given tool into a -pre-formatted text file in a format readable by Prometheus.   - -Since all the data is available through the HTTP protocol, we wrote a -[Python](https://www.python.org/) script to fetch data from Prometheus. We -store it in a [MySQL](https://www.mysql.com/) database accessed via a web -application that creates a live floor map. This allows us to know with a simple -mouse over which user is seated where with what type of hardware. We also -created another page with user’s picture & department information, it helps -new employees know who’s their neighbour. The website is still an ongoing -project so please don’t judge the look, we’re sysadmins after all not web -designers :-) - -![Floormap with workstation detail](/assets/blog/2017-06-14/floormap.png) - - -## What improvements have you seen since switching? - -It gave us an opportunity to change the way we monitor everything in the studio -and inspired us to create a new custom floor map with all the data which has -been initially fetched by Prometheus. The setup is a lot simpler with one -service to rule them all. - - -## What do you think the future holds for L’Atelier Animation and Prometheus? - -We’re currently in the process of integrating software licenses usage with -Prometheus. The information will give artists a good idea of whom is using what -and where. - -We will continue to customize and add new stuff to Prometheus by user demand -and since we work with artists, we know there will be plenty :-) With SNMP and -the node_exporter’s custom text file inputs, the possibilities are endless... diff --git a/content/blog/2017-06-21-prometheus-20-alpha3-new-rule-format.md b/content/blog/2017-06-21-prometheus-20-alpha3-new-rule-format.md deleted file mode 100644 index 7e0cc6e8..00000000 --- a/content/blog/2017-06-21-prometheus-20-alpha3-new-rule-format.md +++ /dev/null @@ -1,76 +0,0 @@ ---- - title: Prometheus 2.0 Alpha.3 with New Rule Format - created_at: 2017-06-22 - kind: article - author_name: Goutham Veeramachaneni ---- - -Today we release the third alpha version of Prometheus 2.0. Aside from a variety of bug fixes in the new storage layer, it contains a few planned breaking changes. - -## Flag Changes - -First, we moved to a new flag library, which uses the more common double-dash `--` prefix for flags instead of the single dash Prometheus used so far. Deployments have to be adapted accordingly. -Additionally, some flags were removed with this alpha. The full list since Prometheus 1.0.0 is: - -* `web.telemetry-path` -* All `storage.remote.*` flags -* All `storage.local.*` flags -* `query.staleness-delta` -* `alertmanager.url` - - -## Recording Rules changes - -Alerting and recording rules are one of the critical features of Prometheus. But they also come with a few design issues and missing features, namely: - -* All rules ran with the same interval. We could have some heavy rules that are better off being run at a 10-minute interval and some rules that could be run at 15-second intervals. - -* All rules were evaluated concurrently, which is actually Prometheus’ oldest [open bug](https://github.com/prometheus/prometheus/blob/main/rules/manager.go#L267). This has a couple of issues, the obvious one being that the load spikes every eval interval if you have a lot of rules. The other being that rules that depend on each other might be fed outdated data. For example: - -``` -instance:network_bytes:rate1m = sum by(instance) (rate(network_bytes_total[1m])) - -ALERT HighNetworkTraffic - IF instance:network_bytes:rate1m > 10e6 - FOR 5m -``` - - -Here we are alerting over `instance:network_bytes:rate1m`, but `instance:network_bytes:rate1m` is itself being generated by another rule. We can get expected results only if the alert `HighNetworkTraffic` is run after the current value for `instance:network_bytes:rate1m` gets recorded. - -* Rules and alerts required users to learn yet another DSL. - -To solve the issues above, grouping of rules has been [proposed long back](https://github.com/prometheus/prometheus/issues/1095) but has only recently been implemented [as a part of Prometheus 2.0](https://github.com/prometheus/prometheus/pull/2842). As part of this implementation we have also moved the rules to the well-known YAML format, which also makes it easier to generate alerting rules based on common patterns in users’ environments. - -Here’s how the new format looks: - -```yaml -groups: -- name: my-group-name - interval: 30s # defaults to global interval - rules: - - record: instance:errors:rate5m - expr: rate(errors_total[5m]) - - record: instance:requests:rate5m - expr: rate(requests_total[5m]) - - alert: HighErrors - # Expressions remain PromQL as before and can be spread over - # multiple lines via YAML’s multi-line strings. - expr: | - sum without(instance) (instance:errors:rate5m) - / - sum without(instance) (instance:requests:rate5m) - for: 5m - labels: - severity: critical - annotations: - description: "stuff's happening with {{ $labels.service }}" -``` - -The rules in each group are executed sequentially and you can have an evaluation interval per group. - -As this change is breaking, we are going to release it with the 2.0 release and have added a command to promtool for the migration: `promtool update rules ` -The converted files have the `.yml` suffix appended and the `rule_files` clause in your Prometheus configuration has to be adapted. - - -Help us moving towards the Prometheus 2.0 stable release by testing this new alpha version! You can report bugs on our [issue tracker](https://github.com/prometheus/prometheus/issues) and provide general feedback via our [community channels](https://prometheus.io/community/). diff --git a/content/blog/2017-09-04-promcon-2017-recap.md b/content/blog/2017-09-04-promcon-2017-recap.md deleted file mode 100644 index 0cd80113..00000000 --- a/content/blog/2017-09-04-promcon-2017-recap.md +++ /dev/null @@ -1,128 +0,0 @@ ---- -title: PromCon 2017 Recap -created_at: 2017-09-04 -kind: article -author_name: Julius Volz ---- - -## What happened - -Two weeks ago, Prometheus users and developers from all over the world came together in Munich for [PromCon 2017](https://promcon.io/2017-munich/), the second conference around the Prometheus monitoring system. The purpose of this event was to exchange knowledge and best practices and build professional connections around monitoring with Prometheus. Google's Munich office offered us a much larger space this year, which allowed us to grow from 80 to 220 attendees while still selling out! - -Take a look at the recap video to get an impression of the event: - - - - - -At PromCon, speakers from a variety of organizations talked about how they are using Prometheus and building solutions around it. For example, [Cloudflare](https://cloudflare.com/) and [DigitalOcean](https://www.digitalocean.com/) both explained how they use Prometheus to monitor their large-scale networks and datacenters: - - - - - - - -Speakers from [Grafana](https://grafana.com/) and [InfluxData](https://www.influxdata.com/) brought us up to date with new features and Prometheus integrations: - - - - - - - -Several Prometheus core developers also spoke about best practices and new features and developments in Prometheus: - - - - - - - - - - - - - - - - - - - -To see the entire program, have a look at [the schedule](https://promcon.io/2017-munich/schedule). - -In the breaks and after-parties, we had a lot of fun: - - - - - - - - - - -...and one lucky participant finally had her childhood dream come true: - - - - -## Talk recordings - -Today, we are pleased to announce that all talk recordings are now ready and -publicly available. You can enjoy them [in this YouTube playlist](https://www.youtube.com/playlist?list=PLoz-W_CUquUlnvoEBbqChb7A0ZEZsWSXt)! - -## Reception - -Again, we received incredibly positive and encouraging feedback from speakers and attendees this year. Here is what some had to say: - - - - - - - - - - - - - - - - -While there were things going wrong here and there, we think that the event turned out super well overall for a community-organized conference, and we're happy to see that our attendees felt similarly! - -## Thanks - -PromCon 2017 would not have been possible without the help of its sponsors, -speakers, attendees, and organizers. Thanks so much to all of you! Our Diamond -and Venue sponsors deserve a special mention at this point, since they did -the most to support us and made all the food, drinks, video recordings, and -swag possible: - -

Diamond

- - -

Venue

- - -We would also like to thank [all our other sponsors](https://promcon.io/2017-munich/#our-sponsors)! - -A special thank you to the [Cloud Native Computing Foundation](https://cncf.io) for helping us handle financials and the registration system! - -## Outlook - -With PromCon 2017, the Prometheus community organized its second successful Prometheus conference. Since all attendees really appreciated the community character of the event, we definitely aim to keep this special feeling for PromCon in the future. PromCon 2017 was still organized mainly in people's free time, but we started distributing the work load over more people this year. For the next iterations of PromCon, we still need to discuss how to make this community-organized model more sustainable. We don't know yet when, where, and how PromCon 2018 will happen, but we will report back when we do, and we hope to welcome you back! - -Stay tuned! diff --git a/content/blog/2017-11-08-announcing-prometheus-2-0.md b/content/blog/2017-11-08-announcing-prometheus-2-0.md deleted file mode 100644 index e0c4983f..00000000 --- a/content/blog/2017-11-08-announcing-prometheus-2-0.md +++ /dev/null @@ -1,63 +0,0 @@ ---- -title: Announcing Prometheus 2.0 -created_at: 2017-11-08 -kind: article -author_name: Fabian Reinartz on behalf of the Prometheus team ---- - -## Announcing Prometheus 2.0 - -Nearly one and a half years ago, we released Prometheus 1.0 into the wild. The release marked a significant milestone for the project. We had reached a broad set of features that make up Prometheus' simple yet extremely powerful monitoring philosophy. - -Since then we added and improved on various service discovery integrations, extended PromQL, and experimented with a first iteration on remote APIs to enable pluggable long-term storage solutions. - -But what else has changed to merit a new major release? - - - -## Prometheus 2.0 - -Prometheus has a simple and robust operational model that our users quickly learn to love. Yet, the infrastructure space did not stand still and projects like Kubernetes and Mesos rapidly change how software is being deployed and managed. Monitored environments have become increasingly more dynamic. - -More and more we felt the strain this put on Prometheus' performance. The storage subsystem required careful configuration for the expected load. Prometheus 1.6 greatly alleviated this pain with its auto-tuning capabilities. Nonetheless, our users were bound to hit some inevitable hard-limits. - -### Storage - -In early 2017, things started moving under the hood. What first began as an experiment for a new, more performant time series database quickly got confirmed in practical benchmarks. -Over the past six months we have been busy stabilizing this work as an [independent time series database](https://www.youtube.com/watch?v=b_pEevMAC3I&list=PLoz-W_CUquUlnvoEBbqChb7A0ZEZsWSXt&index=29) and re-integrating this into Prometheus itself. -The result is a significantly better performing Prometheus 2.0 with improvements along virtually all dimensions. Query latency is more consistent and it especially scales better in the face of high series churn. Resource consumption, as measured in different real-world production scenarios, also decreased significantly: - -* **CPU usage** reduced to **20% - 40%** compared to Prometheus 1.8 -* **Disk space usage** reduced to **33% - 50%** compared to Prometheus 1.8 -* **Disk I/O** without much query load is usually **<1%** on average - -![Prometheus 1.8 vs 2.0 resource comparison](/assets/blog/2017-11-08/resource-comparison.png) - -It is also well-equipped to handle the increasingly dynamic characteristics of modern computing environments for years to come. - -### Staleness handling - -Additionally, many small and big changes have happened to make the Prometheus experience more consistent and intuitive. The most notable one is [staleness handling](https://www.youtube.com/watch?v=GcTzd2CLH7I&list=PLoz-W_CUquUlnvoEBbqChb7A0ZEZsWSXt&index=32), which was one of the oldest and most requested roadmap items. With the new improvements, disappearing monitoring targets or series from those targets are now explicitly tracked, which reduces querying artefacts and increases alerting responsiveness. - -### Other improvements - -Prometheus 2.0 also comes with built-in support for [snapshot backups of the entire database](https://www.youtube.com/watch?v=15uc8oTMgPY). - -We also migrated our recording and alerting rules from a custom format to the ubiquitous YAML format. This makes it easier to integrate with configuration management and templating. - -A lot of additional smaller changes and cleanups happened. Check the [Prometheus 1.x to 2.0](/docs/prometheus/latest/migration/) migration guide for a full overview of changes and how to adapt your setup to them. But do not worry, Prometheus 2 is still the Prometheus you have learned to love — just a lot faster and even easier to operate and use. - -## What's next - -The new storage subsystem is designed to be accessible and extensible. This goes for new features directly integrated into Prometheus as well as custom tools that can be built on top of it. -The simple and open storage format and library also allows users to easily build custom extensions like dynamic retention policies. This enables the storage layer to meet a wide array of requirements without drawing complexity into Prometheus itself; allowing it to focus on its core goals. - -The remote APIs will continue to evolve to satisfy requirements for long-term storage without sacrificing Prometheus' model of reliability through simplicity. - -## Try it out! - -You can try out Prometheus 2.0 as usual by downloading our [official binaries](https://prometheus.io/download/#prometheus) and [container images](https://quay.io/repository/prometheus/prometheus?tab=tags). See the [Getting started](/docs/prometheus/latest/getting_started/) page for a tutorial on how to get up and running with Prometheus. - -If you are upgrading from Prometheus 1.x, check our [migration guide](/docs/prometheus/2.0/migration/) to learn about adjustments that you will have to make and how to use the remote APIs to [read data from old Prometheus servers](https://www.robustperception.io/accessing-data-from-prometheus-1-x-in-prometheus-2-0/) during the migration period. - -Finally, we would like to thank all our users who extensively tested the pre-releases and helped us in debugging issues. This huge milestone would not have been possible without you! diff --git a/content/blog/2017-11-30-prometheus-at-cloudnativecon.md b/content/blog/2017-11-30-prometheus-at-cloudnativecon.md deleted file mode 100644 index d33cc785..00000000 --- a/content/blog/2017-11-30-prometheus-at-cloudnativecon.md +++ /dev/null @@ -1,57 +0,0 @@ ---- -title: Prometheus at CloudNativeCon 2017 -created_at: 2017-11-29 -kind: article -author_name: Tom Wilkie on behalf of the Prometheus team ---- - -## Prometheus at CloudNativeCon 2017 - -Wednesday 6th December is Prometheus Day at CloudNativeCon Austin, and we’ve got -a fantastic lineup of talks and events for you. Go to the Prometheus Salon for -hands on advice on how best to monitor Kubernetes, attend a series of talks on -various aspects of Prometheus and then hang out with some of the Prometheus developers at the CNCF -booth, all followed by the Prometheus Happy Hour. Read on for more details... - - - -## [Prometheus Salon](https://kccncna17.sched.com/event/Cs4d/prometheus-salon-hosted-by-frederic-branczyk-coreos-bob-cotton-freshtracksio-goutham-veeramanchaneni-tom-wilkie-kausal) (11:10am - 12:30pm) - -Featuring talks from Prometheus developers, this salon will include an introduction -to Prometheus, a closer look at how you can use Prometheus to monitor your Kubernetes -cluster, and a discussion of the new features in Prometheus 2.0. The session will -include hands-on access to a live Prometheus and Kubernetes cluster, allowing you -to experiment with PromQL queries to gain deeper insights into your Kubernetes clusters. - -## [The RED Method: How To Instrument Your Services](https://kccncna17.sched.com/event/CU8K/the-red-method-how-to-instrument-your-services-b-tom-wilkie-kausal) (2:45pm - 3:20pm) - -Tom from Kausal will cover patterns of application instrumentation, where and when they are applicable, and how they can be implemented with Prometheus. Covering Google’s Four Golden Signals, the RED Method and the USE Method, this talk will also discuss why consistency is an important approach for reducing cognitive load. Finally it will discuss the limitations of these approaches and what can be done to overcome them. - -## Meet the Maintainers (3:30pm) - -Prometheus developers Frederic Branczyk ([@brancz](https://github.com/brancz)), -Goutham Veeramachaneni ([@Gouthamve](https://github.com/gouthamve)) and Tom Wilkie -([@tomwilkie](https://github.com/tomwilkie)) will be at the 'Meet the Maintainers' -lounges in the Sponsor Showcase to field all your Prometheus-related questions. -Come say hi! - -## [A Practical Guide to Prometheus for App Developers](https://kccncna17.sched.com/event/CU5y/a-practical-guide-to-prometheus-for-app-developers-b-ilya-dmitrichenko-weaveworks) (Wed 4:25pm - 5:00pm) - -This talk will first outline how Weaveworks runs production cloud-native apps on Kubernetes, -uses Prometheus for monitoring, and open-source tools they have -built to implement continuous delivery. Ilya will go on to explain step-by-step -how simple it is to instrument an app, using a very generic Node.js app as reference. - -## ["If you Don’t Monitor your Infrastructure, you Don’t Own it!” Regain Control Thanks to Prometheus](https://kccncna17.sched.com/event/CU5w/if-you-dont-monitor-your-infrastructure-you-dont-own-it-regain-control-thanks-to-prometheus-i-etienne-coutaud-guillaume-lefevre-octo-technology) (4:25pm - 5:00pm) - -The French FedEx company uses Prometheus to monitor their infrastructure. This talk -will discuss the use of Prometheus in production, why they chose Prometheus, how they -integrated it, configured it and what kind of insights they extracted from the -whole infrastructure. In addition, this talk will discuss how Prometheus changed -their way of working, how they implemented self-healing based on Prometheus, how they -configured systemd to trigger AlertManager API, integration with slack and other cool stuff. - -## [Prometheus Happy Hour](http://kubecon.freshtracks.io/) (8-10pm) - -After such a busy day, we'll be off to the Westin rooftop at Azul to soak up the -Austin skyline, drink in hand! Go and [reserve your spot now](http://kubecon.freshtracks.io/). diff --git a/content/blog/2018-02-08-interview-with-scalefastr.md b/content/blog/2018-02-08-interview-with-scalefastr.md deleted file mode 100644 index 3a751cfb..00000000 --- a/content/blog/2018-02-08-interview-with-scalefastr.md +++ /dev/null @@ -1,170 +0,0 @@ ---- -title: Interview with Scalefastr -created_at: 2018-02-08 -kind: article -author_name: Brian Brazil ---- - -*Continuing our series of interviews with users of Prometheus, Kevin Burton -from Scalefastr talks about how they are using Prometheus.* - -## Can you tell us about yourself and what Scalefastr does? - -My name is Kevin Burton and I’m the CEO of -[Scalefastr](https://www.scalefastr.io/). My background is in distributed -systems and I’ve previously ran Datastreamer, a company that built a petabyte -scale distributed social media crawler and search engine. - -At Datastreamer we ran into scalability issues regarding our infrastructure and -built out a high performance cluster based on Debian, Elasticsearch, Cassandra, -and Kubernetes. - -We found that many of our customers were also struggling with their -infrastructure and I was amazed at how much they were paying for hosting large -amounts of content on AWS and Google Cloud. - -We continually evaluated what it costs to run in the cloud and for us our -hosting costs would have been about 5-10x what we currently pay. - -We made the decision to launch a new cloud platform based on Open Source and -cloud native technologies like Kubernetes, Prometheus, Elasticsearch, -Cassandra, Grafana, Etcd, etc. - -We’re currently hosting a few customers in the petabyte scale and are soft -launching our new platform this month. - - -## What was your pre-Prometheus monitoring experience? - -At Datastreamer we found that metrics were key to our ability to iterate -quickly. The observability into our platform became something we embraced and -we integrated tools like [Dropwizard -Metrics](http://metrics.dropwizard.io/4.0.0/) to make it easy to develop -analytics for our platform. - -We built a platform based on KairosDB, Grafana, and our own (simple) -visualization engine which worked out really well for quite a long time. - -They key problem we saw with KairosDB was the rate of adoption and customer -demand for Prometheus. - -Additionally, what’s nice about Prometheus is the support for exporters -implemented by either the projects themselves or the community. - -With KairosDB we would often struggle to build out our own exporters. The -chance that an exporter for KairosDB already existing was rather low compared -to Prometheus. - -For example, there is CollectD support for KairosDB but it’s not supported very -well in Debian and there are practical bugs with CollectD that prevent it from -working reliability in production. - -With Prometheus you can get up and running pretty quickly (the system is rather -easy to install), and the chance that you have an exporter ready for your -platform is pretty high. - -Additionally, we’re expecting customer applications to start standardizing on -Prometheus metrics once there are hosted platforms like Scalefastr which -integrate it as a standardized and supported product. - -Having visibility into your application performance is critical and the high -scalability of Prometheus is necessary to make that happen. - - -## Why did you decide to look at Prometheus? - -We were initially curious how other people were monitoring their Kubernetes and -container applications. - -One of the main challenges of containers is the fact that they can come and go -quickly leaving behind both log and metric data that needs to be analyzed. - -It became clear that we should investigate Prometheus as our analytics backend -once we saw that people were successfully using Prometheus in production along -with a container-first architecture - as well as the support for exporters and -dashboards. - - -![One of Scalefastr's Grafana dashboards](/assets/blog/2018-02-08/dashboard.png) - -## How did you transition? - -The transition was somewhat painless for us since Scalefastr is a greenfield -environment. - -The architecture for the most part is new with very few limiting factors. - -Our main goal is to deploy on bare metal but build cloud features on top of -existing and standardized hardware. - -The idea is to have all analytics in our cluster backed by Prometheus. - -We provide customers with their own “management” infrastructure which includes -Prometheus, Grafana, Elasticsearch, and Kibana as well as a Kubernetes control -plane. We orchestrate this system with Ansible which handles initial machine -setup (ssh, core Debian packages, etc.) and baseline configuration. - -We then deploy Prometheus, all the required exporters for the customer -configuration, and additionally dashboards for Grafana. - -One thing we found to be somewhat problematic is that a few dashboards on -Grafana.com were written for Prometheus 1.x and did not port cleanly to 2.x. -It turns out that there are only a few functions not present in the 2.x series -and many of them just need a small tweak here and there. Additionally, some -of the dashboards were written for an earlier version of Grafana. - -To help solve that we announced a project this week to [standardize and improve -dashboards for -Prometheus](https://www.scalefastr.io/single-post/2018/01/26/Scalefastr-Grafana-Dashboards-for-Prometheus-20-and-Grafana) -for tools like Cassandra, Elasticsearch, the OS, but also Prometheus itself. -We open sourced this and [published it to -Github](https://github.com/scalefastr/scalefastr-prometheus-grafana-dashboards) -last week. - -We’re hoping this makes it easy for other people to migrate to Prometheus. - -One thing we want to improve is to automatically sync it with our Grafana -backend but also to upload these dashboards to Grafana.com. - -We also published our Prometheus configuration so that the labels work -correctly with our Grafana templates. This allows you to have a pull down menu -to select more specific metrics like cluster name, instance name, etc. - -![Using template variables in Grafana dashboards](/assets/blog/2018-02-08/templates.png) - - -## What improvements have you seen since switching? - -The ease of deployment, high performance, and standardized exporters made it -easy for us to switch. Additionally, the fact that the backend is fairly easy -to configure (basically, just the daemon itself) and there aren’t many moving -parts made it an easy decision. - -## What do you think the future holds for Scalefastr and Prometheus? - -Right now we’re deploying Elasticsearch and Cassandra directly on bare metal. -We’re working to run these in containers directly on top of Kubernetes and -working toward using the Container Storage Interface (CSI) to make this -possible. - -Before we can do this we need to get Prometheus service discovery working and -this is something we haven’t played with yet. Currently we deploy and -configure Prometheus via Ansible but clearly this won’t scale (or even work) -with Kubernetes since containers can come and go as our workload changes. - -We’re also working on improving the standard dashboards and alerting. One of -the features we would like to add (maybe as a container) is support for -alerting based on holts winters forecasting. - -This would essentially allow us to predict severe performance issues before -they happen. Rather than waiting for something to fail (like running out of -disk space) until we take action to correct it. - -To a certain extent Kubernetes helps with this issue since we can just add -nodes to the cluster based on a watermark. Once resource utilization is too -high we can just auto-scale. - -We’re very excited about the future of Prometheus especially now that we’re -moving forward on the 2.x series and the fact that CNCF collaboration seems to -be moving forward nicely. - diff --git a/content/blog/2018-03-16-interview-with-datawire.md b/content/blog/2018-03-16-interview-with-datawire.md deleted file mode 100644 index 17536448..00000000 --- a/content/blog/2018-03-16-interview-with-datawire.md +++ /dev/null @@ -1,82 +0,0 @@ ---- -title: Interview with Datawire -created_at: 2018-03-16 -kind: article -author_name: Brian Brazil ---- - -*Continuing our series of interviews with users of Prometheus, Richard Li -from Datawire talks about how they transitioned to Prometheus.* - -## Can you tell us about yourself and what Datawire does? - -At Datawire, we make open source tools that help developers code faster on -Kubernetes. Our projects include [Telepresence](https://www.telepresence.io/), -for local development of Kubernetes services; -[Ambassador](https://www.getambassador.io/), a Kubernetes-native API Gateway -built on the [Envoy Proxy](https://www.envoyproxy.io/); and -[Forge](https://forge.sh/), a build/deployment system. - -We run a number of mission critical cloud services in Kubernetes in AWS to -support our open source efforts. These services support use cases such as -dynamically provisioning dozens of Kubernetes clusters a day, which are then -used by our automated test infrastructure. - -## What was your pre-Prometheus monitoring experience? - -We used AWS CloudWatch. This was easy to set up, but we found that as we -adopted a more distributed development model (microservices), we wanted more -flexibility and control. For example, we wanted each team to be able to -customize their monitoring on an as-needed basis, without requiring operational -help. - - -## Why did you decide to look at Prometheus? - -We had two main requirements. The first was that we wanted every engineer here -to be able to have operational control and visibility into their service(s). -Our development model is highly decentralized by design, and we try to avoid -situations where an engineer needs to wait on a different engineer in order to -get something done. For monitoring, we wanted our engineers to be able to have -a lot of flexibility and control over their metrics infrastructure. Our second -requirement was a strong ecosystem. A strong ecosystem generally means -established (and documented) best practices, continued development, and lots of -people who can help if you get stuck. - - -Prometheus, and in particular, the [Prometheus -Operator](https://github.com/coreos/prometheus-operator), fit our requirements. -With the Prometheus Operator, each developer can create their own Prometheus -instance as needed, without help from operations (no bottleneck!). We are also -members of the [CNCF](https://www.cncf.io/) with a lot of experience with the -Kubernetes and Envoy communities, so looking at another CNCF community in -Prometheus was a natural fit. - - -![Datawire's Ambassador dashboards](/assets/blog/2018-03-16/dashboard.png) - -## How did you transition? - -We knew we wanted to start by integrating Prometheus with our API Gateway. Our -API Gateway uses Envoy for proxying, and Envoy automatically emits metrics -using the statsd protocol. We installed the Prometheus Operator (some detailed -notes [here](https://www.datawire.io/faster/ambassador-prometheus/)) and configured it to start collecting stats -from Envoy. We also set up a Grafana dashboard [based on some -work](https://grafana.com/dashboards/4698/) from another Ambassador contributor. - - -## What improvements have you seen since switching? - -Our engineers now have visibility into L7 traffic. We also are able to use -Prometheus to compare latency and throughput for our canary deployments to give -us more confidence that new versions of our services don’t cause performance -regressions. - -## What do you think the future holds for Datawire and Prometheus? - -Using the Prometheus Operator is still a bit complicated. We need to figure out -operational best practices for our service teams (when do you deploy a -Prometheus?). We’ll then need to educate our engineers on these best practices -and train them on how to configure the Operator to meet their needs. We expect -this will be an area of some experimentation as we figure out what works and -what doesn’t work. diff --git a/content/blog/2018-07-05-implementing-custom-sd.md b/content/blog/2018-07-05-implementing-custom-sd.md deleted file mode 100644 index b7b32d55..00000000 --- a/content/blog/2018-07-05-implementing-custom-sd.md +++ /dev/null @@ -1,190 +0,0 @@ ---- -title: Implementing Custom Service Discovery -created_at: 2018-07-05 -kind: article -author_name: Callum Styan ---- - -## Implementing Custom Service Discovery - -Prometheus contains built in integrations for many service discovery (SD) systems such as Consul, -Kubernetes, and public cloud providers such as Azure. However, we can’t provide integration -implementations for every service discovery option out there. The Prometheus team is already stretched -thin supporting the current set of SD integrations, so maintaining an integration for every possible SD -option isn’t feasible. In many cases the current SD implementations have been contributed by people -outside the team and then not maintained or tested well. We want to commit to only providing direct -integration with service discovery mechanisms that we know we can maintain, and that work as intended. -For this reason, there is currently a moratorium on new SD integrations. - -However, we know there is still a desire to be able to integrate with other SD mechanisms, such as -Docker Swarm. Recently a small code change plus an example was committed to the documentation -[directory](https://github.com/prometheus/prometheus/tree/main/documentation/examples/custom-sd) -within the Prometheus repository for implementing a custom service discovery integration without having -to merge it into the main Prometheus binary. The code change allows us to make use of the internal -Discovery Manager code to write another executable that interacts with a new SD mechanism and outputs -a file that is compatible with Prometheus' file\_sd. By co-locating Prometheus and our new executable -we can configure Prometheus to read the file\_sd-compatible output of our executable, and therefore -scrape targets from that service discovery mechanism. In the future this will enable us to move SD -integrations out of the main Prometheus binary, as well as to move stable SD integrations that make -use of the adapter into the Prometheus -[discovery](https://github.com/prometheus/prometheus/tree/main/discovery) package. - -Integrations using file_sd, such as those that are implemented with the adapter code, are listed -[here](https://prometheus.io/docs/operating/integrations/#file-service-discovery). - -Let’s take a look at the example code. - -## Adapter -First we have the file -[adapter.go](https://github.com/prometheus/prometheus/blob/main/documentation/examples/custom-sd/adapter/adapter.go). -You can just copy this file for your custom SD implementation, but it's useful to understand what's -happening here. - - // Adapter runs an unknown service discovery implementation and converts its target groups - // to JSON and writes to a file for file_sd. - type Adapter struct { - ctx context.Context - disc discovery.Discoverer - groups map[string]*customSD - manager *discovery.Manager - output string - name string - logger log.Logger - } - - // Run starts a Discovery Manager and the custom service discovery implementation. - func (a *Adapter) Run() { - go a.manager.Run() - a.manager.StartCustomProvider(a.ctx, a.name, a.disc) - go a.runCustomSD(a.ctx) - } - - -The adapter makes use of `discovery.Manager` to actually start our custom SD provider’s Run function in -a goroutine. Manager has a channel that our custom SD will send updates to. These updates contain the -SD targets. The groups field contains all the targets and labels our custom SD executable knows about -from our SD mechanism. - - type customSD struct { - Targets []string `json:"targets"` - Labels map[string]string `json:"labels"` - } - -This `customSD` struct exists mostly to help us convert the internal Prometheus `targetgroup.Group` -struct into JSON for the file\_sd format. - -When running, the adapter will listen on a channel for updates from our custom SD implementation. -Upon receiving an update, it will parse the targetgroup.Groups into another `map[string]*customSD`, -and compare it with what’s stored in the `groups` field of Adapter. If the two are different, we assign -the new groups to the Adapter struct, and write them as JSON to the output file. Note that this -implementation assumes that each update sent by the SD implementation down the channel contains -the full list of all target groups the SD knows about. - -## Custom SD Implementation - -Now we want to actually use the Adapter to implement our own custom SD. A full working example is in -the same examples directory -[here](https://github.com/prometheus/prometheus/blob/main/documentation/examples/custom-sd/adapter-usage/main.go). - -Here you can see that we’re importing the adapter code -`"github.com/prometheus/prometheus/documentation/examples/custom-sd/adapter"` as well as some other -Prometheus libraries. In order to write a custom SD we need an implementation of the Discoverer interface. - - // Discoverer provides information about target groups. It maintains a set - // of sources from which TargetGroups can originate. Whenever a discovery provider - // detects a potential change, it sends the TargetGroup through its channel. - // - // Discoverer does not know if an actual change happened. - // It does guarantee that it sends the new TargetGroup whenever a change happens. - // - // Discoverers should initially send a full set of all discoverable TargetGroups. - type Discoverer interface { - // Run hands a channel to the discovery provider(consul,dns etc) through which it can send - // updated target groups. - // Must returns if the context gets canceled. It should not close the update - // channel on returning. - Run(ctx context.Context, up chan<- []*targetgroup.Group) - } - -We really just have to implement one function, `Run(ctx context.Context, up chan<- []*targetgroup.Group)`. -This is the function the manager within the Adapter code will call within a goroutine. The Run function -makes use of a context to know when to exit, and is passed a channel for sending it's updates of target groups. - -Looking at the [Run](https://github.com/prometheus/prometheus/blob/main/documentation/examples/custom-sd/adapter-usage/main.go#L153-L211) -function within the provided example, we can see a few key things happening that we would need to do -in an implementation for another SD. We periodically make calls, in this case to Consul (for the sake -of this example, assume there isn’t already a built-in Consul SD implementation), and convert the -response to a set of `targetgroup.Group` structs. Because of the way Consul works, we have to first make -a call to get all known services, and then another call per service to get information about all the -backing instances. - -Note the comment above the loop that’s calling out to Consul for each service: - - // Note that we treat errors when querying specific consul services as fatal for for this - // iteration of the time.Tick loop. It's better to have some stale targets than an incomplete - // list of targets simply because there may have been a timeout. If the service is actually - // gone as far as consul is concerned, that will be picked up during the next iteration of - // the outer loop. - -With this we’re saying that if we can’t get information for all of the targets, it’s better to not -send any update at all than to send an incomplete update. We’d rather have a list of stale targets -for a small period of time and guard against false positives due to things like momentary network -issues, process restarts, or HTTP timeouts. If we do happen to get a response from Consul about every -target, we send all those targets on the channel. There is also a helper function `parseServiceNodes` -that takes the Consul response for an individual service and creates a target group from the backing -nodes with labels. - -## Using the current example - -Before starting to write your own custom SD implementation it’s probably a good idea to run the current -example after having a look at the code. For the sake of simplicity, I usually run both Consul and -Prometheus as Docker containers via docker-compose when working with the example code. - -`docker-compose.yml` - - version: '2' - services: - consul: - image: consul:latest - container_name: consul - ports: - - 8300:8300 - - 8500:8500 - volumes: - - ${PWD}/consul.json:/consul/config/consul.json - prometheus: - image: prom/prometheus:latest - container_name: prometheus - volumes: - - ./prometheus.yml:/etc/prometheus/prometheus.yml - ports: - - 9090:9090 - -`consul.json` - - { - "service": { - "name": "prometheus", - "port": 9090, - "checks": [ - { - "id": "metrics", - "name": "Prometheus Server Metrics", - "http": "http://prometheus:9090/metrics", - "interval": "10s" - } - ] - - } - } - -If we start both containers via docker-compose and then run the example main.go, we’ll query the Consul -HTTP API at localhost:8500, and the file_sd compatible file will be written as custom_sd.json. We could -configure Prometheus to pick up this file via the file_sd config: - - scrape_configs: - - job_name: "custom-sd" - scrape_interval: "15s" - file_sd_configs: - - files: - - /path/to/custom_sd.json diff --git a/content/blog/2018-08-09-prometheus-graduates-within-cncf.md b/content/blog/2018-08-09-prometheus-graduates-within-cncf.md deleted file mode 100644 index 45b5759b..00000000 --- a/content/blog/2018-08-09-prometheus-graduates-within-cncf.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -title: Prometheus Graduates Within CNCF -created_at: 2018-08-09 -kind: article -author_name: Richard Hartmann ---- - -We are happy to announce that as of today, Prometheus graduates within the [CNCF](https://www.cncf.io/). - -Prometheus is the second project ever to make it to this tier. -By graduating Prometheus, CNCF shows that it's confident in our code and feature velocity, our maturity and stability, and our governance and community processes. -This also acts as an external verification of quality for anyone in internal discussions around choice of monitoring tool. - -Since reaching incubation level, a lot of things happened; some of which stand out: - -* We completely rewrote our storage back-end to support high churn in services -* We had a large push towards stability, especially with 2.3.2 -* We started a documentation push with a special focus on making Prometheus adoption and joining the community easier - -Especially the last point is important as we currently enter our fourth phase of adoption. These phases were adoption by - -1. Monitoring-centric users actively looking for the very best in monitoring -2. Hyperscale users facing a monitoring landscape which couldn't keep up with their scale -3. Companies from small to Fortune 50 redoing their monitoring infrastructure -4. Users lacking funding and/or resources to focus on monitoring, but hearing about the benefits of Prometheus from various places - -Looking into the future, we anticipate even wider adoption and remain committed to handling tomorrow's scale, today. diff --git a/content/blog/2018-08-23-interview-with-presslabs.md b/content/blog/2018-08-23-interview-with-presslabs.md deleted file mode 100644 index eb2545c7..00000000 --- a/content/blog/2018-08-23-interview-with-presslabs.md +++ /dev/null @@ -1,102 +0,0 @@ ---- -title: Interview with Presslabs -created_at: 2018-08-23 -kind: article -author_name: Brian Brazil ---- - -*Continuing our series of interviews with users of Prometheus, Mile Rosu -from Presslabs talks about their monitoring journey.* - -## Can you tell us about yourself and what Presslabs does? - -[Presslabs](https://www.presslabs.com/) is a high-performance managed WordPress -hosting platform targeted at publishers, Enterprise brands and digital agencies -which seek to offer a seamless experience to their website visitors, 100% of -the time. - -Recently, we have developed an innovative component to our core -product—WordPress Business Intelligence. Users can now get real—time, -actionable data in a comprehensive dashboard to support a short -issue-to-deployment process and continuous improvement of their sites. - -We support the seamless delivery of up to 2 billion pageviews per month, on a -fleet of 100 machines entirely dedicated to managed WordPress hosting for -demanding customers. - -We’re currently on our mission to bring the best experience to WordPress -publishers around the world. In this journey, Kubernetes facilitates our route -to an upcoming standard in high availability WordPress hosting infrastructure. - -## What was your pre-Prometheus monitoring experience? - -We started building our WordPress hosting platform back in 2009. At the time, -we were using Munin, an open-source system, network and infrastructure -monitoring that performed all the operations we needed: exposing, collecting, -aggregating, alerting and visualizing metrics. Although it performed well, -collecting once every minute and aggregating once every 5 minutes was too slow -for us, thus the output it generated wasn’t enough to properly analyze events -on our platform. - -Graphite was our second choice on the list, which solved the time challenge -addressed by Munin. We added collectd in to the mix to expose metrics, and used -Graphite to collect and aggregate it. - -Then we made Viz, a tool we’ve written in JavaScript & Python for visualisation -and alerting. However, we stopped actively using this service because -maintaining it was a lot of work, which Grafana substituted very well, since -its first version. - -![Presslab's Viz](/assets/blog/2018-08-23/viz-metrics.jpg) - -Since the second half of 2017, our Presslabs platform entered a large-scale -transition phase. One of the major changes was our migration to Kubernetes -which implied the need for a highly performing monitoring system. That’s when -we got our minds set on Prometheus which we’re using every since and plan to -integrate it across all our services on the new platform as a central piece for -extracting and exposing metrics. - -## Why did you decide to look at Prometheus? - -We started considering Prometheus in 2014 at Velocity Europe Barcelona after -speaking to a team of engineers at Soundcloud. The benefits they exposed were -compelling enough for us to give Prometheus a try. - -## How did you transition? - -We’re still in the transition process, thus we run in parallel the two -systems—Prometheus and the Graphite-collectd combo. For the client dashboard -and our core services we use Prometheus, yet, for the client sites we still use -Graphite-collectd. On top of both there is a Grafana for visualization. - - -![Presslab's Redis Grafana dashboards](/assets/blog/2018-08-23/prometheus-redis.jpg) - -The Prometheus docs, Github issues and the source-code were the go-to resources -for integrating Prometheus; of course, StackOverflow added some spice to the -process, which satisfied a lot of our curiosities. - -The only problem with Prometheus is that we can’t get long-term storage for -certain metrics. Our hosting infrastructure platform needs to store usage -metrics such as pageviews for at least a year. However, the Prometheus -landscape has improved a lot since we’re using it and we still have to test -possible solutions. - -## What improvements have you seen since switching? - -Since switching to Prometheus, we’ve noticed a significant decrease in resource -usage, compared to any other alternative we’ve used before. Moreover, it’s easy -to install since the auto-integration with Kubernetes saves a lot of time. - -## What do you think the future holds for Presslabs and Prometheus? - -We have big plans with Prometheus as we’re working on replacing the Prometheus -Helm chart we use right now with the Prometheus Operator on our new -infrastructure. The implementation will provide a segregation of the platform -customers as we are going to allocate a dedicated Prometheus server for a -limited number of websites. We’re already working on that as part of our effort -of Kubernetizing WordPress. - -We are also working on exporting WordPress metrics in the Prometheus format. -Grafana is here to stay, as it goes hand in hand with Prometheus to solve the -visualisation need. diff --git a/content/blog/2019-01-28-subquery-support.md b/content/blog/2019-01-28-subquery-support.md deleted file mode 100644 index 0fda3241..00000000 --- a/content/blog/2019-01-28-subquery-support.md +++ /dev/null @@ -1,61 +0,0 @@ ---- -title: Subquery Support -created_at: 2019-01-28 -kind: article -author_name: Ganesh Vernekar ---- - -## Introduction - -As the title suggests, a subquery is a part of a query, and allows you to do a range query within a query, which was not possible before. It has been a long-standing feature request: [prometheus/prometheus/1227](https://github.com/prometheus/prometheus/issues/1227). - -The [pull request](https://github.com/prometheus/prometheus/pull/4831) for subquery support was recently merged into Prometheus and will be available in Prometheus 2.7. Let’s learn more about it below. - -## Motivation - -Sometimes, there are cases when you want to spot a problem using `rate` with lower resolution/range (e.g. `5m`) while aggregating this data for higher range (e.g. `max_over_time` for `1h`). - -Previously, the above was not possible for a single *PromQL* query. If you wanted to have a range selection on a query for your alerting rules or graphing, it would require you to have a recording rule based on that query, and perform range selection on the metrics created by the recording rules. Example: `max_over_time(rate(my_counter_total[5m])[1h])`. - -When you want some quick results on data spanning days or weeks, it can be quite a bit of a wait until you have enough data in your recording rules before it can be used. Forgetting to add recording rules can be frustrating. And it would be tedious to create a recording rule for each step of a query. - -With subquery support, all the waiting and frustration is taken care of. - -## Subqueries - -A subquery is similar to a [/api/v1/query_range](https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries) API call, but embedded within an instant query. The result of a subquery is a range vector. - -The Prometheus team arrived at a consensus for the syntax of subqueries at the Prometheus Dev Summit 2018 held in Munich. These are the [notes of the summit on subquery support](https://docs.google.com/document/d/1-C5PycocOZEVIPrmM1hn8fBelShqtqiAmFptoG4yK70/edit#heading=h.q32gdnoqz8t0), and a brief [design doc for the syntax](https://docs.google.com/document/d/1P_G87zN88YvmMr4iwLWygChMTZhai1L7S_c0awu1CAE/edit?usp=sharing) used for implementing subquery support. - - '[' ':' [ ] ']' [ offset ] - -* `` is equivalent to `query` field in `/query_range` API. -* `` and `offset ` is similar to a range selector. -* `` is optional, which is equivalent to `step` in `/query_range` API. - -When the resolution is not specified, the global evaluation interval is taken as the default resolution for the subquery. Also, the step of the subquery is aligned independently, and does not depend on the parent query's evaluation time. - -## Examples - -The subquery inside the `min_over_time` function returns the 5-minute rate of the `http_requests_total` metric for the past 30 minutes, at a resolution of 1 minute. This would be equivalent to a `/query_range` API call with `query=rate(http_requests_total[5m]), end=, start=-30m, step=1m`, and taking the min of all received values. - - min_over_time( rate(http_requests_total[5m])[30m:1m] ) - -Breakdown: - -* `rate(http_requests_total[5m])[30m:1m]` is the subquery, where `rate(http_requests_total[5m])` is the query to be executed. -* `rate(http_requests_total[5m])` is executed from `start=-30m` to `end=`, at a resolution of `1m`. Note that `start` time is aligned independently with step of `1m` (aligned steps are `0m 1m 2m 3m ...`). -* Finally the result of all the evaluations above are passed to `min_over_time()`. - -Below is an example of a nested subquery, and usage of default resolution. The innermost subquery gets the rate of `distance_covered_meters_total` over a range of time. We use that to get `deriv()` of the rates, again for a range of time. And finally take the max of all the derivatives. -Note that the `` time for the innermost subquery is relative to the evaluation time of the outer subquery on `deriv()`. - - max_over_time( deriv( rate(distance_covered_meters_total[1m])[5m:1m] )[10m:] ) - -In most cases you would require the default evaluation interval, which is the interval at which rules are evaluated by default. Custom resolutions will be helpful in cases where you want to compute less/more frequently, e.g. expensive queries which you might want to compute less frequently. - -## Epilogue - -Though subqueries are very convenient to use in place of recording rules, using them unnecessarily has performance implications. Heavy subqueries should eventually be converted to recording rules for efficiency. - -It is also not recommended to have subqueries inside a recording rule. Rather create more recording rules if you do need to use subqueries in a recording rule. diff --git a/content/blog/2019-02-06-interview-with-hostinger.md b/content/blog/2019-02-06-interview-with-hostinger.md deleted file mode 100644 index bd5f6c9e..00000000 --- a/content/blog/2019-02-06-interview-with-hostinger.md +++ /dev/null @@ -1,117 +0,0 @@ ---- -title: Interview with Hostinger -created_at: 2019-02-06 -kind: article -author_name: Brian Brazil ---- - -*Continuing our series of interviews with users of Prometheus, Donatas Abraitis -from Hostinger talks about their monitoring journey.* - -## Can you tell us about yourself and what Hostinger does? - -I’m Donatas Abraitis, a systems engineer at -[Hostinger](https://www.hostinger.com/). Hostinger is a hosting company as -the name implies. We have around 30 million clients since 2004 including -the [000webhost.com](https://www.000webhost.com/) project - free web hosting provider. - -## What was your pre-Prometheus monitoring experience? - -When Hostinger was quite a small company, only Nagios, Cacti, and Ganglia -existed at that time in the market as open source monitoring tools. This is -like telling young people what a floppy drive is, but Nagios and Cacti are -still in development cycle today. - -Even though no automation tools existed. Bash + Perl did the job. If you want -to scale your team and yourself, automation should never be ignored. No -automation - more human manual work involved. - -At that time there were around 150 physical servers. To compare, till this day -we have around 2000 servers including VMs and physical boxes. - -For networking gear, SNMP is still widely used. With the rise of "white box" -switches SNMP becomes less necessary, as regular tools can be installed. - -Instead of SNMP, you can run _node\_exporter_, or any other exporter inside the -switch to expose whatever metrics you need with the human-readable format. -Beautiful is better than ugly, right? - -We use CumulusOS which is in our case mostly x86 thus there is absolutely no -problem to run any kind of Linux stuff. - -## Why did you decide to look at Prometheus? - -In 2015 when we started automating everything that could be automated, -we introduced Prometheus to the ecosystem. In the beginning we had a single -monitoring box where Alertmanager, Pushgateway, Grafana, Graylog, and rsyslogd -were running. - -We also evaluated TICK (Telegraf/InfluxDB/Chronograf/Kapacitor) stack as well, -but we were not happy with them because of limited functionality at that time -and Prometheus looked in many ways simpler and more mature to implement. - -## How did you transition? - -During the transition period from the old monitoring stack (NCG - -Nagios/Cacti/Ganglia) we used both systems and finally, we rely only on -Prometheus. - - -We have about 25 community metric exporters + some custom written like -_lxc\_exporter_ in our fleet. Mostly we expose custom business-related metrics -using textfile collector. - - -## What improvements have you seen since switching? - -The new setup improved our time resolution from 5 minutes to 15 seconds, which -allows us to have fine-grained and quite deep analysis. Even Mean Time To -Detect(MTTD) was reduced by a factor of 4. - - -## What do you think the future holds for Hostinger and Prometheus? - -As we have grown our infrastructure N times since 2015 the main -bottleneck became Prometheus and Alertmanager. Our Prometheus eats about ~2TB -of disk space. Hence, if we restart or change the node under the maintenance we -miss monitoring data for a while. Currently we run Prometheus version 2.4.2, -but in the near future we have a plan to upgrade to 2.6. Especially we are -interested in -[performance](https://www.robustperception.io/new-features-in-prometheus-2-6-0) -and WAL related stuff features. Prometheus restart takes about 10-15 minutes. -Not acceptable. Another problem is that if a single location is down we miss -monitoring data as well. Thus we decided by implementing highly available -monitoring infrastructure: two Prometheus nodes, two Alertmanagers in separate -continents. - - -Our main visualization tool is Grafana. It's critically important that Grafana -could query the backup Prometheus node if the primary is down. This is easy as -that - put HAProxy in front and accept connections locally. - - -Another problem: how can we prevent users (developers and other internal staff) -from abusing dashboards overloading Prometheus nodes. - -Or the backup node if the primary is down - [thundering herds problem](https://en.wikipedia.org/wiki/Thundering_herd_problem). - -To achieve the desired state we gave a chance for -[Trickster](https://github.com/Comcast/trickster). This speeds-up dashboard -loading time incredible. It caches time series. In our case cache sits in -memory, but there are more choices where to store. Even when the primary goes -down and you refresh the dashboard, Trickster won't query the second node for -the time series which it has in memory cached. Trickster sits between Grafana -and Prometheus. It just talks with Prometheus API. - -![Hostinger Graphing Architecture](/assets/blog/2019-02-06/hostinger-arch.png) - -Prometheus nodes are independent while Alertmanager nodes form a cluster. If -both Alertmanagers see the same alert they will deduplicate and fire once -instead of multiple times. - -We have plans to run plenty of _blackbox\_exporters_ and monitor every Hostinger -client's website because anything that cannot be monitored cannot be assessed. - -We are looking forward to implementing more Prometheus nodes in the future so -sharding nodes between multiple Prometheus instances. This would allow us to -not have a bottleneck if one instance per region is down. diff --git a/content/blog/2019-06-18-interview-with-forgerock.md b/content/blog/2019-06-18-interview-with-forgerock.md deleted file mode 100644 index 961b2dd3..00000000 --- a/content/blog/2019-06-18-interview-with-forgerock.md +++ /dev/null @@ -1,88 +0,0 @@ ---- -title: Interview with ForgeRock -created_at: 2019-06-18 -kind: article -author_name: Simon Pasquier ---- - -*Continuing our series of interviews with users of Prometheus, Ludovic Poitou -from ForgeRock talks about their monitoring journey.* - -## Can you tell us about yourself and what ForgeRock does? - -I’m Ludovic Poitou, Director of Product Management at -[ForgeRock](https://www.forgerock.com/), based near Grenoble, France. ForgeRock -is an international identity and access management software company with more -than 500 employees, founded in Norway in 2010, now headquartered in San -Francisco, USA. We provide solutions to secure every online interaction with -customers, employees, devices and things. We have more than 800 customers from -finance companies to government services. - -## What was your pre-Prometheus monitoring experience? - -The ForgeRock Identity Platform has always offered monitoring interfaces. But -the platform is composed of 4 main products, each of them had different -options. For example, the Directory Services product offered monitoring -information through SNMP, JMX or LDAP, or even a RESTful API over HTTP in the -most recent versions. Other products only had REST or JMX. As a result, -monitoring the whole platform was complex and required tools that were able to -integrate those protocols. - -## Why did you decide to look at Prometheus? - -We needed to have a single and common interface for monitoring all our -products, but while keeping the existing ones for backward compatibility. - -We started to use DropWizard to collect the metrics in all products. At the -same time, we were starting to move these products to the cloud and run them in -Docker and Kubernetes. So, Prometheus became evident because of its integration -with Kubernetes, its simplicity for deployments, and the integration of -Grafana. We also looked at Graphite and while we also added support for it in -our products, it’s hardly being used by our customers. - -## How did you transition? - -Some of our products were already using the DropWizard library and we had decided -to use a common library in all products, so DropWizard was an obvious choice to -code the instrumentation. But very quickly, we faced an issue with the data -model. Prometheus interface uses dimensions, while we tend to have a -hierarchical model for metrics. We also started to use Micrometer and quickly -hit some constraints. So we ended up building a custom implementation to collect -our metrics using the Micrometer interface. We adapted DropWizard Metrics to -meet our requirements and made the adjustments to the DropWizard Prometheus -exporter. Now with a single instrumentation we can expose the metrics with -dimensions or hierarchically. Then we’ve started building sample Grafana -dashboards that our customer can install and customise to have their own -monitoring views and alerts. - -![Access Management ForgeRock's Grafana dashboard](/assets/blog/2019-06-18/access-management-grafana-dashboard.png) - -We do continue to offer the previous interfaces, but we strongly encourage our -customers to use Prometheus and Grafana. - -## What improvements have you seen since switching? - -The first benefits came from our Quality Engineering team. As they started to -test our Prometheus support and the different metrics, they started to enable -it by default on all stress and performance tests. They started to customise -the Grafana dashboards for the specific tests. Soon after, they started to -highlight and point at various metrics to explain some performance issues. - -When reproducing the problems in order to understand and fix them, our -engineering team used Prometheus as well and extended some dashboards. The -whole process gave us a better product and a much better understanding of -which metrics are important to monitor and visualise for customers. - -## What do you think the future holds for ForgeRock and Prometheus? - -ForgeRock has started an effort to offer its products and solutions as a -service. With that move, monitoring and alerting are becoming even more -critical, and of course, our monitoring infrastructure is based on Prometheus. -We currently have two levels of monitoring, one per tenant, where we use -Prometheus to collect data about one customer environment, and we can expose a -set of metrics for that customer. But we have also built a central Prometheus -service where metrics from all deployed tenants is pushed, so that our SRE team -can have a really good understanding of what and how all customers environments -are running. Overall I would say that Prometheus has become our main monitoring -service and it serves both our on-premise customers, and ourselves running our -solutions as a Service. diff --git a/content/blog/2019-10-10-remote-read-meets-streaming.md b/content/blog/2019-10-10-remote-read-meets-streaming.md deleted file mode 100644 index f769f852..00000000 --- a/content/blog/2019-10-10-remote-read-meets-streaming.md +++ /dev/null @@ -1,334 +0,0 @@ ---- -title: Remote Read Meets Streaming -created_at: 2019-10-10 -kind: article -author_name: Bartlomiej Plotka (@bwplotka) ---- - -The new Prometheus version 2.13.0 is available and as always, it includes many fixes and improvements. You can read what's changed [here](https://github.com/prometheus/prometheus/blob/release-2.13/CHANGELOG.md). -However, there is one feature that some projects and users were waiting for: [chunked, streamed version of remote read API](https://docs.google.com/document/d/1JqrU3NjM9HoGLSTPYOvR217f5HBKBiJTqikEB9UiJL0/edit#heading=h.3en2gbeew2sa). - -In this article I would like to present a deep dive of what we changed in the remote protocol, why it was changed and how to use it effectively. - -## Remote APIs - -Since version 1.x, Prometheus has the ability to interact directly with its storage using the [remote API](https://prometheus.io/docs/prometheus/latest/storage/#remote-storage-integrations). - -This API allows 3rd party systems to interact with metrics data through two methods: - -* **Write** - receive samples pushed by Prometheus -* **Read** - pull samples from Prometheus - -![Remote read and write architecture](/assets/blog/2019-10-08/remote_integrations.png) - -Both methods are using HTTP with messages encoded with [protobufs](https://github.com/protocolbuffers/protobuf). -The request and response for both methods are compressed using [snappy](https://github.com/google/snappy). - -### Remote Write - -This is the most popular way to replicate Prometheus data into 3rd party system. In this mode, Prometheus streams samples, -by periodically sending a batch of samples to the given endpoint. - -Remote write was recently improved massively in March with [WAL-based remote write](https://grafana.com/blog/2019/03/25/whats-new-in-prometheus-2.8-wal-based-remote-write/) which -improved the reliability and resource consumption. It is also worth to note that the remote write is supported by almost all 3rd -party integrations mentioned [here](https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage). - -### Remote Read - -The read method is less common. It was added in [March 2017](https://github.com/prometheus/prometheus/commit/febed48703b6f82b54b4e1927c53ab6c46257c2f) (server side) and -has not seen significant development since then. - -The release of Prometheus 2.13.0 includes a fix for known resource bottlenecks in the Read API. This article will focus on these improvements. - -The key idea of the remote read is to allow querying Prometheus storage ([TSDB](https://github.com/prometheus/prometheus/tree/main/tsdb)) directly without PromQL evaluation. -It is similar to the [`Querier`](https://github.com/prometheus/prometheus/blob/91d7175eaac18b00e370965f3a8186cc40bf9f55/storage/interface.go#L53) interface -that the PromQL engine uses to retrieve data from storage. - -This essentially allows read access of time series in TSDB that Prometheus collected. The main use cases for remote read are: - -* Seamless Prometheus upgrades between different data formats of Prometheus, so having [Prometheus reading from another Prometheus](https://www.robustperception.io/accessing-data-from-prometheus-1-x-in-prometheus-2-0). -* Prometheus being able to read from 3rd party long term storage systems e.g InfluxDB. -* 3rd party system querying data from Prometheus e.g [Thanos](https://thanos.io). - -The remote read API exposes a simple HTTP endpoint that expects following protobuf payload: - -``` -message ReadRequest { - repeated Query queries = 1; -} - -message Query { - int64 start_timestamp_ms = 1; - int64 end_timestamp_ms = 2; - repeated prometheus.LabelMatcher matchers = 3; - prometheus.ReadHints hints = 4; -} -``` - -With this payload, the client can request certain series matching given `matchers` and time range with `end` and `start`. - -The response is equally simple: - -``` -message ReadResponse { - // In same order as the request's queries. - repeated QueryResult results = 1; -} - -message Sample { - double value = 1; - int64 timestamp = 2; -} - -message TimeSeries { - repeated Label labels = 1; - repeated Sample samples = 2; -} - -message QueryResult { - repeated prometheus.TimeSeries timeseries = 1; -} -``` - -Remote read returns the matched time series with **raw** samples of value and timestamp. - -## Problem Statement - -There were two key problems for such a simple remote read. It was easy to use and understand, but there were no -streaming capabilities within single HTTP request for the protobuf format we defined. Secondly, the response was -including raw samples (`float64` value and `int64` timestamp) instead of -an encoded, compressed batch of samples called "chunks" that are used to store metrics inside TSDB. - -The server algorithm for remote read without streaming was: - -1. Parse request. -1. Select metrics from TSDB. -1. For all decoded series: - * For all samples: - * Add to response protobuf -1. Marshal response. -1. Snappy compress. -1. Send back the HTTP response. - -The whole response of the remote read had to be buffered in a raw, uncompressed format in order to marshsal it in a -potentially huge protobuf message before sending it to the client. The whole response has to then be fully buffered in the client again to be able -to unmarshal it from the received protobuf. Only after that the client was able to use raw samples. - -What does it mean? It means that requests for, let's say, only 8 hours that matches 10,000 series can take up to **2.5GB** of memory allocated by both client and server each! - -Below is memory usage metric for both Prometheus and [Thanos Sidecar](https://thanos.io/components/sidecar.md/) (remote read client) during remote read request time: - -![Prometheus 2.12.0: RSS of single read 8h of 10k series](/assets/blog/2019-10-08/10kseries8hours-2.12.png) - -![Prometheus 2.12.0: Heap-only allocations of single read 8h of 10k series](/assets/blog/2019-10-08/10series8hours-2.12-allocs.png) - -It is worth to noting that querying 10,000 series is not a great idea, even for Prometheus native HTTP `query_range` endpoint, -as your browser simply will not be happy fetching, storing and rendering hundreds of megabytes of data. Additionally, -for dashboards and rendering purposes it is not practical to have that much data, as humans can't possibly read it. -That is why usually we craft queries that have no more than 20 series. - -This is great, but a very common technique is to compose queries in such way that query returns **aggregated** 20 series, -however underneath the query engine has to touch potentially thousands of series to evaluate the response (e.g when using [aggregators](https://prometheus.io/docs/prometheus/latest/querying/operators/#aggregation-operators)). -That is why systems like Thanos, which among other data, uses TSDB data from remote read, it's very often the case that the request is heavy. - -## Solution - -To explain the solution to this problem, it is helpful to understand how Prometheus iterates over the data when queried. -The core concept can be shown in [`Querier's`](https://github.com/prometheus/prometheus/blob/91d7175eaac18b00e370965f3a8186cc40bf9f55/storage/interface.go#L53) -`Select` method returned type called `SeriesSet`. The interface is presented below: - -``` -// SeriesSet contains a set of series. -type SeriesSet interface { - Next() bool - At() Series - Err() error -} - -// Series represents a single time series. -type Series interface { - // Labels returns the complete set of labels identifying the series. - Labels() labels.Labels - // Iterator returns a new iterator of the data of the series. - Iterator() SeriesIterator -} - -// SeriesIterator iterates over the data of a time series. -type SeriesIterator interface { - // At returns the current timestamp/value pair. - At() (t int64, v float64) - // Next advances the iterator by one. - Next() bool - Err() error -} -``` - -These sets of interfaces allow "streaming" flow inside the process. We no longer have to have a precomputed list of series that hold samples. -With this interface each `SeriesSet.Next()` implementation can fetch series on demand. -In a similar way, within each series. we can also dynamically fetch each sample respectively via `SeriesIterator.Next`. - -With this contract, Prometheus can minimize allocated memory, because the PromQL engine can iterate over samples optimally to evaluate the query. -In the same way TSDB implements `SeriesSet` in a way that fetches the series optimally from blocks stored in the filesystem one by one, minimizing allocations. - -This is important for the remote read API, as we can reuse the same pattern of streaming using iterators by sending to the -client a piece of the response in a form of few chunks for the single series. -Because protobuf has no native delimiting logic, we [`extended`](https://github.com/prometheus/prometheus/pull/5703/files#diff-7bdb1c90d5a59fc5ead16457e6d2b038R44) -proto definition to allow sending **set of small protocol buffer messages** instead of a single, huge one. We called -this mode `STREAMED_XOR_CHUNKS` remote read while old one is called `SAMPLES`. Extended protocol means that Prometheus -does not need to buffer the whole response anymore. Instead, it can work on each series sequentially and send a single frame per -each `SeriesSet.Next` or batch of `SeriesIterator.Next` iterations, potentially reusing the same memory pages for next series! - -Now, the response of `STREAMED_XOR_CHUNKS` remote read is a set of Protobuf messages (frames) as presented below: - -``` -// ChunkedReadResponse is a response when response_type equals STREAMED_XOR_CHUNKS. -// We strictly stream full series after series, optionally split by time. This means that a single frame can contain -// partition of the single series, but once a new series is started to be streamed it means that no more chunks will -// be sent for previous one. -message ChunkedReadResponse { - repeated prometheus.ChunkedSeries chunked_series = 1; -} - -// ChunkedSeries represents single, encoded time series. -message ChunkedSeries { - // Labels should be sorted. - repeated Label labels = 1 [(gogoproto.nullable) = false]; - // Chunks will be in start time order and may overlap. - repeated Chunk chunks = 2 [(gogoproto.nullable) = false]; -} -``` - -As you can see the frame does not include raw samples anymore. That's the second improvement we did: We send in the message -samples batched in chunks (see [this video](https://www.youtube.com/watch?v=b_pEevMAC3I) to learn more about chunks), -which are exactly the same chunks we store in the TSDB. - -We ended up with the following server algorithm: - -1. Parse request. -1. Select metrics from TSDB. -1. For all series: - * For all samples: - * Encode into chunks - * if the frame is >= 1MB; break - * Marshal `ChunkedReadResponse` message. - * Snappy compress - * Send the message - -You can find full design [here](https://docs.google.com/document/d/1JqrU3NjM9HoGLSTPYOvR217f5HBKBiJTqikEB9UiJL0/edit#). - -## Benchmarks - -How does the performance of this new approach compare to the old solution? - -Let's compare remote read characteristics between Prometheus `2.12.0` and `2.13.0`. As for the initial results presented -at the beginning of this article, I was using Prometheus as a server, and a Thanos sidecar as a client of remote read. -I was invoking testing remote read request by running gRPC call against Thanos sidecar using `grpcurl`. -Test was performed from my laptop (Lenovo X1 16GB, i7 8th) with Kubernetes in docker (using [kind](https://github.com/kubernetes-sigs/kind)). - -The data was artificially generated, and represents highly dynamic 10,000 series (worst case scenario). - -The full test bench is available in [thanosbench repo](https://github.com/thanos-io/thanosbench/blob/master/benchmarks/remote-read/README.md). - -### Memory - -#### Without streaming - -![Prometheus 2.12.0: Heap-only allocations of single read 8h of 10k series](/assets/blog/2019-10-08/10series8hours-2.12-allocs.png) - -#### With streaming - -![Prometheus 2.13.0: Heap-only allocations of single read 8h of 10k series](/assets/blog/2019-10-08/10series8hours-2.13-allocs.png) - -Reducing memory was the key item we aimed for with our solution. Instead of allocating GBs of memory, Prometheus buffers -roughly 50MB during the whole request, whereas for Thanos there is only a marginal memory use. Thanks to the streamed -Thanos gRPC StoreAPI, sidecar is now a very simple proxy. - -Additionally, I tried different time ranges and number of series, but as expected I kept seeing -a maximum of 50MB in allocations for Prometheus and nothing really visible for Thanos. This proves that our remote read -uses **constant memory per request no matter how many samples you ask for**. Allocated memory per request is also drastically less -influenced by the cardinality of the data, so number of series fetched like it used to be. - -This allowing easier capacity planning against user traffic, with help of the concurrency limit. - -### CPU - -#### Without streaming - -![Prometheus 2.12.0: CPU time of single read 8h of 10k series](/assets/blog/2019-10-08/10kseries8hours-2.12-cpu.png) - -#### With streaming - -![Prometheus 2.13.0: CPU time of single read 8h of 10k series](/assets/blog/2019-10-08/10kseries8hours-2.13-cpu.png) - -During my tests, CPU usage was also improved, with 2x less CPU time used. - -### Latency - -We achieved to reduce remote read request latency as well, thanks to streaming and less encoding. - -Remote read request latency for 8h range with 10,000 series: - -| | 2.12.0: avg time | 2.13.0: avg time | -|------|------------------|------------------| -| real | 0m34.701s | 0m8.164s | -| user | 0m7.324s | 0m8.181s | -| sys | 0m1.172s | 0m0.749s | - -And with 2h time range: - -| | 2.12.0: avg time | 2.13.0: avg time | -|------|------------------|------------------| -| real | 0m10.904s | 0m4.145s | -| user | 0m6.236s | 0m4.322s | -| sys | 0m0.973s | 0m0.536s | - -Additionally to the ~2.5x lower latency, the response is streamed immediately in comparison to the non-streamed -version where the client latency was 27s (`real` minus `user` time) just on processing and marshaling on Prometheus and on the Thanos side. - -## Compatibility - -Remote read was extended in a backward and forward compatible way. This is thanks to the protobuf and `accepted_response_types` field which is -ignored for older servers. In the same time server works just fine if `accepted_response_types` is not present by older clients assuming old `SAMPLES` remote read. - -The remote read protocol was extended in a backward and forward compatible way: - -* Prometheus before v2.13.0 will safely ignore the `accepted_response_types` field provided by newer clients and assume `SAMPLES` mode. -* Prometheus after v2.13.0 will default to the `SAMPLES` mode for older clients that don't provide the `accepted_response_types` parameter. - -## Usage - -To use the new, streamed remote read in Prometheus v2.13.0, a 3rd party system has to add `accepted_response_types = [STREAMED_XOR_CHUNKS]` to the request. - -Then Prometheus will stream `ChunkedReadResponse` instead of old message. Each `ChunkedReadResponse` message is -following varint size and fixed size bigendian uint32 for CRC32 Castagnoli checksum. - -For Go it is recommended to use the [ChunkedReader](https://github.com/prometheus/prometheus/blob/48b2c9c8eae2d4a286d8e9384c2918aefd41d8de/storage/remote/chunked.go#L103) - to read directly from the stream. - -Note that `storage.remote.read-sample-limit` flag is no longer working for `STREAMED_XOR_CHUNKS`. -`storage.remote.read-concurrent-limit` works as previously. - -There also new option `storage.remote.read-max-bytes-in-frame` which controls the maximum size of each message. It is advised -to keep it 1MB as the default as it is recommended by Google to keep protobuf message [not larger than 1MB](https://developers.google.com/protocol-buffers/docs/techniques#large-data). - -As mentioned before, [Thanos](https://thanos.io) gains a lot with this improvement. Streamed remote read is added in `v0.7.0`, so this or any following version, -will use streamed remote read automatically whenever Prometheus 2.13.0 or newer is used with the Thanos sidecar. - -## Next Steps - -Release 2.13.0 introduces extended remote read and Prometheus server side implementation, However at the moment of writing -there are still few items to do in order to fully get advantage from the extended remote read protocol: - -* Support for client side of Prometheus remote read: [In progress](https://github.com/prometheus/prometheus/issues/5926) -* Avoid re-encoding of chunks for blocks during remote read: [In progress](https://github.com/prometheus/prometheus/pull/5882) - -## Summary - -To sum up, the main benefits of chunked, streaming of remote read are: - -* Both client and server are capable of using **practically constant memory size per request**. This is because the Prometheus sends just single small frames one by one instead of the whole response during remote read. This massively helps with -capacity planning, especially for a non-compressible resource like memory. -* Prometheus server does not need to decode chunks to raw samples anymore during remote read. The same for client side for -encoding, **if** the system is reusing native TSDB XOR compression (like Thanos does). - -As always, if you have any issues or feedback, feel free to submit a ticket on GitHub or ask questions on the mailing list. diff --git a/content/blog/2021-02-17-introducing-feature-flags.md b/content/blog/2021-02-17-introducing-feature-flags.md deleted file mode 100644 index 2ebc5489..00000000 --- a/content/blog/2021-02-17-introducing-feature-flags.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: Introducing Feature Flags -created_at: 2021-02-17 -kind: article -author_name: Ganesh Vernekar ---- - -We have always made hard promises around stability and breaking changes following the SemVer model. That will remain to be the case. - -As we want to be bolder in experimentation, we are planning to use feature flags more. - -Starting with v2.25.0, we have introduced a new section called [disabled features](https://prometheus.io/docs/prometheus/latest/disabled_features/) which have the features hidden behind the `--enable-feature` flag. You can expect more and more features getting added to this section in the future releases. - -The features in this list are considered experimental and comes with following considerations as long as they are still behind `--enable-feature`: - -1. API specs may change if the feature has any API (web API, code interfaces, etc.). -2. The behavior of the feature may change. -3. They may break some assumption that you might have had about Prometheus. - * For example the assumption that a query does not look ahead of the evaluation time for samples, which will be broken by `@` modifier and negative offset. -4. They may be unstable but we will try to keep them stable, of course. - -These considerations allow us to be more bold with experimentation and to innovate more quickly. Once any feature gets widely used and is considered stable with respect to its API, behavior, and implementation, they may be moved from disabled features list and enabled by default . If we find any feature to be not worth it or broken, we may completely remove it. If enabling some feature is considered a big breaking change for Prometheus, it would stay disabled until the next major release. - -Keep an eye out on this list on every release, and do try them out! \ No newline at end of file diff --git a/content/blog/2021-02-18-introducing-the-@-modifier.md b/content/blog/2021-02-18-introducing-the-@-modifier.md deleted file mode 100644 index bdb201d8..00000000 --- a/content/blog/2021-02-18-introducing-the-@-modifier.md +++ /dev/null @@ -1,32 +0,0 @@ ---- -title: Introducing the '@' Modifier -created_at: 2021-02-18 -kind: article -author_name: Ganesh Vernekar ---- - -Have you ever selected the top 10 time series for something, but instead of 10 you got 100? If yes, this one is for you. Let me walk you through what the underlying problem is and how I fixed it. - -Currently, the `topk()` query only makes sense as an instant query where you get exactly `k` results, but when you run it as a range query, you can get much more than `k` results since every step is evaluated independently. This `@` modifier lets you fix the ranking for all the steps in a range query. - -In Prometheus v2.25.0, we have introduced a new PromQL modifier `@`. Similar to how `offset` modifier lets you offset the evaluation of vector selector, range vector selector, and subqueries by a fixed duration relative to the evaluation time, the `@` modifier lets you fix the evaluation for those selectors irrespective of the query evaluation time. The credits for this syntax goes to [Björn Rabenstein](https://github.com/beorn7/). - - @ - @ - @ - -The `` is a unix timestamp and described with a float literal. - -For example, the query `http_requests_total @ 1609746000` returns the value of `http_requests_total` at `2021-01-04T07:40:00+00:00`. The query `rate(http_requests_total[5m] @ 1609746000)` returns the 5-minute rate of `http_requests_total` at the same time. - -Additionally, `start()` and `end()` can also be used as values for the `@` modifier as special values. For a range query, they resolve to the start and end of the range query respectively and remain the same for all steps. For an instant query, `start()` and `end()` both resolve to the evaluation time. - -Coming back to the `topk()` fix, the following query plots the `1m` rate of `http_requests_total` of those series whose last `1h` rate was among the top 5. Hence now you can make sense of the `topk()` even as a range query where it plots exactly `k` results. - - rate(http_requests_total[1m]) # This acts like the actual selector. - and - topk(5, rate(http_requests_total[1h] @ end())) # This acts like a ranking function which filters the selector. - -Similarly, the `topk()` ranking can be replaced with other functions like `histogram_quantile()` which only makes sense as an instant query right now. `rate()` can be replaced with `_over_time()`, etc. Let us know how you use this new modifier! - -`@` modifier is disabled by default and can be enabled using the flag `--enable-feature=promql-at-modifier`. Learn more about feature flags in [this blog post](https://prometheus.io/blog/2021/02/17/introducing-feature-flags/) and find the docs for `@` modifier [here](https://prometheus.io/docs/prometheus/latest/querying/basics/#modifier). \ No newline at end of file diff --git a/content/blog/2021-05-03-introducing-prometheus-conformance-program.md b/content/blog/2021-05-03-introducing-prometheus-conformance-program.md deleted file mode 100644 index dd3610a7..00000000 --- a/content/blog/2021-05-03-introducing-prometheus-conformance-program.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -title: Introducing the Prometheus Conformance Program -created_at: 2021-05-03 -kind: article -author_name: Richard "RichiH" Hartmann ---- - -Prometheus is the standard for metric monitoring in the cloud native space and beyond. To ensure interoperability, to protect users from suprises, and to enable more parallel innovation, the Prometheus project is introducing the [Prometheus Conformance Program](https://github.com/cncf/prometheus-conformance) with the help of [CNCF](https://www.cncf.io/) to certify component compliance and Prometheus compatibility. - -The CNCF Governing Board is expected to formally review and approve the program during their next meeting. We invite the wider community to help improve our tests in this ramp-up phase. - -With the help of our [extensive and expanding test suite](https://github.com/prometheus/compliance), projects and vendors can determine the compliance to our specifications and compatibility within the Prometheus ecosystem. - -At launch, we are offering compliance tests for three components: - -* PromQL (needs manual interpretation, somewhat complete) -* Remote Read-Write (fully automated, WIP) -* OpenMetrics (partially automatic, somewhat complete, will need questionnaire) - -We plan to add more components. Tests for Prometheus Remote Read or our data storage/TSDB are likely as next additions. We explicitly invite everyone to extend and improve existing tests, and to submit new ones. - -The Prometheus Conformance Program works as follows: - -For every component, there will be a mark "foo YYYY-MM compliant", e.g. "OpenMetrics 2021-05 compliant", "PromQL 2021-05 compliant", and "Prometheus Remote Write 2021-05 compliant". Any project or vendor can submit their compliance documentation. Upon reaching 100%, the mark will be granted. - -For any complete software, there will be a mark "Prometheus x.y compatible", e.g. "Prometheus 2.26 compatible". Relevant component compliance scores are multiplied. Upon reaching 100%, the mark will be granted. - -As an example, the Prometheus Agent supports both OpenMetrics and Prometheus Remote Write, but not PromQL. As such, only compliance scores for OpenMetrics and Prometheus Remote Write are multiplied. - -Both compliant and compatible marks are valid for 2 minor releases or 12 weeks, whichever is longer. diff --git a/content/blog/2021-05-04-prometheus-conformance-remote-write-compliance.md b/content/blog/2021-05-04-prometheus-conformance-remote-write-compliance.md deleted file mode 100644 index fefbc91d..00000000 --- a/content/blog/2021-05-04-prometheus-conformance-remote-write-compliance.md +++ /dev/null @@ -1,139 +0,0 @@ ---- -title: "Prometheus Conformance Program: Remote Write Compliance Test Results" -created_at: 2021-05-05 -kind: article -author_name: Richard "RichiH" Hartmann ---- - -As [announced by CNCF](https://www.cncf.io/blog/2021/05/03/announcing-the-intent-to-form-the-prometheus-conformance-program/) and by [ourselves](https://prometheus.io/blog/2021/05/03/introducing-prometheus-conformance-program/), we're starting a Prometheus conformance program. - -To give everyone an overview of where the ecosystem is before running tests officially, we wanted to show off the newest addition to our happy little bunch of test suites: The Prometheus [Remote Write](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#remote_write) compliance test suite tests the sender part of the Remote Write protocol against our [specification](https://docs.google.com/document/d/1LPhVRSFkGNSuU1fBd81ulhsCPR4hkSZyyBj1SZ8fWOM). - -During Monday's [PromCon](https://promcon.io/2021-online/), [Tom Wilkie](https://twitter.com/tom_wilkie) presented the test results from the time of recording a few weeks ago. In the live section, he already had an [update](https://docs.google.com/presentation/d/1RcN58LlS3V5tYCUsftqUvNuCpCsgGR2P7-GoH1MVL0Q/edit#slide=id.gd1789c7f7c_0_0). Two days later we have two more updates: -The addition of the [observability pipeline tool Vector](https://github.com/prometheus/compliance/pull/24), as well as [new versions of existing systems](https://github.com/prometheus/compliance/pull/25). - -So, without further ado, the current results in alphabetical order are: - -| Sender | Version | Score -|--------|---------|------ -| Grafana Agent | 0.13.1 | **100%** -| Prometheus | 2.26.0 | **100%** -| OpenTelemetry Collector | 0.26.0 | **41%** -| Telegraf | 1.18.2 | **65%** -| Timber Vector | 0.13.1 | **35%** -| VictoriaMetrics Agent | 1.59.0 | **76%** - -The raw results are: - -```` ---- PASS: TestRemoteWrite/grafana (0.01s) - --- PASS: TestRemoteWrite/grafana/Counter (10.02s) - --- PASS: TestRemoteWrite/grafana/EmptyLabels (10.02s) - --- PASS: TestRemoteWrite/grafana/Gauge (10.02s) - --- PASS: TestRemoteWrite/grafana/Headers (10.02s) - --- PASS: TestRemoteWrite/grafana/Histogram (10.02s) - --- PASS: TestRemoteWrite/grafana/HonorLabels (10.02s) - --- PASS: TestRemoteWrite/grafana/InstanceLabel (10.02s) - --- PASS: TestRemoteWrite/grafana/Invalid (10.02s) - --- PASS: TestRemoteWrite/grafana/JobLabel (10.02s) - --- PASS: TestRemoteWrite/grafana/NameLabel (10.02s) - --- PASS: TestRemoteWrite/grafana/Ordering (26.12s) - --- PASS: TestRemoteWrite/grafana/RepeatedLabels (10.02s) - --- PASS: TestRemoteWrite/grafana/SortedLabels (10.02s) - --- PASS: TestRemoteWrite/grafana/Staleness (10.01s) - --- PASS: TestRemoteWrite/grafana/Summary (10.01s) - --- PASS: TestRemoteWrite/grafana/Timestamp (10.01s) - --- PASS: TestRemoteWrite/grafana/Up (10.02s) ---- PASS: TestRemoteWrite/prometheus (0.01s) - --- PASS: TestRemoteWrite/prometheus/Counter (10.02s) - --- PASS: TestRemoteWrite/prometheus/EmptyLabels (10.02s) - --- PASS: TestRemoteWrite/prometheus/Gauge (10.02s) - --- PASS: TestRemoteWrite/prometheus/Headers (10.02s) - --- PASS: TestRemoteWrite/prometheus/Histogram (10.02s) - --- PASS: TestRemoteWrite/prometheus/HonorLabels (10.02s) - --- PASS: TestRemoteWrite/prometheus/InstanceLabel (10.02s) - --- PASS: TestRemoteWrite/prometheus/Invalid (10.02s) - --- PASS: TestRemoteWrite/prometheus/JobLabel (10.02s) - --- PASS: TestRemoteWrite/prometheus/NameLabel (10.03s) - --- PASS: TestRemoteWrite/prometheus/Ordering (24.99s) - --- PASS: TestRemoteWrite/prometheus/RepeatedLabels (10.02s) - --- PASS: TestRemoteWrite/prometheus/SortedLabels (10.02s) - --- PASS: TestRemoteWrite/prometheus/Staleness (10.02s) - --- PASS: TestRemoteWrite/prometheus/Summary (10.02s) - --- PASS: TestRemoteWrite/prometheus/Timestamp (10.02s) - --- PASS: TestRemoteWrite/prometheus/Up (10.02s) ---- FAIL: TestRemoteWrite/otelcollector (0.00s) - --- FAIL: TestRemoteWrite/otelcollector/Counter (10.01s) - --- FAIL: TestRemoteWrite/otelcollector/Histogram (10.01s) - --- FAIL: TestRemoteWrite/otelcollector/InstanceLabel (10.01s) - --- FAIL: TestRemoteWrite/otelcollector/Invalid (10.01s) - --- FAIL: TestRemoteWrite/otelcollector/JobLabel (10.01s) - --- FAIL: TestRemoteWrite/otelcollector/Ordering (13.54s) - --- FAIL: TestRemoteWrite/otelcollector/RepeatedLabels (10.01s) - --- FAIL: TestRemoteWrite/otelcollector/Staleness (10.01s) - --- FAIL: TestRemoteWrite/otelcollector/Summary (10.01s) - --- FAIL: TestRemoteWrite/otelcollector/Up (10.01s) - --- PASS: TestRemoteWrite/otelcollector/EmptyLabels (10.01s) - --- PASS: TestRemoteWrite/otelcollector/Gauge (10.01s) - --- PASS: TestRemoteWrite/otelcollector/Headers (10.01s) - --- PASS: TestRemoteWrite/otelcollector/HonorLabels (10.01s) - --- PASS: TestRemoteWrite/otelcollector/NameLabel (10.01s) - --- PASS: TestRemoteWrite/otelcollector/SortedLabels (10.01s) - --- PASS: TestRemoteWrite/otelcollector/Timestamp (10.01s) ---- FAIL: TestRemoteWrite/telegraf (0.01s) - --- FAIL: TestRemoteWrite/telegraf/EmptyLabels (14.60s) - --- FAIL: TestRemoteWrite/telegraf/HonorLabels (14.61s) - --- FAIL: TestRemoteWrite/telegraf/Invalid (14.61s) - --- FAIL: TestRemoteWrite/telegraf/RepeatedLabels (14.61s) - --- FAIL: TestRemoteWrite/telegraf/Staleness (14.59s) - --- FAIL: TestRemoteWrite/telegraf/Up (14.60s) - --- PASS: TestRemoteWrite/telegraf/Counter (14.61s) - --- PASS: TestRemoteWrite/telegraf/Gauge (14.60s) - --- PASS: TestRemoteWrite/telegraf/Headers (14.61s) - --- PASS: TestRemoteWrite/telegraf/Histogram (14.61s) - --- PASS: TestRemoteWrite/telegraf/InstanceLabel (14.61s) - --- PASS: TestRemoteWrite/telegraf/JobLabel (14.61s) - --- PASS: TestRemoteWrite/telegraf/NameLabel (14.60s) - --- PASS: TestRemoteWrite/telegraf/Ordering (14.61s) - --- PASS: TestRemoteWrite/telegraf/SortedLabels (14.61s) - --- PASS: TestRemoteWrite/telegraf/Summary (14.60s) - --- PASS: TestRemoteWrite/telegraf/Timestamp (14.61s) ---- FAIL: TestRemoteWrite/vector (0.01s) - --- FAIL: TestRemoteWrite/vector/Counter (10.02s) - --- FAIL: TestRemoteWrite/vector/EmptyLabels (10.01s) - --- FAIL: TestRemoteWrite/vector/Headers (10.02s) - --- FAIL: TestRemoteWrite/vector/HonorLabels (10.02s) - --- FAIL: TestRemoteWrite/vector/InstanceLabel (10.02s) - --- FAIL: TestRemoteWrite/vector/Invalid (10.02s) - --- FAIL: TestRemoteWrite/vector/JobLabel (10.01s) - --- FAIL: TestRemoteWrite/vector/Ordering (13.01s) - --- FAIL: TestRemoteWrite/vector/RepeatedLabels (10.02s) - --- FAIL: TestRemoteWrite/vector/Staleness (10.02s) - --- FAIL: TestRemoteWrite/vector/Up (10.02s) - --- PASS: TestRemoteWrite/vector/Gauge (10.02s) - --- PASS: TestRemoteWrite/vector/Histogram (10.02s) - --- PASS: TestRemoteWrite/vector/NameLabel (10.02s) - --- PASS: TestRemoteWrite/vector/SortedLabels (10.02s) - --- PASS: TestRemoteWrite/vector/Summary (10.02s) - --- PASS: TestRemoteWrite/vector/Timestamp (10.02s) ---- FAIL: TestRemoteWrite/vmagent (0.01s) - --- FAIL: TestRemoteWrite/vmagent/Invalid (20.66s) - --- FAIL: TestRemoteWrite/vmagent/Ordering (22.05s) - --- FAIL: TestRemoteWrite/vmagent/RepeatedLabels (20.67s) - --- FAIL: TestRemoteWrite/vmagent/Staleness (20.67s) - --- PASS: TestRemoteWrite/vmagent/Counter (20.67s) - --- PASS: TestRemoteWrite/vmagent/EmptyLabels (20.64s) - --- PASS: TestRemoteWrite/vmagent/Gauge (20.66s) - --- PASS: TestRemoteWrite/vmagent/Headers (20.64s) - --- PASS: TestRemoteWrite/vmagent/Histogram (20.66s) - --- PASS: TestRemoteWrite/vmagent/HonorLabels (20.66s) - --- PASS: TestRemoteWrite/vmagent/InstanceLabel (20.66s) - --- PASS: TestRemoteWrite/vmagent/JobLabel (20.66s) - --- PASS: TestRemoteWrite/vmagent/NameLabel (20.66s) - --- PASS: TestRemoteWrite/vmagent/SortedLabels (20.66s) - --- PASS: TestRemoteWrite/vmagent/Summary (20.66s) - --- PASS: TestRemoteWrite/vmagent/Timestamp (20.67s) - --- PASS: TestRemoteWrite/vmagent/Up (20.66s) -```` - -We'll work more on improving our test suites, both by adding more tests & by adding new test targets. If you want to help us, consider adding more of [our list of Remote Write integrations](https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage). diff --git a/content/blog/2021-06-10-on-ransomware-naming.md b/content/blog/2021-06-10-on-ransomware-naming.md deleted file mode 100644 index 61ee68b2..00000000 --- a/content/blog/2021-06-10-on-ransomware-naming.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: "On Ransomware Naming" -created_at: 2021-06-10 -kind: article -author_name: Richard "RichiH" Hartmann ---- - -As per Oscar Wilde, imitation is the sincerest form of flattery. - -The names "Prometheus" and "Thanos" have [recently been taken up by a ransomware group](https://cybleinc.com/2021/06/05/prometheus-an-emerging-apt-group-using-thanos-ransomware-to-target-organizations/). There's not much we can do about that except to inform you that this is happening. There's not much you can do either, except be aware that this is happening. - -While we do *NOT* have reason to believe that this group will try to trick anyone into downloading fake binaries of our projects, we still recommend following common supply chain & security practices. When deploying software, do it through one of those mechanisms: - -* Binary downloads from the official release pages for [Prometheus](https://github.com/prometheus/prometheus/releases) and [Thanos](https://github.com/thanos-io/thanos/releases), with verification of checksums provided. -* Docker downloads from official project controlled repositories: - * Prometheus: https://quay.io/repository/prometheus/prometheus and https://hub.docker.com/r/prom/prometheus - * Thanos: https://quay.io/repository/thanos/thanos and https://hub.docker.com/r/thanosio/thanos -* Binaries, images, or containers from distributions you trust -* Binaries, images, or containers from your own internal software verification and deployment teams -* Build from source yourself - -Unless you can reasonably trust the specific providence and supply chain, you should not use software. - -As there's a non-zero chance that the ransomware group chose the names deliberately and thus might come across this blog post: Please stop. With both the ransomware and the naming choice. diff --git a/content/blog/2021-10-14-prometheus-conformance-results.md b/content/blog/2021-10-14-prometheus-conformance-results.md deleted file mode 100644 index b0a155a2..00000000 --- a/content/blog/2021-10-14-prometheus-conformance-results.md +++ /dev/null @@ -1,84 +0,0 @@ ---- -title: "Prometheus Conformance Program: First round of results" -created_at: 2021-10-14 -kind: article -author_name: Richard "RichiH" Hartmann ---- - -Today, we're launching the [Prometheus Conformance Program](/blog/2021/05/03/introducing-prometheus-conformance-program/) with the goal of ensuring interoperability between different projects and vendors in the Prometheus monitoring space. While the legal paperwork still needs to be finalized, we ran tests, and we consider the below our first round of results. As part of this launch [Julius Volz updated his PromQL test results](https://promlabs.com/blog/2021/10/14/promql-vendor-compatibility-round-three). - -As a quick reminder: The program is called Prometheus **Conformance**, software can be **compliant** to specific tests, which result in a **compatibility** rating. The nomenclature might seem complex, but it allows us to speak about this topic without using endless word snakes. - -# Preamble - -## New Categories - -We found that it's quite hard to reason about what needs to be applied to what software. To help sort my thoughts, we created [an overview](https://docs.google.com/document/d/1VGMme9RgpclqF4CF2woNmgFqq0J7nqHn-l72uNmAxhA), introducing four new categories we can put software into: - -* Metrics Exposers -* Agents/Collectors -* Prometheus Storage Backends -* Full Prometheus Compatibility - -## Call for Action - -Feedback is very much welcome. Maybe counter-intuitively, we want the community, not just Prometheus-team, to shape this effort. To help with that, we will launch a WG Conformance within Prometheus. As with [WG Docs](https://docs.google.com/document/d/1k7_Ya7j5HrIgxXghTCj-26CuwPyGdAbHS0uQf0Ir2tw) and [WG Storage](https://docs.google.com/document/d/1HWL-NIfog3_pFxUny0kAHeoxd0grnqhCBcHVPZN4y3Y), those will be public and we actively invite participation. - -As we [alluded to recently](https://www.youtube.com/watch?v=CBDZKjgRiew), the maintainer/adoption ratio of Prometheus is surprisingly, or shockingly, low. In different words, we hope that the economic incentives around Prometheus Compatibility will entice vendors to assign resources in building out the tests with us. If you always wanted to contribute to Prometheus during work time, this might be the way; and a way that will have you touch a lot of highly relevant aspects of Prometheus. There's a variety of ways to [get in touch](https://prometheus.io/community/) with us. - -## Register for being tested - -You can use the [same communication channels](https://prometheus.io/community/) to get in touch with us if you want to register for being tested. Once the paperwork is in place, we will hand contact information and contract operations over to CNCF. - -# Test results - -## Full Prometheus Compatibility - -We know what tests we want to build out, but we are not there yet. As announced previously, it would be unfair to hold this against projects or vendors. As such, test shims are defined as being passed. The currently semi-manual nature of e.g. the [PromQL tests Julius ran this week](https://promlabs.com/blog/2021/10/14/promql-vendor-compatibility-round-three) mean that Julius tested sending data through Prometheus Remote Write in most cases as part of PromQL testing. We're reusing his results in more than one way here. This will be untangled soon, and more tests from different angles will keep ratcheting up the requirements and thus End User confidence. - -It makes sense to look at projects and aaS offerings in two sets. - -### Projects - -#### Passing - -* Cortex 1.10.0 -* M3 1.3.0 -* Promscale 0.6.2 -* Thanos 0.23.1 - -#### Not passing - -VictoriaMetrics 1.67.0 is not passing and [does not intend to pass](https://promlabs.com/blog/2021/10/14/promql-vendor-compatibility-round-three#victoriametrics). In the spirit of End User confidence, we decided to track their results while they position themselves as a drop-in replacement for Prometheus. - -### aaS - -#### Passing - -* Chronosphere -* Grafana Cloud - -#### Not passing - -* Amazon Managed Service for Prometheus -* Google Cloud Managed Service for Prometheus -* New Relic -* Sysdig Monitor - -NB: As Amazon Managed Service for Prometheus is based on Cortex just like Grafana Cloud, we expect them to pass after the next update cycle. - -## Agent/Collector - -### Passing - -* Grafana Agent 0.19.0 -* OpenTelemetry Collector 0.37.0 -* Prometheus 2.30.3 - -### Not passing - -* Telegraf 1.20.2 -* Timber Vector 0.16.1 -* VictoriaMetrics Agent 1.67.0 - -NB: We tested Vector 0.16.1 instead of 0.17.0 because there are no binary downloads for 0.17.0 and our test toolchain currently expects binaries. diff --git a/content/blog/2021-11-16-agent.md b/content/blog/2021-11-16-agent.md deleted file mode 100644 index 8bfd8d67..00000000 --- a/content/blog/2021-11-16-agent.md +++ /dev/null @@ -1,168 +0,0 @@ ---- -title: Introducing Prometheus Agent Mode, an Efficient and Cloud-Native Way for Metric Forwarding -created_at: 2021-11-16 -kind: article -author_name: Bartlomiej Plotka (@bwplotka) ---- - -> Bartek Płotka has been a Prometheus Maintainer since 2019 and Principal Software Engineer at Red Hat. Co-author of the CNCF Thanos project. CNCF Ambassador and tech lead for the CNCF TAG Observability. In his free time, he writes a book titled "Efficient Go" with O'Reilly. Opinions are my own! - -What I personally love in the Prometheus project, and one of the many reasons why I joined the team, was the laser focus on the project's goals. Prometheus was always about pushing boundaries when it comes to providing pragmatic, reliable, cheap, yet invaluable metric-based monitoring. Prometheus' ultra-stable and robust APIs, query language, and integration protocols (e.g. Remote Write and [OpenMetrics](https://openmetrics.io/)) allowed the Cloud Native Computing Foundation (CNCF) metrics ecosystem to grow on those strong foundations. Amazing things happened as a result: - -* We can see community exporters for getting metrics about virtually everything e.g. [containers](https://github.com/google/cadvisor), [eBPF](https://github.com/cloudflare/ebpf_exporter), [Minecraft server statistics](https://github.com/sladkoff/minecraft-prometheus-exporter) and even [plants' health when gardening](https://megamorf.gitlab.io/2019/07/14/monitoring-plant-health-with-prometheus/). -* Most people nowadays expect cloud-native software to have an HTTP/HTTPS `/metrics` endpoint that Prometheus can scrape. A concept developed in secret within Google and pioneered globally by the Prometheus project. -* The observability paradigm shifted. We see SREs and developers rely heavily on metrics from day one, which improves software resiliency, debuggability, and data-driven decisions! - -In the end, we hardly see Kubernetes clusters without Prometheus running there. - -The strong focus of the Prometheus community allowed other open-source projects to grow too to extend the Prometheus deployment model beyond single nodes (e.g. [Cortex](https://cortexmetrics.io/), [Thanos](https://thanos.io/) and more). Not mentioning cloud vendors adopting Prometheus' API and data model (e.g. [Amazon Managed Prometheus](https://aws.amazon.com/prometheus/), [Google Cloud Managed Prometheus](https://cloud.google.com/stackdriver/docs/managed-prometheus), [Grafana Cloud](https://grafana.com/products/cloud/) and more). If you are looking for a single reason why the Prometheus project is so successful, it is this: **Focusing the monitoring community on what matters**. - -In this (lengthy) blog post, I would love to introduce a new operational mode of running Prometheus called "Agent". It is built directly into the Prometheus binary. The agent mode disables some of Prometheus' usual features and optimizes the binary for scraping and remote writing to remote locations. Introducing a mode that reduces the number of features enables new usage patters. In this blog post I will explain why it is a game-changer for certain deployments in the CNCF ecosystem. I am super excited about this! - -## History of the Forwarding Use Case - -The core design of Prometheus has been unchanged for the project's entire lifetime. Inspired by [Google's Borgmon monitoring system](https://sre.google/sre-book/practical-alerting/#the-rise-of-borgmon), you can deploy a Prometheus server alongside the applications you want to monitor, tell Prometheus how to reach them, and allow to scrape the current values of their metrics at regular intervals. Such a collection method, which is often referred to as the "pull model", is the core principle that [allows Prometheus to be lightweight and reliable](https://prometheus.io/blog/2016/07/23/pull-does-not-scale-or-does-it/). Furthermore, it enables application instrumentation and exporters to be dead simple, as they only need to provide a simple human-readable HTTP endpoint with the current value of all tracked metrics (in OpenMetrics format). All without complex push infrastructure and non-trivial client libraries. Overall, a simplified typical Prometheus monitoring deployment looks as below: - -![Prometheus high-level view](/assets/blog/2021-11-16/prom.png) - -This works great, and we have seen millions of successful deployments like this over the years that process dozens of millions of active series. Some of them for longer time retention, like two years or so. All allow to query, alert, and record metrics useful for both cluster admins and developers. - -However, the cloud-native world is constantly growing and evolving. With the growth of managed Kubernetes solutions and clusters created on-demand within seconds, we are now finally able to treat clusters as "cattle", not as "pets" (in other words, we care less about individual instances of those). In some cases, solutions do not even have the cluster notion anymore, e.g. [kcp](https://github.com/kcp-dev/kcp), [Fargate](https://aws.amazon.com/fargate/) and other platforms. - -![Yoda](/assets/blog/2021-11-16/yoda.gif) - -The other interesting use case that emerges is the notion of **Edge** clusters or networks. With industries like telecommunication, automotive and IoT devices adopting cloud-native technologies, we see more and more much smaller clusters with a restricted amount of resources. This is forcing all data (including observability) to be transferred to remote, bigger counterparts as almost nothing can be stored on those remote nodes. - -What does that mean? That means monitoring data has to be somehow aggregated, presented to users and sometimes even stored on the *global* level. This is often called a **Global-View** feature. - -Naively, we could think about implementing this by either putting Prometheus on that global level and scraping metrics across remote networks or pushing metrics directly from the application to the central location for monitoring purposes. Let me explain why both are generally *very* bad ideas: - -🔥 Scraping across network boundaries can be a challenge if it adds new unknowns in a monitoring pipeline. The local pull model allows Prometheus to know why exactly the metric target has problems and when. Maybe it's down, misconfigured, restarted, too slow to give us metrics (e.g. CPU saturated), not discoverable by service discovery, we don't have credentials to access or just DNS, network, or the whole cluster is down. By putting our scraper outside of the network, we risk losing some of this information by introducing unreliability into scrapes that is unrelated to an individual target. On top of that, we risk losing important visibility completely if the network is temporarily down. Please don't do it. It's not worth it. (: - -🔥 Pushing metrics directly from the application to some central location is equally bad. Especially when you monitor a larger fleet, you know literally nothing when you don't see metrics from remote applications. Is the application down? Is my receiver pipeline down? Maybe the application failed to authorize? Maybe it failed to get the IP address of my remote cluster? Maybe it's too slow? Maybe the network is down? Worse, you may not even know that the data from some application targets is missing. And you don't even gain a lot as you need to track the state and status of everything that should be sending data. Such a design needs careful analysis as it can be a recipe for a failure too easily. - -> NOTE: Serverless functions and short-living containers are often cases where we think about push from application as the rescue. At this point however we talk about events or pieces of metrics we might want to aggregate to longer living time series. This topic is discussed [here](https://groups.google.com/g/prometheus-developers/c/FPe0LsTfo2E/m/yS7up2YzAwAJ), feel free to contribute and help us support those cases better! - -Prometheus introduced three ways to support the global view case, each with its own pros and cons. Let's briefly go through those. They are shown in orange color in the diagram below: - -![Prometheus global view](/assets/blog/2021-11-16/prom-remote.png) - -* **Federation** was introduced as the first feature for aggregation purposes. It allows a global-level Prometheus server to scrape a subset of metrics from a leaf Prometheus. Such a "federation" scrape reduces some unknowns across networks because metrics exposed by federation endpoints include the original samples' timestamps. Yet, it usually suffers from the inability to federate all metrics and not lose data during longer network partitions (minutes). -* **Prometheus Remote Read** allows selecting raw metrics from a remote Prometheus server's database without a direct PromQL query. You can deploy Prometheus or other solutions (e.g. Thanos) on the global level to perform PromQL queries on this data while fetching the required metrics from multiple remote locations. This is really powerful as it allows you to store data "locally" and access it only when needed. Unfortunately, there are cons too. Without features like [Query Pushdown](https://github.com/thanos-io/thanos/issues/305) we are in extreme cases pulling GBs of compressed metric data to answer a single query. Also, if we have a network partition, we are temporarily blind. Last but not least, certain security guidelines are not allowing ingress traffic, only egress one. -* Finally, we have **Prometheus Remote Write**, which seems to be the most popular choice nowadays. Since the agent mode focuses on remote write use cases, let's explain it in more detail. - -### Remote Write - -The Prometheus Remote Write protocol allows us to forward (stream) all or a subset of metrics collected by Prometheus to the remote location. You can configure Prometheus to forward some metrics (if you want, with all metadata and exemplars!) to one or more locations that support the Remote Write API. In fact, Prometheus supports both ingesting and sending Remote Write, so you can deploy Prometheus on a global level to receive that stream and aggregate data cross-cluster. - -While the official [Prometheus Remote Write API specification is in review stage](https://docs.google.com/document/d/1LPhVRSFkGNSuU1fBd81ulhsCPR4hkSZyyBj1SZ8fWOM/edit), the ecosystem adopted the Remote Write protocol as the default metrics export protocol. For example, Cortex, Thanos, OpenTelemetry, and cloud services like Amazon, Google, Grafana, Logz.io, etc., all support ingesting data via Remote Write. - -The Prometheus project also offers the official compliance tests for its APIs, e.g. [remote-write sender compliance](https://github.com/prometheus/compliance/tree/main/remote_write_sender) for solutions that offer Remote Write client capabilities. It's an amazing way to quickly tell if you are correctly implementing this protocol. - -Streaming data from such a scraper enables Global View use cases by allowing you to store metrics data in a centralized location. This also enables separation of concerns, which is useful when applications are managed by different teams than the observability or monitoring pipelines. Furthermore, it is also why Remote Write is chosen by vendors who want to offload as much work from their customers as possible. - -> Wait for a second, Bartek. You just mentioned before that pushing metrics directly from the application is not the best idea! - -Sure, but the amazing part is that, even with Remote Write, Prometheus still uses a pull model to gather metrics from applications, which gives us an understanding of those different failure modes. After that, we batch samples and series and export, replicate (push) data to the Remote Write endpoints, limiting the number of monitoring unknowns that the central point has! - -It's important to note that a reliable and efficient remote-writing implementation is a non-trivial problem to solve. The Prometheus community spent around three years to come up with a stable and scalable implementation. We reimplemented the WAL (write-ahead-log) a few times, added internal queuing, sharding, smart back-offs and more. All of this is hidden from the user, who can enjoy well-performing streaming or large amounts of metrics stored in a centralized location. - -### Hands-on Remote Write Example: Katacoda Tutorial - -All of this is not new in Prometheus. Many of us already use Prometheus to scrape all required metrics and remote-write all or some of them to remote locations. - -Suppose you would like to try the hands-on experience of remote writing capabilities. In that case, we recommend the [Thanos Katacoda tutorial of remote writing metrics from Prometheus](https://katacoda.com/thanos/courses/thanos/3-receiver), which explains all steps required for Prometheus to forward all metrics to the remote location. It's **free**, just sign up for an account and enjoy the tutorial! 🤗 - -Note that this example uses Thanos in receive mode as the remote storage. Nowadays, you can use plenty of other projects that are compatible with the remote write API. - -So if remote writing works fine, why did we add a special Agent mode to Prometheus? - -## Prometheus Agent Mode - -From Prometheus `v2.32.0` (next release), everyone will be able to run the Prometheus binary with an experimental `--enable-feature=agent` flag. If you want to try it before the release, feel free to use [Prometheus v2.32.0-beta.0](https://github.com/prometheus/prometheus/releases/tag/v2.32.0-beta.0) or use our `quay.io/prometheus/prometheus:v2.32.0-beta.0` image. - -The Agent mode optimizes Prometheus for the remote write use case. It disables querying, alerting, and local storage, and replaces it with a customized TSDB WAL. Everything else stays the same: scraping logic, service discovery and related configuration. It can be used as a drop-in replacement for Prometheus if you want to just forward your data to a remote Prometheus server or any other Remote-Write-compliant project. In essence it looks like this: - -![Prometheus agent](/assets/blog/2021-11-16/agent.png) - -The best part about Prometheus Agent is that it's built into Prometheus. Same scraping APIs, same semantics, same configuration and discovery mechanism. - -What are the benefits of using the Agent mode if you plan not to query or alert on data locally and stream metrics outside? There are a few: - -First of all, efficiency. Our customized Agent TSDB WAL removes the data immediately after successful writes. If it cannot reach the remote endpoint, it persists the data temporarily on the disk until the remote endpoint is back online. This is currently limited to a two-hour buffer only, similar to non-agent Prometheus, [hopefully unblocked soon](https://github.com/prometheus/prometheus/issues/9607). This means that we don't need to build chunks of data in memory. We don't need to maintain a full index for querying purposes. Essentially the Agent mode uses a fraction of the resources that a normal Prometheus server would use in a similar situation. - -Does this efficiency matter? Yes! As we mentioned, every GB of memory and every CPU core used on edge clusters matters for some deployments. On the other hand, the paradigm of performing monitoring using metrics is quite mature these days. This means that the more relevant metrics with more cardinality you can ship for the same cost - the better. - -> NOTE: With the introduction of the Agent mode, the original Prometheus server mode still stays as the recommended, stable and maintained mode. Agent mode with remote storage brings additional complexity. Use with care. - -Secondly, the benefit of the new Agent mode is that it enables easier horizontal scalability for ingestion. This is something I am excited about the most. Let me explain why. - -### The Dream: Auto-Scalable Metric Ingestion - -A true auto-scalable solution for scraping would need to be based on the amount of metric targets and the number of metrics they expose. The more data we have to scrape, the more instances of Prometheus we deploy automatically. If the number of targets or their number of metrics goes down, we could scale down and remove a couple of instances. This would remove the manual burden of adjusting the sizing of Prometheus and stop the need for over-allocating Prometheus for situations where the cluster is temporarily small. - -With just Prometheus in server mode, this was hard to achieve. This is because Prometheus in server mode is stateful. Whatever is collected stays as-is in a single place. This means that the scale-down procedure would need to back up the collected data to existing instances before termination. Then we would have the problem of overlapping scrapes, misleading staleness markers etc. - -On top of that, we would need some global view query that is able to aggregate all samples across all instances (e.g. Thanos Query or Promxy). Last but not least, the resource usage of Prometheus in server mode depends on more things than just ingestion. There is alerting, recording, querying, compaction, remote write etc., that might need more or fewer resources independent of the number of metric targets. - -Agent mode essentially moves the discovery, scraping and remote writing to a separate microservice. This allows a focused operational model on ingestion only. As a result, Prometheus in Agent mode is more or less stateless. Yes, to avoid loss of metrics, we need to deploy an HA pair of agents and attach a persistent disk to them. But technically speaking, if we have thousands of metric targets (e.g. containers), we can deploy multiple Prometheus agents and safely change which replica is scraping which targets. This is because, in the end, all samples will be pushed to the same central storage. - -Overall, Prometheus in Agent mode enables easy horizontal auto-scaling capabilities of Prometheus-based scraping that can react to dynamic changes in metric targets. This is definitely something we will look at with the [Prometheus Kubernetes Operator](https://github.com/prometheus-operator/prometheus-operator) community going forward. - -Now let's take a look at the currently implemented state of agent mode in Prometheus. Is it ready to use? - -### Agent Mode Was Proven at Scale - -The next release of Prometheus will include Agent mode as an experimental feature. Flags, APIs and WAL format on disk might change. But the performance of the implementation is already battle-tested thanks to [Grafana Labs'](https://grafana.com/) open-source work. - -The initial implementation of our Agent's custom WAL was inspired by the current Prometheus server's TSDB WAL and created by [Robert Fratto](https://github.com/rfratto) in 2019, under the mentorship of [Tom Wilkie](https://twitter.com/tom_wilkie), Prometheus maintainer. It was then used in an open-source [Grafana Agent](https://github.com/grafana/agent) project that was since then used by many Grafana Cloud customers and community members. Given the maturity of the solution, it was time to donate the implementation to Prometheus for native integration and bigger adoption. Robert (Grafana Labs), with the help of Srikrishna (Red Hat) and the community, ported the code to the Prometheus codebase, which was merged to `main` 2 weeks ago! - -The donation process was quite smooth. Since some Prometheus maintainers contributed to this code before within the Grafana Agent, and since the new WAL is inspired by Prometheus' own WAL, it was not hard for the current Prometheus TSDB maintainers to take it under full maintenance! It also really helps that Robert is joining the Prometheus Team as a TSDB maintainer (congratulations!). - -Now, let's explain how you can use it! (: - -### How to Use Agent Mode in Detail - -From now on, if you show the help output of Prometheus (`--help` flag), you should see more or less the following: - -```bash -usage: prometheus [] - -The Prometheus monitoring server - -Flags: - -h, --help Show context-sensitive help (also try --help-long and --help-man). - (... other flags) - --storage.tsdb.path="data/" - Base path for metrics storage. Use with server mode only. - --storage.agent.path="data-agent/" - Base path for metrics storage. Use with agent mode only. - (... other flags) - --enable-feature= ... Comma separated feature names to enable. Valid options: agent, exemplar-storage, expand-external-labels, memory-snapshot-on-shutdown, promql-at-modifier, promql-negative-offset, remote-write-receiver, - extra-scrape-metrics, new-service-discovery-manager. See https://prometheus.io/docs/prometheus/latest/feature_flags/ for more details. -``` - -Since the Agent mode is behind a feature flag, as mentioned previously, use the `--enable-feature=agent` flag to run Prometheus in the Agent mode. Now, the rest of the flags are either for both server and Agent or only for a specific mode. You can see which flag is for which mode by checking the last sentence of a flag's help string. "Use with server mode only" means it's only for server mode. If you don't see any mention like this, it means the flag is shared. - -The Agent mode accepts the same scrape configuration with the same discovery options and remote write options. - -It also exposes a web UI with disabled query capabitilies, but showing build info, configuration, targets and service discovery information as in a normal Prometheus server. - -### Hands-on Prometheus Agent Example: Katacoda Tutorial - -Similarly to Prometheus remote-write tutorial, if you would like to try the hands-on experience of Prometheus Agent capabilities, we recommend the [Thanos Katacoda tutorial of Prometheus Agent](https://katacoda.com/thanos/courses/thanos/4-receiver-agent), which explains how easy it is to run Prometheus Agent. - -## Summary - -I hope you found this interesting! In this post, we walked through the new cases that emerged like: - -* edge clusters -* limited access networks -* large number of clusters -* ephemeral and dynamic clusters - -We then explained the new Prometheus Agent mode that allows efficiently forwarding scraped metrics to the remote write endpoints. - -As always, if you have any issues or feedback, feel free to [submit a ticket on GitHub or ask questions on the mailing list](https://prometheus.io/community/). - -> This blog post is part of a coordinated release between CNCF, Grafana, and Prometheus. Feel free to also read the [CNCF announcement](https://www.cncf.io/blog/) and the angle on the [Grafana Agent which underlies the Prometheus Agent](https://grafana.com/blog/2021/11/16/why-we-created-a-prometheus-agent-mode-from-the-grafana-agent). diff --git a/content/blog/2023-03-21-stringlabel.md b/content/blog/2023-03-21-stringlabel.md deleted file mode 100644 index 2db5846f..00000000 --- a/content/blog/2023-03-21-stringlabel.md +++ /dev/null @@ -1,72 +0,0 @@ ---- -title: FAQ about Prometheus 2.43 String Labels Optimization -created_at: 2023-03-21 -kind: article -author_name: Julien Pivotto (@roidelapluie) ---- - -Prometheus 2.43 has just been released, and it brings some exciting features and -enhancements. One of the significant improvements is the `stringlabels` release, -which uses a new data structure for labels. This blog post will answer some -frequently asked questions about the 2.43 release and the `stringlabels` -optimizations. - -### What is the `stringlabels` release? - -The `stringlabels` release is a Prometheus 2.43 version that uses a new data -structure for labels. It stores all the label/values in a single string, -resulting in a smaller heap size and some speedups in most cases. These -optimizations are not shipped in the default binaries and require compiling -Prometheus using the Go tag `stringlabels`. - -### Why didn't you go for a feature flag that we can toggle? - -We considered using a feature flag but it would have a memory overhead that was -not worth it. Therefore, we decided to provide a separate release with these -optimizations for those who are interested in testing and measuring the gains on -their production environment. - -### When will these optimizations be generally available? - -These optimizations will be available in the upcoming Prometheus 2.44 release -by default. - -### How do I get the 2.43 release? - -The [Prometheus 2.43 release](https://github.com/prometheus/prometheus/releases/tag/v2.43.0) is available on the official Prometheus GitHub -releases page, and users can download the binary files directly from there. -Additionally, Docker images are also available for those who prefer to use -containers. - -The stringlabels optimization is not included in these default binaries. To use -this optimization, users will need to download the [2.43.0+stringlabels -release](https://github.com/prometheus/prometheus/releases/tag/v2.43.0%2Bstringlabels) -binary or the [Docker images tagged -v2.43.0-stringlabels](https://quay.io/repository/prometheus/prometheus?tab=tags) specifically. - -### Why is the release `v2.43.0+stringlabels` and the Docker tag `v2.43.0-stringlabels`? - -In semantic versioning, the plus sign (+) is used to denote build -metadata. Therefore, the Prometheus 2.43 release with the `stringlabels` -optimization is named `2.43.0+stringlabels` to signify that it includes the -experimental `stringlabels` feature. However, Docker tags do not allow the use of -the plus sign in their names. Hence, the plus sign has been replaced with a dash -(-) to make the Docker tag `v2.43.0-stringlabels`. This allows the Docker tag to -pass the semantic versioning checks of downstream projects such as the -Prometheus Operator. - -### What are the other noticeable features in the Prometheus 2.43 release? - -Apart from the `stringlabels` optimizations, the Prometheus 2.43 release -brings several new features and enhancements. Some of the significant additions -include: - -* We added support for `scrape_config_files` to include scrape configs from - different files. This makes it easier to manage and organize the configuration. -* The HTTP clients now includes two new config options: `no_proxy` to exclude - URLs from proxied requests and `proxy_from_environment` to read proxies from - env variables. These features make it easier to manage the HTTP client's - behavior in different environments. - -You can learn more about features and bugfixes in the -[full changelog](https://github.com/prometheus/prometheus/releases/tag/v2.43.0). diff --git a/content/blog/2023-09-01-promcon2023-schedule.md b/content/blog/2023-09-01-promcon2023-schedule.md deleted file mode 100644 index b809c16b..00000000 --- a/content/blog/2023-09-01-promcon2023-schedule.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: The Schedule for the PromCon Europe 2023 is Live -created_at: 2023-09-01 -kind: article -author_name: Matthias Loibl (@metalmatze) ---- - -> PromCon Europe is the eighth conference fully dedicated to the Prometheus monitoring system - -Berlin, Germany – September 1, 2023 – The CNCF and the Prometheus team, released the two-day schedule for the single-track PromCon Europe 2023 conference happening in Berlin, Germany from September 28 to September 29, 2023. Attendees will be able to choose from 21 full-length (25min) sessions and up to 20 five-minute lightning talk sessions spanning diverse topics related to [Prometheus](https://prometheus.io/). - -Now in its 8th installment, PromCon brings together Prometheus users and developers from around the world to exchange knowledge, best practices, and experience gained through using Prometheus. The program committee reviewed 66 submissions that will provide a fresh and informative look into the most pressing topics around Prometheus today. - -"We are super excited for PromCon to be coming home to Berlin. Prometheus was started in Berlin at Soundcloud in 2012. The first PromCon was hosted in Berlin and in between moved to Munich. This year we're hosting around 300 attendees at Radialsystem in Friedrichshain, Berlin. Berlin has a vibrant Prometheus community and many of the Prometheus team members live in the neighborhood. It is a great opportunity to network and connect with the Prometheus family who are all passionate about systems and service monitoring," said Matthias Loibl, Senior Software Engineer at Polar Signals and Prometheus team member who leads this year's PromCon program committee. "It will be a great event to learn about the latest developments from the Prometheus team itself and connect to some big-scale users of Prometheus up close." - -The community-curated schedule will feature sessions from open source community members, including: - -- [Towards making Prometheus OpenTelemetry native](https://promcon.io/2023-berlin/talks/towards-making-prometheus-opentelemetry-native) -- [How to Monitor Global Tens of Thousands of Kubernetes Clusters with Thanos Federation](https://promcon.io/2023-berlin/talks/how-to-monitor-global-tens-of-thousands-of-kubernetes-clusters-with-thanos-federation) -- [Prometheus Java Client 1.0.0](https://promcon.io/2023-berlin/talks/prometheus-java-client) -- [Perses](https://promcon.io/2023-berlin/talks/perses) -- [Where's your money going? The Beginners Guide To Measuring Kubernetes Costs](https://promcon.io/2023-berlin/talks/where-your-money-going-the-beginners-guide-to-measuring-kubernetes-costs) - -For the full PromCon Europe 2023 program, please visit the [schedule](https://promcon.io/2023-berlin/schedule/). - -## Registration - -[Register](https://promcon.io/2023-berlin/register/) for the in-person standard pricing of $350 USD through September 25. The venue has space for 300 attendees so don’t wait! - -## Thank You to Our Sponsors - -PromCon Europe 2023 has been made possible thanks to the amazing community around Prometheus and support from our Diamond Sponsor [Grafana Labs](https://grafana.com/), Platinum Sponsor [Red Hat](https://www.redhat.com/) as well as many more Gold, and Startup sponsors. This year’s edition is organized by [Polar Signals](https://www.polarsignals.com/) and CNCF. - -## Watch the Prometheus Documentary - - - -## Contact - -Jessie Adams-Shore - The Linux Foundation - pr@cncf.io - -PromCon Organizers - promcon-organizers@googlegroups.com diff --git a/content/blog/2024-03-14-commitment-to-opentelemetry.md b/content/blog/2024-03-14-commitment-to-opentelemetry.md deleted file mode 100644 index 2c361266..00000000 --- a/content/blog/2024-03-14-commitment-to-opentelemetry.md +++ /dev/null @@ -1,64 +0,0 @@ ---- -title: Our commitment to OpenTelemetry -created_at: 2024-03-13 -kind: article -author_name: Goutham Veeramachaneni (@Gouthamve) and Carrie Edwards (@carrieedwards) ---- - -*The [OpenTelemetry project](https://opentelemetry.io/) is an Observability framework and toolkit designed to create and manage telemetry data such as traces, metrics, and logs. It is gaining widespread adoption due to its consistent specification between signals and promise to reduce vendor lock-in which is something that we’re excited about.* - -## Looking back at 2023 - -Over the past few years, we have collaborated with the OpenTelemetry community to make sure that OpenTelemetry and Prometheus support each other bidirectionally. This led to the drafting of the official specification to convert between the two systems, as well as the implementations that allow you to ingest Prometheus metrics into OpenTelemetry Collector and vice-versa. - -Since then, we have spent a significant amount of time understanding the [challenges faced by OpenTelemetry users](https://docs.google.com/document/d/1epvoO_R7JhmHYsII-GJ6Yw99Ky91dKOqOtZGqX7Bk0g/edit?usp=sharing) when storing their metrics in Prometheus and based on those, explored [how we can address them](https://docs.google.com/document/d/1NGdKqcmDExynRXgC_u1CDtotz9IUdMrq2yyIq95hl70/edit?usp=sharing). Some of the changes proposed need careful considerations to avoid breaking either side's operating promises, e.g. supporting both push and pull. At PromCon Berlin 2023, we attempted to summarize our ideas in [one of the talks](https://www.youtube.com/watch?v=mcabOH70FqU). - -At our [dev summit in Berlin](https://docs.google.com/document/d/11LC3wJcVk00l8w5P3oLQ-m3Y37iom6INAMEu2ZAGIIE/edit#bookmark=id.9kp854ea3sv4), we spent the majority of our time discussing these changes and our general stance on OpenTelemetry in depth, and the broad consensus is that we want [“to be the default store for OpenTelemetry metrics”](https://docs.google.com/document/d/11LC3wJcVk00l8w5P3oLQ-m3Y37iom6INAMEu2ZAGIIE/edit#bookmark=id.196i9ij1u7fs)! - -We’ve formed a core group of developers to lead this initiative, and we are going to release a Prometheus 3.0 in 2024 with OTel support as one of its more important features. Here’s a sneak peek at what's coming in 2024. - -## The year ahead - -### OTLP Ingestion GA - -In [Prometheus v2.47.0](https://github.com/prometheus/prometheus/releases/tag/v2.47.0), released on 6th September 2023, we added experimental support for OTLP ingestion in Prometheus. We’re constantly improving this and we plan to add support for staleness and make it a stable feature. We will also mark our support for out-of-order ingestion as stable. This involves also GA-ing our support for native / exponential histograms. - -### Support UTF-8 metric and label names - -[OpenTelemetry semantic conventions](https://github.com/open-telemetry/semantic-conventions/blob/main/docs/http/http-metrics.md) push for `“.”` to be the namespacing character. For example, `http.server.request.duration`. However, Prometheus currently requires a [more limited character set](https://prometheus.io/docs/instrumenting/writing_exporters/#naming), which means we convert the metric to `http_server_request_duration` when ingesting it into Prometheus. - -This causes unnecessary dissonance and we’re working on removing this limitation by adding UTF-8 support for all labels and metric names. The progress is tracked [here](https://github.com/prometheus/prometheus/issues/13095). - -### Native support for resource attributes - -OpenTelemetry differentiates between metric attributes (labels to identify the metric itself, like `http.status_code`) and resource attributes (labels to identify the source of the metrics, like `k8s.pod.name`), while Prometheus has a more flat label schema. This leads to many usability issues that are detailed [here](https://docs.google.com/document/d/1gG-eTQ4SxmfbGwkrblnUk97fWQA93umvXHEzQn2Nv7E/edit?usp=sharing). - -We’re [exploring several solutions](https://docs.google.com/document/d/1FgHxOzCQ1Rom-PjHXsgujK8x5Xx3GTiwyG__U3Gd9Tw/edit) to this problem from many fronts (Query, UX, storage, etc.), but our goal is to make it quite easy to filter and group on resource attributes. This is a work in progress, and feedback and help are wanted! - -### OTLP export in the ecosystem - -Prometheus remote write is supported by [most of the leading Observability projects and vendors](https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage) already. However, OpenTelemetry Protocol (OTLP) is gaining prominence and we would like to support it across the Prometheus ecosystem. - -We would like to add support for it to the Prometheus server, SDKs and the exporters. This would mean that any service instrumented with the Prometheus SDKs will also be able to _push_ OTLP and it will unlock the rich Prometheus exporter ecosystem for OpenTelemetry users. - -However, we intend to keep and develop the OpenMetrics exposition format as an optimized / simplified format for Prometheus and pull-based use-cases. - -### Delta temporality - -The OpenTelemetry project also supports [Delta temporality](https://grafana.com/blog/2023/09/26/opentelemetry-metrics-a-guide-to-delta-vs.-cumulative-temporality-trade-offs/) which has some use-cases for the Observability ecosystem. We have a lot of Prometheus users still running statsd and using the statsd_exporter for various reasons. - -We would like to support the Delta temporality of OpenTelemetry in the Prometheus server and are [working towards it](https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/30479). - -## Call for contributions! - -As you can see, a lot of new and exciting things are coming to Prometheus! If working in the intersection between two of the most relevant open-source projects around observability sounds challenging and interesting to you, we'd like to have you on board! - -This year there is also a change of governance in the works that will make the process of becoming a maintainer easier than ever! If you ever wanted to have an impact on Prometheus, now is a great time to get started. - -Our first focus has always been to be as open and transparent as possible on how we are organizing all the work above so that you can also contribute. We are looking for contributors to support this initiative and help implement these features. Check out the [Prometheus 3.0 public board](https://github.com/orgs/prometheus/projects/9) and [Prometheus OTel support milestone](https://github.com/prometheus/prometheus/issues?q=is%3Aopen+is%3Aissue+milestone%3A%22OTEL+Support%22) to track the progress of the feature development and see ways that you can [contribute](https://github.com/prometheus/prometheus/blob/main/CONTRIBUTING.md). - -## Conclusion - -Some of the changes proposed are large and invasive or involve a fundamental departure from the original data model of Prometheus. However, we plan to introduce these gracefully so that Prometheus 3.0 will have no major breaking changes and most of the users can upgrade without impact. - -We are excited to embark on this new chapter for Prometheus and would love your feedback on the changes suggested. diff --git a/content/blog/2024-09-11-prometheus-3-beta.md b/content/blog/2024-09-11-prometheus-3-beta.md deleted file mode 100644 index 446e1948..00000000 --- a/content/blog/2024-09-11-prometheus-3-beta.md +++ /dev/null @@ -1,69 +0,0 @@ ---- -title: Prometheus 3.0 Beta Released -created_at: 2024-09-11 -kind: article -author_name: The Prometheus Team -draft: true ---- - -The Prometheus Team is proud to announce the availability of Prometheus Version 3.0-beta! -You can download it [here](https://github.com/prometheus/prometheus/releases/tag/v3.0.0-beta.0). -As is traditional with a beta release, we do **not** recommend users install Prometheus 3.0-beta on critical production systems, but we do want everyone to test it out and find bugs. - -In general, the only breaking changes are the removal of deprecated feature flags. The Prometheus team worked hard to ensure backwards-compatibility and not to break existing installations, so all of the new features described below build on top of existing functionality. Most users should be able to try Prometheus 3.0 out of the box without any configuration changes. - -# What's New - -With over 7500 commits in the 7 years since Prometheus 2.0 came out there are too many new individual features and fixes to list, but there are some big shiny and breaking changes we wanted to call out. We need everyone in the community to try them out and report any issues you might find. -The more feedback we get, the more stable the final 3.0 release can be. - -## New UI - -One of the highlights in Prometheus 3.0 is its brand new UI that is enabled by default: - -![New UI query page](/assets/blog/2024-09-11/blog_post_screenshot_tree_view-s.png) - -The UI has been completely rewritten with less clutter, a more modern look and feel, new features like a [**PromLens**](https://promlens.com/)-style tree view, and will make future maintenance easier by using a more modern technical stack. - -Learn more about the new UI in general in [Julius' detailed article on the PromLabs blog](https://promlabs.com/blog/2024/09/11/a-look-at-the-new-prometheus-3-0-ui/). -Users can temporarily enable the old UI by using the `old-ui` feature flag. -Since the new UI is not battle-tested yet, it is also very possible that there are still bugs. If you find any, please [report them on GitHub](https://github.com/prometheus/prometheus/issues/new?assignees=&labels=&projects=&template=bug_report.yml). - -## Remote Write 2.0 - -Remote-Write 2.0 iterates on the previous protocol version by adding native support for a host of new elements including metadata, exemplars, created timestamp and native histograms. It also uses string interning to reduce payload size and CPU usage when compressing and decompressing. More details can be found [here](https://prometheus.io/docs/specs/remote_write_spec_2_0/). - -## OpenTelemetry Support - -Prometheus intends to be the default choice for storing OpenTelemetry metrics, and 3.0 includes some big new features that makes it even better as a storage backend for OpenTelemetry metrics data. - -### UTF-8 - -By default, Prometheus will allow all valid UTF-8 characters to be used in metric and label names, as well as label values as has been true in version 2.x. - -Users will need to make sure their metrics producers are configured to pass UTF-8 names, and if either side does not support UTF-8, metric names will be escaped using the traditional underscore-replacement method. PromQL queries can be written with the new quoting syntax in order to retrieve UTF-8 metrics, or users can specify the `__name__` label name manually. - -Not all language bindings have been updated with support for UTF-8 but the primary Go libraries have been. - -### OTLP Ingestion - -Prometheus can be configured as a native receiver for the OTLP Metrics protocol, receiving OTLP metrics on the /api/v1/otlp/v1/metrics endpoint. - -## Native Histograms -Native histograms are a Prometheus metric type that offer a higher efficiency and lower cost alternative to Classic Histograms. Rather than having to choose (and potentially have to update) bucket boundaries based on the data set, native histograms have pre-set bucket boundaries based on exponential growth. - -Native Histograms are still experimental and not yet enabled by default, and can be turned on by passing `--enable-feature=native-histograms`. Some aspects of Native Histograms, like the text format and accessor functions / operators are still under active design. - -## Other Breaking Changes - -The following feature flags have been removed, being enabled by default instead. References to these flags should be removed from configs, and will be ignored in Prometheus starting with version 3.0 - -* `promql-at-modifier` -* `promql-negative-offset` -* `remote-write-receiver` -* `no-scrape-default-port` -* `new-service-discovery-manager` - -Range selections are now [left-open and right-closed](https://github.com/prometheus/prometheus/issues/13213), which will avoid rare occasions that more points than intended are included in operations. - -Agent mode is now stable and has its own config flag instead of a feature flag \ No newline at end of file diff --git a/content/blog/2024-11-19-yace-joining-prometheus-community.md b/content/blog/2024-11-19-yace-joining-prometheus-community.md deleted file mode 100644 index fcc3eba4..00000000 --- a/content/blog/2024-11-19-yace-joining-prometheus-community.md +++ /dev/null @@ -1,33 +0,0 @@ ---- -title: YACE is joining Prometheus Community -created_at: 2024-11-19 -kind: article -author_name: Thomas Peitz (@thomaspeitz) ---- - -[Yet Another Cloudwatch Exporter](https://github.com/prometheus-community/yet-another-cloudwatch-exporter) (YACE) has officially joined the Prometheus community! This move will make it more accessible to users and open new opportunities for contributors to enhance and maintain the project. There's also a blog post from [Cristian Greco's point of view](https://grafana.com/blog/2024/11/19/yace-moves-to-prometheus-community/). - -## The early days - -When I first started YACE, I had no idea it would grow to this scale. At the time, I was working with [Invision AG](https://www.ivx.com) (not to be confused with the design app), a company focused on workforce management software. They fully supported me in open-sourcing the tool, and with the help of my teammate [Kai Forsthövel](https://github.com/kforsthoevel), YACE was brought to life. - -Our first commit was back in 2018, with one of our primary goals being to make CloudWatch metrics easy to scale and automatically detect what to measure, all while keeping the user experience simple and intuitive. InVision AG was scaling their infrastructure up and down due to machine learning workloads and we needed something that detects new infrastructure easily. This focus on simplicity has remained a core priority. From that point on, YACE began to find its audience. - -## Yace Gains Momentum - -As YACE expanded, so did the support around it. One pivotal moment was when [Cristian Greco](https://github.com/cristiangreco) from Grafana Labs reached out. I was feeling overwhelmed and hardly keeping up when Cristian stepped in, simply asking where he could help. He quickly became the main releaser and led Grafana Labs' contributions to YACE, a turning point that made a huge impact on the project. Along with an incredible community of contributors from all over the world, they elevated YACE beyond what I could have achieved alone, shaping it into a truly global tool. YACE is no longer just my project or Invision's—it belongs to the community. - - -## Gratitude and Future Vision - -I am immensely grateful to every developer, tester, and user who has contributed to YACE's success. This journey has shown me the power of community and open source collaboration. But we're not done yet. - -It's time to take Yace even further—into the heart of the Prometheus ecosystem. Making Yace as the official Amazon CloudWatch exporter for Prometheus will make it easier and more accessible for everyone. With ongoing support from Grafana Labs and my commitment to refining the user experience, we'll ensure YACE becomes an intuitive tool that anyone can use effortlessly. - -## Try out YACE on your own - -Try out **[YACE (Yet Another CloudWatch Exporter)](https://github.com/prometheus-community/yet-another-cloudwatch-exporter)** by following our step-by-step [Installation Guide](https://github.com/prometheus-community/yet-another-cloudwatch-exporter/blob/master/docs/installation.md). - -You can explore various configuration examples [here](https://github.com/prometheus-community/yet-another-cloudwatch-exporter/tree/master/examples) to get started with monitoring specific AWS services. - -Our goal is to enable easy auto-discovery across all AWS services, making it simple to monitor any dynamic infrastructure. diff --git a/content/blog/content/blog/2024-11-14-prometheus-3-0.md b/content/blog/content/blog/2024-11-14-prometheus-3-0.md deleted file mode 100644 index 3a447655..00000000 --- a/content/blog/content/blog/2024-11-14-prometheus-3-0.md +++ /dev/null @@ -1,125 +0,0 @@ ---- -title: Announcing Prometheus 3.0 -created_at: 2024-11-14 -kind: article -author_name: The Prometheus Team ---- - -Following the recent release of [Prometheus 3.0 beta](https://prometheus.io/blog/2024/09/11/prometheus-3-beta/) at PromCon in Berlin, the Prometheus Team -is excited to announce the immediate availability of Prometheus Version 3.0! - -This latest version marks a significant milestone as it is the first major release in 7 years. Prometheus has come a long way in that time, -evolving from a project for early adopters to becoming a standard part of the cloud native monitoring stack. Prometheus 3.0 aims to -continue that journey by adding some exciting new features while largely maintaining stability and compatibility with previous versions. - -The full 3.0 release adds some new features on top of the beta and also introduces a few additional breaking changes that we will describe in this article. - -# What's New - -Here is a summary of the exciting changes that have been released as part of the beta version, as well as what has been added since: - -## New UI - -One of the highlights in Prometheus 3.0 is its brand-new UI that is enabled by default: - -![New UI query page](/assets/blog/2024-11-14/blog_post_screenshot_tree_view-s.png) - -The UI has been completely rewritten with less clutter, a more modern look and feel, new features like a [**PromLens**](https://promlens.com/)-style tree view, -and will make future maintenance easier by using a more modern technical stack. - -Learn more about the new UI in general in [Julius' detailed article on the PromLabs blog](https://promlabs.com/blog/2024/09/11/a-look-at-the-new-prometheus-3-0-ui/). -Users can temporarily enable the old UI by using the `old-ui` feature flag. - -Since the new UI is not battle-tested yet, it is also very possible that there are still bugs. If you find any, please -[report them on GitHub](https://github.com/prometheus/prometheus/issues/new?assignees=&labels=&projects=&template=bug_report.yml). - -Since the beta, the user interface has been updated to support UTF-8 metric and label names. - -![New UTF-8 UI](/assets/blog/2024-11-14/utf8_ui.png) - -## Remote Write 2.0 - -Remote-Write 2.0 iterates on the previous protocol version by adding native support for a host of new elements including metadata, exemplars, -created timestamp and native histograms. It also uses string interning to reduce payload size and CPU usage when compressing and decompressing. -There is better handling for partial writes to provide more details to clients when this occurs. More details can be found -[here](https://prometheus.io/docs/specs/remote_write_spec_2_0/). - -## UTF-8 Support - -Prometheus now allows all valid UTF-8 characters to be used in metric and label names by default, as well as label values, -as has been true in version 2.x. - -Users will need to make sure their metrics producers are configured to pass UTF-8 names, and if either side does not support UTF-8, -metric names will be escaped using the traditional underscore-replacement method. PromQL queries can be written with the new quoting syntax -in order to retrieve UTF-8 metrics, or users can specify the `__name__` label name manually. - -Currently only the Go client library has been updated to support UTF-8, but support for other languages will be added soon. - -## OTLP Support - -In alignment with [our commitment to OpenTelemetry](https://prometheus.io/blog/2024/03/14/commitment-to-opentelemetry/), Prometheus 3.0 features -several new features to improve interoperability with OpenTelemetry. - -### OTLP Ingestion - -Prometheus can be configured as a native receiver for the OTLP Metrics protocol, receiving OTLP metrics on the `/api/v1/otlp/v1/metrics` endpoint. - -See our [guide](https://prometheus.io/docs/guides/opentelemetry) on best practices for consuming OTLP metric traffic into Prometheus. - -### UTF-8 Normalization - -With Prometheus 3.0, thanks to [UTF-8 support](#utf-8-support), users can store and query OpenTelemetry metrics without annoying changes to metric and label names like [changing dots to underscores](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/pkg/translator/prometheus). - -Notably this allows **less confusion** for users and tooling in terms of the discrepancy between what’s defined in OpenTelemetry semantic convention or SDK and what’s actually queryable. - -To achieve this for OTLP ingestion, Prometheus 3.0 has experimental support for different translation strategies. See [otlp section in the Prometheus configuration](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#:~:text=Settings%20related%20to%20the%20OTLP%20receiver%20feature) for details. - -> NOTE: While “NoUTF8EscapingWithSuffixes” strategy allows special characters, it still adds required suffixes for the best experience. See [the proposal on the future work to enable no suffixes](https://github.com/prometheus/proposals/pull/39) in Prometheus. - -## Native Histograms - -Native histograms are a Prometheus metric type that offer a higher efficiency and lower cost alternative to Classic Histograms. Rather than having to choose (and potentially have to update) bucket boundaries based on the data set, native histograms have pre-set bucket boundaries based on exponential growth. - -Native Histograms are still experimental and not yet enabled by default, and can be turned on by passing `--enable-feature=native-histograms`. Some aspects of Native Histograms, like the text format and accessor functions / operators are still under active design. - -## Breaking Changes - -The Prometheus community strives to [not break existing features within a major release](https://prometheus.io/docs/prometheus/latest/stability/). With a new major release we took the opportunity to clean up a few, but small, long-standing issues. In other words, Prometheus 3.0 contains a few breaking changes. This includes changes to feature flags, configuration files, PromQL, and scrape protocols. - -Please read the [migration guide](https://prometheus.io/docs/prometheus/3.0/migration/) to find out if your setup is affected and what actions to take. - -# Performance - -It’s impressive to see what we have accomplished in the community since Prometheus 2.0. We all love numbers, so let’s celebrate the efficiency improvements we made for both CPU and memory use for the TSDB mode. Below you can see performance numbers between 3 Prometheus versions on the node with 8 CPU and 49 GB allocatable memory. - -* 2.0.0 (7 years ago) -* 2.18.0 (4 years ago) -* 3.0.0 (now) - -![Memory bytes](/assets/blog/2024-11-14/memory_bytes_ui.png) - -![CPU seconds](/assets/blog/2024-11-14/cpu_seconds_ui.png) - -It’s furthermore impressive that those numbers were taken using our [prombench macrobenchmark](https://github.com/prometheus/prometheus/pull/15366) -that uses the same PromQL queries, configuration and environment–highlighting backward compatibility and stability for the core features, even with 3.0. - -# What's Next - -There are still tons of exciting features and improvements we can make in Prometheus and the ecosystem. Here is a non-exhaustive list to get you excited and… -hopefully motivate you to contribute and join us! - -* New, more inclusive **governance** -* More **OpenTelemetry** compatibility and features -* OpenMetrics 2.0, now under Prometheus governance! -* Native Histograms stability (and with custom buckets!) -* More optimizations! -* UTF-8 support coverage in more SDKs and tools - -# Try It Out! - -You can try out Prometheus 3.0 by downloading it from our [official binaries](https://prometheus.io/download/#prometheus) and [container images](https://quay.io/repository/prometheus/prometheus?tab=tags). - -If you are upgrading from Prometheus 2.x, check out the migration guide for more information on any adjustments you will have to make. -Please note that we strongly recommend upgrading to v2.55 before upgrading to v3.0. Rollback is possible from v3.0 to v2.55, but not to earlier versions. - -As always, we welcome feedback and contributions from the community! diff --git a/content/blog/feed.erb b/content/blog/feed.erb deleted file mode 100644 index 16b7c5b7..00000000 --- a/content/blog/feed.erb +++ /dev/null @@ -1,4 +0,0 @@ -<%= atom_feed :title => 'Prometheus Blog', :author_name => '© Prometheus Authors 2015', - :author_uri => 'https://prometheus.io/blog/', :limit => 10, - :logo => 'https://prometheus.io/assets/prometheus_logo.png', - :icon => 'https://prometheus.io/assets/favicons/favicon.ico' %> diff --git a/content/blog/index.html b/content/blog/index.html deleted file mode 100644 index 8f2d318e..00000000 --- a/content/blog/index.html +++ /dev/null @@ -1,17 +0,0 @@ ---- -title: Blog ---- -
-
- <% sorted_articles.each do |post| %> -
-

<%= link_to post[:title], post.path %>

- -
- <%= get_post_start(post) %> -
-
- <% end %> -
- <%= render 'blog_sidebar' %> -
diff --git a/content/community.md b/content/community.md deleted file mode 100644 index 22fc8ee8..00000000 --- a/content/community.md +++ /dev/null @@ -1,199 +0,0 @@ ---- -title: Community -layout: page - ---- -# Project Governance - -The Prometheus project follows the [Prometheus governance](/governance/). - -# Community connections - -Prometheus is developed in the open. Here are some of the channels we -use to communicate and contribute: - -## Slack channel - -`#prometheus` on CNCF [Slack](https://slack.cncf.io/). - -## IRC - -`#prometheus` on [irc.libera.chat](https://libera.chat/). - -## Matrix - -[`#prometheus:matrix.org`](https://app.element.io/#/room/#prometheus:matrix.org). - -## User mailing lists - -- [prometheus-announce](https://groups.google.com/forum/#!forum/prometheus-announce) ([mirror](https://www.mail-archive.com/prometheus-announce@googlegroups.com/)) - - low-traffic list for announcements like new releases. -- [prometheus-users](https://groups.google.com/forum/#!forum/prometheus-users) ([mirror](https://www.mail-archive.com/prometheus-users@googlegroups.com/)) - - for discussions around Prometheus usage and community support. - Announcements are *not* generally mirrored from - [prometheus-announce](https://groups.google.com/forum/#!forum/prometheus-announce). - -## Discourse forum - -Web-based discussion forum at [discuss.prometheus.io](https://discuss.prometheus.io/) hosted by -[Discourse](https://www.discourse.org/). - -## Calendar for public events - -We have a public calendar for events, which you can use to join us. - -If you just want to get an overview, simply use our [web view in your browser's time zone](https://calendar.google.com/calendar/u/0/embed?src=prometheus.io_bdf9qgm081nrd0fe32g3olsld0%40group.calendar.google.com). - -If you're using Google products, there's an [automagic link to add it your own Google calendar](https://calendar.google.com/calendar/render?cid=prometheus.io_bdf9qgm081nrd0fe32g3olsld0@group.calendar.google.com). - -If you're using a different calendar, there's an [.ics to add to non-Google calendars](https://calendar.google.com/calendar/ical/prometheus.io_bdf9qgm081nrd0fe32g3olsld0%40group.calendar.google.com/public/basic.ics). - -## Social - -- [@prometheus.io](https://bsky.app/profile/prometheus.io) on [Bluesky](https://bsky.app/). -- [@Prometheus](https://chaos.social/@Prometheus) on [Mastodon (chaos.social)](https://chaos.social). - -## GitHub - -To file bugs and feature requests, use the GitHub issue -tracker of the relevant [Prometheus repository](http://github.com/prometheus). -For questions and discussions, many repositories offer GitHub discussions. -Generally, the other community channels listed here are best suited to get -support or discuss overarching topics. - -*Please do not ask individual project members for support. Use the -channels above instead, where the whole community can help you and -benefit from the solutions provided. If community support is -insufficient for your situation, please refer to the [Support & -Training](/support-training) page.* - -# Contributing - -We welcome community contributions! Please see the `CONTRIBUTING.md` -file in the respective Prometheus repository for instructions on how to -submit changes. If you are planning on making more elaborate or -potentially controversial changes, please discuss them in the developers -IRC channel or on the mailing list before sending a pull request. - -We host public weekly meetings focused on Prometheus development and -contributions. It's meant for developers and maintainers to meet and get -unblocked, pair review, and discuss development aspects of the -Prometheus and related official projects (e.g `node_exporter`, -`alertmanager`). The document linked below contains all the details, -including how to register. - -## Slack channel - -`#prometheus-dev` on CNCF [Slack](https://slack.cncf.io/). - -## IRC - -`#prometheus-dev` on [irc.libera.chat](https://libera.chat/). - -## Matrix - -[`#prometheus-dev:matrix.org`](https://app.element.io/#/room/#prometheus-dev:matrix.org). - -## Development mailing list - -[prometheus-developers](https://groups.google.com/forum/#!forum/prometheus-developers) ([mirror](https://www.mail-archive.com/prometheus-developers@googlegroups.com/)) - - for discussions around Prometheus development. - -## Office Hours - -[Prometheus Contributor Office Hours](https://docs.google.com/document/d/1bPIsOssTswA244z5SDlws5Qwzc27hQQ2IukNmkIjmPk/edit) -- public weekly meetings focused on Prometheus development and contributions. - -# Developer summits - -Developer summits are public meetings to discuss more involved -development topics. They currently happen monthly as an online meeting. -(For details, check out the public events calendar linked in the -Community section above.) The Prometheus team curates the agenda based -on recent discussions via other channels. To propose a topic, please -send a mail to the [development mailing -list](https://groups.google.com/forum/#!forum/prometheus-developers) at -least 24 hours prior to the summit. - -As of 2024, we carry a public [rolling meeting notes -document](https://docs.google.com/document/d/1uurQCi5iVufhYHGlBZ8mJMK_freDFKPG0iYBQqJ9fvA). -You can find our historic meeting notes below. - -- [2017 developer summit notes](https://docs.google.com/document/d/1DaHFao0saZ3MDt9yuuxLaCQg8WGadO8s44i3cxSARcM) -- [2018 developer summit notes](https://docs.google.com/document/d/1-C5PycocOZEVIPrmM1hn8fBelShqtqiAmFptoG4yK70) -- [2019 developer summit notes](https://docs.google.com/document/d/1NQIX78nwBhfLZD3pAb0PK-uBKYqnkzjjVhOQ-kIaEGU) -- [2019 developer summit 2 notes](https://docs.google.com/document/d/1VVxx9DzpJPDgOZpZ5TtSHBRPuG5Fr3Vr6EFh8XuUpgs) -- [2020 virtual developer summit 1 notes](https://docs.google.com/document/d/1yuaPKLDvhJNXMF1ubsOOm5kE2_6dvCxHowBQIDs0KdU) -- [2020 virtual developer summit 2 notes](https://docs.google.com/document/d/1vhXKpCNY0k2cbm0g10uM2msXoMoH8CTwrg_dyqsFUKo) -- [2020 virtual developer summit 3 notes](https://docs.google.com/document/d/18Jbl5LC_FPLqCqU12qY8XpjVMJLCtGt-ykbyAhYXcIc) -- [2020 virtual developer summit 4 notes](https://docs.google.com/document/d/1_60pplXWF1R-utJtswJYFf8F9HBAJdDS5x4Lv9CgAJ8) -- [2020 virtual developer summit 5 notes](https://docs.google.com/document/d/1iO1QHRyABaIpc6xXB1oqu91jYL1QQibEpvQqXfQF-WA) -- [2021 virtual developer summit 1 notes](https://docs.google.com/document/d/10o4gkjgK46MdUHTowpUG_pyeON3Z5k0CbPBqQ211KOE) -- [2021-2024 developer summit rolling notes](https://docs.google.com/document/d/11LC3wJcVk00l8w5P3oLQ-m3Y37iom6INAMEu2ZAGIIE) - -## Developer Summit's Facilitator - -The Facilitator role was created to help the Prometheus team to run the -Developer Summits effectively. It's a rotational role (switches for -every meeting) and its responsibilities are spread across different -phases of the summit: - -### Before the summit - -Before the summit, the Facilitator's main goal is to help the -Prometheus team define the agenda and the topics to be discussed while -making sure interested parties of the most voted topics will be able to -attend the summit. We suggest the following tasks: - -- Two or three days before the meeting, send reminders in our public - community channels inviting people to add Agenda Topics, and - Prometheus Team members and maintainers to vote on topics they'd - like to discuss. -- One day before the meeting, reach out to "Topic owners" who - received the most votes to make sure they'll make it to the summit. - -### During the summit - -During the summit, the Facilitator is here to make sure the meeting runs -smoothly, and that consensus is reached when needed. We suggest the -following tasks: - -- Start the meeting on time. Use `@prometheus.io` account for the admin meeting permissions. -- Start the recording and mention that the Code of Conduct applies. -- Select topics to be discussed based on votes and who is currently present in the meeting. -- Take notes or find volunteer for taking notes in the shared document. -- Strategically step in when the discussion is not moving forward or deviating from the topic. -- Call for consensus when needed. - -### After the summit - -Once the meeting is over, the last task of the Facilitator is to find a -new Facilitator for the next summit by sending an email to the -Prometheus Team mailing list. - -# Code of Conduct - -To make Prometheus a welcoming and harassment-free experience for -everyone, we follow the [CNCF Code of -Conduct](https://github.com/cncf/foundation/blob/main/code-of-conduct.md). - -# Legal Umbrella - -Prometheus is an independent open-source project and not controlled by -any single company. To emphasize this we joined the [Cloud Native -Computing Foundation](https://cncf.io/) in 2016 as the second project -after [Kubernetes](https://kubernetes.io/). - -CNCF logo - -# Acknowledgements - -Prometheus was started by [Matt T. Proud](http://www.matttproud.com) and -[Julius Volz](http://juliusv.com). The majority of its initial -development was sponsored by [SoundCloud](https://soundcloud.com). - -We would also like to acknowledge early contributions by engineers from -[Docker](https://www.docker.com/) and -[Boxever](http://www.boxever.com/). - -The Prometheus logo was contributed by Robin Greenwood. diff --git a/content/css/docs.css b/content/css/docs.css deleted file mode 100644 index d8f23ab3..00000000 --- a/content/css/docs.css +++ /dev/null @@ -1,599 +0,0 @@ -body { - font-family: 'Open Sans', 'Helvetica Neue', Helvetica, sans-serif; -} - -.navbar { - margin-bottom: 0; - min-height: 60px; -} - -.navbar-brand { - margin-top: 1px; - font-family: Lato, sans-serif; - font-size: 26px; - font-weight: 300; - color: #aaa; -} - -.navbar-brand img { - height: 30px; - display: inline; - margin-top: -5px; - margin-right: 3px; -} - -.navbar-toggle { - margin-top: 22px; -} - -.navbar-jumbotron .navbar { - min-height: 83px; -} - -.navbar-jumbotron .navbar-brand { - margin-top: 12px; -} - -.navbar-jumbotron .main-nav { - margin-top: 15px; -} - -.main-nav { - margin-top: 3px; - letter-spacing: 1px; - font-family: 'Lato', sans-serif; - font-size: 15px; -} - -.main-nav > li > a { - text-transform: uppercase; -} - -.algolia-autocomplete { - margin-top: 8px; - padding-left: 15px; - padding-right: 15px; -} - -.algolia-autocomplete .algolia-docsearch-suggestion--highlight { - color: #e6522c; - background: #e6522c20; -} - -.algolia-autocomplete .algolia-docsearch-suggestion--category-header .algolia-docsearch-suggestion--category-header-lvl0 -.algolia-docsearch-suggestion--highlight, -.algolia-autocomplete .algolia-docsearch-suggestion--category-header .algolia-docsearch-suggestion--category-header-lvl1 -.algolia-docsearch-suggestion--highlight { - box-shadow: inset 0 -2px 0 0 #e6522cc0; -} - -.algolia-autocomplete .algolia-docsearch-suggestion--text .algolia-docsearch-suggestion--highlight { - box-shadow: inset 0 -2px 0 0 #e6522cc0; -} - -.searchbox { - border: none; - border-radius: 5px; - background-color: rgb(75, 75, 75); - color: white; -} - -.banner { - background-color: rgb(252, 232, 170); - width: 100%; - display: flex; - gap: 32px; - align-items: center; - justify-content: center; -} - -.banner button { - background-color: transparent; - color: rgb(36, 36, 152); - border: none; -} - -.banner button:hover { - border: rgb(36, 36, 152); -} - -.banner a { - text-decoration: underline; -} - -.banner h3 { - margin: 1.2em 0; - font-size: 16px; - font-family: Lato, sans-serif; -} - -.jumbotron { - background-color: #e6522c; - background-image: url("/assets/jumbotron-background.png"); - text-align: center; - font-family: Lato, sans-serif; - text-shadow: rgba(0, 0, 0, 0.2) 0px 2px 0px; - margin-bottom: 0px; - padding: 40px 0; -} - -.jumbotron h1 { - margin-top: 30px; - font-size: 52px; - font-weight: 300; - color: #fff; -} - -.jumbotron .subtitle { - font-weight: 300; - font-size: 32px; - color: rgba(255,255,255,0.8); - margin-bottom: 20px; -} - -.jumbotron a.btn { - border: none; - background-color: rgba(0,0,0,0.15); - color: #fff; - padding: 20px 25px 20px 25px; - margin: 15px 10px 0 10px; - text-transform: uppercase; -} - -.jumbotron a.btn:hover { - background-color: rgba(0,0,0,0.25); - color: #fff; -} - -.newsbar { - background-color: #eee; - text-align: center; - font-family: Lato, sans-serif; - padding: 25px 50px; - font-size: 24px; - font-weight: 300; - margin-bottom: 50px; -} - -.newsbar a { - font-weight: 400; -} - -.feature-item { - font-family: 'Lato', sans-serif; - font-weight: 300; - cursor: pointer; -} - -.feature-item:hover { - background-color: #fad9d1; - border-radius: 3px; -} - -.feature-item a { - text-decoration: none; - color: none; -} - -.feature-item h2 { - color: #333; - font-weight: 300; - font-size: 25px; - white-space: nowrap; - overflow: hidden; - text-overflow: ellipsis; - line-height: normal; -} - -.feature-item .ti { - margin-right: 5px; - color: #e6522c; -} - -.feature-item p { - font-size: 16px; - line-height: 1.8em; - text-rendering: optimizeLegibility; - -webkit-font-smoothing: antialiased; - color: #111; -} - -.top-hr { - margin-top: 30px; -} - -.quote { - margin: 25px 0 25px 0; - font-family: 'Lato', sans-serif; - font-weight: 300; - text-align: center; -} - -.quote-text { - width: 60%; - margin: auto; - font-size: 22px; - font-style: italic; - color: #be3511; -} - -.quote-source { - font-size: 16px; - margin-top: 15px; - margin-left: 50px; -} - -.open-source { - margin: 25px 0 20px 0; - font-family: 'Lato', sans-serif; - font-weight: 300; - text-align: center; -} - -.open-source h1 { - margin-top: 0; - font-weight: 300; -} - -.open-source p { - width: 50%; - margin: auto; - font-size: 20px; -} - -.open-source .github-stars { - margin-top: 20px; - width: 200px; - height: 30px; -} - -@media screen and (max-width: 365px) { - .open-source img { - width: 90%; - } -} - -.trusted-by { - font-family: 'Lato', sans-serif; - font-size: 20px; - font-weight: 300; - padding-bottom: 10px; -} - -.logos { - display: flex; - align-items: center; - justify-content: center; - flex-flow: row wrap; -} - -.logos img { - flex: 1 1 auto; - padding: 22px; - max-width: 220px; - max-height: 101px; -} - - -footer { - font-size: 12px; - color: #333; -} - -/* Downloads related styles. */ -.download h2 { - margin-top: 2em; -} - -.download-selection { - clear: both; -} - -.download .alert { - clear: both; -} - -.download-selection .btn-group { - padding-right: 1em; -} - -.downloads td, .downloads th { - white-space: nowrap; -} - -.downloads .filename { - width: 100%; -} - -.downloads .checksum { - font-family: monospace; - font-size: .6em; - vertical-align: inherit; -} - -/* Docs-related styles. */ -.side-nav-col { - z-index: 100; -} - -.side-nav { - margin-top: 20px; - padding: 5px 20px 20px 7px; -} - -.side-nav .ti { - width: 20px; - color: #555; - font-size: 1.1em; -} - -.side-nav a { - color: #333; -} - -.side-nav > li { - width: 100%; -} - -.side-nav div { - padding: 10px 15px; - background-color: #eee; -} - -.side-nav .nav-header { - display: flex; - gap: 5px; - align-items: baseline; - margin: 20px auto 15px auto; - font-size: 16px; -} - -.side-nav .nav-header a, .side-nav .nav-header { - color: #e6522c; - text-transform: uppercase; - text-decoration: none; -} - -.side-nav li.current { - border-left: 3px solid #e6522c; - margin-left: -2px; - font-weight: bold; -} - -.side-nav li ul li { - border-left: 1px solid #e9eff2; -} - -.side-nav li li li a { - padding: 5px 15px 5px 40px; -} - -/* NOTE: When updating this class name, the search box configuration at - https://github.com/algolia/docsearch-configs/blob/master/configs/prometheus.json needs to be updated as well! */ -.doc-content { - font-size: 16px; -} - -.doc-content p, .doc-content.ul { - margin: 15px 0 15px 0; - line-height: 1.5; -} - -.doc-content .alert { - margin: 15px 0 15px 0; - line-height: 1.5; - /* This is to prevent NOTE/admonition alert boxes from overlapping the table of contents at the top of a docs page. */ - display: inline-block; width: 100%; -} - -.doc-content > h1 { - color: #e6522c; - font-size: 30px; - text-transform: uppercase; - margin: 40px 0 10px 0; -} - -.doc-content > h1 a { - color: #e6522c; -} - -.doc-content.blog > h1 { - text-transform: none; -} - -.doc-content.blog .sponsor-logos > a > img { - width: 250px; - display: inline-block !important; - margin: 15px 55px; -} - -.doc-content > h2 { - color: #e6522c; - font-size: 22px; -} - -.doc-content > h2 code { - color: #e6522c; - background: none; -} - -.doc-content > h3 { - font-size: 20px; - font-weight: bold; -} - -.doc-content > h4 { - font-weight: bold; - font-size: 18px; - margin-top: 20px; -} - -.doc-content a.header-anchor { - font-size: .8em; - padding-left: 8px; - text-decoration: none; -} - -.doc-content a.header-anchor:link, -.doc-content a.header-anchor:visited { - visibility: hidden; -} - -.doc-content h1:hover a.header-anchor:hover, -.doc-content h2:hover a.header-anchor:hover, -.doc-content h3:hover a.header-anchor:hover, -.doc-content h4:hover a.header-anchor:hover, -.doc-content h5:hover a.header-anchor:hover, -.doc-content h6:hover a.header-anchor:hover { - color: #000; -} - -.doc-content h1:hover a.header-anchor, -.doc-content h2:hover a.header-anchor, -.doc-content h3:hover a.header-anchor, -.doc-content h4:hover a.header-anchor, -.doc-content h5:hover a.header-anchor, -.doc-content h6:hover a.header-anchor { - color: #999; - visibility: visible; -} - -.doc-content img { - width: 90%; - margin-left: auto; - margin-right: auto; - display: block; -} - -.doc-content img.orig-size { - width: auto; - margin-left: 0; -} - -.doc-content .open-source-notice { - color: #666; - background-color: #f5f5f5; - text-align: center; - padding: 0.8em; - margin-top: 1.5em; -} - -.toc { - padding: 1em; - background-color: #f5f5f5; -} - -.toc-right { - float: right; - width: 40%; - margin: 0 0 0.5em 0.5em; -} - -.toc ul { - padding: 0 0 0 1.5em; - margin: 0; -} - -.toc a code { - color: #337ab7; - background-color: transparent; -} - -pre { - border: 1px solid #ddd; - border-left: 4px solid #e6522c; - border-radius: 0; - font-family: "Courier New", Monaco, Menlo, Consolas, monospace; - background-color: #f5f5f5; - color: #333; - padding: 15px; -} - -pre code { - white-space: pre; -} - -code { - color: #333; -} - -aside { - color: #888; - padding-bottom: 8px; - border-bottom: 1px solid #aaa; -} - -article { - margin: 10px 0 60px 0; -} - -.training > .row { - padding-bottom: 20px; - margin-bottom: 40px; -} - -.training h2 { - margin-bottom: 20px; -} - -.gray-bg { - background-color: #f5f5f5; -} - -.darkgray-bg { - background-color: #c1c1c1; -} - -.orange-bg { - background-color: #e6522c; -} - -.orange-bg:hover { - background-color: #c13e1c; -} - -.equal { - display: flex; - flex-wrap: wrap; -} - -.panel-col { - margin-bottom: 40px; -} - -.panel { - height: 100%; -} - -.btn-training { - width: 100%; -} - -.tc-col { - margin-bottom: 40px; - display: flex; - justify-content: center; - align-items: center; -} - -.tc-col .btn { - min-height: 100px; - display: flex; - justify-content: center; - align-items: center; -} - -.training-cta { - display: flex; - justify-content: center; - align-items: center; - margin-top: 40px; - margin-bottom: 40px; -} - - -.training-cta .btn { - text-transform: uppercase; - border: 0px; - padding: 25px; -} diff --git a/content/css/routing-tree-editor.css b/content/css/routing-tree-editor.css deleted file mode 100644 index 47d9c32f..00000000 --- a/content/css/routing-tree-editor.css +++ /dev/null @@ -1,39 +0,0 @@ -.routing-table { - font: 12px sans-serif; -} - -.node circle { - stroke: #e6522c; - stroke-width: 1.5px; -} - -.node text { - font: 10px sans-serif; -} - -.link { - fill: none; - stroke: #ccc; - stroke-width: 1.5px; -} - -.form-control.label-input { - padding: 2px; - width: 450px; -} - -textarea { - border-color: #ddd; - height: 450px; - padding: 2px 0; - width: 100%; - font-family: monospace; -} - -.block { - display: block; -} - -.inline-block { - display: inline-block; -} diff --git a/content/docs/404.md b/content/docs/404.md deleted file mode 100644 index 710091e0..00000000 --- a/content/docs/404.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -title: 404 -is_hidden: true ---- - -# 404 - -## Oops! The page you requested could not be found. - -Here are a few things you can try: - -* Check the URL to make sure you typed it correctly. -* Try searching for the page you're looking for in the documentation search bar. -* Browse the documentation table of contents to see if you can find the page you're looking for. -* If you're still having trouble, please contact us for assistance. - -Thank you for your patience. diff --git a/content/docs/alerting/index.md b/content/docs/alerting/index.md deleted file mode 100644 index 46e1b28f..00000000 --- a/content/docs/alerting/index.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: Alertmanager -sort_rank: 7 -nav_icon: bell ---- diff --git a/content/docs/concepts/data_model.md b/content/docs/concepts/data_model.md deleted file mode 100644 index 6fa583b0..00000000 --- a/content/docs/concepts/data_model.md +++ /dev/null @@ -1,71 +0,0 @@ ---- -title: Data model -sort_rank: 1 ---- - -# Data model - -Prometheus fundamentally stores all data as [_time -series_](http://en.wikipedia.org/wiki/Time_series): streams of timestamped -values belonging to the same metric and the same set of labeled dimensions. -Besides stored time series, Prometheus may generate temporary derived time series -as the result of queries. - -## Metric names and labels - -Every time series is uniquely identified by its metric name and optional key-value pairs called labels. - -***Metric names:*** - -* Metric names SHOULD specify the general feature of a system that is measured (e.g. `http_requests_total` - the total number of HTTP requests received). -* Metric names MAY use any UTF-8 characters. -* Metric names SHOULD match the regex `[a-zA-Z_:][a-zA-Z0-9_:]*` for the best experience and compatibility (see the warning below). Metric names outside of that set will require quoting e.g. when used in PromQL (see the [UTF-8 guide](../guides/utf8.md#querying)). - -NOTE: Colons (':') are reserved for user-defined recording rules. They SHOULD NOT be used by exporters or direct instrumentation. - -***Metric labels:*** - -Labels let you capture different instances of the same metric name. For example: all HTTP requests that used the method `POST` to the `/api/tracks` handler. We refer to this as Prometheus's "dimensional data model". The query language allows filtering and aggregation based on these dimensions. The change of any label's value, including adding or removing labels, will create a new time series. - -* Label names MAY use any UTF-8 characters. -* Label names beginning with `__` (two underscores) MUST be reserved for internal Prometheus use. -* Label names SHOULD match the regex `[a-zA-Z_][a-zA-Z0-9_]*` for the best experience and compatibility (see the warning below). Label names outside of that regex will require quoting e.g. when used in PromQL (see the [UTF-8 guide](../guides/utf8.md#querying)). -* Label values MAY contain any UTF-8 characters. -* Labels with an empty label value are considered equivalent to labels that do not exist. - -WARNING: The [UTF-8](../guides/utf8.md) support for metric and label names was added relatively recently in Prometheus v3.0.0. It might take time for the wider ecosystem (downstream PromQL compatible projects and vendors, tooling, third-party instrumentation, collectors, etc.) to adopt new quoting mechanisms, relaxed validation etc. For the best compatibility it's recommended to stick to the recommended ("SHOULD") character set. - -INFO: See also the [best practices for naming metrics and labels](/docs/practices/naming/). - -## Samples - -Samples form the actual time series data. Each sample consists of: - -* a float64 or [native histogram](https://prometheus.io/docs/specs/native_histograms/) value -* a millisecond-precision timestamp - -## Notation - -Given a metric name and a set of labels, time series are frequently identified -using this notation: - - {