mirror of
https://github.com/foomo/foomo-docs.git
synced 2025-10-16 12:35:40 +00:00
97 lines
18 KiB
HTML
97 lines
18 KiB
HTML
<!doctype html>
|
|
<html lang="en" dir="ltr" class="blog-wrapper blog-post-page plugin-blog plugin-id-default" data-has-hydrated="false">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="generator" content="Docusaurus v3.0.0">
|
|
<title data-rh="true">Prometheus Is Out Of Memory. Again. | foomo project docs</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://www.foomo.org/blog/prometheus-cardinality-issues"><meta data-rh="true" property="og:locale" content="en"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docusaurus_tag" content="default"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docsearch:docusaurus_tag" content="default"><meta data-rh="true" property="og:title" content="Prometheus Is Out Of Memory. Again. | foomo project docs"><meta data-rh="true" name="description" content="The Annoyance"><meta data-rh="true" property="og:description" content="The Annoyance"><meta data-rh="true" property="og:type" content="article"><meta data-rh="true" property="article:published_time" content="2022-01-25T00:00:00.000Z"><meta data-rh="true" property="article:author" content="https://github.com/smartinov"><meta data-rh="true" property="article:tag" content="prometheus,cardinality,devops,ops,k8s,oom,memory"><link data-rh="true" rel="icon" href="/img/favicon.ico"><link data-rh="true" rel="canonical" href="https://www.foomo.org/blog/prometheus-cardinality-issues"><link data-rh="true" rel="alternate" href="https://www.foomo.org/blog/prometheus-cardinality-issues" hreflang="en"><link data-rh="true" rel="alternate" href="https://www.foomo.org/blog/prometheus-cardinality-issues" hreflang="x-default"><link data-rh="true" rel="preconnect" href="https://SUATUVZDDM-dsn.algolia.net" crossorigin="anonymous"><link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="foomo project docs RSS Feed">
|
|
<link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="foomo project docs Atom Feed">
|
|
|
|
|
|
|
|
<link rel="search" type="application/opensearchdescription+xml" title="foomo project docs" href="/opensearch.xml"><link rel="stylesheet" href="/assets/css/styles.78fe5ce6.css">
|
|
<script src="/assets/js/runtime~main.638e5c2c.js" defer="defer"></script>
|
|
<script src="/assets/js/main.1248442c.js" defer="defer"></script>
|
|
</head>
|
|
<body class="navigation-with-keyboard">
|
|
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){try{return new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}}()||function(){try{return localStorage.getItem("theme")}catch(t){}}();t(null!==e?e:"light")}(),function(){try{const c=new URLSearchParams(window.location.search).entries();for(var[t,e]of c)if(t.startsWith("docusaurus-data-")){var a=t.replace("docusaurus-data-","data-");document.documentElement.setAttribute(a,e)}}catch(t){}}()</script><div id="__docusaurus"><div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><b class="navbar__title text--truncate">foomo</b></a><a class="navbar__item navbar__link" href="/docs/general">General</a><a class="navbar__item navbar__link" href="/docs/frontend">Frontend</a><a class="navbar__item navbar__link" href="/docs/backend">Backend</a><a class="navbar__item navbar__link" href="/docs/devops">DevOps</a><a class="navbar__item navbar__link" href="/docs/projects">Projects</a></div><div class="navbar__items navbar__items--right"><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/blog">Blog</a><div class="navbarSearchContainer_Bca1"><button type="button" class="DocSearch DocSearch-Button" aria-label="Search"><span class="DocSearch-Button-Container"><svg width="20" height="20" class="DocSearch-Search-Icon" viewBox="0 0 20 20"><path d="M14.386 14.386l4.0877 4.0877-4.0877-4.0877c-2.9418 2.9419-7.7115 2.9419-10.6533 0-2.9419-2.9418-2.9419-7.7115 0-10.6533 2.9418-2.9419 7.7115-2.9419 10.6533 0 2.9419 2.9418 2.9419 7.7115 0 10.6533z" stroke="currentColor" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg><span class="DocSearch-Button-Placeholder">Search</span></span><span class="DocSearch-Button-Keys"></span></button></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0"><div class="container margin-vert--lg"><div class="row"><aside class="col col--3"><nav class="sidebar_re4s thin-scrollbar" aria-label="Blog recent posts navigation"><div class="sidebarItemTitle_pO2u margin-bottom--md">Recent posts</div><ul class="sidebarItemList_Yudw clean-list"><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/blog/go-race-conditions-testing-and-coverage">Go race conditions testing and coverage</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/blog/accuracy-of-decimal-computations">Accuracy of decimal computations</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/blog/why-bundle-size-is-important">Why bundle size is important?</a></li><li class="sidebarItem__DBe"><a aria-current="page" class="sidebarItemLink_mo7H sidebarItemLinkActive_I1ZP" href="/blog/prometheus-cardinality-issues">Prometheus Is Out Of Memory. Again.</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/blog/searching-for-search-engines">The never ending search a search engine 2022-01 edition</a></li></ul></nav></aside><main class="col col--7" itemscope="" itemtype="https://schema.org/Blog"><article itemprop="blogPost" itemscope="" itemtype="https://schema.org/BlogPosting"><meta itemprop="description" content="The Annoyance"><header><h1 class="title_f1Hy" itemprop="headline">Prometheus Is Out Of Memory. Again.</h1><div class="container_mt6G margin-vert--md"><time datetime="2022-01-25T00:00:00.000Z" itemprop="datePublished">January 25, 2022</time></div><div class="margin-top--md margin-bottom--sm row"><div class="col col--6 authorCol_Hf19"><div class="avatar margin-bottom--sm"><a href="https://github.com/smartinov" target="_blank" rel="noopener noreferrer" class="avatar__photo-link"><img class="avatar__photo" src="https://github.com/smartinov.png" alt="Stefan Martinov" itemprop="image"></a><div class="avatar__intro" itemprop="author" itemscope="" itemtype="https://schema.org/Person"><div class="avatar__name"><a href="https://github.com/smartinov" target="_blank" rel="noopener noreferrer" itemprop="url"><span itemprop="name">Stefan Martinov</span></a></div><small class="avatar__subtitle" itemprop="description">Memelord</small></div></div></div></div></header><div id="__blog-post-container" class="markdown" itemprop="articleBody"><h2 id="the-annoyance">The Annoyance</h2>
|
|
<p>So, we've all been there. You go to your trusty grafana, search for some sweet metrics that you implemented and WHAM!
|
|
Prometheus returns us a 503, a trusty way of saying I'm not ready, and I'm probably going to die soon.
|
|
And since we're running in kubernetes I'm going to die soon, again and again.
|
|
And you're getting reports from your colleagues that prometheus is not responding.
|
|
And you can't ignore them anymore.</p>
|
|
<p><img alt="Bummer." src="/assets/images/bummer-e80d471cba23d1ee83e8463187845893.webp" width="480" height="270"></p>
|
|
<h2 id="the-problem">The Problem</h2>
|
|
<p>All right, lets check what's happening to the little guy.</p>
|
|
<pre><code class="language-bash">kubectl get pods -n monitoring
|
|
|
|
prometheus-prometheus-kube-prometheus-prometheus-0 1/2 Running 4 5m
|
|
</code></pre>
|
|
<p>It seems like it's stuck in the running state, where the container is not yet ready.
|
|
Let's describe the deployment, to check out what's happening.</p>
|
|
<pre><code class="language-yaml"> State: Running │
|
|
Started: Wed, 12 Jan 2022 15:12:49 +0100 │
|
|
Last State: Terminated │
|
|
Reason: OOMKilled │
|
|
Exit Code: 137 │
|
|
Started: Tue, 11 Jan 2022 17:14:41 +0100 │
|
|
Finished: Wed, 12 Jan 2022 15:12:47 +0100 │
|
|
</code></pre>
|
|
<p>So we see that the prometheus is in a running state waiting for the readiness probe to trigger, probably working on recovering from Write Ahead Log (WAL).
|
|
This could be an issue where prometheus is recovering from an error, or a restart and does not have enough memory to write everything in the WAL.
|
|
We could be running into an issue where we set the request/limits memory lower than the prometheus requires, and the kube scheduler keeps killing prometheus for wanting more memory.</p>
|
|
<p>For this case, we could give it more memory to work to see if it recovers. We should also analyze why the prometheus WAL is getting clogged up.</p>
|
|
<p>In essence, we want to check what has changed so that we suddenly have a high memory spike in our sweet, sweet environment.</p>
|
|
<h2 id="the-source">The Source</h2>
|
|
<p><img alt="Cardinality" src="/assets/images/cardinality-5f722655c50c25a6a91c53884ad19677.webp" width="501" height="500"></p>
|
|
<p>A lot of prometheus issues revolve around cardinality.
|
|
Memory spikes that break your deployment? Cardinality.
|
|
Prometheus dragging its feet like it's Monday after the log4j (the second one ofc) zero day security breach? Cardinality.
|
|
Not getting that raise since you worked hard the past 16 years without wavering? You bet your ass it's cardinality.
|
|
So, as you can see much of life's problems can be accredited to cardinality.</p>
|
|
<p>In short cardinality of your metrics is the combination of all label values per metric.
|
|
For example, if our metric <code>http_request_total</code> had a label response code, and let's say we support 8 status codes, our cardinality starts off at 8.
|
|
For good measure we want to record the HTTP verb for the request. We support <code>GET POST PUT HEAD</code> which would put the cardinality to 4*8=<strong>32</strong>.
|
|
Now, if someone adds a URL to the metric label (<strong>!!VERY BAD IDEA!!</strong>, but bare with me now) and we have 2 active pages, we'd have a cardinality of 2*4*8=<strong>64</strong>.
|
|
But, imagine someone starts scraping your website for potential vulnerabilities. Imagine all the URLs that will appear, most likely only once.</p>
|
|
<pre><code class="language-text">mywebsite.com/admin.php
|
|
mywebsite.com/wp/admin.php
|
|
mywebsite.com/?utm_source=GUID
|
|
...
|
|
</code></pre>
|
|
<p>This would blow up our cardinality to kingdom come. Like you will be out of memory faster than "<a href="https://www.reddit.com/r/ProgrammerHumor/comments/a483yz/those_javascript_devs/">a new super awesome Javascript gamechanger framework</a>" is born.
|
|
Or to quote user <a href="https://www.reddit.com/user/naveen17797/">naveen17797</a> <em>Scientists predict the number of js frameworks may exceed human population by 2020,at that point of time random string generators will be used to name those frameworks.</em></p>
|
|
<p>The point to this story is, be very mindful of how you use labels and cardinality in prometheus, since that will indeed have great impact on your prometheus performance.</p>
|
|
<h2 id="the-solution">The Solution</h2>
|
|
<p>Since this has never happened to me (never-ever) I found the following solution to be handy.
|
|
Since we can't get prometheus up and running to utilize PromQL to detect the potential issues, we have to find another way to detect high cardinality.
|
|
Therefore, we might want to get our hands dirty with some <code>kubectl exec -it -n monitoring pods/prometheus-prometheus-kube-prometheus-prometheus-0 -- sh</code>, and run the prometheus <code>tsdb</code> analysis too.</p>
|
|
<pre><code class="language-bash">/prometheus $ promtool tsdb analyze .
|
|
</code></pre>
|
|
<p>Which produced the result.</p>
|
|
<pre><code class="language-text">> Block ID: 01FT8E8YY4THHZ2S7C3G04GJMG
|
|
> Duration: 1h59m59.997s
|
|
> Series: 564171
|
|
> Label names: 285
|
|
> Postings (unique label pairs): 21139
|
|
> Postings entries (total label pairs): 6423664
|
|
>
|
|
> ...
|
|
>
|
|
> Highest cardinality metric names:
|
|
> 11340 haproxy_server_http_responses_total
|
|
> ...
|
|
</code></pre>
|
|
<p>We see the potential issue here, where the <code>haproxy_server_http_responses_total</code> metric is having a super-high cardinality which is growing.
|
|
We need to deal with it, so that our prometheus instance can breathe again. In this particular case, the solution was updating the haproxy.</p>
|
|
<p>... or burn it, up to you.</p>
|
|
<p><img alt="Flame Thrower" src="/assets/images/flame-thrower-56bcf89132356ff53c03ca029d9d0746.webp" width="1440" height="1080"></p>
|
|
<h2 id="the-further-reading">The Further Reading</h2>
|
|
<ol>
|
|
<li><a href="https://github.com/prometheus/prometheus/blob/main/tsdb/docs/format/wal.md">WAL Definition</a></li>
|
|
<li><a href="https://ganeshvernekar.com/blog/prometheus-tsdb-wal-and-checkpoint/">WAL & Checkpoints</a></li>
|
|
<li><a href="https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality">Using TSDB</a></li>
|
|
<li><a href="https://www.robustperception.io/which-are-my-biggest-metrics">Biggest Metrics</a></li>
|
|
<li><a href="https://www.robustperception.io/cardinality-is-key">Cardinality</a></li>
|
|
</ol></div><footer class="row docusaurus-mt-lg blogPostFooterDetailsFull_mRVl"><div class="col"><b>Tags:</b><ul class="tags_jXut padding--none margin-left--sm"><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/blog/tags/prometheus">prometheus</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/blog/tags/cardinality">cardinality</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/blog/tags/devops">devops</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/blog/tags/ops">ops</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/blog/tags/k-8-s">k8s</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/blog/tags/oom">oom</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/blog/tags/memory">memory</a></li></ul></div><div class="col margin-top--sm"><a href="https://github.com/foomo/foomo-docs/tree/main/foomo/blog/2022-01-25-prometheus-cardinality-issues/index.mdx" target="_blank" rel="noopener noreferrer" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_Z9Sw" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div></footer></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Blog post page navigation"><a class="pagination-nav__link pagination-nav__link--prev" href="/blog/why-bundle-size-is-important"><div class="pagination-nav__sublabel">Newer Post</div><div class="pagination-nav__label">Why bundle size is important?</div></a><a class="pagination-nav__link pagination-nav__link--next" href="/blog/searching-for-search-engines"><div class="pagination-nav__sublabel">Older Post</div><div class="pagination-nav__label">The never ending search a search engine 2022-01 edition</div></a></nav></main><div class="col col--2"><div class="tableOfContents_bqdL thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#the-annoyance" class="table-of-contents__link toc-highlight">The Annoyance</a></li><li><a href="#the-problem" class="table-of-contents__link toc-highlight">The Problem</a></li><li><a href="#the-source" class="table-of-contents__link toc-highlight">The Source</a></li><li><a href="#the-solution" class="table-of-contents__link toc-highlight">The Solution</a></li><li><a href="#the-further-reading" class="table-of-contents__link toc-highlight">The Further Reading</a></li></ul></div></div></div></div></div><footer class="footer"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">github</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://github.com/foomo" target="_blank" rel="noopener noreferrer" class="footer__link-item">https://github.com/foomo</a></li></ul></div><div class="col footer__col"><div class="footer__title">legal</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/etc/imprint">Imprint</a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright">© 2024 bestbytes</div></div></div></footer></div>
|
|
</body>
|
|
</html> |