foomo-docs/blog/prometheus-cardinality-issues.html
2024-06-10 09:44:34 +00:00

97 lines
18 KiB
HTML

<!doctype html>
<html lang="en" dir="ltr" class="blog-wrapper blog-post-page plugin-blog plugin-id-default" data-has-hydrated="false">
<head>
<meta charset="UTF-8">
<meta name="generator" content="Docusaurus v3.0.0">
<title data-rh="true">Prometheus Is Out Of Memory. Again. | foomo project docs</title><meta data-rh="true" name="viewport" content="width=device-width,initial-scale=1"><meta data-rh="true" name="twitter:card" content="summary_large_image"><meta data-rh="true" property="og:url" content="https://www.foomo.org/blog/prometheus-cardinality-issues"><meta data-rh="true" property="og:locale" content="en"><meta data-rh="true" name="docusaurus_locale" content="en"><meta data-rh="true" name="docusaurus_tag" content="default"><meta data-rh="true" name="docsearch:language" content="en"><meta data-rh="true" name="docsearch:docusaurus_tag" content="default"><meta data-rh="true" property="og:title" content="Prometheus Is Out Of Memory. Again. | foomo project docs"><meta data-rh="true" name="description" content="The Annoyance"><meta data-rh="true" property="og:description" content="The Annoyance"><meta data-rh="true" property="og:type" content="article"><meta data-rh="true" property="article:published_time" content="2022-01-25T00:00:00.000Z"><meta data-rh="true" property="article:author" content="https://github.com/smartinov"><meta data-rh="true" property="article:tag" content="prometheus,cardinality,devops,ops,k8s,oom,memory"><link data-rh="true" rel="icon" href="/img/favicon.ico"><link data-rh="true" rel="canonical" href="https://www.foomo.org/blog/prometheus-cardinality-issues"><link data-rh="true" rel="alternate" href="https://www.foomo.org/blog/prometheus-cardinality-issues" hreflang="en"><link data-rh="true" rel="alternate" href="https://www.foomo.org/blog/prometheus-cardinality-issues" hreflang="x-default"><link data-rh="true" rel="preconnect" href="https://SUATUVZDDM-dsn.algolia.net" crossorigin="anonymous"><link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="foomo project docs RSS Feed">
<link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="foomo project docs Atom Feed">
<link rel="search" type="application/opensearchdescription+xml" title="foomo project docs" href="/opensearch.xml"><link rel="stylesheet" href="/assets/css/styles.78fe5ce6.css">
<script src="/assets/js/runtime~main.638e5c2c.js" defer="defer"></script>
<script src="/assets/js/main.1248442c.js" defer="defer"></script>
</head>
<body class="navigation-with-keyboard">
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){try{return new URLSearchParams(window.location.search).get("docusaurus-theme")}catch(t){}}()||function(){try{return localStorage.getItem("theme")}catch(t){}}();t(null!==e?e:"light")}(),function(){try{const c=new URLSearchParams(window.location.search).entries();for(var[t,e]of c)if(t.startsWith("docusaurus-data-")){var a=t.replace("docusaurus-data-","data-");document.documentElement.setAttribute(a,e)}}catch(t){}}()</script><div id="__docusaurus"><div role="region" aria-label="Skip to main content"><a class="skipToContent_fXgn" href="#__docusaurus_skipToContent_fallback">Skip to main content</a></div><nav aria-label="Main" class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Toggle navigation bar" aria-expanded="false" class="navbar__toggle clean-btn" type="button"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><b class="navbar__title text--truncate">foomo</b></a><a class="navbar__item navbar__link" href="/docs/general">General</a><a class="navbar__item navbar__link" href="/docs/frontend">Frontend</a><a class="navbar__item navbar__link" href="/docs/backend">Backend</a><a class="navbar__item navbar__link" href="/docs/devops">DevOps</a><a class="navbar__item navbar__link" href="/docs/projects">Projects</a></div><div class="navbar__items navbar__items--right"><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/blog">Blog</a><div class="navbarSearchContainer_Bca1"><button type="button" class="DocSearch DocSearch-Button" aria-label="Search"><span class="DocSearch-Button-Container"><svg width="20" height="20" class="DocSearch-Search-Icon" viewBox="0 0 20 20"><path d="M14.386 14.386l4.0877 4.0877-4.0877-4.0877c-2.9418 2.9419-7.7115 2.9419-10.6533 0-2.9419-2.9418-2.9419-7.7115 0-10.6533 2.9418-2.9419 7.7115-2.9419 10.6533 0 2.9419 2.9418 2.9419 7.7115 0 10.6533z" stroke="currentColor" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg><span class="DocSearch-Button-Placeholder">Search</span></span><span class="DocSearch-Button-Keys"></span></button></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div id="__docusaurus_skipToContent_fallback" class="main-wrapper mainWrapper_z2l0"><div class="container margin-vert--lg"><div class="row"><aside class="col col--3"><nav class="sidebar_re4s thin-scrollbar" aria-label="Blog recent posts navigation"><div class="sidebarItemTitle_pO2u margin-bottom--md">Recent posts</div><ul class="sidebarItemList_Yudw clean-list"><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/blog/go-race-conditions-testing-and-coverage">Go race conditions testing and coverage</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/blog/accuracy-of-decimal-computations">Accuracy of decimal computations</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/blog/why-bundle-size-is-important">Why bundle size is important?</a></li><li class="sidebarItem__DBe"><a aria-current="page" class="sidebarItemLink_mo7H sidebarItemLinkActive_I1ZP" href="/blog/prometheus-cardinality-issues">Prometheus Is Out Of Memory. Again.</a></li><li class="sidebarItem__DBe"><a class="sidebarItemLink_mo7H" href="/blog/searching-for-search-engines">The never ending search a search engine 2022-01 edition</a></li></ul></nav></aside><main class="col col--7" itemscope="" itemtype="https://schema.org/Blog"><article itemprop="blogPost" itemscope="" itemtype="https://schema.org/BlogPosting"><meta itemprop="description" content="The Annoyance"><header><h1 class="title_f1Hy" itemprop="headline">Prometheus Is Out Of Memory. Again.</h1><div class="container_mt6G margin-vert--md"><time datetime="2022-01-25T00:00:00.000Z" itemprop="datePublished">January 25, 2022</time></div><div class="margin-top--md margin-bottom--sm row"><div class="col col--6 authorCol_Hf19"><div class="avatar margin-bottom--sm"><a href="https://github.com/smartinov" target="_blank" rel="noopener noreferrer" class="avatar__photo-link"><img class="avatar__photo" src="https://github.com/smartinov.png" alt="Stefan Martinov" itemprop="image"></a><div class="avatar__intro" itemprop="author" itemscope="" itemtype="https://schema.org/Person"><div class="avatar__name"><a href="https://github.com/smartinov" target="_blank" rel="noopener noreferrer" itemprop="url"><span itemprop="name">Stefan Martinov</span></a></div><small class="avatar__subtitle" itemprop="description">Memelord</small></div></div></div></div></header><div id="__blog-post-container" class="markdown" itemprop="articleBody"><h2 id="the-annoyance">The Annoyance</h2>
<p>So, we&#x27;ve all been there. You go to your trusty grafana, search for some sweet metrics that you implemented and WHAM!
Prometheus returns us a 503, a trusty way of saying I&#x27;m not ready, and I&#x27;m probably going to die soon.
And since we&#x27;re running in kubernetes I&#x27;m going to die soon, again and again.
And you&#x27;re getting reports from your colleagues that prometheus is not responding.
And you can&#x27;t ignore them anymore.</p>
<p><img alt="Bummer." src="/assets/images/bummer-e80d471cba23d1ee83e8463187845893.webp" width="480" height="270"></p>
<h2 id="the-problem">The Problem</h2>
<p>All right, lets check what&#x27;s happening to the little guy.</p>
<pre><code class="language-bash">kubectl get pods -n monitoring
prometheus-prometheus-kube-prometheus-prometheus-0 1/2 Running 4 5m
</code></pre>
<p>It seems like it&#x27;s stuck in the running state, where the container is not yet ready.
Let&#x27;s describe the deployment, to check out what&#x27;s happening.</p>
<pre><code class="language-yaml"> State: Running │
Started: Wed, 12 Jan 2022 15:12:49 +0100 │
Last State: Terminated │
Reason: OOMKilled │
Exit Code: 137 │
Started: Tue, 11 Jan 2022 17:14:41 +0100 │
Finished: Wed, 12 Jan 2022 15:12:47 +0100 │
</code></pre>
<p>So we see that the prometheus is in a running state waiting for the readiness probe to trigger, probably working on recovering from Write Ahead Log (WAL).
This could be an issue where prometheus is recovering from an error, or a restart and does not have enough memory to write everything in the WAL.
We could be running into an issue where we set the request/limits memory lower than the prometheus requires, and the kube scheduler keeps killing prometheus for wanting more memory.</p>
<p>For this case, we could give it more memory to work to see if it recovers. We should also analyze why the prometheus WAL is getting clogged up.</p>
<p>In essence, we want to check what has changed so that we suddenly have a high memory spike in our sweet, sweet environment.</p>
<h2 id="the-source">The Source</h2>
<p><img alt="Cardinality" src="/assets/images/cardinality-5f722655c50c25a6a91c53884ad19677.webp" width="501" height="500"></p>
<p>A lot of prometheus issues revolve around cardinality.
Memory spikes that break your deployment? Cardinality.
Prometheus dragging its feet like it&#x27;s Monday after the log4j (the second one ofc) zero day security breach? Cardinality.
Not getting that raise since you worked hard the past 16 years without wavering? You bet your ass it&#x27;s cardinality.
So, as you can see much of life&#x27;s problems can be accredited to cardinality.</p>
<p>In short cardinality of your metrics is the combination of all label values per metric.
For example, if our metric <code>http_request_total</code> had a label response code, and let&#x27;s say we support 8 status codes, our cardinality starts off at 8.
For good measure we want to record the HTTP verb for the request. We support <code>GET POST PUT HEAD</code> which would put the cardinality to 4*8=<strong>32</strong>.
Now, if someone adds a URL to the metric label (<strong>!!VERY BAD IDEA!!</strong>, but bare with me now) and we have 2 active pages, we&#x27;d have a cardinality of 2*4*8=<strong>64</strong>.
But, imagine someone starts scraping your website for potential vulnerabilities. Imagine all the URLs that will appear, most likely only once.</p>
<pre><code class="language-text">mywebsite.com/admin.php
mywebsite.com/wp/admin.php
mywebsite.com/?utm_source=GUID
...
</code></pre>
<p>This would blow up our cardinality to kingdom come. Like you will be out of memory faster than &quot;<a href="https://www.reddit.com/r/ProgrammerHumor/comments/a483yz/those_javascript_devs/">a new super awesome Javascript gamechanger framework</a>&quot; is born.
Or to quote user <a href="https://www.reddit.com/user/naveen17797/">naveen17797</a> <em>Scientists predict the number of js frameworks may exceed human population by 2020,at that point of time random string generators will be used to name those frameworks.</em></p>
<p>The point to this story is, be very mindful of how you use labels and cardinality in prometheus, since that will indeed have great impact on your prometheus performance.</p>
<h2 id="the-solution">The Solution</h2>
<p>Since this has never happened to me (never-ever) I found the following solution to be handy.
Since we can&#x27;t get prometheus up and running to utilize PromQL to detect the potential issues, we have to find another way to detect high cardinality.
Therefore, we might want to get our hands dirty with some <code>kubectl exec -it -n monitoring pods/prometheus-prometheus-kube-prometheus-prometheus-0 -- sh</code>, and run the prometheus <code>tsdb</code> analysis too.</p>
<pre><code class="language-bash">/prometheus $ promtool tsdb analyze .
</code></pre>
<p>Which produced the result.</p>
<pre><code class="language-text">&gt; Block ID: 01FT8E8YY4THHZ2S7C3G04GJMG
&gt; Duration: 1h59m59.997s
&gt; Series: 564171
&gt; Label names: 285
&gt; Postings (unique label pairs): 21139
&gt; Postings entries (total label pairs): 6423664
&gt;
&gt; ...
&gt;
&gt; Highest cardinality metric names:
&gt; 11340 haproxy_server_http_responses_total
&gt; ...
</code></pre>
<p>We see the potential issue here, where the <code>haproxy_server_http_responses_total</code> metric is having a super-high cardinality which is growing.
We need to deal with it, so that our prometheus instance can breathe again. In this particular case, the solution was updating the haproxy.</p>
<p>... or burn it, up to you.</p>
<p><img alt="Flame Thrower" src="/assets/images/flame-thrower-56bcf89132356ff53c03ca029d9d0746.webp" width="1440" height="1080"></p>
<h2 id="the-further-reading">The Further Reading</h2>
<ol>
<li><a href="https://github.com/prometheus/prometheus/blob/main/tsdb/docs/format/wal.md">WAL Definition</a></li>
<li><a href="https://ganeshvernekar.com/blog/prometheus-tsdb-wal-and-checkpoint/">WAL &amp; Checkpoints</a></li>
<li><a href="https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality">Using TSDB</a></li>
<li><a href="https://www.robustperception.io/which-are-my-biggest-metrics">Biggest Metrics</a></li>
<li><a href="https://www.robustperception.io/cardinality-is-key">Cardinality</a></li>
</ol></div><footer class="row docusaurus-mt-lg blogPostFooterDetailsFull_mRVl"><div class="col"><b>Tags:</b><ul class="tags_jXut padding--none margin-left--sm"><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/blog/tags/prometheus">prometheus</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/blog/tags/cardinality">cardinality</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/blog/tags/devops">devops</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/blog/tags/ops">ops</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/blog/tags/k-8-s">k8s</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/blog/tags/oom">oom</a></li><li class="tag_QGVx"><a class="tag_zVej tagRegular_sFm0" href="/blog/tags/memory">memory</a></li></ul></div><div class="col margin-top--sm"><a href="https://github.com/foomo/foomo-docs/tree/main/foomo/blog/2022-01-25-prometheus-cardinality-issues/index.mdx" target="_blank" rel="noopener noreferrer" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_Z9Sw" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div></footer></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Blog post page navigation"><a class="pagination-nav__link pagination-nav__link--prev" href="/blog/why-bundle-size-is-important"><div class="pagination-nav__sublabel">Newer Post</div><div class="pagination-nav__label">Why bundle size is important?</div></a><a class="pagination-nav__link pagination-nav__link--next" href="/blog/searching-for-search-engines"><div class="pagination-nav__sublabel">Older Post</div><div class="pagination-nav__label">The never ending search a search engine 2022-01 edition</div></a></nav></main><div class="col col--2"><div class="tableOfContents_bqdL thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#the-annoyance" class="table-of-contents__link toc-highlight">The Annoyance</a></li><li><a href="#the-problem" class="table-of-contents__link toc-highlight">The Problem</a></li><li><a href="#the-source" class="table-of-contents__link toc-highlight">The Source</a></li><li><a href="#the-solution" class="table-of-contents__link toc-highlight">The Solution</a></li><li><a href="#the-further-reading" class="table-of-contents__link toc-highlight">The Further Reading</a></li></ul></div></div></div></div></div><footer class="footer"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">github</div><ul class="footer__items clean-list"><li class="footer__item"><a href="https://github.com/foomo" target="_blank" rel="noopener noreferrer" class="footer__link-item">https://github.com/foomo</a></li></ul></div><div class="col footer__col"><div class="footer__title">legal</div><ul class="footer__items clean-list"><li class="footer__item"><a class="footer__link-item" href="/etc/imprint">Imprint</a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright">© 2024 bestbytes</div></div></div></footer></div>
</body>
</html>