mirror of
https://github.com/foomo/foomo-docs.git
synced 2025-10-16 12:35:40 +00:00
38 lines
25 KiB
HTML
38 lines
25 KiB
HTML
<!doctype html>
|
||
<html lang="en" dir="ltr">
|
||
<head>
|
||
<meta charset="UTF-8">
|
||
<meta name="viewport" content="width=device-width,initial-scale=1">
|
||
<meta name="generator" content="Docusaurus v2.0.0-beta.14">
|
||
<link rel="alternate" type="application/rss+xml" href="/blog/rss.xml" title="foomo project docs RSS Feed">
|
||
<link rel="alternate" type="application/atom+xml" href="/blog/atom.xml" title="foomo project docs Atom Feed">
|
||
<link rel="search" type="application/opensearchdescription+xml" title="foomo project docs" href="/opensearch.xml"><title data-react-helmet="true">Prometheus Is Out Of Memory. Again. | foomo project docs</title><meta data-react-helmet="true" name="twitter:card" content="summary_large_image"><meta data-react-helmet="true" property="og:url" content="https://www.foomo.org/blog/prometheus-cardinality-issues"><meta data-react-helmet="true" name="docsearch:language" content="en"><meta data-react-helmet="true" name="docsearch:docusaurus_tag" content="default"><meta data-react-helmet="true" property="og:title" content="Prometheus Is Out Of Memory. Again. | foomo project docs"><meta data-react-helmet="true" name="description" content="The Annoyance"><meta data-react-helmet="true" property="og:description" content="The Annoyance"><meta data-react-helmet="true" property="og:type" content="article"><meta data-react-helmet="true" property="article:published_time" content="2022-01-25T00:00:00.000Z"><meta data-react-helmet="true" property="article:author" content="https://github.com/smartinov"><meta data-react-helmet="true" property="article:tag" content="prometheus,cardinality,devops,ops,k8s,oom,memory"><link data-react-helmet="true" rel="icon" href="/img/favicon.ico"><link data-react-helmet="true" rel="canonical" href="https://www.foomo.org/blog/prometheus-cardinality-issues"><link data-react-helmet="true" rel="alternate" href="https://www.foomo.org/blog/prometheus-cardinality-issues" hreflang="en"><link data-react-helmet="true" rel="alternate" href="https://www.foomo.org/blog/prometheus-cardinality-issues" hreflang="x-default"><link data-react-helmet="true" rel="preconnect" href="https://SUATUVZDDM-dsn.algolia.net" crossorigin="anonymous"><link rel="stylesheet" href="/assets/css/styles.47738ace.css">
|
||
<link rel="preload" href="/assets/js/runtime~main.2340db96.js" as="script">
|
||
<link rel="preload" href="/assets/js/main.d79cb617.js" as="script">
|
||
</head>
|
||
<body>
|
||
<script>!function(){function t(t){document.documentElement.setAttribute("data-theme",t)}var e=function(){var t=null;try{t=localStorage.getItem("theme")}catch(t){}return t}();t(null!==e?e:"light")}()</script><div id="__docusaurus">
|
||
<div><a href="#" class="skipToContent_OuoZ">Skip to main content</a></div><nav class="navbar navbar--fixed-top"><div class="navbar__inner"><div class="navbar__items"><button aria-label="Navigation bar toggle" class="navbar__toggle clean-btn" type="button" tabindex="0"><svg width="30" height="30" viewBox="0 0 30 30" aria-hidden="true"><path stroke="currentColor" stroke-linecap="round" stroke-miterlimit="10" stroke-width="2" d="M4 7h22M4 15h22M4 23h22"></path></svg></button><a class="navbar__brand" href="/"><b class="navbar__title">foomo</b></a><a class="navbar__item navbar__link" href="/docs/general/intro">General</a><a class="navbar__item navbar__link" href="/docs/frontend/intro">Frontend</a><a class="navbar__item navbar__link" href="/docs/backend/intro">Backend</a><a class="navbar__item navbar__link" href="/docs/devops/intro">DevOps</a><a class="navbar__item navbar__link" href="/docs/project-management/intro">PM</a><a class="navbar__item navbar__link" href="/docs/projects/intro">Projects</a></div><div class="navbar__items navbar__items--right"><a aria-current="page" class="navbar__item navbar__link navbar__link--active" href="/blog">Blog</a><div class="searchBox_Utm0"><button type="button" class="DocSearch DocSearch-Button" aria-label="Search"><span class="DocSearch-Button-Container"><svg width="20" height="20" class="DocSearch-Search-Icon" viewBox="0 0 20 20"><path d="M14.386 14.386l4.0877 4.0877-4.0877-4.0877c-2.9418 2.9419-7.7115 2.9419-10.6533 0-2.9419-2.9418-2.9419-7.7115 0-10.6533 2.9418-2.9419 7.7115-2.9419 10.6533 0 2.9419 2.9418 2.9419 7.7115 0 10.6533z" stroke="currentColor" fill="none" fill-rule="evenodd" stroke-linecap="round" stroke-linejoin="round"></path></svg><span class="DocSearch-Button-Placeholder">Search</span></span><span class="DocSearch-Button-Keys"></span></button></div></div></div><div role="presentation" class="navbar-sidebar__backdrop"></div></nav><div class="main-wrapper blog-wrapper blog-post-page"><div class="container margin-vert--lg"><div class="row"><aside class="col col--3"><nav class="sidebar_q+wC thin-scrollbar" aria-label="Blog recent posts navigation"><div class="sidebarItemTitle_9G5K margin-bottom--md">Recent posts</div><ul class="sidebarItemList_6T4b"><li class="sidebarItem_cjdF"><a class="sidebarItemLink_zyXk" href="/blog/why-bundle-size-is-important">Why bundle size is important?</a></li><li class="sidebarItem_cjdF"><a aria-current="page" class="sidebarItemLink_zyXk sidebarItemLinkActive_wcJs" href="/blog/prometheus-cardinality-issues">Prometheus Is Out Of Memory. Again.</a></li><li class="sidebarItem_cjdF"><a class="sidebarItemLink_zyXk" href="/blog/searching-for-search-engines">The never ending search a search engine 2022-01 edition</a></li><li class="sidebarItem_cjdF"><a class="sidebarItemLink_zyXk" href="/blog/impact-of-3rd-party-scripts-on-performance">Impact of 3rd party scripts on performance</a></li><li class="sidebarItem_cjdF"><a class="sidebarItemLink_zyXk" href="/blog/debugging-go-map-races-in-k8s">debugging Go map races in k8s</a></li></ul></nav></aside><main class="col col--7" itemscope="" itemtype="http://schema.org/Blog"><article itemprop="blogPost" itemscope="" itemtype="http://schema.org/BlogPosting"><header><h1 class="blogPostTitle_d4p0" itemprop="headline">Prometheus Is Out Of Memory. Again.</h1><div class="blogPostData_-Im+ margin-vert--md"><time datetime="2022-01-25T00:00:00.000Z" itemprop="datePublished">January 25, 2022</time></div><div class="row margin-top--md margin-bottom--sm"><div class="col col--6 authorCol_8c0z"><div class="avatar margin-bottom--sm"><a href="https://github.com/smartinov" target="_blank" rel="noopener noreferrer" class="avatar__photo-link avatar__photo"><img class="image_9q7L" src="https://github.com/smartinov.png" alt="Stefan Martinov"></a><div class="avatar__intro" itemprop="author" itemscope="" itemtype="https://schema.org/Person"><div class="avatar__name"><a href="https://github.com/smartinov" target="_blank" rel="noopener noreferrer" itemprop="url"><span itemprop="name">Stefan Martinov</span></a></div><small class="avatar__subtitle" itemprop="description">Memelord</small></div></div></div></div></header><div class="markdown" itemprop="articleBody"><h2 class="anchor anchorWithStickyNavbar_y2LR" id="the-annoyance">The Annoyance<a class="hash-link" href="#the-annoyance" title="Direct link to heading"></a></h2><p>So, we've all been there. You go to your trusty grafana, search for some sweet metrics that you implemented and WHAM!
|
||
Prometheus returns us a 503, a trusty way of saying I'm not ready, and I'm probably going to die soon.
|
||
And since we're running in kubernetes I'm going to die soon, again and again.
|
||
And you're getting reports from your colleagues that prometheus is not responding.
|
||
And you can't ignore them anymore.</p><p><img alt="Bummer." src="/assets/images/bummer-e80d471cba23d1ee83e8463187845893.webp"></p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="the-problem">The Problem<a class="hash-link" href="#the-problem" title="Direct link to heading"></a></h2><p>All right, lets check what's happening to the little guy.</p><div class="codeBlockContainer_J+bg language-bash theme-code-block"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#bfc7d5;background-color:#292d3e"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#bfc7d5"><span class="token plain">kubectl get pods -n monitoring</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain" style="display:inline-block"></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">prometheus-prometheus-kube-prometheus-prometheus-0 </span><span class="token number" style="color:rgb(247, 140, 108)">1</span><span class="token plain">/2 Running </span><span class="token number" style="color:rgb(247, 140, 108)">4</span><span class="token plain"> 5m</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>It seems like it's stuck in the running state, where the container is not yet ready.
|
||
Let's describe the deployment, to check out what's happening.</p><div class="codeBlockContainer_J+bg language-yaml theme-code-block"><div class="codeBlockContent_csEI yaml"><pre tabindex="0" class="prism-code language-yaml codeBlock_rtdJ thin-scrollbar" style="color:#bfc7d5;background-color:#292d3e"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">State</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> Running │</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">Started</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> Wed</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> 12 Jan 2022 15</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain">12</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain">49 +0100 │</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">Last State</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> Terminated │</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">Reason</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> OOMKilled │</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">Exit Code</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> 137 │</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">Started</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> Tue</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> 11 Jan 2022 17</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain">14</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain">41 +0100 │</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain"> </span><span class="token key atrule">Finished</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain"> Wed</span><span class="token punctuation" style="color:rgb(199, 146, 234)">,</span><span class="token plain"> 12 Jan 2022 15</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain">12</span><span class="token punctuation" style="color:rgb(199, 146, 234)">:</span><span class="token plain">47 +0100 │</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>So we see that the prometheus is in a running state waiting for the readiness probe to trigger, probably working on recovering from Write Ahead Log (WAL).
|
||
This could be an issue where prometheus is recovering from an error, or a restart and does not have enough memory to write everything in the WAL.
|
||
We could be running into an issue where we set the request/limits memory lower than the prometheus requires, and the kube scheduler keeps killing prometheus for wanting more memory.</p><p>For this case, we could give it more memory to work to see if it recovers. We should also analyze why the prometheus WAL is getting clogged up.</p><p>In essence, we want to check what has changed so that we suddenly have a high memory spike in our sweet, sweet environment.</p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="the-source">The Source<a class="hash-link" href="#the-source" title="Direct link to heading"></a></h2><p><img alt="Cardinality" src="/assets/images/cardinality-5f722655c50c25a6a91c53884ad19677.webp"></p><p>A lot of prometheus issues revolve around cardinality.
|
||
Memory spikes that break your deployment? Cardinality.
|
||
Prometheus dragging its feet like it's Monday after the log4j (the second one ofc) zero day security breach? Cardinality.
|
||
Not getting that raise since you worked hard the past 16 years without wavering? You bet your ass it's cardinality.
|
||
So, as you can see much of life's problems can be accredited to cardinality.</p><p>In short cardinality of your metrics is the combination of all label values per metric.
|
||
For example, if our metric <code>http_request_total</code> had a label response code, and let's say we support 8 status codes, our cardinality starts off at 8.
|
||
For good measure we want to record the HTTP verb for the request. We support <code>GET POST PUT HEAD</code> which would put the cardinality to 4<!-- -->*<!-- -->8=<strong>32</strong>.
|
||
Now, if someone adds a URL to the metric label (<strong>!!VERY BAD IDEA!!</strong>, but bare with me now) and we have 2 active pages, we'd have a cardinality of 2<!-- -->*<!-- -->4<!-- -->*<!-- -->8=<strong>64</strong>.
|
||
But, imagine someone starts scraping your website for potential vulnerabilities. Imagine all the URLs that will appear, most likely only once.</p><div class="codeBlockContainer_J+bg language-text theme-code-block"><div class="codeBlockContent_csEI text"><pre tabindex="0" class="prism-code language-text codeBlock_rtdJ thin-scrollbar" style="color:#bfc7d5;background-color:#292d3e"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#bfc7d5"><span class="token plain">mywebsite.com/admin.php</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">mywebsite.com/wp/admin.php</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">mywebsite.com/?utm_source=GUID</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">...</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>This would blow up our cardinality to kingdom come. Like you will be out of memory faster than "<a href="https://www.reddit.com/r/ProgrammerHumor/comments/a483yz/those_javascript_devs/" target="_blank" rel="noopener noreferrer">a new super awesome Javascript gamechanger framework</a>" is born.
|
||
Or to quote user <a href="https://www.reddit.com/user/naveen17797/" target="_blank" rel="noopener noreferrer">naveen17797</a> <em>Scientists predict the number of js frameworks may exceed human population by 2020,at that point of time random string generators will be used to name those frameworks.</em></p><p>The point to this story is, be very mindful of how you use labels and cardinality in prometheus, since that will indeed have great impact on your prometheus performance.</p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="the-solution">The Solution<a class="hash-link" href="#the-solution" title="Direct link to heading"></a></h2><p>Since this has never happened to me (never-ever) I found the following solution to be handy.
|
||
Since we can't get prometheus up and running to utilize PromQL to detect the potential issues, we have to find another way to detect high cardinality.
|
||
Therefore, we might want to get our hands dirty with some <code>kubectl exec -it -n monitoring pods/prometheus-prometheus-kube-prometheus-prometheus-0 -- sh</code>, and run the prometheus <code>tsdb</code> analysis too.</p><div class="codeBlockContainer_J+bg language-bash theme-code-block"><div class="codeBlockContent_csEI bash"><pre tabindex="0" class="prism-code language-bash codeBlock_rtdJ thin-scrollbar" style="color:#bfc7d5;background-color:#292d3e"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#bfc7d5"><span class="token plain">/prometheus $ promtool tsdb analyze </span><span class="token builtin class-name" style="color:rgb(255, 203, 107)">.</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>Which produced the result.</p><div class="codeBlockContainer_J+bg language-text theme-code-block"><div class="codeBlockContent_csEI text"><pre tabindex="0" class="prism-code language-text codeBlock_rtdJ thin-scrollbar" style="color:#bfc7d5;background-color:#292d3e"><code class="codeBlockLines_1zSZ"><span class="token-line" style="color:#bfc7d5"><span class="token plain">> Block ID: 01FT8E8YY4THHZ2S7C3G04GJMG</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">> Duration: 1h59m59.997s</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">> Series: 564171</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">> Label names: 285</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">> Postings (unique label pairs): 21139</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">> Postings entries (total label pairs): 6423664</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">> ...</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">></span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">> Highest cardinality metric names:</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">> 11340 haproxy_server_http_responses_total</span><br></span><span class="token-line" style="color:#bfc7d5"><span class="token plain">> ...</span><br></span></code></pre><button type="button" aria-label="Copy code to clipboard" class="copyButton_M3SB clean-btn">Copy</button></div></div><p>We see the potential issue here, where the <code>haproxy_server_http_responses_total</code> metric is having a super-high cardinality which is growing.
|
||
We need to deal with it, so that our prometheus instance can breathe again. In this particular case, the solution was updating the haproxy.</p><p>... or burn it, up to you.</p><p><img alt="Flame Thrower" src="/assets/images/flame-thrower-56bcf89132356ff53c03ca029d9d0746.webp"></p><h2 class="anchor anchorWithStickyNavbar_y2LR" id="the-further-reading">The Further Reading<a class="hash-link" href="#the-further-reading" title="Direct link to heading"></a></h2><ol><li><a href="https://github.com/prometheus/prometheus/blob/main/tsdb/docs/format/wal.md" target="_blank" rel="noopener noreferrer">WAL Definition</a></li><li><a href="https://ganeshvernekar.com/blog/prometheus-tsdb-wal-and-checkpoint/" target="_blank" rel="noopener noreferrer">WAL & Checkpoints</a></li><li><a href="https://www.robustperception.io/using-tsdb-analyze-to-investigate-churn-and-cardinality" target="_blank" rel="noopener noreferrer">Using TSDB</a></li><li><a href="https://www.robustperception.io/which-are-my-biggest-metrics" target="_blank" rel="noopener noreferrer">Biggest Metrics</a></li><li><a href="https://www.robustperception.io/cardinality-is-key" target="_blank" rel="noopener noreferrer">Cardinality</a></li></ol></div><footer class="row docusaurus-mt-lg blogPostDetailsFull_xD8n"><div class="col"><b>Tags:</b><ul class="tags_NBRY padding--none margin-left--sm"><li class="tag_F03v"><a class="tag_WK-t tagRegular_LXbV" href="/blog/tags/prometheus">prometheus</a></li><li class="tag_F03v"><a class="tag_WK-t tagRegular_LXbV" href="/blog/tags/cardinality">cardinality</a></li><li class="tag_F03v"><a class="tag_WK-t tagRegular_LXbV" href="/blog/tags/devops">devops</a></li><li class="tag_F03v"><a class="tag_WK-t tagRegular_LXbV" href="/blog/tags/ops">ops</a></li><li class="tag_F03v"><a class="tag_WK-t tagRegular_LXbV" href="/blog/tags/k-8-s">k8s</a></li><li class="tag_F03v"><a class="tag_WK-t tagRegular_LXbV" href="/blog/tags/oom">oom</a></li><li class="tag_F03v"><a class="tag_WK-t tagRegular_LXbV" href="/blog/tags/memory">memory</a></li></ul></div><div class="col margin-top--sm"><a href="https://github.com/foomo/foomo-docs/tree/main/foomo/blog/2022-01-25-prometheus-cardinality-issues/index.mdx" target="_blank" rel="noreferrer noopener" class="theme-edit-this-page"><svg fill="currentColor" height="20" width="20" viewBox="0 0 40 40" class="iconEdit_mS5F" aria-hidden="true"><g><path d="m34.5 11.7l-3 3.1-6.3-6.3 3.1-3q0.5-0.5 1.2-0.5t1.1 0.5l3.9 3.9q0.5 0.4 0.5 1.1t-0.5 1.2z m-29.5 17.1l18.4-18.5 6.3 6.3-18.4 18.4h-6.3v-6.2z"></path></g></svg>Edit this page</a></div></footer></article><nav class="pagination-nav docusaurus-mt-lg" aria-label="Blog post page navigation"><div class="pagination-nav__item"><a class="pagination-nav__link" href="/blog/why-bundle-size-is-important"><div class="pagination-nav__sublabel">Newer Post</div><div class="pagination-nav__label">« <!-- -->Why bundle size is important?</div></a></div><div class="pagination-nav__item pagination-nav__item--next"><a class="pagination-nav__link" href="/blog/searching-for-search-engines"><div class="pagination-nav__sublabel">Older Post</div><div class="pagination-nav__label">The never ending search a search engine 2022-01 edition<!-- --> »</div></a></div></nav></main><div class="col col--2"><div class="tableOfContents_vrFS thin-scrollbar"><ul class="table-of-contents table-of-contents__left-border"><li><a href="#the-annoyance" class="table-of-contents__link toc-highlight">The Annoyance</a></li><li><a href="#the-problem" class="table-of-contents__link toc-highlight">The Problem</a></li><li><a href="#the-source" class="table-of-contents__link toc-highlight">The Source</a></li><li><a href="#the-solution" class="table-of-contents__link toc-highlight">The Solution</a></li><li><a href="#the-further-reading" class="table-of-contents__link toc-highlight">The Further Reading</a></li></ul></div></div></div></div></div><footer class="footer"><div class="container container-fluid"><div class="row footer__links"><div class="col footer__col"><div class="footer__title">legal</div><ul class="footer__items"><li class="footer__item"><a class="footer__link-item" href="/etc/imprint">Imprint</a></li></ul></div></div><div class="footer__bottom text--center"><div class="footer__copyright">© 2022 bestbytes</div></div></div></footer></div>
|
||
<script src="/assets/js/runtime~main.2340db96.js"></script>
|
||
<script src="/assets/js/main.d79cb617.js"></script>
|
||
</body>
|
||
</html> |