{"id":16554,"date":"2026-06-20T10:24:26","date_gmt":"2026-06-20T17:24:26","guid":{"rendered":"https:\/\/mattfife.com\/?p=16554"},"modified":"2026-06-19T10:31:15","modified_gmt":"2026-06-19T17:31:15","slug":"running-gemma-4-with-a-5090-on-llama-cpp","status":"publish","type":"post","link":"https:\/\/mattfife.com\/?p=16554","title":{"rendered":"Running Gemma 4 with a 5090 on llama.cpp"},"content":{"rendered":"\n<p class=\"wp-block-paragraph\">First, grab a trustworthy <a href=\"https:\/\/huggingface.co\/collections\/unsloth\/gemma-4\" data-type=\"link\" data-id=\"https:\/\/huggingface.co\/collections\/unsloth\/gemma-4\">Gemma 4 gguf model (unsloth is great)<\/a>. I have been fooling around with Q4, Q6, Q8 models of gemma-4-26B and gemma-4-31B models. <\/p>\n\n\n\n<figure class=\"wp-block-table\"><table class=\"has-fixed-layout\"><thead><tr><th>operty<\/th><th>E2B<\/th><th>E4B<\/th><th>31B Dense<\/th><\/tr><\/thead><tbody><tr><td><strong>Total Parameters<\/strong><\/td><td>2.3B effective (5.1B with embeddings)<\/td><td>4.5B effective (8B with embeddings)<\/td><td>30.7B<\/td><\/tr><tr><td><strong>Layers<\/strong><\/td><td>35<\/td><td>42<\/td><td>60<\/td><\/tr><tr><td><strong>Sliding Window<\/strong><\/td><td>512 tokens<\/td><td>512 tokens<\/td><td>1024 tokens<\/td><\/tr><tr><td><strong>Context Length<\/strong><\/td><td>128K tokens<\/td><td>128K tokens<\/td><td>256K tokens<\/td><\/tr><tr><td><strong>Vocabulary Size<\/strong><\/td><td>262K<\/td><td>262K<\/td><td>262K<\/td><\/tr><tr><td><strong>Supported Modalities<\/strong><\/td><td>Text, Image, Audio<\/td><td>Text, Image, Audio<\/td><td>Text, Image<\/td><\/tr><tr><td><strong>Vision Encoder Parameters<\/strong><\/td><td><em>~150M<\/em><\/td><td><em>~150M<\/em><\/td><td><em>~550M<\/em><\/td><\/tr><tr><td><strong>Audio Encoder Parameters<\/strong><\/td><td><em>~300M<\/em><\/td><td><em>~300M<\/em><\/td><td>No Audio<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">Grab the latest version of <a href=\"https:\/\/github.com\/ggml-org\/llama.cpp\/blob\/master\/docs\/build.md\" data-type=\"link\" data-id=\"https:\/\/github.com\/ggml-org\/llama.cpp\/blob\/master\/docs\/build.md\">llama.cpp and compile it<\/a> with CUDA support for GPU usage (or CPU if you don&#8217;t have a CUDA enabled GPU).<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">Then <a href=\"https:\/\/ai.google.dev\/gemma\/docs\/integrations\/llamacpp\" data-type=\"link\" data-id=\"https:\/\/ai.google.dev\/gemma\/docs\/integrations\/llamacpp\">set up your server command line<\/a><\/p>\n","protected":false},"excerpt":{"rendered":"<p>First, grab a trustworthy Gemma 4 gguf model (unsloth is great). I have been fooling around with Q4, Q6, Q8 models of gemma-4-26B and gemma-4-31B models. operty E2B E4B 31B Dense Total Parameters 2.3B effective (5.1B with embeddings) 4.5B effective (8B with embeddings) 30.7B Layers 35 42 60 Sliding Window 512 tokens 512 tokens 1024 tokens Context Length 128K tokens 128K tokens 256K tokens Vocabulary Size 262K 262K 262K Supported Modalities Text, Image, Audio Text, Image, Audio Text, Image Vision&#8230;<\/p>\n<p class=\"read-more\"><a class=\"btn btn-default\" href=\"https:\/\/mattfife.com\/?p=16554\"> Read More<span class=\"screen-reader-text\">  Read More<\/span><\/a><\/p>\n","protected":false},"author":2,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"_jetpack_newsletter_access":"","_jetpack_dont_email_post_to_subs":false,"_jetpack_newsletter_tier_id":0,"_jetpack_memberships_contains_paywalled_content":false,"_jetpack_memberships_contains_paid_content":false,"footnotes":"","jetpack_publicize_message":"","jetpack_publicize_feature_enabled":true,"jetpack_social_post_already_shared":true,"jetpack_social_options":{"image_generator_settings":{"template":"highway","default_image_id":0,"font":"","enabled":false},"version":2},"jetpack_post_was_ever_published":false},"categories":[28,9],"tags":[],"class_list":["post-16554","post","type-post","status-publish","format-standard","hentry","category-ai","category-cool"],"jetpack_publicize_connections":[],"jetpack_featured_media_url":"","jetpack_sharing_enabled":true,"jetpack_shortlink":"https:\/\/wp.me\/p4WECr-4j0","jetpack-related-posts":[],"_links":{"self":[{"href":"https:\/\/mattfife.com\/index.php?rest_route=\/wp\/v2\/posts\/16554","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/mattfife.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/mattfife.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/mattfife.com\/index.php?rest_route=\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/mattfife.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=16554"}],"version-history":[{"count":1,"href":"https:\/\/mattfife.com\/index.php?rest_route=\/wp\/v2\/posts\/16554\/revisions"}],"predecessor-version":[{"id":16555,"href":"https:\/\/mattfife.com\/index.php?rest_route=\/wp\/v2\/posts\/16554\/revisions\/16555"}],"wp:attachment":[{"href":"https:\/\/mattfife.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=16554"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/mattfife.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=16554"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/mattfife.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=16554"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}