Cache Lines and Donuts

I read a little about threading and cache lines and realized I could improve my algorithm for drawing the rainbow-colored donut in my Wayland example. Instead of passing each pixel individually to the threads in the thread pool, feed a row to a thread and let it work the entire row. The theory behind this is the row stays in the L1 cache, so working the next pixel doesn’t require fetching anything from ram. Working each pixel individually invalidates the cache frequently because the threads are working too close to each other’s pixels, requiring more synchronization. I’m still learning how to read KCachegrind’s breakdown, but the difference was immediately apparent visually.

Pixels

Rows

The change was small.

--- loopy_ring_old.cpp	2026-01-04 13:30:53.855125512 -0500
+++ loopy_ring_cache.cpp	2026-01-04 13:29:55.792113977 -0500
@@ -137,14 +137,16 @@
 	};
 
 	// populate the thread pool task queue
-	std::queue<std::function<void()>> shaders;
+	std::queue<std::vector<std::function<void()>>> shaders;
 	auto threadCount=std::thread::hardware_concurrency(); // use maximum number of threads
 	for (int y=0; y < HEIGHT; y++)
 	{
+		std::vector<std::function<void()>> row;
 		for (int x=0; x < WIDTH; x++)
 		{
-			shaders.push(std::bind(Donut,x,y)); // store call to lambda with the pixel position it will be working on
+			row.push_back(std::bind(Donut,x,y)); // store call to lambda with the pixel position it will be working on
 		}
+		shaders.push(row);
 	}
 
 	// create the "shaders" that will run the tasks from the task queue
@@ -155,12 +157,15 @@
 		// critical section - checking size of queue and removing an item cannot be done concurrently
 		while (!shaders.empty())
 		{
-			auto Program=shaders.front();
+			auto row=shaders.front();
 			shaders.pop();
 			// end critical section
 			shaderQueueMutex.unlock(); // unlocks A once, then B each subsequent iteration
 
-			Program(); // run the shader program (lambda) on this shader (thread)
+			for (auto &shader : row)
+			{
+				shader(); // run the shader program (lambda) on this shader (thread)
+			}
 
 			shaderQueueMutex.lock(); // lock B
 		}
Code language: Diff (diff)

Below is the Draw() function called by Wayland’s frame callback. The pixel buffer is passed in as the user data.

 Draw(void *userData,wl_callback *callback,std::uint32_t time)
{
	wl_callback_destroy(callback);

	unsigned int *pixelData=reinterpret_cast<unsigned int*>(userData);

	auto now=std::chrono::steady_clock::now();
	static std::chrono::duration<float> elapsed(0);
	elapsed+=now-lastTick;
	lastTick=now;

	glm::vec2 r(static_cast<float>(WIDTH),static_cast<float>(HEIGHT)); // twigl's r, same as iResolution in ShaderToy
	// lambda that will be run by a task in the thread pool queue for each pixel
	// this is essentially our shader program
	auto Donut=[&r,pixelData](int x,int y) {
		glm::vec4 fragmentCoordinate(static_cast<float>(x),static_cast<float>(y),1.0f,1.0f);
		glm::vec4 o(0.0f);

		auto h=[](glm::vec3 c)->glm::vec3 {
			glm::vec3 r=glm::clamp(glm::abs(glm::mod(c.x*6.0f+glm::vec3(0.0f,4.0f,2.0f),6.0f)-3.0f)-1.0f,0.0f,1.0f);
			return c.z+c.y*(r-0.5f)*(1.0f-glm::abs(2.0f*c.z-1.0f));
		};
		glm::vec2 u=(fragmentCoordinate.xy()-r.xy()*0.5f)/r.y;
		o=glm::vec4(0.0f);
		for(float i=0.0f; i < 0.99; i+=0.01f)
		{
			glm::vec2 v=u+glm::vec2(glm::sin(i*6.28318f)*0.25f*glm::sin((elapsed.count())*0.6f),glm::cos(i*6.28318f)*0.25f*glm::cos((elapsed.count())/2*0.6f));
			float c=glm::smoothstep(0.03f,0.0f,glm::abs(glm::length(v)-0.1f));
			o+=c*glm::vec4(h(glm::vec3(i,0.8f,0.6f)),1.0f)*0.08f;
		}

		int index=(HEIGHT-y-1)*WIDTH+x; // draw bottom up since gl_FragCoord using cartesian coordinates, not screen coordinates
		unsigned int &argb=static_cast<unsigned int*>(pixelData)[index];
		argb=0;
		argb|=static_cast<unsigned int>(255) << 24;
		argb|=static_cast<unsigned int>(std::clamp(o.r*255,0.0f,255.0f)) << 16;
		argb|=static_cast<unsigned int>(std::clamp(o.g*255,0.0f,255.0f)) << 8;
		argb|=static_cast<unsigned int>(std::clamp(o.b*255,0.0f,255.0f));
	};

	// populate the thread pool task queue
	std::queue<std::vector<std::function<void()>>> shaders;
	auto threadCount=std::thread::hardware_concurrency(); // use maximum number of threads
	for (int y=0; y < HEIGHT; y++)
	{
		std::vector<std::function<void()>> row;
		for (int x=0; x < WIDTH; x++)
		{
			row.push_back(std::bind(Donut,x,y)); // store call to lambda with the pixel position it will be working on
		}
		shaders.push(row);
	}

	// create the "shaders" that will run the tasks from the task queue
	std::mutex shaderQueueMutex;
	auto Shader=[&shaders,&shaderQueueMutex]() {
		bool empty=false;
		shaderQueueMutex.lock(); // lock A
		// critical section - checking size of queue and removing an item cannot be done concurrently
		while (!shaders.empty())
		{
			auto row=shaders.front();
			shaders.pop();
			// end critical section
			shaderQueueMutex.unlock(); // unlocks A once, then B each subsequent iteration

			for (auto &shader : row)
			{
				shader(); // run the shader program (lambda) on this shader (thread)
			}

			shaderQueueMutex.lock(); // lock B
		}
		shaderQueueMutex.unlock(); // unlocks B since iteration ends before unlocking
	};

	// create and start the threads that will act as shaders
	std::vector<std::jthread> threads;
	for (int threadNumber=0; threadNumber < threadCount; threadNumber++)
	{
		threads.emplace_back(Shader); // creating a std::jthread starts it
	}
	for (auto &thread : threads)
	{
		thread.join(); // wait for all threads to finish emptying the queue
	}

	wl_callback *frame=wl_surface_frame(surface);
	wl_callback_add_listener(frame,&FrameDispatch,userData); // set callback for next frame request

	wl_surface_attach(surface,sharedMemoryBuffer,0,0);
	wl_surface_damage(surface,0,0,WIDTH,HEIGHT);
	wl_surface_commit(surface);
};
Code language: C++ (cpp)

Leave a Reply Cancel reply