I read a little about threading and cache lines and realized I could improve my algorithm for drawing the rainbow-colored donut in my Wayland example. Instead of passing each pixel individually to the threads in the thread pool, feed a row to a thread and let it work the entire row. The theory behind this is the row stays in the L1 cache, so working the next pixel doesn’t require fetching anything from ram. Working each pixel individually invalidates the cache frequently because the threads are working too close to each other’s pixels, requiring more synchronization. I’m still learning how to read KCachegrind’s breakdown, but the difference was immediately apparent visually.
The change was small.
--- loopy_ring_old.cpp 2026-01-04 13:30:53.855125512 -0500
+++ loopy_ring_cache.cpp 2026-01-04 13:29:55.792113977 -0500
@@ -137,14 +137,16 @@
};
// populate the thread pool task queue
- std::queue<std::function<void()>> shaders;
+ std::queue<std::vector<std::function<void()>>> shaders;
auto threadCount=std::thread::hardware_concurrency(); // use maximum number of threads
for (int y=0; y < HEIGHT; y++)
{
+ std::vector<std::function<void()>> row;
for (int x=0; x < WIDTH; x++)
{
- shaders.push(std::bind(Donut,x,y)); // store call to lambda with the pixel position it will be working on
+ row.push_back(std::bind(Donut,x,y)); // store call to lambda with the pixel position it will be working on
}
+ shaders.push(row);
}
// create the "shaders" that will run the tasks from the task queue
@@ -155,12 +157,15 @@
// critical section - checking size of queue and removing an item cannot be done concurrently
while (!shaders.empty())
{
- auto Program=shaders.front();
+ auto row=shaders.front();
shaders.pop();
// end critical section
shaderQueueMutex.unlock(); // unlocks A once, then B each subsequent iteration
- Program(); // run the shader program (lambda) on this shader (thread)
+ for (auto &shader : row)
+ {
+ shader(); // run the shader program (lambda) on this shader (thread)
+ }
shaderQueueMutex.lock(); // lock B
}
Code language: Diff (diff)
Below is the Draw() function called by Wayland’s frame callback. The pixel buffer is passed in as the user data.
Draw(void *userData,wl_callback *callback,std::uint32_t time)
{
wl_callback_destroy(callback);
unsigned int *pixelData=reinterpret_cast<unsigned int*>(userData);
auto now=std::chrono::steady_clock::now();
static std::chrono::duration<float> elapsed(0);
elapsed+=now-lastTick;
lastTick=now;
glm::vec2 r(static_cast<float>(WIDTH),static_cast<float>(HEIGHT)); // twigl's r, same as iResolution in ShaderToy
// lambda that will be run by a task in the thread pool queue for each pixel
// this is essentially our shader program
auto Donut=[&r,pixelData](int x,int y) {
glm::vec4 fragmentCoordinate(static_cast<float>(x),static_cast<float>(y),1.0f,1.0f);
glm::vec4 o(0.0f);
auto h=[](glm::vec3 c)->glm::vec3 {
glm::vec3 r=glm::clamp(glm::abs(glm::mod(c.x*6.0f+glm::vec3(0.0f,4.0f,2.0f),6.0f)-3.0f)-1.0f,0.0f,1.0f);
return c.z+c.y*(r-0.5f)*(1.0f-glm::abs(2.0f*c.z-1.0f));
};
glm::vec2 u=(fragmentCoordinate.xy()-r.xy()*0.5f)/r.y;
o=glm::vec4(0.0f);
for(float i=0.0f; i < 0.99; i+=0.01f)
{
glm::vec2 v=u+glm::vec2(glm::sin(i*6.28318f)*0.25f*glm::sin((elapsed.count())*0.6f),glm::cos(i*6.28318f)*0.25f*glm::cos((elapsed.count())/2*0.6f));
float c=glm::smoothstep(0.03f,0.0f,glm::abs(glm::length(v)-0.1f));
o+=c*glm::vec4(h(glm::vec3(i,0.8f,0.6f)),1.0f)*0.08f;
}
int index=(HEIGHT-y-1)*WIDTH+x; // draw bottom up since gl_FragCoord using cartesian coordinates, not screen coordinates
unsigned int &argb=static_cast<unsigned int*>(pixelData)[index];
argb=0;
argb|=static_cast<unsigned int>(255) << 24;
argb|=static_cast<unsigned int>(std::clamp(o.r*255,0.0f,255.0f)) << 16;
argb|=static_cast<unsigned int>(std::clamp(o.g*255,0.0f,255.0f)) << 8;
argb|=static_cast<unsigned int>(std::clamp(o.b*255,0.0f,255.0f));
};
// populate the thread pool task queue
std::queue<std::vector<std::function<void()>>> shaders;
auto threadCount=std::thread::hardware_concurrency(); // use maximum number of threads
for (int y=0; y < HEIGHT; y++)
{
std::vector<std::function<void()>> row;
for (int x=0; x < WIDTH; x++)
{
row.push_back(std::bind(Donut,x,y)); // store call to lambda with the pixel position it will be working on
}
shaders.push(row);
}
// create the "shaders" that will run the tasks from the task queue
std::mutex shaderQueueMutex;
auto Shader=[&shaders,&shaderQueueMutex]() {
bool empty=false;
shaderQueueMutex.lock(); // lock A
// critical section - checking size of queue and removing an item cannot be done concurrently
while (!shaders.empty())
{
auto row=shaders.front();
shaders.pop();
// end critical section
shaderQueueMutex.unlock(); // unlocks A once, then B each subsequent iteration
for (auto &shader : row)
{
shader(); // run the shader program (lambda) on this shader (thread)
}
shaderQueueMutex.lock(); // lock B
}
shaderQueueMutex.unlock(); // unlocks B since iteration ends before unlocking
};
// create and start the threads that will act as shaders
std::vector<std::jthread> threads;
for (int threadNumber=0; threadNumber < threadCount; threadNumber++)
{
threads.emplace_back(Shader); // creating a std::jthread starts it
}
for (auto &thread : threads)
{
thread.join(); // wait for all threads to finish emptying the queue
}
wl_callback *frame=wl_surface_frame(surface);
wl_callback_add_listener(frame,&FrameDispatch,userData); // set callback for next frame request
wl_surface_attach(surface,sharedMemoryBuffer,0,0);
wl_surface_damage(surface,0,0,WIDTH,HEIGHT);
wl_surface_commit(surface);
};
Code language: C++ (cpp)
Leave a Reply