Skip to content

CS 21 25.2 Laboratory Exercise 10

Cache-Level Performance Optimization

Overview

Using a cache simulator, you will be optimizing a given C program to reduce its L1 data cache miss rate.

General Instructions

For this laboratory activity, you are to work on one HOPELEx Checkpoint Item that will aid in your understanding of the RISC-V pipelined processor.

At the end of the HOPELEx Checkpoints are take-home exercises called TakeHOPE items. These are not graded and will not be submitted, but will help you prepare for the HOPE.

HOPELEx Checkpoint

Checkpoint Task: Image Transposition

You will be changing the naive image transposition algorithm that improves its memory locality characteristics while retaining its asymptotic runtime characteristics.

The code below loads the pixel data of a .ppm image file into a matrix, transposes the matrix, then saves the output as output.ppm. Doing this results in the input image being flipped on the diagonal connecting its top-left and bottom-right corners.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>

#define OUTPUT_PATH "output.ppm"

void transpose_basic(uint8_t *data, uint32_t width, uint32_t height) {
    uint8_t *temp = calloc(width * height * 3, 1);
    if (!temp) {
        fprintf(stderr, "Memory allocation failed\n");
        exit(1);
    }

    for (uint32_t rgb = 0; rgb < 3; rgb++) {
        for (uint32_t c = 0; c < width; c++) {
            for (uint32_t r = 0; r < height; r++) {
                temp[c * height * 3 + r * 3 + rgb] =
                    data[r * width * 3 + c * 3 + rgb];
            }
        }
    }

    memcpy(data, temp, width * height * 3);
    free(temp);
}

void transpose_improved(uint8_t *data,
                        uint32_t width,
                        uint32_t height) {
    // Your code here
}

void skip_comment_lines(FILE *fp) {
    int ch;

    while ((ch = fgetc(fp)) == '#') {
        fscanf(fp, " %*[^\n]\n");
    }

    ungetc(ch, fp);
}

int read_header(FILE *in,
                uint32_t *width,
                uint32_t *height,
                uint32_t *max_value) {
    char magic[3];

    if (fread(magic, 1, 3, in) != 3 || strncmp(magic, "P6\n", 3) != 0) {
        fprintf(stderr, "Invalid or missing P6 header\n");
        return -1;
    }

    skip_comment_lines(in);

    if (fscanf(in, " %u %u", width, height) != 2) {
        fprintf(stderr, "Failed to read image dimensions\n");
        return -1;
    }

    skip_comment_lines(in);

    if (fscanf(in, " %u%*c", max_value) != 1) {
        fprintf(stderr, "Failed to read max value\n");
        return -1;
    }

    return 0;
}

uint8_t *read_image_data(FILE *in, size_t size) {
    uint8_t *data = malloc(size);
    if (!data) {
        fprintf(stderr, "Failed to allocate image buffer\n");
        return NULL;
    }

    size_t read_bytes = fread(data, 1, size, in);
    if (read_bytes != size) {
        fprintf(stderr, "Image read error (%zu/%zu bytes)\n", read_bytes, size);
        free(data);
        return NULL;
    }

    return data;
}

int write_image(const char *path,
                uint8_t *data,
                uint32_t width,
                uint32_t height,
                uint32_t max_value) {
    FILE *out = fopen(path, "wb");
    if (!out) {
        fprintf(stderr, "Error opening output file\n");
        return -1;
    }

    fprintf(out, "P6\n%u %u\n%u\n", height, width, max_value);
    fwrite(data, 1, width * height * 3, out);

    fclose(out);

    return 0;
}

int main(int argc, char *argv[]) {
    if (argc != 3) {
        fprintf(stderr, "Usage: %s <filename> <0|blocksize>\n", argv[0]);
        return -1;
    }

    const char *input_path = argv[1];
    int mode = atoi(argv[2]);

    FILE *in = fopen(input_path, "rb");
    if (!in) {
        fprintf(stderr, "Error reading %s\n", input_path);
        return -1;
    }

    uint32_t width, height, max_value;

    if (read_header(in, &width, &height, &max_value) != 0) {
        fclose(in);
        return -1;
    }

    printf("%u x %u image with max value %u\n", width, height, max_value);

    size_t data_size = (size_t)width * height * 3;
    uint8_t *data = read_image_data(in, data_size);
    fclose(in);

    if (!data) {
        return -1;
    }

    if (mode == 0) {
        transpose_basic(data, width, height);
    } else {
        transpose_improved(data, width, height);
    }

    if (write_image(OUTPUT_PATH, data, width, height, max_value) != 0) {
        free(data);
        return -1;
    }

    free(data);
}

Save the code above as lab10.c.

You are expected to compile lab10.c with the -O2 optimization flag:

gcc -O2 -g lab10.c -o lab10

Download the following .ppm files and place them in the same directory as lab10:

lab10 is executed by passing in the path to the .ppm as its first argument and its execution mode as the second argument:

./lab10 <path> <mode>

A 0 for the execution mode runs the naive transposition algorithm (transpose_basic) while a nonzero integer uses the code in transpose_improved which you are to implement.

As an example, earth.ppm can be transposed using transpose_basic as follows:

./lab10 earth.ppm 0

In contrast, the command below uses transpose_improved to transpose 4k.ppm:

./lab10 4k.ppm 1

Each pixel of a PPM (Portable Pix Map) image is represented using three integers in the range \([0,255]\) that describe the intensity of its red, green, and blue components (laid out contiguously in that order). Pixel data is ordered in memory by row from left to right.

You are to use the cachegrind tool to simulate a single-level 32K cache with separate instruction and data caches. This may be done as follows:

valgrind \
  --tool=cachegrind \
  --cache-sim=yes \
  --I1=32768,8,64 \
  --D1=32768,8,64 \
  --LL=33554432,16,64 \
  ./lab10 <path> <mode>

Implement transpose_improved such that the L1 data cache exhibits at most a 4% miss rate (combined reads and writes).

Solutions that artificially inflate the hit rate will not be considered.