[施工完成] CSAPP Cachelab

背景

CSAPP:3e 的配套实验 地址 分成了两个部分,第一部分是模拟一下cache的miss,hit,evict的规则。第二部分是优化一个矩阵的转置,使得miss尽可能少。

PART A

给了一个标程csim-ref,要求实现一个程序,输出与该标程一致。

写的时候太心急了。。导致没看完作业要求就开始写了。。 然后就纠结了好久。。如果要访问的地址超过了一个block line的边界该怎么办。。

然而实际上题目里已经把这种情况排除了。。

For this this lab, you should assume that memory accesses are aligned properly, such that a single memory access never crosses block boundaries. By making this assumption, you can ignore the request sizes in the valgrind traces

这样的话。。就没什么难度了。。 LRU的替换策略用纯c去撸hashmap+list有点烦。。。干脆就写了暴力的实现。。反正是模拟题(x

  1
  2#include <getopt.h>
  3#include <stdbool.h>
  4#include <stdio.h>
  5#include <stdlib.h>
  6#include <string.h>
  7#include <unistd.h>
  8
  9#include "cachelab.h"
 10
 11const unsigned int address_space_size = 64;
 12// https://stackoverflow.com/questions/9642732/parsing-command-line-arguments-in-c
 13
 14typedef struct {
 15  bool verbose;
 16  int set_index_bits;
 17  int lines_per_set;
 18  int block_bits;
 19  char *tracefile;
 20
 21} Arg;
 22
 23void ParseArgs(int argc, char **argv, Arg *arg) {
 24  int opt;
 25  while ((opt = getopt(argc, argv, "hvs:E:b:t:")) != -1) {
 26    switch (opt) {
 27      case 'h':
 28        printf("Usage: ./csim-ref [-hv] -s <s> -E <E> -b <b> -t <tracefile>\n");
 29        break;
 30      case 'v':
 31        arg->verbose = true;
 32        break;
 33      case 's':
 34        arg->set_index_bits = atoi(optarg);
 35        break;
 36      case 'E':
 37        arg->lines_per_set = atoi(optarg);
 38        break;
 39      case 'b':
 40        arg->block_bits = atoi(optarg);
 41        break;
 42      case 't':
 43        arg->tracefile = optarg;
 44        break;
 45      default:
 46        abort();
 47    }
 48  }
 49}
 50
 51typedef unsigned long long ULL;
 52// 注意: 地址是十六进制表示。。。debug 到去世。。
 53typedef struct {
 54  ULL tag_bits;
 55  ULL index_bits;
 56  ULL offset_bits;
 57  ULL block_number;
 58
 59} AddressBits;
 60// 64 bit address,m=64
 61// tag bit t = m-(b+s)
 62// parse 64 address bits into tag bits,index bits and offset bits
 63void ParseAddress(const Arg *arg, const ULL raw_addr, AddressBits *addr) {
 64  const unsigned int index_and_block_bit_length =
 65      arg->block_bits + arg->set_index_bits;
 66  ULL block_bit_mask = (1ULL << arg->block_bits) - 1;
 67  ULL index_bit_mask = ((1ULL << arg->set_index_bits) - 1) << arg->block_bits;
 68  ULL tag_bit_mask =
 69      ((1ULL << (address_space_size - index_and_block_bit_length)) - 1)
 70      << index_and_block_bit_length;
 71  // printf("index_and_block_bit_length:%d\n",index_and_block_bit_length);
 72  // printf("mask:%lld %lld %lld\n", block_bit_mask, index_bit_mask,
 73  // tag_bit_mask);
 74  addr->tag_bits = (raw_addr & tag_bit_mask) >> index_and_block_bit_length;
 75  addr->index_bits = (raw_addr & index_bit_mask) >> arg->block_bits;
 76  addr->offset_bits = raw_addr & block_bit_mask;
 77  addr->block_number = raw_addr / (1ULL << arg->block_bits);
 78  // printf("addr:%llx tag:%lld index:%lld osset:%lld block:%lld \n", raw_addr,
 79  //        addr->tag_bits, addr->index_bits, addr->offset_bits,
 80  //        addr->block_number);
 81}
 82void TestParseAddress(const Arg *arg) {
 83  for (int addr = 0; addr < (1ULL << address_space_size); addr++) {
 84    AddressBits tmp_addr;
 85    ParseAddress(arg, addr, &tmp_addr);
 86  }
 87}
 88typedef struct {
 89  enum operation { instruction_load, data_load, data_store, data_modify } op;
 90  ULL address;
 91  int size;
 92} Trace;
 93#define MAX_NUM_OF_TRACE 300000
 94Trace trace[MAX_NUM_OF_TRACE];
 95int lines_in_trace_file = 0;
 96void ParseTraceFile(const char *file) {
 97  FILE *fptr = fopen(file, "r");
 98  if (fptr == NULL) {
 99    printf("Error!");
100    exit(1);
101  }
102
103  char op[5];
104  ULL addr;
105  int cnt = 0, siz;
106  while (fscanf(fptr, "%s %llx,%d", op, &addr, &siz) != EOF) {
107    // printf("op:%s addr:%lld size:%d\n", op, addr, siz);
108    trace[cnt].address = addr;
109    trace[cnt].size = siz;
110    if (op[0] == 'L') {
111      trace[cnt].op = data_load;
112    } else if (op[0] == 'I') {
113      trace[cnt].op = instruction_load;
114    } else if (op[0] == 'S') {
115      trace[cnt].op = data_store;
116    } else if (op[0] == 'M') {
117      trace[cnt].op = data_modify;
118    }
119    cnt++;
120  }
121  lines_in_trace_file = cnt;
122
123  fclose(fptr);
124}
125
126void TestParseTraceFile() {
127  char *file = "direct-mapped-4-bit.trace";
128  ParseTraceFile(file);
129}
130
131// 不需要真的做cache,只需要模拟hit,miss,evition 的次数即可
132
133typedef struct {
134  unsigned int valid;
135  unsigned int tag;
136  // used for LRU policy
137  // use line number of trace file as age
138  // the smallest age is the least recent used line
139  unsigned int age;
140} Line;
141
142typedef struct {
143  int num_of_lines;
144  Line *lines;
145} Set;
146
147Set *set;
148struct cache_summary_t {
149  int hits;
150  int misses;
151  int evitions;
152} cache_summary;
153char operation_table[5] = {'I', 'L', 'S', 'M'};
154
155bool CheckCacheLineHit(const AddressBits addr, Set *cur_set, const int index) {
156  bool any_of_line_hit = false;
157  for (int j = 0; j < cur_set->num_of_lines; j++) {
158    Line *cur_line = &cur_set->lines[j];
159    if (cur_line->valid == 1 && cur_line->tag == addr.tag_bits) {
160      // printf("addr.tag_bits:%lld\n", addr.tag_bits);
161      cache_summary.hits++;
162      cur_line->age = index;
163      any_of_line_hit = true;
164      break;
165    }
166  }
167  return any_of_line_hit;
168}
169
170bool CheckCacheLineEmpty(const AddressBits addr, Set *cur_set,
171                         const int index) {
172  bool any_of_line_empty = false;
173  for (int j = 0; j < cur_set->num_of_lines; j++) {
174    Line *cur_line = &cur_set->lines[j];
175    if (cur_line->valid == 0) {
176      cache_summary.misses++;
177      cur_line->valid = 1;
178      cur_line->tag = addr.tag_bits;
179      cur_line->age = index;
180
181      any_of_line_empty = true;
182      break;
183    }
184  }
185  return any_of_line_empty;
186}
187
188void NaiveLru(const AddressBits addr, Set *cur_set, const int index) {
189  int evict_line;
190  int min_age = lines_in_trace_file;
191  // printf("set[%lld]->num_of_lines:%d\n",addr.index_bits,cur_set->num_of_lines);
192  for (int j = 0; j < cur_set->num_of_lines; j++) {
193    Line *cur_line = &cur_set->lines[j];
194    // printf("cur_line age:%d\n",cur_line->age);
195    if (cur_line->age < min_age) {
196      min_age = cur_line->age;
197      evict_line = j;
198    }
199  }
200  // printf("evict_line:%d min_age:%d\n",evict_line,min_age);
201  cache_summary.misses++;
202  cache_summary.evitions++;
203  cur_set->lines[evict_line].tag = addr.tag_bits;
204  cur_set->lines[evict_line].age = index;
205}
206
207void EvicteCacheLine(const AddressBits addr, Set *cur_set, const int index) {
208  // I'm too lazy to use Doubly linked list and hashmap
209  NaiveLru(addr, cur_set, index);
210}
211
212void SimulateCache(const Arg *arg) {
213  memset(&cache_summary, 0, sizeof(cache_summary));
214  //   instruction_load, data_load, data_store, data_modify
215
216  for (int i = 0; i < lines_in_trace_file; i++) {
217    // ignore instruction load
218    if (trace[i].op == 0) continue;
219    AddressBits addr;
220    ParseAddress(arg, trace[i].address, &addr);
221    Set *cur_set = &set[addr.index_bits];
222    // data load or data store
223    if (trace[i].op <= 2) {
224      bool any_of_line_hit = CheckCacheLineHit(addr, cur_set, i);
225      if (any_of_line_hit) {
226        if (arg->verbose) {
227          printf("%c %lld,%d hit\n", operation_table[trace[i].op],
228                 trace[i].address, trace[i].size);
229        }
230        continue;
231      }
232      bool any_of_line_empty = CheckCacheLineEmpty(addr, cur_set, i);
233      if (any_of_line_empty) {
234        if (arg->verbose) {
235          printf("%c %lld,%d miss\n", operation_table[trace[i].op],
236                 trace[i].address, trace[i].size);
237        }
238        continue;
239      }
240      EvicteCacheLine(addr, cur_set, i);
241      if (arg->verbose) {
242        printf("%c %lld,%d miss eviction\n", operation_table[trace[i].op],
243               trace[i].address, trace[i].size);
244      }
245    } else {
246      // data modify
247      bool any_of_line_hit_in_data_load = CheckCacheLineHit(addr, cur_set, i);
248      if (any_of_line_hit_in_data_load) {
249        // if 'data load' hit cache, then 'data store' must also hit cache
250        cache_summary.hits++;
251        if (arg->verbose) {
252          printf("%c %lld,%d hit hit \n", operation_table[trace[i].op],
253                 trace[i].address, trace[i].size);
254        }
255        continue;
256      }
257      bool any_of_line_empty = CheckCacheLineEmpty(addr, cur_set, i);
258      if (any_of_line_empty) {
259        cache_summary.hits++;
260        if (arg->verbose) {
261          printf("%c %lld,%d miss hit\n", operation_table[trace[i].op],
262                 trace[i].address, trace[i].size);
263        }
264        continue;
265      }
266      EvicteCacheLine(addr, cur_set, i);
267      if (arg->verbose) {
268        cache_summary.hits++;
269        printf("%c %lld,%d miss eviction hit\n", operation_table[trace[i].op],
270               trace[i].address, trace[i].size);
271      }
272    }
273  }
274}
275int main(int argc, char **argv) {
276  Arg arg;
277  ParseArgs(argc, argv, &arg);
278  ParseTraceFile(arg.tracefile);
279  int total_set_num = 1 << arg.set_index_bits;
280  // printf("total_set_num:%d\n",total_set_num);
281  set = (Set *)malloc(total_set_num * sizeof(Set));
282  memset(set, 0, total_set_num * sizeof(Set));
283  for (int i = 0; i < total_set_num; i++) {
284    set[i].lines = (Line *)malloc(arg.lines_per_set * sizeof(Line));
285    memset(set[i].lines, 0, arg.lines_per_set * sizeof(Line));
286    set[i].num_of_lines = arg.lines_per_set;
287  }
288  SimulateCache(&arg);
289  printSummary(cache_summary.hits, cache_summary.misses,
290               cache_summary.evitions);
291
292  for (int i = 0; i < total_set_num; i++) {
293    free(set[i].lines);
294  }
295  free(set);
296
297  return 0;
298}
299
300

PART B

三个小题。。矩阵的size分别是:

  • 32*32
  • 64*64
  • 61*67

很重要的一点是。。输入就是这三组固定值。。。题目是允许根据不同的输入。。来用不同的实现应对的。

32*32

 1
 2int tmp, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 3    for (int i = 0; i < N; i += 8) {
 4      for (int j = 0; j < M; j++) {
 5        tmp = A[i][j];
 6        tmp1 = A[i + 1][j];
 7        tmp2 = A[i + 2][j];
 8        tmp3 = A[i + 3][j];
 9        tmp4 = A[i + 4][j];
10        tmp5 = A[i + 5][j];
11        tmp6 = A[i + 6][j];
12        tmp7 = A[i + 7][j];
13        B[j][i] = tmp;
14        B[j][i + 1] = tmp1;
15        B[j][i + 2] = tmp2;
16        B[j][i + 3] = tmp3;
17        B[j][i + 4] = tmp4;
18        B[j][i + 5] = tmp5;
19        B[j][i + 6] = tmp6;
20        B[j][i + 7] = tmp7;
21      }
22    }
23

61*67

 1
 2 int tmp[8];
 3    for (int block_i_start = 0; block_i_start < N; block_i_start += 8) {
 4      for (int block_j_start = 0; block_j_start < M; block_j_start += 8) {
 5        for (int i = block_i_start; i < min(block_i_start + 8, N); i++) {
 6          for (int j = block_j_start; j < min(M, block_j_start + 8); j++) {
 7            tmp[j - block_j_start] = A[i][j];
 8          }
 9          for (int j = block_j_start; j < min(M, block_j_start + 8); j++) {
10            B[j][i] = tmp[j - block_j_start];
11          }
12         
13        }
14      }
15    }

64*64

略难。。留个坑。。日后填

最终分数

64*64的没拿到分数。。其他部分是满分

1
2Cache Lab summary:
3                        Points   Max pts      Misses
4Csim correctness          27.0        27
5Trans perf 32x32           8.0         8         287
6Trans perf 64x64           0.0         8        4723
7Trans perf 61x67          10.0        10        1997
8          Total points    45.0        53
9

Posts in this Series