|
|
|
|
|
|
_ZN12CAssignArray11ColumnOrderEv: .LFB1413: pushl %ebp .LCFI18: movl %esp, %ebp .LCFI19: subl $16, %esp .LCFI20: movl $0, -12(%ebp) jmp .L16 .L17: movl $0, -8(%ebp) jmp .L18 .L19: movl $0, -4(%ebp) jmp .L20 .L21: movl 8(%ebp), %eax movl 4(%eax), %edx movl -4(%ebp), %eax sall $2, %eax leal (%edx,%eax), %eax movl (%eax), %edx movl -8(%ebp), %eax sall $2, %eax leal (%edx,%eax), %eax movl (%eax), %edx movl -12(%ebp), %eax sall $2, %eax leal (%edx,%eax), %eax movl $0, (%eax) addl $1, -4(%ebp) .L20: movl 8(%ebp), %eax movl (%eax), %eax cmpl -4(%ebp), %eax jg .L21 addl $1, -8(%ebp) .L18: movl 8(%ebp), %eax movl (%eax), %eax cmpl -8(%ebp), %eax jg .L19 addl $1, -12(%ebp) .L16: movl 8(%ebp), %eax movl (%eax), %eax cmpl -12(%ebp), %eax jg .L17 leave ret |
_ZN12CAssignArray8RowOrderEv: .LFB1414: pushl %ebp .LCFI21: movl %esp, %ebp .LCFI22: subl $16, %esp .LCFI23: movl $0, -12(%ebp) jmp .L27 .L28: movl $0, -8(%ebp) jmp .L29 .L30: movl $0, -4(%ebp) jmp .L31 .L32: movl 8(%ebp), %eax movl 4(%eax), %edx movl -12(%ebp), %eax sall $2, %eax leal (%edx,%eax), %eax movl (%eax), %edx movl -8(%ebp), %eax sall $2, %eax leal (%edx,%eax), %eax movl (%eax), %edx movl -4(%ebp), %eax sall $2, %eax leal (%edx,%eax), %eax movl $0, (%eax) addl $1, -4(%ebp) .L31: movl 8(%ebp), %eax movl (%eax), %eax cmpl -4(%ebp), %eax jg .L32 addl $1, -8(%ebp) .L29: movl 8(%ebp), %eax movl (%eax), %eax cmpl -8(%ebp), %eax jg .L30 addl $1, -12(%ebp) .L27: movl 8(%ebp), %eax movl (%eax), %eax cmpl -12(%ebp), %eax jg .L28 leave ret |
|
|
==28047== ==28047== I refs: 3,373,725,947 ==28047== I1 misses: 2,018 ==28047== L2i misses: 702 ==28047== I1 miss rate: 0.00% ==28047== L2i miss rate: 0.00% ==28047== ==28047== D refs: 2,168,879,033 (1,600,528,348 rd + 568,350,685 wr) ==28047== D1 misses: 963,907,096 ( 490,989,806 rd + 472,917,290 wr) ==28047== L2d misses: 322,693,036 ( 1,175,544 rd + 321,517,492 wr) ==28047== D1 miss rate: 44.4% ( 30.6% + 83.2% ) ==28047== L2d miss rate: 14.8% ( 0.0% + 56.5% ) ==28047== ==28047== L2 refs: 963,909,114 ( 490,991,824 rd + 472,917,290 wr) ==28047== L2 misses: 322,693,738 ( 1,176,246 rd + 321,517,492 wr) ==28047== L2 miss rate: 5.8% ( 0.0% + 56.5% ) |
==28063== ==28063== I refs: 2,429,099,153 ==28063== I1 misses: 1,981 ==28063== L2i misses: 706 ==28063== I1 miss rate: 0.00% ==28063== L2i miss rate: 0.00% ==28063== ==28063== D refs: 1,225,150,028 (656,799,718 rd + 568,350,310 wr) ==28063== D1 misses: 31,712,405 ( 1,049,070 rd + 30,663,335 wr) ==28063== L2d misses: 26,214,103 ( 662,414 rd + 25,551,689 wr) ==28063== D1 miss rate: 2.5% ( 0.1% + 5.3% ) ==28063== L2d miss rate: 2.1% ( 0.1% + 4.4% ) ==28063== ==28063== L2 refs: 31,714,386 ( 1,051,051 rd + 30,663,335 wr) ==28063== L2 misses: 26,214,809 ( 663,120 rd + 25,551,689 wr) ==28063== L2 miss rate: 0.7% ( 0.0% + 4.4% ) |
/valgrind --tool=cachegrind a.out ==28047== Cachegrind, an I1/D1/L2 cache profiler. ==28047== Copyright (C) 2002-2006, and GNU GPL'd, by Nicholas Nethercote et al. ==28047== Using LibVEX rev 1658, a library for dynamic binary translation. ==28047== Copyright (C) 2004-2006, and GNU GPL'd, by OpenWorks LLP. ==28047== Using valgrind-3.2.1, a dynamic binary instrumentation framework. ==28047== Copyright (C) 2000-2006, and GNU GPL'd, by Julian Seward et al. ==28047== For more details, rerun with: -v ==28047== --28047-- warning: Unknown Intel cache config value (0x4E), ignoring --28047-- warning: L2 cache not installed, ignore L2 results. |
Spatial and temporal locality example: matrix multiplication A common example is matrix multiplication: for i in 0..n for j in 0..m for k in 0..p C[i][j] = C[i][j] + A[i][k] * B[k][j]; When dealing with large matrices, this algorithm tends to shuffle data around too much. Since memory is pulled up the hierarchy in consecutive address blocks, in the C programming language it would be advantageous to refer to several memory addresses that share the same row (spatial locality). By keeping the row number fixed, the second element changes more rapidly. In C and C++, this means the memory addresses are used more consecutively. One can see that since j affects the column reference of both matrices C and B, it should be iterated in the innermost loop (this will fix the row iterators, i and k, while j moves across each column in the row). This will not change the mathematical result, but it improves efficiency. By switching the looping order for j and k, the speedup in large matrix multiplications becomes dramatic. (In this case, 'large' means, approximately, more than 100,000 elements in each matrix, or enough addressable memory such that the matrices will not fit in L1 and L2 caches.) |