The Interconnect Topologies for Manycore Processors
Bidirectional Ring Topology
-
As the number of cores N increases, the number of hops and hence the latency increases.
-
A single bidirectional path to move data from one core to another can easily become a bottleneck as the data transfer load increases.
-
Any link failure will cause the chip to become nonfunctional.
Two-Dimensional Mesh Topology
-
The power efficiency of a 2D mesh network is relatively low compared to that of a ring network.
-
The 2D mesh topology is nonuniform, because the cores at the edges and corners have fewer communication channels and hence less bandwidth available to them.
Two-Dimensional Torus Topology
Other Topologies
The Ring Interconnect Architecture in Intel Xeon Phi
L2 Cache
Tag Directory
Data Transactions
The Cache Coherency Protocol
L2 Cache State | State Definition |
---|---|
M |
Modified: Cache line is modified relative to memory. Only one core can have a given line in M state at a time. |
E |
Exclusive: Cache line is consistent with memory. Only one core can have a cache line in E state at a time. |
S |
Shared: Cache line is shared and consistent with other cores, but may not be consistent with memory. Multiple cores can have a given cache line in S state at a time. |
I |
Invalid: Cache line is not present in the cores L1 or L2. |
TD State | State Definition |
---|---|
GOLS |
Globally owned locally shared: Cache line is present in one or more cores but inconsistent with memory (GDDR). |
GS |
Globally shared: Cache line is present in one or more cores and consistent with memory. |
GE/GM |
Globally exclusive/modified: Cache line is owned by one and only one core and may or may not be consistent with the memory. The TD does not know whether the core has modified the cache line or not. |
GI |
Globally invalid: Cache line is not present in any core. |
Hardware Prefetcher
The Memory Controllers
Memory Transactions Flow
Cacheable Memory Read Transaction
Managing Cache Hierarchy in Software
–opt-prefetch switch
to tell the code generator to insert prefetch instructions in the code and the -opt-prefetch-distance switch
to globally define the L1 and L2 prefetch distances. You can also tell the compiler not to generate prefetch instructions by setting the –no-opt-prefetch
compiler switch or setting –opt-prefetch=0
.34 #include <stdio.h>
35 #include <stdlib.h>
36
37 #define SIZE 1000000
38 #define ITER 20
39
40
41 typedef struct pointVal {
42 double x, y, z;
43 double value;
44 }POINT;
45
46 __declspec(align(256)) static POINT a[SIZE];
47
48 extern double elapsedTime (void);
49
50 int main()
51 {
52 double startTime, duration, tmp[SIZE];
53 int i, j;
54 //initialize
55 for( j=0; j<SIZE;j++){
56 a[j].x=0.1;
57 }
58
59 startTime = elapsedTime();
60
61 for(i=0; i<ITER;i++) {
62 for( j=0; j<SIZE;j++){
63 tmp[j]+=a[j].x;
64 }
65 }
66 duration = elapsedTime()-startTime;
67
68 double MB = SIZE*sizeof(double)/1e+6;
69 double MBps = ITER*MB/duration;
70 printf("DP ArraySize = %lf MB, MB/s = %lf\n", MB, MBps);
71
72 return 0;
73 }
74
Command_prompt-host >icpc -mcmodel=medium -O3 -no-opt-prefetch -mmic -vec-report3 gather.cpp gettime.cpp -o gather.out
command_prompt-mic0 >./gather.out
DP ArraySize = 8.000000 MB, MB/s = 381.794093
command_prompt-mic0 >./gather.out
DP ArraySize = 8.000000 MB, MB/s = 484.896942
command_prompt-mic0 >
–opt-report-phase=hlo
when building the code, so that the compiler will provide you diagnostics on where it generated the software prefetch instructions, if any:Command_prompt-host >icc -mmic -vec-report3 -O3 -c -S –unroll0 –opt-report-phase=hlo gather.cpp
62 for( j=0; j<SIZE;j++){
63 tmp[j]+=a[j].x;
64 }
vgatherpf0dps
for L1 and vgatherpf1dps
for L2 prefetch.170
vprefetch0 a(%rip
) #63.22 c17
171 ..LN46:
172
vprefetch0 256+a(%rip)
#63.22 c21
173 .align 16,0x90
174 ..LN47:
....
177 ..LN48:
178 kmov %k1, %k2 #63.22 c1
179 ..LN49:
180 vprefetche1 512(%rsp,%rcx,8)
#63.14 c1
181 ..L13: #63.22
182 ..LN50:
183 vgatherdpd a(%rdx,%zmm1), %zmm3{%k2}
#63.22
184 ..LN51:
185 jkzd ..L12, %k2 # Prob 50
% #63.22
186 ..LN52:
187 vgatherdpd a(%rdx,%zmm1), %zmm3{%k2}
#63.22
188 ..LN53:
189 jknzd ..L13, %k2 # Prob 50%
#63.22
190 ..L12: #
191 ..LN54:
192 vaddpd (%rsp,%rcx,8), %zmm3, %zmm4
194 vprefetch0 256(%rsp,%rcx,8)
196 vprefetch1 2048+a(%rdx)
197 ..LN57:
vgatherpf0dps
misses both L1 and L2, the resulting prefetch in L1 is nontemporal, but the prefetch into L2 is a normal prefetch.Probing the Memory Subsystem
Measuring the Memory Bandwidth on Intel Xeon Phi
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <omp.h>
37
38
39 #define SIZE (180*1024*1000)
40 #define ITER 20
41
42 __declspec(align(256)) static double a[SIZE], b[SIZE], c[SIZE];
43
44
45 extern double elapsedTime (void);
46
47 int main()
48 {
49 double startTime, duration;
50 int i, j;
51
52 //initialize arrays
53 #pragma omp parallel for
54 for (i=0; i<SIZE;i++)
55 {
56 c[i]=0.0f;
57 b[i]=a[i]=(double)1.0f;
58 }
59
60 //measure c = a*b+c performance
61 startTime = elapsedTime();
62 for(i=0; i<ITER;i++) {
63 #pragma omp parallel for
64 for( j=0; j<SIZE;j++){
65 c[j]=a[j]*b[j]+c[j];
66 }
67 }
68 duration = elapsedTime() - startTime;
69
70 double GB = SIZE*sizeof(double)/1e+9;
71 double GBps = 4*ITER*GB/duration;
72 printf("Running %d openmp threads\n", omp_get_max_threads());
73 printf("DP ArraySize = %lf MB, GB/s = %lf\n", GB*1000, GBps);
74
75 return 0;
76 }
SIZE*sizeof(double)/1e+9;)
.4*GB*ITER/duration
icpc -mcmodel=medium -O3 -mmic -openmp -vec-report3 bw.cpp gettime.cpp -o bw.out
-mcmodel=medium
, which tells the compiler to expect the data size to be above 2GB and handle that accordingly, as it is for this case. This compile command will generate a bw.out binary that can run on the Intel Xeon Phi coprocessor.command-prompt-host>scp bw.out mic0:/tmp
command-prompt-host>scp /opt/intel/composerxe/lib/mic/libiomp5.so mic0:/tmp
Command-prompt-host > ssh mic0
Command-prompt-mic0 > export $LD_LIBRARY_PATH=/tmp:$LD_LIBRARY_PATH;
Command_prompt-mic0> export OMP_NUM_THREADS=180
Command_prompt-mic0>./bw.out
Command_prompt-mic0> ./bw.out
Running 180 openmp threads
DP ArraySize = 1475.56 MB and GBs = 159.005
Command_promot-mic0 >