Saurav Muralidharan srvm

## cuda.h
/* Minimal declarations for CUDA support.  Testing purposes only. */

#define __constant__ __attribute__((constant))
#define __device__ __attribute__((device))
#define __global__ extern "C" __attribute__((global))
#define __host__ __attribute__((host))
#define __shared__ __attribute__((shared))
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
#define __forceinline__ __attribute__((always_inline))

## latency.txt
Latency Comparison Numbers (~2012)
----------------------------------
L1 cache reference                           0.5 ns
Branch mispredict                            5   ns
L2 cache reference                           7   ns                      14x L1 cache
Mutex lock/unlock                           25   ns
Main memory reference                      100   ns                      20x L2 cache, 200x L1 cache
Compress 1K bytes with Zippy             3,000   ns        3 us
Send 1K bytes over 1 Gbps network       10,000   ns       10 us
Read 4K randomly from SSD*             150,000   ns      150 us          ~1GB/sec SSD
	/* Minimal declarations for CUDA support. Testing purposes only. */

	#define __constant__ __attribute__((constant))
	#define __device__ __attribute__((device))
	#define __global__ extern "C" __attribute__((global))
	#define __host__ __attribute__((host))
	#define __shared__ __attribute__((shared))
	#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
	#define __forceinline__ __attribute__((always_inline))
	Latency Comparison Numbers (~2012)
	----------------------------------
	L1 cache reference 0.5 ns
	Branch mispredict 5 ns
	L2 cache reference 7 ns 14x L1 cache
	Mutex lock/unlock 25 ns
	Main memory reference 100 ns 20x L2 cache, 200x L1 cache
	Compress 1K bytes with Zippy 3,000 ns 3 us
	Send 1K bytes over 1 Gbps network 10,000 ns 10 us
	Read 4K randomly from SSD* 150,000 ns 150 us ~1GB/sec SSD