Phase 1: The Initiation¶

We'll start by checking your GPU and printing basic architecture info using Python and pycuda.

First, install PyCUDA if you haven't already:

In [1]:

Copied!

!pip install pycuda
!pip install pycuda

Collecting pycuda
  Downloading pycuda-2025.1.tar.gz (1.7 MB)
     ---------------------------------------- 0.0/1.7 MB ? eta -:--:--
     ---------------------------------------- 1.7/1.7 MB 23.0 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting pytools>=2011.2 (from pycuda)
  Downloading pytools-2025.1.2-py3-none-any.whl.metadata (3.0 kB)
Requirement already satisfied: platformdirs>=2.2.0 in c:\users\iyeng\miniconda3\envs\gpu_mode\lib\site-packages (from pycuda) (4.3.7)
Collecting mako (from pycuda)
  Downloading mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Requirement already satisfied: typing-extensions>=4.5 in c:\users\iyeng\miniconda3\envs\gpu_mode\lib\site-packages (from pytools>=2011.2->pycuda) (4.13.1)
Requirement already satisfied: MarkupSafe>=0.9.2 in c:\users\iyeng\miniconda3\envs\gpu_mode\lib\site-packages (from mako->pycuda) (3.0.2)
Downloading pytools-2025.1.2-py3-none-any.whl (92 kB)
Downloading mako-1.3.10-py3-none-any.whl (78 kB)
Building wheels for collected packages: pycuda
  Building wheel for pycuda (pyproject.toml): started
  Building wheel for pycuda (pyproject.toml): finished with status 'done'
  Created wheel for pycuda: filename=pycuda-2025.1-cp310-cp310-win_amd64.whl size=375812 sha256=0bc865bc5414920579b87239b7e27249eda2ef17f7b4f6426c5ae2e72c9ea971
  Stored in directory: c:\users\iyeng\appdata\local\pip\cache\wheels\65\53\5f\f5f184c26b7cc503acb77f3456531a6e1fac0ce30c774b9d82
Successfully built pycuda
Installing collected packages: pytools, mako, pycuda
Successfully installed mako-1.3.10 pycuda-2025.1 pytools-2025.1.2

Step 1. GPU Architecture¶

This will:

Identify your GPU
Show you SM count (streaming multiprocessors)
Show warp size
Show max block/grid/thread limits
Reveal SIMT-style hints (like max threads per block)

In [5]:

Copied!





import pycuda.driver as cuda
import pycuda.autoinit

device = cuda.Device(0)
attrs = device.get_attributes()

print(f"GPU Name: {device.name()}")
print(f"Total Memory: {device.total_memory() / (1024 ** 3):.2f} GB")
print("\n-- GPU Architecture Attributes --")

arch_attrs = {
    "MULTIPROCESSOR_COUNT": cuda.device_attribute.MULTIPROCESSOR_COUNT,
    "MAX_THREADS_PER_BLOCK": cuda.device_attribute.MAX_THREADS_PER_BLOCK,
    "WARP_SIZE": cuda.device_attribute.WARP_SIZE,
    "MAX_BLOCK_DIM_X": cuda.device_attribute.MAX_BLOCK_DIM_X,
    "MAX_GRID_DIM_X": cuda.device_attribute.MAX_GRID_DIM_X,
    "CLOCK_RATE (KHz)": cuda.device_attribute.CLOCK_RATE
}

for name, attr in arch_attrs.items():
    print(f"{name}: {attrs.get(attr)}")
import pycuda.driver as cuda
import pycuda.autoinit

device = cuda.Device(0)
attrs = device.get_attributes()

print(f"GPU Name: {device.name()}")
print(f"Total Memory: {device.total_memory() / (1024 ** 3):.2f} GB")
print("\n-- GPU Architecture Attributes --")

arch_attrs = {
    "MULTIPROCESSOR_COUNT": cuda.device_attribute.MULTIPROCESSOR_COUNT,
    "MAX_THREADS_PER_BLOCK": cuda.device_attribute.MAX_THREADS_PER_BLOCK,
    "WARP_SIZE": cuda.device_attribute.WARP_SIZE,
    "MAX_BLOCK_DIM_X": cuda.device_attribute.MAX_BLOCK_DIM_X,
    "MAX_GRID_DIM_X": cuda.device_attribute.MAX_GRID_DIM_X,
    "CLOCK_RATE (KHz)": cuda.device_attribute.CLOCK_RATE
}

for name, attr in arch_attrs.items():
    print(f"{name}: {attrs.get(attr)}")

GPU Name: NVIDIA GeForce RTX 4080 Laptop GPU
Total Memory: 11.99 GB

-- GPU Architecture Attributes --
MULTIPROCESSOR_COUNT: 58
MAX_THREADS_PER_BLOCK: 1024
WARP_SIZE: 32
MAX_BLOCK_DIM_X: 1024
MAX_GRID_DIM_X: 2147483647
CLOCK_RATE (KHz): 1830000

STEP 2: CUDA Programming Model — Threads, Blocks, Grids, Warps¶

Next, let’s illustrate how CUDA organizes parallelism using a kernel.

We’ll write a simple vector addition example that shows:

How threads are indexed within a block and grid
The relationship between blocks, threads, and warps

This step demonstrates:

The CUDA thread hierarchy (grid, block, thread)
How to index threads globally
The connection between Python and CUDA C code
How to compile and launch kernels from Python

In [10]:

Copied!





import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

N = 16

mod = SourceModule("""
__global__ void add_vectors(float *a, float *b, float *c, int N)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    
    if (idx < N) {
        c[idx] = a[idx] + b[idx];
    }
}
""")


a = np.random.randn(N).astype(np.float32)
b = np.random.randn(N).astype(np.float32)
c = np.empty_like(a)

a_gpu = cuda.mem_alloc(a.nbytes)
b_gpu = cuda.mem_alloc(b.nbytes)
c_gpu = cuda.mem_alloc(c.nbytes)

cuda.memcpy_htod(a_gpu, a)
cuda.memcpy_htod(b_gpu, b)

func = mod.get_function("add_vectors")

block_size = 4
grid_size = (N + block_size - 1) // block_size

func(a_gpu, b_gpu, c_gpu, np.int32(N), block=(block_size,1,1), grid=(grid_size,1))

cuda.memcpy_dtoh(c, c_gpu)

print(f"Vector A:\n{a}\n")
print(f"Vector B:\n{b}\n")
print(f"Vector C (A+B):\n{c}\n")
import numpy as np
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

N = 16

mod = SourceModule("""
__global__ void add_vectors(float *a, float *b, float *c, int N)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    
    if (idx < N) {
        c[idx] = a[idx] + b[idx];
    }
}
""")


a = np.random.randn(N).astype(np.float32)
b = np.random.randn(N).astype(np.float32)
c = np.empty_like(a)

a_gpu = cuda.mem_alloc(a.nbytes)
b_gpu = cuda.mem_alloc(b.nbytes)
c_gpu = cuda.mem_alloc(c.nbytes)

cuda.memcpy_htod(a_gpu, a)
cuda.memcpy_htod(b_gpu, b)

func = mod.get_function("add_vectors")

block_size = 4
grid_size = (N + block_size - 1) // block_size

func(a_gpu, b_gpu, c_gpu, np.int32(N), block=(block_size,1,1), grid=(grid_size,1))

cuda.memcpy_dtoh(c, c_gpu)

print(f"Vector A:\n{a}\n")
print(f"Vector B:\n{b}\n")
print(f"Vector C (A+B):\n{c}\n")

Vector A:
[-0.09272769  0.36310634 -1.4122794  -1.531028   -1.5436966  -0.4410738
  0.57584506  0.63177073  0.9921369  -1.0148718   1.544412   -0.6879888
  0.1384869   0.90717006  0.20168625  0.22363763]

Vector B:
[-1.0652711   0.12170894  1.071717    1.1530142   1.4780273  -1.2505566
 -1.101104   -0.8431008   0.32481927 -0.58225757 -1.535062   -1.5423752
 -0.65916175 -0.44161093 -0.46648252  1.8605511 ]

Vector C (A+B):
[-1.1579988   0.4848153  -0.34056234 -0.37801385 -0.0656693  -1.6916304
 -0.52525896 -0.21133006  1.3169562  -1.5971293   0.00935006 -2.230364
 -0.5206748   0.46555912 -0.26479626  2.0841887 ]

C:\Users\iyeng\AppData\Local\Temp\ipykernel_31004\3374425168.py:8: UserWarning: The CUDA compiler succeeded, but said the following:
kernel.cu

  mod = SourceModule("""

STEP 3: Compilation & Runtime — nvcc, .cu, Device vs Host Code¶

Goals:

Understand how CUDA code is compiled (separating host and device code)
See the relationship between .cu files, nvcc, and Python bindings
Compile a standalone .cu file and call it from Python

What’s Really Going On?¶

Host code: runs on the CPU (e.g., your Python or C++ control logic)

Device code: runs on the GPU (your global kernels)

nvcc separates and compiles them correctly, producing PTX or binary objects

PyCUDA uses SourceModule() which auto-calls nvcc under the hood (in memory)

Let's create a vector_add.cu file¶

// vector_add.cu
extern "C" __global__ void add_vectors(float *a, float *b, float *c, int N)
{
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    if (idx < N) {
        c[idx] = a[idx] + b[idx];
    }
}

We’ll write a .cu file and compile it to a dynamic linked library (.dll) — then call it from Python using ctypes.¶

nvcc -shared -o vector_add.dll vector_add.cu

Note: That AttributeError: 'DeviceAllocation' object has no attribute 'handle' is because on Windows with PyCUDA, we don't use .handle to get the raw device pointer.

Instead, use the int() cast, which gives you the actual pointer address in integer form.

In [ ]:

Copied!





import numpy as np
import ctypes
import pycuda.driver as cuda

N = 16
block_size = 4
grid_size = (N + block_size - 1) // block_size

lib = ctypes.CDLL("./vector_add.dll")
print("LIB::", dir(lib)) #Check the attributes of lib variable in Python
# Define argument types for safety
lib.add_vectors.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int]

a = np.random.randn(N).astype(np.float32)
b = np.random.randn(N).astype(np.float32)
c = np.empty_like(a)

a_gpu = cuda.mem_alloc(a.nbytes)
b_gpu = cuda.mem_alloc(b.nbytes)
c_gpu = cuda.mem_alloc(c.nbytes)

cuda.memcpy_htod(a_gpu, a)
cuda.memcpy_htod(b_gpu, b)

lib.add_vectors(int(a_gpu), int(b_gpu), int(c_gpu), N)

cuda.Context.synchronize()
cuda.memcpy_dtoh(c, c_gpu)

print("A:", a)
print("B:", b)
print("C = A + B:", c)
import numpy as np
import ctypes
import pycuda.driver as cuda

N = 16
block_size = 4
grid_size = (N + block_size - 1) // block_size

lib = ctypes.CDLL("./vector_add.dll")
print("LIB::", dir(lib)) #Check the attributes of lib variable in Python
# Define argument types for safety
lib.add_vectors.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int]

a = np.random.randn(N).astype(np.float32)
b = np.random.randn(N).astype(np.float32)
c = np.empty_like(a)

a_gpu = cuda.mem_alloc(a.nbytes)
b_gpu = cuda.mem_alloc(b.nbytes)
c_gpu = cuda.mem_alloc(c.nbytes)

cuda.memcpy_htod(a_gpu, a)
cuda.memcpy_htod(b_gpu, b)

lib.add_vectors(int(a_gpu), int(b_gpu), int(c_gpu), N)

cuda.Context.synchronize()
cuda.memcpy_dtoh(c, c_gpu)

print("A:", a)
print("B:", b)
print("C = A + B:", c)

LIB:: ['_FuncPtr', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_func_flags_', '_func_restype_', '_handle', '_name']
A: [ 0.78028464 -0.49346212 -0.90274906  0.9751807  -0.02011734  1.0545729
  0.40566817  0.31163436 -0.9446583   0.56412727  0.51989985 -1.3264078
 -0.55833036  0.85947335 -0.4002817   1.0153143 ]
B: [ 1.2956427   0.1763411   0.31895843 -1.928016    0.69085884 -1.1382663
 -1.5165892  -0.8581926  -0.6500315  -1.1406062   1.4036125   0.7908466
 -0.60482484 -0.04766817  0.4928366  -0.24710186]
C = A + B: [-0.38582104  0.8457861   0.41153207 -1.2528391   0.59481716 -1.5670898
 -0.72116786 -0.39626685  2.7010136  -0.80919904  0.33222598  0.09432879
  1.3553219  -0.07149178  1.6497035   0.14472684]

In [ ]: