RAG (Retrieval-Augmented Generation) is like an open-book exam for AI:
Get a question
Find the relevant page in your book
Answer using that page
This notebook walks through each step.
%%capture
# Install packages (output suppressed)
%pip install matplotlib scikit-learn onnxruntime-gpu transformers huggingface_hub --no-cache-dir --break-system-packages
import platform, os, subprocess, sys
# llama-cpp-python needs the CUDA wheel index on JupyterHub (plain pip grabs CPU-only)
_cmd = [sys.executable, "-m", "pip", "install", "llama-cpp-python",
"--break-system-packages", "--quiet", "--no-cache-dir"]
if not (platform.system() == "Darwin") and os.path.exists("/dev/nvidia0"):
_cmd += ["--extra-index-url", "https://abetlen.github.io/llama-cpp-python/whl/cu121"]
# CUDA runtime libs needed for GPU inference
subprocess.check_call(
[sys.executable, "-m", "pip", "install",
"nvidia-cuda-runtime-cu12", "nvidia-cublas-cu12",
"--break-system-packages", "--quiet", "--no-cache-dir"],
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
subprocess.check_call(_cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)import re, time, os, platform, glob, ctypes
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import PCA
import onnxruntime as ort
from huggingface_hub import hf_hub_download
from transformers import AutoTokenizer
# --- GPU detection ---
if platform.system() == "Darwin" and platform.machine() == "arm64":
N_GPU_LAYERS, DEVICE = -1, "metal"
elif os.path.exists("/dev/nvidia0"):
N_GPU_LAYERS, DEVICE = -1, "cuda"
# Preload CUDA shared libraries (required on JupyterHub before importing llama_cpp)
for pattern in ["**/libcudart.so.12*", "**/libcublas.so.12*"]:
for so in glob.glob(f"/srv/conda/envs/notebook/lib/python3.12/site-packages/nvidia/{pattern}", recursive=True):
ctypes.CDLL(so)
break
else:
N_GPU_LAYERS, DEVICE = 0, "cpu"
from llama_cpp import Llama
# --- Embedding model (~90MB, cached after first download) ---
_onnx = hf_hub_download("sentence-transformers/all-MiniLM-L6-v2", "onnx/model.onnx")
_providers = (["CUDAExecutionProvider", "CPUExecutionProvider"]
if "CUDAExecutionProvider" in ort.get_available_providers()
else ["CPUExecutionProvider"])
_session = ort.InferenceSession(_onnx, providers=_providers)
_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
def embed(texts):
"""Convert text into vectors that capture meaning."""
if isinstance(texts, str): texts = [texts]
enc = _tokenizer(texts, padding=True, truncation=True, max_length=256, return_tensors="np")
out = _session.run(None, {k: enc[k] for k in ["input_ids", "attention_mask", "token_type_ids"]})
mask = np.expand_dims(enc["attention_mask"], -1).astype(float)
emb = np.sum(out[0] * mask, axis=1) / np.maximum(mask.sum(axis=1), 1e-9)
return emb / np.linalg.norm(emb, axis=1, keepdims=True)
def word_match(question, document):
"""Fraction of meaningful question words found in the document."""
stop = {'how','much','do','does','the','a','an','is','are','what','which',
'who','where','when','can','tell','me','about','and','or','to','in',
'of','for','that','they','it','i','want','their'}
q = set(re.sub(r'[^\w\s]','',question.lower()).split()) - stop
d = set(re.sub(r'[^\w\s]','',document.lower()).split())
return len(q & d) / len(q) if q else 0
print(f"Ready! Device: {DEVICE} | ONNX: {_session.get_providers()[0]}")Our Knowledge Base¶
5 facts the AI has never seen. This way we can clearly see whether RAG helps.
documents = [
"The SmallLM course at UC Berkeley was created in Fall 2025 by the discovery program team.",
"Professor Greg Merritt designed the Ollama demo to teach students to run AI models locally.",
"The Qwen2 1.5B model uses Q4_0 quantization and requires only 894 MB of disk space.",
"TinyLlama 1.1B was trained on 3 trillion tokens despite having only 1.1 billion parameters.",
"JupyterCon 2025 hosted a workshop on teaching economics students about AI inference costs."
]
labels = ["SmallLM\nCourse", "Ollama\nDemo", "Qwen2\nModel", "TinyLlama", "JupyterCon"]
# Pre-compute document embeddings (used throughout the notebook)
doc_embeddings = embed(documents)
for i, doc in enumerate(documents):
print(f"Doc {i+1}: {doc}")Finding the Right Document¶
Given a question, which document is most relevant? We’ll compare two approaches:
| Word Matching | Embeddings | |
|---|---|---|
| How it works | Count shared words | Compare meaning vectors |
| Understands synonyms? | No | Yes |
question = "How big is the Qwen2 model file?"
wm_scores = [word_match(question, doc) for doc in documents]
em_scores = cosine_similarity(embed([question]), doc_embeddings)[0]
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 4))
for ax, scores, title in [(ax1, wm_scores, "Word Matching"), (ax2, em_scores, "Embeddings")]:
colors = ['#2ecc71' if s == max(scores) and s > 0 else '#dfe6e9' for s in scores]
ax.bar(labels, scores, color=colors, edgecolor='#636e72')
ax.set_title(title, fontsize=14, fontweight='bold')
ax.set_ylabel('Score')
ax.set_ylim(0, 1)
for i, s in enumerate(scores):
if s > 0.01:
ax.text(i, s + 0.03, f'{s:.2f}', ha='center', fontsize=10)
plt.suptitle(f'"{question}"', fontsize=13, y=1.02)
plt.tight_layout()
plt.show()
print(f"Both methods find '{labels[np.argmax(em_scores)].replace(chr(10), " ")}' -- but embeddings give a much clearer signal.")The Real Test: Different Words, Same Meaning¶
What if someone asks about “storage requirements for the quantized neural network”? No words overlap with any document -- word matching fails completely.
tricky = "What are the storage needs for the quantized neural network?"
wm_scores = [word_match(tricky, doc) for doc in documents]
em_scores = cosine_similarity(embed([tricky]), doc_embeddings)[0]
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 4))
# Word matching: all zeros
ax1.bar(labels, wm_scores, color='#dfe6e9', edgecolor='#636e72')
ax1.set_title('Word Matching', fontsize=14, fontweight='bold')
ax1.set_ylabel('Score')
ax1.set_ylim(0, 1)
ax1.text(2, 0.5, 'No matches!', fontsize=18, ha='center', color='#e74c3c', fontweight='bold')
# Embeddings: finds it
colors = ['#2ecc71' if s == max(em_scores) else '#dfe6e9' for s in em_scores]
ax2.bar(labels, em_scores, color=colors, edgecolor='#636e72')
ax2.set_title('Embeddings', fontsize=14, fontweight='bold')
ax2.set_ylabel('Similarity')
ax2.set_ylim(0, 1)
for i, s in enumerate(em_scores):
if s > 0.05:
ax2.text(i, s + 0.03, f'{s:.2f}', ha='center', fontsize=10)
plt.suptitle(f'"{tricky}"', fontsize=13, y=1.02)
plt.tight_layout()
plt.show()
print('Embeddings understood that "storage needs" = "disk space" and "quantized neural network" = "Q4_0 quantization model"')Visualizing Meaning Space¶
Each document and question becomes a point. Similar meaning = nearby points.
pca = PCA(n_components=2)
doc_2d = pca.fit_transform(doc_embeddings)
# Two questions: one with matching words, one without
questions = ["How big is the Qwen2 model file?",
"What are the storage needs for the quantized neural network?"]
q_embs = embed(questions)
q_2d = pca.transform(q_embs)
fig, ax = plt.subplots(figsize=(10, 7))
# Documents
doc_colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#F38181', '#3D5A80']
for i, (x, y) in enumerate(doc_2d):
ax.scatter(x, y, s=250, c=doc_colors[i], edgecolors='black', linewidths=1.5, zorder=2)
ax.annotate(labels[i].replace('\n', ' '), (x, y), fontsize=11, fontweight='bold',
xytext=(10, 10), textcoords='offset points')
# Questions
q_labels = ['Q1: "How big is Qwen2?"', 'Q2: "storage needs for\nquantized network?"']
for i, (x, y) in enumerate(q_2d):
ax.scatter(x, y, s=350, c='gold', marker='*', edgecolors='black', linewidths=1.5, zorder=3)
ax.annotate(q_labels[i], (x, y), fontsize=9, xytext=(-20, -30), textcoords='offset points',
bbox=dict(boxstyle='round', facecolor='lightyellow', edgecolor='orange'))
best_idx = np.argmax(cosine_similarity([q_embs[i]], doc_embeddings)[0])
ax.plot([x, doc_2d[best_idx][0]], [y, doc_2d[best_idx][1]], 'g--', linewidth=1.5, zorder=1)
ax.set_title('Meaning Space: Questions Land Near Relevant Documents', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.2)
ax.set_xlabel('Dimension 1')
ax.set_ylabel('Dimension 2')
plt.tight_layout()
plt.show()
print("Both questions point to Qwen2 Model -- even though they use completely different words!")The Full Pipeline: Retrieve + Generate¶
Now we add a language model. Embeddings find the best document, the LLM writes the answer.
# Load the language model
MODEL_DIR = "/home/jovyan/shared" # <-- change this to your model directory
t0 = time.perf_counter()
model = Llama(
model_path=os.path.join(MODEL_DIR, "qwen2-1_5b-instruct-q4_0.gguf"),
n_ctx=2048, n_gpu_layers=N_GPU_LAYERS, verbose=False, chat_format="chatml"
)
print(f"Model loaded in {time.perf_counter()-t0:.1f}s (GPU layers: {N_GPU_LAYERS})")question = "How big is the Qwen2 model file?"
# 1. Retrieve: find the best document
scores = cosine_similarity(embed([question]), doc_embeddings)[0]
best_doc = documents[np.argmax(scores)]
# 2. Generate WITH RAG (provide the retrieved document as context)
rag_prompt = f"Use this information to answer the question.\n\nInformation: {best_doc}\n\nQuestion: {question}"
with_rag = model.create_chat_completion(
messages=[{"role": "user", "content": rag_prompt}], max_tokens=100
)["choices"][0]["message"]["content"]
# 3. Generate WITHOUT RAG (no context, model guesses)
without_rag = model.create_chat_completion(
messages=[{"role": "user", "content": question}], max_tokens=100
)["choices"][0]["message"]["content"]
# Compare
print(f"Question: {question}")
print(f"Retrieved: {best_doc}")
print(f"")
print(f"WITH RAG: {with_rag}")
print(f"WITHOUT RAG: {without_rag}")Try It Yourself!¶
Change my_question below and run the cell.
Suggestions:
“Who created the SmallLM course?”
“How much training data did the small language model use?”
“What conference covered AI education for economists?”
“How can I run LLMs without the internet?”
my_question = "Who created the SmallLM course?" # <-- CHANGE THIS
# Retrieve
scores = cosine_similarity(embed([my_question]), doc_embeddings)[0]
best_idx = np.argmax(scores)
# Visualize
fig, ax = plt.subplots(figsize=(11, 3.5))
colors = ['#2ecc71' if s == max(scores) else '#dfe6e9' for s in scores]
ax.bar(labels, scores, color=colors, edgecolor='#636e72')
for i, s in enumerate(scores):
if s > 0.05: ax.text(i, s + 0.02, f'{s:.2f}', ha='center', fontsize=10)
ax.set_title(f'"{my_question}"', fontsize=13)
ax.set_ylabel('Similarity')
ax.set_ylim(0, 1)
plt.tight_layout()
plt.show()
# Generate
rag_prompt = f"Use this information to answer the question.\n\nInformation: {documents[best_idx]}\n\nQuestion: {my_question}"
answer = model.create_chat_completion(
messages=[{"role": "user", "content": rag_prompt}], max_tokens=100
)["choices"][0]["message"]["content"]
print(f"Retrieved: {documents[best_idx]}")
print(f"Answer: {answer}")Key Takeaways¶
RAG = Retrieve + Generate -- find relevant docs first, then answer using them
Embeddings > Word Matching -- they understand meaning, not just keywords
RAG reduces hallucination -- the model answers from facts, not guesses
No retraining needed -- just add new documents to your knowledge base