Test Notebooks - Small Models Spring 2026

# Ensure that your python environment has huggingface_hub package installed.
import torch


# Check for GPU/MPS
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using device: {device}")

Using device: cuda

# Path for Shared Hub - change this to match your JupyterHub's shared directory
# Examples: /home/jovyan/shared, /home/jovyan/shared_readwrite, /home/jovyan/_shared/course-name
shared_model_path = "/home/jovyan/shared-readwrite"

Download TinyLlama 1.1B (Recommended for teaching)¶

from huggingface_hub import HfApi, list_models
# Search for GGUF models
api = HfApi()

# Find models with "gguf" in the name, sorted by downloads
models = list(api.list_models(
    search="gguf",
    sort="downloads",
    limit=20
))

print("Top 20 GGUF models by downloads:")
print("-" * 60)
for model in models:
    print(f"{model.id}")

Top 20 GGUF models by downloads:
------------------------------------------------------------
ggml-org/tinygemma3-GGUF
xtuner/llava-llama-3-8b-v1_1-gguf
unsloth/Qwen3-Coder-Next-GGUF
ggml-org/embeddinggemma-300M-GGUF
hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF
ggml-org/gpt-oss-120b-GGUF
mradermacher/Trinity-Nano-Base-GGUF
lmstudio-community/gemma-3-4b-it-GGUF
unsloth/GLM-4.7-Flash-GGUF
bartowski/Meta-Llama-3.1-8B-Instruct-GGUF
ggml-org/gemma-3-12b-it-GGUF
lmg-anon/vntl-llama3-8b-v2-gguf
janhq/Jan-v3-4B-base-instruct-gguf
unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF
unsloth/gpt-oss-20b-GGUF
MaziyarPanahi/Qwen3-14B-GGUF
MaziyarPanahi/Qwen3-4B-GGUF
MaziyarPanahi/Qwen3-0.6B-GGUF
unsloth/Qwen3.5-35B-A3B-GGUF
MaziyarPanahi/Qwen3-1.7B-GGUF

# List files in a specific repository to find available quantizations
from huggingface_hub import list_repo_files

repo_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
files = list_repo_files(repo_id)

print(f"Files in {repo_id}:")
print("-" * 60)
for f in files:
    if f.endswith(".gguf"):
        print(f)

Files in TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF:
------------------------------------------------------------
tinyllama-1.1b-chat-v1.0.Q2_K.gguf
tinyllama-1.1b-chat-v1.0.Q3_K_L.gguf
tinyllama-1.1b-chat-v1.0.Q3_K_M.gguf
tinyllama-1.1b-chat-v1.0.Q3_K_S.gguf
tinyllama-1.1b-chat-v1.0.Q4_0.gguf
tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf
tinyllama-1.1b-chat-v1.0.Q4_K_S.gguf
tinyllama-1.1b-chat-v1.0.Q5_0.gguf
tinyllama-1.1b-chat-v1.0.Q5_K_M.gguf
tinyllama-1.1b-chat-v1.0.Q5_K_S.gguf
tinyllama-1.1b-chat-v1.0.Q6_K.gguf
tinyllama-1.1b-chat-v1.0.Q8_0.gguf

from llama_cpp import Llama
import os
model_directory = "/home/jovyan/shared/"
# Define the model filename
model_name = "qwen2-1_5b-instruct-q4_0.gguf"

# Create the full path to the model
model_path = os.path.join(model_directory, model_name)
model = Llama(
    model_path=model_path,
    n_ctx=2048,          # Context window size
    n_threads=1,      # Auto-detect optimal thread count
    n_gpu_layers=-1,     # -1 means send all layers to GPU
    verbose=True,        # Print model loading info
    chat_format="chatml" # Qwen uses ChatML format
)