Chatbot - Gradio X llama-cpp-python - Small Models Spring 2026

import gradio as gr
from IPython.display import IFrame, display
import os
from llama_cpp import Llama
import json

gr.close_all()

Running a Gradio App on JupyterHub¶

This code demonstrates how to run a Gradio app (a simple web-based interface for Python functions) inside a JupyterHub environment such as DataHub or CloudBank.

Gradio normally launches on localhost, but on JupyterHub the server runs behind a proxy — so we use the environment variable JUPYTERHUB_SERVICE_PREFIX to route the app correctly through the proxy system.

demo1 = gr.Interface(fn=lambda x: f"Hello {x}!", inputs="text", outputs="text")

base_url = os.environ.get('JUPYTERHUB_SERVICE_PREFIX', '/')
demo1.launch(
    share=False,
    prevent_thread_lock=True,
    server_port=7860,
    root_path=f"{base_url}proxy/7860",
    inline=False
)

proxy_url = f"{base_url}proxy/7860/"
display(IFrame(src=proxy_url, width=1000, height=600))

demo1.close()

print(base_url)

Set up the llama-cpp-python framework¶

and Run llama-cpp-python behind the Chatbot

path="/home/jovyan/shared/"

model = Llama(
    model_path=os.path.join(path, "qwen2-1_5b-instruct-q4_0.gguf"),
    n_ctx=2048,
    n_threads=None,
    verbose=True,
    chat_format="chatml"
)

# Define function to call the model
def chat_with_model(prompt):
    messages = [{"role": "user", "content": prompt}]
    response = model.create_chat_completion(messages=messages, max_tokens=256)
    return response["choices"][0]["message"]["content"]

# Define Gradio interface
demo2 = gr.Interface(fn=chat_with_model, inputs="text", outputs="text", title="Small Model Chat")

# Launch on JupyterHub proxy
base_url = os.environ.get('JUPYTERHUB_SERVICE_PREFIX', '/')
demo2.launch(
    share=False,
    prevent_thread_lock=True,
    server_port=7860,
    root_path=f"{base_url}proxy/7860",
    inline=False
)

# Display inline in the notebook
proxy_url = f"{base_url}proxy/7860/"
display(IFrame(src=proxy_url, width=1000, height=600))

demo2.close()

Lets Build the History of the Chat¶

This will be a Json file that stores the chat history

demo2.close()

# ===== Persistent History File =====
HISTORY_FILE = "chat_history.json"

def load_history():
    if os.path.exists(HISTORY_FILE):
        try:
            with open(HISTORY_FILE, "r") as f:
                content = f.read().strip()
                if not content:
                    return []  # empty file
                return json.loads(content)
        except json.JSONDecodeError:
            print("⚠️ Warning: history file is corrupted or empty, resetting it.")
            return []
    return []

def save_history(history):
    with open(HISTORY_FILE, "w") as f:
        json.dump(history, f, indent=2)

history = load_history()

# ===== Chat Function =====
def chat_with_model(user_input):
    """Append user input to history, generate response, and persist conversation."""
    # Reload history each time in case file changed externally
    global history
    history = load_history()

    # Build messages list from history
    messages = []
    for h in history:
        messages.append({"role": "user", "content": h['user']})
        messages.append({"role": "assistant", "content": h['model']})
    messages.append({"role": "user", "content": user_input})

    # Generate model response
    response = model.create_chat_completion(messages=messages, max_tokens=100)
    response_text = response["choices"][0]["message"]["content"]

    # Update and persist history
    history.append({"user": user_input, "model": response_text})
    save_history(history)

    return response_text

# ===== Gradio Interface =====
demo3 = gr.Interface(
    fn=chat_with_model,
    inputs="text",
    outputs="text",
    title="Persistent Small Model Chat",
    description="Chat with a local llama-cpp-python model that remembers previous conversations."
)


base_url = os.environ.get('JUPYTERHUB_SERVICE_PREFIX', '/')
demo3.launch(
    share=False,
    prevent_thread_lock=True,
    server_port=7860,
    root_path=f"{base_url}proxy/7860",
    inline=False
)

proxy_url = f"{base_url}proxy/7860/"
display(IFrame(src=proxy_url, width=1000, height=600))

Now we can check the history that we have¶

for turn in json.load(open("chat_history.json")):
    print(f"🧑 User: {turn['user']}\n🤖 Model: {turn['model']}\n")

#if you want to clear the history 
#open("chat_history.json", "w").write("[]")

demo3.close()

gr.close_all()