Initial commit

This commit is contained in:
Chris Proctor
2026-06-22 16:14:58 -04:00
commit 42bc2e7a50
14 changed files with 2049 additions and 0 deletions

9
.commit_template Normal file
View File

@@ -0,0 +1,9 @@
# -----------------------------------------------------------------
# Write your commit message above this line.
#
# The first line should be a quick description of what you changed.
# Then leave a blank line.
# Then write a few sentences describing an idea or a question you
# have been thinking about.

2
.gitignore vendored Normal file
View File

@@ -0,0 +1,2 @@
**/__pycache__/*
runs/*

117
babysnake/__init__.py Normal file
View File

@@ -0,0 +1,117 @@
"""BabySnake: a 4×4 grid game where an agent collects food.
State: (agent_x, agent_y, food_x, food_y) — four integers.
The agent starts with 50 energy. Each step costs 1 energy.
Collecting food restores 30 energy and adds 1 to the score.
The game ends when energy reaches 0.
"""
from random import randint, choice
from retro.game import Game
BOARD_SIZE = 4
START_ENERGY = 50
FOOD_ENERGY = 30
class Forager:
"""The player agent. Controlled with arrow keys."""
name = "Forager"
character = '@'
color = "green_on_black"
position = (0, 0)
UP = (0, -1)
DOWN = (0, 1)
LEFT = (-1, 0)
RIGHT = (1, 0)
def __init__(self):
self._direction = self.RIGHT
def handle_keystroke(self, keystroke, game):
if keystroke.name == "KEY_RIGHT":
self._direction = self.RIGHT
elif keystroke.name == "KEY_UP":
self._direction = self.UP
elif keystroke.name == "KEY_LEFT":
self._direction = self.LEFT
elif keystroke.name == "KEY_DOWN":
self._direction = self.DOWN
def play_turn(self, game):
x, y = self.position
dx, dy = self._direction
new_pos = (x + dx, y + dy)
if game.on_board(new_pos):
self.position = new_pos
game.state['energy'] -= 1
game.state['reward'] -= 0.01
food = game.get_agent_by_name("Food")
if self.position == food.position:
food.relocate(game)
game.state['energy'] += FOOD_ENERGY
game.state['score'] += 1
game.state['reward'] += 1.0
ax, ay = self.position
fx, fy = game.get_agent_by_name("Food").position
game.state['agent_x'] = ax
game.state['agent_y'] = ay
game.state['food_x'] = fx
game.state['food_y'] = fy
if game.state['energy'] <= 0:
game.end()
class Food:
"""The food item. Respawns at a random empty position when collected."""
name = "Food"
character = '*'
color = "yellow_on_black"
position = (0, 0)
def relocate(self, game):
bw, bh = game.board_size
forager = game.get_agent_by_name("Forager")
while True:
pos = (randint(0, bw - 1), randint(0, bh - 1))
if pos != forager.position:
self.position = pos
return
def create_game():
"""Return a fresh BabySnake game."""
forager = Forager()
food = Food()
bw = bh = BOARD_SIZE
game = Game(
[forager, food],
{
'score': 0,
'reward': 0.0,
'energy': START_ENERGY,
'agent_x': 0,
'agent_y': 0,
'food_x': 0,
'food_y': 0,
},
board_size=(bw, bh),
framerate=6,
)
forager.position = (randint(0, bw - 1), randint(0, bh - 1))
food.relocate(game)
ax, ay = forager.position
fx, fy = food.position
game.state['agent_x'] = ax
game.state['agent_y'] = ay
game.state['food_x'] = fx
game.state['food_y'] = fy
return game
if __name__ == '__main__':
create_game().play()

3
babysnake/__main__.py Normal file
View File

@@ -0,0 +1,3 @@
from babysnake import create_game
create_game().play()

4
babysnake/pyproject.toml Normal file
View File

@@ -0,0 +1,4 @@
[tool.retro-gamer]
actions = ["KEY_RIGHT", "KEY_UP", "KEY_LEFT", "KEY_DOWN"]
reward = "reward"
character_set = ["@", "*"]

103
forager/__init__.py Normal file
View File

@@ -0,0 +1,103 @@
"""Forager: an 8×8 grid game where an agent collects food.
The agent moves in four directions collecting food that respawns on collection.
The game runs indefinitely; retro-gamer's max_turns_per_episode controls episode length.
Observation features for retro-gamer:
food_dx: (food_x - agent_x) / board_width (positive = food is to the right)
food_dy: (food_y - agent_y) / board_height (positive = food is below)
"""
from random import randint
from retro.game import Game
BOARD_SIZE = 8
class Forager:
"""The player agent."""
name = "Forager"
character = '@'
color = "green_on_black"
position = (0, 0)
UP = (0, -1)
DOWN = (0, 1)
LEFT = (-1, 0)
RIGHT = (1, 0)
def __init__(self):
self._direction = self.RIGHT
def handle_keystroke(self, keystroke, game):
if keystroke.name == "KEY_RIGHT":
self._direction = self.RIGHT
elif keystroke.name == "KEY_UP":
self._direction = self.UP
elif keystroke.name == "KEY_LEFT":
self._direction = self.LEFT
elif keystroke.name == "KEY_DOWN":
self._direction = self.DOWN
def play_turn(self, game):
x, y = self.position
dx, dy = self._direction
new_pos = (x + dx, y + dy)
if game.on_board(new_pos):
self.position = new_pos
game.state['reward'] -= 0.01
food = game.get_agent_by_name("Food")
if self.position == food.position:
food.relocate(game)
game.state['score'] += 1
game.state['reward'] += 1.0
bw, bh = game.board_size
ax, ay = self.position
fx, fy = game.get_agent_by_name("Food").position
game.state['food_dx'] = (fx - ax) / bw
game.state['food_dy'] = (fy - ay) / bh
class Food:
"""The food item. Respawns when collected."""
name = "Food"
character = '*'
color = "yellow_on_black"
position = (0, 0)
def relocate(self, game):
bw, bh = game.board_size
forager = game.get_agent_by_name("Forager")
while True:
pos = (randint(0, bw - 1), randint(0, bh - 1))
if pos != forager.position:
self.position = pos
return
def create_game():
"""Return a fresh Forager game."""
forager = Forager()
food = Food()
bw = bh = BOARD_SIZE
game = Game(
[forager, food],
{'score': 0, 'reward': 0.0, 'food_dx': 0.0, 'food_dy': 0.0},
board_size=(bw, bh),
framerate=12,
)
forager.position = (randint(0, bw - 1), randint(0, bh - 1))
food.relocate(game)
bw, bh = game.board_size
ax, ay = forager.position
fx, fy = food.position
game.state['food_dx'] = (fx - ax) / bw
game.state['food_dy'] = (fy - ay) / bh
return game
if __name__ == '__main__':
create_game().play()

3
forager/__main__.py Normal file
View File

@@ -0,0 +1,3 @@
from forager import create_game
create_game().play()

4
forager/pyproject.toml Normal file
View File

@@ -0,0 +1,4 @@
[tool.retro-gamer]
actions = ["KEY_RIGHT", "KEY_UP", "KEY_LEFT", "KEY_DOWN"]
reward = "reward"
character_set = ["@", "*"]

20
pyproject.toml Normal file
View File

@@ -0,0 +1,20 @@
[project]
name = "lab-reinforcement-learning"
version = "0.1.0"
description = "Reinforcement learning lab"
requires-python = ">=3.11"
dependencies = [
"retro-games>=2.3.1",
"retro-gamer>=0.1.0",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["babysnake", "forager"]
[tool.uv.sources]
retro-games = { path = "../../packages/retro", editable = true }
retro-gamer = { path = "../../packages/retro-gamer", editable = true }

218
q_learning.py Normal file
View File

@@ -0,0 +1,218 @@
"""Q-learning agent for BabySnake.
This file contains starter code for implementing a Q-learning agent.
You need to fill in two functions:
- choose_action: select an action using an epsilon-greedy policy
- update_q: update the Q-table using the Bellman equation
Run this file to train the agent:
python q_learning.py
After training, run this to watch it play:
python -c "from q_learning import watch; watch()"
"""
import random
import babysnake
from retro.input import ProgrammaticInput
from retro.views.headless import HeadlessView
# The four actions the agent can take.
ACTIONS = ["KEY_RIGHT", "KEY_DOWN", "KEY_LEFT", "KEY_UP"]
# ---------------------------------------------------------------------------
# Environment wrapper
# ---------------------------------------------------------------------------
class BabySnakeEnv:
"""A simple wrapper that lets us step through BabySnake programmatically.
Usage:
env = BabySnakeEnv()
state = env.reset() # start a new episode
next_state, reward, done = env.step("KEY_RIGHT")
"""
def reset(self):
"""Start a new episode. Returns the initial state tuple."""
self._inp = ProgrammaticInput()
self.game = babysnake.create_game()
self.game.input_source = self._inp
self.game.view = HeadlessView()
self.game.start()
self._prev_reward = 0.0
return self._get_state()
def step(self, action):
"""Take one action. Returns (next_state, reward, done).
Arguments:
action (str): One of ACTIONS, or None for no-op.
Returns:
next_state (tuple): The state after the action.
reward (float): The reward received this step.
done (bool): True if the episode has ended.
"""
self._inp.press(action)
self.game.step()
next_state = self._get_state()
reward = self.game.state['reward'] - self._prev_reward
self._prev_reward = self.game.state['reward']
done = not self.game.playing
return next_state, reward, done
def _get_state(self):
"""Return the current state as a tuple of four integers."""
s = self.game.state
return (int(s['agent_x']), int(s['agent_y']),
int(s['food_x']), int(s['food_y']))
# ---------------------------------------------------------------------------
# Q-learning functions — fill these in!
# ---------------------------------------------------------------------------
def choose_action(q_table, state, epsilon):
"""Choose an action using an epsilon-greedy policy.
With probability `epsilon`, return a random action from ACTIONS.
Otherwise, return the action with the highest Q-value in `q_table`
for the given `state`. If a (state, action) pair has not been seen
before, treat its Q-value as 0.
Arguments:
q_table (dict): Maps (state, action) -> Q-value.
state (tuple): The current state, e.g. (1, 2, 3, 0).
epsilon (float): Exploration rate, between 0.0 and 1.0.
Returns:
str: One action from ACTIONS.
Hint: random.random() returns a float in [0.0, 1.0).
random.choice(ACTIONS) returns a random action.
q_table.get(key, default) is handy for missing entries.
"""
raise NotImplementedError("Fill in choose_action")
def update_q(q_table, state, action, reward, next_state, alpha, gamma):
"""Update one entry of the Q-table using the Bellman equation.
The update rule is:
Q(s, a) <- Q(s, a) + alpha * (r + gamma * max_a' Q(s', a') - Q(s, a))
where:
s, a — the state we were in and the action we took
r — the reward we received
s' — the state we ended up in
max_a' ... — the best possible Q-value from the new state
Arguments:
q_table (dict): Maps (state, action) -> Q-value (modified in place).
state (tuple): The state before the action.
action (str): The action taken.
reward (float): The reward received.
next_state (tuple): The state after the action.
alpha (float): Learning rate (how much to update).
gamma (float): Discount factor (how much to value future rewards).
Returns:
None — modifies q_table in place.
Hint: Q-values for unseen (state, action) pairs start at 0.
"""
raise NotImplementedError("Fill in update_q")
# ---------------------------------------------------------------------------
# Training loop
# ---------------------------------------------------------------------------
def train(
episodes=1000,
alpha=0.1,
gamma=0.95,
epsilon=1.0,
epsilon_decay=0.995,
epsilon_min=0.05,
):
"""Train a Q-learning agent on BabySnake.
Arguments:
episodes (int): How many episodes to run.
alpha (float): Learning rate.
gamma (float): Discount factor.
epsilon (float): Starting exploration rate.
epsilon_decay (float): Multiply epsilon by this each episode.
epsilon_min (float): Epsilon never falls below this.
Returns:
dict: The trained Q-table.
"""
q_table = {}
env = BabySnakeEnv()
for episode in range(episodes):
state = env.reset()
total_reward = 0.0
while env.game.playing:
action = choose_action(q_table, state, epsilon)
next_state, reward, done = env.step(action)
update_q(q_table, state, action, reward, next_state, alpha, gamma)
state = next_state
total_reward += reward
epsilon = max(epsilon_min, epsilon * epsilon_decay)
if (episode + 1) % 100 == 0:
print(
f"Episode {episode + 1:5d} "
f"reward={total_reward:6.1f} "
f"score={env.game.state['score']} "
f"epsilon={epsilon:.3f} "
f"q_entries={len(q_table)}"
)
return q_table
def watch(q_table=None):
"""Watch the trained agent play in the terminal.
Arguments:
q_table (dict | None): A trained Q-table. If None, trains first.
"""
import babysnake
from retro.input import ProgrammaticInput
if q_table is None:
print("Training first...")
q_table = train()
_inp = ProgrammaticInput()
class PolicyInput:
"""An input source that picks actions from the Q-table."""
def collect(self):
s = game.state
state = (int(s['agent_x']), int(s['agent_y']),
int(s['food_x']), int(s['food_y']))
q_values = [q_table.get((state, a), 0.0) for a in ACTIONS]
best = ACTIONS[q_values.index(max(q_values))]
_inp.press(best)
return _inp.collect()
game = babysnake.create_game()
game.play(input_source=PolicyInput())
if __name__ == '__main__':
print("Training Q-learning agent on BabySnake...")
q_table = train()
print(f"\nDone. Q-table has {len(q_table)} entries.")
print("\nWatching trained agent (press Enter or Escape to quit)...")
watch(q_table)

113
q_learning_solution.py Normal file
View File

@@ -0,0 +1,113 @@
"""Solution for q_learning.py — remove before publishing to students."""
import random
import babysnake
from retro.input import ProgrammaticInput
from retro.views.headless import HeadlessView
ACTIONS = ["KEY_RIGHT", "KEY_DOWN", "KEY_LEFT", "KEY_UP"]
class BabySnakeEnv:
def reset(self):
self._inp = ProgrammaticInput()
self.game = babysnake.create_game()
self.game.input_source = self._inp
self.game.view = HeadlessView()
self.game.start()
self._prev_reward = 0.0
return self._get_state()
def step(self, action):
self._inp.press(action)
self.game.step()
next_state = self._get_state()
reward = self.game.state['reward'] - self._prev_reward
self._prev_reward = self.game.state['reward']
done = not self.game.playing
return next_state, reward, done
def _get_state(self):
s = self.game.state
return (int(s['agent_x']), int(s['agent_y']),
int(s['food_x']), int(s['food_y']))
def choose_action(q_table, state, epsilon):
if random.random() < epsilon:
return random.choice(ACTIONS)
q_values = [q_table.get((state, a), 0.0) for a in ACTIONS]
return ACTIONS[q_values.index(max(q_values))]
def update_q(q_table, state, action, reward, next_state, alpha, gamma):
old_q = q_table.get((state, action), 0.0)
next_q_values = [q_table.get((next_state, a), 0.0) for a in ACTIONS]
best_next_q = max(next_q_values)
new_q = old_q + alpha * (reward + gamma * best_next_q - old_q)
q_table[(state, action)] = new_q
def train(
episodes=1000,
alpha=0.1,
gamma=0.95,
epsilon=1.0,
epsilon_decay=0.995,
epsilon_min=0.05,
):
q_table = {}
env = BabySnakeEnv()
for episode in range(episodes):
state = env.reset()
total_reward = 0.0
while env.game.playing:
action = choose_action(q_table, state, epsilon)
next_state, reward, done = env.step(action)
update_q(q_table, state, action, reward, next_state, alpha, gamma)
state = next_state
total_reward += reward
epsilon = max(epsilon_min, epsilon * epsilon_decay)
if (episode + 1) % 100 == 0:
print(
f"Episode {episode + 1:5d} "
f"reward={total_reward:6.1f} "
f"score={env.game.state['score']} "
f"epsilon={epsilon:.3f} "
f"q_entries={len(q_table)}"
)
return q_table
def watch(q_table=None):
if q_table is None:
print("Training first...")
q_table = train()
_inp = ProgrammaticInput()
class PolicyInput:
def collect(self):
s = game.state
state = (int(s['agent_x']), int(s['agent_y']),
int(s['food_x']), int(s['food_y']))
q_values = [q_table.get((state, a), 0.0) for a in ACTIONS]
best = ACTIONS[q_values.index(max(q_values))]
_inp.press(best)
return _inp.collect()
game = babysnake.create_game()
game.play(input_source=PolicyInput())
if __name__ == '__main__':
print("Training Q-learning agent on BabySnake...")
q_table = train()
print(f"\nDone. Q-table has {len(q_table)} entries.")
print("\nWatching trained agent (press Enter or Escape to quit)...")
watch(q_table)

143
snake_training.md Normal file
View File

@@ -0,0 +1,143 @@
# Snake Training: Conceptual Questions
Answer each question in the space provided. Use evidence from the training log
and your observations of the agent at different checkpoints to support your answers.
---
## 1. Feature selection
In the first training attempt, the agent received the full 32×16 game board as
its input (6 × 32 × 16 = 3,072 numbers). The agent could see every character on
the board, yet it never learned to reliably find the apple after 45,000 episodes.
When we added `apple_dx` and `apple_dy` — two numbers that encode the direction
from the snake's head to the apple — performance improved dramatically within
hundreds of episodes.
**Why didn't the board encoding help the agent find the apple? What did the two
new features provide that the board encoding could not?**
*Your answer:*
---
## 2. Dimensionality reduction
In the full-board experiment, the agent processed 3,072 input values. When we
switched to the egocentric view (a 17×17 window centered on the snake's head),
the board input shrank to 17 × 17 × 6 = 1,734 values.
**How many input values did the egocentric view save compared to the full board?
What is one thing the agent gained from this change, and one thing it lost?**
*Your answer:*
---
## 3. Exploration vs. exploitation
With `epsilon_decay = 0.995`, epsilon falls from 1.0 to 0.05 by episode ~450.
With `epsilon_decay = 0.9997` (used in the final run), epsilon is still 0.55 at
episode 2,000.
**Sketch a rough curve of epsilon over time for each setting. Why does slower
decay produce a better-trained agent, even though it means the agent takes more
random actions overall?**
*Your answer:*
---
## 4. Runaway loss
In one intermediate experiment, the loss grew from around 35 to hundreds of
thousands within a few hundred episodes:
```
[ep_0300] avg_loss=48.7 avg_reward=+8.1
[ep_0500] avg_loss=347 avg_reward=+12.4
[ep_0700] avg_loss=4,102 avg_reward=+6.5
[ep_1100] avg_loss=686,000 avg_reward=-3.1
```
This happened because the learning algorithm was using MSE (mean squared error)
loss, which is *quadratic* — an error of size 2 produces a loss of 4, an error
of size 10 produces a loss of 100.
**Describe the feedback loop that caused the loss to spiral upward. Why does
Huber loss (which is linear for large errors) break this cycle?**
*Your answer:*
---
## 5. Interpreting the training curve
Look at the snake training log. The reward climbs, then dips, then climbs again:
```
[ep_1100] avg_reward=+34.5 avg_steps=57
[ep_1800] avg_reward=+4.4 avg_steps=98
[ep_3800] avg_reward=+51.2 avg_steps=33
[ep_9000] avg_reward=+246.0 avg_steps=85
```
Notice that around episode 3,800, avg_steps dropped sharply (from ~98 to 33)
at the same time reward jumped. Then by episode 9,000, steps rose again while
reward kept climbing.
**What do you think the agent was doing at each of these stages? Use the
avg_steps and avg_reward numbers to support your interpretation.**
*Your answer:*
---
## 6. Policy observation
Run these commands to watch the agent at three checkpoints:
```
retro-gamer play runs/snake --checkpoint ep_1100
retro-gamer play runs/snake --checkpoint ep_5400
retro-gamer play runs/snake --checkpoint ep_17100
```
**Describe the agent's behavior at each checkpoint. What has the agent learned
by episode 5,400 that it hadn't yet learned at episode 1,100? What does the
episode 17,100 agent do that the earlier agents do not?**
*ep_1100:*
*ep_5400:*
*ep_17100:*
---
## 7. CNN vs. MLP
In the first attempt (full board, no explicit features), we used a CNN
(`spatial = true`). In the final run (egocentric board + explicit features), we
used an MLP (`spatial = false`).
**Why might an MLP be a reasonable choice when using the egocentric view, even
though the input is still a 2D board? What does the CNN offer that the MLP does
not, and why is that less important with an egocentric observation?**
*Your answer:*
---
## 8. Hyperparameter comparison
Suppose you ran two otherwise identical training experiments:
- Run A: `learning_rate = 0.001`
- Run B: `learning_rate = 0.0001`
**Based on what you learned from the runaway loss in Question 4, predict what
would happen in each run. What does this tell you about the trade-off when
choosing a learning rate?**
*Your answer:*

102
training_log.md Normal file
View File

@@ -0,0 +1,102 @@
# Forager Training Log
Document each training attempt below. For each attempt, write your hypothesis
before you run the experiment, then fill in the evidence and analysis after.
Use `retro-gamer info runs/forager/` to see a summary of your run, and
`cat runs/forager/training.log` to see the full log.
---
## Attempt 1
### Hypothesis
*Before training, predict what will happen with the default configuration.
Will the agent learn to find the food? How quickly? What might go wrong?*
Your prediction:
### Configuration
*Copy the relevant sections of `runs/forager/config.toml` here.*
```toml
```
### Evidence
*Paste the first and last few lines of your training log, and any interesting
moments in between.*
```
```
### Analysis
*What happened? How do the numbers — avg_reward, avg_steps, epsilon, avg_loss —
tell the story of what the agent learned? Did the result match your prediction?*
---
## Attempt 2
### Hypothesis
*Based on what you observed in Attempt 1, what will you change and why?
Predict the outcome.*
### Configuration
```toml
```
### Evidence
```
```
### Analysis
---
## Attempt 3 (if needed)
### Hypothesis
### Configuration
```toml
```
### Evidence
```
```
### Analysis
---
## Final analysis
**Which attempt produced the best-trained agent? Run `retro-gamer play` on your
best run's checkpoints and describe what the agent does.**
*Your answer:*
**Compare two of your attempts. What changed between them, and how did that
change affect the training curve?**
*Your answer:*
**If you had more time, what would you try next to improve the agent further?
Refer to specific hyperparameters or configuration options.**
*Your answer:*

1208
uv.lock generated Normal file

File diff suppressed because it is too large Load Diff