Initial commit

2026-06-22 16:14:58 -04:00
commit 42bc2e7a50
14 changed files with 2049 additions and 0 deletions
--- a/.commit_template
+++ b/.commit_template
@@ -0,0 +1,9 @@
+
+
+# -----------------------------------------------------------------
+# Write your commit message above this line.
+# 
+# The first line should be a quick description of what you changed.
+# Then leave a blank line. 
+# Then write a few sentences describing an idea or a question you 
+# have been thinking about. 
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+**/__pycache__/*
+runs/*
--- a/babysnake/init.py
+++ b/babysnake/init.py
@@ -0,0 +1,117 @@
+"""BabySnake: a 4×4 grid game where an agent collects food.
+
+State: (agent_x, agent_y, food_x, food_y) — four integers.
+The agent starts with 50 energy. Each step costs 1 energy.
+Collecting food restores 30 energy and adds 1 to the score.
+The game ends when energy reaches 0.
+"""
+
+from random import randint, choice
+from retro.game import Game
+
+BOARD_SIZE = 4
+START_ENERGY = 50
+FOOD_ENERGY = 30
+
+class Forager:
+    """The player agent. Controlled with arrow keys."""
+    name = "Forager"
+    character = '@'
+    color = "green_on_black"
+    position = (0, 0)
+
+    UP = (0, -1)
+    DOWN = (0, 1)
+    LEFT = (-1, 0)
+    RIGHT = (1, 0)
+
+    def __init__(self):
+        self._direction = self.RIGHT
+
+    def handle_keystroke(self, keystroke, game):
+        if keystroke.name == "KEY_RIGHT":
+            self._direction = self.RIGHT
+        elif keystroke.name == "KEY_UP":
+            self._direction = self.UP
+        elif keystroke.name == "KEY_LEFT":
+            self._direction = self.LEFT
+        elif keystroke.name == "KEY_DOWN":
+            self._direction = self.DOWN
+
+    def play_turn(self, game):
+        x, y = self.position
+        dx, dy = self._direction
+        new_pos = (x + dx, y + dy)
+        if game.on_board(new_pos):
+            self.position = new_pos
+
+        game.state['energy'] -= 1
+        game.state['reward'] -= 0.01
+
+        food = game.get_agent_by_name("Food")
+        if self.position == food.position:
+            food.relocate(game)
+            game.state['energy'] += FOOD_ENERGY
+            game.state['score'] += 1
+            game.state['reward'] += 1.0
+
+        ax, ay = self.position
+        fx, fy = game.get_agent_by_name("Food").position
+        game.state['agent_x'] = ax
+        game.state['agent_y'] = ay
+        game.state['food_x'] = fx
+        game.state['food_y'] = fy
+
+        if game.state['energy'] <= 0:
+            game.end()
+
+
+class Food:
+    """The food item. Respawns at a random empty position when collected."""
+    name = "Food"
+    character = '*'
+    color = "yellow_on_black"
+    position = (0, 0)
+
+    def relocate(self, game):
+        bw, bh = game.board_size
+        forager = game.get_agent_by_name("Forager")
+        while True:
+            pos = (randint(0, bw - 1), randint(0, bh - 1))
+            if pos != forager.position:
+                self.position = pos
+                return
+
+
+def create_game():
+    """Return a fresh BabySnake game."""
+    forager = Forager()
+    food = Food()
+    bw = bh = BOARD_SIZE
+    game = Game(
+        [forager, food],
+        {
+            'score': 0,
+            'reward': 0.0,
+            'energy': START_ENERGY,
+            'agent_x': 0,
+            'agent_y': 0,
+            'food_x': 0,
+            'food_y': 0,
+        },
+        board_size=(bw, bh),
+        framerate=6,
+    )
+    forager.position = (randint(0, bw - 1), randint(0, bh - 1))
+    food.relocate(game)
+    ax, ay = forager.position
+    fx, fy = food.position
+    game.state['agent_x'] = ax
+    game.state['agent_y'] = ay
+    game.state['food_x'] = fx
+    game.state['food_y'] = fy
+    return game
+
+
+if __name__ == '__main__':
+    create_game().play()
--- a/babysnake/main.py
+++ b/babysnake/main.py
@@ -0,0 +1,3 @@
+from babysnake import create_game
+
+create_game().play()
--- a/babysnake/pyproject.toml
+++ b/babysnake/pyproject.toml
@@ -0,0 +1,4 @@
+[tool.retro-gamer]
+actions = ["KEY_RIGHT", "KEY_UP", "KEY_LEFT", "KEY_DOWN"]
+reward = "reward"
+character_set = ["@", "*"]
--- a/forager/init.py
+++ b/forager/init.py
@@ -0,0 +1,103 @@
+"""Forager: an 8×8 grid game where an agent collects food.
+
+The agent moves in four directions collecting food that respawns on collection.
+The game runs indefinitely; retro-gamer's max_turns_per_episode controls episode length.
+
+Observation features for retro-gamer:
+  food_dx: (food_x - agent_x) / board_width  (positive = food is to the right)
+  food_dy: (food_y - agent_y) / board_height (positive = food is below)
+"""
+
+from random import randint
+from retro.game import Game
+
+BOARD_SIZE = 8
+
+
+class Forager:
+    """The player agent."""
+    name = "Forager"
+    character = '@'
+    color = "green_on_black"
+    position = (0, 0)
+
+    UP = (0, -1)
+    DOWN = (0, 1)
+    LEFT = (-1, 0)
+    RIGHT = (1, 0)
+
+    def __init__(self):
+        self._direction = self.RIGHT
+
+    def handle_keystroke(self, keystroke, game):
+        if keystroke.name == "KEY_RIGHT":
+            self._direction = self.RIGHT
+        elif keystroke.name == "KEY_UP":
+            self._direction = self.UP
+        elif keystroke.name == "KEY_LEFT":
+            self._direction = self.LEFT
+        elif keystroke.name == "KEY_DOWN":
+            self._direction = self.DOWN
+
+    def play_turn(self, game):
+        x, y = self.position
+        dx, dy = self._direction
+        new_pos = (x + dx, y + dy)
+        if game.on_board(new_pos):
+            self.position = new_pos
+
+        game.state['reward'] -= 0.01
+
+        food = game.get_agent_by_name("Food")
+        if self.position == food.position:
+            food.relocate(game)
+            game.state['score'] += 1
+            game.state['reward'] += 1.0
+
+        bw, bh = game.board_size
+        ax, ay = self.position
+        fx, fy = game.get_agent_by_name("Food").position
+        game.state['food_dx'] = (fx - ax) / bw
+        game.state['food_dy'] = (fy - ay) / bh
+
+
+class Food:
+    """The food item. Respawns when collected."""
+    name = "Food"
+    character = '*'
+    color = "yellow_on_black"
+    position = (0, 0)
+
+    def relocate(self, game):
+        bw, bh = game.board_size
+        forager = game.get_agent_by_name("Forager")
+        while True:
+            pos = (randint(0, bw - 1), randint(0, bh - 1))
+            if pos != forager.position:
+                self.position = pos
+                return
+
+
+def create_game():
+    """Return a fresh Forager game."""
+    forager = Forager()
+    food = Food()
+    bw = bh = BOARD_SIZE
+    game = Game(
+        [forager, food],
+        {'score': 0, 'reward': 0.0, 'food_dx': 0.0, 'food_dy': 0.0},
+        board_size=(bw, bh),
+        framerate=12,
+    )
+    forager.position = (randint(0, bw - 1), randint(0, bh - 1))
+    food.relocate(game)
+    bw, bh = game.board_size
+    ax, ay = forager.position
+    fx, fy = food.position
+    game.state['food_dx'] = (fx - ax) / bw
+    game.state['food_dy'] = (fy - ay) / bh
+    return game
+
+
+if __name__ == '__main__':
+    create_game().play()
--- a/forager/main.py
+++ b/forager/main.py
@@ -0,0 +1,3 @@
+from forager import create_game
+
+create_game().play()
--- a/forager/pyproject.toml
+++ b/forager/pyproject.toml
@@ -0,0 +1,4 @@
+[tool.retro-gamer]
+actions = ["KEY_RIGHT", "KEY_UP", "KEY_LEFT", "KEY_DOWN"]
+reward = "reward"
+character_set = ["@", "*"]
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,20 @@
+[project]
+name = "lab-reinforcement-learning"
+version = "0.1.0"
+description = "Reinforcement learning lab"
+requires-python = ">=3.11"
+dependencies = [
+    "retro-games>=2.3.1",
+    "retro-gamer>=0.1.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.hatch.build.targets.wheel]
+packages = ["babysnake", "forager"]
+
+[tool.uv.sources]
+retro-games = { path = "../../packages/retro", editable = true }
+retro-gamer = { path = "../../packages/retro-gamer", editable = true }
--- a/q_learning.py
+++ b/q_learning.py
@@ -0,0 +1,218 @@
+"""Q-learning agent for BabySnake.
+
+This file contains starter code for implementing a Q-learning agent.
+You need to fill in two functions:
+  - choose_action: select an action using an epsilon-greedy policy
+  - update_q: update the Q-table using the Bellman equation
+
+Run this file to train the agent:
+  python q_learning.py
+
+After training, run this to watch it play:
+  python -c "from q_learning import watch; watch()"
+"""
+
+import random
+import babysnake
+from retro.input import ProgrammaticInput
+from retro.views.headless import HeadlessView
+
+# The four actions the agent can take.
+ACTIONS = ["KEY_RIGHT", "KEY_DOWN", "KEY_LEFT", "KEY_UP"]
+
+
+# ---------------------------------------------------------------------------
+# Environment wrapper
+# ---------------------------------------------------------------------------
+
+class BabySnakeEnv:
+    """A simple wrapper that lets us step through BabySnake programmatically.
+
+    Usage:
+        env = BabySnakeEnv()
+        state = env.reset()           # start a new episode
+        next_state, reward, done = env.step("KEY_RIGHT")
+    """
+
+    def reset(self):
+        """Start a new episode. Returns the initial state tuple."""
+        self._inp = ProgrammaticInput()
+        self.game = babysnake.create_game()
+        self.game.input_source = self._inp
+        self.game.view = HeadlessView()
+        self.game.start()
+        self._prev_reward = 0.0
+        return self._get_state()
+
+    def step(self, action):
+        """Take one action. Returns (next_state, reward, done).
+
+        Arguments:
+            action (str): One of ACTIONS, or None for no-op.
+
+        Returns:
+            next_state (tuple): The state after the action.
+            reward (float): The reward received this step.
+            done (bool): True if the episode has ended.
+        """
+        self._inp.press(action)
+        self.game.step()
+        next_state = self._get_state()
+        reward = self.game.state['reward'] - self._prev_reward
+        self._prev_reward = self.game.state['reward']
+        done = not self.game.playing
+        return next_state, reward, done
+
+    def _get_state(self):
+        """Return the current state as a tuple of four integers."""
+        s = self.game.state
+        return (int(s['agent_x']), int(s['agent_y']),
+                int(s['food_x']),  int(s['food_y']))
+
+
+# ---------------------------------------------------------------------------
+# Q-learning functions — fill these in!
+# ---------------------------------------------------------------------------
+
+def choose_action(q_table, state, epsilon):
+    """Choose an action using an epsilon-greedy policy.
+
+    With probability `epsilon`, return a random action from ACTIONS.
+    Otherwise, return the action with the highest Q-value in `q_table`
+    for the given `state`. If a (state, action) pair has not been seen
+    before, treat its Q-value as 0.
+
+    Arguments:
+        q_table (dict): Maps (state, action) -> Q-value.
+        state (tuple): The current state, e.g. (1, 2, 3, 0).
+        epsilon (float): Exploration rate, between 0.0 and 1.0.
+
+    Returns:
+        str: One action from ACTIONS.
+
+    Hint: random.random() returns a float in [0.0, 1.0).
+          random.choice(ACTIONS) returns a random action.
+          q_table.get(key, default) is handy for missing entries.
+    """
+    raise NotImplementedError("Fill in choose_action")
+
+
+def update_q(q_table, state, action, reward, next_state, alpha, gamma):
+    """Update one entry of the Q-table using the Bellman equation.
+
+    The update rule is:
+
+        Q(s, a) <- Q(s, a) + alpha * (r + gamma * max_a' Q(s', a') - Q(s, a))
+
+    where:
+        s, a       — the state we were in and the action we took
+        r          — the reward we received
+        s'         — the state we ended up in
+        max_a' ... — the best possible Q-value from the new state
+
+    Arguments:
+        q_table (dict): Maps (state, action) -> Q-value (modified in place).
+        state (tuple): The state before the action.
+        action (str): The action taken.
+        reward (float): The reward received.
+        next_state (tuple): The state after the action.
+        alpha (float): Learning rate (how much to update).
+        gamma (float): Discount factor (how much to value future rewards).
+
+    Returns:
+        None — modifies q_table in place.
+
+    Hint: Q-values for unseen (state, action) pairs start at 0.
+    """
+    raise NotImplementedError("Fill in update_q")
+
+
+# ---------------------------------------------------------------------------
+# Training loop
+# ---------------------------------------------------------------------------
+
+def train(
+    episodes=1000,
+    alpha=0.1,
+    gamma=0.95,
+    epsilon=1.0,
+    epsilon_decay=0.995,
+    epsilon_min=0.05,
+):
+    """Train a Q-learning agent on BabySnake.
+
+    Arguments:
+        episodes (int): How many episodes to run.
+        alpha (float): Learning rate.
+        gamma (float): Discount factor.
+        epsilon (float): Starting exploration rate.
+        epsilon_decay (float): Multiply epsilon by this each episode.
+        epsilon_min (float): Epsilon never falls below this.
+
+    Returns:
+        dict: The trained Q-table.
+    """
+    q_table = {}
+    env = BabySnakeEnv()
+
+    for episode in range(episodes):
+        state = env.reset()
+        total_reward = 0.0
+
+        while env.game.playing:
+            action = choose_action(q_table, state, epsilon)
+            next_state, reward, done = env.step(action)
+            update_q(q_table, state, action, reward, next_state, alpha, gamma)
+            state = next_state
+            total_reward += reward
+
+        epsilon = max(epsilon_min, epsilon * epsilon_decay)
+
+        if (episode + 1) % 100 == 0:
+            print(
+                f"Episode {episode + 1:5d}  "
+                f"reward={total_reward:6.1f}  "
+                f"score={env.game.state['score']}  "
+                f"epsilon={epsilon:.3f}  "
+                f"q_entries={len(q_table)}"
+            )
+
+    return q_table
+
+
+def watch(q_table=None):
+    """Watch the trained agent play in the terminal.
+
+    Arguments:
+        q_table (dict | None): A trained Q-table. If None, trains first.
+    """
+    import babysnake
+    from retro.input import ProgrammaticInput
+
+    if q_table is None:
+        print("Training first...")
+        q_table = train()
+
+    _inp = ProgrammaticInput()
+
+    class PolicyInput:
+        """An input source that picks actions from the Q-table."""
+        def collect(self):
+            s = game.state
+            state = (int(s['agent_x']), int(s['agent_y']),
+                     int(s['food_x']),  int(s['food_y']))
+            q_values = [q_table.get((state, a), 0.0) for a in ACTIONS]
+            best = ACTIONS[q_values.index(max(q_values))]
+            _inp.press(best)
+            return _inp.collect()
+
+    game = babysnake.create_game()
+    game.play(input_source=PolicyInput())
+
+
+if __name__ == '__main__':
+    print("Training Q-learning agent on BabySnake...")
+    q_table = train()
+    print(f"\nDone. Q-table has {len(q_table)} entries.")
+    print("\nWatching trained agent (press Enter or Escape to quit)...")
+    watch(q_table)
--- a/q_learning_solution.py
+++ b/q_learning_solution.py
@@ -0,0 +1,113 @@
+"""Solution for q_learning.py — remove before publishing to students."""
+
+import random
+import babysnake
+from retro.input import ProgrammaticInput
+from retro.views.headless import HeadlessView
+
+ACTIONS = ["KEY_RIGHT", "KEY_DOWN", "KEY_LEFT", "KEY_UP"]
+
+
+class BabySnakeEnv:
+    def reset(self):
+        self._inp = ProgrammaticInput()
+        self.game = babysnake.create_game()
+        self.game.input_source = self._inp
+        self.game.view = HeadlessView()
+        self.game.start()
+        self._prev_reward = 0.0
+        return self._get_state()
+
+    def step(self, action):
+        self._inp.press(action)
+        self.game.step()
+        next_state = self._get_state()
+        reward = self.game.state['reward'] - self._prev_reward
+        self._prev_reward = self.game.state['reward']
+        done = not self.game.playing
+        return next_state, reward, done
+
+    def _get_state(self):
+        s = self.game.state
+        return (int(s['agent_x']), int(s['agent_y']),
+                int(s['food_x']),  int(s['food_y']))
+
+
+def choose_action(q_table, state, epsilon):
+    if random.random() < epsilon:
+        return random.choice(ACTIONS)
+    q_values = [q_table.get((state, a), 0.0) for a in ACTIONS]
+    return ACTIONS[q_values.index(max(q_values))]
+
+
+def update_q(q_table, state, action, reward, next_state, alpha, gamma):
+    old_q = q_table.get((state, action), 0.0)
+    next_q_values = [q_table.get((next_state, a), 0.0) for a in ACTIONS]
+    best_next_q = max(next_q_values)
+    new_q = old_q + alpha * (reward + gamma * best_next_q - old_q)
+    q_table[(state, action)] = new_q
+
+
+def train(
+    episodes=1000,
+    alpha=0.1,
+    gamma=0.95,
+    epsilon=1.0,
+    epsilon_decay=0.995,
+    epsilon_min=0.05,
+):
+    q_table = {}
+    env = BabySnakeEnv()
+
+    for episode in range(episodes):
+        state = env.reset()
+        total_reward = 0.0
+
+        while env.game.playing:
+            action = choose_action(q_table, state, epsilon)
+            next_state, reward, done = env.step(action)
+            update_q(q_table, state, action, reward, next_state, alpha, gamma)
+            state = next_state
+            total_reward += reward
+
+        epsilon = max(epsilon_min, epsilon * epsilon_decay)
+
+        if (episode + 1) % 100 == 0:
+            print(
+                f"Episode {episode + 1:5d}  "
+                f"reward={total_reward:6.1f}  "
+                f"score={env.game.state['score']}  "
+                f"epsilon={epsilon:.3f}  "
+                f"q_entries={len(q_table)}"
+            )
+
+    return q_table
+
+
+def watch(q_table=None):
+    if q_table is None:
+        print("Training first...")
+        q_table = train()
+
+    _inp = ProgrammaticInput()
+
+    class PolicyInput:
+        def collect(self):
+            s = game.state
+            state = (int(s['agent_x']), int(s['agent_y']),
+                     int(s['food_x']),  int(s['food_y']))
+            q_values = [q_table.get((state, a), 0.0) for a in ACTIONS]
+            best = ACTIONS[q_values.index(max(q_values))]
+            _inp.press(best)
+            return _inp.collect()
+
+    game = babysnake.create_game()
+    game.play(input_source=PolicyInput())
+
+
+if __name__ == '__main__':
+    print("Training Q-learning agent on BabySnake...")
+    q_table = train()
+    print(f"\nDone. Q-table has {len(q_table)} entries.")
+    print("\nWatching trained agent (press Enter or Escape to quit)...")
+    watch(q_table)
--- a/snake_training.md
+++ b/snake_training.md
@@ -0,0 +1,143 @@
+# Snake Training: Conceptual Questions
+
+Answer each question in the space provided. Use evidence from the training log
+and your observations of the agent at different checkpoints to support your answers.
+
+---
+
+## 1. Feature selection
+
+In the first training attempt, the agent received the full 32×16 game board as
+its input (6 × 32 × 16 = 3,072 numbers). The agent could see every character on
+the board, yet it never learned to reliably find the apple after 45,000 episodes.
+
+When we added `apple_dx` and `apple_dy` — two numbers that encode the direction
+from the snake's head to the apple — performance improved dramatically within
+hundreds of episodes.
+
+**Why didn't the board encoding help the agent find the apple? What did the two
+new features provide that the board encoding could not?**
+
+*Your answer:*
+
+---
+
+## 2. Dimensionality reduction
+
+In the full-board experiment, the agent processed 3,072 input values. When we
+switched to the egocentric view (a 17×17 window centered on the snake's head),
+the board input shrank to 17 × 17 × 6 = 1,734 values.
+
+**How many input values did the egocentric view save compared to the full board?
+What is one thing the agent gained from this change, and one thing it lost?**
+
+*Your answer:*
+
+---
+
+## 3. Exploration vs. exploitation
+
+With `epsilon_decay = 0.995`, epsilon falls from 1.0 to 0.05 by episode ~450.
+With `epsilon_decay = 0.9997` (used in the final run), epsilon is still 0.55 at
+episode 2,000.
+
+**Sketch a rough curve of epsilon over time for each setting. Why does slower
+decay produce a better-trained agent, even though it means the agent takes more
+random actions overall?**
+
+*Your answer:*
+
+---
+
+## 4. Runaway loss
+
+In one intermediate experiment, the loss grew from around 35 to hundreds of
+thousands within a few hundred episodes:
+
+```
+[ep_0300]  avg_loss=48.7   avg_reward=+8.1
+[ep_0500]  avg_loss=347    avg_reward=+12.4
+[ep_0700]  avg_loss=4,102  avg_reward=+6.5
+[ep_1100]  avg_loss=686,000  avg_reward=-3.1
+```
+
+This happened because the learning algorithm was using MSE (mean squared error)
+loss, which is *quadratic* — an error of size 2 produces a loss of 4, an error
+of size 10 produces a loss of 100.
+
+**Describe the feedback loop that caused the loss to spiral upward. Why does
+Huber loss (which is linear for large errors) break this cycle?**
+
+*Your answer:*
+
+---
+
+## 5. Interpreting the training curve
+
+Look at the snake training log. The reward climbs, then dips, then climbs again:
+
+```
+[ep_1100]  avg_reward=+34.5  avg_steps=57
+[ep_1800]  avg_reward=+4.4   avg_steps=98
+[ep_3800]  avg_reward=+51.2  avg_steps=33
+[ep_9000]  avg_reward=+246.0 avg_steps=85
+```
+
+Notice that around episode 3,800, avg_steps dropped sharply (from ~98 to 33)
+at the same time reward jumped. Then by episode 9,000, steps rose again while
+reward kept climbing.
+
+**What do you think the agent was doing at each of these stages? Use the
+avg_steps and avg_reward numbers to support your interpretation.**
+
+*Your answer:*
+
+---
+
+## 6. Policy observation
+
+Run these commands to watch the agent at three checkpoints:
+
+```
+retro-gamer play runs/snake --checkpoint ep_1100
+retro-gamer play runs/snake --checkpoint ep_5400
+retro-gamer play runs/snake --checkpoint ep_17100
+```
+
+**Describe the agent's behavior at each checkpoint. What has the agent learned
+by episode 5,400 that it hadn't yet learned at episode 1,100? What does the
+episode 17,100 agent do that the earlier agents do not?**
+
+*ep_1100:*
+
+*ep_5400:*
+
+*ep_17100:*
+
+---
+
+## 7. CNN vs. MLP
+
+In the first attempt (full board, no explicit features), we used a CNN
+(`spatial = true`). In the final run (egocentric board + explicit features), we
+used an MLP (`spatial = false`).
+
+**Why might an MLP be a reasonable choice when using the egocentric view, even
+though the input is still a 2D board? What does the CNN offer that the MLP does
+not, and why is that less important with an egocentric observation?**
+
+*Your answer:*
+
+---
+
+## 8. Hyperparameter comparison
+
+Suppose you ran two otherwise identical training experiments:
+- Run A: `learning_rate = 0.001`
+- Run B: `learning_rate = 0.0001`
+
+**Based on what you learned from the runaway loss in Question 4, predict what
+would happen in each run. What does this tell you about the trade-off when
+choosing a learning rate?**
+
+*Your answer:*
--- a/training_log.md
+++ b/training_log.md
@@ -0,0 +1,102 @@
+# Forager Training Log
+
+Document each training attempt below. For each attempt, write your hypothesis
+before you run the experiment, then fill in the evidence and analysis after.
+
+Use `retro-gamer info runs/forager/` to see a summary of your run, and
+`cat runs/forager/training.log` to see the full log.
+
+---
+
+## Attempt 1
+
+### Hypothesis
+
+*Before training, predict what will happen with the default configuration.
+Will the agent learn to find the food? How quickly? What might go wrong?*
+
+Your prediction:
+
+### Configuration
+
+*Copy the relevant sections of `runs/forager/config.toml` here.*
+
+```toml
+
+```
+
+### Evidence
+
+*Paste the first and last few lines of your training log, and any interesting
+moments in between.*
+
+```
+
+```
+
+### Analysis
+
+*What happened? How do the numbers — avg_reward, avg_steps, epsilon, avg_loss —
+tell the story of what the agent learned? Did the result match your prediction?*
+
+---
+
+## Attempt 2
+
+### Hypothesis
+
+*Based on what you observed in Attempt 1, what will you change and why?
+Predict the outcome.*
+
+### Configuration
+
+```toml
+
+```
+
+### Evidence
+
+```
+
+```
+
+### Analysis
+
+---
+
+## Attempt 3 (if needed)
+
+### Hypothesis
+
+### Configuration
+
+```toml
+
+```
+
+### Evidence
+
+```
+
+```
+
+### Analysis
+
+---
+
+## Final analysis
+
+**Which attempt produced the best-trained agent? Run `retro-gamer play` on your
+best run's checkpoints and describe what the agent does.**
+
+*Your answer:*
+
+**Compare two of your attempts. What changed between them, and how did that
+change affect the training curve?**
+
+*Your answer:*
+
+**If you had more time, what would you try next to improve the agent further?
+Refer to specific hyperparameters or configuration options.**
+
+*Your answer:*
--- a/uv.lock
+++ b/uv.lock