Initial commit
This commit is contained in:
9
.commit_template
Normal file
9
.commit_template
Normal file
@@ -0,0 +1,9 @@
|
||||
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# Write your commit message above this line.
|
||||
#
|
||||
# The first line should be a quick description of what you changed.
|
||||
# Then leave a blank line.
|
||||
# Then write a few sentences describing an idea or a question you
|
||||
# have been thinking about.
|
||||
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
**/__pycache__/*
|
||||
runs/*
|
||||
117
babysnake/__init__.py
Normal file
117
babysnake/__init__.py
Normal file
@@ -0,0 +1,117 @@
|
||||
"""BabySnake: a 4×4 grid game where an agent collects food.
|
||||
|
||||
State: (agent_x, agent_y, food_x, food_y) — four integers.
|
||||
The agent starts with 50 energy. Each step costs 1 energy.
|
||||
Collecting food restores 30 energy and adds 1 to the score.
|
||||
The game ends when energy reaches 0.
|
||||
"""
|
||||
|
||||
from random import randint, choice
|
||||
from retro.game import Game
|
||||
|
||||
BOARD_SIZE = 4
|
||||
START_ENERGY = 50
|
||||
FOOD_ENERGY = 30
|
||||
|
||||
class Forager:
|
||||
"""The player agent. Controlled with arrow keys."""
|
||||
name = "Forager"
|
||||
character = '@'
|
||||
color = "green_on_black"
|
||||
position = (0, 0)
|
||||
|
||||
UP = (0, -1)
|
||||
DOWN = (0, 1)
|
||||
LEFT = (-1, 0)
|
||||
RIGHT = (1, 0)
|
||||
|
||||
def __init__(self):
|
||||
self._direction = self.RIGHT
|
||||
|
||||
def handle_keystroke(self, keystroke, game):
|
||||
if keystroke.name == "KEY_RIGHT":
|
||||
self._direction = self.RIGHT
|
||||
elif keystroke.name == "KEY_UP":
|
||||
self._direction = self.UP
|
||||
elif keystroke.name == "KEY_LEFT":
|
||||
self._direction = self.LEFT
|
||||
elif keystroke.name == "KEY_DOWN":
|
||||
self._direction = self.DOWN
|
||||
|
||||
def play_turn(self, game):
|
||||
x, y = self.position
|
||||
dx, dy = self._direction
|
||||
new_pos = (x + dx, y + dy)
|
||||
if game.on_board(new_pos):
|
||||
self.position = new_pos
|
||||
|
||||
game.state['energy'] -= 1
|
||||
game.state['reward'] -= 0.01
|
||||
|
||||
food = game.get_agent_by_name("Food")
|
||||
if self.position == food.position:
|
||||
food.relocate(game)
|
||||
game.state['energy'] += FOOD_ENERGY
|
||||
game.state['score'] += 1
|
||||
game.state['reward'] += 1.0
|
||||
|
||||
ax, ay = self.position
|
||||
fx, fy = game.get_agent_by_name("Food").position
|
||||
game.state['agent_x'] = ax
|
||||
game.state['agent_y'] = ay
|
||||
game.state['food_x'] = fx
|
||||
game.state['food_y'] = fy
|
||||
|
||||
if game.state['energy'] <= 0:
|
||||
game.end()
|
||||
|
||||
|
||||
class Food:
|
||||
"""The food item. Respawns at a random empty position when collected."""
|
||||
name = "Food"
|
||||
character = '*'
|
||||
color = "yellow_on_black"
|
||||
position = (0, 0)
|
||||
|
||||
def relocate(self, game):
|
||||
bw, bh = game.board_size
|
||||
forager = game.get_agent_by_name("Forager")
|
||||
while True:
|
||||
pos = (randint(0, bw - 1), randint(0, bh - 1))
|
||||
if pos != forager.position:
|
||||
self.position = pos
|
||||
return
|
||||
|
||||
|
||||
def create_game():
|
||||
"""Return a fresh BabySnake game."""
|
||||
forager = Forager()
|
||||
food = Food()
|
||||
bw = bh = BOARD_SIZE
|
||||
game = Game(
|
||||
[forager, food],
|
||||
{
|
||||
'score': 0,
|
||||
'reward': 0.0,
|
||||
'energy': START_ENERGY,
|
||||
'agent_x': 0,
|
||||
'agent_y': 0,
|
||||
'food_x': 0,
|
||||
'food_y': 0,
|
||||
},
|
||||
board_size=(bw, bh),
|
||||
framerate=6,
|
||||
)
|
||||
forager.position = (randint(0, bw - 1), randint(0, bh - 1))
|
||||
food.relocate(game)
|
||||
ax, ay = forager.position
|
||||
fx, fy = food.position
|
||||
game.state['agent_x'] = ax
|
||||
game.state['agent_y'] = ay
|
||||
game.state['food_x'] = fx
|
||||
game.state['food_y'] = fy
|
||||
return game
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
create_game().play()
|
||||
3
babysnake/__main__.py
Normal file
3
babysnake/__main__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from babysnake import create_game
|
||||
|
||||
create_game().play()
|
||||
4
babysnake/pyproject.toml
Normal file
4
babysnake/pyproject.toml
Normal file
@@ -0,0 +1,4 @@
|
||||
[tool.retro-gamer]
|
||||
actions = ["KEY_RIGHT", "KEY_UP", "KEY_LEFT", "KEY_DOWN"]
|
||||
reward = "reward"
|
||||
character_set = ["@", "*"]
|
||||
103
forager/__init__.py
Normal file
103
forager/__init__.py
Normal file
@@ -0,0 +1,103 @@
|
||||
"""Forager: an 8×8 grid game where an agent collects food.
|
||||
|
||||
The agent moves in four directions collecting food that respawns on collection.
|
||||
The game runs indefinitely; retro-gamer's max_turns_per_episode controls episode length.
|
||||
|
||||
Observation features for retro-gamer:
|
||||
food_dx: (food_x - agent_x) / board_width (positive = food is to the right)
|
||||
food_dy: (food_y - agent_y) / board_height (positive = food is below)
|
||||
"""
|
||||
|
||||
from random import randint
|
||||
from retro.game import Game
|
||||
|
||||
BOARD_SIZE = 8
|
||||
|
||||
|
||||
class Forager:
|
||||
"""The player agent."""
|
||||
name = "Forager"
|
||||
character = '@'
|
||||
color = "green_on_black"
|
||||
position = (0, 0)
|
||||
|
||||
UP = (0, -1)
|
||||
DOWN = (0, 1)
|
||||
LEFT = (-1, 0)
|
||||
RIGHT = (1, 0)
|
||||
|
||||
def __init__(self):
|
||||
self._direction = self.RIGHT
|
||||
|
||||
def handle_keystroke(self, keystroke, game):
|
||||
if keystroke.name == "KEY_RIGHT":
|
||||
self._direction = self.RIGHT
|
||||
elif keystroke.name == "KEY_UP":
|
||||
self._direction = self.UP
|
||||
elif keystroke.name == "KEY_LEFT":
|
||||
self._direction = self.LEFT
|
||||
elif keystroke.name == "KEY_DOWN":
|
||||
self._direction = self.DOWN
|
||||
|
||||
def play_turn(self, game):
|
||||
x, y = self.position
|
||||
dx, dy = self._direction
|
||||
new_pos = (x + dx, y + dy)
|
||||
if game.on_board(new_pos):
|
||||
self.position = new_pos
|
||||
|
||||
game.state['reward'] -= 0.01
|
||||
|
||||
food = game.get_agent_by_name("Food")
|
||||
if self.position == food.position:
|
||||
food.relocate(game)
|
||||
game.state['score'] += 1
|
||||
game.state['reward'] += 1.0
|
||||
|
||||
bw, bh = game.board_size
|
||||
ax, ay = self.position
|
||||
fx, fy = game.get_agent_by_name("Food").position
|
||||
game.state['food_dx'] = (fx - ax) / bw
|
||||
game.state['food_dy'] = (fy - ay) / bh
|
||||
|
||||
|
||||
class Food:
|
||||
"""The food item. Respawns when collected."""
|
||||
name = "Food"
|
||||
character = '*'
|
||||
color = "yellow_on_black"
|
||||
position = (0, 0)
|
||||
|
||||
def relocate(self, game):
|
||||
bw, bh = game.board_size
|
||||
forager = game.get_agent_by_name("Forager")
|
||||
while True:
|
||||
pos = (randint(0, bw - 1), randint(0, bh - 1))
|
||||
if pos != forager.position:
|
||||
self.position = pos
|
||||
return
|
||||
|
||||
|
||||
def create_game():
|
||||
"""Return a fresh Forager game."""
|
||||
forager = Forager()
|
||||
food = Food()
|
||||
bw = bh = BOARD_SIZE
|
||||
game = Game(
|
||||
[forager, food],
|
||||
{'score': 0, 'reward': 0.0, 'food_dx': 0.0, 'food_dy': 0.0},
|
||||
board_size=(bw, bh),
|
||||
framerate=12,
|
||||
)
|
||||
forager.position = (randint(0, bw - 1), randint(0, bh - 1))
|
||||
food.relocate(game)
|
||||
bw, bh = game.board_size
|
||||
ax, ay = forager.position
|
||||
fx, fy = food.position
|
||||
game.state['food_dx'] = (fx - ax) / bw
|
||||
game.state['food_dy'] = (fy - ay) / bh
|
||||
return game
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
create_game().play()
|
||||
3
forager/__main__.py
Normal file
3
forager/__main__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from forager import create_game
|
||||
|
||||
create_game().play()
|
||||
4
forager/pyproject.toml
Normal file
4
forager/pyproject.toml
Normal file
@@ -0,0 +1,4 @@
|
||||
[tool.retro-gamer]
|
||||
actions = ["KEY_RIGHT", "KEY_UP", "KEY_LEFT", "KEY_DOWN"]
|
||||
reward = "reward"
|
||||
character_set = ["@", "*"]
|
||||
20
pyproject.toml
Normal file
20
pyproject.toml
Normal file
@@ -0,0 +1,20 @@
|
||||
[project]
|
||||
name = "lab-reinforcement-learning"
|
||||
version = "0.1.0"
|
||||
description = "Reinforcement learning lab"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"retro-games>=2.3.1",
|
||||
"retro-gamer>=0.1.0",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["babysnake", "forager"]
|
||||
|
||||
[tool.uv.sources]
|
||||
retro-games = { path = "../../packages/retro", editable = true }
|
||||
retro-gamer = { path = "../../packages/retro-gamer", editable = true }
|
||||
218
q_learning.py
Normal file
218
q_learning.py
Normal file
@@ -0,0 +1,218 @@
|
||||
"""Q-learning agent for BabySnake.
|
||||
|
||||
This file contains starter code for implementing a Q-learning agent.
|
||||
You need to fill in two functions:
|
||||
- choose_action: select an action using an epsilon-greedy policy
|
||||
- update_q: update the Q-table using the Bellman equation
|
||||
|
||||
Run this file to train the agent:
|
||||
python q_learning.py
|
||||
|
||||
After training, run this to watch it play:
|
||||
python -c "from q_learning import watch; watch()"
|
||||
"""
|
||||
|
||||
import random
|
||||
import babysnake
|
||||
from retro.input import ProgrammaticInput
|
||||
from retro.views.headless import HeadlessView
|
||||
|
||||
# The four actions the agent can take.
|
||||
ACTIONS = ["KEY_RIGHT", "KEY_DOWN", "KEY_LEFT", "KEY_UP"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Environment wrapper
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class BabySnakeEnv:
|
||||
"""A simple wrapper that lets us step through BabySnake programmatically.
|
||||
|
||||
Usage:
|
||||
env = BabySnakeEnv()
|
||||
state = env.reset() # start a new episode
|
||||
next_state, reward, done = env.step("KEY_RIGHT")
|
||||
"""
|
||||
|
||||
def reset(self):
|
||||
"""Start a new episode. Returns the initial state tuple."""
|
||||
self._inp = ProgrammaticInput()
|
||||
self.game = babysnake.create_game()
|
||||
self.game.input_source = self._inp
|
||||
self.game.view = HeadlessView()
|
||||
self.game.start()
|
||||
self._prev_reward = 0.0
|
||||
return self._get_state()
|
||||
|
||||
def step(self, action):
|
||||
"""Take one action. Returns (next_state, reward, done).
|
||||
|
||||
Arguments:
|
||||
action (str): One of ACTIONS, or None for no-op.
|
||||
|
||||
Returns:
|
||||
next_state (tuple): The state after the action.
|
||||
reward (float): The reward received this step.
|
||||
done (bool): True if the episode has ended.
|
||||
"""
|
||||
self._inp.press(action)
|
||||
self.game.step()
|
||||
next_state = self._get_state()
|
||||
reward = self.game.state['reward'] - self._prev_reward
|
||||
self._prev_reward = self.game.state['reward']
|
||||
done = not self.game.playing
|
||||
return next_state, reward, done
|
||||
|
||||
def _get_state(self):
|
||||
"""Return the current state as a tuple of four integers."""
|
||||
s = self.game.state
|
||||
return (int(s['agent_x']), int(s['agent_y']),
|
||||
int(s['food_x']), int(s['food_y']))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Q-learning functions — fill these in!
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def choose_action(q_table, state, epsilon):
|
||||
"""Choose an action using an epsilon-greedy policy.
|
||||
|
||||
With probability `epsilon`, return a random action from ACTIONS.
|
||||
Otherwise, return the action with the highest Q-value in `q_table`
|
||||
for the given `state`. If a (state, action) pair has not been seen
|
||||
before, treat its Q-value as 0.
|
||||
|
||||
Arguments:
|
||||
q_table (dict): Maps (state, action) -> Q-value.
|
||||
state (tuple): The current state, e.g. (1, 2, 3, 0).
|
||||
epsilon (float): Exploration rate, between 0.0 and 1.0.
|
||||
|
||||
Returns:
|
||||
str: One action from ACTIONS.
|
||||
|
||||
Hint: random.random() returns a float in [0.0, 1.0).
|
||||
random.choice(ACTIONS) returns a random action.
|
||||
q_table.get(key, default) is handy for missing entries.
|
||||
"""
|
||||
raise NotImplementedError("Fill in choose_action")
|
||||
|
||||
|
||||
def update_q(q_table, state, action, reward, next_state, alpha, gamma):
|
||||
"""Update one entry of the Q-table using the Bellman equation.
|
||||
|
||||
The update rule is:
|
||||
|
||||
Q(s, a) <- Q(s, a) + alpha * (r + gamma * max_a' Q(s', a') - Q(s, a))
|
||||
|
||||
where:
|
||||
s, a — the state we were in and the action we took
|
||||
r — the reward we received
|
||||
s' — the state we ended up in
|
||||
max_a' ... — the best possible Q-value from the new state
|
||||
|
||||
Arguments:
|
||||
q_table (dict): Maps (state, action) -> Q-value (modified in place).
|
||||
state (tuple): The state before the action.
|
||||
action (str): The action taken.
|
||||
reward (float): The reward received.
|
||||
next_state (tuple): The state after the action.
|
||||
alpha (float): Learning rate (how much to update).
|
||||
gamma (float): Discount factor (how much to value future rewards).
|
||||
|
||||
Returns:
|
||||
None — modifies q_table in place.
|
||||
|
||||
Hint: Q-values for unseen (state, action) pairs start at 0.
|
||||
"""
|
||||
raise NotImplementedError("Fill in update_q")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Training loop
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def train(
|
||||
episodes=1000,
|
||||
alpha=0.1,
|
||||
gamma=0.95,
|
||||
epsilon=1.0,
|
||||
epsilon_decay=0.995,
|
||||
epsilon_min=0.05,
|
||||
):
|
||||
"""Train a Q-learning agent on BabySnake.
|
||||
|
||||
Arguments:
|
||||
episodes (int): How many episodes to run.
|
||||
alpha (float): Learning rate.
|
||||
gamma (float): Discount factor.
|
||||
epsilon (float): Starting exploration rate.
|
||||
epsilon_decay (float): Multiply epsilon by this each episode.
|
||||
epsilon_min (float): Epsilon never falls below this.
|
||||
|
||||
Returns:
|
||||
dict: The trained Q-table.
|
||||
"""
|
||||
q_table = {}
|
||||
env = BabySnakeEnv()
|
||||
|
||||
for episode in range(episodes):
|
||||
state = env.reset()
|
||||
total_reward = 0.0
|
||||
|
||||
while env.game.playing:
|
||||
action = choose_action(q_table, state, epsilon)
|
||||
next_state, reward, done = env.step(action)
|
||||
update_q(q_table, state, action, reward, next_state, alpha, gamma)
|
||||
state = next_state
|
||||
total_reward += reward
|
||||
|
||||
epsilon = max(epsilon_min, epsilon * epsilon_decay)
|
||||
|
||||
if (episode + 1) % 100 == 0:
|
||||
print(
|
||||
f"Episode {episode + 1:5d} "
|
||||
f"reward={total_reward:6.1f} "
|
||||
f"score={env.game.state['score']} "
|
||||
f"epsilon={epsilon:.3f} "
|
||||
f"q_entries={len(q_table)}"
|
||||
)
|
||||
|
||||
return q_table
|
||||
|
||||
|
||||
def watch(q_table=None):
|
||||
"""Watch the trained agent play in the terminal.
|
||||
|
||||
Arguments:
|
||||
q_table (dict | None): A trained Q-table. If None, trains first.
|
||||
"""
|
||||
import babysnake
|
||||
from retro.input import ProgrammaticInput
|
||||
|
||||
if q_table is None:
|
||||
print("Training first...")
|
||||
q_table = train()
|
||||
|
||||
_inp = ProgrammaticInput()
|
||||
|
||||
class PolicyInput:
|
||||
"""An input source that picks actions from the Q-table."""
|
||||
def collect(self):
|
||||
s = game.state
|
||||
state = (int(s['agent_x']), int(s['agent_y']),
|
||||
int(s['food_x']), int(s['food_y']))
|
||||
q_values = [q_table.get((state, a), 0.0) for a in ACTIONS]
|
||||
best = ACTIONS[q_values.index(max(q_values))]
|
||||
_inp.press(best)
|
||||
return _inp.collect()
|
||||
|
||||
game = babysnake.create_game()
|
||||
game.play(input_source=PolicyInput())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print("Training Q-learning agent on BabySnake...")
|
||||
q_table = train()
|
||||
print(f"\nDone. Q-table has {len(q_table)} entries.")
|
||||
print("\nWatching trained agent (press Enter or Escape to quit)...")
|
||||
watch(q_table)
|
||||
113
q_learning_solution.py
Normal file
113
q_learning_solution.py
Normal file
@@ -0,0 +1,113 @@
|
||||
"""Solution for q_learning.py — remove before publishing to students."""
|
||||
|
||||
import random
|
||||
import babysnake
|
||||
from retro.input import ProgrammaticInput
|
||||
from retro.views.headless import HeadlessView
|
||||
|
||||
ACTIONS = ["KEY_RIGHT", "KEY_DOWN", "KEY_LEFT", "KEY_UP"]
|
||||
|
||||
|
||||
class BabySnakeEnv:
|
||||
def reset(self):
|
||||
self._inp = ProgrammaticInput()
|
||||
self.game = babysnake.create_game()
|
||||
self.game.input_source = self._inp
|
||||
self.game.view = HeadlessView()
|
||||
self.game.start()
|
||||
self._prev_reward = 0.0
|
||||
return self._get_state()
|
||||
|
||||
def step(self, action):
|
||||
self._inp.press(action)
|
||||
self.game.step()
|
||||
next_state = self._get_state()
|
||||
reward = self.game.state['reward'] - self._prev_reward
|
||||
self._prev_reward = self.game.state['reward']
|
||||
done = not self.game.playing
|
||||
return next_state, reward, done
|
||||
|
||||
def _get_state(self):
|
||||
s = self.game.state
|
||||
return (int(s['agent_x']), int(s['agent_y']),
|
||||
int(s['food_x']), int(s['food_y']))
|
||||
|
||||
|
||||
def choose_action(q_table, state, epsilon):
|
||||
if random.random() < epsilon:
|
||||
return random.choice(ACTIONS)
|
||||
q_values = [q_table.get((state, a), 0.0) for a in ACTIONS]
|
||||
return ACTIONS[q_values.index(max(q_values))]
|
||||
|
||||
|
||||
def update_q(q_table, state, action, reward, next_state, alpha, gamma):
|
||||
old_q = q_table.get((state, action), 0.0)
|
||||
next_q_values = [q_table.get((next_state, a), 0.0) for a in ACTIONS]
|
||||
best_next_q = max(next_q_values)
|
||||
new_q = old_q + alpha * (reward + gamma * best_next_q - old_q)
|
||||
q_table[(state, action)] = new_q
|
||||
|
||||
|
||||
def train(
|
||||
episodes=1000,
|
||||
alpha=0.1,
|
||||
gamma=0.95,
|
||||
epsilon=1.0,
|
||||
epsilon_decay=0.995,
|
||||
epsilon_min=0.05,
|
||||
):
|
||||
q_table = {}
|
||||
env = BabySnakeEnv()
|
||||
|
||||
for episode in range(episodes):
|
||||
state = env.reset()
|
||||
total_reward = 0.0
|
||||
|
||||
while env.game.playing:
|
||||
action = choose_action(q_table, state, epsilon)
|
||||
next_state, reward, done = env.step(action)
|
||||
update_q(q_table, state, action, reward, next_state, alpha, gamma)
|
||||
state = next_state
|
||||
total_reward += reward
|
||||
|
||||
epsilon = max(epsilon_min, epsilon * epsilon_decay)
|
||||
|
||||
if (episode + 1) % 100 == 0:
|
||||
print(
|
||||
f"Episode {episode + 1:5d} "
|
||||
f"reward={total_reward:6.1f} "
|
||||
f"score={env.game.state['score']} "
|
||||
f"epsilon={epsilon:.3f} "
|
||||
f"q_entries={len(q_table)}"
|
||||
)
|
||||
|
||||
return q_table
|
||||
|
||||
|
||||
def watch(q_table=None):
|
||||
if q_table is None:
|
||||
print("Training first...")
|
||||
q_table = train()
|
||||
|
||||
_inp = ProgrammaticInput()
|
||||
|
||||
class PolicyInput:
|
||||
def collect(self):
|
||||
s = game.state
|
||||
state = (int(s['agent_x']), int(s['agent_y']),
|
||||
int(s['food_x']), int(s['food_y']))
|
||||
q_values = [q_table.get((state, a), 0.0) for a in ACTIONS]
|
||||
best = ACTIONS[q_values.index(max(q_values))]
|
||||
_inp.press(best)
|
||||
return _inp.collect()
|
||||
|
||||
game = babysnake.create_game()
|
||||
game.play(input_source=PolicyInput())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print("Training Q-learning agent on BabySnake...")
|
||||
q_table = train()
|
||||
print(f"\nDone. Q-table has {len(q_table)} entries.")
|
||||
print("\nWatching trained agent (press Enter or Escape to quit)...")
|
||||
watch(q_table)
|
||||
143
snake_training.md
Normal file
143
snake_training.md
Normal file
@@ -0,0 +1,143 @@
|
||||
# Snake Training: Conceptual Questions
|
||||
|
||||
Answer each question in the space provided. Use evidence from the training log
|
||||
and your observations of the agent at different checkpoints to support your answers.
|
||||
|
||||
---
|
||||
|
||||
## 1. Feature selection
|
||||
|
||||
In the first training attempt, the agent received the full 32×16 game board as
|
||||
its input (6 × 32 × 16 = 3,072 numbers). The agent could see every character on
|
||||
the board, yet it never learned to reliably find the apple after 45,000 episodes.
|
||||
|
||||
When we added `apple_dx` and `apple_dy` — two numbers that encode the direction
|
||||
from the snake's head to the apple — performance improved dramatically within
|
||||
hundreds of episodes.
|
||||
|
||||
**Why didn't the board encoding help the agent find the apple? What did the two
|
||||
new features provide that the board encoding could not?**
|
||||
|
||||
*Your answer:*
|
||||
|
||||
---
|
||||
|
||||
## 2. Dimensionality reduction
|
||||
|
||||
In the full-board experiment, the agent processed 3,072 input values. When we
|
||||
switched to the egocentric view (a 17×17 window centered on the snake's head),
|
||||
the board input shrank to 17 × 17 × 6 = 1,734 values.
|
||||
|
||||
**How many input values did the egocentric view save compared to the full board?
|
||||
What is one thing the agent gained from this change, and one thing it lost?**
|
||||
|
||||
*Your answer:*
|
||||
|
||||
---
|
||||
|
||||
## 3. Exploration vs. exploitation
|
||||
|
||||
With `epsilon_decay = 0.995`, epsilon falls from 1.0 to 0.05 by episode ~450.
|
||||
With `epsilon_decay = 0.9997` (used in the final run), epsilon is still 0.55 at
|
||||
episode 2,000.
|
||||
|
||||
**Sketch a rough curve of epsilon over time for each setting. Why does slower
|
||||
decay produce a better-trained agent, even though it means the agent takes more
|
||||
random actions overall?**
|
||||
|
||||
*Your answer:*
|
||||
|
||||
---
|
||||
|
||||
## 4. Runaway loss
|
||||
|
||||
In one intermediate experiment, the loss grew from around 35 to hundreds of
|
||||
thousands within a few hundred episodes:
|
||||
|
||||
```
|
||||
[ep_0300] avg_loss=48.7 avg_reward=+8.1
|
||||
[ep_0500] avg_loss=347 avg_reward=+12.4
|
||||
[ep_0700] avg_loss=4,102 avg_reward=+6.5
|
||||
[ep_1100] avg_loss=686,000 avg_reward=-3.1
|
||||
```
|
||||
|
||||
This happened because the learning algorithm was using MSE (mean squared error)
|
||||
loss, which is *quadratic* — an error of size 2 produces a loss of 4, an error
|
||||
of size 10 produces a loss of 100.
|
||||
|
||||
**Describe the feedback loop that caused the loss to spiral upward. Why does
|
||||
Huber loss (which is linear for large errors) break this cycle?**
|
||||
|
||||
*Your answer:*
|
||||
|
||||
---
|
||||
|
||||
## 5. Interpreting the training curve
|
||||
|
||||
Look at the snake training log. The reward climbs, then dips, then climbs again:
|
||||
|
||||
```
|
||||
[ep_1100] avg_reward=+34.5 avg_steps=57
|
||||
[ep_1800] avg_reward=+4.4 avg_steps=98
|
||||
[ep_3800] avg_reward=+51.2 avg_steps=33
|
||||
[ep_9000] avg_reward=+246.0 avg_steps=85
|
||||
```
|
||||
|
||||
Notice that around episode 3,800, avg_steps dropped sharply (from ~98 to 33)
|
||||
at the same time reward jumped. Then by episode 9,000, steps rose again while
|
||||
reward kept climbing.
|
||||
|
||||
**What do you think the agent was doing at each of these stages? Use the
|
||||
avg_steps and avg_reward numbers to support your interpretation.**
|
||||
|
||||
*Your answer:*
|
||||
|
||||
---
|
||||
|
||||
## 6. Policy observation
|
||||
|
||||
Run these commands to watch the agent at three checkpoints:
|
||||
|
||||
```
|
||||
retro-gamer play runs/snake --checkpoint ep_1100
|
||||
retro-gamer play runs/snake --checkpoint ep_5400
|
||||
retro-gamer play runs/snake --checkpoint ep_17100
|
||||
```
|
||||
|
||||
**Describe the agent's behavior at each checkpoint. What has the agent learned
|
||||
by episode 5,400 that it hadn't yet learned at episode 1,100? What does the
|
||||
episode 17,100 agent do that the earlier agents do not?**
|
||||
|
||||
*ep_1100:*
|
||||
|
||||
*ep_5400:*
|
||||
|
||||
*ep_17100:*
|
||||
|
||||
---
|
||||
|
||||
## 7. CNN vs. MLP
|
||||
|
||||
In the first attempt (full board, no explicit features), we used a CNN
|
||||
(`spatial = true`). In the final run (egocentric board + explicit features), we
|
||||
used an MLP (`spatial = false`).
|
||||
|
||||
**Why might an MLP be a reasonable choice when using the egocentric view, even
|
||||
though the input is still a 2D board? What does the CNN offer that the MLP does
|
||||
not, and why is that less important with an egocentric observation?**
|
||||
|
||||
*Your answer:*
|
||||
|
||||
---
|
||||
|
||||
## 8. Hyperparameter comparison
|
||||
|
||||
Suppose you ran two otherwise identical training experiments:
|
||||
- Run A: `learning_rate = 0.001`
|
||||
- Run B: `learning_rate = 0.0001`
|
||||
|
||||
**Based on what you learned from the runaway loss in Question 4, predict what
|
||||
would happen in each run. What does this tell you about the trade-off when
|
||||
choosing a learning rate?**
|
||||
|
||||
*Your answer:*
|
||||
102
training_log.md
Normal file
102
training_log.md
Normal file
@@ -0,0 +1,102 @@
|
||||
# Forager Training Log
|
||||
|
||||
Document each training attempt below. For each attempt, write your hypothesis
|
||||
before you run the experiment, then fill in the evidence and analysis after.
|
||||
|
||||
Use `retro-gamer info runs/forager/` to see a summary of your run, and
|
||||
`cat runs/forager/training.log` to see the full log.
|
||||
|
||||
---
|
||||
|
||||
## Attempt 1
|
||||
|
||||
### Hypothesis
|
||||
|
||||
*Before training, predict what will happen with the default configuration.
|
||||
Will the agent learn to find the food? How quickly? What might go wrong?*
|
||||
|
||||
Your prediction:
|
||||
|
||||
### Configuration
|
||||
|
||||
*Copy the relevant sections of `runs/forager/config.toml` here.*
|
||||
|
||||
```toml
|
||||
|
||||
```
|
||||
|
||||
### Evidence
|
||||
|
||||
*Paste the first and last few lines of your training log, and any interesting
|
||||
moments in between.*
|
||||
|
||||
```
|
||||
|
||||
```
|
||||
|
||||
### Analysis
|
||||
|
||||
*What happened? How do the numbers — avg_reward, avg_steps, epsilon, avg_loss —
|
||||
tell the story of what the agent learned? Did the result match your prediction?*
|
||||
|
||||
---
|
||||
|
||||
## Attempt 2
|
||||
|
||||
### Hypothesis
|
||||
|
||||
*Based on what you observed in Attempt 1, what will you change and why?
|
||||
Predict the outcome.*
|
||||
|
||||
### Configuration
|
||||
|
||||
```toml
|
||||
|
||||
```
|
||||
|
||||
### Evidence
|
||||
|
||||
```
|
||||
|
||||
```
|
||||
|
||||
### Analysis
|
||||
|
||||
---
|
||||
|
||||
## Attempt 3 (if needed)
|
||||
|
||||
### Hypothesis
|
||||
|
||||
### Configuration
|
||||
|
||||
```toml
|
||||
|
||||
```
|
||||
|
||||
### Evidence
|
||||
|
||||
```
|
||||
|
||||
```
|
||||
|
||||
### Analysis
|
||||
|
||||
---
|
||||
|
||||
## Final analysis
|
||||
|
||||
**Which attempt produced the best-trained agent? Run `retro-gamer play` on your
|
||||
best run's checkpoints and describe what the agent does.**
|
||||
|
||||
*Your answer:*
|
||||
|
||||
**Compare two of your attempts. What changed between them, and how did that
|
||||
change affect the training curve?**
|
||||
|
||||
*Your answer:*
|
||||
|
||||
**If you had more time, what would you try next to improve the agent further?
|
||||
Refer to specific hyperparameters or configuration options.**
|
||||
|
||||
*Your answer:*
|
||||
Reference in New Issue
Block a user