From e7f4a8a0a12e65328b3f03a5904dae574a69b7e0 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Tue, 15 Jun 2021 23:47:31 +0530
Subject: [PATCH 01/28] add StatsBase package

---
 Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Project.toml b/Project.toml
index ea30e6b..5e5a68c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -12,6 +12,7 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReinforcementLearningBase = "e575027e-6cd6-5018-9292-cdc6200d2b44"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
 Crayons = "4.0"

From fd659a730bd7ca5efa0508a2392cc23ee12717a0 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Tue, 15 Jun 2021 23:50:24 +0530
Subject: [PATCH 02/28] add SingleRoomUndirectedBatch

---
 src/envs/envs.jl                         |   1 +
 src/envs/single_room_undirected_batch.jl | 200 +++++++++++++++++++++++
 2 files changed, 201 insertions(+)
 create mode 100644 src/envs/single_room_undirected_batch.jl

diff --git a/src/envs/envs.jl b/src/envs/envs.jl
index 730d069..840955c 100644
--- a/src/envs/envs.jl
+++ b/src/envs/envs.jl
@@ -42,3 +42,4 @@ include("snake.jl")
 include("catcher.jl")
 include("transport.jl")
 include("collect_gems_undirected_multi_agent.jl")
+include("single_room_undirected_batch.jl")
diff --git a/src/envs/single_room_undirected_batch.jl b/src/envs/single_room_undirected_batch.jl
new file mode 100644
index 0000000..0e768d2
--- /dev/null
+++ b/src/envs/single_room_undirected_batch.jl
@@ -0,0 +1,200 @@
+module ModuleSingleRoomUndirectedBatch
+
+import Crayons
+import ..GridWorlds as GW
+import Random
+import ReinforcementLearningBase as RLBase
+import StatsBase as SB
+
+const MOVE_UP = 1
+const MOVE_DOWN = 2
+const MOVE_LEFT = 3
+const MOVE_RIGHT = 4
+
+const AGENT = 1
+const WALL = 2
+const GOAL = 3
+
+const DUMMY_CHARACTER = '⋅'
+const CHARACTERS = ('☻', '█', '♥')
+const FOREGROUND_COLORS = (:light_red, :white, :light_red)
+
+function move(action::Integer, i, j)
+    if action == MOVE_UP
+        return i - 1, j
+    elseif action == MOVE_DOWN
+        return i + 1, j
+    elseif action == MOVE_LEFT
+        return i, j - 1
+    elseif action == MOVE_RIGHT
+        return i, j + 1
+    end
+end
+
+struct SingleRoomUndirectedBatch{I, R, RNG} <: GW.AbstractGridWorld
+    tile_map::BitArray{4}
+    agent_position::Array{I, 2}
+    reward::Array{R, 1}
+    rng::Array{RNG, 1}
+    done::BitArray{1}
+    terminal_reward::R
+    goal_position::Array{I, 2}
+end
+
+function SingleRoomUndirectedBatch(; I = Int32, R = Float32, num_envs = 2, height = 8, width = 8, rng = [Random.MersenneTwister() for i in 1:num_envs])
+    tile_map = BitArray(undef, num_envs, 3, height, width)
+    agent_position = Array{I}(undef, num_envs, 2)
+    reward = Array{R}(undef, num_envs)
+    done = BitArray(undef, num_envs)
+    goal_position = Array{I}(undef, num_envs, 2)
+    terminal_reward = one(R)
+
+    inner_area = CartesianIndices((2 : height - 1, 2 : width - 1))
+
+    for env_id in 1:num_envs
+        tile_map[env_id, :, :, :] .= false
+        tile_map[env_id, WALL, 1, :] .= true
+        tile_map[env_id, WALL, height, :] .= true
+        tile_map[env_id, WALL, :, 1] .= true
+        tile_map[env_id, WALL, :, width] .= true
+
+        random_positions = SB.sample(rng[env_id], inner_area, 2, replace = false)
+
+        agent_position[env_id, 1] = random_positions[1][1]
+        agent_position[env_id, 2] = random_positions[1][2]
+        tile_map[env_id, AGENT, random_positions[1]] = true
+
+        goal_position[env_id, 1] = random_positions[2][1]
+        goal_position[env_id, 2] = random_positions[2][2]
+        tile_map[env_id, GOAL, random_positions[2]] = true
+
+        reward[env_id] = zero(R)
+        done[env_id] = false
+    end
+
+    env = SingleRoomUndirectedBatch(tile_map, agent_position, reward, rng, done, terminal_reward, goal_position)
+
+    RLBase.reset!(env)
+
+    return env
+end
+
+RLBase.state_space(env::SingleRoomUndirectedBatch, ::RLBase.InternalState, ::RLBase.DefaultPlayer) = nothing
+RLBase.state(env::SingleRoomUndirectedBatch, ::RLBase.InternalState, ::RLBase.DefaultPlayer) = copy(env.tile_map)
+
+RLBase.action_space(env::SingleRoomUndirectedBatch, player::RLBase.DefaultPlayer) = (MOVE_UP, MOVE_DOWN, MOVE_LEFT, MOVE_RIGHT)
+RLBase.reward(env::SingleRoomUndirectedBatch, ::RLBase.DefaultPlayer) = env.reward
+RLBase.is_terminated(env::SingleRoomUndirectedBatch) = env.done
+
+function RLBase.reset!(env::SingleRoomUndirectedBatch{I, R}) where {I, R}
+    tile_map = env.tile_map
+    agent_position = env.agent_position
+    goal_position = env.goal_position
+    reward = env.reward
+    done = env.done
+    rng = env.rng
+
+    num_envs = size(tile_map, 1)
+    inner_area = CartesianIndices((2 : size(tile_map, 3) - 1, 2 : size(tile_map, 4) - 1))
+
+    for env_id in 1:num_envs
+        tile_map[env_id, AGENT, agent_position[env_id, 1], agent_position[env_id, 2]] = false
+        tile_map[env_id, GOAL, goal_position[env_id, 1], goal_position[env_id, 2]] = false
+
+        random_positions = SB.sample(rng[env_id], inner_area, 2, replace = false)
+
+        agent_position[env_id, 1] = random_positions[1][1]
+        agent_position[env_id, 2] = random_positions[1][2]
+        tile_map[env_id, AGENT, random_positions[1]] = true
+
+        goal_position[env_id, 1] = random_positions[2][1]
+        goal_position[env_id, 2] = random_positions[2][2]
+        tile_map[env_id, GOAL, random_positions[2]] = true
+
+        reward[env_id] = zero(R)
+        done[env_id] = false
+    end
+
+    return nothing
+end
+
+function (env::SingleRoomUndirectedBatch{I, R})(action::Vector) where {I, R}
+    tile_map = env.tile_map
+    agent_position = env.agent_position
+    goal_position = env.goal_position
+    reward = env.reward
+    done = env.done
+    rng = env.rng
+    terminal_reward = env.terminal_reward
+
+    num_envs = size(tile_map, 1)
+
+    for env_id in 1:num_envs
+        current_position_i = agent_position[env_id, 1]
+        current_position_j = agent_position[env_id, 2]
+        next_position_i, next_position_j = move(action[env_id], current_position_i, current_position_j)
+
+        if !tile_map[env_id, WALL, next_position_i, next_position_j]
+            tile_map[env_id, AGENT, current_position_i, current_position_j] = false
+            agent_position[env_id, 1] = next_position_i
+            agent_position[env_id, 2] = next_position_j
+            tile_map[env_id, AGENT, next_position_i, next_position_j] = true
+        end
+
+        new_current_position_i = agent_position[env_id, 1]
+        new_current_position_j = agent_position[env_id, 2]
+
+        if tile_map[env_id, GOAL, new_current_position_i, new_current_position_j]
+            done[env_id] = true
+            reward[env_id] = terminal_reward
+        else
+            done[env_id] = false
+            reward[env_id] = zero(R)
+        end
+    end
+
+    return nothing
+end
+
+function Base.show(io::IO, ::MIME"text/plain", env::SingleRoomUndirectedBatch)
+    tile_map = env.tile_map
+    reward = env.reward
+    done = env.done
+
+    num_envs, num_objects, height, width = size(tile_map)
+
+    print(io, "objects = ")
+    for i in 1 : length(CHARACTERS)
+        print(io, Crayons.Crayon(foreground = FOREGROUND_COLORS[i]), CHARACTERS[i], Crayons.Crayon(reset = true))
+        if i < length(CHARACTERS)
+            print(io, ", ")
+        else
+            print(io, "\n")
+        end
+    end
+    println(io, "dummy character = ", DUMMY_CHARACTER)
+
+    for env_id in 1:num_envs
+        println(io)
+        println(io, "env_id = ", env_id)
+        for i in 1:height
+            for j in 1:width
+                idx = findfirst(@view tile_map[env_id, :, i, j])
+                if isnothing(idx)
+                    print(io, DUMMY_CHARACTER)
+                else
+                    print(io, Crayons.Crayon(foreground = FOREGROUND_COLORS[idx]), CHARACTERS[idx], Crayons.Crayon(reset = true))
+                end
+            end
+
+            println(io)
+        end
+
+        println(io, "reward = ", reward[env_id])
+        println(io, "done = ", done[env_id])
+    end
+
+    return nothing
+end
+
+end # module

From 0b8c1cba4e13d2dacb79fa0c30301b405c73d42a Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Wed, 16 Jun 2021 15:26:37 +0530
Subject: [PATCH 03/28] add playability to SingleRoomUndirectedBatch

---
 src/GridWorlds.jl                        |  1 +
 src/envs/single_room_undirected_batch.jl | 75 ++++++++++++++++++++++++
 src/play.jl                              | 45 ++++++++++++++
 3 files changed, 121 insertions(+)
 create mode 100644 src/play.jl

diff --git a/src/GridWorlds.jl b/src/GridWorlds.jl
index d77d239..7cb077b 100644
--- a/src/GridWorlds.jl
+++ b/src/GridWorlds.jl
@@ -19,6 +19,7 @@ include("actions.jl")
 include("objects.jl")
 include("grid_world_base.jl")
 include("abstract_grid_world.jl")
+include("play.jl")
 include("envs/envs.jl")
 include("textual_rendering.jl")
 
diff --git a/src/envs/single_room_undirected_batch.jl b/src/envs/single_room_undirected_batch.jl
index 0e768d2..e0b5744 100644
--- a/src/envs/single_room_undirected_batch.jl
+++ b/src/envs/single_room_undirected_batch.jl
@@ -2,7 +2,9 @@ module ModuleSingleRoomUndirectedBatch
 
 import Crayons
 import ..GridWorlds as GW
+import ..Play
 import Random
+import REPL
 import ReinforcementLearningBase as RLBase
 import StatsBase as SB
 
@@ -197,4 +199,77 @@ function Base.show(io::IO, ::MIME"text/plain", env::SingleRoomUndirectedBatch)
     return nothing
 end
 
+get_string_key_bindings(env::GW.AbstractGridWorld) = """Key bindings:
+                                                     'q': quit
+                                                     'r': RLBase.reset!(env)
+                                                     'w': MOVE_UP
+                                                     's': MOVE_DOWN
+                                                     'a': MOVE_LEFT
+                                                     'd': MOVE_RIGHT
+                                                     """
+
+function play!(terminal::REPL.Terminals.UnixTerminal, env::SingleRoomUndirectedBatch; file_name::Union{Nothing, AbstractString} = nothing)
+    REPL.Terminals.raw!(terminal, true)
+
+    terminal_out = terminal.out_stream
+    terminal_in = terminal.in_stream
+    file = Play.open_maybe(file_name)
+
+    Play.write_io1_maybe_io2(terminal_out, file, Play.CLEAR_SCREEN)
+    Play.write_io1_maybe_io2(terminal_out, file, Play.MOVE_CURSOR_TO_ORIGIN)
+    Play.write_io1_maybe_io2(terminal_out, file, Play.HIDE_CURSOR)
+
+    num_envs = size(env.tile_map, 1)
+    chars = Array{Char}(undef, num_envs)
+
+    action_chars = ('w', 's', 'a', 'd')
+
+    char_to_action = Dict('w' => MOVE_UP,
+                          's' => MOVE_DOWN,
+                          'a' => MOVE_LEFT,
+                          'd' => MOVE_RIGHT,
+                         )
+
+    action = Array{Int}(undef, num_envs)
+
+    try
+        while true
+            Play.write_io1_maybe_io2(terminal_out, file, get_string_key_bindings(env))
+            Play.show_io1_maybe_io2(terminal_out, file, MIME("text/plain"), env)
+
+            for i in 1:num_envs
+                chars[i] = read(terminal_in, Char)
+            end
+
+            Play.write_io1_maybe_io2(terminal_out, file, Play.EMPTY_SCREEN)
+
+            if 'q' in chars
+                Play.write_io1_maybe_io2(terminal_out, file, Play.SHOW_CURSOR)
+                Play.close_maybe(file)
+                REPL.Terminals.raw!(terminal, false)
+                return nothing
+            elseif 'r' in chars
+                RLBase.reset!(env)
+            elseif all(char -> char in action_chars, chars)
+                for i in 1:num_envs
+                    action[i] = char_to_action[chars[i]]
+                end
+                env(action)
+            else
+                @warn "No procedure exists for this character sequence: $chars"
+            end
+
+            Play.write_io1_maybe_io2(terminal_out, file, "Last character sequence = $(chars)\n")
+        end
+    finally
+        Play.write_io1_maybe_io2(terminal_out, file, Play.SHOW_CURSOR)
+        Play.close_maybe(file)
+        REPL.Terminals.raw!(terminal, false)
+    end
+
+    return nothing
+end
+
+play!(env::SingleRoomUndirectedBatch; file_name = nothing) = play!(REPL.TerminalMenus.terminal, env, file_name = file_name)
+
 end # module
diff --git a/src/play.jl b/src/play.jl
new file mode 100644
index 0000000..03525e4
--- /dev/null
+++ b/src/play.jl
@@ -0,0 +1,45 @@
+module Play
+
+import REPL
+
+const ESC = Char(0x1B)
+const HIDE_CURSOR = ESC * "[?25l"
+const SHOW_CURSOR = ESC * "[?25h"
+const CLEAR_SCREEN = ESC * "[2J"
+const MOVE_CURSOR_TO_ORIGIN = ESC * "[H"
+const CLEAR_SCREEN_BEFORE_CURSOR = ESC * "[1J"
+const EMPTY_SCREEN = CLEAR_SCREEN_BEFORE_CURSOR * MOVE_CURSOR_TO_ORIGIN
+
+open_maybe(file_name::AbstractString) = open(file_name, "w")
+open_maybe(::Nothing) = nothing
+
+close_maybe(io::IO) = close(io)
+close_maybe(io::Nothing) = nothing
+
+write_maybe(io::IO, content) = write(io, content)
+write_maybe(io::Nothing, content) = 0
+write_io1_maybe_io2(io1::IO, io2::Union{Nothing, IO}, content) = write(io1, content) + write_maybe(io2, content)
+
+show_maybe(io::IO, mime::MIME, content) = show(io, mime, content)
+show_maybe(io::Nothing, mime::MIME, content) = nothing
+function show_io1_maybe_io2(io1::IO, io2::Union{Nothing, IO}, mime::MIME, content)
+    show(io1, mime, content)
+    show_maybe(io2, mime, content)
+end
+
+function replay(terminal::REPL.Terminals.UnixTerminal, file_name::AbstractString, frame_rate)
+    terminal_out = terminal.out_stream
+    delimiter = get_string_empty_screen()
+    frames = split(read(file_name, String), delimiter)
+    for frame in frames
+        write(terminal_out, frame)
+        sleep(1 / frame_rate)
+        write(terminal_out, delimiter)
+    end
+
+    return nothing
+end
+
+replay(file_name; frame_rate = 2) = replay(REPL.TerminalMenus.terminal, file_name, frame_rate)
+
+end # module

From 1bb2641de889682927be76ed9cb6db61a5bfa010 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Wed, 16 Jun 2021 15:33:37 +0530
Subject: [PATCH 04/28] add keyword force to RLBase.reset! method

---
 src/envs/single_room_undirected_batch.jl | 28 +++++++++++++-----------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/src/envs/single_room_undirected_batch.jl b/src/envs/single_room_undirected_batch.jl
index e0b5744..757a4b6 100644
--- a/src/envs/single_room_undirected_batch.jl
+++ b/src/envs/single_room_undirected_batch.jl
@@ -76,7 +76,7 @@ function SingleRoomUndirectedBatch(; I = Int32, R = Float32, num_envs = 2, heigh
 
     env = SingleRoomUndirectedBatch(tile_map, agent_position, reward, rng, done, terminal_reward, goal_position)
 
-    RLBase.reset!(env)
+    RLBase.reset!(env, force = true)
 
     return env
 end
@@ -88,7 +88,7 @@ RLBase.action_space(env::SingleRoomUndirectedBatch, player::RLBase.DefaultPlayer
 RLBase.reward(env::SingleRoomUndirectedBatch, ::RLBase.DefaultPlayer) = env.reward
 RLBase.is_terminated(env::SingleRoomUndirectedBatch) = env.done
 
-function RLBase.reset!(env::SingleRoomUndirectedBatch{I, R}) where {I, R}
+function RLBase.reset!(env::SingleRoomUndirectedBatch{I, R}; force = false) where {I, R}
     tile_map = env.tile_map
     agent_position = env.agent_position
     goal_position = env.goal_position
@@ -100,21 +100,23 @@ function RLBase.reset!(env::SingleRoomUndirectedBatch{I, R}) where {I, R}
     inner_area = CartesianIndices((2 : size(tile_map, 3) - 1, 2 : size(tile_map, 4) - 1))
 
     for env_id in 1:num_envs
-        tile_map[env_id, AGENT, agent_position[env_id, 1], agent_position[env_id, 2]] = false
-        tile_map[env_id, GOAL, goal_position[env_id, 1], goal_position[env_id, 2]] = false
+        if force || done[env_id]
+            tile_map[env_id, AGENT, agent_position[env_id, 1], agent_position[env_id, 2]] = false
+            tile_map[env_id, GOAL, goal_position[env_id, 1], goal_position[env_id, 2]] = false
 
-        random_positions = SB.sample(rng[env_id], inner_area, 2, replace = false)
+            random_positions = SB.sample(rng[env_id], inner_area, 2, replace = false)
 
-        agent_position[env_id, 1] = random_positions[1][1]
-        agent_position[env_id, 2] = random_positions[1][2]
-        tile_map[env_id, AGENT, random_positions[1]] = true
+            agent_position[env_id, 1] = random_positions[1][1]
+            agent_position[env_id, 2] = random_positions[1][2]
+            tile_map[env_id, AGENT, random_positions[1]] = true
 
-        goal_position[env_id, 1] = random_positions[2][1]
-        goal_position[env_id, 2] = random_positions[2][2]
-        tile_map[env_id, GOAL, random_positions[2]] = true
+            goal_position[env_id, 1] = random_positions[2][1]
+            goal_position[env_id, 2] = random_positions[2][2]
+            tile_map[env_id, GOAL, random_positions[2]] = true
 
-        reward[env_id] = zero(R)
-        done[env_id] = false
+            reward[env_id] = zero(R)
+            done[env_id] = false
+        end
     end
 
     return nothing

From 0d115af2b6e2e1535e65caea3e34b851f828ad94 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Wed, 16 Jun 2021 15:54:42 +0530
Subject: [PATCH 05/28] fix replay method

---
 src/play.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/play.jl b/src/play.jl
index 03525e4..5096f3f 100644
--- a/src/play.jl
+++ b/src/play.jl
@@ -29,7 +29,7 @@ end
 
 function replay(terminal::REPL.Terminals.UnixTerminal, file_name::AbstractString, frame_rate)
     terminal_out = terminal.out_stream
-    delimiter = get_string_empty_screen()
+    delimiter = EMPTY_SCREEN
     frames = split(read(file_name, String), delimiter)
     for frame in frames
         write(terminal_out, frame)

From d39fae711f0c14bbed04b3887d276b3a8cdf1242 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Wed, 16 Jun 2021 15:54:53 +0530
Subject: [PATCH 06/28] write characters to terminal out while playing

---
 src/envs/single_room_undirected_batch.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/envs/single_room_undirected_batch.jl b/src/envs/single_room_undirected_batch.jl
index 757a4b6..f297d60 100644
--- a/src/envs/single_room_undirected_batch.jl
+++ b/src/envs/single_room_undirected_batch.jl
@@ -240,7 +240,9 @@ function play!(terminal::REPL.Terminals.UnixTerminal, env::SingleRoomUndirectedB
             Play.show_io1_maybe_io2(terminal_out, file, MIME("text/plain"), env)
 
             for i in 1:num_envs
-                chars[i] = read(terminal_in, Char)
+                c = read(terminal_in, Char)
+                chars[i] = c
+                write(terminal_out, c)
             end
 
             Play.write_io1_maybe_io2(terminal_out, file, Play.EMPTY_SCREEN)

From 20d103f99139147aca88d662c8b4f96cc7c6867c Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Wed, 16 Jun 2021 15:55:40 +0530
Subject: [PATCH 07/28] ignore scratchpad.jl

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 622cef2..aea776a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,5 @@ Manifest.toml
 # vim temporary files
 *~
 *.swp
+
+/src/scratchpad.jl

From 4c16b209464f6f418661f7bd0b22c64843d236fc Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Wed, 16 Jun 2021 23:31:35 +0530
Subject: [PATCH 08/28] add tests for SingleRoomUndirectedBatch

---
 test/runtests.jl | 109 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 76 insertions(+), 33 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index b6d8879..bfc7f1c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -30,6 +30,8 @@ ENVS = [GW.EmptyRoomDirected,
         GW.CollectGemsUndirectedMultiAgent,
        ]
 
+BATCH_ENVS = [GW.ModuleSingleRoomUndirectedBatch.SingleRoomUndirectedBatch]
+
 const MAX_STEPS = 3000
 const NUM_RESETS = 3
 
@@ -60,45 +62,86 @@ get_terminal_returns(env::GW.Catcher) = env.terminal_reward:env.ball_reward:MAX_
 get_terminal_returns(env::GW.TransportDirected) = (GW.get_terminal_reward(env),)
 get_terminal_returns(env::GW.TransportUndirected) = (GW.get_terminal_reward(env),)
 
+get_terminal_returns(env::GW.ModuleSingleRoomUndirectedBatch.SingleRoomUndirectedBatch) = (env.terminal_reward,)
+
 Test.@testset "GridWorlds.jl" begin
-    for Env in ENVS
-        Test.@testset "$(Env)" begin
-            T = Float32
-            env = Env(T = T)
-            for _ in 1:NUM_RESETS
-                RLBase.reset!(env)
-                Test.@test RLBase.reward(env) == zero(T)
-                Test.@test RLBase.is_terminated(env) == false
-
-                total_reward = zero(T)
-                for i in 1:MAX_STEPS
-                    action = rand(RLBase.action_space(env))
-                    env(action)
-                    total_reward += RLBase.reward(env)
-
-
-                    if Env == GW.CollectGemsUndirectedMultiAgent
-                        for i in 1:GW.get_num_agents(env)
-                            agent_pos = env.agent_pos[i]
-                            Test.@test 1 ≤ agent_pos[1] ≤ GW.get_height(env)
-                            Test.@test 1 ≤ agent_pos[2] ≤ GW.get_width(env)
-                        end
-                    else
-                        Test.@test 1 ≤ GW.get_agent_pos(env)[1] ≤ GW.get_height(env)
-                        Test.@test 1 ≤ GW.get_agent_pos(env)[2] ≤ GW.get_width(env)
-                    end
+    Test.@testset "Single Environments" begin
+        for Env in ENVS
+            Test.@testset "$(Env)" begin
+                T = Float32
+                env = Env(T = T)
+                for _ in 1:NUM_RESETS
+                    RLBase.reset!(env)
+                    Test.@test RLBase.reward(env) == zero(T)
+                    Test.@test RLBase.is_terminated(env) == false
+
+                    total_reward = zero(T)
+                    for i in 1:MAX_STEPS
+                        action = rand(RLBase.action_space(env))
+                        env(action)
+                        total_reward += RLBase.reward(env)
 
-                    if RLBase.is_terminated(env)
-                        if Env == GW.Snake
-                            Test.@test (total_reward in get_terminal_returns_win(env) || total_reward in get_terminal_returns_lose(env))
+
+                        if Env == GW.CollectGemsUndirectedMultiAgent
+                            for i in 1:GW.get_num_agents(env)
+                                agent_pos = env.agent_pos[i]
+                                Test.@test 1 ≤ agent_pos[1] ≤ GW.get_height(env)
+                                Test.@test 1 ≤ agent_pos[2] ≤ GW.get_width(env)
+                            end
                         else
-                            Test.@test total_reward in get_terminal_returns(env)
+                            Test.@test 1 ≤ GW.get_agent_pos(env)[1] ≤ GW.get_height(env)
+                            Test.@test 1 ≤ GW.get_agent_pos(env)[2] ≤ GW.get_width(env)
+                        end
+
+                        if RLBase.is_terminated(env)
+                            if Env == GW.Snake
+                                Test.@test (total_reward in get_terminal_returns_win(env) || total_reward in get_terminal_returns_lose(env))
+                            else
+                                Test.@test total_reward in get_terminal_returns(env)
+                            end
+                            break
+                        end
+
+                        if i == MAX_STEPS
+                            @info "$Env not terminated after MAX_STEPS = $MAX_STEPS"
                         end
-                        break
                     end
+                end
+            end
+        end
+    end
+
+    Test.@testset "Batch Environments" begin
+        for Env in BATCH_ENVS
+            Test.@testset "$(Env)" begin
+                num_envs = 1
+                R = Float32
+                I = Int32
+                env = Env(I = I, R = R, num_envs = num_envs)
+                height = size(env.tile_map, 3)
+                width = size(env.tile_map, 4)
+                for _ in 1:NUM_RESETS
+                    RLBase.reset!(env)
+                    Test.@test RLBase.reward(env) == zeros(R, num_envs)
+                    Test.@test RLBase.is_terminated(env) == falses(num_envs)
 
-                    if i == MAX_STEPS
-                        @info "$Env not terminated after MAX_STEPS = $MAX_STEPS"
+                    total_reward = zeros(R, num_envs)
+                    for i in 1:MAX_STEPS
+                        action = [rand(RLBase.action_space(env)) for _ in 1:num_envs]
+                        env(action)
+                        total_reward .+= RLBase.reward(env)
+
+                        Test.@test 1 ≤ env.agent_position[1, 1] ≤ height
+                        Test.@test 1 ≤ env.agent_position[1, 2] ≤ width
+
+                        if RLBase.is_terminated(env)[1]
+                            Test.@test total_reward[1] in get_terminal_returns(env)
+                            break
+                        end
+
+                        if i == MAX_STEPS
+                            @info "$Env not terminated after MAX_STEPS = $MAX_STEPS"
+                        end
                     end
                 end
             end

From 342deb070bef9fbbdf25fea18209c449b24b82c0 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Thu, 17 Jun 2021 23:36:12 +0530
Subject: [PATCH 09/28] set state style to internal state, copy reward & done

---
 src/envs/single_room_undirected_batch.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/envs/single_room_undirected_batch.jl b/src/envs/single_room_undirected_batch.jl
index f297d60..cefef9a 100644
--- a/src/envs/single_room_undirected_batch.jl
+++ b/src/envs/single_room_undirected_batch.jl
@@ -81,12 +81,13 @@ function SingleRoomUndirectedBatch(; I = Int32, R = Float32, num_envs = 2, heigh
     return env
 end
 
+RLBase.StateStyle(env::SingleRoomUndirectedBatch) = RLBase.InternalState{Any}()
 RLBase.state_space(env::SingleRoomUndirectedBatch, ::RLBase.InternalState, ::RLBase.DefaultPlayer) = nothing
 RLBase.state(env::SingleRoomUndirectedBatch, ::RLBase.InternalState, ::RLBase.DefaultPlayer) = copy(env.tile_map)
 
 RLBase.action_space(env::SingleRoomUndirectedBatch, player::RLBase.DefaultPlayer) = (MOVE_UP, MOVE_DOWN, MOVE_LEFT, MOVE_RIGHT)
-RLBase.reward(env::SingleRoomUndirectedBatch, ::RLBase.DefaultPlayer) = env.reward
-RLBase.is_terminated(env::SingleRoomUndirectedBatch) = env.done
+RLBase.reward(env::SingleRoomUndirectedBatch, ::RLBase.DefaultPlayer) = copy(env.reward)
+RLBase.is_terminated(env::SingleRoomUndirectedBatch) = copy(env.done)
 
 function RLBase.reset!(env::SingleRoomUndirectedBatch{I, R}; force = false) where {I, R}
     tile_map = env.tile_map

From 4926c997388ff87c53f7b6ced95c8686f3c7f3ae Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Thu, 17 Jun 2021 23:37:30 +0530
Subject: [PATCH 10/28] add benchmark_multi_threaded.jl

---
 benchmark/benchmark_multi_threaded.jl | 143 ++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 benchmark/benchmark_multi_threaded.jl

diff --git a/benchmark/benchmark_multi_threaded.jl b/benchmark/benchmark_multi_threaded.jl
new file mode 100644
index 0000000..6698210
--- /dev/null
+++ b/benchmark/benchmark_multi_threaded.jl
@@ -0,0 +1,143 @@
+import GridWorlds as GW
+import ReinforcementLearningBase as RLBase
+import BenchmarkTools as BT
+import Dates
+
+const STEPS_PER_RESET = 100
+const NUM_RESETS = 100
+const NUM_ENVS = 64
+
+const information = Dict()
+
+ENVS = [GW.ModuleSingleRoomUndirectedBatch.SingleRoomUndirectedBatch]
+
+function run_random_policy!(env, num_resets, steps_per_reset)
+    num_envs = size(env.tile_map, 1)
+    action = Array{eltype(RLBase.action_space(env))}(undef, num_envs)
+    for _ in 1:num_resets
+        RLBase.reset!(env, force = true)
+        for _ in 1:steps_per_reset
+            state = RLBase.state(env)
+            for i in 1:num_envs
+                action[i] = rand(RLBase.action_space(env))
+            end
+            env(action)
+            is_terminated = RLBase.is_terminated(env)
+            reward = RLBase.reward(env)
+        end
+    end
+
+    return nothing
+end
+
+function format_benchmark(str::String)
+    l = split(str, "\n")
+    deleteat!(l, (1, 4, 9))
+    return strip.(l)
+end
+
+function write_benchmarks(information, file)
+    io = open(file, "w")
+
+    write(io, "Date: " * Dates.format(Dates.now(), "yyyy_mm_dd_HH_MM_SS") * "\n")
+    write(io, "# List of Environments\n")
+
+    for Env in ENVS
+        name = Env.body.body.body.name.name
+        write(io, "  1. [$(String(name))](#$(lowercase(String(name))))\n")
+    end
+
+    write(io, "\n")
+    write(io, "# Benchmarks\n\n")
+
+    for Env in ENVS
+        name = Env.body.body.body.name.name
+        env_benchmark = information[name]
+
+        write(io, "# $(String(name))\n\n")
+
+        write(io, "#### Run uniformly random policy, NUM_RESETS = $(NUM_RESETS), STEPS_PER_RESET = $(STEPS_PER_RESET), TOTAL_STEPS = $(NUM_RESETS * STEPS_PER_RESET)\n\n")
+        for line in format_benchmark(repr("text/plain", env_benchmark[:run_random_policy]))
+            write(io, line * "\n\n")
+        end
+
+        write(io, "#### $(String(Symbol(Env)))()\n\n")
+        for line in format_benchmark(repr("text/plain", env_benchmark[:instantiation]))
+            write(io, line * "\n\n")
+        end
+
+        write(io, "#### RLBase.reset!(env)\n\n")
+        for line in format_benchmark(repr("text/plain", env_benchmark[:reset!]))
+            write(io, line * "\n\n")
+        end
+
+        write(io, "#### RLBase.state(env)\n\n")
+        for line in format_benchmark(repr("text/plain", env_benchmark[:state]))
+            write(io, line * "\n\n")
+        end
+
+        write(io, "#### RLBase.action_space(env)\n\n")
+        for line in format_benchmark(repr("text/plain", env_benchmark[:action_space]))
+            write(io, line * "\n\n")
+        end
+
+        write(io, "#### RLBase.is_terminated(env)\n\n")
+        for line in format_benchmark(repr("text/plain", env_benchmark[:is_terminated]))
+            write(io, line * "\n\n")
+        end
+
+        write(io, "#### RLBase.reward(env)\n\n")
+        for line in format_benchmark(repr("text/plain", env_benchmark[:reward]))
+            write(io, line * "\n\n")
+        end
+
+        for action in keys(env_benchmark[:action_info])
+            write(io, "#### env($action)\n\n")
+            for line in format_benchmark(repr("text/plain", env_benchmark[:action_info][action]))
+                write(io, line * "\n\n")
+            end
+        end
+
+    end
+
+    close(io)
+end
+
+# compile everything once
+for Env in ENVS
+    env = Env(num_envs = NUM_ENVS)
+    run_random_policy!(env, NUM_RESETS, STEPS_PER_RESET)
+end
+
+@info "First run (for compilation) is complete"
+
+for Env in ENVS
+
+    env = Env(num_envs = NUM_ENVS)
+
+    env_benchmark = Dict()
+
+    env_benchmark[:run_random_policy] = BT.@benchmark run_random_policy!($(Ref(env))[], $(Ref(NUM_RESETS))[], $(Ref(STEPS_PER_RESET))[])
+
+    env_benchmark[:instantiation] = BT.@benchmark $(Ref(Env))[](num_envs = $(NUM_ENVS)[])
+
+    env_benchmark[:reset!] = BT.@benchmark RLBase.reset!($(Ref(env))[], force = true)
+    env_benchmark[:state] = BT.@benchmark RLBase.state($(Ref(env))[])
+    env_benchmark[:action_space] = BT.@benchmark RLBase.action_space($(Ref(env))[])
+    env_benchmark[:is_terminated] = BT.@benchmark RLBase.is_terminated($(Ref(env))[])
+    env_benchmark[:reward] = BT.@benchmark RLBase.reward($(Ref(env))[])
+
+    action_info = Dict()
+    for action in RLBase.action_space(env)
+        actions = fill(action, NUM_ENVS)
+        action_info[Symbol(action)] = BT.@benchmark $(Ref(env))[]($(Ref(actions))[])
+    end
+    env_benchmark[:action_info] = action_info
+
+    name = Env.body.body.body.name.name
+    information[name] = env_benchmark
+
+    @info "$(name) benchmark complete"
+end
+
+write_benchmarks(information, "benchmark_multi_threaded.md")

From 7824551941d08eebd3a9187823d4c55a4f027ee9 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Thu, 17 Jun 2021 23:45:04 +0530
Subject: [PATCH 11/28] print NUM_ENVS

---
 benchmark/benchmark_multi_threaded.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/benchmark_multi_threaded.jl b/benchmark/benchmark_multi_threaded.jl
index 6698210..7c5c28d 100644
--- a/benchmark/benchmark_multi_threaded.jl
+++ b/benchmark/benchmark_multi_threaded.jl
@@ -56,7 +56,7 @@ function write_benchmarks(information, file)
 
         write(io, "# $(String(name))\n\n")
 
-        write(io, "#### Run uniformly random policy, NUM_RESETS = $(NUM_RESETS), STEPS_PER_RESET = $(STEPS_PER_RESET), TOTAL_STEPS = $(NUM_RESETS * STEPS_PER_RESET)\n\n")
+        write(io, "#### Run uniformly random policy, NUM_ENVS = $(NUM_ENVS), NUM_RESETS = $(NUM_RESETS), STEPS_PER_RESET = $(STEPS_PER_RESET), TOTAL_STEPS = $(NUM_RESETS * STEPS_PER_RESET)\n\n")
         for line in format_benchmark(repr("text/plain", env_benchmark[:run_random_policy]))
             write(io, line * "\n\n")
         end

From d711bccf95b42fcab7423c8b882a259dc955b850 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Mon, 21 Jun 2021 17:56:36 +0530
Subject: [PATCH 12/28] move num_envs to the last dimension

---
 src/envs/single_room_undirected_batch.jl | 76 ++++++++++++------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/src/envs/single_room_undirected_batch.jl b/src/envs/single_room_undirected_batch.jl
index cefef9a..a3676b8 100644
--- a/src/envs/single_room_undirected_batch.jl
+++ b/src/envs/single_room_undirected_batch.jl
@@ -44,31 +44,31 @@ struct SingleRoomUndirectedBatch{I, R, RNG} <: GW.AbstractGridWorld
 end
 
 function SingleRoomUndirectedBatch(; I = Int32, R = Float32, num_envs = 2, height = 8, width = 8, rng = [Random.MersenneTwister() for i in 1:num_envs])
-    tile_map = BitArray(undef, num_envs, 3, height, width)
-    agent_position = Array{I}(undef, num_envs, 2)
+    tile_map = BitArray(undef, 3, height, width, num_envs)
+    agent_position = Array{I}(undef, 2, num_envs)
     reward = Array{R}(undef, num_envs)
     done = BitArray(undef, num_envs)
-    goal_position = Array{I}(undef, num_envs, 2)
+    goal_position = Array{I}(undef, 2, num_envs)
     terminal_reward = one(R)
 
     inner_area = CartesianIndices((2 : height - 1, 2 : width - 1))
 
     for env_id in 1:num_envs
-        tile_map[env_id, :, :, :] .= false
-        tile_map[env_id, WALL, 1, :] .= true
-        tile_map[env_id, WALL, height, :] .= true
-        tile_map[env_id, WALL, :, 1] .= true
-        tile_map[env_id, WALL, :, width] .= true
+        tile_map[:, :, :, env_id] .= false
+        tile_map[WALL, 1, :, env_id] .= true
+        tile_map[WALL, height, :, env_id] .= true
+        tile_map[WALL, :, 1, env_id] .= true
+        tile_map[WALL, :, width, env_id] .= true
 
         random_positions = SB.sample(rng[env_id], inner_area, 2, replace = false)
 
-        agent_position[env_id, 1] = random_positions[1][1]
-        agent_position[env_id, 2] = random_positions[1][2]
-        tile_map[env_id, AGENT, random_positions[1]] = true
+        agent_position[1, env_id] = random_positions[1][1]
+        agent_position[2, env_id] = random_positions[1][2]
+        tile_map[AGENT, random_positions[1], env_id] = true
 
-        goal_position[env_id, 1] = random_positions[2][1]
-        goal_position[env_id, 2] = random_positions[2][2]
-        tile_map[env_id, GOAL, random_positions[2]] = true
+        goal_position[1, env_id] = random_positions[2][1]
+        goal_position[2, env_id] = random_positions[2][2]
+        tile_map[GOAL, random_positions[2], env_id] = true
 
         reward[env_id] = zero(R)
         done[env_id] = false
@@ -97,23 +97,23 @@ function RLBase.reset!(env::SingleRoomUndirectedBatch{I, R}; force = false) wher
     done = env.done
     rng = env.rng
 
-    num_envs = size(tile_map, 1)
-    inner_area = CartesianIndices((2 : size(tile_map, 3) - 1, 2 : size(tile_map, 4) - 1))
+    num_objects, height, width, num_envs = size(tile_map)
+    inner_area = CartesianIndices((2 : height - 1, 2 : width - 1))
 
     for env_id in 1:num_envs
         if force || done[env_id]
-            tile_map[env_id, AGENT, agent_position[env_id, 1], agent_position[env_id, 2]] = false
-            tile_map[env_id, GOAL, goal_position[env_id, 1], goal_position[env_id, 2]] = false
+            tile_map[AGENT, agent_position[1, env_id], agent_position[2, env_id], env_id] = false
+            tile_map[GOAL, goal_position[1, env_id], goal_position[2, env_id], env_id] = false
 
             random_positions = SB.sample(rng[env_id], inner_area, 2, replace = false)
 
-            agent_position[env_id, 1] = random_positions[1][1]
-            agent_position[env_id, 2] = random_positions[1][2]
-            tile_map[env_id, AGENT, random_positions[1]] = true
+            agent_position[1, env_id] = random_positions[1][1]
+            agent_position[2, env_id] = random_positions[1][2]
+            tile_map[AGENT, random_positions[1], env_id] = true
 
-            goal_position[env_id, 1] = random_positions[2][1]
-            goal_position[env_id, 2] = random_positions[2][2]
-            tile_map[env_id, GOAL, random_positions[2]] = true
+            goal_position[1, env_id] = random_positions[2][1]
+            goal_position[2, env_id] = random_positions[2][2]
+            tile_map[GOAL, random_positions[2], env_id] = true
 
             reward[env_id] = zero(R)
             done[env_id] = false
@@ -132,24 +132,24 @@ function (env::SingleRoomUndirectedBatch{I, R})(action::Vector) where {I, R}
     rng = env.rng
     terminal_reward = env.terminal_reward
 
-    num_envs = size(tile_map, 1)
+    num_envs = size(tile_map, 4)
 
     for env_id in 1:num_envs
-        current_position_i = agent_position[env_id, 1]
-        current_position_j = agent_position[env_id, 2]
+        current_position_i = agent_position[1, env_id]
+        current_position_j = agent_position[2, env_id]
         next_position_i, next_position_j = move(action[env_id], current_position_i, current_position_j)
 
-        if !tile_map[env_id, WALL, next_position_i, next_position_j]
-            tile_map[env_id, AGENT, current_position_i, current_position_j] = false
-            agent_position[env_id, 1] = next_position_i
-            agent_position[env_id, 2] = next_position_j
-            tile_map[env_id, AGENT, next_position_i, next_position_j] = true
+        if !tile_map[WALL, next_position_i, next_position_j, env_id]
+            tile_map[AGENT, current_position_i, current_position_j, env_id] = false
+            agent_position[1, env_id] = next_position_i
+            agent_position[2, env_id] = next_position_j
+            tile_map[AGENT, next_position_i, next_position_j, env_id] = true
         end
 
-        new_current_position_i = agent_position[env_id, 1]
-        new_current_position_j = agent_position[env_id, 2]
+        new_current_position_i = agent_position[1, env_id]
+        new_current_position_j = agent_position[2, env_id]
 
-        if tile_map[env_id, GOAL, new_current_position_i, new_current_position_j]
+        if tile_map[GOAL, new_current_position_i, new_current_position_j, env_id]
             done[env_id] = true
             reward[env_id] = terminal_reward
         else
@@ -166,7 +166,7 @@ function Base.show(io::IO, ::MIME"text/plain", env::SingleRoomUndirectedBatch)
     reward = env.reward
     done = env.done
 
-    num_envs, num_objects, height, width = size(tile_map)
+    num_objects, height, width, num_envs = size(tile_map)
 
     print(io, "objects = ")
     for i in 1 : length(CHARACTERS)
@@ -184,7 +184,7 @@ function Base.show(io::IO, ::MIME"text/plain", env::SingleRoomUndirectedBatch)
         println(io, "env_id = ", env_id)
         for i in 1:height
             for j in 1:width
-                idx = findfirst(@view tile_map[env_id, :, i, j])
+                idx = findfirst(@view tile_map[:, i, j, env_id])
                 if isnothing(idx)
                     print(io, DUMMY_CHARACTER)
                 else
@@ -222,7 +222,7 @@ function play!(terminal::REPL.Terminals.UnixTerminal, env::SingleRoomUndirectedB
     Play.write_io1_maybe_io2(terminal_out, file, Play.MOVE_CURSOR_TO_ORIGIN)
     Play.write_io1_maybe_io2(terminal_out, file, Play.HIDE_CURSOR)
 
-    num_envs = size(env.tile_map, 1)
+    num_envs = size(env.tile_map, 4)
     chars = Array{Char}(undef, num_envs)
 
     action_chars = ('w', 's', 'a', 'd')

From 2d68a0ad4c7d63ee73ce0e3d1122dad301aeb608 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Mon, 21 Jun 2021 18:07:06 +0530
Subject: [PATCH 13/28] update tests for batch envs

---
 test/runtests.jl | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index bfc7f1c..6419b15 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -114,12 +114,12 @@ Test.@testset "GridWorlds.jl" begin
     Test.@testset "Batch Environments" begin
         for Env in BATCH_ENVS
             Test.@testset "$(Env)" begin
-                num_envs = 1
+                num_envs = 2
                 R = Float32
                 I = Int32
                 env = Env(I = I, R = R, num_envs = num_envs)
-                height = size(env.tile_map, 3)
-                width = size(env.tile_map, 4)
+                height = size(env.tile_map, 2)
+                width = size(env.tile_map, 3)
                 for _ in 1:NUM_RESETS
                     RLBase.reset!(env)
                     Test.@test RLBase.reward(env) == zeros(R, num_envs)
@@ -131,15 +131,19 @@ Test.@testset "GridWorlds.jl" begin
                         env(action)
                         total_reward .+= RLBase.reward(env)
 
-                        Test.@test 1 ≤ env.agent_position[1, 1] ≤ height
-                        Test.@test 1 ≤ env.agent_position[1, 2] ≤ width
+                        for env_id in 1:num_envs
+                            Test.@test 1 ≤ env.agent_position[1, env_id] ≤ height
+                            Test.@test 1 ≤ env.agent_position[2, env_id] ≤ width
+                        end
 
-                        if RLBase.is_terminated(env)[1]
-                            Test.@test total_reward[1] in get_terminal_returns(env)
-                            break
+                        for env_id in 1:num_envs
+                            if RLBase.is_terminated(env)[env_id]
+                                Test.@test total_reward[env_id] in get_terminal_returns(env)
+                                total_reward[env_id] = zero(total_reward[env_id])
+                            end
                         end
 
-                        if i == MAX_STEPS
+                        if i == MAX_STEPS && !any(RLBase.is_terminated(env))
                             @info "$Env not terminated after MAX_STEPS = $MAX_STEPS"
                         end
                     end

From 3dbd497b0f5ce21c32a6d3bc3a526c96dd260835 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Mon, 21 Jun 2021 18:08:24 +0530
Subject: [PATCH 14/28] don't copy tile_map, reward, and done in RLBase API

---
 src/envs/single_room_undirected_batch.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/envs/single_room_undirected_batch.jl b/src/envs/single_room_undirected_batch.jl
index a3676b8..73655f1 100644
--- a/src/envs/single_room_undirected_batch.jl
+++ b/src/envs/single_room_undirected_batch.jl
@@ -83,11 +83,11 @@ end
 
 RLBase.StateStyle(env::SingleRoomUndirectedBatch) = RLBase.InternalState{Any}()
 RLBase.state_space(env::SingleRoomUndirectedBatch, ::RLBase.InternalState, ::RLBase.DefaultPlayer) = nothing
-RLBase.state(env::SingleRoomUndirectedBatch, ::RLBase.InternalState, ::RLBase.DefaultPlayer) = copy(env.tile_map)
+RLBase.state(env::SingleRoomUndirectedBatch, ::RLBase.InternalState, ::RLBase.DefaultPlayer) = env.tile_map
 
 RLBase.action_space(env::SingleRoomUndirectedBatch, player::RLBase.DefaultPlayer) = (MOVE_UP, MOVE_DOWN, MOVE_LEFT, MOVE_RIGHT)
-RLBase.reward(env::SingleRoomUndirectedBatch, ::RLBase.DefaultPlayer) = copy(env.reward)
-RLBase.is_terminated(env::SingleRoomUndirectedBatch) = copy(env.done)
+RLBase.reward(env::SingleRoomUndirectedBatch, ::RLBase.DefaultPlayer) = env.reward
+RLBase.is_terminated(env::SingleRoomUndirectedBatch) = env.done
 
 function RLBase.reset!(env::SingleRoomUndirectedBatch{I, R}; force = false) where {I, R}
     tile_map = env.tile_map

From 8ec31a06bccc8b06a324bc299090ca42f2d5c913 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Mon, 21 Jun 2021 18:10:56 +0530
Subject: [PATCH 15/28] remove unnecessary RLBase.DefaultPlayer

---
 src/envs/single_room_undirected_batch.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/envs/single_room_undirected_batch.jl b/src/envs/single_room_undirected_batch.jl
index 73655f1..eb8f6dc 100644
--- a/src/envs/single_room_undirected_batch.jl
+++ b/src/envs/single_room_undirected_batch.jl
@@ -82,11 +82,11 @@ function SingleRoomUndirectedBatch(; I = Int32, R = Float32, num_envs = 2, heigh
 end
 
 RLBase.StateStyle(env::SingleRoomUndirectedBatch) = RLBase.InternalState{Any}()
-RLBase.state_space(env::SingleRoomUndirectedBatch, ::RLBase.InternalState, ::RLBase.DefaultPlayer) = nothing
-RLBase.state(env::SingleRoomUndirectedBatch, ::RLBase.InternalState, ::RLBase.DefaultPlayer) = env.tile_map
+RLBase.state_space(env::SingleRoomUndirectedBatch, ::RLBase.InternalState) = nothing
+RLBase.state(env::SingleRoomUndirectedBatch, ::RLBase.InternalState) = env.tile_map
 
-RLBase.action_space(env::SingleRoomUndirectedBatch, player::RLBase.DefaultPlayer) = (MOVE_UP, MOVE_DOWN, MOVE_LEFT, MOVE_RIGHT)
-RLBase.reward(env::SingleRoomUndirectedBatch, ::RLBase.DefaultPlayer) = env.reward
+RLBase.action_space(env::SingleRoomUndirectedBatch) = (MOVE_UP, MOVE_DOWN, MOVE_LEFT, MOVE_RIGHT)
+RLBase.reward(env::SingleRoomUndirectedBatch) = env.reward
 RLBase.is_terminated(env::SingleRoomUndirectedBatch) = env.done
 
 function RLBase.reset!(env::SingleRoomUndirectedBatch{I, R}; force = false) where {I, R}

From 73406ea5b1c829578b6bd0ef76fb3ba7bafa1038 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Mon, 21 Jun 2021 18:12:52 +0530
Subject: [PATCH 16/28] rename benchmark_multi_threaded.jl to
 benchmark_batch.jl

---
 benchmark/{benchmark_multi_threaded.jl => benchmark_batch.jl} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename benchmark/{benchmark_multi_threaded.jl => benchmark_batch.jl} (100%)

diff --git a/benchmark/benchmark_multi_threaded.jl b/benchmark/benchmark_batch.jl
similarity index 100%
rename from benchmark/benchmark_multi_threaded.jl
rename to benchmark/benchmark_batch.jl

From 45bf86aa31e5fadfa4a115d9f6ab72ad116ab455 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Mon, 21 Jun 2021 18:23:59 +0530
Subject: [PATCH 17/28] fix and cleanup benchmark_batch

---
 benchmark/benchmark_batch.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmark/benchmark_batch.jl b/benchmark/benchmark_batch.jl
index 7c5c28d..d95b4ec 100644
--- a/benchmark/benchmark_batch.jl
+++ b/benchmark/benchmark_batch.jl
@@ -12,7 +12,7 @@ const information = Dict()
 ENVS = [GW.ModuleSingleRoomUndirectedBatch.SingleRoomUndirectedBatch]
 
 function run_random_policy!(env, num_resets, steps_per_reset)
-    num_envs = size(env.tile_map, 1)
+    num_envs = size(env.tile_map, 4)
     action = Array{eltype(RLBase.action_space(env))}(undef, num_envs)
     for _ in 1:num_resets
         RLBase.reset!(env, force = true)
@@ -32,7 +32,7 @@ end
 
 function format_benchmark(str::String)
     l = split(str, "\n")
-    deleteat!(l, (1, 4, 9))
+    deleteat!(l, (1, 3, 4, 5, 7, 8, 9, 10, 11))
     return strip.(l)
 end
 
@@ -140,4 +140,4 @@ for Env in ENVS
     @info "$(name) benchmark complete"
 end
 
-write_benchmarks(information, "benchmark_multi_threaded.md")
+write_benchmarks(information, "benchmark_batch.md")

From ed2b37bfc2094c1e4efa5da9d3260d0df48cefc2 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Mon, 21 Jun 2021 18:29:14 +0530
Subject: [PATCH 18/28] make move function type stable (huge improvement in
 performance)

---
 src/envs/single_room_undirected_batch.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/envs/single_room_undirected_batch.jl b/src/envs/single_room_undirected_batch.jl
index eb8f6dc..966d792 100644
--- a/src/envs/single_room_undirected_batch.jl
+++ b/src/envs/single_room_undirected_batch.jl
@@ -30,6 +30,8 @@ function move(action::Integer, i, j)
         return i, j - 1
     elseif action == MOVE_RIGHT
         return i, j + 1
+    else
+        return i, j
     end
 end
 

From 2ceedadbcec2d280ec24bc9cc8b176c243b2edf8 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Mon, 21 Jun 2021 19:38:54 +0530
Subject: [PATCH 19/28] add function sample_two_positions_without_replacement

---
 src/envs/single_room_undirected_batch.jl | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/envs/single_room_undirected_batch.jl b/src/envs/single_room_undirected_batch.jl
index 966d792..4b657f2 100644
--- a/src/envs/single_room_undirected_batch.jl
+++ b/src/envs/single_room_undirected_batch.jl
@@ -6,6 +6,7 @@ import ..Play
 import Random
 import REPL
 import ReinforcementLearningBase as RLBase
+import StaticArrays as SA
 import StatsBase as SB
 
 const MOVE_UP = 1
@@ -21,6 +22,17 @@ const DUMMY_CHARACTER = '⋅'
 const CHARACTERS = ('☻', '█', '♥')
 const FOREGROUND_COLORS = (:light_red, :white, :light_red)
 
+function sample_two_positions_without_replacement(rng, region)
+    position1 = rand(rng, region)
+    position2 = rand(rng, region)
+
+    while position1 == position2
+        position2 = rand(rng, region)
+    end
+
+    return position1, position2
+end
+
 function move(action::Integer, i, j)
     if action == MOVE_UP
         return i - 1, j
@@ -62,7 +74,7 @@ function SingleRoomUndirectedBatch(; I = Int32, R = Float32, num_envs = 2, heigh
         tile_map[WALL, :, 1, env_id] .= true
         tile_map[WALL, :, width, env_id] .= true
 
-        random_positions = SB.sample(rng[env_id], inner_area, 2, replace = false)
+        random_positions = sample_two_positions_without_replacement(rng[env_id], inner_area)
 
         agent_position[1, env_id] = random_positions[1][1]
         agent_position[2, env_id] = random_positions[1][2]
@@ -107,7 +119,7 @@ function RLBase.reset!(env::SingleRoomUndirectedBatch{I, R}; force = false) wher
             tile_map[AGENT, agent_position[1, env_id], agent_position[2, env_id], env_id] = false
             tile_map[GOAL, goal_position[1, env_id], goal_position[2, env_id], env_id] = false
 
-            random_positions = SB.sample(rng[env_id], inner_area, 2, replace = false)
+            random_positions = sample_two_positions_without_replacement(rng[env_id], inner_area)
 
             agent_position[1, env_id] = random_positions[1][1]
             agent_position[2, env_id] = random_positions[1][2]

From 24269558ea71a344f29870f4d8b3681fae0797a7 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Thu, 24 Jun 2021 13:30:36 +0530
Subject: [PATCH 20/28] add DataStructures package in benchmarking code

---
 benchmark/Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/benchmark/Project.toml b/benchmark/Project.toml
index 2f876ff..dd589b6 100644
--- a/benchmark/Project.toml
+++ b/benchmark/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 GridWorlds = "e15a9946-cd7f-4d03-83e2-6c30bacb0043"
 Profile = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"

From cda573ee96a0c49289005de4d4f6d19ab41ea020 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Thu, 24 Jun 2021 13:31:45 +0530
Subject: [PATCH 21/28] add ACTION_NAMES in ModuleSingleRoomUndirectedBatch

---
 src/envs/single_room_undirected_batch.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/envs/single_room_undirected_batch.jl b/src/envs/single_room_undirected_batch.jl
index 4b657f2..c7ed553 100644
--- a/src/envs/single_room_undirected_batch.jl
+++ b/src/envs/single_room_undirected_batch.jl
@@ -13,6 +13,7 @@ const MOVE_UP = 1
 const MOVE_DOWN = 2
 const MOVE_LEFT = 3
 const MOVE_RIGHT = 4
+const ACTION_NAMES = (:MOVE_UP, :MOVE_DOWN, :MOVE_LEFT, :MOVE_RIGHT)
 
 const AGENT = 1
 const WALL = 2

From 26d1231738a203959981612d73315a99f3445c90 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Thu, 24 Jun 2021 13:32:22 +0530
Subject: [PATCH 22/28] refactor benchmark_batch.jl

---
 benchmark/benchmark_batch.jl | 162 +++++++++++++++++------------------
 1 file changed, 78 insertions(+), 84 deletions(-)

diff --git a/benchmark/benchmark_batch.jl b/benchmark/benchmark_batch.jl
index d95b4ec..1b395ea 100644
--- a/benchmark/benchmark_batch.jl
+++ b/benchmark/benchmark_batch.jl
@@ -1,14 +1,14 @@
-import GridWorlds as GW
-import ReinforcementLearningBase as RLBase
 import BenchmarkTools as BT
+import DataStructures as DS
 import Dates
+import GridWorlds as GW
+import ReinforcementLearningBase as RLBase
+import Statistics
 
 const STEPS_PER_RESET = 100
 const NUM_RESETS = 100
 const NUM_ENVS = 64
 
-const information = Dict()
-
 ENVS = [GW.ModuleSingleRoomUndirectedBatch.SingleRoomUndirectedBatch]
 
 function run_random_policy!(env, num_resets, steps_per_reset)
@@ -30,114 +30,108 @@ function run_random_policy!(env, num_resets, steps_per_reset)
     return nothing
 end
 
-function format_benchmark(str::String)
-    l = split(str, "\n")
-    deleteat!(l, (1, 3, 4, 5, 7, 8, 9, 10, 11))
-    return strip.(l)
+function compile_envs(Envs)
+    for Env in Envs
+        env = Env(num_envs = NUM_ENVS)
+        run_random_policy!(env, NUM_RESETS, STEPS_PER_RESET)
+    end
+
+    @info "Compiled and ran all environments"
+
+    return nothing
 end
 
-function write_benchmarks(information, file)
-    io = open(file, "w")
+function benchmark_batch_env(Env, num_resets, steps_per_reset, num_envs)
+    benchmark = DS.OrderedDict()
 
-    write(io, "Date: " * Dates.format(Dates.now(), "yyyy_mm_dd_HH_MM_SS") * "\n")
-    write(io, "# List of Environments\n")
+    parent_module = parentmodule(Env)
 
-    for Env in ENVS
-        name = Env.body.body.body.name.name
-        write(io, "  1. [$(String(name))](#$(lowercase(String(name))))\n")
-    end
+    env = Env(num_envs = num_envs)
 
-    write(io, "\n")
-    write(io, "# Benchmarks\n\n")
+    benchmark[:random_policy] = BT.@benchmark run_random_policy!($(Ref(env))[], $(Ref(num_resets))[], $(Ref(steps_per_reset))[])
+    benchmark[:reset] = BT.@benchmark RLBase.reset!($(Ref(env))[], force = true)
+    benchmark[:state] = BT.@benchmark RLBase.state($(Ref(env))[])
 
-    for Env in ENVS
-        name = Env.body.body.body.name.name
-        env_benchmark = information[name]
+    for action in RLBase.action_space(env)
+        action_name = parent_module.ACTION_NAMES[action]
+        batch_action = fill(action, NUM_ENVS)
+        benchmark[action_name] = BT.@benchmark $(Ref(env))[]($(Ref(batch_action))[])
+    end
 
-        write(io, "# $(String(name))\n\n")
+    benchmark[:action_space] = BT.@benchmark RLBase.action_space($(Ref(env))[])
+    benchmark[:is_terminated] = BT.@benchmark RLBase.is_terminated($(Ref(env))[])
+    benchmark[:reward] = BT.@benchmark RLBase.reward($(Ref(env))[])
 
-        write(io, "#### Run uniformly random policy, NUM_ENVS = $(NUM_ENVS), NUM_RESETS = $(NUM_RESETS), STEPS_PER_RESET = $(STEPS_PER_RESET), TOTAL_STEPS = $(NUM_RESETS * STEPS_PER_RESET)\n\n")
-        for line in format_benchmark(repr("text/plain", env_benchmark[:run_random_policy]))
-            write(io, line * "\n\n")
-        end
+    @info "$(nameof(Env)) benchmarked"
 
-        write(io, "#### $(String(Symbol(Env)))()\n\n")
-        for line in format_benchmark(repr("text/plain", env_benchmark[:instantiation]))
-            write(io, line * "\n\n")
-        end
+    return benchmark
+end
 
-        write(io, "#### RLBase.reset!(env)\n\n")
-        for line in format_benchmark(repr("text/plain", env_benchmark[:reset!]))
-            write(io, line * "\n\n")
-        end
+function benchmark_batch_envs(Envs, num_resets, steps_per_reset, num_envs)
+    benchmarks = DS.OrderedDict()
 
-        write(io, "#### RLBase.state(env)\n\n")
-        for line in format_benchmark(repr("text/plain", env_benchmark[:state]))
-            write(io, line * "\n\n")
-        end
+    for Env in Envs
+        benchmarks[nameof(Env)] = benchmark_batch_env(Env, num_resets, steps_per_reset, num_envs)
+    end
 
-        write(io, "#### RLBase.action_space(env)\n\n")
-        for line in format_benchmark(repr("text/plain", env_benchmark[:action_space]))
-            write(io, line * "\n\n")
-        end
+    @info "All benchmarks complete"
 
-        write(io, "#### RLBase.is_terminated(env)\n\n")
-        for line in format_benchmark(repr("text/plain", env_benchmark[:is_terminated]))
-            write(io, line * "\n\n")
-        end
+    return benchmarks
+end
 
-        write(io, "#### RLBase.reward(env)\n\n")
-        for line in format_benchmark(repr("text/plain", env_benchmark[:reward]))
-            write(io, line * "\n\n")
-        end
+function get_summary(trial::BT.Trial)
+    median_trial = BT.median(trial)
+    memory = BT.prettymemory(median_trial.memory)
+    median_time = BT.prettytime(median_trial.time)
+    return memory, median_time
+end
 
-        for action in keys(env_benchmark[:action_info])
-            write(io, "#### env($action)\n\n")
-            for line in format_benchmark(repr("text/plain", env_benchmark[:action_info][action]))
-                write(io, line * "\n\n")
-            end
-        end
+function get_table(benchmark)
+    title = "|"
+    separator = "|"
+    data = "|"
 
+    for key in keys(benchmark)
+        title = title * String(key) * "|"
+        separator = separator * ":---:|"
+        memory, median_time = get_summary(benchmark[key])
+        data = data * "$(memory)<br>$(median_time)|"
     end
 
-    close(io)
-end
-
-# compile everything once
-for Env in ENVS
-    env = Env(num_envs = NUM_ENVS)
-    run_random_policy!(env, NUM_RESETS, STEPS_PER_RESET)
+    return title, separator, data
 end
 
-@info "First run (for compilation) is complete"
+function generate_benchmark_file_batch_envs(Envs, num_resets, steps_per_reset, num_envs, file_name = nothing)
+    date = Dates.format(Dates.now(), "yyyy_mm_dd_HH_MM_SS")
 
-for Env in ENVS
+    if isnothing(file_name)
+        file_name = date * ".md"
+    end
 
-    env = Env(num_envs = NUM_ENVS)
+    io = open(file_name, "w")
 
-    env_benchmark = Dict()
+    benchmarks = benchmark_batch_envs(Envs, num_resets, steps_per_reset, num_envs)
 
-    env_benchmark[:run_random_policy] = BT.@benchmark run_random_policy!($(Ref(env))[], $(Ref(NUM_RESETS))[], $(Ref(STEPS_PER_RESET))[])
+    println(io, "Date: $(date)")
+    println(io, "## List of Environments")
 
-    env_benchmark[:instantiation] = BT.@benchmark $(Ref(Env))[](num_envs = $(NUM_ENVS)[])
+    for Env in ENVS
+        name_string = String(nameof(Env))
+        println(io, "  1. [$(name_string)](#$(lowercase(name_string)))")
+    end
 
-    env_benchmark[:reset!] = BT.@benchmark RLBase.reset!($(Ref(env))[], force = true)
-    env_benchmark[:state] = BT.@benchmark RLBase.state($(Ref(env))[])
-    env_benchmark[:action_space] = BT.@benchmark RLBase.action_space($(Ref(env))[])
-    env_benchmark[:is_terminated] = BT.@benchmark RLBase.is_terminated($(Ref(env))[])
-    env_benchmark[:reward] = BT.@benchmark RLBase.reward($(Ref(env))[])
+    println(io)
 
-    action_info = Dict()
-    for action in RLBase.action_space(env)
-        actions = fill(action, NUM_ENVS)
-        action_info[Symbol(action)] = BT.@benchmark $(Ref(env))[]($(Ref(actions))[])
+    for key in keys(benchmarks)
+        println(io, "### " * String(key))
+        title, separator, data = get_table(benchmarks[key])
+        println(io, title)
+        println(io, separator)
+        println(io, data)
+        println(io)
     end
-    env_benchmark[:action_info] = action_info
 
-    name = Env.body.body.body.name.name
-    information[name] = env_benchmark
+    close(io)
 
-    @info "$(name) benchmark complete"
+    return nothing
 end
-
-write_benchmarks(information, "benchmark_batch.md")

From bb048ccf351307fcccc3edcc4a4af6ebee5b41df Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Thu, 24 Jun 2021 13:33:06 +0530
Subject: [PATCH 23/28] rename benchmark_batch.jl to benchmark_utils.jl

---
 benchmark/{benchmark_batch.jl => benchmark_utils.jl} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename benchmark/{benchmark_batch.jl => benchmark_utils.jl} (100%)

diff --git a/benchmark/benchmark_batch.jl b/benchmark/benchmark_utils.jl
similarity index 100%
rename from benchmark/benchmark_batch.jl
rename to benchmark/benchmark_utils.jl

From e8854e23c4b8f85cac0293f76cd4d3e7274c2eaa Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Thu, 24 Jun 2021 14:29:06 +0530
Subject: [PATCH 24/28] add SingleRoomUndirected

---
 src/envs/envs.jl                   |   1 +
 src/envs/single_room_undirected.jl | 267 +++++++++++++++++++++++++++++
 2 files changed, 268 insertions(+)
 create mode 100644 src/envs/single_room_undirected.jl

diff --git a/src/envs/envs.jl b/src/envs/envs.jl
index 840955c..c9efa9e 100644
--- a/src/envs/envs.jl
+++ b/src/envs/envs.jl
@@ -43,3 +43,4 @@ include("catcher.jl")
 include("transport.jl")
 include("collect_gems_undirected_multi_agent.jl")
 include("single_room_undirected_batch.jl")
+include("single_room_undirected.jl")
diff --git a/src/envs/single_room_undirected.jl b/src/envs/single_room_undirected.jl
new file mode 100644
index 0000000..fb8be6c
--- /dev/null
+++ b/src/envs/single_room_undirected.jl
@@ -0,0 +1,267 @@
+module ModuleSingleRoomUndirected
+
+import Crayons
+import ..GridWorlds as GW
+import ..Play
+import Random
+import REPL
+import ReinforcementLearningBase as RLBase
+import StaticArrays as SA
+import StatsBase as SB
+
+const MOVE_UP = 1
+const MOVE_DOWN = 2
+const MOVE_LEFT = 3
+const MOVE_RIGHT = 4
+const ACTION_NAMES = (:MOVE_UP, :MOVE_DOWN, :MOVE_LEFT, :MOVE_RIGHT)
+
+const AGENT = 1
+const WALL = 2
+const GOAL = 3
+
+const DUMMY_CHARACTER = '⋅'
+const CHARACTERS = ('☻', '█', '♥')
+const FOREGROUND_COLORS = (:light_red, :white, :light_red)
+
+function sample_two_positions_without_replacement(rng, region)
+    position1 = rand(rng, region)
+    position2 = rand(rng, region)
+
+    while position1 == position2
+        position2 = rand(rng, region)
+    end
+
+    return position1, position2
+end
+
+function move(action::Integer, i, j)
+    if action == MOVE_UP
+        return i - 1, j
+    elseif action == MOVE_DOWN
+        return i + 1, j
+    elseif action == MOVE_LEFT
+        return i, j - 1
+    elseif action == MOVE_RIGHT
+        return i, j + 1
+    else
+        return i, j
+    end
+end
+
+struct SingleRoomUndirected{I, R, RNG} <: GW.AbstractGridWorld
+    tile_map::BitArray{3}
+    agent_position::SA.MVector{2, I}
+    reward::Ref{R}
+    rng::RNG
+    done::Ref{Bool}
+    terminal_reward::R
+    goal_position::SA.MVector{2, I}
+end
+
+function SingleRoomUndirected(; I = Int32, R = Float32, height = 8, width = 8, rng = Random.MersenneTwister())
+    tile_map = BitArray(undef, 3, height, width)
+    agent_position = SA.MVector{2, I}(undef)
+    reward = Ref{R}()
+    done = Ref{Bool}()
+    goal_position = SA.MVector{2, I}(undef)
+    terminal_reward = one(R)
+
+    inner_area = CartesianIndices((2 : height - 1, 2 : width - 1))
+
+    tile_map[:, :, :] .= false
+    tile_map[WALL, 1, :] .= true
+    tile_map[WALL, height, :] .= true
+    tile_map[WALL, :, 1] .= true
+    tile_map[WALL, :, width] .= true
+
+    random_positions = sample_two_positions_without_replacement(rng, inner_area)
+
+    agent_position[1] = random_positions[1][1]
+    agent_position[2] = random_positions[1][2]
+    tile_map[AGENT, random_positions[1]] = true
+
+    goal_position[1] = random_positions[2][1]
+    goal_position[2] = random_positions[2][2]
+    tile_map[GOAL, random_positions[2]] = true
+
+    reward[] = zero(R)
+    done[] = false
+
+    env = SingleRoomUndirected(tile_map, agent_position, reward, rng, done, terminal_reward, goal_position)
+
+    RLBase.reset!(env)
+
+    return env
+end
+
+RLBase.StateStyle(env::SingleRoomUndirected) = RLBase.InternalState{Any}()
+RLBase.state_space(env::SingleRoomUndirected, ::RLBase.InternalState) = nothing
+RLBase.state(env::SingleRoomUndirected, ::RLBase.InternalState) = env.tile_map
+
+RLBase.action_space(env::SingleRoomUndirected) = (MOVE_UP, MOVE_DOWN, MOVE_LEFT, MOVE_RIGHT)
+RLBase.reward(env::SingleRoomUndirected) = env.reward[]
+RLBase.is_terminated(env::SingleRoomUndirected) = env.done[]
+
+function RLBase.reset!(env::SingleRoomUndirected{I, R}) where {I, R}
+    tile_map = env.tile_map
+    agent_position = env.agent_position
+    goal_position = env.goal_position
+    reward = env.reward
+    done = env.done
+    rng = env.rng
+
+    num_objects, height, width = size(tile_map)
+    inner_area = CartesianIndices((2 : height - 1, 2 : width - 1))
+
+    tile_map[AGENT, agent_position...] = false
+    tile_map[GOAL, goal_position...] = false
+
+    random_positions = sample_two_positions_without_replacement(rng, inner_area)
+
+    agent_position[1] = random_positions[1][1]
+    agent_position[2] = random_positions[1][2]
+    tile_map[AGENT, random_positions[1]] = true
+
+    goal_position[1] = random_positions[2][1]
+    goal_position[2] = random_positions[2][2]
+    tile_map[GOAL, random_positions[2]] = true
+
+    reward[] = zero(R)
+    done[] = false
+
+    return nothing
+end
+
+function (env::SingleRoomUndirected{I, R})(action) where {I, R}
+    tile_map = env.tile_map
+    agent_position = env.agent_position
+    goal_position = env.goal_position
+    reward = env.reward
+    done = env.done
+    rng = env.rng
+    terminal_reward = env.terminal_reward
+
+    current_position_i = agent_position[1]
+    current_position_j = agent_position[2]
+    next_position_i, next_position_j = move(action, current_position_i, current_position_j)
+
+    if !tile_map[WALL, next_position_i, next_position_j]
+        tile_map[AGENT, current_position_i, current_position_j] = false
+        agent_position[1] = next_position_i
+        agent_position[2] = next_position_j
+        tile_map[AGENT, next_position_i, next_position_j] = true
+    end
+
+    if tile_map[GOAL, agent_position...]
+        reward[] = terminal_reward
+        done[] = true
+    else
+        reward[] = zero(R)
+        done[] = false
+    end
+
+    return nothing
+end
+
+function Base.show(io::IO, ::MIME"text/plain", env::SingleRoomUndirected)
+    tile_map = env.tile_map
+    reward = env.reward
+    done = env.done
+
+    num_objects, height, width = size(tile_map)
+
+    print(io, "objects = ")
+    for i in 1 : length(CHARACTERS)
+        print(io, Crayons.Crayon(foreground = FOREGROUND_COLORS[i]), CHARACTERS[i], Crayons.Crayon(reset = true))
+        if i < length(CHARACTERS)
+            print(io, ", ")
+        else
+            print(io, "\n")
+        end
+    end
+    println(io, "dummy character = ", DUMMY_CHARACTER)
+
+    println(io)
+    for i in 1:height
+        for j in 1:width
+            idx = findfirst(@view tile_map[:, i, j])
+            if isnothing(idx)
+                print(io, DUMMY_CHARACTER)
+            else
+                print(io, Crayons.Crayon(foreground = FOREGROUND_COLORS[idx]), CHARACTERS[idx], Crayons.Crayon(reset = true))
+            end
+        end
+
+        println(io)
+    end
+
+    println(io, "reward = ", reward[])
+    println(io, "done = ", done[])
+
+    return nothing
+end
+
+get_string_key_bindings(env::SingleRoomUndirected) = """Key bindings:
+                                                     'q': quit
+                                                     'r': RLBase.reset!(env)
+                                                     'w': MOVE_UP
+                                                     's': MOVE_DOWN
+                                                     'a': MOVE_LEFT
+                                                     'd': MOVE_RIGHT
+                                                     """
+
+function play!(terminal::REPL.Terminals.UnixTerminal, env::SingleRoomUndirected; file_name::Union{Nothing, AbstractString} = nothing)
+    REPL.Terminals.raw!(terminal, true)
+
+    terminal_out = terminal.out_stream
+    terminal_in = terminal.in_stream
+    file = Play.open_maybe(file_name)
+
+    Play.write_io1_maybe_io2(terminal_out, file, Play.CLEAR_SCREEN)
+    Play.write_io1_maybe_io2(terminal_out, file, Play.MOVE_CURSOR_TO_ORIGIN)
+    Play.write_io1_maybe_io2(terminal_out, file, Play.HIDE_CURSOR)
+
+    action_chars = ('w', 's', 'a', 'd')
+
+    char_to_action = Dict('w' => MOVE_UP,
+                          's' => MOVE_DOWN,
+                          'a' => MOVE_LEFT,
+                          'd' => MOVE_RIGHT,
+                         )
+
+    try
+        while true
+            Play.write_io1_maybe_io2(terminal_out, file, get_string_key_bindings(env))
+            Play.show_io1_maybe_io2(terminal_out, file, MIME("text/plain"), env)
+
+            char = read(terminal_in, Char)
+
+            Play.write_io1_maybe_io2(terminal_out, file, Play.EMPTY_SCREEN)
+
+            if char == 'q'
+                Play.write_io1_maybe_io2(terminal_out, file, Play.SHOW_CURSOR)
+                Play.close_maybe(file)
+                REPL.Terminals.raw!(terminal, false)
+                return nothing
+            elseif char == 'r'
+                RLBase.reset!(env)
+            elseif char in action_chars
+                env(char_to_action[char])
+            else
+                @warn "No procedure exists for this character: $char"
+            end
+
+            Play.write_io1_maybe_io2(terminal_out, file, "Last character = $(char)\n")
+        end
+    finally
+        Play.write_io1_maybe_io2(terminal_out, file, Play.SHOW_CURSOR)
+        Play.close_maybe(file)
+        REPL.Terminals.raw!(terminal, false)
+    end
+
+    return nothing
+end
+
+play!(env::SingleRoomUndirected; file_name = nothing) = play!(REPL.TerminalMenus.terminal, env, file_name = file_name)
+
+end # module

From 9f3b1bde252b881207a13b21a77ddc64deab22f0 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Thu, 24 Jun 2021 14:44:11 +0530
Subject: [PATCH 25/28] ignore generated benchmark files

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index aea776a..24cb5fb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,3 +17,5 @@ Manifest.toml
 *.swp
 
 /src/scratchpad.jl
+
+/benchmark/20*

From b73095834f20ff3ada203090999af3cf747ad0ee Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Thu, 24 Jun 2021 15:24:26 +0530
Subject: [PATCH 26/28] add benchmarking for non-batch envs

---
 benchmark/benchmark_utils.jl | 116 ++++++++++++++++++++++++++++++-----
 1 file changed, 101 insertions(+), 15 deletions(-)

diff --git a/benchmark/benchmark_utils.jl b/benchmark/benchmark_utils.jl
index 1b395ea..5b03da2 100644
--- a/benchmark/benchmark_utils.jl
+++ b/benchmark/benchmark_utils.jl
@@ -9,9 +9,25 @@ const STEPS_PER_RESET = 100
 const NUM_RESETS = 100
 const NUM_ENVS = 64
 
-ENVS = [GW.ModuleSingleRoomUndirectedBatch.SingleRoomUndirectedBatch]
+ENVS = [GW.ModuleSingleRoomUndirected.SingleRoomUndirected]
+BATCH_ENVS = [GW.ModuleSingleRoomUndirectedBatch.SingleRoomUndirectedBatch]
 
-function run_random_policy!(env, num_resets, steps_per_reset)
+function run_random_policy_env!(env, num_resets, steps_per_reset)
+    for _ in 1:num_resets
+        RLBase.reset!(env)
+        for _ in 1:steps_per_reset
+            state = RLBase.state(env)
+            action = rand(RLBase.action_space(env))
+            env(action)
+            is_terminated = RLBase.is_terminated(env)
+            reward = RLBase.reward(env)
+        end
+    end
+
+    return nothing
+end
+
+function run_random_policy_batch_env!(env, num_resets, steps_per_reset)
     num_envs = size(env.tile_map, 4)
     action = Array{eltype(RLBase.action_space(env))}(undef, num_envs)
     for _ in 1:num_resets
@@ -30,15 +46,40 @@ function run_random_policy!(env, num_resets, steps_per_reset)
     return nothing
 end
 
-function compile_envs(Envs)
-    for Env in Envs
-        env = Env(num_envs = NUM_ENVS)
-        run_random_policy!(env, NUM_RESETS, STEPS_PER_RESET)
+# function compile_envs(Envs, num_resets, steps_per_reset)
+    # for Env in Envs
+        # env = Env()
+        # run_random_policy!(env, num_resets, steps_per_reset)
+    # end
+
+    # @info "Compiled and ran all environments"
+
+    # return nothing
+# end
+
+function benchmark_env(Env, num_resets, steps_per_reset)
+    benchmark = DS.OrderedDict()
+
+    parent_module = parentmodule(Env)
+
+    env = Env()
+
+    benchmark[:random_policy] = BT.@benchmark run_random_policy_env!($(Ref(env))[], $(Ref(num_resets))[], $(Ref(steps_per_reset))[])
+    benchmark[:reset] = BT.@benchmark RLBase.reset!($(Ref(env))[])
+    benchmark[:state] = BT.@benchmark RLBase.state($(Ref(env))[])
+
+    for action in RLBase.action_space(env)
+        action_name = parent_module.ACTION_NAMES[action]
+        benchmark[action_name] = BT.@benchmark $(Ref(env))[]($(Ref(action))[])
     end
 
-    @info "Compiled and ran all environments"
+    benchmark[:action_space] = BT.@benchmark RLBase.action_space($(Ref(env))[])
+    benchmark[:is_terminated] = BT.@benchmark RLBase.is_terminated($(Ref(env))[])
+    benchmark[:reward] = BT.@benchmark RLBase.reward($(Ref(env))[])
 
-    return nothing
+    @info "$(nameof(Env)) benchmarked"
+
+    return benchmark
 end
 
 function benchmark_batch_env(Env, num_resets, steps_per_reset, num_envs)
@@ -48,7 +89,7 @@ function benchmark_batch_env(Env, num_resets, steps_per_reset, num_envs)
 
     env = Env(num_envs = num_envs)
 
-    benchmark[:random_policy] = BT.@benchmark run_random_policy!($(Ref(env))[], $(Ref(num_resets))[], $(Ref(steps_per_reset))[])
+    benchmark[:random_policy] = BT.@benchmark run_random_policy_batch_env!($(Ref(env))[], $(Ref(num_resets))[], $(Ref(steps_per_reset))[])
     benchmark[:reset] = BT.@benchmark RLBase.reset!($(Ref(env))[], force = true)
     benchmark[:state] = BT.@benchmark RLBase.state($(Ref(env))[])
 
@@ -67,6 +108,18 @@ function benchmark_batch_env(Env, num_resets, steps_per_reset, num_envs)
     return benchmark
 end
 
+function benchmark_envs(Envs, num_resets, steps_per_reset)
+    benchmarks = DS.OrderedDict()
+
+    for Env in Envs
+        benchmarks[nameof(Env)] = benchmark_env(Env, num_resets, steps_per_reset)
+    end
+
+    @info "benchmark_envs complete"
+
+    return benchmarks
+end
+
 function benchmark_batch_envs(Envs, num_resets, steps_per_reset, num_envs)
     benchmarks = DS.OrderedDict()
 
@@ -74,7 +127,7 @@ function benchmark_batch_envs(Envs, num_resets, steps_per_reset, num_envs)
         benchmarks[nameof(Env)] = benchmark_batch_env(Env, num_resets, steps_per_reset, num_envs)
     end
 
-    @info "All benchmarks complete"
+    @info "benchmark_batch_envs complete"
 
     return benchmarks
 end
@@ -101,7 +154,7 @@ function get_table(benchmark)
     return title, separator, data
 end
 
-function generate_benchmark_file_batch_envs(Envs, num_resets, steps_per_reset, num_envs, file_name = nothing)
+function generate_benchmark_file(benchmarks; file_name = nothing)
     date = Dates.format(Dates.now(), "yyyy_mm_dd_HH_MM_SS")
 
     if isnothing(file_name)
@@ -110,13 +163,11 @@ function generate_benchmark_file_batch_envs(Envs, num_resets, steps_per_reset, n
 
     io = open(file_name, "w")
 
-    benchmarks = benchmark_batch_envs(Envs, num_resets, steps_per_reset, num_envs)
-
     println(io, "Date: $(date)")
     println(io, "## List of Environments")
 
-    for Env in ENVS
-        name_string = String(nameof(Env))
+    for key in keys(benchmarks)
+        name_string = String(key)
         println(io, "  1. [$(name_string)](#$(lowercase(name_string)))")
     end
 
@@ -135,3 +186,38 @@ function generate_benchmark_file_batch_envs(Envs, num_resets, steps_per_reset, n
 
     return nothing
 end
+
+# function generate_benchmark_file_batch_envs(Envs, num_resets, steps_per_reset, num_envs; file_name = nothing)
+    # date = Dates.format(Dates.now(), "yyyy_mm_dd_HH_MM_SS")
+
+    # if isnothing(file_name)
+        # file_name = date * ".md"
+    # end
+
+    # io = open(file_name, "w")
+
+    # benchmarks = benchmark_batch_envs(Envs, num_resets, steps_per_reset, num_envs)
+
+    # println(io, "Date: $(date)")
+    # println(io, "## List of Environments")
+
+    # for Env in Envs
+        # name_string = String(nameof(Env))
+        # println(io, "  1. [$(name_string)](#$(lowercase(name_string)))")
+    # end
+
+    # println(io)
+
+    # for key in keys(benchmarks)
+        # println(io, "### " * String(key))
+        # title, separator, data = get_table(benchmarks[key])
+        # println(io, title)
+        # println(io, separator)
+        # println(io, data)
+        # println(io)
+    # end
+
+    # close(io)
+
+    # return nothing
+# end

From e2f313fe87ab8fb1901e1a8e94001439d9bb0c83 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Thu, 24 Jun 2021 15:55:27 +0530
Subject: [PATCH 27/28] make SingleRoomUndirected mutable and improve
 performance

---
 src/envs/single_room_undirected.jl | 95 +++++++++++-------------------
 1 file changed, 35 insertions(+), 60 deletions(-)

diff --git a/src/envs/single_room_undirected.jl b/src/envs/single_room_undirected.jl
index fb8be6c..7be382f 100644
--- a/src/envs/single_room_undirected.jl
+++ b/src/envs/single_room_undirected.jl
@@ -48,23 +48,18 @@ function move(action::Integer, i, j)
     end
 end
 
-struct SingleRoomUndirected{I, R, RNG} <: GW.AbstractGridWorld
+mutable struct SingleRoomUndirected{R, RNG} <: GW.AbstractGridWorld
     tile_map::BitArray{3}
-    agent_position::SA.MVector{2, I}
-    reward::Ref{R}
+    agent_position::CartesianIndex{2}
+    reward::R
     rng::RNG
-    done::Ref{Bool}
+    done::Bool
     terminal_reward::R
-    goal_position::SA.MVector{2, I}
+    goal_position::CartesianIndex{2}
 end
 
-function SingleRoomUndirected(; I = Int32, R = Float32, height = 8, width = 8, rng = Random.MersenneTwister())
+function SingleRoomUndirected(; R = Float32, height = 8, width = 8, rng = Random.MersenneTwister())
     tile_map = BitArray(undef, 3, height, width)
-    agent_position = SA.MVector{2, I}(undef)
-    reward = Ref{R}()
-    done = Ref{Bool}()
-    goal_position = SA.MVector{2, I}(undef)
-    terminal_reward = one(R)
 
     inner_area = CartesianIndices((2 : height - 1, 2 : width - 1))
 
@@ -74,18 +69,14 @@ function SingleRoomUndirected(; I = Int32, R = Float32, height = 8, width = 8, r
     tile_map[WALL, :, 1] .= true
     tile_map[WALL, :, width] .= true
 
-    random_positions = sample_two_positions_without_replacement(rng, inner_area)
-
-    agent_position[1] = random_positions[1][1]
-    agent_position[2] = random_positions[1][2]
-    tile_map[AGENT, random_positions[1]] = true
+    agent_position, goal_position = sample_two_positions_without_replacement(rng, inner_area)
 
-    goal_position[1] = random_positions[2][1]
-    goal_position[2] = random_positions[2][2]
-    tile_map[GOAL, random_positions[2]] = true
+    tile_map[AGENT, agent_position] = true
+    tile_map[GOAL, goal_position] = true
 
-    reward[] = zero(R)
-    done[] = false
+    reward = zero(R)
+    done = false
+    terminal_reward = one(R)
 
     env = SingleRoomUndirected(tile_map, agent_position, reward, rng, done, terminal_reward, goal_position)
 
@@ -102,62 +93,48 @@ RLBase.action_space(env::SingleRoomUndirected) = (MOVE_UP, MOVE_DOWN, MOVE_LEFT,
 RLBase.reward(env::SingleRoomUndirected) = env.reward[]
 RLBase.is_terminated(env::SingleRoomUndirected) = env.done[]
 
-function RLBase.reset!(env::SingleRoomUndirected{I, R}) where {I, R}
+function RLBase.reset!(env::SingleRoomUndirected{R}) where {R}
     tile_map = env.tile_map
-    agent_position = env.agent_position
-    goal_position = env.goal_position
-    reward = env.reward
-    done = env.done
     rng = env.rng
 
     num_objects, height, width = size(tile_map)
     inner_area = CartesianIndices((2 : height - 1, 2 : width - 1))
 
-    tile_map[AGENT, agent_position...] = false
-    tile_map[GOAL, goal_position...] = false
+    tile_map[AGENT, env.agent_position] = false
+    tile_map[GOAL, env.goal_position] = false
 
-    random_positions = sample_two_positions_without_replacement(rng, inner_area)
+    new_agent_position, new_goal_position = sample_two_positions_without_replacement(rng, inner_area)
 
-    agent_position[1] = random_positions[1][1]
-    agent_position[2] = random_positions[1][2]
-    tile_map[AGENT, random_positions[1]] = true
+    env.agent_position = new_agent_position
+    tile_map[AGENT, new_agent_position] = true
 
-    goal_position[1] = random_positions[2][1]
-    goal_position[2] = random_positions[2][2]
-    tile_map[GOAL, random_positions[2]] = true
+    env.goal_position = new_goal_position
+    tile_map[GOAL, new_goal_position] = true
 
-    reward[] = zero(R)
-    done[] = false
+    env.reward = zero(R)
+    env.done = false
 
     return nothing
 end
 
-function (env::SingleRoomUndirected{I, R})(action) where {I, R}
+function (env::SingleRoomUndirected{R})(action) where {R}
     tile_map = env.tile_map
     agent_position = env.agent_position
-    goal_position = env.goal_position
-    reward = env.reward
-    done = env.done
-    rng = env.rng
-    terminal_reward = env.terminal_reward
 
-    current_position_i = agent_position[1]
-    current_position_j = agent_position[2]
-    next_position_i, next_position_j = move(action, current_position_i, current_position_j)
+    new_agent_position = CartesianIndex(move(action, agent_position.I...))
 
-    if !tile_map[WALL, next_position_i, next_position_j]
-        tile_map[AGENT, current_position_i, current_position_j] = false
-        agent_position[1] = next_position_i
-        agent_position[2] = next_position_j
-        tile_map[AGENT, next_position_i, next_position_j] = true
+    if !tile_map[WALL, new_agent_position]
+        tile_map[AGENT, agent_position] = false
+        env.agent_position = new_agent_position
+        tile_map[AGENT, new_agent_position] = true
     end
 
-    if tile_map[GOAL, agent_position...]
-        reward[] = terminal_reward
-        done[] = true
+    if tile_map[GOAL, env.agent_position]
+        env.reward = env.terminal_reward
+        done = true
     else
-        reward[] = zero(R)
-        done[] = false
+        env.reward = zero(R)
+        done = false
     end
 
     return nothing
@@ -165,8 +142,6 @@ end
 
 function Base.show(io::IO, ::MIME"text/plain", env::SingleRoomUndirected)
     tile_map = env.tile_map
-    reward = env.reward
-    done = env.done
 
     num_objects, height, width = size(tile_map)
 
@@ -195,8 +170,8 @@ function Base.show(io::IO, ::MIME"text/plain", env::SingleRoomUndirected)
         println(io)
     end
 
-    println(io, "reward = ", reward[])
-    println(io, "done = ", done[])
+    println(io, "reward = ", env.reward)
+    println(io, "done = ", env.done)
 
     return nothing
 end

From 6d114c66ee57fec610b539d3a2e5d146b99dfaa0 Mon Sep 17 00:00:00 2001
From: Siddharth Bhatia <sidb60200@gmail.com>
Date: Thu, 24 Jun 2021 16:05:09 +0530
Subject: [PATCH 28/28] remove constants NUM_RESETS, STEPS_PER_EPISODE,
 NUM_ENVS

---
 benchmark/benchmark_utils.jl | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/benchmark/benchmark_utils.jl b/benchmark/benchmark_utils.jl
index 5b03da2..8e21b2d 100644
--- a/benchmark/benchmark_utils.jl
+++ b/benchmark/benchmark_utils.jl
@@ -5,10 +5,6 @@ import GridWorlds as GW
 import ReinforcementLearningBase as RLBase
 import Statistics
 
-const STEPS_PER_RESET = 100
-const NUM_RESETS = 100
-const NUM_ENVS = 64
-
 ENVS = [GW.ModuleSingleRoomUndirected.SingleRoomUndirected]
 BATCH_ENVS = [GW.ModuleSingleRoomUndirectedBatch.SingleRoomUndirectedBatch]
 
@@ -95,7 +91,7 @@ function benchmark_batch_env(Env, num_resets, steps_per_reset, num_envs)
 
     for action in RLBase.action_space(env)
         action_name = parent_module.ACTION_NAMES[action]
-        batch_action = fill(action, NUM_ENVS)
+        batch_action = fill(action, num_envs)
         benchmark[action_name] = BT.@benchmark $(Ref(env))[]($(Ref(batch_action))[])
     end