# An implementation of Q-Learning as applied to the house example in
#   in note 8; calculations done with ints to keep down the detail

ROOMS = [:a, :b, :c, :d, :e, :f]
# captures both value of moving to destination (:f) and whether there is
#   a legal move (non-nil)
REWARD = begin
           m = ROOMS.to_h { |src| [src, ROOMS.to_h { |dest| [dest, nil] }] }
           ROOMS.each { |r| m[r][r] = 0 }
           m[:a][:e] = 0
           m[:b][:d] = 0
           m[:b][:f] = 100
           m[:c][:d] = 0
           m[:d][:b] = m[:d][:c] = m[:d][:e] = 0
           m[:e][:a] = m[:e][:d] = 0
           m[:e][:f] = 100
           m[:f][:b] = m[:f][:e] = 0
           m[:f][:f] = 100
           m
         end
  
ALPHA = 0.3
DISCOUNT = 0.8

class HouseMatrix

  def initialize()
    @q = {}
    ROOMS.each { |src_room| 
      @q[src_room] = ROOMS.to_h { |dest| [dest, 0] }
    }
  end
    
  def dump(title)
    puts "\n" + title
    puts "     " + ROOMS.join('   ')
    ROOMS.each { |src|
      destinations = ROOMS.map { |dest| '%3d' % @q[src][dest] }
      puts src.to_s + ": " + destinations.join(' ')
    }
  end
    
  def dump_normalized(title)
    puts "\n" + title + " (normalized)"
    puts "     " + ROOMS.join('      ')
    largest = (ROOMS.map { |r| @q[r].values.max }).max
    ROOMS.each { |src|
      destinations = ROOMS.map { |dest| '%5.3f' % (@q[src][dest] / largest.to_f) }
      puts src.to_s + ": " + destinations.join('  ')
    }
  end
    

  # update @q on basis of moving from source room to dest room
  # uses reward matrix to evaluate legality of moves
  def update(source_room, dest_room)
    #puts "Updating: " + source_room.to_s + ", " + dest_room.to_s
    return unless REWARD[source_room][dest_room]
    max_from_dest = (ROOMS.map { |x| @q[dest_room][x] }).max
    @q[source_room][dest_room] = (@q[source_room][dest_room] + 
                                  ALPHA * (REWARD[source_room][dest_room] +
                                           DISCOUNT * max_from_dest -
                                           @q[source_room][dest_room])).round
  end

  def update_random 
    update(ROOMS.sample, ROOMS.sample)
  end
end

q = HouseMatrix.new

q.dump 'initial matrix'

# first slide

q.update(:b, :b)
q.update(:b, :d)
q.update(:b, :f)
q.update(:e, :e)
q.update(:e, :d)
q.update(:e, :f)
q.update(:f, :f)

q.dump 'slide 27 - after moving to room f from adjacent rooms'

# second slide

q.update(:d, :c)
q.update(:d, :d)
q.update(:d, :b)
q.update(:d, :e)

q.dump 'slide 28 - after considering all moves from d'

# third slide

q.update(:b, :d)
q.update(:b, :f)

q.dump 'slide 29 - b to d, f'

# run 100 times
100.times { q.update_random }

q.dump 'After 100 updates'
q.dump_normalized 'After 100 updates'

# run another 900 times
900.times { q.update_random }

q.dump 'After 900 more updates'

q.dump_normalized 'After 900 more updates (1000 total)'

1000.times { q.update_random }

q.dump_normalized 'After 2000 total updates'

1000.times { q.update_random }

q.dump_normalized 'After 3000 total updates'