# An implementation of Q-Learning as applied to the house example in # in note 8; calculations done with ints to keep down the detail ROOMS = [:a, :b, :c, :d, :e, :f] # captures both value of moving to destination (:f) and whether there is # a legal move (non-nil) REWARD = begin m = ROOMS.to_h { |src| [src, ROOMS.to_h { |dest| [dest, nil] }] } ROOMS.each { |r| m[r][r] = 0 } m[:a][:e] = 0 m[:b][:d] = 0 m[:b][:f] = 100 m[:c][:d] = 0 m[:d][:b] = m[:d][:c] = m[:d][:e] = 0 m[:e][:a] = m[:e][:d] = 0 m[:e][:f] = 100 m[:f][:b] = m[:f][:e] = 0 m[:f][:f] = 100 m end ALPHA = 0.3 DISCOUNT = 0.8 class HouseMatrix def initialize() @q = {} ROOMS.each { |src_room| @q[src_room] = ROOMS.to_h { |dest| [dest, 0] } } end def dump(title) puts "\n" + title puts " " + ROOMS.join(' ') ROOMS.each { |src| destinations = ROOMS.map { |dest| '%3d' % @q[src][dest] } puts src.to_s + ": " + destinations.join(' ') } end def dump_normalized(title) puts "\n" + title + " (normalized)" puts " " + ROOMS.join(' ') largest = (ROOMS.map { |r| @q[r].values.max }).max ROOMS.each { |src| destinations = ROOMS.map { |dest| '%5.3f' % (@q[src][dest] / largest.to_f) } puts src.to_s + ": " + destinations.join(' ') } end # update @q on basis of moving from source room to dest room # uses reward matrix to evaluate legality of moves def update(source_room, dest_room) #puts "Updating: " + source_room.to_s + ", " + dest_room.to_s return unless REWARD[source_room][dest_room] max_from_dest = (ROOMS.map { |x| @q[dest_room][x] }).max @q[source_room][dest_room] = (@q[source_room][dest_room] + ALPHA * (REWARD[source_room][dest_room] + DISCOUNT * max_from_dest - @q[source_room][dest_room])).round end def update_random update(ROOMS.sample, ROOMS.sample) end end q = HouseMatrix.new q.dump 'initial matrix' # first slide q.update(:b, :b) q.update(:b, :d) q.update(:b, :f) q.update(:e, :e) q.update(:e, :d) q.update(:e, :f) q.update(:f, :f) q.dump 'slide 27 - after moving to room f from adjacent rooms' # second slide q.update(:d, :c) q.update(:d, :d) q.update(:d, :b) q.update(:d, :e) q.dump 'slide 28 - after considering all moves from d' # third slide q.update(:b, :d) q.update(:b, :f) q.dump 'slide 29 - b to d, f' # run 100 times 100.times { q.update_random } q.dump 'After 100 updates' q.dump_normalized 'After 100 updates' # run another 900 times 900.times { q.update_random } q.dump 'After 900 more updates' q.dump_normalized 'After 900 more updates (1000 total)' 1000.times { q.update_random } q.dump_normalized 'After 2000 total updates' 1000.times { q.update_random } q.dump_normalized 'After 3000 total updates'