基于“蘑菇书”的强化学习知识点（十）：第二章的代码：value_iteration.ipynb及其涉及的其他代码的更新以及注解（gym版本＞= 0.26）（一）

第二章的代码：value_iteration.ipynb及其涉及的其他代码的更新以及注解（gym版本＞= 0.26）（一）

- 摘要
摘要

本系列知识点讲解基于蘑菇书EasyRL中的内容进行详细的疑难点分析！具体内容请阅读蘑菇书EasyRL！
#!/usr/bin/env python

# simple_grid.py
# based on frozen_lake.py
# adapted by Frans Oliehoek.
# 
import sys
from contextlib import closing

import numpy as np
from io import StringIO
#from six import StringIO, b
import gym
from gym import utils
from gym import Env, spaces
from gym.utils import seeding

# 随机采样函数
def categorical_sample(prob_n, np_random):
    """
    用于从分类分布（Categorical Distribution）中采样，常用于强化学习中的动作选择或状态转移。
    1. 强化学习中的动作选择：
        根据策略网络输出的动作概率分布，选择下一步动作。
    2. 状态转移采样：
        根据转移概率矩阵，确定下一状态。
    - prob_n：概率分布数组，例如[0.2, 0.5, 0.3]，表示三个事件的概率。
    - np_random：NumPy的随机数生成器对象，用于控制随机性（如设置种子）
    """
    '''
    将输入的概率分布转换为NumPy数组。
    - 确保输入为NumPy数组，以支持后续的向量化操作（如cumsum）。
    - 兼容不同输入类型（如列表、元组等）。
    '''
    prob_n = np.asarray(prob_n)                     # 转换为 NumPy 数组
    '''
    计算累积概率（Cumulative Sum）。
    示例：输入[0.2, 0.5, 0.3]，输出为[0.2, 0.7, 1.0]。
    将概率分布转换为区间分界点。例如：
    - 事件0的区间为 [0.0, 0.2)，
    - 事件1的区间为 [0.2, 0.7)，
    - 事件2的区间为 [0.7, 1.0]。
    '''
    csprob_n = np.cumsum(prob_n)                    # 计算累积概率
    '''
    根据随机数确定采样结果。
    1. 生成随机数：
        np_random.random() 生成 [0.0, 1.0) 内的随机数（如 0.65）。
    2. 比较累积概率与随机数：
        csprob_n > random_value 生成布尔数组。例如：
        - 若随机数为 0.65，比较结果为 [False, True, True]。
    3. 确定采样索引：
        argmax() 返回第一个 True 的索引。例如：
        - [False, True, True] 的 argmax() 为 1，对应事件1
    '''
    return (csprob_n > np_random.random()).argmax() # 采样类别索引


class DiscreteEnv(Env):

    """
    继承 OpenAI Gym 的 Env 基类，实现离散状态/动作的强化学习环境。
    遵循 Gym 接口规范（reset(), step() 等），兼容 Gym 生态工具（如 gym.make()）。
    该类定义状态、动作、转移概率，并提供 step 和 reset 方法。


    (*) dictionary of lists, where
      P[s][a] == [(probability, nextstate, reward, done), ...]
    (**) list or array of length nS
    """
    def __init__(self, nS, nA, P, isd):
        '''
        状态转移表，结构为 P[s][a] = [(prob, next_state, reward, done), ...]，
        定义马尔可夫决策过程（MDP）的动态特性。
        P[0][1] = [(0.8, 4, 0, False), (0.1, 1, 0, False), (0.1, 0, 0, False)]
        - 0.8 概率移动到 4
        - 0.1 概率移动到 1
        - 0.1 概率停留在 0
        - 奖励 0
        - done=False
        '''
        self.P = P              # 状态转移字典
        '''
        初始状态分布（Initial State Distribution），
        如 [0.25, 0.25, 0.25, 0.25] 表示均匀分布的 4 状态环境。
        '''
        self.isd = isd          # 初始状态分布
        '''
        记录最近一次执行的动作，辅助环境渲染（如可视化时显示动作方向）。
        '''
        self.lastaction = None  # 记录上次动作（用于渲染）
        '''
        明确状态和动作空间的维度。
        - 例如 4x4 地图有 16 个状态。
        - LEFT, DOWN, RIGHT, UP 共 4 种。
        '''
        self.nS = nS            # 状态数量
        self.nA = nA            # 动作数量

        ''' 定义动作空间为离散型，取值范围 [0, nA-1]，如 nA=4 对应上下左右。 '''
        self.action_space = spaces.Discrete(self.nA)
        ''' 定义状态空间为离散型，取值范围 [0, nS-1]，如网格世界中的格子编号。 '''
        self.observation_space = spaces.Discrete(self.nS)
        '''
        action_space 和 observation_space 都是 gym.spaces.Discrete 类的实例。
        该类在内部定义了一个属性 .n，用来存储离散状态或动作的个数。
        '''
        
        # 设定 Gym 随机数，保证实验可复现。
        self.seed()  
        '''
        从 isd（初始状态分布）中随机选取初始状态。
        根据 isd 初始状态分布采样初始状态 self.s。
        关键点：要求 isd 是合法概率分布（元素非负且和为 1）。
        '''
        self.s = categorical_sample(self.isd, self.np_random)

    def seed(self, seed=None):
        '''
        使用 Gym 的 seeding.np_random 生成可控随机数生成器。
        - seed 为可选种子，若为 None 则生成随机种子。
        - 返回种子列表（Gym 规范要求返回列表）。
        '''
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def reset(self):
        '''
        符合 Gym 接口，每个 episode 开始时调用。
        流程：
        1. 重新从 isd 采样初始状态 self.s。
        2. 清空 lastaction（重置动作记录）。
        3. 返回初始状态（强制转换为 int，确保类型兼容性）。
        '''
        self.s = categorical_sample(self.isd, self.np_random)
        self.lastaction = None
        return int(self.s)

    def step(self, a):
        '''根据当前状态 self.s 和动作 a，从 P 中取出所有可能的转移 transitions。'''
        transitions = self.P[self.s][a]     # 获取当前状态的所有可能转移
        '''提取转移概率列表 [t[0] ...]，调用 categorical_sample 随机选择一种转移。'''
        i = categorical_sample([t[0] for t in transitions], self.np_random)
        p, s, r, d = transitions[i]         # 获取转移后的状态、奖励、是否终止
        '''将环境状态更新为转移后的状态 s。'''
        self.s = s
        '''保存动作 a 到 lastaction（用于渲染）。'''
        self.lastaction = a
        '''返回四元组 (next_state, reward, done, info)，其中 info 包含转移概率 p。'''
        return (int(s), r, d, {"prob": p})  # 返回新的状态、奖励、是否终止、概率信息
        """
        状态转移表（P）的结构详解：
        - 格式：P[s][a] = [(prob1, s1, r1, d1), (prob2, s2, r2, d2), ...]
            '''
            P = {
                0: {
                    0: [(0.8, 0, -1, False), (0.1, 1, -1, False), (0.1, 4, -1, False)],
                    1: [(0.7, 4, -1, False), (0.3, 0, -1, False)],
                    ...
                },
                1: {...},
                ...
            }
            '''
            - prob：转移概率（必须满足 sum(prob) = 1）。
            - next_state：下一状态。
            - reward：即时奖励。
            - done：是否终止 episode（如到达目标或陷阱）。
        - 支持随机性：每个动作可对应多个转移结果（如风的影响导致移动不确定性）。
        - 终止条件：通过 done 标志结束 episode（如 CliffWalking 中跌落悬崖）。
        """

''' 规定动作的编号'''
LEFT = 0
DOWN = 1
RIGHT = 2
UP = 3

''' 地图'''
MAPS = {
    "theAlley": [
        "S...H...H...G"
    ],
    "walkInThePark": [
        "S.......",
        ".....H..",
        "........",
        "......H.",
        "........",
        "...H...G"
    ],
    "1Dtest": [

    ],
    "4x4": [
        "S...",
        ".H.H",
        "...H",
        "H..G"
    ],
    "8x8": [
        "S.......",
        "........",
        "...H....",
        ".....H..",
        "...H....",
        ".HH...H.",
        ".H..H.H.",
        "...H...G"
    ],
}

POTHOLE_PROB = 0.2               # 陷阱（Hole）生成概率
BROKEN_LEG_PENALTY = -5          # 踩中陷阱的惩罚值
SLEEP_DEPRIVATION_PENALTY = -0.0 # 睡眠剥夺惩罚（暂未使用）
REWARD = 10                      # 到达目标的奖励

def generate_random_map(size=8, p=0.8):
    """Generates a random valid map (one that has a path from start to goal)
    - size: 网格边长，默认8x8 地图尺寸（size x size 网格）。
    - p: 格子为可通行区域（'.'）的概率，默认0.8 每个格子为可通行区域（冰冻表面）的概率。
    输出：有效的地图列表，如 ["S...H", ".H..G", ...]，保证从起点 S 到目标 G 存在路径。
    """
    valid = False

    # DFS (深度优先搜索) to check that it's a valid path.
    def is_valid(res):
        frontier, discovered = [], set()
        frontier.append((0,0)) # 起点 (0,0)
        while frontier:
            r, c = frontier.pop() # 深度优先搜索（栈结构）
            if not (r,c) in discovered:
                discovered.add((r,c))
                directions = [(1, 0), (0, 1), (-1, 0), (0, -1)] # 下、右、上、左
                for x, y in directions:
                    r_new = r + x
                    c_new = c + y
                    # 边界检查
                    if r_new < 0 or r_new >= size or c_new < 0 or c_new >= size:
                        continue
                    # 找到目标则返回 True
                    if res[r_new][c_new] == 'G':
                        return True
                    # 可通行区域（非陷阱且非墙）加入探索队列
                    if (res[r_new][c_new] not in '#H'):
                        frontier.append((r_new, c_new))
        return False # 无路径则地图无效

    while not valid:
        p = min(1, p) # 确保概率不超过1
        '''
        生成随机地图（'.' 或 'H'），概率由参数 p 控制
        - p 控制可通行区域比例，1-p 为陷阱概率。
        - 起点 S 固定在左上角 (0,0)。
        - 终点 G 固定在右下角 (size-1, size-1)。
        '''
        res = np.random.choice(['.', 'H'], (size, size), p=[p, 1-p])
        res[0][0] = 'S'
        res[-1][-1] = 'G'
        valid = is_valid(res)
    return ["".join(x) for x in res]


class DrunkenWalkEnv(DiscreteEnv):
    """
    这段代码定义了一个名为 DrunkenWalkEnv 的环境，
    模拟了一个醉汉在网格地图中从起点 (S) 走向终点 (G) 的场景。
    环境中除了正常的“路面”外，还有坑洞（H）——一旦踩上坑洞，
    有一定概率（POTHOLE_PROB）会导致摔倒（给予负奖励并结束回合）。
    此外，醉汉由于醉酒，控制动作存在不确定性：
    他以 80% 的概率按预期方向移动，以 10% 的概率偏离到左侧，以 10% 的概率偏离到右侧。
    此环境是基于 Gym 框架并继承了一个离散状态空间环境。
    """
    """
    DrunkenWalkEnv 继承自 DiscreteEnv，后者是一个抽象化的离散环境，
    定义了状态、动作、转移概率（P）、初始状态分布（isd）等基本元素。
    通过继承，我们可以重用 DiscreteEnv 中已实现的 step()、reset() 等方法，
    同时在子类中构造适用于醉汉走路问题的状态转移矩阵。
    """
    """
    环境的构造流程主要有三部分：
    1. 地图描述与初始化：确定网格地图（可以是预设地图或者随机生成）。
    2. 构建转移概率矩阵：根据当前地图中每个网格的性质（起点、普通路面、坑洞、目标），
       为每个状态和动作组合构造一个转移列表。
    3. 调用父类构造器：将计算得到的状态数、动作数、转移矩阵以及初始状态分布传给父类，
       以便继承的 step()、reset() 等方法正常工作。
    """
    """
    继承 DiscreteEnv，模拟醉汉走路环境。
    A simple grid environment, completely based on the code of 'FrozenLake', credits to 
    the original authors.

    You're finding your way home (G) after a great party which was happening at (S).
    Unfortunately, due to recreational intoxication you find yourself only moving into 
    the intended direction 80% of the time, and perpendicular to that the other 20%.

    To make matters worse, the local community has been cutting the budgets for pavement
    maintenance, which means that the way to home is full of potholes, which are very likely
    to make you trip. If you fall, you are obviously magically transported back to the party, 
    without getting some of that hard-earned sleep.

        S...
        .H.H
        ...H
        H..G

    S : starting point
    . : normal pavement
    H : pothole, you have a POTHOLE_PROB chance of tripping
    G : goal, time for bed

    The episode ends when you reach the goal or trip.
    You receive a reward of +10 if you reach the goal, 
    but get a SLEEP_DEPRIVATION_PENALTY and otherwise.
    """
    '''
    metadata 是 Gym 环境中用于说明支持哪些渲染模式的属性。这里支持两种渲染模式：
    human：直接在屏幕上输出。
    ansi：输出到一个字符串缓冲区，适合测试或其它程序调用。
    '''
    metadata = {'render.modes': ['human', 'ansi']}

    def __init__(self, desc=None, map_name="4x4",is_slippery=True):
        """ This generates a map and sets all transition probabilities.
            (by passing constructed nS, nA, P, isd to DiscreteEnv)
        """
        """
        desc：地图描述，可以直接传入一个字符串列表定义地图。
        map_name：若 desc 为 None，则根据地图名从预设地图集合 MAPS 中选择地图。
        is_slippery：在本代码中未使用，但通常用于控制是否存在不确定性（这里始终默认醉汉移动存在随机性）。
        """
        if desc is None and map_name is None:
            desc = generate_random_map()
        elif desc is None:
            desc = MAPS[map_name]
            
        '''将地图描述（通常是字符串列表）转换成 NumPy 数组，并设置数据类型为字符（'c'）。'''
        """
        desc = [
            "S...",
            ".H.H",
            "...H",
            "H..G"
        ]
        变成
        array([[b'S', b'.', b'.', b'.'],
               [b'.', b'H', b'.', b'H'],
               [b'.', b'.', b'.', b'H'],
               [b'H', b'.', b'.', b'G']], dtype='|S1')
        - 后续代码中，我们可以通过简单的二维索引（例如 desc_array[row, col]）来获取地图上对应位置的字符。
        - dtype='c' 指定数组中的每个元素为单个字符（字节），
          这在后续与字节字符串比较（如判断是否为 G）时非常有用。
          if letter in b'G':
              ...
        """
        self.desc = desc = np.asarray(desc,dtype='c')
        '''利用 desc.shape 得到网格行数和列数，
        分别赋值给 self.nrow 和 self.ncol（以及局部变量 nrow, ncol，便于后续计算）。'''
        self.nrow, self.ncol = nrow, ncol = desc.shape
        '''
        self.reward_range 指定了奖励范围，
        不过实际奖励数值在状态转移时定义（例如目标奖励 REWARD、摔倒惩罚 BROKEN_LEG_PENALTY）。'''
        self.reward_range = (0, 1)
        '''动作数 nA 固定为 4（对应上下左右四个方向）。'''
        nA = 4
        '''状态数 nS 为网格中的单元格总数，计算公式为行数乘以列数。例如：在 4x4 地图中，nS = 16。'''
        nS = nrow * ncol
        """
        1. desc == b'S' 生成一个布尔数组，标记出地图中所有起始位置（S）的所在位置。
        2. 将布尔数组转换为浮点数（True→1.0，False→0.0），再用 ravel() 将其展平为一维数组。
        3. 最后除以总和（通常只有一个起始位置，因此总和为 1），形成概率分布。
        4. 这样，在重置环境时，会按照这个分布随机选择一个起始状态。
        """
        """
        array([[True,  False, False, False],
               [False, False, False, False],
               [False, False, False, False],
               [False, False, False, False]])
        array([[1.0, 0.0, 0.0, 0.0],
               [0.0, 0.0, 0.0, 0.0],
               [0.0, 0.0, 0.0, 0.0],
               [0.0, 0.0, 0.0, 0.0]])
        array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0])
        - 这样，当我们使用这个概率分布来重置环境时，必然会选择起点作为初始状态。
        - 在 Gym 中，环境的状态通常用一个整数表示（例如，通过 row * ncol + col 计算出一维索引）
        """
        isd = np.array(desc == b'S').astype('float64').ravel()
        isd /= isd.sum()

        # We need to pass 'P' to DiscreteEnv:
        # P dictionary dict of dicts of lists, where
        # P[s][a] == [(probability, nextstate, reward, done), ...]
        """
        1. 构造一个嵌套字典 P，键是状态 s（0 到 nS-1），值为另一个字典。
        2. 内部字典的键是动作 a（0, 1, 2, 3），每个键对应一个列表，
            LEFT = 0
            DOWN = 1
            RIGHT = 2
            UP = 3
           列表中存储了从状态 s 执行动作 a 后所有可能的转移结果。
        3. 每个转移结果以元组 (probability, next_state, reward, done) 的形式表示。
        P: {0: {0: [], 1: [], 2: [], 3: []},    0状态执行动作 0，1，2，3
            1: {0: [], 1: [], 2: [], 3: []},    1状态执行动作 0，1，2，3
            2: {0: [], 1: [], 2: [], 3: []},    2状态执行动作 0，1，2，3
            ...
            11: {0: [], 1: [], 2: [], 3: []},   11状态执行动作 0，1，2，3
            12: {0: [], 1: [], 2: [], 3: []}}   12状态执行动作 0，1，2，3
        """
        P = {s : {a : [] for a in range(nA)} for s in range(nS)}
        '''
        将二维网格坐标 (row, col) 转换为一维状态索引。
        例如，在一个 4x4 网格中，坐标 (2, 1) 会转换为 2*4 + 1 = 9。
        '''
        def convert_rc_to_s(row, col):
            return row*ncol + col
        
        '''
        根据传入的动作 a（LEFT、DOWN、RIGHT、UP），计算在当前 (row, col) 上按预期方向移动后的新坐标。
        使用 max 和 min 保证不会越界（例如，左移时不会小于 0，右移时不会超过最大列数）。
        举例：若在 (0, 0) 向 LEFT 移动，则计算结果仍为 (0, 0)。
        '''
        #def inc(row, col, a):
        def intended_destination(row, col, a):
            if a == LEFT:
                col = max(col-1,0)
            elif a == DOWN:
                row = min(row+1,nrow-1)
            elif a == RIGHT:
                col = min(col+1,ncol-1)
            elif a == UP:
                row = max(row-1,0)
            return (row, col)

        def construct_transition_for_intended(row, col, a, prob, li):
            """ this constructs a transition to the "intended_destination(row, col, a)"
                and adds it to the transition list (which could be for a different action b).
            """
            '''计算目的地：调用 intended_destination 得到移动后的新坐标 (newrow, newcol)。'''
            newrow, newcol = intended_destination(row, col, a)
            '''转换状态索引：使用 convert_rc_to_s 将新坐标转换为一维状态 newstate。'''
            newstate = convert_rc_to_s(newrow, newcol)
            '''读取地图符号：从 desc 中读取新位置的字符（可能是普通路面 .、坑洞 H、目标 G 等）。'''
            newletter = desc[newrow, newcol]
            '''判断是否结束：
                若新字符是目标 G（通过 bytes(newletter) in b'G' 判断），
                则将 done 标记为 True。'''
            done = bytes(newletter) in b'G'
            '''设置奖励：若达到目标则奖励为 REWARD（例如 +10），
                        否则为 SLEEP_DEPRIVATION_PENALTY（例如 -0，即没有额外奖励惩罚，或可能设为负值）。'''
            rew = REWARD if newletter == b'G' else SLEEP_DEPRIVATION_PENALTY
            '''这样，每次调用此函数都会根据指定的概率 prob 构造一条转移规则，并更新状态转移矩阵。'''
            li.append( (prob, newstate, rew, done) )


        #THIS IS WHERE THE MATRIX OF TRANSITION PROBABILITIES IS COMPUTED.
        """
        外层双重循环遍历每个网格单元。
        对每个单元格，首先计算一维状态 s，并获得该位置的地图符号 letter。
        对每个可能动作 a（0：LEFT，1：DOWN，2：RIGHT，3：UP），获取对应的转移列表 li。
        """
        for row in range(nrow):
            for col in range(ncol):
                # specify transitions for s=(row, col)
                s = convert_rc_to_s(row, col)
                letter = desc[row, col]
                for a in range(nA):
                    # specify transitions for action a
                    '''现在li是空的：[], 正常是情况下是：[(prob, newstate, rew, done), (prob, newstate, rew, done)]'''
                    li = P[s][a]
                    if letter in b'G':
                        '''
                        如果当前位置是目标（G），
                        那么无论采取什么动作，都保持在原状态 s，
                        获得 0 奖励，且标记为终止状态 done=True。
                        '''
                        # We are at the goal ('G').... 
                        # This is a strange case:
                        # - conceptually, we can think of this as:
                        #     always transition to a 'terminated' state where we willget 0 reward.
                        #
                        # - But in gym, in practie, this case should not be happening at all!!!
                        #   Gym will alreay have returned 'done' when transitioning TO the goal state (not from it).
                        #   So we will never use the transition probabilities *from* the goal state.
                        #   So, from gym's perspective we could specify anything we like here. E.g.,:
                        #       li.append((1.0, 59, 42000000, True))
                        #
                        # However, if we want to be able to use the transition matrix to do value iteration, it is important
                        # that we get 0 reward ever after.
                        li.append((1.0, s, 0, True))

                    if letter in b'H':
                        '''
                        如果当前位置为坑洞（H），
                        - 有 POTHOLE_PROB 的概率，醉汉会直接摔倒（transition 回到自身 s），
                          获得惩罚 BROKEN_LEG_PENALTY（例如 -5），且回合终止。
                        - 其余的概率（1.0 - POTHOLE_PROB），仍按照预期的运动方向移动，
                          调用 construct_transition_for_intended 构造转移规则。
                        举例：
                        - 如果 POTHOLE_PROB 为 0.2，则有 20% 的概率触发摔倒，80% 的概率仍能尝试正常移动。
                        '''
                        #We are at a pothole ('H')
                        #when we are at a pothole, we trip with prob. POTHOLE_PROB
                        li.append((POTHOLE_PROB, s, BROKEN_LEG_PENALTY, True))
                        construct_transition_for_intended(row, col, a, 1.0 - POTHOLE_PROB, li)
                        
                    else:
                        '''
                        如果当前位置是普通路面（或起点 S，注意 S 不在目标或坑洞中）：
                        - 有 80% 的概率按照预期方向 a 移动。
                        - 有 10% 的概率意外向左偏移（计算为 (a - 1) % 4），
                          以及 10% 的概率意外向右偏移（计算为 (a + 1) % 4）。
                        '''
                        # We are at normal pavement (.)
                        # with prob. 0.8 we move as intended:
                        construct_transition_for_intended(row, col, a, 0.8, li)
                        # but with prob. 0.1 we move sideways to intended:
                        for b in [(a-1)%4, (a+1)%4]:
                            construct_transition_for_intended(row, col, b, 0.1, li)

        super(DrunkenWalkEnv, self).__init__(nS, nA, P, isd)
        '''
        调用父类 DiscreteEnv 的构造方法，将已经计算好的：
        - 状态数 nS
        - 动作数 nA
        - 状态转移矩阵 P
        - 初始状态分布 isd
        传递过去。
        父类构造器负责设置动作空间、观察空间、随机种子等，
        使得 DrunkenWalkEnv 具有 Gym 环境应有的功能，如 step() 和 reset() 方法。
        '''

    def action_to_string(self, action_index):
        '''
        根据传入的动作索引（0～3）返回相应的字符串描述。
        例如：若 action_index 为 0，则返回 "Left"；
                             为 1，则返回 "Down"。
        用途：
        - 在调试或渲染时，可以用来直观展示上一次执行的动作。
        '''
        s ="{}".format(["Left","Down","Right","Up"][action_index])
        return s

    def render(self, mode='human'):
        '''如果模式是 'ansi'，则创建一个 StringIO 对象，用于存储输出字符串；
           否则直接使用标准输出（sys.stdout）。'''
        outfile = StringIO() if mode == 'ansi' else sys.stdout
        '''根据当前状态 self.s（一维索引），通过整除和取余计算其在二维网格中的 (row, col) 坐标。'''
        row, col = self.s // self.ncol, self.s % self.ncol
        '''将 NumPy 数组转换为 Python 列表，并将每个字节解码为字符串，得到可读的地图格式。'''
        desc = self.desc.tolist()
        desc = [[c.decode('utf-8') for c in line] for line in desc]
        '''将当前位置对应的字符用红色高亮显示，便于观察当前智能体的位置。'''
        desc[row][col] = utils.colorize(desc[row][col], "red", highlight=True)
        '''如果记录了上一次动作（self.lastaction 不为 None），则将动作信息打印出来。'''
        if self.lastaction is not None:
            outfile.write(" (last action was '{action}')\n".format( action=self.action_to_string(self.lastaction) ) )
        else:
            outfile.write("\n")
        '''将每一行拼接为字符串后再用换行符连接，输出整个地图的当前状态。'''
        outfile.write("\n".join(''.join(line) for line in desc)+"\n")
        '''如果模式不是 human，则关闭 outfile（确保资源正确释放），并返回构造好的字符串。'''
        if mode != 'human':
            with closing(outfile):
                return outfile.getvalue()
if __name__ == "__main__":
    # env = DrunkenWalkEnv(map_name="walkInThePark")
    env = DrunkenWalkEnv(map_name="theAlley")
    n_states = env.observation_space.n
    n_actions = env.action_space.n
    
    
"""
S...
.H.H
...H
H..G
1. 初始设置
    - 起点 S 在 (0, 0)，目标 G 在 (3, 3)。
    - 环境根据该地图的行列数计算状态数为 16。
    - 初始状态分布 isd 将使得起点 (0, 0) 的状态索引（0）概率为 1。
2. 构造转移矩阵
    - 例如，假设在 (1, 1) 处的字符为 H（坑洞），对于某个动作 a，
        - 有 20% 的概率直接摔倒，停留在 (1, 1) 并获得负奖励；
        - 80% 的概率按预期方向移动（假设 a=RIGHT，则目标为 (1, 2)）。
    - 对于普通路面（.），例如在 (0, 2) 处，若动作 a=DOWN，则
        - 80% 的概率移动到 (1, 2)，
        - 10% 的概率错误移动到 (0, 1)（左侧偏移），
        - 10% 的概率错误移动到 (0, 3)（右侧偏移）。
3. 调用父类方法
    - 初始化完转移矩阵后，父类 DiscreteEnv 的构造函数会设置好环境的动作空间、观察空间、随机数生成器等，使得后续调用 step(action) 和 reset() 成为可能。
4. 渲染
    - 当调用 render() 时，当前状态（例如状态 0 对应 (0, 0)）会被高亮显示，地图整体以文本形式展示，并附带上次动作信息。
"""
基于“蘑菇书”的强化学习知识点（十）：第二章的代码：value_iteration.ipynb及其涉及的其他代码的更新以及注解（gym版本 ＞= 0.26）（一）

第二章的代码：value_iteration.ipynb及其涉及的其他代码的更新以及注解（gym版本 ＞= 0.26）（一）

摘要

悦读

基于“蘑菇书”的强化学习知识点（十）：第二章的代码：value_iteration.ipynb及其涉及的其他代码的更新以及注解（gym版本＞= 0.26）（一）

第二章的代码：value_iteration.ipynb及其涉及的其他代码的更新以及注解（gym版本＞= 0.26）（一）