1200字范文 > python实现井字棋小游戏（使用蒙特卡洛搜索树进行训练）

python实现井字棋小游戏（使用蒙特卡洛搜索树进行训练）

时间：2020-07-20 11:13:47

需要源码请点赞关注收藏后评论区留言或私信博主

蒙特卡洛搜索树是一类算法的统称，它适用于零和且确定环境的游戏，现在用蒙特卡洛搜索树算法对井字棋进行训练。
训练要求将模拟次数设定为2000次，即每个状态都模拟2000次到达终点，到到达胜利叶子节点则回溯得一分，失败或平局则不得分。

代码运行效果如下

部分代码如下

# 深度强化学习——原理、算法与PyTorch实战，代码名称：代31-例8.6-基于蒙特卡洛树的井字棋实例.pyimport numpy as npimport sysimport mathimport random# 初始化环境class environment():def __init__(self):self.start_env = np.array([[0] * 3] * 3)class State(object):def __init__(self):self.current_env = [[]]self.current_value = 0self.current_round_index = 0self.cumulative_choices = [[]]self.available_choice = [[]]# 定义结束情况def is_end(self):tiaojian = Truefor i in range(0, 3):for j in range(0, 3):if self.current_env[i][j] == 0:tiaojian = Falsefor i in range(0, 3):if (np.array(self.current_env)[i] == np.array([1, 1, 1])).all() or (np.array(self.current_env)[i] == np.array([2, 2, 2])).all():tiaojian = Trueif (np.array(self.current_env)[:, 0] == np.array([1, 1, 1])).all() or (np.array(self.current_env)[:, 0] == np.array([2, 2, 2])).all() or (np.array(self.current_env)[:, 1] == np.array([1, 1, 1])).all() or (np.array(self.current_env)[:, 1] == np.array([2, 2, 2])).all() or (np.array(self.current_env)[:, 2] == np.array([1, 1, 1])).all() or (np.array(self.current_env)[:, 2] == np.array([2, 2, 2])).all():tiaojian = Trueelif np.array(self.current_env)[0, 0] == np.array(self.current_env)[1, 1] == np.array(self.current_env)[2, 2] != 0:tiaojian = Trueelif np.array(self.current_env)[0, 2] == np.array(self.current_env)[1, 1] == np.array(self.current_env)[2, 0] != 0:tiaojian = Truereturn tiaojian# 定义自家胜利情况def i_win(self):tiaojian = Falsefor i in range(0, 3):if ((np.array(self.current_env)[i] == np.array([1, 1, 1])).all()):tiaojian = Trueif (np.array(self.current_env)[:, 0] == np.array([1, 1, 1])).all() or (np.array(self.current_env)[:, 1] == np.array([1, 1, 1])).all() or (np.array(self.current_env)[:, 2] == np.array([1, 1, 1])).all():tiaojian = Trueif np.array(self.current_env)[0, 0] == np.array(self.current_env)[1, 1] == np.array(self.current_env)[2, 2] == 1:tiaojian = Trueif np.array(self.current_env)[0, 2] == np.array(self.current_env)[1, 1] == np.array(self.current_env)[2, 0] == 1:tiaojian = Truereturn tiaojian# 定义自家失败情况def i_lose(self):tiaojian = Falsefor i in range(0, 3):if ((np.array(self.current_env)[i] == np.array([2, 2, 2])).all()):tiaojian = Trueif (np.array(self.current_env)[:, 0] == np.array([2, 2, 2])).all() or (np.array(self.current_env)[:, 1] == np.array([2, 2, 2])).all() or (np.array(self.current_env)[:, 2] == np.array([2, 2, 2])).all():tiaojian = Trueif np.array(self.current_env)[0, 0] == np.array(self.current_env)[1, 1] == np.array(self.current_env)[2, 2] == 2:tiaojian = Trueif np.array(self.current_env)[0, 2] == np.array(self.current_env)[1, 1] == np.array(self.current_env)[2, 0] == 2:tiaojian = Truereturn tiaojian# 设置/获取可用动作def set_available_choice(self, choice):self.available_choice = choicedef get_available_choice(self):return self.available_choice# 设置/获取当前环境def get_current_env(self):return self.current_envdef set_current_env(self, env):self.current_env = env# 设置/获取累计奖赏def get_current_value(self):return self.current_valuedef set_current_value(self, value):self.current_value = valuedef get_current_round_index(self):return self.current_round_indexdef set_current_round_index(self, turn):self.current_round_index = turn# 设置/获取累积动作def get_cumulative_choices(self):return self.cumulative_choicesdef set_cumulative_choices(self, choices):self.cumulative_choices = choices# 判断是否结束def is_terminal(self):# The round index starts from 1 to max round numberreturn self.is_end()# 计算累计奖赏def compute_reward(self):return self.current_value# 随机策略得到下一状态def get_next_state_with_random_choice(self):a = np.array([[0] * 3] * 3)b = [0] * len(self.available_choice)random_choice = random.choice([choice for choice in self.available_choice])next_state = State()next_state.set_current_round_index(self.current_round_index + 1)next_state.set_cumulative_choices(self.cumulative_choices + [random_choice])for i in range(0, len(self.available_choice)):b[i] = self.available_choice[i]next_state.available_choice = bnext_state.available_choice.remove(random_choice)if next_state.current_round_index != 0 and next_state.current_round_index % 2 == 0:for i in range(0, 3):for j in range(0, 3):a[i][j] = self.current_env[i][j]a[random_choice[0]][random_choice[1]] = 1next_state.set_current_env(a)if next_state.current_round_index != 0 and next_state.current_round_index % 2 == 1:for i in range(0, 3):for j in range(0, 3):a[i][j] = self.current_env[i][j]a[random_choice[0]][random_choice[1]] = 2next_state.set_current_env(a)if next_state.i_win():next_state.set_current_value(1)if next_state.i_lose():next_state.set_current_value(-0.5)if next_state.i_lose() != True and next_state.i_win() != True:next_state.set_current_value(0)return next_statedef __repr__(self):return "State: {}, value: {}, choices: {}".format(hash(self), self.current_value,self.available_choice)# 建立节点class Node(object):def __init__(self):self.env = [[]]self.parent = Noneself.children = []self.visit_times = 0self.quality_value = 0.0self.state = Nonedef avanum(self):num = 0a = self.get_state().current_envfor i in range(0, 3):for j in range(0, 3):if a[i][j] == 0:num += 1return numdef set_state(self, state):self.state = statedef get_state(self):return self.statedef get_parent(self):return self.parentdef set_parent(self, parent):self.parent = parentdef get_children(self):return self.childrendef get_visit_times(self):return self.visit_timesdef set_visit_times(self, times):self.visit_times = timesdef visit_times_add_one(self):self.visit_times += 1def get_quality_value(self):return self.quality_valuedef set_quality_value(self, value):self.quality_value = valuedef quality_value_add_n(self, n):self.quality_value += ndef is_all_expand(self):return len(self.children) == self.avanum()def add_child(self, sub_node):sub_node.set_parent(self)self.children.append(sub_node)def __repr__(self):return "Node: {}, Q/N: {}/{}, state: {}".format(hash(self), self.quality_value, self.visit_times, self.state)# *************************************# 搜索树策略def tree_policy(node):# Check if the current node is the leaf nodewhile node.get_state().is_terminal() == False:if node.is_all_expand():node_best = best_child(node, True)else:# Return the new sub nodesub_node = expand(node)return sub_node# Return the leaf nodereturn node_best# 默认策略def default_policy(node):# Get the state of the gamecurrent_state = node.get_state()# Run until the game overwhile current_state.is_terminal() == False:# Pick one random action to play and get next statecurrent_state = current_state.get_next_state_with_random_choice()final_state_reward = pute_reward()return final_state_reward# 扩展def expand(node):tried_sub_node_states = [sub_node.get_state().current_env for sub_node in node.get_children()]# Check until get the new state which has the different action from othersnoin = Falsewhile noin == False:noin = Truenew_state = node.get_state().get_next_state_with_random_choice()for i in range(0, len(tried_sub_node_states)):if (new_state.current_env == tried_sub_node_states[i]).all():noin = Falsesub_node = Node()sub_node.set_state(new_state)node.add_child(sub_node)return sub_nodedef best_child(node, is_exploration):# TODO: Use the min float valuebest_score = -sys.maxsizebest_sub_node = None# Travel all sub nodes to find the best onefor sub_node in node.get_children():# Ignore exploration for inferenceif is_exploration:C = 1 / math.sqrt(2.0)else:C = 0.0# UCB = quality / times + C * sqrt(2 * ln(total_times) / times)left = sub_node.get_quality_value() / sub_node.get_visit_times()right = 2.0 * math.log(node.get_visit_times()) / sub_node.get_visit_times()score = left + C * math.sqrt(right)if score > best_score:best_sub_node = sub_nodebest_score = scorereturn best_sub_node# 回传def backup(node, reward):# Update util the root nodewhile node != None:# Update the visit timesnode.visit_times_add_one()# Update the quality valuenode.quality_value_add_n(reward)# Change the node to the parent nodenode = node.parent# 蒙特卡洛搜索树算法def monte_carlo_tree_search(node):computation_budget = 4000# Run as much as possible under the computation budgetfor i in range(computation_budget):# 1. Find the best node to expandexpand_node = tree_policy(node)# 2. Random run to add node and get rewardreward = default_policy(expand_node)# 3. Update all passing nodes with rewardbackup(expand_node, reward)# N. Get the best next nodebest_next_node = best_child(node, False)a = [[sub_node.quality_value, sub_node.get_state().current_env] for sub_node in node.get_children()]print(a)return best_next_node# *************************************def main():# Create the initialized state and initialized nodeinit_state = State()init_state.set_current_env(np.array([[0] * 3] * 3))init_state.set_current_round_index(1)init_state.set_available_choice([[0, 0], [0, 1], [0, 2], [1, 0], [1, 1], [1, 2], [2, 0], [2, 1], [2, 2]])init_node = Node()init_node.state = init_stateinit_env = environment()current_node = init_node# Set the rounds to playd = 0while (current_node.get_state().is_terminal() != True):if d % 2 == 0:print("Play round: {}".format(d + 1))print("你好，这是我下的步骤，来与我一战")current_node = monte_carlo_tree_search(current_node)print(current_node.get_state().current_env)else:new = Node()bb = State()new.set_state(bb)print("你的回合，请君下棋")n = 3a = [[0] * n] * nfor i in range(n):a[i] = input().split(" ")for i in range(0, 3):for j in range(0, 3):a[i][j] = int(a[i][j])= current_node.get_state().current_round_index + 1current_node = newd += 1if current_node.get_state().i_win():print("我赢了！你真菜")if current_node.get_state().i_lose():print("我输了，快给我调力度")if current_node.get_state().i_win() != True and current_node.get_state().i_lose() != True:print("平局，你还不错")if __name__ == "__main__":main()

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。