HS-RL-Environment/environment.py at develop · Gloriel621/HS-RL-Environment · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from state import State
from data import Sellers, Gold_Amount

import copy
import numpy as np
from collections import OrderedDict


class Environment:
    def __init__(self):
        self.num_actions = 14
        self.init_state = State()
        self.init_sellers = Sellers
        self.gold_amount = Gold_Amount

    def reset(self):
        self.done = False
        self.num_steps = 0
        self.reward = 0
        self.num_solved = 0

        self.state = copy.deepcopy(self.init_state)
        self.infeasible = np.zeros(self.num_actions)

    # actions 0 ~ 6
    def Sell(self, goods: int, action: int):

        goods_list = Sellers[action]
        goods_str = list(self.state.hand.items())[goods][0]
        # goods = 현재 손패 중 내가 교환하고자 하는 것

        for exchange_list in goods_list:
            if goods_str == exchange_list[0]:
                amount = exchange_list[2]
                target = exchange_list[1]

                num_second = self.state.hand[goods_str] // amount
                self.state.hand[target] += num_second
                self.state.hand[goods_str] -= num_second * amount

    # actions 7 ~ 13
    def Buy(self, goods: int, action: int):

        buyer = self.state.buyers[action]
        gold = self.gold_amount[action]
        goods_str = list(self.state.hand.items())[goods][0]
        goods_index = list(OrderedDict(buyer).keys()).index(goods_str)

        self.state.hand["gold"] += gold[goods_index]
        self.state.hand[goods_str] -= buyer[goods_str]
        self.state.buyers[action][goods_str] = 0
        self.reward += 10
        self.num_solved += 1

    def return_available_hand(self):
        # return indices of available actions
        hand = self.state.hand_to_numpy()

        available_hand = hand
        available_hand[available_hand > 0] = 1

        return available_hand

    def return_available_action(self, goods_int):
        # 손패에 있는 교환하기로 한 물건에 대하여 갈 수 있는 action의 목록을 뽑아준다.
        # input goods는 int 형태

        available_action_list = np.zeros(14)
        goods_str = list(self.state.hand.items())[goods_int][0]

        for action in range(7):  # sellers
            goods_list = Sellers[action]

            for goods in goods_list:
                first = goods[0]
                amount = goods[2]

                if goods_str == first and self.state.hand[first] >= amount:
                    available_action_list[action] = 1
                    break

        for action in range(7):  # buyers
            buyer = self.state.buyers[action]
            buyer_goods_list = list(buyer.keys())

            if (goods_str in buyer_goods_list) and buyer[goods_str] > 0 \
                    and self.state.hand[goods_str] >= buyer[goods_str]:
                available_action_list[action + 7] = 1

        return available_action_list

    def step(self, goods: int, action: int):

        prev_state = copy.deepcopy(self.state)

        if action >= 7:
            self.Buy(goods, action - 7)

        else:
            self.Sell(goods, action)

        if not (prev_state.hand == self.state.hand
                and prev_state.buyers == self.state.buyers):
            self.num_steps += 1

        if self.num_solved >= 35 or self.num_steps >= 500:
            self.done = True

        return self.state, self.reward, self.done

    def render(self):

        return