diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1c36090 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.history +.vscode \ No newline at end of file diff --git a/Train.xlsx b/Train.xlsx index 87b7d98..02158b7 100644 Binary files a/Train.xlsx and b/Train.xlsx differ diff --git a/entropy.py b/entropy.py new file mode 100644 index 0000000..d3da65d --- /dev/null +++ b/entropy.py @@ -0,0 +1,98 @@ +import pandas as pd +from math import log2 +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def entropy(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + e = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + try: + score +=(p*log2(p)) + except ValueError: + continue + + # weight the group score by its relative size + e += (- score) * (size/n_instances) + return e + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + e = entropy(groups, class_values) + if e < b_score: + b_index, b_value, b_score, b_groups = index, row[index], e, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +# Create a terminal node value +def to_terminal(group): + outcomes = [row[-1] for row in group] + return max(set(outcomes), key=outcomes.count) + +# Create child splits for a node or make terminal +def split(node, max_depth, min_size, depth): + left, right = node['groups'] + del(node['groups']) + # check for a no split + if not left or not right: + node['left'] = node['right'] = to_terminal(left + right) + return + # check for max depth + if depth >= max_depth: + node['left'], node['right'] = to_terminal(left), to_terminal(right) + return + # process left child + if len(left) <= min_size: + node['left'] = to_terminal(left) + else: + node['left'] = get_split(left) + split(node['left'], max_depth, min_size, depth+1) + # process right child + if len(right) <= min_size: + node['right'] = to_terminal(right) + else: + node['right'] = get_split(right) + split(node['right'], max_depth, min_size, depth+1) + +# Build a decision tree +def build_tree(train, max_depth, min_size): + root = get_split(train) + split(root, max_depth, min_size, 1) + return root + +# Print a decision tree +def print_tree(node, depth=0): + if isinstance(node, dict): + print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) + print_tree(node['left'], depth+1) + print_tree(node['right'], depth+1) + else: + print('%s[%s]' % ((depth*' ', node))) + + +if __name__ == "__main__": + dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() + mxdp = input("max depth") + tree = build_tree(dataset, int(mxdp), 1) + print_tree(tree)