entopy done

2020-12-25 17:44:44 +03:30 · 2020-12-25 17:44:44 +03:30 · bec8fe4850
parent 51f4670f60
commit bec8fe4850
3 changed files with 100 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+.history
+.vscode
--- a/Train.xlsx
+++ b/Train.xlsx
--- a/entropy.py
+++ b/entropy.py
@ -0,0 +1,98 @@
+import pandas as pd
+from math import log2
+def test_split(index, value, dataset):
+	left, right = list(), list()
+	for row in dataset:
+		if row[index] < value:
+			left.append(row)
+		else:
+			right.append(row)
+	return left, right
+ 
+# Calculate the Gini index for a split dataset
+def entropy(groups, classes):
+	# count all samples at split point
+	n_instances = float(sum([len(group) for group in groups]))
+	# sum weighted Gini index for each group
+	e = 0.0
+	for group in groups:
+		size = float(len(group))
+		# avoid divide by zero
+		if size == 0:
+			continue
+		score = 0.0
+		# score the group based on the score for each class
+		for class_val in classes:
+			p = [row[-1] for row in group].count(class_val) / size
+			try:
+				score +=(p*log2(p))
+			except ValueError:
+				continue
+			
+		# weight the group score by its relative size
+		e += (- score) * (size/n_instances)
+	return e
+ 
+# Select the best split point for a dataset
+def get_split(dataset):
+	class_values = list(set(row[-1] for row in dataset))
+	b_index, b_value, b_score, b_groups = 999, 999, 999, None
+	for index in range(len(dataset[0])-1):
+		for row in dataset:
+			groups = test_split(index, row[index], dataset)
+			e = entropy(groups, class_values)
+			if e < b_score:
+				b_index, b_value, b_score, b_groups = index, row[index], e, groups
+	return {'index':b_index, 'value':b_value, 'groups':b_groups}
+ 
+# Create a terminal node value
+def to_terminal(group):
+	outcomes = [row[-1] for row in group]
+	return max(set(outcomes), key=outcomes.count)
+ 
+# Create child splits for a node or make terminal
+def split(node, max_depth, min_size, depth):
+	left, right = node['groups']
+	del(node['groups'])
+	# check for a no split
+	if not left or not right:
+		node['left'] = node['right'] = to_terminal(left + right)
+		return
+	# check for max depth
+	if depth >= max_depth:
+		node['left'], node['right'] = to_terminal(left), to_terminal(right)
+		return
+	# process left child
+	if len(left) <= min_size:
+		node['left'] = to_terminal(left)
+	else:
+		node['left'] = get_split(left)
+		split(node['left'], max_depth, min_size, depth+1)
+	# process right child
+	if len(right) <= min_size:
+		node['right'] = to_terminal(right)
+	else:
+		node['right'] = get_split(right)
+		split(node['right'], max_depth, min_size, depth+1)
+ 
+# Build a decision tree
+def build_tree(train, max_depth, min_size):
+	root = get_split(train)
+	split(root, max_depth, min_size, 1)
+	return root
+ 
+# Print a decision tree
+def print_tree(node, depth=0):
+	if isinstance(node, dict):
+		print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value'])))
+		print_tree(node['left'], depth+1)
+		print_tree(node['right'], depth+1)
+	else:
+		print('%s[%s]' % ((depth*' ', node)))
+ 
+
+if __name__ == "__main__":
+	dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy()
+	mxdp = input("max depth")
+	tree = build_tree(dataset, int(mxdp), 1)
+	print_tree(tree)