commit 51f4670f60f56023707e793f2cd2a5d422463b64 Author: amirmoghi3 Date: Fri Dec 25 16:33:49 2020 +0330 Initial Commit diff --git a/.history/app_20201225152542.py b/.history/app_20201225152542.py new file mode 100644 index 0000000..e69de29 diff --git a/.history/app_20201225152608.py b/.history/app_20201225152608.py new file mode 100644 index 0000000..49e2a2b --- /dev/null +++ b/.history/app_20201225152608.py @@ -0,0 +1,21 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225152810.py b/.history/app_20201225152810.py new file mode 100644 index 0000000..3c4ace5 --- /dev/null +++ b/.history/app_20201225152810.py @@ -0,0 +1,21 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225152934.py b/.history/app_20201225152934.py new file mode 100644 index 0000000..852e422 --- /dev/null +++ b/.history/app_20201225152934.py @@ -0,0 +1,22 @@ +def gini_index(groups, classes): + # count all samples at split point + print(classes) + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153009.py b/.history/app_20201225153009.py new file mode 100644 index 0000000..5a31d44 --- /dev/null +++ b/.history/app_20201225153009.py @@ -0,0 +1,22 @@ +def gini_index(groups, classes): + print(classes) + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153016.py b/.history/app_20201225153016.py new file mode 100644 index 0000000..9384f10 --- /dev/null +++ b/.history/app_20201225153016.py @@ -0,0 +1,22 @@ +def gini_index(groups, classes): + + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153034.py b/.history/app_20201225153034.py new file mode 100644 index 0000000..ee7278c --- /dev/null +++ b/.history/app_20201225153034.py @@ -0,0 +1,23 @@ +def gini_index(groups, classes): + print(groups) + print(classes) + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153041.py b/.history/app_20201225153041.py new file mode 100644 index 0000000..7ac7a9f --- /dev/null +++ b/.history/app_20201225153041.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + print(groups) + print(classes) + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153057.py b/.history/app_20201225153057.py new file mode 100644 index 0000000..42f868f --- /dev/null +++ b/.history/app_20201225153057.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + print(groups) + print(classes) + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153107.py b/.history/app_20201225153107.py new file mode 100644 index 0000000..7ac7a9f --- /dev/null +++ b/.history/app_20201225153107.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + print(groups) + print(classes) + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153134.py b/.history/app_20201225153134.py new file mode 100644 index 0000000..5056bad --- /dev/null +++ b/.history/app_20201225153134.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + print(groups) + print(classes) + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153219.py b/.history/app_20201225153219.py new file mode 100644 index 0000000..29bb710 --- /dev/null +++ b/.history/app_20201225153219.py @@ -0,0 +1,22 @@ +def gini_index(groups, classes): + + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153710.py b/.history/app_20201225153710.py new file mode 100644 index 0000000..752a1a7 --- /dev/null +++ b/.history/app_20201225153710.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + for row in group: + print(row[-1]) + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153815.py b/.history/app_20201225153815.py new file mode 100644 index 0000000..752a1a7 --- /dev/null +++ b/.history/app_20201225153815.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + for row in group: + print(row[-1]) + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153906.py b/.history/app_20201225153906.py new file mode 100644 index 0000000..c5550ad --- /dev/null +++ b/.history/app_20201225153906.py @@ -0,0 +1,22 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153913.py b/.history/app_20201225153913.py new file mode 100644 index 0000000..386c39f --- /dev/null +++ b/.history/app_20201225153913.py @@ -0,0 +1,23 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153932.py b/.history/app_20201225153932.py new file mode 100644 index 0000000..05c1067 --- /dev/null +++ b/.history/app_20201225153932.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + for row in group: + print(row[-1]) + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154020.py b/.history/app_20201225154020.py new file mode 100644 index 0000000..05c1067 --- /dev/null +++ b/.history/app_20201225154020.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + for row in group: + print(row[-1]) + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154028.py b/.history/app_20201225154028.py new file mode 100644 index 0000000..7573e2c --- /dev/null +++ b/.history/app_20201225154028.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + for row in group: + print(row[-1]) + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154033.py b/.history/app_20201225154033.py new file mode 100644 index 0000000..05c1067 --- /dev/null +++ b/.history/app_20201225154033.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + for row in group: + print(row[-1]) + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154115.py b/.history/app_20201225154115.py new file mode 100644 index 0000000..0f5fa34 --- /dev/null +++ b/.history/app_20201225154115.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + for row in group: + print(row[-1]) + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154141.py b/.history/app_20201225154141.py new file mode 100644 index 0000000..f77015a --- /dev/null +++ b/.history/app_20201225154141.py @@ -0,0 +1,24 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + for row in group: + print(row[-1]) + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154223.py b/.history/app_20201225154223.py new file mode 100644 index 0000000..99f846b --- /dev/null +++ b/.history/app_20201225154223.py @@ -0,0 +1,23 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154225.py b/.history/app_20201225154225.py new file mode 100644 index 0000000..c5550ad --- /dev/null +++ b/.history/app_20201225154225.py @@ -0,0 +1,22 @@ +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# test Gini values +print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) +print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154640.py b/.history/app_20201225154640.py new file mode 100644 index 0000000..81873ae --- /dev/null +++ b/.history/app_20201225154640.py @@ -0,0 +1,54 @@ +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini)) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +dataset = [[2.771244718,1.784783929,0], + [1.728571309,1.169761413,0], + [3.678319846,2.81281357,0], + [3.961043357,2.61995032,0], + [2.999208922,2.209014212,0], + [7.497545867,3.162953546,1], + [9.00220326,3.339047188,1], + [7.444542326,0.476683375,1], + [10.12493903,3.234550982,1], + [6.642287351,3.319983761,1]] +split = get_split(dataset) +print('Split: [X%d < %.3f]' % ((split['index']+1), split['value'])) \ No newline at end of file diff --git a/.history/app_20201225161344.py b/.history/app_20201225161344.py new file mode 100644 index 0000000..076cce8 --- /dev/null +++ b/.history/app_20201225161344.py @@ -0,0 +1,48 @@ +import pandas as pd + + +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini)) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +dataset = pd.read_excel('dataset2.xls', sheet_name="forestfires").to_numpy() +split = get_split(dataset) +print('Split: [X%d < %.3f]' % ((split['index']+1), split['value'])) \ No newline at end of file diff --git a/.history/app_20201225161529.py b/.history/app_20201225161529.py new file mode 100644 index 0000000..d5303c5 --- /dev/null +++ b/.history/app_20201225161529.py @@ -0,0 +1,48 @@ +import pandas as pd + + +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini)) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +dataset = pd.read_excel('train.xls', sheet_name="Sheet1").to_numpy() +split = get_split(dataset) +print('Split: [X%d < %.3f]' % ((split['index']+1), split['value'])) \ No newline at end of file diff --git a/.history/app_20201225161542.py b/.history/app_20201225161542.py new file mode 100644 index 0000000..8b04984 --- /dev/null +++ b/.history/app_20201225161542.py @@ -0,0 +1,48 @@ +import pandas as pd + + +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini)) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() +split = get_split(dataset) +print('Split: [X%d < %.3f]' % ((split['index']+1), split['value'])) \ No newline at end of file diff --git a/.history/gini_20201225161914.py b/.history/gini_20201225161914.py new file mode 100644 index 0000000..8b04984 --- /dev/null +++ b/.history/gini_20201225161914.py @@ -0,0 +1,48 @@ +import pandas as pd + + +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini)) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() +split = get_split(dataset) +print('Split: [X%d < %.3f]' % ((split['index']+1), split['value'])) \ No newline at end of file diff --git a/.history/gini_20201225162123.py b/.history/gini_20201225162123.py new file mode 100644 index 0000000..f0964db --- /dev/null +++ b/.history/gini_20201225162123.py @@ -0,0 +1,90 @@ +import pandas as pd +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +# Create a terminal node value +def to_terminal(group): + outcomes = [row[-1] for row in group] + return max(set(outcomes), key=outcomes.count) + +# Create child splits for a node or make terminal +def split(node, max_depth, min_size, depth): + left, right = node['groups'] + del(node['groups']) + # check for a no split + if not left or not right: + node['left'] = node['right'] = to_terminal(left + right) + return + # check for max depth + if depth >= max_depth: + node['left'], node['right'] = to_terminal(left), to_terminal(right) + return + # process left child + if len(left) <= min_size: + node['left'] = to_terminal(left) + else: + node['left'] = get_split(left) + split(node['left'], max_depth, min_size, depth+1) + # process right child + if len(right) <= min_size: + node['right'] = to_terminal(right) + else: + node['right'] = get_split(right) + split(node['right'], max_depth, min_size, depth+1) + +# Build a decision tree +def build_tree(train, max_depth, min_size): + root = get_split(train) + split(root, max_depth, min_size, 1) + return root + +# Print a decision tree +def print_tree(node, depth=0): + if isinstance(node, dict): + print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) + print_tree(node['left'], depth+1) + print_tree(node['right'], depth+1) + else: + print('%s[%s]' % ((depth*' ', node))) + +dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() +tree = build_tree(dataset, 1, 1) +print_tree(tree) \ No newline at end of file diff --git a/.history/gini_20201225162607.py b/.history/gini_20201225162607.py new file mode 100644 index 0000000..65bac69 --- /dev/null +++ b/.history/gini_20201225162607.py @@ -0,0 +1,90 @@ +import pandas as pd +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +# Create a terminal node value +def to_terminal(group): + outcomes = [row[-1] for row in group] + return max(set(outcomes), key=outcomes.count) + +# Create child splits for a node or make terminal +def split(node, max_depth, min_size, depth): + left, right = node['groups'] + del(node['groups']) + # check for a no split + if not left or not right: + node['left'] = node['right'] = to_terminal(left + right) + return + # check for max depth + if depth >= max_depth: + node['left'], node['right'] = to_terminal(left), to_terminal(right) + return + # process left child + if len(left) <= min_size: + node['left'] = to_terminal(left) + else: + node['left'] = get_split(left) + split(node['left'], max_depth, min_size, depth+1) + # process right child + if len(right) <= min_size: + node['right'] = to_terminal(right) + else: + node['right'] = get_split(right) + split(node['right'], max_depth, min_size, depth+1) + +# Build a decision tree +def build_tree(train, max_depth, min_size): + root = get_split(train) + split(root, max_depth, min_size, 1) + return root + +# Print a decision tree +def print_tree(node, depth=0): + if isinstance(node, dict): + print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) + print_tree(node['left'], depth+1) + print_tree(node['right'], depth+1) + else: + print('%s[%s]' % ((depth*' ', node))) + +dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() +tree = build_tree(dataset, 5, 1) +print_tree(tree) \ No newline at end of file diff --git a/.history/gini_20201225162804.py b/.history/gini_20201225162804.py new file mode 100644 index 0000000..65bac69 --- /dev/null +++ b/.history/gini_20201225162804.py @@ -0,0 +1,90 @@ +import pandas as pd +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +# Create a terminal node value +def to_terminal(group): + outcomes = [row[-1] for row in group] + return max(set(outcomes), key=outcomes.count) + +# Create child splits for a node or make terminal +def split(node, max_depth, min_size, depth): + left, right = node['groups'] + del(node['groups']) + # check for a no split + if not left or not right: + node['left'] = node['right'] = to_terminal(left + right) + return + # check for max depth + if depth >= max_depth: + node['left'], node['right'] = to_terminal(left), to_terminal(right) + return + # process left child + if len(left) <= min_size: + node['left'] = to_terminal(left) + else: + node['left'] = get_split(left) + split(node['left'], max_depth, min_size, depth+1) + # process right child + if len(right) <= min_size: + node['right'] = to_terminal(right) + else: + node['right'] = get_split(right) + split(node['right'], max_depth, min_size, depth+1) + +# Build a decision tree +def build_tree(train, max_depth, min_size): + root = get_split(train) + split(root, max_depth, min_size, 1) + return root + +# Print a decision tree +def print_tree(node, depth=0): + if isinstance(node, dict): + print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) + print_tree(node['left'], depth+1) + print_tree(node['right'], depth+1) + else: + print('%s[%s]' % ((depth*' ', node))) + +dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() +tree = build_tree(dataset, 5, 1) +print_tree(tree) \ No newline at end of file diff --git a/.history/gini_20201225162859.py b/.history/gini_20201225162859.py new file mode 100644 index 0000000..9fab08c --- /dev/null +++ b/.history/gini_20201225162859.py @@ -0,0 +1,90 @@ +import pandas as pd +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +# Create a terminal node value +def to_terminal(group): + outcomes = [row[-1] for row in group] + return max(set(outcomes), key=outcomes.count) + +# Create child splits for a node or make terminal +def split(node, max_depth, min_size, depth): + left, right = node['groups'] + del(node['groups']) + # check for a no split + if not left or not right: + node['left'] = node['right'] = to_terminal(left + right) + return + # check for max depth + if depth >= max_depth: + node['left'], node['right'] = to_terminal(left), to_terminal(right) + return + # process left child + if len(left) <= min_size: + node['left'] = to_terminal(left) + else: + node['left'] = get_split(left) + split(node['left'], max_depth, min_size, depth+1) + # process right child + if len(right) <= min_size: + node['right'] = to_terminal(right) + else: + node['right'] = get_split(right) + split(node['right'], max_depth, min_size, depth+1) + +# Build a decision tree +def build_tree(train, max_depth, min_size): + root = get_split(train) + split(root, max_depth, min_size, 1) + return root + +# Print a decision tree +def print_tree(node, depth=0): + if isinstance(node, dict): + print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) + print_tree(node['left'], depth+1) + print_tree(node['right'], depth+1) + else: + print('%s[%s]' % ((depth*' ', node))) + +dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() +tree = build_tree(dataset, 5, 0) +print_tree(tree) \ No newline at end of file diff --git a/.history/gini_20201225163004.py b/.history/gini_20201225163004.py new file mode 100644 index 0000000..184a8fd --- /dev/null +++ b/.history/gini_20201225163004.py @@ -0,0 +1,92 @@ +import pandas as pd +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +# Create a terminal node value +def to_terminal(group): + outcomes = [row[-1] for row in group] + return max(set(outcomes), key=outcomes.count) + +# Create child splits for a node or make terminal +def split(node, max_depth, min_size, depth): + left, right = node['groups'] + del(node['groups']) + # check for a no split + if not left or not right: + node['left'] = node['right'] = to_terminal(left + right) + return + # check for max depth + if depth >= max_depth: + node['left'], node['right'] = to_terminal(left), to_terminal(right) + return + # process left child + if len(left) <= min_size: + node['left'] = to_terminal(left) + else: + node['left'] = get_split(left) + split(node['left'], max_depth, min_size, depth+1) + # process right child + if len(right) <= min_size: + node['right'] = to_terminal(right) + else: + node['right'] = get_split(right) + split(node['right'], max_depth, min_size, depth+1) + +# Build a decision tree +def build_tree(train, max_depth, min_size): + root = get_split(train) + split(root, max_depth, min_size, 1) + return root + +# Print a decision tree +def print_tree(node, depth=0): + if isinstance(node, dict): + print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) + print_tree(node['left'], depth+1) + print_tree(node['right'], depth+1) + else: + print('%s[%s]' % ((depth*' ', node))) + + +if __name__ == "__main__": + dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() + tree = build_tree(dataset, 5, 0) + print_tree(tree) \ No newline at end of file diff --git a/.history/gini_20201225163146.py b/.history/gini_20201225163146.py new file mode 100644 index 0000000..ccfe822 --- /dev/null +++ b/.history/gini_20201225163146.py @@ -0,0 +1,92 @@ +import pandas as pd +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +# Create a terminal node value +def to_terminal(group): + outcomes = [row[-1] for row in group] + return max(set(outcomes), key=outcomes.count) + +# Create child splits for a node or make terminal +def split(node, max_depth, min_size, depth): + left, right = node['groups'] + del(node['groups']) + # check for a no split + if not left or not right: + node['left'] = node['right'] = to_terminal(left + right) + return + # check for max depth + if depth >= max_depth: + node['left'], node['right'] = to_terminal(left), to_terminal(right) + return + # process left child + if len(left) <= min_size: + node['left'] = to_terminal(left) + else: + node['left'] = get_split(left) + split(node['left'], max_depth, min_size, depth+1) + # process right child + if len(right) <= min_size: + node['right'] = to_terminal(right) + else: + node['right'] = get_split(right) + split(node['right'], max_depth, min_size, depth+1) + +# Build a decision tree +def build_tree(train, max_depth, min_size): + root = get_split(train) + split(root, max_depth, min_size, 1) + return root + +# Print a decision tree +def print_tree(node, depth=0): + if isinstance(node, dict): + print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) + print_tree(node['left'], depth+1) + print_tree(node['right'], depth+1) + else: + print('%s[%s]' % ((depth*' ', node))) + + +if __name__ == "__main__": + dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() + tree = build_tree(dataset, 1, 1) + print_tree(tree) \ No newline at end of file diff --git a/.history/gini_20201225163343.py b/.history/gini_20201225163343.py new file mode 100644 index 0000000..ccfe822 --- /dev/null +++ b/.history/gini_20201225163343.py @@ -0,0 +1,92 @@ +import pandas as pd +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +# Create a terminal node value +def to_terminal(group): + outcomes = [row[-1] for row in group] + return max(set(outcomes), key=outcomes.count) + +# Create child splits for a node or make terminal +def split(node, max_depth, min_size, depth): + left, right = node['groups'] + del(node['groups']) + # check for a no split + if not left or not right: + node['left'] = node['right'] = to_terminal(left + right) + return + # check for max depth + if depth >= max_depth: + node['left'], node['right'] = to_terminal(left), to_terminal(right) + return + # process left child + if len(left) <= min_size: + node['left'] = to_terminal(left) + else: + node['left'] = get_split(left) + split(node['left'], max_depth, min_size, depth+1) + # process right child + if len(right) <= min_size: + node['right'] = to_terminal(right) + else: + node['right'] = get_split(right) + split(node['right'], max_depth, min_size, depth+1) + +# Build a decision tree +def build_tree(train, max_depth, min_size): + root = get_split(train) + split(root, max_depth, min_size, 1) + return root + +# Print a decision tree +def print_tree(node, depth=0): + if isinstance(node, dict): + print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) + print_tree(node['left'], depth+1) + print_tree(node['right'], depth+1) + else: + print('%s[%s]' % ((depth*' ', node))) + + +if __name__ == "__main__": + dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() + tree = build_tree(dataset, 1, 1) + print_tree(tree) \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..17e15f2 --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Current File", + "type": "python", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..c8a63ca --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.pythonPath": "C:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python38-32\\python.exe" +} \ No newline at end of file diff --git a/Train.xlsx b/Train.xlsx new file mode 100644 index 0000000..87b7d98 Binary files /dev/null and b/Train.xlsx differ diff --git a/gini.py b/gini.py new file mode 100644 index 0000000..ccfe822 --- /dev/null +++ b/gini.py @@ -0,0 +1,92 @@ +import pandas as pd +def test_split(index, value, dataset): + left, right = list(), list() + for row in dataset: + if row[index] < value: + left.append(row) + else: + right.append(row) + return left, right + +# Calculate the Gini index for a split dataset +def gini_index(groups, classes): + # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) + # sum weighted Gini index for each group + gini = 0.0 + for group in groups: + size = float(len(group)) + # avoid divide by zero + if size == 0: + continue + score = 0.0 + # score the group based on the score for each class + for class_val in classes: + p = [row[-1] for row in group].count(class_val) / size + score += p * p + # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) + return gini + +# Select the best split point for a dataset +def get_split(dataset): + class_values = list(set(row[-1] for row in dataset)) + b_index, b_value, b_score, b_groups = 999, 999, 999, None + for index in range(len(dataset[0])-1): + for row in dataset: + groups = test_split(index, row[index], dataset) + gini = gini_index(groups, class_values) + if gini < b_score: + b_index, b_value, b_score, b_groups = index, row[index], gini, groups + return {'index':b_index, 'value':b_value, 'groups':b_groups} + +# Create a terminal node value +def to_terminal(group): + outcomes = [row[-1] for row in group] + return max(set(outcomes), key=outcomes.count) + +# Create child splits for a node or make terminal +def split(node, max_depth, min_size, depth): + left, right = node['groups'] + del(node['groups']) + # check for a no split + if not left or not right: + node['left'] = node['right'] = to_terminal(left + right) + return + # check for max depth + if depth >= max_depth: + node['left'], node['right'] = to_terminal(left), to_terminal(right) + return + # process left child + if len(left) <= min_size: + node['left'] = to_terminal(left) + else: + node['left'] = get_split(left) + split(node['left'], max_depth, min_size, depth+1) + # process right child + if len(right) <= min_size: + node['right'] = to_terminal(right) + else: + node['right'] = get_split(right) + split(node['right'], max_depth, min_size, depth+1) + +# Build a decision tree +def build_tree(train, max_depth, min_size): + root = get_split(train) + split(root, max_depth, min_size, 1) + return root + +# Print a decision tree +def print_tree(node, depth=0): + if isinstance(node, dict): + print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) + print_tree(node['left'], depth+1) + print_tree(node['right'], depth+1) + else: + print('%s[%s]' % ((depth*' ', node))) + + +if __name__ == "__main__": + dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() + tree = build_tree(dataset, 1, 1) + print_tree(tree) \ No newline at end of file