diff --git a/.history/app_20201225152542.py b/.history/app_20201225152542.py deleted file mode 100644 index e69de29..0000000 diff --git a/.history/app_20201225152608.py b/.history/app_20201225152608.py deleted file mode 100644 index 49e2a2b..0000000 --- a/.history/app_20201225152608.py +++ /dev/null @@ -1,21 +0,0 @@ -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225152810.py b/.history/app_20201225152810.py deleted file mode 100644 index 3c4ace5..0000000 --- a/.history/app_20201225152810.py +++ /dev/null @@ -1,21 +0,0 @@ -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225152934.py b/.history/app_20201225152934.py deleted file mode 100644 index 852e422..0000000 --- a/.history/app_20201225152934.py +++ /dev/null @@ -1,22 +0,0 @@ -def gini_index(groups, classes): - # count all samples at split point - print(classes) - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153009.py b/.history/app_20201225153009.py deleted file mode 100644 index 5a31d44..0000000 --- a/.history/app_20201225153009.py +++ /dev/null @@ -1,22 +0,0 @@ -def gini_index(groups, classes): - print(classes) - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153016.py b/.history/app_20201225153016.py deleted file mode 100644 index 9384f10..0000000 --- a/.history/app_20201225153016.py +++ /dev/null @@ -1,22 +0,0 @@ -def gini_index(groups, classes): - - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153034.py b/.history/app_20201225153034.py deleted file mode 100644 index ee7278c..0000000 --- a/.history/app_20201225153034.py +++ /dev/null @@ -1,23 +0,0 @@ -def gini_index(groups, classes): - print(groups) - print(classes) - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153041.py b/.history/app_20201225153041.py deleted file mode 100644 index 7ac7a9f..0000000 --- a/.history/app_20201225153041.py +++ /dev/null @@ -1,24 +0,0 @@ -def gini_index(groups, classes): - - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - print(groups) - print(classes) - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153057.py b/.history/app_20201225153057.py deleted file mode 100644 index 42f868f..0000000 --- a/.history/app_20201225153057.py +++ /dev/null @@ -1,24 +0,0 @@ -def gini_index(groups, classes): - - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - print(groups) - print(classes) - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153107.py b/.history/app_20201225153107.py deleted file mode 100644 index 7ac7a9f..0000000 --- a/.history/app_20201225153107.py +++ /dev/null @@ -1,24 +0,0 @@ -def gini_index(groups, classes): - - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - print(groups) - print(classes) - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153134.py b/.history/app_20201225153134.py deleted file mode 100644 index 5056bad..0000000 --- a/.history/app_20201225153134.py +++ /dev/null @@ -1,24 +0,0 @@ -def gini_index(groups, classes): - - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - print(groups) - print(classes) - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153219.py b/.history/app_20201225153219.py deleted file mode 100644 index 29bb710..0000000 --- a/.history/app_20201225153219.py +++ /dev/null @@ -1,22 +0,0 @@ -def gini_index(groups, classes): - - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153710.py b/.history/app_20201225153710.py deleted file mode 100644 index 752a1a7..0000000 --- a/.history/app_20201225153710.py +++ /dev/null @@ -1,24 +0,0 @@ -def gini_index(groups, classes): - - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - for row in group: - print(row[-1]) - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153815.py b/.history/app_20201225153815.py deleted file mode 100644 index 752a1a7..0000000 --- a/.history/app_20201225153815.py +++ /dev/null @@ -1,24 +0,0 @@ -def gini_index(groups, classes): - - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - for row in group: - print(row[-1]) - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153906.py b/.history/app_20201225153906.py deleted file mode 100644 index c5550ad..0000000 --- a/.history/app_20201225153906.py +++ /dev/null @@ -1,22 +0,0 @@ -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# test Gini values -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153913.py b/.history/app_20201225153913.py deleted file mode 100644 index 386c39f..0000000 --- a/.history/app_20201225153913.py +++ /dev/null @@ -1,23 +0,0 @@ -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# test Gini values -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225153932.py b/.history/app_20201225153932.py deleted file mode 100644 index 05c1067..0000000 --- a/.history/app_20201225153932.py +++ /dev/null @@ -1,24 +0,0 @@ -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - for row in group: - print(row[-1]) - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# test Gini values -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154020.py b/.history/app_20201225154020.py deleted file mode 100644 index 05c1067..0000000 --- a/.history/app_20201225154020.py +++ /dev/null @@ -1,24 +0,0 @@ -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - for row in group: - print(row[-1]) - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# test Gini values -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154028.py b/.history/app_20201225154028.py deleted file mode 100644 index 7573e2c..0000000 --- a/.history/app_20201225154028.py +++ /dev/null @@ -1,24 +0,0 @@ -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - for row in group: - print(row[-1]) - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# test Gini values -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154033.py b/.history/app_20201225154033.py deleted file mode 100644 index 05c1067..0000000 --- a/.history/app_20201225154033.py +++ /dev/null @@ -1,24 +0,0 @@ -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - for row in group: - print(row[-1]) - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# test Gini values -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154115.py b/.history/app_20201225154115.py deleted file mode 100644 index 0f5fa34..0000000 --- a/.history/app_20201225154115.py +++ /dev/null @@ -1,24 +0,0 @@ -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - for row in group: - print(row[-1]) - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# test Gini values -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154141.py b/.history/app_20201225154141.py deleted file mode 100644 index f77015a..0000000 --- a/.history/app_20201225154141.py +++ /dev/null @@ -1,24 +0,0 @@ -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - for row in group: - print(row[-1]) - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# test Gini values -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154223.py b/.history/app_20201225154223.py deleted file mode 100644 index 99f846b..0000000 --- a/.history/app_20201225154223.py +++ /dev/null @@ -1,23 +0,0 @@ -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# test Gini values -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154225.py b/.history/app_20201225154225.py deleted file mode 100644 index c5550ad..0000000 --- a/.history/app_20201225154225.py +++ /dev/null @@ -1,22 +0,0 @@ -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# test Gini values -print(gini_index([[[1, 1], [1, 0]], [[1, 1], [1, 0]]], [0, 1])) -print(gini_index([[[1, 0], [1, 0]], [[1, 1], [1, 1]]], [0, 1])) \ No newline at end of file diff --git a/.history/app_20201225154640.py b/.history/app_20201225154640.py deleted file mode 100644 index 81873ae..0000000 --- a/.history/app_20201225154640.py +++ /dev/null @@ -1,54 +0,0 @@ -def test_split(index, value, dataset): - left, right = list(), list() - for row in dataset: - if row[index] < value: - left.append(row) - else: - right.append(row) - return left, right - -# Calculate the Gini index for a split dataset -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# Select the best split point for a dataset -def get_split(dataset): - class_values = list(set(row[-1] for row in dataset)) - b_index, b_value, b_score, b_groups = 999, 999, 999, None - for index in range(len(dataset[0])-1): - for row in dataset: - groups = test_split(index, row[index], dataset) - gini = gini_index(groups, class_values) - print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini)) - if gini < b_score: - b_index, b_value, b_score, b_groups = index, row[index], gini, groups - return {'index':b_index, 'value':b_value, 'groups':b_groups} - -dataset = [[2.771244718,1.784783929,0], - [1.728571309,1.169761413,0], - [3.678319846,2.81281357,0], - [3.961043357,2.61995032,0], - [2.999208922,2.209014212,0], - [7.497545867,3.162953546,1], - [9.00220326,3.339047188,1], - [7.444542326,0.476683375,1], - [10.12493903,3.234550982,1], - [6.642287351,3.319983761,1]] -split = get_split(dataset) -print('Split: [X%d < %.3f]' % ((split['index']+1), split['value'])) \ No newline at end of file diff --git a/.history/app_20201225161344.py b/.history/app_20201225161344.py deleted file mode 100644 index 076cce8..0000000 --- a/.history/app_20201225161344.py +++ /dev/null @@ -1,48 +0,0 @@ -import pandas as pd - - -def test_split(index, value, dataset): - left, right = list(), list() - for row in dataset: - if row[index] < value: - left.append(row) - else: - right.append(row) - return left, right - -# Calculate the Gini index for a split dataset -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# Select the best split point for a dataset -def get_split(dataset): - class_values = list(set(row[-1] for row in dataset)) - b_index, b_value, b_score, b_groups = 999, 999, 999, None - for index in range(len(dataset[0])-1): - for row in dataset: - groups = test_split(index, row[index], dataset) - gini = gini_index(groups, class_values) - print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini)) - if gini < b_score: - b_index, b_value, b_score, b_groups = index, row[index], gini, groups - return {'index':b_index, 'value':b_value, 'groups':b_groups} - -dataset = pd.read_excel('dataset2.xls', sheet_name="forestfires").to_numpy() -split = get_split(dataset) -print('Split: [X%d < %.3f]' % ((split['index']+1), split['value'])) \ No newline at end of file diff --git a/.history/app_20201225161529.py b/.history/app_20201225161529.py deleted file mode 100644 index d5303c5..0000000 --- a/.history/app_20201225161529.py +++ /dev/null @@ -1,48 +0,0 @@ -import pandas as pd - - -def test_split(index, value, dataset): - left, right = list(), list() - for row in dataset: - if row[index] < value: - left.append(row) - else: - right.append(row) - return left, right - -# Calculate the Gini index for a split dataset -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# Select the best split point for a dataset -def get_split(dataset): - class_values = list(set(row[-1] for row in dataset)) - b_index, b_value, b_score, b_groups = 999, 999, 999, None - for index in range(len(dataset[0])-1): - for row in dataset: - groups = test_split(index, row[index], dataset) - gini = gini_index(groups, class_values) - print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini)) - if gini < b_score: - b_index, b_value, b_score, b_groups = index, row[index], gini, groups - return {'index':b_index, 'value':b_value, 'groups':b_groups} - -dataset = pd.read_excel('train.xls', sheet_name="Sheet1").to_numpy() -split = get_split(dataset) -print('Split: [X%d < %.3f]' % ((split['index']+1), split['value'])) \ No newline at end of file diff --git a/.history/app_20201225161542.py b/.history/app_20201225161542.py deleted file mode 100644 index 8b04984..0000000 --- a/.history/app_20201225161542.py +++ /dev/null @@ -1,48 +0,0 @@ -import pandas as pd - - -def test_split(index, value, dataset): - left, right = list(), list() - for row in dataset: - if row[index] < value: - left.append(row) - else: - right.append(row) - return left, right - -# Calculate the Gini index for a split dataset -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# Select the best split point for a dataset -def get_split(dataset): - class_values = list(set(row[-1] for row in dataset)) - b_index, b_value, b_score, b_groups = 999, 999, 999, None - for index in range(len(dataset[0])-1): - for row in dataset: - groups = test_split(index, row[index], dataset) - gini = gini_index(groups, class_values) - print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini)) - if gini < b_score: - b_index, b_value, b_score, b_groups = index, row[index], gini, groups - return {'index':b_index, 'value':b_value, 'groups':b_groups} - -dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() -split = get_split(dataset) -print('Split: [X%d < %.3f]' % ((split['index']+1), split['value'])) \ No newline at end of file diff --git a/.history/gini_20201225161914.py b/.history/gini_20201225161914.py deleted file mode 100644 index 8b04984..0000000 --- a/.history/gini_20201225161914.py +++ /dev/null @@ -1,48 +0,0 @@ -import pandas as pd - - -def test_split(index, value, dataset): - left, right = list(), list() - for row in dataset: - if row[index] < value: - left.append(row) - else: - right.append(row) - return left, right - -# Calculate the Gini index for a split dataset -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# Select the best split point for a dataset -def get_split(dataset): - class_values = list(set(row[-1] for row in dataset)) - b_index, b_value, b_score, b_groups = 999, 999, 999, None - for index in range(len(dataset[0])-1): - for row in dataset: - groups = test_split(index, row[index], dataset) - gini = gini_index(groups, class_values) - print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini)) - if gini < b_score: - b_index, b_value, b_score, b_groups = index, row[index], gini, groups - return {'index':b_index, 'value':b_value, 'groups':b_groups} - -dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() -split = get_split(dataset) -print('Split: [X%d < %.3f]' % ((split['index']+1), split['value'])) \ No newline at end of file diff --git a/.history/gini_20201225162123.py b/.history/gini_20201225162123.py deleted file mode 100644 index f0964db..0000000 --- a/.history/gini_20201225162123.py +++ /dev/null @@ -1,90 +0,0 @@ -import pandas as pd -def test_split(index, value, dataset): - left, right = list(), list() - for row in dataset: - if row[index] < value: - left.append(row) - else: - right.append(row) - return left, right - -# Calculate the Gini index for a split dataset -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# Select the best split point for a dataset -def get_split(dataset): - class_values = list(set(row[-1] for row in dataset)) - b_index, b_value, b_score, b_groups = 999, 999, 999, None - for index in range(len(dataset[0])-1): - for row in dataset: - groups = test_split(index, row[index], dataset) - gini = gini_index(groups, class_values) - if gini < b_score: - b_index, b_value, b_score, b_groups = index, row[index], gini, groups - return {'index':b_index, 'value':b_value, 'groups':b_groups} - -# Create a terminal node value -def to_terminal(group): - outcomes = [row[-1] for row in group] - return max(set(outcomes), key=outcomes.count) - -# Create child splits for a node or make terminal -def split(node, max_depth, min_size, depth): - left, right = node['groups'] - del(node['groups']) - # check for a no split - if not left or not right: - node['left'] = node['right'] = to_terminal(left + right) - return - # check for max depth - if depth >= max_depth: - node['left'], node['right'] = to_terminal(left), to_terminal(right) - return - # process left child - if len(left) <= min_size: - node['left'] = to_terminal(left) - else: - node['left'] = get_split(left) - split(node['left'], max_depth, min_size, depth+1) - # process right child - if len(right) <= min_size: - node['right'] = to_terminal(right) - else: - node['right'] = get_split(right) - split(node['right'], max_depth, min_size, depth+1) - -# Build a decision tree -def build_tree(train, max_depth, min_size): - root = get_split(train) - split(root, max_depth, min_size, 1) - return root - -# Print a decision tree -def print_tree(node, depth=0): - if isinstance(node, dict): - print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) - print_tree(node['left'], depth+1) - print_tree(node['right'], depth+1) - else: - print('%s[%s]' % ((depth*' ', node))) - -dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() -tree = build_tree(dataset, 1, 1) -print_tree(tree) \ No newline at end of file diff --git a/.history/gini_20201225162607.py b/.history/gini_20201225162607.py deleted file mode 100644 index 65bac69..0000000 --- a/.history/gini_20201225162607.py +++ /dev/null @@ -1,90 +0,0 @@ -import pandas as pd -def test_split(index, value, dataset): - left, right = list(), list() - for row in dataset: - if row[index] < value: - left.append(row) - else: - right.append(row) - return left, right - -# Calculate the Gini index for a split dataset -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# Select the best split point for a dataset -def get_split(dataset): - class_values = list(set(row[-1] for row in dataset)) - b_index, b_value, b_score, b_groups = 999, 999, 999, None - for index in range(len(dataset[0])-1): - for row in dataset: - groups = test_split(index, row[index], dataset) - gini = gini_index(groups, class_values) - if gini < b_score: - b_index, b_value, b_score, b_groups = index, row[index], gini, groups - return {'index':b_index, 'value':b_value, 'groups':b_groups} - -# Create a terminal node value -def to_terminal(group): - outcomes = [row[-1] for row in group] - return max(set(outcomes), key=outcomes.count) - -# Create child splits for a node or make terminal -def split(node, max_depth, min_size, depth): - left, right = node['groups'] - del(node['groups']) - # check for a no split - if not left or not right: - node['left'] = node['right'] = to_terminal(left + right) - return - # check for max depth - if depth >= max_depth: - node['left'], node['right'] = to_terminal(left), to_terminal(right) - return - # process left child - if len(left) <= min_size: - node['left'] = to_terminal(left) - else: - node['left'] = get_split(left) - split(node['left'], max_depth, min_size, depth+1) - # process right child - if len(right) <= min_size: - node['right'] = to_terminal(right) - else: - node['right'] = get_split(right) - split(node['right'], max_depth, min_size, depth+1) - -# Build a decision tree -def build_tree(train, max_depth, min_size): - root = get_split(train) - split(root, max_depth, min_size, 1) - return root - -# Print a decision tree -def print_tree(node, depth=0): - if isinstance(node, dict): - print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) - print_tree(node['left'], depth+1) - print_tree(node['right'], depth+1) - else: - print('%s[%s]' % ((depth*' ', node))) - -dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() -tree = build_tree(dataset, 5, 1) -print_tree(tree) \ No newline at end of file diff --git a/.history/gini_20201225162804.py b/.history/gini_20201225162804.py deleted file mode 100644 index 65bac69..0000000 --- a/.history/gini_20201225162804.py +++ /dev/null @@ -1,90 +0,0 @@ -import pandas as pd -def test_split(index, value, dataset): - left, right = list(), list() - for row in dataset: - if row[index] < value: - left.append(row) - else: - right.append(row) - return left, right - -# Calculate the Gini index for a split dataset -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# Select the best split point for a dataset -def get_split(dataset): - class_values = list(set(row[-1] for row in dataset)) - b_index, b_value, b_score, b_groups = 999, 999, 999, None - for index in range(len(dataset[0])-1): - for row in dataset: - groups = test_split(index, row[index], dataset) - gini = gini_index(groups, class_values) - if gini < b_score: - b_index, b_value, b_score, b_groups = index, row[index], gini, groups - return {'index':b_index, 'value':b_value, 'groups':b_groups} - -# Create a terminal node value -def to_terminal(group): - outcomes = [row[-1] for row in group] - return max(set(outcomes), key=outcomes.count) - -# Create child splits for a node or make terminal -def split(node, max_depth, min_size, depth): - left, right = node['groups'] - del(node['groups']) - # check for a no split - if not left or not right: - node['left'] = node['right'] = to_terminal(left + right) - return - # check for max depth - if depth >= max_depth: - node['left'], node['right'] = to_terminal(left), to_terminal(right) - return - # process left child - if len(left) <= min_size: - node['left'] = to_terminal(left) - else: - node['left'] = get_split(left) - split(node['left'], max_depth, min_size, depth+1) - # process right child - if len(right) <= min_size: - node['right'] = to_terminal(right) - else: - node['right'] = get_split(right) - split(node['right'], max_depth, min_size, depth+1) - -# Build a decision tree -def build_tree(train, max_depth, min_size): - root = get_split(train) - split(root, max_depth, min_size, 1) - return root - -# Print a decision tree -def print_tree(node, depth=0): - if isinstance(node, dict): - print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) - print_tree(node['left'], depth+1) - print_tree(node['right'], depth+1) - else: - print('%s[%s]' % ((depth*' ', node))) - -dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() -tree = build_tree(dataset, 5, 1) -print_tree(tree) \ No newline at end of file diff --git a/.history/gini_20201225162859.py b/.history/gini_20201225162859.py deleted file mode 100644 index 9fab08c..0000000 --- a/.history/gini_20201225162859.py +++ /dev/null @@ -1,90 +0,0 @@ -import pandas as pd -def test_split(index, value, dataset): - left, right = list(), list() - for row in dataset: - if row[index] < value: - left.append(row) - else: - right.append(row) - return left, right - -# Calculate the Gini index for a split dataset -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# Select the best split point for a dataset -def get_split(dataset): - class_values = list(set(row[-1] for row in dataset)) - b_index, b_value, b_score, b_groups = 999, 999, 999, None - for index in range(len(dataset[0])-1): - for row in dataset: - groups = test_split(index, row[index], dataset) - gini = gini_index(groups, class_values) - if gini < b_score: - b_index, b_value, b_score, b_groups = index, row[index], gini, groups - return {'index':b_index, 'value':b_value, 'groups':b_groups} - -# Create a terminal node value -def to_terminal(group): - outcomes = [row[-1] for row in group] - return max(set(outcomes), key=outcomes.count) - -# Create child splits for a node or make terminal -def split(node, max_depth, min_size, depth): - left, right = node['groups'] - del(node['groups']) - # check for a no split - if not left or not right: - node['left'] = node['right'] = to_terminal(left + right) - return - # check for max depth - if depth >= max_depth: - node['left'], node['right'] = to_terminal(left), to_terminal(right) - return - # process left child - if len(left) <= min_size: - node['left'] = to_terminal(left) - else: - node['left'] = get_split(left) - split(node['left'], max_depth, min_size, depth+1) - # process right child - if len(right) <= min_size: - node['right'] = to_terminal(right) - else: - node['right'] = get_split(right) - split(node['right'], max_depth, min_size, depth+1) - -# Build a decision tree -def build_tree(train, max_depth, min_size): - root = get_split(train) - split(root, max_depth, min_size, 1) - return root - -# Print a decision tree -def print_tree(node, depth=0): - if isinstance(node, dict): - print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) - print_tree(node['left'], depth+1) - print_tree(node['right'], depth+1) - else: - print('%s[%s]' % ((depth*' ', node))) - -dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() -tree = build_tree(dataset, 5, 0) -print_tree(tree) \ No newline at end of file diff --git a/.history/gini_20201225163004.py b/.history/gini_20201225163004.py deleted file mode 100644 index 184a8fd..0000000 --- a/.history/gini_20201225163004.py +++ /dev/null @@ -1,92 +0,0 @@ -import pandas as pd -def test_split(index, value, dataset): - left, right = list(), list() - for row in dataset: - if row[index] < value: - left.append(row) - else: - right.append(row) - return left, right - -# Calculate the Gini index for a split dataset -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# Select the best split point for a dataset -def get_split(dataset): - class_values = list(set(row[-1] for row in dataset)) - b_index, b_value, b_score, b_groups = 999, 999, 999, None - for index in range(len(dataset[0])-1): - for row in dataset: - groups = test_split(index, row[index], dataset) - gini = gini_index(groups, class_values) - if gini < b_score: - b_index, b_value, b_score, b_groups = index, row[index], gini, groups - return {'index':b_index, 'value':b_value, 'groups':b_groups} - -# Create a terminal node value -def to_terminal(group): - outcomes = [row[-1] for row in group] - return max(set(outcomes), key=outcomes.count) - -# Create child splits for a node or make terminal -def split(node, max_depth, min_size, depth): - left, right = node['groups'] - del(node['groups']) - # check for a no split - if not left or not right: - node['left'] = node['right'] = to_terminal(left + right) - return - # check for max depth - if depth >= max_depth: - node['left'], node['right'] = to_terminal(left), to_terminal(right) - return - # process left child - if len(left) <= min_size: - node['left'] = to_terminal(left) - else: - node['left'] = get_split(left) - split(node['left'], max_depth, min_size, depth+1) - # process right child - if len(right) <= min_size: - node['right'] = to_terminal(right) - else: - node['right'] = get_split(right) - split(node['right'], max_depth, min_size, depth+1) - -# Build a decision tree -def build_tree(train, max_depth, min_size): - root = get_split(train) - split(root, max_depth, min_size, 1) - return root - -# Print a decision tree -def print_tree(node, depth=0): - if isinstance(node, dict): - print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) - print_tree(node['left'], depth+1) - print_tree(node['right'], depth+1) - else: - print('%s[%s]' % ((depth*' ', node))) - - -if __name__ == "__main__": - dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() - tree = build_tree(dataset, 5, 0) - print_tree(tree) \ No newline at end of file diff --git a/.history/gini_20201225163146.py b/.history/gini_20201225163146.py deleted file mode 100644 index ccfe822..0000000 --- a/.history/gini_20201225163146.py +++ /dev/null @@ -1,92 +0,0 @@ -import pandas as pd -def test_split(index, value, dataset): - left, right = list(), list() - for row in dataset: - if row[index] < value: - left.append(row) - else: - right.append(row) - return left, right - -# Calculate the Gini index for a split dataset -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# Select the best split point for a dataset -def get_split(dataset): - class_values = list(set(row[-1] for row in dataset)) - b_index, b_value, b_score, b_groups = 999, 999, 999, None - for index in range(len(dataset[0])-1): - for row in dataset: - groups = test_split(index, row[index], dataset) - gini = gini_index(groups, class_values) - if gini < b_score: - b_index, b_value, b_score, b_groups = index, row[index], gini, groups - return {'index':b_index, 'value':b_value, 'groups':b_groups} - -# Create a terminal node value -def to_terminal(group): - outcomes = [row[-1] for row in group] - return max(set(outcomes), key=outcomes.count) - -# Create child splits for a node or make terminal -def split(node, max_depth, min_size, depth): - left, right = node['groups'] - del(node['groups']) - # check for a no split - if not left or not right: - node['left'] = node['right'] = to_terminal(left + right) - return - # check for max depth - if depth >= max_depth: - node['left'], node['right'] = to_terminal(left), to_terminal(right) - return - # process left child - if len(left) <= min_size: - node['left'] = to_terminal(left) - else: - node['left'] = get_split(left) - split(node['left'], max_depth, min_size, depth+1) - # process right child - if len(right) <= min_size: - node['right'] = to_terminal(right) - else: - node['right'] = get_split(right) - split(node['right'], max_depth, min_size, depth+1) - -# Build a decision tree -def build_tree(train, max_depth, min_size): - root = get_split(train) - split(root, max_depth, min_size, 1) - return root - -# Print a decision tree -def print_tree(node, depth=0): - if isinstance(node, dict): - print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) - print_tree(node['left'], depth+1) - print_tree(node['right'], depth+1) - else: - print('%s[%s]' % ((depth*' ', node))) - - -if __name__ == "__main__": - dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() - tree = build_tree(dataset, 1, 1) - print_tree(tree) \ No newline at end of file diff --git a/.history/gini_20201225163343.py b/.history/gini_20201225163343.py deleted file mode 100644 index ccfe822..0000000 --- a/.history/gini_20201225163343.py +++ /dev/null @@ -1,92 +0,0 @@ -import pandas as pd -def test_split(index, value, dataset): - left, right = list(), list() - for row in dataset: - if row[index] < value: - left.append(row) - else: - right.append(row) - return left, right - -# Calculate the Gini index for a split dataset -def gini_index(groups, classes): - # count all samples at split point - n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group - gini = 0.0 - for group in groups: - size = float(len(group)) - # avoid divide by zero - if size == 0: - continue - score = 0.0 - # score the group based on the score for each class - for class_val in classes: - p = [row[-1] for row in group].count(class_val) / size - score += p * p - # weight the group score by its relative size - gini += (1.0 - score) * (size / n_instances) - return gini - -# Select the best split point for a dataset -def get_split(dataset): - class_values = list(set(row[-1] for row in dataset)) - b_index, b_value, b_score, b_groups = 999, 999, 999, None - for index in range(len(dataset[0])-1): - for row in dataset: - groups = test_split(index, row[index], dataset) - gini = gini_index(groups, class_values) - if gini < b_score: - b_index, b_value, b_score, b_groups = index, row[index], gini, groups - return {'index':b_index, 'value':b_value, 'groups':b_groups} - -# Create a terminal node value -def to_terminal(group): - outcomes = [row[-1] for row in group] - return max(set(outcomes), key=outcomes.count) - -# Create child splits for a node or make terminal -def split(node, max_depth, min_size, depth): - left, right = node['groups'] - del(node['groups']) - # check for a no split - if not left or not right: - node['left'] = node['right'] = to_terminal(left + right) - return - # check for max depth - if depth >= max_depth: - node['left'], node['right'] = to_terminal(left), to_terminal(right) - return - # process left child - if len(left) <= min_size: - node['left'] = to_terminal(left) - else: - node['left'] = get_split(left) - split(node['left'], max_depth, min_size, depth+1) - # process right child - if len(right) <= min_size: - node['right'] = to_terminal(right) - else: - node['right'] = get_split(right) - split(node['right'], max_depth, min_size, depth+1) - -# Build a decision tree -def build_tree(train, max_depth, min_size): - root = get_split(train) - split(root, max_depth, min_size, 1) - return root - -# Print a decision tree -def print_tree(node, depth=0): - if isinstance(node, dict): - print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value']))) - print_tree(node['left'], depth+1) - print_tree(node['right'], depth+1) - else: - print('%s[%s]' % ((depth*' ', node))) - - -if __name__ == "__main__": - dataset = pd.read_excel('Train.xlsx', sheet_name="Sheet1").to_numpy() - tree = build_tree(dataset, 1, 1) - print_tree(tree) \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json deleted file mode 100644 index 17e15f2..0000000 --- a/.vscode/launch.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - // Use IntelliSense to learn about possible attributes. - // Hover to view descriptions of existing attributes. - // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 - "version": "0.2.0", - "configurations": [ - { - "name": "Python: Current File", - "type": "python", - "request": "launch", - "program": "${file}", - "console": "integratedTerminal" - } - ] -} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index c8a63ca..0000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "python.pythonPath": "C:\\Users\\user\\AppData\\Local\\Programs\\Python\\Python38-32\\python.exe" -} \ No newline at end of file diff --git a/gini.py b/gini.py index ccfe822..5e15bd0 100644 --- a/gini.py +++ b/gini.py @@ -8,27 +8,27 @@ def test_split(index, value, dataset): right.append(row) return left, right -# Calculate the Gini index for a split dataset + def gini_index(groups, classes): - # count all samples at split point + n_instances = float(sum([len(group) for group in groups])) - # sum weighted Gini index for each group + gini = 0.0 for group in groups: size = float(len(group)) - # avoid divide by zero + if size == 0: continue score = 0.0 - # score the group based on the score for each class + for class_val in classes: p = [row[-1] for row in group].count(class_val) / size score += p * p - # weight the group score by its relative size + gini += (1.0 - score) * (size / n_instances) return gini -# Select the best split point for a dataset + def get_split(dataset): class_values = list(set(row[-1] for row in dataset)) b_index, b_value, b_score, b_groups = 999, 999, 999, None @@ -40,43 +40,43 @@ def get_split(dataset): b_index, b_value, b_score, b_groups = index, row[index], gini, groups return {'index':b_index, 'value':b_value, 'groups':b_groups} -# Create a terminal node value + def to_terminal(group): outcomes = [row[-1] for row in group] return max(set(outcomes), key=outcomes.count) -# Create child splits for a node or make terminal + def split(node, max_depth, min_size, depth): left, right = node['groups'] del(node['groups']) - # check for a no split + if not left or not right: node['left'] = node['right'] = to_terminal(left + right) return - # check for max depth + if depth >= max_depth: node['left'], node['right'] = to_terminal(left), to_terminal(right) return - # process left child + if len(left) <= min_size: node['left'] = to_terminal(left) else: node['left'] = get_split(left) split(node['left'], max_depth, min_size, depth+1) - # process right child + if len(right) <= min_size: node['right'] = to_terminal(right) else: node['right'] = get_split(right) split(node['right'], max_depth, min_size, depth+1) -# Build a decision tree + def build_tree(train, max_depth, min_size): root = get_split(train) split(root, max_depth, min_size, 1) return root -# Print a decision tree + def print_tree(node, depth=0): if isinstance(node, dict): print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value'])))